├── KMSD.py
├── README.md
├── master
    ├── share_code
    │   ├── scrapy.cfg
    │   ├── share_code
    │   │   ├── __init__.py
    │   │   ├── __init__.pyc
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-36.pyc
    │   │   │   ├── items.cpython-36.pyc
    │   │   │   ├── middlewares.cpython-36.pyc
    │   │   │   ├── pipelines.cpython-36.pyc
    │   │   │   └── settings.cpython-36.pyc
    │   │   ├── algorithm
    │   │   │   ├── best_params.csv
    │   │   │   ├── similarity.py
    │   │   │   ├── similarity.pyc
    │   │   │   └── test1.py
    │   │   ├── auto_ip.py
    │   │   ├── auto_ip_multi_pro.py
    │   │   ├── example.py
    │   │   ├── extensions.pyc
    │   │   ├── extensionsItem.py
    │   │   ├── extensionsItem.pyc
    │   │   ├── extensionsTime.py
    │   │   ├── extensionsTime.pyc
    │   │   ├── items.py
    │   │   ├── items.pyc
    │   │   ├── middlewares.py
    │   │   ├── middlewares.pyc
    │   │   ├── pipelines.py
    │   │   ├── pipelines.pyc
    │   │   ├── settings.py
    │   │   ├── settings.pyc
    │   │   ├── spiders
    │   │   │   ├── ShareSpider.py
    │   │   │   ├── ShareSpider.pyc
    │   │   │   ├── __init__.py
    │   │   │   ├── __init__.pyc
    │   │   │   └── __pycache__
    │   │   │   │   ├── ShareSpider.cpython-36.pyc
    │   │   │   │   └── __init__.cpython-36.pyc
    │   │   └── untitled2.py
    │   └── zookeeper.out
    └── zoo_detect
    │   └── zoo_watcher.py
├── nupic_output.py
├── share_experiment.py
├── similarity.py
├── slave
    ├── share_code
    │   ├── scrapy.cfg
    │   └── share_code
    │   │   ├── __init__.py
    │   │   ├── __init__.pyc
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-36.pyc
    │   │       ├── items.cpython-36.pyc
    │   │       ├── middlewares.cpython-36.pyc
    │   │       ├── pipelines.cpython-36.pyc
    │   │       └── settings.cpython-36.pyc
    │   │   ├── auto_ip.py
    │   │   ├── auto_ip_multi_pro.py
    │   │   ├── example.py
    │   │   ├── extensions.pyc
    │   │   ├── extensionsItem.py
    │   │   ├── extensionsItem.pyc
    │   │   ├── extensionsTime.py
    │   │   ├── extensionsTime.pyc
    │   │   ├── items.py
    │   │   ├── items.pyc
    │   │   ├── middlewares.py
    │   │   ├── middlewares.pyc
    │   │   ├── pipelines.py
    │   │   ├── pipelines.pyc
    │   │   ├── settings.py
    │   │   ├── settings.pyc
    │   │   ├── spiders
    │   │       ├── SlaveSpider.py
    │   │       ├── SlaveSpider.pyc
    │   │       ├── __init__.py
    │   │       ├── __init__.pyc
    │   │       └── __pycache__
    │   │       │   ├── SlaveSpider.cpython-36.pyc
    │   │       │   └── __init__.cpython-36.pyc
    │   │   └── untitled2.py
    └── zoo_detect
    │   └── zoo_watcher.py
├── test_nupic.py
├── testbp.py
├── 参考博客
├── 各组件安装文档说明
├── 启动步骤
└── 笔记摘录


/KMSD.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | from similarity import MSD
  3 | import pandas as pd
  4 | import numpy as np
  5 | import copy
  6 | import random
  7 | 
  8 | def local_MSD_layer(a,b):   #对应于出现断层数据
  9 |     l1=np.nonzero(a!=-1)[0]
 10 |     l2=np.nonzero(b!=-1)[0]
 11 |     l=list(set(l1) & set(l2))
 12 |     sim=MSD(a[l],b[l])
 13 |     return sim
 14 | 
 15 | #在这里我将欧式距离改为了MSD距离
 16 | def distEclud(vecA,vecB):
 17 |     return local_MSD_layer(vecA,vecB)
 18 | #随机初始化K个质心(质心满足数据边界之内)
 19 | def randCent(dataSet,k):
 20 |     #得到数据样本的维度
 21 |     n=np.shape(dataSet)[1]
 22 |     #初始化为一个(k,n)的矩阵
 23 |     centroids=np.mat(np.zeros((k,n)))
 24 |     #遍历数据集的每一维度
 25 |     for j in range(n):
 26 |         #得到该列数据的最小值
 27 |         minJ=min(dataSet[:,j])
 28 |         #得到该列数据的范围(最大值-最小值)
 29 |         rangeJ=float(max(dataSet[:,j])-minJ)
 30 |         #k个质心向量的第j维数据值随机为位于(最小值，最大值)内的某一值
 31 |         centroids[:,j]=minJ+rangeJ*np.random.rand(k,1)
 32 |     #返回初始化得到的k个质心向量
 33 |     return np.array(centroids)
 34 | #k-均值聚类算法
 35 | #@dataSet:聚类数据集
 36 | #@k:用户指定的k个类
 37 | #@distMeas:距离计算方法，默认欧氏距离distEclud()
 38 | #@createCent:获得k个质心的方法，默认随机获取randCent()
 39 | def kMeans(dataSet,k,distMeas=distEclud,createCent=randCent):
 40 |     dataSet=np.array(dataSet)
 41 |     #获取数据集样本数
 42 |     m=np.shape(dataSet)[0]
 43 |     #初始化一个(m,2)的矩阵
 44 |     clusterAssment=np.mat(np.zeros((m,2)))
 45 |     #创建初始的k个质心向量
 46 |     centroids=createCent(dataSet,k)
 47 |     #聚类结果是否发生变化的布尔类型
 48 |     clusterChanged=True
 49 |     #只要聚类结果一直发生变化，就一直执行聚类算法，直至所有数据点聚类结果不变化
 50 |     while clusterChanged:
 51 |         #聚类结果变化布尔类型置为false
 52 |         clusterChanged=False
 53 |         #遍历数据集每一个样本向量
 54 |         for i in range(m):
 55 |             #初始化最小距离最正无穷；最小距离对应索引为-1
 56 |             minDist=np.inf;minIndex=-1
 57 |             #循环k个类的质心
 58 |             for j in range(k):
 59 |                 #计算数据点到质心的欧氏距离
 60 |                 distJI=distMeas(centroids[j,:],dataSet[i,:].reshape(centroids[j,:].shape))
 61 |                 #如果距离小于当前最小距离
 62 |                 if distJI<minDist:
 63 |                     #当前距离定为当前最小距离；最小距离对应索引对应为j(第j个类)
 64 |                     minDist=distJI;minIndex=j
 65 |         #当前聚类结果中第i个样本的聚类结果发生变化：布尔类型置为true，继续聚类算法
 66 |         if clusterAssment[i,0] !=minIndex:clusterChanged=True
 67 |         #更新当前变化样本的聚类结果和平方误差
 68 | 	print('minDist: ',minDist)
 69 |         clusterAssment[i,:]=minIndex,minDist**2
 70 |     #打印k-均值聚类的质心
 71 |     print centroids
 72 |     #遍历每一个质心
 73 |     for cent in range(k):
 74 |         #将数据集中所有属于当前质心类的样本通过条件过滤筛选出来
 75 |         ptsInClust=dataSet[np.nonzero(clusterAssment[:,0].A==cent)[0]]
 76 |         #计算这些数据的均值（axis=0：求列的均值），作为该类质心向量
 77 |         centroids[cent,:]=np.mean(ptsInClust,axis=0)
 78 |     #返回k个聚类，聚类结果及误差
 79 |     return centroids,clusterAssment
 80 | #算法如下：
 81 | #如果两个序列的0占比不多于30%，则取出这些序列作为新的DD，然后聚类，其他的作为最后一个聚类
 82 | def kmeans_MSD(DD,k=3):  #返回dict,value是DF
 83 |     newD=[]
 84 |     leftD=[]
 85 |     for i in range(DD.shape[1]):
 86 | 	if len(np.nonzero(DD[:,i]==0)[0])<=int(0.3*DD.shape[0]):
 87 | 	    if len(newD)==0:
 88 | 		newD=copy.deepcopy(DD[:,i]).reshape(-1,1)
 89 | 	    else:
 90 | 		newD=np.concatenate((newD,copy.deepcopy(DD[:,i]).reshape(-1,1)),axis=1)
 91 |    	else:
 92 | 	    if len(leftD)==0:
 93 | 		leftD=copy.deepcopy(DD[:,i]).reshape(-1,1)
 94 | 	    else:
 95 | 		leftD=np.concatenate((leftD,copy.deepcopy(DD[:,i]).reshape(-1,1)),axis=1)
 96 |     if isinstance(leftD,np.ndarray):leftD=leftD.T
 97 |     if isinstance(newD,np.ndarray):newD=newD.T
 98 |     print('KMSD newD leftD: ',newD.shape,leftD.shape)
 99 |     centroids,clusterAss= kMeans(copy.deepcopy(newD),k) 
100 |     print('clusterAss: ',clusterAss.shape)
101 |     return centroids,np.array(clusterAss)
102 | 	
103 | 
104 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 项目中文名：基于scrapy-redis的股票分布式爬虫实现及其股票预测算法研究
 2 | 
 3 | ## 写在前面：   
 4 | 这是我的个人项目，之前学习了大数据和AI的大量知识，但一直没有机会去做一个全面的实践。后来决定利用大数据的
 5 | 技术做一个爬虫系统，然后在数据上面做一些AI层面的东西。然后就完成这个项目了。这个项目的主要意义在，里面几乎包含了最经
 6 | 典的大数据技术和一些和比较基础但经典的AI技术，对（大数据AI）有兴趣的读者，可以仔细看看这个项目，甚至可以接着这个项目
 7 | 往下做，因为这个项目完成度还不是很高。有大量可优化的地方。若真的完成一遍，对大数据+AI这方面的知识把握程度一定会有一个
 8 | 质的变化----这个项目所包含的组件和技术，还有这个项目的各个模块，是很接近企业的实际情况，只不过企业的更大更完善。有兴趣
 9 | 的学生可以玩玩，也可以联系我。我的博客：https://blog.csdn.net/jiede1/  我的微信：jiedemaikena
10 | 
11 | ## 项目意义：  
12 | 大数据方面： 利用scrapy-redis插件实现爬虫数据的分布式爬取，加快爬取速度。我选择了东方财富网的股票数据。  
13 | AI方面： 鉴于项目选择了股票数据的爬取，因此使用神经网络以及HTM算法分别二次实现预测算法，对股票的“每日的开盘价”进行预测。
14 | 
15 | ## 环境：   
16 | 三台服务器，Centos6系统。（实际上由于是分布式的，可以部署任意多台服务器，笔者三台）
17 | 
18 | ## 技术点：  
19 | scrapy  单机爬虫框架  
20 | redis   高速的key-value数据库  
21 | zookeeper  分布式监控框架  
22 | hadoop    分布式存储系统  
23 | BP neural network 反向传播神经网络   
24 | HTM algotithm  类脑AI算法，可用于对时间序列的预测和异常检测
25 | 
26 | ## 项目效果：  
27 | 该项目完善的实现了数据的分布式爬取和存储功能，也实现了两个预测算法进行预测，其中神经网络预测效果并不良好，  
28 | 但HTM算法效果突出。对股票预测有兴趣的人，可以进一步研究此算法。
29 | 
30 | 
31 | ## 各目录意义：   
32 | master  spider的master端的源码  
33 | slave spider的slave端的源码  
34 | KSMD.py 基于MSD相似度的kmeans算法实现
35 | nupic_output.py nupic算法的output  
36 | share_experiment.py Nupic算法的swarm自动寻最优参数  
37 | similarity.py MSD相似度
38 | test_nupic.py Nupic算法预测股票
39 | testbp.py BP神经网络预测股票
40 |  
41 |  
42 |  
43 |  
44 | 
45 | 


--------------------------------------------------------------------------------
/master/share_code/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = share_code.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = share_code
12 | 


--------------------------------------------------------------------------------
/master/share_code/share_code/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/__init__.py


--------------------------------------------------------------------------------
/master/share_code/share_code/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/__init__.pyc


--------------------------------------------------------------------------------
/master/share_code/share_code/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/master/share_code/share_code/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/master/share_code/share_code/__pycache__/middlewares.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/__pycache__/middlewares.cpython-36.pyc


--------------------------------------------------------------------------------
/master/share_code/share_code/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/master/share_code/share_code/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/master/share_code/share_code/algorithm/best_params.csv:
--------------------------------------------------------------------------------
1 | {'shuffle': False, 'hidden_layer_sizes': (7,), 'activation': 'relu', 'max_iter': 20000, 'solver': 'adam', 'learning_rate': 'adaptive', 'momentum': 0}


--------------------------------------------------------------------------------
/master/share_code/share_code/algorithm/similarity.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | '''
 3 | complete Time Series Trend Similarity
 4 | D(msd)= D(Euclid)*(2-ASD/SAD)
 5 | ASD --- Difference in absolute value
 6 | SAD --- manhaton distance
 7 | '''
 8 | import numpy as np
 9 | 
10 | def MSD(a,b):
11 |     if len(a)!=len(b):
12 | 	print('length not equal,quit')
13 | 	return 
14 |     if not isinstance(a,np.ndarray):
15 | 	a=np.array(a).reshape(-1)
16 |     if not isinstance(b,np.ndarray):
17 |         b=np.array(b).reshape(-1)
18 |     if (a==b).all():return 0
19 |     #欧式距离
20 |     Deuclid=np.sqrt((a-b).dot(a-b))
21 |     print('Deuclid: ',Deuclid)
22 |     #曼哈顿距离
23 |     Dmahat=sum(abs(a-b))
24 |     print('mahaton: ',Dmahat)
25 |     #ASD
26 |     ASD=abs(sum(a-b))
27 |     print('ASD: ',ASD)
28 |     #MSD
29 |     msd=Deuclid*(2-ASD/Dmahat)
30 |     print('msd: ',msd)
31 |     return msd
32 |     
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/master/share_code/share_code/algorithm/similarity.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/algorithm/similarity.pyc


--------------------------------------------------------------------------------
/master/share_code/share_code/algorithm/test1.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | #  version-1  #
  3 | #将hdfs数据本地化---读入DataFrame----利用BP+SMD
  4 | #下面讲述如何利用BP+SMD
  5 | #将数据的‘开盘价’取出，形成一列。对所有股票操作，形成多列。使用SMD
  6 | import time
  7 | import numpy as np
  8 | import pandas as pd
  9 | import sklearn
 10 | from hdfs.client import Client
 11 | from similarity import MSD
 12 | import copy
 13 | from sklearn.neural_network import MLPRegressor
 14 | from sklearn import preprocessing
 15 | from sklearn.model_selection import GridSearchCV
 16 | '''
 17 | filepath='/sdbadmin/hadoop/input'
 18 | try:
 19 |     client=Client('http://192.168.111.130:50070')
 20 | except Exception as e:
 21 |     print(e)
 22 | 
 23 | dirs=client.list(filepath)
 24 | #将hdfs本地化
 25 | print('there are %d shares'%(len(dirs)))
 26 | 
 27 | try:
 28 |     for i in range(len(dirs)):
 29 |         client.download(filepath+'/'+dirs[i],'/opt/share_code_data/'+dirs[i])
 30 | except Exception as e:
 31 |     print(e)
 32 | 
 33 | min_max_scaler = preprocessing.MinMaxScaler()
 34 | DD=pd.DataFrame([])
 35 | for i in range(len(dirs)):
 36 |     df=pd.read_csv('/opt/share_code_data/'+dirs[i],index_col=0)
 37 |     if len(DD)==len(df):
 38 | 	trun = min_max_scaler.fit_transform(copy.deepcopy(df.iloc[:,5]))
 39 |         DD[dirs[i].strip().split('.')[0]]=trun
 40 |     else:
 41 | 	cols=DD.columns
 42 | 	DD=pd.concat([DD,copy.deepcopy(df.iloc[:,5])],axis=1)   #长度不一致的合并
 43 | 	f=list(cols)
 44 | 	f.append(dirs[i].strip().split('.')[0])
 45 | 	DD.columns=f
 46 | DD.fillna(-1, inplace=True)
 47 | 
 48 | print('DD shape: ',np.shape(DD))
 49 | print(DD) #[6690 rows x 169 columns]
 50 | '''
 51 | 
 52 | 
 53 | #计算MSD
 54 | #只针对非-1序列值
 55 | #设定距离长为100,计算得到的share称为 局部msd最优share
 56 | #算法如下：
 57 | '''
 58 | 1.选取要进行预测的share
 59 | 2.计算剩余的share中，msd最小的20条局部msd最优share
 60 | 3.对2中share计算百分比*share值，作为输入，输出为下一刻的目标值,取70%长度作为训练
 61 | '''
 62 | def local_MSD(a,b,start=0,end=100): #局部最优，默认100
 63 |     return MSD(a.iloc[start:end,],b.iloc[start:end,])
 64 | 
 65 | def local_MSD_layer(a,b):   #对应于出现断层数据
 66 |     l1=np.nonzero(a!=-1)[0]
 67 |     l2=np.nonzero(b!=-1)[0]
 68 |     l=list(set(l1) & set(l2))
 69 |     sim=local_MSD(a.iloc[l,],b.iloc[l,],end=len(a))
 70 |     return sim
 71 | def sortedTw(a):
 72 |     res=sorted(a,reverse=True)
 73 |     D={}
 74 |     for i in range(20):
 75 | 	val=res[i].values()[0].split('-')[1]
 76 | 	D[val]=res[i].keys()[0]   #序列号做keys,相似度做value
 77 |     return D
 78 | def update_train(tempDF,D,detect,trainD,trainL):
 79 |     #key=D.keys()
 80 |     #value=D.values()
 81 |     #tempLength=len(tempDF)
 82 |     FG=[]
 83 |     for key,value in D.items():
 84 |         print('key,value:',key,value)
 85 |         temp=copy.deepcopy(tempDF.iloc[:,int(key)])
 86 | 	temp=temp/float(value) if value!=0 else temp
 87 | 	FG.append(temp)
 88 |     if len(trainD)!=0:
 89 |         trainD=np.concatenate([trainD,np.array(FG).T],axis=0)  #向下拼接
 90 | 	trainL=np.concatenate([trainL,tempDF.iloc[:,detect].values.reshape(-1)],axis=0)
 91 |     else:
 92 | 	trainD=np.array(FG).T
 93 | 	trainL=tempDF.iloc[:,detect].values.reshape(-1)
 94 |     print('trainD,trainL:',trainD.shape,trainL.shape)
 95 |     return trainD,trainL
 96 | 
 97 | def create_examples(DF,detect,pro=1.0,msd_len=100):  #DF是pandas，pro可选，这里默认为1，意味着train/test数据集都生成，msd_len默认msd距离长为100,detect是目标列
 98 |     length=int(len(DF)*pro)
 99 |     print('length: ',length)
100 |     i=0
101 |     Data=np.array([])   #train data
102 |     Label=np.array([])   #train label
103 |     while i<length:
104 | 	start=i
105 | 	i+=100
106 |         end=start+100 if start+100<=length else length  #训练集不得超过length
107 |         tempDF=DF.iloc[start:end,:]
108 | 	if len(np.nonzero(tempDF.iloc[:,detect]==-1)[0])>=int(0.1*len(tempDF)):    #限制为10%的好处是，可以避免这种情况：两个序列中，有一个序列非常多-1，导致计算不准确
109 | 	    print('Not satisfied!')
110 | 	    continue
111 | 	simL=[]
112 | 	for j in range(tempDF.shape[1]):
113 | 	    if j!=detect:  #自己不会跟自己计算
114 | 		l=len(np.nonzero(tempDF.iloc[:,j]==-1)[0])
115 | 		if l<=int(0.1*len(tempDF)):    #如果-1占比不超过10%,计算
116 | 		    sim=local_MSD_layer(tempDF.iloc[:,detect],tempDF.iloc[:,j]) 
117 | 		    simL.append({sim:str(detect)+'-'+str(j)})  #相似度做keys
118 | 		else:
119 | 		    print('%d col is not suitable for %d because -1 too much '%(j,detect))
120 |         D=sortedTw(simL)   #得到与detect符合的20个序列序号，以及相似度
121 | 	Data,Label = update_train(tempDF,D,detect,Data,Label)   #更新train训练集和标签	
122 | 	print('Data print: ',Data)
123 | 	#ime.sleep(6)
124 |     return Data,Label
125 | 
126 | def percent(a):
127 |     a=pd.DataFrame(a)
128 |     a.columns=['origin','predict']
129 |     b=[]
130 |     for i in range(len(a)):
131 | 	d=abs(a.iloc[i,0]-a.iloc[i,1])/a.iloc[i,1] if a.iloc[i,1] else 'Error'
132 | 	if not isinstance(d,str):
133 | 	    d=str(d*100)+'%'
134 | 	b.append(d)
135 |     a['loss']=b
136 |     return a
137 | def use_algorithm(Data,Label,pro=0.7):   #利用sklearn的神经网络实现预测，默认0.7训练集
138 |     #from sklearn.neural_network import MLPRegressor    
139 |     train_length=int(pro*len(Data))
140 |     trainD=Data[:train_length,:]
141 |     testD=Data[train_length:,:]
142 |     trainL=Label[:train_length]
143 |     testL=Label[train_length:]
144 |     print('trainD,trainL,testD,testL: ',trainD.shape,trainL.shape,testD.shape,testL.shape)
145 |     #exit(1)
146 |     # 神经网络
147 |     parameters = {  
148 |         'hidden_layer_sizes': [(15,),(7,),(21,),(15,3),(7,3),(21,3),(7,3,3),(7,5,3)], 
149 | 	'max_iter': [20000,100000],
150 | 	'momentum': [0,0.5,1],
151 | 	'learning_rate': ['adaptive','constant','invscaling'],\
152 | 	'solver': ['sgd','adam'],\
153 | 	'shuffle': [False],\
154 | 	'activation': ['logistic','relu','tanh']
155 |     }
156 |     mlp = MLPRegressor()
157 |     clf = GridSearchCV(mlp, parameters)
158 |     model=clf.fit(trainD,trainL)
159 |     bestp=clf.best_params_   
160 |     with open('best_params.csv','w') as f:
161 | 	f.write(str(bestp))
162 | 	f.close()
163 |     with open('best_params.csv','r') as f:
164 |         bestp=eval(f.read())
165 |         f.close()
166 |     print('bestp: ',bestp)
167 |     #clf = MLPRegressor(hidden_layer_sizes=(15,), max_iter=100000,learning_rate='adaptive',solver='sgd',shuffle=False,activation='logistic')
168 |     #model = clf.fit(trainD,trainL)
169 |     predictD = model.predict(testD)   #predict data
170 | 
171 |     nnDF=np.concatenate([predictD.reshape(-1,1),testL.reshape(-1,1)],axis=1)
172 |     loss=percent(nnDF)
173 |     print('loss: ',loss)
174 |     #coefficient of determination ---- 1.0 is the best
175 |     score = model.score(testD,testL)
176 |     print('score: ',score)
177 | 
178 | if __name__=='__main__':       
179 | 
180 |     filepath='/sdbadmin/hadoop/input'
181 |     try:
182 |         client=Client('http://192.168.111.130:50070')
183 |     except Exception as e:
184 |         print(e)
185 | 
186 |     dirs=client.list(filepath)
187 |     #将hdfs本地化
188 |     print('there are %d shares'%(len(dirs)))
189 |     '''
190 |     try:
191 |         for i in range(len(dirs)):
192 |             client.download(filepath+'/'+dirs[i],'/opt/share_code_data/'+dirs[i])
193 |     except Exception as e:
194 |         print(e)
195 |     '''
196 |     min_max_scaler = preprocessing.MinMaxScaler()
197 |     DD=pd.DataFrame([])
198 |     for i in range(len(dirs)):
199 |         df=pd.read_csv('/opt/share_code_data/'+dirs[i],index_col=0)
200 |         if len(DD)==len(df) or len(DD)==0 and len(df)!=0:
201 |             trun = min_max_scaler.fit_transform(copy.deepcopy(df.iloc[:,5]).values.reshape(-1,1))
202 |             DD[dirs[i].strip().split('.')[0]]=trun.ravel()	
203 |         elif len(df)!=0:
204 |             cols=DD.columns
205 | 	    trun = min_max_scaler.fit_transform(copy.deepcopy(df.iloc[:,5]).values.reshape(-1,1))
206 |             DD=pd.concat([DD,pd.DataFrame(trun)],axis=1)   #长度不一致的合并
207 |             f=list(cols)
208 |             f.append(dirs[i].strip().split('.')[0])
209 |             DD.columns=f
210 |     DD.fillna(-1, inplace=True)
211 | 
212 |     print('DD shape: ',np.shape(DD))
213 |     print(DD) #[6690 rows x 169 columns]
214 | 
215 |     Data,Label=create_examples(DD,detect=0)	
216 | 	
217 |     use_algorithm(Data,Label)		
218 | 
219 | 
220 | '''
221 | for i in range(y):  #遍历所有share
222 |     print('Now is %s'%DD.columns[i])
223 |     for j in range(y):
224 | 	if i!=j:
225 | 	    l1=np.nonzero(DD.iloc[:,i]!=-1)[0]
226 | 	    l2=np.nonzero(DD.iloc[:,j]!=-1)[0]
227 | 	    l=set(l1) & set(l2)
228 | 	    tempDF=copy.deepcopy(DD.iloc[list(l),[i,j]])
229 | 	    print('tempDF shape: ',tempDF.shape)
230 | 	    train_len=int(len(tempDF)*0.7)
231 | 	    test_len=int(len(tempDF)*0.3)
232 | '''	     
233 | 


--------------------------------------------------------------------------------
/master/share_code/share_code/auto_ip.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Apr 13 21:51:29 2018
  4 | 
  5 | @author: Administrator
  6 | """
  7 | import urllib
  8 | import urllib2
  9 | import time
 10 | import redis
 11 | from scrapy.selector import Selector
 12 | import requests
 13 | 
 14 | def get_url(url):     # 国内高匿代理的链接
 15 |     url_list = []
 16 |     for i in range(1,100):
 17 |         url_new = url + str(i)
 18 |         url_list.append(url_new)
 19 |     return url_list
 20 | def get_content(url):     # 获取网页内容
 21 |     user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0'
 22 |     headers = {'User-Agent': user_agent}
 23 |     req = requests.get(url=url, headers=headers)
 24 |     
 25 |     '''
 26 |     req = urllib.request.Request(url=url, headers=headers)
 27 |     res = urllib.request.urlopen(req)
 28 |     content = res.read()
 29 |     return content.decode('utf-8')    
 30 |     '''
 31 |     return req
 32 | 
 33 | def get_info(content):      # 提取网页信息 / ip 端口
 34 |     datas_ip = Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[2]/text()').extract()
 35 |     datas_head = Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[6]/text()').extract()
 36 |     datas_port =Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[3]/text()').extract()
 37 |     
 38 |     '''
 39 |     datas_ip = Selector(text=content).xpath('//table[contains(@id,"ip_list")]/tr/td[2]/text()').extract()
 40 |     datas_head = Selector(text=content).xpath('//table[contains(@id,"ip_list")]/tr/td[6]/text()').extract()
 41 |     datas_port =Selector(text=content).xpath('//table[contains(@id,"ip_list")]/tr/td[3]/text()').extract()
 42 |     '''
 43 |     
 44 |     #写入redis
 45 |     print('head: ',datas_head)
 46 |     try:        
 47 |         R=redis.Redis(host='localhost',port='6379')
 48 |     except Exception as e:
 49 |         print(e)
 50 |     count=0
 51 |     for head,ip,port in zip(datas_head,datas_ip,datas_port):
 52 |         p=R.zadd('share:auto_ip_pool',str(head).lower()+'://'+str(ip)+':'+str(port),count)
 53 |         if p:
 54 |             count+=1
 55 |     #print(datas_ip,datas_port)
 56 |     return datas_head,datas_ip,datas_port
 57 | def verify_ip(head,ip,port):    # 验证ip有效 
 58 |     user_agent ='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0' 
 59 |     headers = {'User-Agent':user_agent}
 60 |     if 'https' in head:
 61 |         proxy = {'https':'%s://%s:%s'%(head.lower(),ip,port)}
 62 |     else:
 63 |         proxy = {'http':'%s://%s:%s'%(head.lower(),ip,port)}
 64 |     print(proxy)
 65 |     
 66 |     #proxy_handler = urllib.request.ProxyHandler(proxy)
 67 |     proxy_handler=urllib2.ProxyHandler(proxy)
 68 |     #opener = urllib.request.build_opener(proxy_handler)
 69 |     opener = urllib2.build_opener(proxy_handler)
 70 |     urllib2.install_opener(opener)
 71 | 
 72 |     #test_url = "https://www.baidu.com/"
 73 |     test_url = "http://quote.eastmoney.com/stocklist.html#sh"
 74 |     req = urllib2.Request(url=test_url,headers=headers)
 75 |     time.sleep(3)
 76 |     
 77 |     try:   
 78 |         R=redis.Redis(host='localhost',port='6379')
 79 |     except Exception as e:
 80 |         print(e)
 81 |     count=0
 82 |     try:
 83 |         res = urllib2.urlopen(req,timeout=1)
 84 |         #time.sleep(3)
 85 |         content = res.read()
 86 |         if content:
 87 |             print('that is ok')
 88 |             R.zadd('share:auto_ip_pool_ok',str(head).lower()+'://'+str(ip)+':'+str(port),count)
 89 |             count+=1
 90 |         else:
 91 |             print('its not ok')
 92 |     except urllib2.URLError as e:
 93 |         print(e.reason)
 94 |     except Exception as e:
 95 | 	print(e)
 96 | 
 97 | if __name__ == '__main__':
 98 |     url = 'http://www.xicidaili.com/nn/'
 99 |     url_list = get_url(url)
100 |     for i in url_list:
101 |         print(i)
102 |         content = get_content(i)
103 |         time.sleep(3)
104 |         data=get_info(content)
105 |         for head,ip,port in zip(data[0],data[1],data[2]):
106 |             verify_ip(head,ip,port)
107 | 


--------------------------------------------------------------------------------
/master/share_code/share_code/auto_ip_multi_pro.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Apr 11 00:01:28 2018
  4 | 
  5 | @author: Administrator
  6 | """
  7 | from scrapy.selector import Selector
  8 | import time
  9 | import requests
 10 | import redis
 11 | from multiprocessing import Process, Queue  
 12 | 
 13 | 
 14 | def get_url(url):     # 国内高匿代理的链接
 15 |     url_list = []
 16 |     for i in range(1,100):
 17 |         url_new = url + str(i)
 18 |         url_list.append(url_new)
 19 |     return url_list
 20 | 
 21 | def get_content(url):     # 获取网页内容
 22 |     user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
 23 |     headers = {'User-Agent': user_agent}
 24 |     req = requests.get(url=url, headers=headers)
 25 |     return req
 26 | 
 27 | def get_info(content):      # 提取网页信息 / ip 端口
 28 |     datas_ip = Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[2]/text()').extract()
 29 |     datas_head = Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[6]/text()').extract()
 30 |     datas_port =Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[3]/text()').extract()
 31 |     
 32 |     #写入redis
 33 |     print('head: ',datas_head)
 34 |     try:        
 35 |         R=redis.Redis(host='localhost',port='6379')
 36 |     except Exception as e:
 37 |         print(e)
 38 |     count=0
 39 |     for head,ip,port in zip(datas_head,datas_ip,datas_port):
 40 |         p=R.zadd('share:auto_ip_pool',str(head).lower()+'://'+str(ip)+':'+str(port),count)
 41 |         if p:
 42 |             count+=1
 43 |     #print(datas_ip,datas_port)
 44 |     return datas_head,datas_ip,datas_port
 45 |    
 46 | def verify_ip_one(old_queue,new_queue):    # 验证ip有效性
 47 |     while 1:
 48 |         data=old_queue.get()
 49 |         print(data)
 50 |         if data==0:
 51 |             break
 52 |         head=data[0].lower()
 53 |         ip=data[1]
 54 |         port=data[2]
 55 |         print('head,data,ip')
 56 |         
 57 |         user_agent ='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0'
 58 |         accept_encoding ='gzip, deflate, sdch'  
 59 |         accept_language ='zh-CN,zh;q=0.8'  
 60 |         headers = {'User-Agent':user_agent,'Accept-Encoding':accept_encoding,'Accept-Language':accept_language}
 61 |         if 'https' in head:
 62 |             proxy = {'https':'%s://%s:%s'%(head,ip,port)}
 63 |         else:
 64 |             proxy = {'http':'%s://%s:%s'%(head,ip,port)}
 65 |         print(proxy)
 66 |     
 67 |         test_url = "https://www.baidu.com/"
 68 |         
 69 |         try:
 70 |             req = requests.get(url=test_url,proxies=proxy,headers=headers)
 71 |             status_code=req.status_code
 72 |             if status_code==200:
 73 |                 print('that is ok')
 74 |                 print(str(ip) + u":" + str(port))
 75 |                 new_queue.put([head,ip,port])
 76 |             else:
 77 |                 print('its not ok')
 78 |         except Exception as e:
 79 |             print('fall down')
 80 | 
 81 | def verif_ip(data):
 82 |     old_queue=Queue()
 83 |     for head,ip,port in zip(data[0],data[1],data[2]):
 84 |         old_queue.put([head,ip,port])               #往没验证过的queue加入数据
 85 |     print('verify ip.....')
 86 |     print('old_queue: ',old_queue.qsize())
 87 |     old_queue.put(0)  #终止条件
 88 |     new_queue=Queue()
 89 |     works = []  
 90 |     for i in range(1):  
 91 |         print('process %s'%i)
 92 |         works.append(Process(target=verify_ip_one, args=(old_queue,new_queue)))
 93 |     for work in works:
 94 |         print('process start')
 95 |         work.start()     
 96 |         work.join()
 97 |     '''
 98 |     for work in works:
 99 |         work.join() 
100 |     '''
101 |     try:   
102 |         R=redis.Redis(host='localhost',port='6379')
103 |     except Exception as e:
104 |         print(e)
105 |     for i in range(new_queue.qsize()):
106 |         head,ip,port=new_queue.get()
107 |         R.sadd('share:auto_ip_pool_ok',str(head).lower()+'://'+str(ip)+':'+str(port))
108 |         print('insert one row')
109 |     
110 |         
111 | if __name__ == '__main__':
112 |     url = 'http://www.xicidaili.com/nn/'
113 |     url_list = get_url(url)
114 |     for i in url_list:
115 |         print(i)
116 |         content = get_content(i)
117 |         time.sleep(3)
118 |         data=get_info(content)
119 |         verif_ip(data)
120 |         '''
121 |         for head,ip,port in zip(data[0],data[1],data[2]):
122 |             verif_ip(head.lower(),ip,port)
123 |         '''
124 |             


--------------------------------------------------------------------------------
/master/share_code/share_code/example.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Apr 13 17:35:23 2018
 4 | 
 5 | @author: Administrator
 6 | """
 7 | 
 8 | from multiprocessing import Process, Queue
 9 |  
10 | def f(q,n):
11 |     q.put([42, n, 'hello'])
12 |  
13 | if __name__ == '__main__':
14 |     q = Queue()
15 |     p_list=[]
16 |     for i in range(3):
17 |         p = Process(target=f, args=(q,i))
18 |         p_list.append(p)
19 |         p.start()
20 |     print(q.get())
21 |     print(q.get())
22 |     print(q.get())
23 |     for i in p_list:
24 |         i.join()


--------------------------------------------------------------------------------
/master/share_code/share_code/extensions.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/extensions.pyc


--------------------------------------------------------------------------------
/master/share_code/share_code/extensionsItem.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Apr 20 19:18:54 2018
 4 | 
 5 | @author: Administrator
 6 | """
 7 | import time
 8 | import logging
 9 | from scrapy import signals
10 | from scrapy.exceptions import NotConfigured
11 | logger = logging.getLogger(__name__)
12 | 
13 | class SpiderOpenCloseLogging(object):
14 | 
15 |     def __init__(self, item_count,idle_number,crawler):
16 |         self.item_count = item_count
17 | 
18 |         self.items_scraped = 0
19 | 
20 | 	self.idle_count = 0
21 | 
22 | 	self.idle_list = []
23 | 	
24 | 	self.crawler = crawler
25 | 
26 | 	self.idle_number = idle_number
27 | 
28 |     @classmethod
29 |     def from_crawler(cls, crawler):
30 |         # first check if the extension should be enabled and raise
31 | 
32 |         # NotConfigured otherwise
33 | 
34 |         if not crawler.settings.getbool('MYEXT_ENABLED'):
35 | 
36 |             raise NotConfigured
37 | 	#idle_number
38 | 	idle_number = crawler.settings.getint('IDLE_NUMBER', 10)
39 | 
40 |         # get the number of items from settings
41 | 
42 |         item_count = crawler.settings.getint('ITEM_NUMBER', 10000000)
43 | 
44 |         # instantiate the extension object
45 | 
46 |         ext = cls(item_count,idle_number,crawler)
47 | 
48 |         # connect the extension object to signals
49 | 
50 |         crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
51 | 
52 |         crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
53 | 
54 |         crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)
55 | 
56 |         crawler.signals.connect(ext.spider_idle, signal=signals.spider_idle)
57 | 
58 |         # return the extension object
59 | 
60 |         return ext
61 | 
62 |     def spider_opened(self, spider):
63 |         logger.info("opened spider %s redis spider Idle, Continuous idle limit： %d", spider.name, self.idle_number)
64 | 
65 |     def spider_closed(self, spider,reason='finished'):  #默认结束reason是finished，如果spider是被引擎的 close_spider 方法所关闭，则其为调用该方法时传入的 reason 参数(默认为 'cancelled')
66 |         logger.info("closed spider %s, idle count %d , Continuous idle count %d ,closed reason %s",
67 |                 spider.name, self.idle_count, len(self.idle_list),reason)
68 | 
69 |     def item_scraped(self, item, spider):
70 | 	if item:
71 |             self.items_scraped += 1
72 | 	    print('self.items: ',item)
73 |             if self.items_scraped % self.item_count == 0:
74 |                 spider.log("scraped %d items" % self.items_scraped)
75 | 
76 |     def spider_idle(self, spider):
77 |         self.idle_count += 1                        # 空闲计数
78 |         print('idle_count:',self.idle_count)
79 |         self.idle_list.append(time.time())       # 每次触发 spider_idle时，记录下触发时间戳
80 |         idle_list_len = len(self.idle_list)         # 获取当前已经连续触发的次数
81 |         if idle_list_len > 8:
82 |            # 连续触发的次数达到配置次数后关闭爬虫
83 |            logger.info('\n continued idle number exceed {} Times'
84 |                         '\n meet the idle shutdown conditions, will close the reptile operation'
85 |                         '\n idle start time: {},  close spider time: {}'.format(8,
86 |                                                                               self.idle_list[0], self.idle_list[-1]))
87 | 	   self.crawler.engine.close_spider(spider, 'closespider_ForNullRun')
88 | 


--------------------------------------------------------------------------------
/master/share_code/share_code/extensionsItem.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/extensionsItem.pyc


--------------------------------------------------------------------------------
/master/share_code/share_code/extensionsTime.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | from time import time
 4 | 
 5 | from scrapy.exceptions import NotConfigured
 6 | from twisted.internet import task
 7 | from scrapy import signals
 8 | 
 9 | 
10 | class Latencies(object):
11 |     """
12 |     An extension that measures throughput and latencies.
13 |     """
14 |     @classmethod
15 |     def from_crawler(cls, crawler):
16 |         return cls(crawler)
17 | 
18 |     def __init__(self, crawler):
19 |         self.crawler = crawler
20 |         self.interval = crawler.settings.getfloat('LATENCIES_INTERVAL')
21 | 
22 |         if not self.interval:
23 |             raise NotConfigured
24 | 
25 |         cs = crawler.signals
26 |         cs.connect(self._spider_opened, signal=signals.spider_opened)
27 |         cs.connect(self._spider_closed, signal=signals.spider_closed)
28 |         cs.connect(self._request_scheduled, signal=signals.request_scheduled)
29 |         cs.connect(self._response_received, signal=signals.response_received)
30 |         cs.connect(self._item_scraped, signal=signals.item_scraped)
31 | 
32 |         self.latency, self.proc_latency, self.items = 0, 0, 0
33 | 
34 |     def _spider_opened(self, spider):
35 |         self.task = task.LoopingCall(self._log, spider)
36 |         self.task.start(self.interval)
37 | 
38 |     def _spider_closed(self, spider, reason):
39 |         if self.task.running:
40 |             self.task.stop()
41 | 
42 |     def _request_scheduled(self, request, spider):
43 |         request.meta['schedule_time'] = time()
44 | 
45 |     def _response_received(self, response, request, spider):
46 |         request.meta['received_time'] = time()
47 | 
48 |     def _item_scraped(self, item, response, spider):
49 |         self.latency += time() - response.meta['schedule_time']
50 |         self.proc_latency += time() - response.meta['received_time']
51 |         self.items += 1
52 | 
53 |     def _log(self, spider):
54 |         irate = float(self.items) / self.interval  #interval 时间内处理item数
55 |         latency = self.latency / self.items if self.items else 0  #单个时延
56 |         proc_latency = self.proc_latency / self.items if self.items else 0  #单个响应时间
57 | 
58 |         spider.logger.info(("Scraped %d items at %.1f items/s, avg latency: "
59 |                             "%.2f s and avg time in pipelines: %.2f s") %
60 |                           (self.items, irate, latency, proc_latency))
61 | 
62 |         self.latency, self.proc_latency, self.items = 0, 0, 0
63 | 


--------------------------------------------------------------------------------
/master/share_code/share_code/extensionsTime.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/extensionsTime.pyc


--------------------------------------------------------------------------------
/master/share_code/share_code/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | from scrapy.loader import ItemLoader
10 | from scrapy.loader.processors import MapCompose, TakeFirst, Join
11 | 
12 | 
13 | class ShareCodeItem(scrapy.Item):
14 |     # define the fields for your item here like:
15 |     # name = scrapy.Field()
16 |     number = scrapy.Field()
17 |     
18 |     data = scrapy.Field()
19 | 
20 | class ShareLoader(ItemLoader):
21 |     default_item_class = ShareCodeItem
22 |     default_input_processor = MapCompose(lambda s: s.lstrip().replace('\r',''))#去掉左边空格
23 |     default_output_processor = Join()
24 |     description_out = Join()
25 | 


--------------------------------------------------------------------------------
/master/share_code/share_code/items.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/items.pyc


--------------------------------------------------------------------------------
/master/share_code/share_code/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | import scrapy
  9 | from scrapy import signals
 10 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
 11 | from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
 12 | import random
 13 | import redis
 14 | 
 15 | 
 16 | class ShareCodeSpiderMiddleware(object):
 17 |     # Not all methods need to be defined. If a method is not defined,
 18 |     # scrapy acts as if the spider middleware does not modify the
 19 |     # passed objects.
 20 | 
 21 |     @classmethod
 22 |     def from_crawler(cls, crawler):
 23 |         # This method is used by Scrapy to create your spiders.
 24 |         s = cls()
 25 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 26 |         return s
 27 | 
 28 |     def process_spider_input(self, response, spider):
 29 |         # Called for each response that goes through the spider
 30 |         # middleware and into the spider.
 31 | 
 32 |         # Should return None or raise an exception.
 33 |         return None
 34 | 
 35 |     def process_spider_output(self, response, result, spider):
 36 |         # Called with the results returned from the Spider, after
 37 |         # it has processed the response.
 38 | 
 39 |         # Must return an iterable of Request, dict or Item objects.
 40 |         for i in result:
 41 |             yield i
 42 | 
 43 |     def process_spider_exception(self, response, exception, spider):
 44 |         # Called when a spider or process_spider_input() method
 45 |         # (from other spider middleware) raises an exception.
 46 | 
 47 |         # Should return either None or an iterable of Response, dict
 48 |         # or Item objects.
 49 |         pass
 50 | 
 51 |     def process_start_requests(self, start_requests, spider):
 52 |         # Called with the start requests of the spider, and works
 53 |         # similarly to the process_spider_output() method, except
 54 |         # that it doesn’t have a response associated.
 55 | 
 56 |         # Must return only requests (not items).
 57 |         for r in start_requests:
 58 |             yield r
 59 | 
 60 |     def spider_opened(self, spider):
 61 |         spider.logger.info('Spider opened: %s' % spider.name)
 62 | 
 63 | 
 64 | class ShareCodeDownloaderMiddleware(object):
 65 |     # Not all methods need to be defined. If a method is not defined,
 66 |     # scrapy acts as if the downloader middleware does not modify the
 67 |     # passed objects.
 68 | 
 69 |     @classmethod
 70 |     def from_crawler(cls, crawler):
 71 |         # This method is used by Scrapy to create your spiders.
 72 |         s = cls()
 73 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 74 |         return s
 75 | 
 76 |     def process_request(self, request, spider):
 77 |         # Called for each request that goes through the downloader
 78 |         # middleware.
 79 | 
 80 |         # Must either:
 81 |         # - return None: continue processing this request
 82 |         # - or return a Response object
 83 |         # - or return a Request object
 84 |         # - or raise IgnoreRequest: process_exception() methods of
 85 |         #   installed downloader middleware will be called
 86 |         return None
 87 | 
 88 |     def process_response(self, request, response, spider):
 89 |         # Called with the response returned from the downloader.
 90 | 
 91 |         # Must either;
 92 |         # - return a Response object
 93 |         # - return a Request object
 94 |         # - or raise IgnoreRequest
 95 |         return response
 96 | 
 97 |     def process_exception(self, request, exception, spider):
 98 |         # Called when a download handler or a process_request()
 99 |         # (from other downloader middleware) raises an exception.
100 | 
101 |         # Must either:
102 |         # - return None: continue processing this exception
103 |         # - return a Response object: stops process_exception() chain
104 |         # - return a Request object: stops process_exception() chain
105 |         pass
106 | 
107 |     def spider_opened(self, spider):
108 |         spider.logger.info('Spider opened: %s' % spider.name)
109 | 
110 | class DoubleUserAgentMiddleware(UserAgentMiddleware):
111 |     '''
112 |     设置User-Agent
113 |     '''
114 | 
115 |     def __init__(self, user_agent):
116 |         self.user_agent = user_agent
117 | 
118 |     @classmethod
119 |     def from_crawler(cls, crawler):
120 |         return cls(
121 |             user_agent=crawler.settings.get('DOUBLE_USER_AGENT')
122 |         )
123 | 
124 |     def process_request(self, request, spider):
125 |         agent = random.choice(self.user_agent)
126 |         request.headers['User-Agent'] = agent
127 | 
128 | class AuToIpMiddleware(HttpProxyMiddleware):
129 |         '''设置自动IP'''
130 |         def __init__(self,ipaddr=''):
131 |             super(AuToIpMiddleware,self).__init__
132 |             self.ipaddr=ipaddr
133 |             
134 |         def process_request(self, request, spider):
135 |             '''对request对象加上proxy'''  
136 |             self.ipaddr=self.get_random_proxy()
137 |             print('The chosen ip is: '+str(self.ipaddr))
138 |             request.meta["proxy"] = self.ipaddr
139 |             
140 |         def process_response(self, request, response, spider):  
141 |             '''对返回的response处理'''  
142 |             # 如果返回的response状态不是200，重新生成当前request对象  
143 |             if response.status != 200 : 
144 | 		if 'Unauthorized' in response.body: 
145 | 		    print('length: ',len(response.body))
146 | 		    print('content: ',response.body)
147 |                     proxy = self.get_random_proxy()  
148 |                     print("Response ip:"+proxy)  
149 |                     # 对当前reque加上代理  
150 |                     request.meta['proxy'] = self.ipaddr
151 |                     return request  
152 | 		return request
153 |             return response  
154 |         
155 |         def get_random_proxy(self):
156 |             try:        
157 |                 R=redis.Redis(host='localhost',port='6379')
158 |             except Exception as e:
159 |                 print(e)
160 |             length=R.zcard('share:auto_ip_pool_ok')
161 |             number=random.randint(0,length-1)
162 |             ipaddr=R.zrange('share:auto_ip_pool_ok',number,number)[0]  #取出随机一个ip
163 |             ipaddr=ipaddr.decode('utf-8')
164 |             return ipaddr
165 | 


--------------------------------------------------------------------------------
/master/share_code/share_code/middlewares.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/middlewares.pyc


--------------------------------------------------------------------------------
/master/share_code/share_code/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | #from hdfs import Client
 8 | from scrapy.exceptions import DropItem
 9 | import logging
10 | logger=logging.getLogger(__name__)
11 | 
12 | class ShareCodePipeline(object):
13 |      # 初始化方法
14 |     def __init__(self):
15 |         logger.info("ShareCodePipeline __init__")
16 | 	'''
17 |    	try:
18 |             self.client=Client('http://192.168.111.130:50070')
19 |         except Exception as e:
20 |             print(e)
21 | 	'''
22 |  
23 |     def process_item(self, item, spider):
24 |         logger.info("ShareCodePipeline process_item")
25 |         
26 |         if item['number']:
27 |             number=item['number']
28 | 	    logger.info('number exists')
29 |         else:
30 |            raise DropItem('Missing number in %s'%item)
31 |         if item['data']:
32 |             data=item['data']
33 | 	    logger.info('data exists')
34 |             #print('data:',data,'\n\n')
35 |         else:
36 |            raise DropItem('Missing data in %s'%item)
37 |         
38 |         data_str=data.encode('utf-8')   #内含中文，先编码成utf-8
39 | 	logger.info('ShareCodePipeline process_item success')
40 |         '''       
41 |         try:
42 |             print('begin write')
43 |             self.client.write('/sdbadmin/hadoop/input/'+str(number)+'.csv',data=data_str)
44 |             print('end write')
45 |         except Exception as e:
46 |             print(e)
47 |         '''
48 | 


--------------------------------------------------------------------------------
/master/share_code/share_code/pipelines.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/pipelines.pyc


--------------------------------------------------------------------------------
/master/share_code/share_code/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for share_code project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'share_code'
 13 | 
 14 | SPIDER_MODULES = ['share_code.spiders']
 15 | NEWSPIDER_MODULE = 'share_code.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'share_code (+http://www.yourdomain.com)'
 20 | 
 21 | # Obey robots.txt rules
 22 | ROBOTSTXT_OBEY = True
 23 | 
 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 25 | #CONCURRENT_REQUESTS = 32
 26 | 
 27 | # Configure a delay for requests for the same website (default: 0)
 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 29 | # See also autothrottle settings and docs
 30 | #DOWNLOAD_DELAY = 3
 31 | # The download delay setting will honor only one of:
 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 33 | #CONCURRENT_REQUESTS_PER_IP = 16
 34 | 
 35 | # Disable cookies (enabled by default)
 36 | COOKIES_ENABLED = False
 37 | 
 38 | # Disable Telnet Console (enabled by default)
 39 | #TELNETCONSOLE_ENABLED = False
 40 | 
 41 | # Override the default request headers:
 42 | #DEFAULT_REQUEST_HEADERS = {
 43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 44 | #   'Accept-Language': 'en',
 45 | #}
 46 | 
 47 | # Enable or disable spider middlewares
 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 49 | #SPIDER_MIDDLEWARES = {
 50 | #    'share_code.middlewares.ShareCodeSpiderMiddleware': 543,
 51 | #}
 52 | 
 53 | # Enable or disable downloader middlewares
 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 55 | DOWNLOADER_MIDDLEWARES = {
 56 |     'scrapy.downloadermiddleware.useragent.UserAgentMiddleware': None, 
 57 |     'share_code.middlewares.DoubleUserAgentMiddleware': 240,
 58 |     'share_code.middlewares.AuToIpMiddleware': None,  #340
 59 |     'share_code.middlewares.ShareCodeDownloaderMiddleware': 543,
 60 |     'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':800,
 61 |     'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': None   
 62 | }
 63 | 
 64 | # Enable or disable extensions
 65 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 66 | #EXTENSIONS = {
 67 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 68 | #}
 69 | 
 70 | # Configure item pipelines
 71 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 72 | #ITEM_PIPELINES = {
 73 | #    'share_code.pipelines.ShareCodePipeline': 300,
 74 | #}
 75 | 
 76 | # Enable and configure the AutoThrottle extension (disabled by default)
 77 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 78 | #AUTOTHROTTLE_ENABLED = True
 79 | # The initial download delay
 80 | #AUTOTHROTTLE_START_DELAY = 5
 81 | # The maximum download delay to be set in case of high latencies
 82 | #AUTOTHROTTLE_MAX_DELAY = 60
 83 | # The average number of requests Scrapy should be sending in parallel to
 84 | # each remote server
 85 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 86 | # Enable showing throttling stats for every response received:
 87 | #AUTOTHROTTLE_DEBUG = False
 88 | 
 89 | # Enable and configure HTTP caching (disabled by default)
 90 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 91 | #HTTPCACHE_ENABLED = True
 92 | #HTTPCACHE_EXPIRATION_SECS = 0
 93 | #HTTPCACHE_DIR = 'httpcache'
 94 | #HTTPCACHE_IGNORE_HTTP_CODES = []
 95 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 96 | 
 97 | MYEXT_ENABLED = True
 98 | 
 99 | LATENCIES_INTERVAL = 5
100 | ITEM_NUMBERS=10000000
101 | IDLE_NUMBERS=8
102 | 
103 | MEMUSAGE_NOTIFY_MAIL = ['3081881935@qq.com']
104 | MEMUSAGE_REPORT = True
105 | MEMUSAGE_ENABLED = True
106 | MEMUSAGE_LIMIT_MB = 2048
107 | MEMDEBUG_ENABLED = True
108 | MEMDEBUG_NOTIFY = []
109 | 
110 | EXTENSIONS = {
111 |     'share_code.extensionsItem.SpiderOpenCloseLogging': 100,
112 |     'share_code.extensionsTime.Latencies': 120,
113 |     'scrapy.contrib.memusage.MemoryUsage': 50,
114 |     'scrapy.contrib.memdebug.MemoryDebugger': 60
115 | }
116 | 
117 | ITEM_PIPELINES = {
118 |         'share_code.pipelines.ShareCodePipeline':100
119 |         }
120 | 
121 | # Enables scheduling storing requests queue in redis.
122 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
123 | 
124 | # Ensure all spiders share same duplicates filter through redis.
125 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
126 | 
127 | REDIS_START_URLS_AS_SET = True
128 | 
129 | REDIS_HOST = '192.168.111.130' 
130 | REDIS_PORT = '6379'
131 | 
132 | DOUBLE_USER_AGENT = [
133 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
134 |     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
135 |     "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
136 |     "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
137 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
138 |     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
139 |     "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
140 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
141 |     "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
142 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
143 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
144 |     "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
145 |     "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
146 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
147 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
148 |     "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
149 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
150 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
151 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
152 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
153 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
154 |     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
155 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
156 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
157 |     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
158 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
159 |     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
160 |     "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
161 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
162 |     "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
163 |     "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
164 |     "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
165 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
166 |     "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
167 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
168 |     ]
169 | 


--------------------------------------------------------------------------------
/master/share_code/share_code/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/settings.pyc


--------------------------------------------------------------------------------
/master/share_code/share_code/spiders/ShareSpider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | #redis-cli sadd share:start_urls http://quote.eastmoney.com/stocklist.html#sh
  4 | #scrapy crawl share
  5 | #del share:download_url share:temp_urls share:share_code
  6 | 
  7 | 
  8 | from scrapy_redis.spiders import RedisSpider
  9 | import redis
 10 | import scrapy
 11 | from scrapy import log
 12 | from share_code.items import ShareLoader
 13 | from scrapy.selector import Selector
 14 | import re
 15 | from kazoo.client import KazooClient
 16 | #rom scrapy import log
 17 | 
 18 | class ShareSpider(RedisSpider):
 19 |     name = "share"
 20 |     #allowed_domains = ["share.org"]redis_key = 'share:start_urls'
 21 |     #start_urls = 'http://quote.eastmoney.com/stocklist.html#sh'
 22 |     redis_key = 'share:start_urls'
 23 |     
 24 |     temp_url_piece = 'http://quotes.money.163.com/trade/lsjysj_'  #中间url的片段
 25 |     download_url_piece = 'http://quotes.money.163.com/service/chddata.html?code=0'
 26 | 
 27 |     zk = KazooClient(hosts='127.0.0.1:2181')  
 28 |     zk.start()  
 29 |     
 30 |     # Ensure a path, create if necessary  
 31 |     zk.ensure_path("/ip_process")  
 32 |       
 33 |     # Create a node with data  
 34 |     zk.create("/ip_process/192.168.111.130",  
 35 |               value=b"ok", ephemeral=True)  
 36 | 
 37 |     pool=redis.ConnectionPool(host='localhost', port=6379, decode_responses=True)
 38 |     
 39 |     def parse(self, response):
 40 |         self.log('parse begin!',level=log.INFO)
 41 |         #r = ShareLoader(response=response)
 42 |         #response.encoding='GB2312'
 43 |         node_list = Selector(response=response).xpath('//*[@id="quotesearch"]/ul[1]/li/a/text()').extract()
 44 |         #print(node_list)
 45 |         code_list = []
 46 |         count = 1 #zadd score
 47 |         totalCount = 0 #total share url 
 48 |         
 49 |         R = redis.Redis(connection_pool=self.pool)
 50 |         
 51 |         self.log('Redis connect success!',level=log.INFO)
 52 |         for node in node_list:
 53 |             #r.add_value('number',node)
 54 |             try:
 55 |                 code = re.match(r'.*?\((\d+)\)', node).group(1)
 56 |                 print ('code: ',code)
 57 |                 code_list.append(code)
 58 |                 p0 = R.zadd('share:share_code',code,count)  #增加code进sorted_set
 59 |                 if p0==1:  #无重复
 60 |                     print('Add share code success')
 61 |                     totalCount += 1
 62 |                     print('code totalCount +=1')
 63 |                     p1 = R.sadd('share:temp_urls',self.temp_url_piece +str(code) + '.html')
 64 |                     if p1 ==1: #无重复
 65 |                         print('share:temp_urls count: ',R.scard('share:temp_urls'))
 66 |                         temp_url = self.temp_url_piece +str(code) + '.html'
 67 |                         count += 1
 68 |                         yield scrapy.Request(url=temp_url,callback=self.parse2,meta={'code':code})
 69 |                         self.log('parse end!',level=log.INFO)
 70 | 		else:
 71 | 		    print('Add share code duplicate')
 72 |             except Exception as e:
 73 |                 print(e)
 74 |                 continue
 75 |                         
 76 |     def parse2(self,response):
 77 | 	#print('response body: ',response.body)
 78 |         if response.status == 200:
 79 | 	    #print('url: ',response.url)
 80 | 	    #print('response.body: ',response.body)
 81 |             code = response.meta['code']
 82 |             #print('__init__: ',Selector(response=response).xpath('//input[@name="date_start_type"]/@value').extract())
 83 |             if Selector(response=response).xpath('//input[@name="date_start_type"]/@value').extract():
 84 |                 start_date = Selector(response=response).xpath('//input[@name="date_start_type"]/@value').extract()[0].replace('-','')
 85 | 	    else:
 86 | 		start_date = Selector(response=response).xpath('//input[@type="text"]/@value').extract()[0].replace('-','')
 87 |             end_date = Selector(response=response).xpath('//input[@name="date_end_type"]/@value').extract()[0].replace('-','')
 88 |             print('start_date: ',start_date)
 89 |             download_url = self.download_url_piece+str(code)+"&start="+str(start_date)+"&end="+str(end_date)+"&fields=TCLOSE;HIGH;LOW;TOPEN;LCLOSE;CHG;PCHG;TURNOVER;VOTURNOVER;VATURNOVER;TCAP;MCAP"
 90 |             #yield scrapy.Request(url=download_url,meta={'code',code},callback=self.parse3)
 91 |             r = ShareLoader(response=response)
 92 |             pool=redis.ConnectionPool(host='localhost', port=6379, decode_responses=True)
 93 |             R = redis.Redis(connection_pool=pool)
 94 |             
 95 |             p2 = R.sadd('share:download_url',download_url)    
 96 |                 
 97 |             if p2:
 98 |                 print('download_url success')
 99 |                 print('download_url: ',download_url)
100 |                 
101 |                 r.add_value('number',code)
102 |                 r.add_value('data',download_url)
103 |                 self.log('Add download_url one more', level=log.INFO) 
104 |                 
105 |             return r.load_item()
106 |         else:
107 |             self.log('Response not 200!! ',level=log.WARNING)
108 | 


--------------------------------------------------------------------------------
/master/share_code/share_code/spiders/ShareSpider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/spiders/ShareSpider.pyc


--------------------------------------------------------------------------------
/master/share_code/share_code/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/master/share_code/share_code/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/spiders/__init__.pyc


--------------------------------------------------------------------------------
/master/share_code/share_code/spiders/__pycache__/ShareSpider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/spiders/__pycache__/ShareSpider.cpython-36.pyc


--------------------------------------------------------------------------------
/master/share_code/share_code/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/master/share_code/share_code/untitled2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Apr 20 01:08:28 2018
 4 | 
 5 | @author: Administrator
 6 | """
 7 | 
 8 | class zoo:
 9 |     b=1
10 |     def __init__(self):
11 |         print('init',self.b)
12 |     @classmethod
13 |     def ok(cls,iin):
14 |         print('classmethod: ',iin)
15 |     print('hello')
16 |     global a
17 |     a=1
18 |     a+=1
19 |     print(a)
20 | zoo()


--------------------------------------------------------------------------------
/master/share_code/zookeeper.out:
--------------------------------------------------------------------------------
 1 | 2018-05-12 17:40:42,831 [myid:] - INFO  [main:QuorumPeerConfig@134] - Reading configuration from: /opt/zookeeper-3.4.10/bin/../conf/zoo.cfg
 2 | 2018-05-12 17:40:42,875 [myid:] - INFO  [main:QuorumPeer$QuorumServer@167] - Resolved hostname: 192.168.111.131 to address: /192.168.111.131
 3 | 2018-05-12 17:40:42,877 [myid:] - INFO  [main:QuorumPeer$QuorumServer@167] - Resolved hostname: 192.168.111.128 to address: /192.168.111.128
 4 | 2018-05-12 17:40:42,880 [myid:] - INFO  [main:QuorumPeer$QuorumServer@167] - Resolved hostname: 192.168.111.129 to address: /192.168.111.129
 5 | 2018-05-12 17:40:42,881 [myid:] - INFO  [main:QuorumPeer$QuorumServer@167] - Resolved hostname: 192.168.111.130 to address: /192.168.111.130
 6 | 2018-05-12 17:40:42,882 [myid:] - WARN  [main:QuorumPeerConfig@352] - Non-optimial configuration, consider an odd number of servers.
 7 | 2018-05-12 17:40:42,882 [myid:] - INFO  [main:QuorumPeerConfig@396] - Defaulting to majority quorums
 8 | 2018-05-12 17:40:42,893 [myid:0] - INFO  [main:DatadirCleanupManager@78] - autopurge.snapRetainCount set to 3
 9 | 2018-05-12 17:40:42,893 [myid:0] - INFO  [main:DatadirCleanupManager@79] - autopurge.purgeInterval set to 0
10 | 2018-05-12 17:40:42,894 [myid:0] - INFO  [main:DatadirCleanupManager@101] - Purge task is not scheduled.
11 | 2018-05-12 17:40:42,926 [myid:0] - INFO  [main:QuorumPeerMain@127] - Starting quorum peer
12 | 2018-05-12 17:40:42,947 [myid:0] - INFO  [main:NIOServerCnxnFactory@89] - binding to port 0.0.0.0/0.0.0.0:2181
13 | 2018-05-12 17:40:42,949 [myid:0] - ERROR [main:QuorumPeerMain@89] - Unexpected exception, exiting abnormally
14 | java.net.BindException: 地址已在使用
15 | 	at sun.nio.ch.Net.bind0(Native Method)
16 | 	at sun.nio.ch.Net.bind(Net.java:433)
17 | 	at sun.nio.ch.Net.bind(Net.java:425)
18 | 	at sun.nio.ch.ServerSocketChannelImpl.bind(ServerSocketChannelImpl.java:223)
19 | 	at sun.nio.ch.ServerSocketAdaptor.bind(ServerSocketAdaptor.java:74)
20 | 	at sun.nio.ch.ServerSocketAdaptor.bind(ServerSocketAdaptor.java:67)
21 | 	at org.apache.zookeeper.server.NIOServerCnxnFactory.configure(NIOServerCnxnFactory.java:90)
22 | 	at org.apache.zookeeper.server.quorum.QuorumPeerMain.runFromConfig(QuorumPeerMain.java:130)
23 | 	at org.apache.zookeeper.server.quorum.QuorumPeerMain.initializeAndRun(QuorumPeerMain.java:111)
24 | 	at org.apache.zookeeper.server.quorum.QuorumPeerMain.main(QuorumPeerMain.java:78)
25 | 


--------------------------------------------------------------------------------
/master/zoo_detect/zoo_watcher.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Apr 20 00:57:45 2018
 4 | 
 5 | @author: Administrator
 6 | """
 7 | 
 8 | from kazoo.client import KazooClient  
 9 |   
10 | import time  
11 |   
12 | import logging  
13 | logging.basicConfig()  
14 |   
15 | zk = KazooClient(hosts='127.0.0.1:2181')  
16 | zk.start()  
17 |   
18 | # Determine if a node exists  
19 | while True:  
20 |     for ip in ['192.168.111.130']:
21 |         if zk.exists("/ip_process/" + ip):  
22 |             print ("%s is alive!"%ip)  
23 |         else:  
24 |             print ("%s is dead!"%ip) 
25 |             break  
26 |     time.sleep(6)  
27 |   
28 | zk.stop()  
29 | 


--------------------------------------------------------------------------------
/nupic_output.py:
--------------------------------------------------------------------------------
  1 | <<<<<<< HEAD
  2 | # ----------------------------------------------------------------------
  3 | # Numenta Platform for Intelligent Computing (NuPIC)
  4 | # Copyright (C) 2013, Numenta, Inc.  Unless you have an agreement
  5 | # with Numenta, Inc., for a separate license for this software code, the
  6 | # following terms and conditions apply:
  7 | #
  8 | # This program is free software: you can redistribute it and/or modify
  9 | # it under the terms of the GNU General Public License version 3 as
 10 | # published by the Free Software Foundation.
 11 | #
 12 | # This program is distributed in the hope that it will be useful,
 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 15 | # See the GNU General Public License for more details.
 16 | #
 17 | # You should have received a copy of the GNU General Public License
 18 | # along with this program.  If not, see http://www.gnu.org/licenses.
 19 | #
 20 | # http://numenta.org/licenses/
 21 | # ----------------------------------------------------------------------
 22 | import csv
 23 | from collections import deque
 24 | from abc import ABCMeta, abstractmethod
 25 | from nupic.data.inference_shifter import InferenceShifter
 26 | # Some users might not have matplotlib, and will only be using NuPICFileOutput.
 27 | # So we can attempt to import and swallow any import errors that occur.
 28 | try:
 29 |   import matplotlib.pyplot as plt
 30 |   import matplotlib.gridspec as gridspec
 31 | except ImportError:
 32 |   pass
 33 | 
 34 | 
 35 | WINDOW = 360
 36 | 
 37 | 
 38 | class NuPICOutput(object):
 39 | 
 40 |   __metaclass__ = ABCMeta
 41 | 
 42 | 
 43 |   def __init__(self, name, show_anomaly_score=False):
 44 |     self.name = name
 45 |     self.show_anomaly_score = show_anomaly_score
 46 | 
 47 | 
 48 |   @abstractmethod
 49 |   def write(self, index, value, prediction_result, prediction_step=1):
 50 |     pass
 51 | 
 52 | 
 53 |   @abstractmethod
 54 |   def close(self):
 55 |     pass
 56 | 
 57 | 
 58 | 
 59 | class NuPICFileOutput(NuPICOutput):
 60 | 
 61 | 
 62 |   def __init__(self, *args, **kwargs):
 63 |     super(NuPICFileOutput, self).__init__(*args, **kwargs)
 64 |     self.linecount = 0
 65 |     output_filename = '/home/sdbadmin/runSwarm/'+"%s.csv" % self.name
 66 |     print "Preparing to output to %s" % output_filename
 67 |     self.file = open(output_filename, 'w')
 68 |     self.writer = csv.writer(self.file)
 69 |     header_row = ['Time', 'value', 'prediction']
 70 |     if self.show_anomaly_score:
 71 |       header_row.append('anomaly score')
 72 |     self.writer.writerow(header_row)
 73 | 
 74 | 
 75 |   def write(self, index, value, prediction_result, prediction_step=1):
 76 |     prediction = prediction_result.inferences\
 77 |       ['multiStepBestPredictions'][prediction_step]
 78 |     output_row = [index, value, prediction]
 79 |     if self.show_anomaly_score:
 80 |       output_row.append(prediction_result.inferences['anomalyScore'])
 81 |     self.writer.writerow(output_row)
 82 |     self.linecount = self.linecount + 1
 83 | 
 84 | 
 85 |   def close(self):
 86 |     self.file.close()
 87 |     print "Done. Wrote %i data lines to %s." % (self.linecount, self.file.name)
 88 | 
 89 | 
 90 | 
 91 | class NuPICPlotOutput(NuPICOutput):
 92 | 
 93 | 
 94 |   def __init__(self, *args, **kwargs):
 95 |     super(NuPICPlotOutput, self).__init__(*args, **kwargs)
 96 |     # turn matplotlib interactive mode on (ion)
 97 |     plt.ion()
 98 |     plt.figure(figsize=(14, 10))
 99 |     gs = gridspec.GridSpec(2, 1, height_ratios=[3,1])
100 |     # plot title, legend, etc
101 |     plt.title('Sine prediction example')
102 |     plt.ylabel('Sine (rad)')
103 |     # The shifter will align prediction and actual values.
104 |     self.shifter = InferenceShifter()
105 |     # Keep the last WINDOW predicted and actual values for plotting.
106 |     self.actual_history = deque([0.0] * WINDOW, maxlen=360)
107 |     self.predicted_history = deque([0.0] * WINDOW, maxlen=360)
108 |     if self.show_anomaly_score:
109 |       self.anomaly_score = deque([0.0] * WINDOW, maxlen=360)
110 |     # Initialize the plot lines that we will update with each new record.
111 |     if self.show_anomaly_score:
112 |       plt.subplot(gs[0])
113 |     self.actual_line, = plt.plot(range(WINDOW), self.actual_history)
114 |     self.predicted_line, = plt.plot(range(WINDOW), self.predicted_history)
115 |     plt.legend(tuple(['actual','predicted']), loc=3)
116 |     if self.show_anomaly_score:
117 |       plt.subplot(gs[1])
118 |       self.anomaly_score_line, = plt.plot(range(WINDOW), self.anomaly_score, 'r-')
119 |       plt.legend(tuple(['anomaly score']), loc=3)
120 | 
121 |     # Set the y-axis range.
122 |     self.actual_line.axes.set_ylim(-1, 1)
123 |     self.predicted_line.axes.set_ylim(-1, 1)
124 |     if self.show_anomaly_score:
125 |       self.anomaly_score_line.axes.set_ylim(-1, 1)
126 | 
127 | 
128 | 
129 |   def write(self, index, value, prediction_result, prediction_step=1):
130 |     shifted_result = self.shifter.shift(prediction_result)
131 |     # shifted_result = prediction_result
132 |     # Update the trailing predicted and actual value deques.
133 |     inference = shifted_result.inferences\
134 |       ['multiStepBestPredictions'][prediction_step]
135 |     if inference is not None:
136 |       self.actual_history.append(shifted_result.rawInput['sine'])
137 |       self.predicted_history.append(inference)
138 |       if self.show_anomaly_score:
139 |         anomaly_score = prediction_result.inferences['anomalyScore']
140 |         self.anomaly_score.append(anomaly_score)
141 | 
142 |     # Redraw the chart with the new data.
143 |     self.actual_line.set_ydata(self.actual_history)  # update the data
144 |     self.predicted_line.set_ydata(self.predicted_history)  # update the data
145 |     if self.show_anomaly_score:
146 |       self.anomaly_score_line.set_ydata(self.anomaly_score)  # update the data
147 |     plt.draw()
148 |     plt.tight_layout()
149 | 
150 | 
151 | 
152 |   def close(self):
153 |     plt.ioff()
154 |     plt.show()
155 | 
156 | 
157 | 
158 | NuPICOutput.register(NuPICFileOutput)
159 | NuPICOutput.register(NuPICPlotOutput)
160 | =======
161 | # ----------------------------------------------------------------------
162 | # Numenta Platform for Intelligent Computing (NuPIC)
163 | # Copyright (C) 2013, Numenta, Inc.  Unless you have an agreement
164 | # with Numenta, Inc., for a separate license for this software code, the
165 | # following terms and conditions apply:
166 | #
167 | # This program is free software: you can redistribute it and/or modify
168 | # it under the terms of the GNU General Public License version 3 as
169 | # published by the Free Software Foundation.
170 | #
171 | # This program is distributed in the hope that it will be useful,
172 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
173 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
174 | # See the GNU General Public License for more details.
175 | #
176 | # You should have received a copy of the GNU General Public License
177 | # along with this program.  If not, see http://www.gnu.org/licenses.
178 | #
179 | # http://numenta.org/licenses/
180 | # ----------------------------------------------------------------------
181 | import csv
182 | from collections import deque
183 | from abc import ABCMeta, abstractmethod
184 | from nupic.data.inference_shifter import InferenceShifter
185 | # Some users might not have matplotlib, and will only be using NuPICFileOutput.
186 | # So we can attempt to import and swallow any import errors that occur.
187 | try:
188 |   import matplotlib.pyplot as plt
189 |   import matplotlib.gridspec as gridspec
190 | except ImportError:
191 |   pass
192 | 
193 | 
194 | WINDOW = 360
195 | 
196 | 
197 | class NuPICOutput(object):
198 | 
199 |   __metaclass__ = ABCMeta
200 | 
201 | 
202 |   def __init__(self, name, show_anomaly_score=False):
203 |     self.name = name
204 |     self.show_anomaly_score = show_anomaly_score
205 | 
206 | 
207 |   @abstractmethod
208 |   def write(self, index, value, prediction_result, prediction_step=1):
209 |     pass
210 | 
211 | 
212 |   @abstractmethod
213 |   def close(self):
214 |     pass
215 | 
216 | 
217 | 
218 | class NuPICFileOutput(NuPICOutput):
219 | 
220 | 
221 |   def __init__(self, *args, **kwargs):
222 |     super(NuPICFileOutput, self).__init__(*args, **kwargs)
223 |     self.linecount = 0
224 |     output_filename = '/home/sdbadmin/runSwarm/'+"%s.csv" % self.name
225 |     print "Preparing to output to %s" % output_filename
226 |     self.file = open(output_filename, 'w')
227 |     self.writer = csv.writer(self.file)
228 |     header_row = ['Time', 'value', 'prediction']
229 |     if self.show_anomaly_score:
230 |       header_row.append('anomaly score')
231 |     self.writer.writerow(header_row)
232 | 
233 | 
234 |   def write(self, index, value, prediction_result, prediction_step=1):
235 |     prediction = prediction_result.inferences\
236 |       ['multiStepBestPredictions'][prediction_step]
237 |     output_row = [index, value, prediction]
238 |     if self.show_anomaly_score:
239 |       output_row.append(prediction_result.inferences['anomalyScore'])
240 |     self.writer.writerow(output_row)
241 |     self.linecount = self.linecount + 1
242 | 
243 | 
244 |   def close(self):
245 |     self.file.close()
246 |     print "Done. Wrote %i data lines to %s." % (self.linecount, self.file.name)
247 | 
248 | 
249 | 
250 | class NuPICPlotOutput(NuPICOutput):
251 | 
252 | 
253 |   def __init__(self, *args, **kwargs):
254 |     super(NuPICPlotOutput, self).__init__(*args, **kwargs)
255 |     # turn matplotlib interactive mode on (ion)
256 |     plt.ion()
257 |     plt.figure(figsize=(14, 10))
258 |     gs = gridspec.GridSpec(2, 1, height_ratios=[3,1])
259 |     # plot title, legend, etc
260 |     plt.title('Sine prediction example')
261 |     plt.ylabel('Sine (rad)')
262 |     # The shifter will align prediction and actual values.
263 |     self.shifter = InferenceShifter()
264 |     # Keep the last WINDOW predicted and actual values for plotting.
265 |     self.actual_history = deque([0.0] * WINDOW, maxlen=360)
266 |     self.predicted_history = deque([0.0] * WINDOW, maxlen=360)
267 |     if self.show_anomaly_score:
268 |       self.anomaly_score = deque([0.0] * WINDOW, maxlen=360)
269 |     # Initialize the plot lines that we will update with each new record.
270 |     if self.show_anomaly_score:
271 |       plt.subplot(gs[0])
272 |     self.actual_line, = plt.plot(range(WINDOW), self.actual_history)
273 |     self.predicted_line, = plt.plot(range(WINDOW), self.predicted_history)
274 |     plt.legend(tuple(['actual','predicted']), loc=3)
275 |     if self.show_anomaly_score:
276 |       plt.subplot(gs[1])
277 |       self.anomaly_score_line, = plt.plot(range(WINDOW), self.anomaly_score, 'r-')
278 |       plt.legend(tuple(['anomaly score']), loc=3)
279 | 
280 |     # Set the y-axis range.
281 |     self.actual_line.axes.set_ylim(-1, 1)
282 |     self.predicted_line.axes.set_ylim(-1, 1)
283 |     if self.show_anomaly_score:
284 |       self.anomaly_score_line.axes.set_ylim(-1, 1)
285 | 
286 | 
287 | 
288 |   def write(self, index, value, prediction_result, prediction_step=1):
289 |     shifted_result = self.shifter.shift(prediction_result)
290 |     # shifted_result = prediction_result
291 |     # Update the trailing predicted and actual value deques.
292 |     inference = shifted_result.inferences\
293 |       ['multiStepBestPredictions'][prediction_step]
294 |     if inference is not None:
295 |       self.actual_history.append(shifted_result.rawInput['sine'])
296 |       self.predicted_history.append(inference)
297 |       if self.show_anomaly_score:
298 |         anomaly_score = prediction_result.inferences['anomalyScore']
299 |         self.anomaly_score.append(anomaly_score)
300 | 
301 |     # Redraw the chart with the new data.
302 |     self.actual_line.set_ydata(self.actual_history)  # update the data
303 |     self.predicted_line.set_ydata(self.predicted_history)  # update the data
304 |     if self.show_anomaly_score:
305 |       self.anomaly_score_line.set_ydata(self.anomaly_score)  # update the data
306 |     plt.draw()
307 |     plt.tight_layout()
308 | 
309 | 
310 | 
311 |   def close(self):
312 |     plt.ioff()
313 |     plt.show()
314 | 
315 | 
316 | 
317 | NuPICOutput.register(NuPICFileOutput)
318 | NuPICOutput.register(NuPICPlotOutput)
319 | >>>>>>> 86b35818ed8f44ee23353ebaffc6adce33faf04a
320 | 


--------------------------------------------------------------------------------
/share_experiment.py:
--------------------------------------------------------------------------------
  1 | <<<<<<< HEAD
  2 | #!/usr/bin/python
  3 | 
  4 | # ----------------------------------------------------------------------
  5 | # Numenta Platform for Intelligent Computing (NuPIC)
  6 | # Copyright (C) 2013, Numenta, Inc.  Unless you have an agreement
  7 | # with Numenta, Inc., for a separate license for this software code, the
  8 | # following terms and conditions apply:
  9 | #
 10 | # This program is free software: you can redistribute it and/or modify
 11 | # it under the terms of the GNU General Public License version 3 as
 12 | # published by the Free Software Foundation.
 13 | #
 14 | # This program is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 17 | # See the GNU General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU General Public License
 20 | # along with this program.  If not, see http://www.gnu.org/licenses.
 21 | #
 22 | # http://numenta.org/licenses/
 23 | # ----------------------------------------------------------------------
 24 | import csv
 25 | from nupic.frameworks.opf.model_factory import ModelFactory
 26 | from nupic_output import NuPICFileOutput, NuPICPlotOutput
 27 | from nupic.swarming import permutations_runner
 28 | 
 29 | 
 30 | # Change this to switch from a matplotlib plot to file output.
 31 | PLOT = False
 32 | SWARM_CONFIG = {
 33 |   "includedFields": [
 34 |     {
 35 |       "fieldName": "value",
 36 |       "fieldType": "float",
 37 |       "maxValue": 1.0,
 38 |       "minValue": 0.0
 39 |     }
 40 |   ],
 41 |   "streamDef": {
 42 |     "info": "value",
 43 |     "version": 1,
 44 |     "streams": [
 45 |       {
 46 |         "info": "choice.csv",
 47 |         "source": "file://home/sdbadmin/runSwarm/choice.csv",
 48 |         "columns": [
 49 |           "*"
 50 |         ]
 51 |       }
 52 |     ]
 53 |   },
 54 |   "inferenceType": "TemporalAnomaly",
 55 |   "inferenceArgs": {
 56 |     "predictionSteps": [
 57 |       3
 58 |     ],
 59 |     "predictedField": "value"
 60 |   },
 61 |   "swarmSize": "medium"
 62 | }
 63 | 
 64 | 
 65 | 
 66 | def swarm_over_data(filename):
 67 |     config=SWARM_CONFIG
 68 |     print('filename: ',filename)
 69 |     
 70 |     print config['streamDef']['streams']
 71 |     config['streamDef']['streams'][0]['info']=filename
 72 |     config['streamDef']['streams'][0]['source']="file://runSwarm/"+filename
 73 |     return permutations_runner.runWithConfig(config,
 74 |         {'maxWorkers': 4, 'overwrite': True})
 75 | 
 76 | 
 77 | 
 78 | def run_sine_experiment():
 79 |   input_file = "sine.csv"
 80 |   generate_data.run(input_file)
 81 |   model_params = swarm_over_data()
 82 |   if PLOT:
 83 |     output = NuPICPlotOutput("sine_output", show_anomaly_score=True)
 84 |   else:
 85 |     output = NuPICFileOutput("sine_output", show_anomaly_score=True)
 86 |   model = ModelFactory.create(model_params)
 87 |   model.enableInference({"predictedField": "sine"})
 88 | 
 89 |   with open(input_file, "rb") as sine_input:
 90 |     csv_reader = csv.reader(sine_input)
 91 | 
 92 |     # skip header rows
 93 |     csv_reader.next()
 94 |     csv_reader.next()
 95 |     csv_reader.next()
 96 | 
 97 |     # the real data
 98 |     for row in csv_reader:
 99 |       angle = float(row[0])
100 |       sine_value = float(row[1])
101 |       result = model.run({"sine": sine_value})
102 |       output.write(angle, sine_value, result, prediction_step=1)
103 | 
104 |   output.close()
105 | 
106 | def generate_data(a,filename):
107 |     print "Generating data into %s" % filename
108 |     fileHandle = open('/home/sdbadmin/runSwarm/'+filename,"w")
109 |     writer = csv.writer(fileHandle)
110 |     writer.writerow(["Time","value"])
111 |     writer.writerow(["int","float"])
112 |     writer.writerow(["",""])
113 | 
114 |     for i in range(len(a)):
115 | 	time=i
116 | 	value=a[i]
117 |         writer.writerow([time, value])
118 | 
119 |     fileHandle.close()
120 |     print "Generated %i rows of output data into %s" % (len(a), filename)
121 | 
122 | 
123 | def swarm(a,number,col):
124 |     generate_data(a,str(col)+'.csv')
125 |     model_params = swarm_over_data(filename=str(col)+'.csv')
126 | 
127 |     '''model params save'''
128 |     import json
129 |     fp=file('/home/sdbadmin/'+str(col)+'_swarmParams.csv','w')
130 |     json.dump(model_params,fp)
131 |     fp.close()
132 | 
133 |     if PLOT:
134 |         output = NuPICPlotOutput(str(col)+"_swarm__output", show_anomaly_score=True)
135 |     else:
136 |         output = NuPICFileOutput(str(col)+"_swarm_output", show_anomaly_score=True)
137 |     model = ModelFactory.create(model_params)
138 |     model.enableInference({"predictedField": "value"})
139 |     
140 |     input_file='/home/sdbadmin/runSwarm/'+str(col)+'.csv'
141 |     with open(input_file, "rb") as sine_input:
142 |         csv_reader = csv.reader(sine_input)
143 |         # the real data
144 | 
145 | 	# skip header rows
146 |         csv_reader.next()
147 |         csv_reader.next()
148 |         csv_reader.next()
149 | 
150 |         for row in csv_reader:
151 |             time=float(row[0])
152 |             value = float(row[1])
153 |             result = model.run({"value": value})
154 |             output.write(time,value, result, prediction_step=3)
155 | 
156 |     output.close()
157 |     return model_params
158 | =======
159 | #!/usr/bin/python
160 | 
161 | # ----------------------------------------------------------------------
162 | # Numenta Platform for Intelligent Computing (NuPIC)
163 | # Copyright (C) 2013, Numenta, Inc.  Unless you have an agreement
164 | # with Numenta, Inc., for a separate license for this software code, the
165 | # following terms and conditions apply:
166 | #
167 | # This program is free software: you can redistribute it and/or modify
168 | # it under the terms of the GNU General Public License version 3 as
169 | # published by the Free Software Foundation.
170 | #
171 | # This program is distributed in the hope that it will be useful,
172 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
173 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
174 | # See the GNU General Public License for more details.
175 | #
176 | # You should have received a copy of the GNU General Public License
177 | # along with this program.  If not, see http://www.gnu.org/licenses.
178 | #
179 | # http://numenta.org/licenses/
180 | # ----------------------------------------------------------------------
181 | import csv
182 | from nupic.frameworks.opf.model_factory import ModelFactory
183 | from nupic_output import NuPICFileOutput, NuPICPlotOutput
184 | from nupic.swarming import permutations_runner
185 | 
186 | 
187 | # Change this to switch from a matplotlib plot to file output.
188 | PLOT = False
189 | SWARM_CONFIG = {
190 |   "includedFields": [
191 |     {
192 |       "fieldName": "value",
193 |       "fieldType": "float",
194 |       "maxValue": 1.0,
195 |       "minValue": 0.0
196 |     }
197 |   ],
198 |   "streamDef": {
199 |     "info": "value",
200 |     "version": 1,
201 |     "streams": [
202 |       {
203 |         "info": "choice.csv",
204 |         "source": "file://home/sdbadmin/runSwarm/choice.csv",
205 |         "columns": [
206 |           "*"
207 |         ]
208 |       }
209 |     ]
210 |   },
211 |   "inferenceType": "TemporalAnomaly",
212 |   "inferenceArgs": {
213 |     "predictionSteps": [
214 |       3
215 |     ],
216 |     "predictedField": "value"
217 |   },
218 |   "swarmSize": "medium"
219 | }
220 | 
221 | 
222 | 
223 | def swarm_over_data(filename):
224 |     config=SWARM_CONFIG
225 |     print('filename: ',filename)
226 |     
227 |     print config['streamDef']['streams']
228 |     config['streamDef']['streams'][0]['info']=filename
229 |     config['streamDef']['streams'][0]['source']="file://runSwarm/"+filename
230 |     return permutations_runner.runWithConfig(config,
231 |         {'maxWorkers': 4, 'overwrite': True})
232 | 
233 | 
234 | 
235 | def run_sine_experiment():
236 |   input_file = "sine.csv"
237 |   generate_data.run(input_file)
238 |   model_params = swarm_over_data()
239 |   if PLOT:
240 |     output = NuPICPlotOutput("sine_output", show_anomaly_score=True)
241 |   else:
242 |     output = NuPICFileOutput("sine_output", show_anomaly_score=True)
243 |   model = ModelFactory.create(model_params)
244 |   model.enableInference({"predictedField": "sine"})
245 | 
246 |   with open(input_file, "rb") as sine_input:
247 |     csv_reader = csv.reader(sine_input)
248 | 
249 |     # skip header rows
250 |     csv_reader.next()
251 |     csv_reader.next()
252 |     csv_reader.next()
253 | 
254 |     # the real data
255 |     for row in csv_reader:
256 |       angle = float(row[0])
257 |       sine_value = float(row[1])
258 |       result = model.run({"sine": sine_value})
259 |       output.write(angle, sine_value, result, prediction_step=1)
260 | 
261 |   output.close()
262 | 
263 | def generate_data(a,filename):
264 |     print "Generating data into %s" % filename
265 |     fileHandle = open('/home/sdbadmin/runSwarm/'+filename,"w")
266 |     writer = csv.writer(fileHandle)
267 |     writer.writerow(["Time","value"])
268 |     writer.writerow(["int","float"])
269 |     writer.writerow(["",""])
270 | 
271 |     for i in range(len(a)):
272 | 	time=i
273 | 	value=a[i]
274 |         writer.writerow([time, value])
275 | 
276 |     fileHandle.close()
277 |     print "Generated %i rows of output data into %s" % (len(a), filename)
278 | 
279 | 
280 | def swarm(a,number,col):
281 |     generate_data(a,str(col)+'.csv')
282 |     model_params = swarm_over_data(filename=str(col)+'.csv')
283 | 
284 |     '''model params save'''
285 |     import json
286 |     fp=file('/home/sdbadmin/'+str(col)+'_swarmParams.csv','w')
287 |     json.dump(model_params,fp)
288 |     fp.close()
289 | 
290 |     if PLOT:
291 |         output = NuPICPlotOutput(str(col)+"_swarm__output", show_anomaly_score=True)
292 |     else:
293 |         output = NuPICFileOutput(str(col)+"_swarm_output", show_anomaly_score=True)
294 |     model = ModelFactory.create(model_params)
295 |     model.enableInference({"predictedField": "value"})
296 |     
297 |     input_file='/home/sdbadmin/runSwarm/'+str(col)+'.csv'
298 |     with open(input_file, "rb") as sine_input:
299 |         csv_reader = csv.reader(sine_input)
300 |         # the real data
301 | 
302 | 	# skip header rows
303 |         csv_reader.next()
304 |         csv_reader.next()
305 |         csv_reader.next()
306 | 
307 |         for row in csv_reader:
308 |             time=float(row[0])
309 |             value = float(row[1])
310 |             result = model.run({"value": value})
311 |             output.write(time,value, result, prediction_step=3)
312 | 
313 |     output.close()
314 |     return model_params
315 | >>>>>>> 86b35818ed8f44ee23353ebaffc6adce33faf04a
316 | 


--------------------------------------------------------------------------------
/similarity.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | '''
 3 | complete Time Series Trend Similarity
 4 | D(msd)= D(Euclid)*(2-ASD/SAD)
 5 | ASD --- Difference in absolute value
 6 | SAD --- manhaton distance
 7 | '''
 8 | import numpy as np
 9 | 
10 | def MSD(a,b):
11 |     if len(a)!=len(b):
12 | 	print('a,b:',len(a),len(b))
13 |         print('length not equal,quit')
14 |         return
15 |     if not isinstance(a,np.ndarray):
16 |         a=np.array(a).reshape(-1)
17 |     if not isinstance(b,np.ndarray):
18 |         b=np.array(b).reshape(-1)
19 |     a=a.reshape(-1)
20 |     b=b.reshape(-1)
21 |     if (a==b).all():return 0
22 |     #欧式距离
23 |     Deuclid=np.linalg.norm(a-b)
24 |     #print('Deuclid: ',Deuclid)
25 |     #曼哈顿距离
26 |     Dmahat=sum(abs(a-b))
27 |     #print('mahaton: ',Dmahat)
28 |     #ASD
29 |     ASD=abs(sum(a-b))
30 |     #print('ASD: ',ASD)
31 |     #MSD
32 |     msd=Deuclid*(2-ASD/Dmahat)
33 |     #print('msd: ',msd)
34 |     return msd
35 | 


--------------------------------------------------------------------------------
/slave/share_code/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = share_code.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = share_code
12 | 


--------------------------------------------------------------------------------
/slave/share_code/share_code/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/__init__.py


--------------------------------------------------------------------------------
/slave/share_code/share_code/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/__init__.pyc


--------------------------------------------------------------------------------
/slave/share_code/share_code/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/slave/share_code/share_code/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/slave/share_code/share_code/__pycache__/middlewares.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/__pycache__/middlewares.cpython-36.pyc


--------------------------------------------------------------------------------
/slave/share_code/share_code/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/slave/share_code/share_code/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/slave/share_code/share_code/auto_ip.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Apr 13 21:51:29 2018
  4 | 
  5 | @author: Administrator
  6 | """
  7 | import urllib
  8 | import urllib2
  9 | import time
 10 | import redis
 11 | from scrapy.selector import Selector
 12 | import requests
 13 | 
 14 | def get_url(url):     # 国内高匿代理的链接
 15 |     url_list = []
 16 |     for i in range(1,100):
 17 |         url_new = url + str(i)
 18 |         url_list.append(url_new)
 19 |     return url_list
 20 | def get_content(url):     # 获取网页内容
 21 |     user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0'
 22 |     headers = {'User-Agent': user_agent}
 23 |     req = requests.get(url=url, headers=headers)
 24 |     
 25 |     '''
 26 |     req = urllib.request.Request(url=url, headers=headers)
 27 |     res = urllib.request.urlopen(req)
 28 |     content = res.read()
 29 |     return content.decode('utf-8')    
 30 |     '''
 31 |     return req
 32 | 
 33 | def get_info(content):      # 提取网页信息 / ip 端口
 34 |     datas_ip = Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[2]/text()').extract()
 35 |     datas_head = Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[6]/text()').extract()
 36 |     datas_port =Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[3]/text()').extract()
 37 |     
 38 |     '''
 39 |     datas_ip = Selector(text=content).xpath('//table[contains(@id,"ip_list")]/tr/td[2]/text()').extract()
 40 |     datas_head = Selector(text=content).xpath('//table[contains(@id,"ip_list")]/tr/td[6]/text()').extract()
 41 |     datas_port =Selector(text=content).xpath('//table[contains(@id,"ip_list")]/tr/td[3]/text()').extract()
 42 |     '''
 43 |     
 44 |     #写入redis
 45 |     print('head: ',datas_head)
 46 |     try:        
 47 |         R=redis.Redis(host='localhost',port='6379')
 48 |     except Exception as e:
 49 |         print(e)
 50 |     count=0
 51 |     for head,ip,port in zip(datas_head,datas_ip,datas_port):
 52 |         p=R.zadd('share:auto_ip_pool',str(head).lower()+'://'+str(ip)+':'+str(port),count)
 53 |         if p:
 54 |             count+=1
 55 |     #print(datas_ip,datas_port)
 56 |     return datas_head,datas_ip,datas_port
 57 | def verify_ip(head,ip,port):    # 验证ip有效 
 58 |     user_agent ='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0' 
 59 |     headers = {'User-Agent':user_agent}
 60 |     if 'https' in head:
 61 |         proxy = {'https':'%s://%s:%s'%(head.lower(),ip,port)}
 62 |     else:
 63 |         proxy = {'http':'%s://%s:%s'%(head.lower(),ip,port)}
 64 |     print(proxy)
 65 |     
 66 |     #proxy_handler = urllib.request.ProxyHandler(proxy)
 67 |     proxy_handler=urllib2.ProxyHandler(proxy)
 68 |     #opener = urllib.request.build_opener(proxy_handler)
 69 |     opener = urllib2.build_opener(proxy_handler)
 70 |     urllib2.install_opener(opener)
 71 | 
 72 |     #test_url = "https://www.baidu.com/"
 73 |     test_url = "http://quote.eastmoney.com/stocklist.html#sh"
 74 |     req = urllib2.Request(url=test_url,headers=headers)
 75 |     time.sleep(3)
 76 |     
 77 |     try:   
 78 |         R=redis.Redis(host='localhost',port='6379')
 79 |     except Exception as e:
 80 |         print(e)
 81 |     count=0
 82 |     try:
 83 |         res = urllib2.urlopen(req,timeout=3)
 84 |         #time.sleep(3)
 85 |         content = res.read()
 86 |         if content:
 87 |             print('that is ok')
 88 |             R.zadd('share:auto_ip_pool_ok',str(head).lower()+'://'+str(ip)+':'+str(port),count)
 89 |             count+=1
 90 |         else:
 91 |             print('its not ok')
 92 |     except urllib2.URLError as e:
 93 |         print(e.reason)
 94 |     except Exception as e:
 95 | 	print(e)
 96 | 
 97 | if __name__ == '__main__':
 98 |     url = 'http://www.xicidaili.com/nn/'
 99 |     url_list = get_url(url)
100 |     for i in url_list:
101 |         print(i)
102 |         content = get_content(i)
103 |         time.sleep(3)
104 |         data=get_info(content)
105 |         for head,ip,port in zip(data[0],data[1],data[2]):
106 |             verify_ip(head,ip,port)
107 | 


--------------------------------------------------------------------------------
/slave/share_code/share_code/auto_ip_multi_pro.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Apr 11 00:01:28 2018
  4 | 
  5 | @author: Administrator
  6 | """
  7 | from scrapy.selector import Selector
  8 | import time
  9 | import requests
 10 | import redis
 11 | from multiprocessing import Process, Queue  
 12 | 
 13 | 
 14 | def get_url(url):     # 国内高匿代理的链接
 15 |     url_list = []
 16 |     for i in range(1,100):
 17 |         url_new = url + str(i)
 18 |         url_list.append(url_new)
 19 |     return url_list
 20 | 
 21 | def get_content(url):     # 获取网页内容
 22 |     user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
 23 |     headers = {'User-Agent': user_agent}
 24 |     req = requests.get(url=url, headers=headers)
 25 |     return req
 26 | 
 27 | def get_info(content):      # 提取网页信息 / ip 端口
 28 |     datas_ip = Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[2]/text()').extract()
 29 |     datas_head = Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[6]/text()').extract()
 30 |     datas_port =Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[3]/text()').extract()
 31 |     
 32 |     #写入redis
 33 |     print('head: ',datas_head)
 34 |     try:        
 35 |         R=redis.Redis(host='localhost',port='6379')
 36 |     except Exception as e:
 37 |         print(e)
 38 |     count=0
 39 |     for head,ip,port in zip(datas_head,datas_ip,datas_port):
 40 |         p=R.zadd('share:auto_ip_pool',str(head).lower()+'://'+str(ip)+':'+str(port),count)
 41 |         if p:
 42 |             count+=1
 43 |     #print(datas_ip,datas_port)
 44 |     return datas_head,datas_ip,datas_port
 45 |    
 46 | def verify_ip_one(old_queue,new_queue):    # 验证ip有效性
 47 |     while 1:
 48 |         data=old_queue.get()
 49 |         print(data)
 50 |         if data==0:
 51 |             break
 52 |         head=data[0].lower()
 53 |         ip=data[1]
 54 |         port=data[2]
 55 |         print('head,data,ip')
 56 |         
 57 |         user_agent ='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0'
 58 |         accept_encoding ='gzip, deflate, sdch'  
 59 |         accept_language ='zh-CN,zh;q=0.8'  
 60 |         headers = {'User-Agent':user_agent,'Accept-Encoding':accept_encoding,'Accept-Language':accept_language}
 61 |         if 'https' in head:
 62 |             proxy = {'https':'%s://%s:%s'%(head,ip,port)}
 63 |         else:
 64 |             proxy = {'http':'%s://%s:%s'%(head,ip,port)}
 65 |         print(proxy)
 66 |     
 67 |         test_url = "https://www.baidu.com/"
 68 |         
 69 |         try:
 70 |             req = requests.get(url=test_url,proxies=proxy,headers=headers)
 71 |             status_code=req.status_code
 72 |             if status_code==200:
 73 |                 print('that is ok')
 74 |                 print(str(ip) + u":" + str(port))
 75 |                 new_queue.put([head,ip,port])
 76 |             else:
 77 |                 print('its not ok')
 78 |         except Exception as e:
 79 |             print('fall down')
 80 | 
 81 | def verif_ip(data):
 82 |     old_queue=Queue()
 83 |     for head,ip,port in zip(data[0],data[1],data[2]):
 84 |         old_queue.put([head,ip,port])               #往没验证过的queue加入数据
 85 |     print('verify ip.....')
 86 |     print('old_queue: ',old_queue.qsize())
 87 |     old_queue.put(0)  #终止条件
 88 |     new_queue=Queue()
 89 |     works = []  
 90 |     for i in range(1):  
 91 |         print('process %s'%i)
 92 |         works.append(Process(target=verify_ip_one, args=(old_queue,new_queue)))
 93 |     for work in works:
 94 |         print('process start')
 95 |         work.start()     
 96 |         work.join()
 97 |     '''
 98 |     for work in works:
 99 |         work.join() 
100 |     '''
101 |     try:   
102 |         R=redis.Redis(host='localhost',port='6379')
103 |     except Exception as e:
104 |         print(e)
105 |     for i in range(new_queue.qsize()):
106 |         head,ip,port=new_queue.get()
107 |         R.sadd('share:auto_ip_pool_ok',str(head).lower()+'://'+str(ip)+':'+str(port))
108 |         print('insert one row')
109 |     
110 |         
111 | if __name__ == '__main__':
112 |     url = 'http://www.xicidaili.com/nn/'
113 |     url_list = get_url(url)
114 |     for i in url_list:
115 |         print(i)
116 |         content = get_content(i)
117 |         time.sleep(3)
118 |         data=get_info(content)
119 |         verif_ip(data)
120 |         '''
121 |         for head,ip,port in zip(data[0],data[1],data[2]):
122 |             verif_ip(head.lower(),ip,port)
123 |         '''
124 |             


--------------------------------------------------------------------------------
/slave/share_code/share_code/example.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Apr 13 17:35:23 2018
 4 | 
 5 | @author: Administrator
 6 | """
 7 | 
 8 | from multiprocessing import Process, Queue
 9 |  
10 | def f(q,n):
11 |     q.put([42, n, 'hello'])
12 |  
13 | if __name__ == '__main__':
14 |     q = Queue()
15 |     p_list=[]
16 |     for i in range(3):
17 |         p = Process(target=f, args=(q,i))
18 |         p_list.append(p)
19 |         p.start()
20 |     print(q.get())
21 |     print(q.get())
22 |     print(q.get())
23 |     for i in p_list:
24 |         i.join()


--------------------------------------------------------------------------------
/slave/share_code/share_code/extensions.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/extensions.pyc


--------------------------------------------------------------------------------
/slave/share_code/share_code/extensionsItem.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Apr 20 19:18:54 2018
 4 | 
 5 | @author: Administrator
 6 | """
 7 | import time
 8 | import logging
 9 | from scrapy import signals
10 | from scrapy.exceptions import NotConfigured
11 | logger = logging.getLogger(__name__)
12 | 
13 | class SpiderOpenCloseLogging(object):
14 | 
15 |     def __init__(self, item_count,idle_number,crawler):
16 |         self.item_count = item_count
17 | 
18 |         self.items_scraped = 0
19 | 
20 | 	self.idle_count = 0
21 | 
22 | 	self.idle_list = []
23 | 	
24 | 	self.crawler = crawler
25 | 
26 | 	self.idle_number = idle_number
27 | 
28 |     @classmethod
29 |     def from_crawler(cls, crawler):
30 |         # first check if the extension should be enabled and raise
31 | 
32 |         # NotConfigured otherwise
33 | 
34 |         if not crawler.settings.getbool('MYEXT_ENABLED'):
35 | 
36 |             raise NotConfigured
37 | 	#idle_number
38 | 	idle_number = crawler.settings.getint('IDLE_NUMBER', 20)
39 | 
40 |         # get the number of items from settings
41 | 
42 |         item_count = crawler.settings.getint('ITEM_NUMBER', 10)
43 | 
44 |         # instantiate the extension object
45 | 
46 |         ext = cls(item_count,idle_number,crawler)
47 | 
48 |         # connect the extension object to signals
49 | 
50 |         crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
51 | 
52 |         crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
53 | 
54 |         crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)
55 | 
56 |         crawler.signals.connect(ext.spider_idle, signal=signals.spider_idle)
57 | 
58 |         # return the extension object
59 | 
60 |         return ext
61 | 
62 |     def spider_opened(self, spider):
63 |         logger.info("opened spider %s redis spider Idle, Continuous idle limit： %d", spider.name, self.idle_number)
64 | 
65 |     def spider_closed(self, spider,reason='finished'):  #默认结束reason是finished，如果spider是被引擎的 close_spider 方法所关闭，则其为调用该方法时传入的 reason 参数(默认为 'cancelled')
66 |         logger.info("closed spider %s, idle count %d , Continuous idle count %d ,closed reason %s",
67 |                 spider.name, self.idle_count, len(self.idle_list),reason)
68 | 
69 |     def item_scraped(self, item, spider):
70 |         self.items_scraped += 1
71 | 	if item:
72 | 		print('items add one ')
73 | 	else:
74 | 		print('item is None')
75 | 		print('self.idle_count: ',self.idle_count)
76 |         if self.items_scraped % self.item_count == 0:
77 |             spider.log("scraped %d items" % self.items_scraped)
78 | 
79 |     def spider_idle(self, spider):
80 |         self.idle_count += 1                        # 空闲计数
81 |         print('idle_count:',self.idle_count)
82 |         self.idle_list.append(time.time())       # 每次触发 spider_idle时，记录下触发时间戳
83 |         idle_list_len = len(self.idle_list)         # 获取当前已经连续触发的次数
84 |         if idle_list_len > self.idle_number:
85 |            # 连续触发的次数达到配置次数后关闭爬虫
86 |            logger.info('\n continued idle number exceed {} Times'
87 |                         '\n meet the idle shutdown conditions, will close the reptile operation'
88 |                         '\n idle start time: {},  close spider time: {}'.format(self.idle_count,
89 |                                                                               self.idle_list[0], self.idle_list[-1]))
90 | 	   self.crawler.engine.close_spider(spider, 'closespider_ForNullRun')
91 | 


--------------------------------------------------------------------------------
/slave/share_code/share_code/extensionsItem.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/extensionsItem.pyc


--------------------------------------------------------------------------------
/slave/share_code/share_code/extensionsTime.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | from time import time
 4 | 
 5 | from scrapy.exceptions import NotConfigured
 6 | from twisted.internet import task
 7 | from scrapy import signals
 8 | 
 9 | 
10 | class Latencies(object):
11 |     """
12 |     An extension that measures throughput and latencies.
13 |     """
14 |     @classmethod
15 |     def from_crawler(cls, crawler):
16 |         return cls(crawler)
17 | 
18 |     def __init__(self, crawler):
19 |         self.crawler = crawler
20 |         self.interval = crawler.settings.getfloat('LATENCIES_INTERVAL')
21 | 
22 |         if not self.interval:
23 |             raise NotConfigured
24 | 
25 |         cs = crawler.signals
26 |         cs.connect(self._spider_opened, signal=signals.spider_opened)
27 |         cs.connect(self._spider_closed, signal=signals.spider_closed)
28 |         cs.connect(self._request_scheduled, signal=signals.request_scheduled)
29 |         cs.connect(self._response_received, signal=signals.response_received)
30 |         cs.connect(self._item_scraped, signal=signals.item_scraped)
31 | 
32 |         self.latency, self.proc_latency, self.items = 0, 0, 0
33 | 
34 |     def _spider_opened(self, spider):
35 |         self.task = task.LoopingCall(self._log, spider)
36 |         self.task.start(self.interval)
37 | 
38 |     def _spider_closed(self, spider, reason):
39 |         if self.task.running:
40 |             self.task.stop()
41 | 
42 |     def _request_scheduled(self, request, spider):
43 |         request.meta['schedule_time'] = time()
44 | 
45 |     def _response_received(self, response, request, spider):
46 |         request.meta['received_time'] = time()
47 | 
48 |     def _item_scraped(self, item, response, spider):
49 |         self.latency += time() - response.meta['schedule_time']
50 |         self.proc_latency += time() - response.meta['received_time']
51 |         self.items += 1
52 | 
53 |     def _log(self, spider):
54 |         irate = float(self.items) / self.interval  #interval 时间内处理item数
55 |         latency = self.latency / self.items if self.items else 0  #单个时延
56 |         proc_latency = self.proc_latency / self.items if self.items else 0  #单个响应时间
57 | 
58 |         spider.logger.info(("Scraped %d items at %.1f items/s, avg latency: "
59 |                             "%.2f s and avg time in pipelines: %.2f s") %
60 |                           (self.items, irate, latency, proc_latency))
61 | 
62 |         self.latency, self.proc_latency, self.items = 0, 0, 0
63 | 	spider.logger.info('now init values again: %d %d %d'%(self.latency, self.proc_latency, self.items))
64 | 
65 | 


--------------------------------------------------------------------------------
/slave/share_code/share_code/extensionsTime.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/extensionsTime.pyc


--------------------------------------------------------------------------------
/slave/share_code/share_code/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | from scrapy.loader import ItemLoader
10 | from scrapy.loader.processors import MapCompose, TakeFirst, Join
11 | 
12 | 
13 | class ShareCodeItem(scrapy.Item):
14 |     # define the fields for your item here like:
15 |     # name = scrapy.Field()
16 |     number = scrapy.Field()
17 |     
18 |     data = scrapy.Field()
19 | 
20 | class ShareLoader(ItemLoader):
21 |     default_item_class = ShareCodeItem
22 |     default_input_processor = MapCompose(lambda s: s.lstrip().replace('\r',''))#去掉左边空格
23 |     default_output_processor = Join()
24 |     description_out = Join()
25 | 


--------------------------------------------------------------------------------
/slave/share_code/share_code/items.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/items.pyc


--------------------------------------------------------------------------------
/slave/share_code/share_code/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | import scrapy
  9 | from scrapy import signals
 10 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
 11 | from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware
 12 | import random
 13 | import redis
 14 | 
 15 | 
 16 | class ShareCodeSpiderMiddleware(object):
 17 |     # Not all methods need to be defined. If a method is not defined,
 18 |     # scrapy acts as if the spider middleware does not modify the
 19 |     # passed objects.
 20 | 
 21 |     @classmethod
 22 |     def from_crawler(cls, crawler):
 23 |         # This method is used by Scrapy to create your spiders.
 24 |         s = cls()
 25 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 26 |         return s
 27 | 
 28 |     def process_spider_input(self, response, spider):
 29 |         # Called for each response that goes through the spider
 30 |         # middleware and into the spider.
 31 | 
 32 |         # Should return None or raise an exception.
 33 |         return None
 34 | 
 35 |     def process_spider_output(self, response, result, spider):
 36 |         # Called with the results returned from the Spider, after
 37 |         # it has processed the response.
 38 | 
 39 |         # Must return an iterable of Request, dict or Item objects.
 40 |         for i in result:
 41 |             yield i
 42 | 
 43 |     def process_spider_exception(self, response, exception, spider):
 44 |         # Called when a spider or process_spider_input() method
 45 |         # (from other spider middleware) raises an exception.
 46 | 
 47 |         # Should return either None or an iterable of Response, dict
 48 |         # or Item objects.
 49 |         pass
 50 | 
 51 |     def process_start_requests(self, start_requests, spider):
 52 |         # Called with the start requests of the spider, and works
 53 |         # similarly to the process_spider_output() method, except
 54 |         # that it doesn’t have a response associated.
 55 | 
 56 |         # Must return only requests (not items).
 57 |         for r in start_requests:
 58 |             yield r
 59 | 
 60 |     def spider_opened(self, spider):
 61 |         spider.logger.info('Spider opened: %s' % spider.name)
 62 | 
 63 | 
 64 | class ShareCodeDownloaderMiddleware(object):
 65 |     # Not all methods need to be defined. If a method is not defined,
 66 |     # scrapy acts as if the downloader middleware does not modify the
 67 |     # passed objects.
 68 | 
 69 |     @classmethod
 70 |     def from_crawler(cls, crawler):
 71 |         # This method is used by Scrapy to create your spiders.
 72 |         s = cls()
 73 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 74 |         return s
 75 | 
 76 |     def process_request(self, request, spider):
 77 |         # Called for each request that goes through the downloader
 78 |         # middleware.
 79 | 
 80 |         # Must either:
 81 |         # - return None: continue processing this request
 82 |         # - or return a Response object
 83 |         # - or return a Request object
 84 |         # - or raise IgnoreRequest: process_exception() methods of
 85 |         #   installed downloader middleware will be called
 86 |         return None
 87 | 
 88 |     def process_response(self, request, response, spider):
 89 |         # Called with the response returned from the downloader.
 90 | 
 91 |         # Must either;
 92 |         # - return a Response object
 93 |         # - return a Request object
 94 |         # - or raise IgnoreRequest
 95 |         return response
 96 | 
 97 |     def process_exception(self, request, exception, spider):
 98 |         # Called when a download handler or a process_request()
 99 |         # (from other downloader middleware) raises an exception.
100 | 
101 |         # Must either:
102 |         # - return None: continue processing this exception
103 |         # - return a Response object: stops process_exception() chain
104 |         # - return a Request object: stops process_exception() chain
105 |         pass
106 | 
107 |     def spider_opened(self, spider):
108 |         spider.logger.info('Spider opened: %s' % spider.name)
109 | 
110 | class DoubleUserAgentMiddleware(UserAgentMiddleware):
111 |     '''
112 |     设置User-Agent
113 |     '''
114 | 
115 |     def __init__(self, user_agent):
116 |         self.user_agent = user_agent
117 | 
118 |     @classmethod
119 |     def from_crawler(cls, crawler):
120 |         return cls(
121 |             user_agent=crawler.settings.get('DOUBLE_USER_AGENT')
122 |         )
123 | 
124 |     def process_request(self, request, spider):
125 |         agent = random.choice(self.user_agent)
126 |         request.headers['User-Agent'] = agent
127 | 
128 | class AuToIpMiddleware(HttpProxyMiddleware):
129 |         '''设置自动IP'''
130 |         def __init__(self,ipaddr=''):
131 |             super(AuToIpMiddleware,self).__init__
132 |             self.ipaddr=ipaddr
133 |             
134 |         def process_request(self, request, spider):
135 |             '''对request对象加上proxy'''  
136 |             self.ipaddr=self.get_random_proxy()
137 |             print('The ip is: '+str(self.ipaddr))
138 |             request.meta["proxy"] = self.ipaddr
139 |             
140 |         def process_response(self, request, response, spider):  
141 |             '''对返回的response处理'''  
142 |             # 如果返回的response状态不是200，重新生成当前request对象  
143 |             if response.status != 200:  
144 |                 proxy = self.get_random_proxy()  
145 |                 print("this is response ip:"+proxy)  
146 |                 # 对当前reque加上代理  
147 |                 request.meta['proxy'] = self.ipaddr
148 |                 return request  
149 |             return response  
150 |         
151 |         def get_random_proxy(self):
152 |             try:        
153 |                 R=redis.Redis(host='192.168.111.130',port='6379')
154 |             except Exception as e:
155 |                 print(e)
156 |             length=R.zcard('share:auto_ip_pool_ok')
157 |             number=random.randint(0,length-1)
158 |             ipaddr=R.zrange('share:auto_ip_pool_ok',number,number)[0]  #取出随机一个ip
159 |             ipaddr=ipaddr.decode('utf-8')
160 |             return ipaddr
161 | 


--------------------------------------------------------------------------------
/slave/share_code/share_code/middlewares.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/middlewares.pyc


--------------------------------------------------------------------------------
/slave/share_code/share_code/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | from hdfs import Client
 8 | from scrapy.exceptions import DropItem
 9 | 
10 | 
11 | class ShareCodePipeline(object):
12 |     def __init__(self,client):
13 |         print("ShareCodePipeline __init__")
14 | 	self.client=client
15 | 	'''
16 | 	try:
17 |             client=Client('http://192.168.111.130:50070')
18 | 	except Exception as e:
19 |             print(e)
20 | 	self.client=client
21 | 	'''
22 |     @classmethod
23 |     def from_settings(cls,settings):
24 | 	hdfs_master=settings['HDFS_MASTER']
25 | 	hdfs_address=settings['HDFS_ADDRESS']
26 | 	try:
27 |        	    client=Client('http://'+str(hdfs_master)+':'+str(hdfs_address))	
28 | 	except Exception as e:
29 | 	    print(e)
30 | 
31 | 	return cls(client)
32 | 
33 |     def process_item(self, item, spider):
34 |         print("ShareCodePipeline process_item")
35 |         
36 |         if item['number']:
37 |             number=item['number']
38 |         else:
39 |            raise DropItem('Missing number in %s'%item)
40 |         if item['data']:
41 |             data=item['data']
42 |         else:
43 |            raise DropItem('Missing data in %s'%item)
44 |         
45 |         data_str=data   #内含中文，先编码成utf-8
46 |         '''
47 | 	try:
48 |             client=Client('http://192.168.111.130:50070')
49 | 	except Exception as e:
50 |             print(e)
51 | 	'''
52 | 
53 |         try:
54 |             print('begin write')
55 | 	    if not self.client.content('/sdbadmin/hadoop/input/'+str(number)+'.csv',strict=False):
56 |             	self.client.write('/sdbadmin/hadoop/input/'+str(number)+'.csv',data=data_str,encoding='utf-8')
57 | 		print('hdfs client close!')
58 |             	print('end write')
59 | 	    else:
60 |   		print('dupilicate data!')
61 |         except Exception as e:
62 |             print(e)
63 | 
64 |         return item 
65 | 


--------------------------------------------------------------------------------
/slave/share_code/share_code/pipelines.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/pipelines.pyc


--------------------------------------------------------------------------------
/slave/share_code/share_code/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for share_code project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'share_code'
 13 | 
 14 | SPIDER_MODULES = ['share_code.spiders']
 15 | NEWSPIDER_MODULE = 'share_code.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'share_code (+http://www.yourdomain.com)'
 20 | 
 21 | # Obey robots.txt rules
 22 | ROBOTSTXT_OBEY = True
 23 | 
 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 25 | #CONCURRENT_REQUESTS = 32
 26 | 
 27 | # Configure a delay for requests for the same website (default: 0)
 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 29 | # See also autothrottle settings and docs
 30 | #DOWNLOAD_DELAY = 3
 31 | # The download delay setting will honor only one of:
 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 33 | #CONCURRENT_REQUESTS_PER_IP = 16
 34 | 
 35 | # Disable cookies (enabled by default)
 36 | COOKIES_ENABLED = False
 37 | 
 38 | # Disable Telnet Console (enabled by default)
 39 | #TELNETCONSOLE_ENABLED = False
 40 | 
 41 | # Override the default request headers:
 42 | #DEFAULT_REQUEST_HEADERS = {
 43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 44 | #   'Accept-Language': 'en',
 45 | #}
 46 | 
 47 | # Enable or disable spider middlewares
 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 49 | #SPIDER_MIDDLEWARES = {
 50 | #    'share_code.middlewares.ShareCodeSpiderMiddleware': 543,
 51 | #}
 52 | 
 53 | # Enable or disable downloader middlewares
 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 55 | DOWNLOADER_MIDDLEWARES = {
 56 |     'scrapy.downloadermiddleware.useragent.UserAgentMiddleware': None, 
 57 |     'share_code.middlewares.DoubleUserAgentMiddleware': 240,
 58 |     'share_code.middlewares.AuToIpMiddleware': None, #340
 59 |     'share_code.middlewares.ShareCodeDownloaderMiddleware': 543,
 60 |     'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware':None,   
 61 | }
 62 | 
 63 | # Enable or disable extensions
 64 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 65 | #EXTENSIONS = {
 66 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 67 | #}
 68 | 
 69 | # Configure item pipelines
 70 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 71 | ITEM_PIPELINES = {
 72 |     'share_code.pipelines.ShareCodePipeline': None,
 73 | }
 74 | 
 75 | # Enable and configure the AutoThrottle extension (disabled by default)
 76 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 77 | #AUTOTHROTTLE_ENABLED = True
 78 | # The initial download delay
 79 | #AUTOTHROTTLE_START_DELAY = 5
 80 | # The maximum download delay to be set in case of high latencies
 81 | #AUTOTHROTTLE_MAX_DELAY = 60
 82 | # The average number of requests Scrapy should be sending in parallel to
 83 | # each remote server
 84 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 85 | # Enable showing throttling stats for every response received:
 86 | #AUTOTHROTTLE_DEBUG = False
 87 | 
 88 | # Enable and configure HTTP caching (disabled by default)
 89 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 90 | #HTTPCACHE_ENABLED = True
 91 | #HTTPCACHE_EXPIRATION_SECS = 0
 92 | #HTTPCACHE_DIR = 'httpcache'
 93 | #HTTPCACHE_IGNORE_HTTP_CODES = []
 94 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 95 | 
 96 | MYEXT_ENABLED = True
 97 | 
 98 | LATENCIES_INTERVAL = 5
 99 | ITEM_NUMBERS=10
100 | IDLE_NUMBERS=20
101 | 
102 | MEMUSAGE_NOTIFY_MAIL = ['3081881935@qq.com']
103 | MEMUSAGE_REPORT = True
104 | MEMUSAGE_ENABLED = True
105 | MEMUSAGE_LIMIT_MB = 4112
106 | MEMDEBUG_ENABLED = True
107 | MEMDEBUG_NOTIFY = []
108 | 
109 | EXTENSIONS = {
110 |     'share_code.extensionsItem.SpiderOpenCloseLogging': 100,
111 |     'share_code.extensionsTime.Latencies': 120,
112 |     'scrapy.contrib.memusage.MemoryUsage': 50,
113 |     'scrapy.contrib.memdebug.MemoryDebugger': 60
114 | }
115 | 
116 | ITEM_PIPELINES = {
117 |         'share_code.pipelines.ShareCodePipeline':100
118 |         }
119 | 
120 | # Enables scheduling storing requests queue in redis.
121 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
122 | 
123 | # Ensure all spiders share same duplicates filter through redis.
124 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
125 | 
126 | REDIS_START_URLS_AS_SET = True
127 | 
128 | REDIS_HOST = '192.168.111.130' 
129 | REDIS_PORT = '6379'
130 | 
131 | HDFS_MASTER = '192.168.111.130'
132 | HDFS_ADDRESS = '50070'
133 | 
134 | DOUBLE_USER_AGENT = [
135 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
136 |     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
137 |     "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
138 |     "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
139 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
140 |     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
141 |     "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
142 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
143 |     "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
144 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
145 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
146 |     "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
147 |     "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
148 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
149 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
150 |     "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
151 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
152 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
153 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
154 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
155 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
156 |     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
157 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
158 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
159 |     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
160 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
161 |     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
162 |     "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
163 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
164 |     "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
165 |     "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
166 |     "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
167 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
168 |     "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
169 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
170 |     ]
171 | 


--------------------------------------------------------------------------------
/slave/share_code/share_code/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/settings.pyc


--------------------------------------------------------------------------------
/slave/share_code/share_code/spiders/SlaveSpider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Mar 26 13:42:52 2018
 4 | 
 5 | @author: Administrator
 6 | """
 7 | import time
 8 | from scrapy_redis.spiders import RedisSpider
 9 | import redis
10 | from scrapy import log
11 | from share_code.items import ShareLoader
12 | #from scrapy.selector import Selector
13 | import re
14 | from kazoo.client import KazooClient   
15 | import sys
16 | reload(sys)
17 | sys.setdefaultencoding('gbk')
18 | 
19 | class SlaveSpider(RedisSpider):
20 |     name = "spider"
21 |     download_delay=2
22 |     #allowed_domains = ["spider.org"]
23 |     redis_key = 'share:download_url'
24 |     
25 |     zk = KazooClient(hosts='127.0.0.1:2181')  
26 |     zk.start()  
27 |     
28 |     # Ensure a path, create if necessary  
29 |     zk.ensure_path("/ip_process")  
30 |       
31 |     # Create a node with data  
32 |     zk.create("/ip_process/192.168.111.129",  
33 |              value=b"ok", ephemeral=True)  
34 |     
35 |     pool=redis.ConnectionPool(host='192.168.111.130', port=6379, decode_responses=True)
36 |       
37 | 
38 |     def parse(self, response):
39 | 	#print('response.body: ',response.body.decode('gbk'))  #str
40 | 	print('response.encoding: ',response.encoding)
41 | 	#print('body: ',response.body)
42 |         #print('response.request.meta: ',response.request.meta)
43 |         self.log('parse begin!',level=log.INFO)
44 |         try:
45 |             #global pool
46 |             R=redis.Redis(connection_pool=self.pool)
47 |         except Exception as e:
48 |             self.log(e,level=log.ERROR)
49 |         print('response.url: ',response.url)    
50 |         if R.sismember('share:dupefilter_bak',response.url):  #如果url不重复
51 |             self.log('download_url repeat! stop this chance and continue',level=log.DEBUG)   
52 |             #return None  
53 |         else:    
54 |             #打印user-agent
55 |             print('user-agent: ',response.request.headers['User-Agent'])
56 | 	    time.sleep(10)
57 |             el=ShareLoader(response=response)
58 |            
59 |             #text=str(response.body,encoding='gbk')
60 | 	    text=str(response.body) if type(response.body) != 'str' else response.body
61 |             el.add_value('data',text)
62 |             
63 |             pat=re.compile('.*?code=0(\d+).*?')  #正则表达式，取出code
64 |             download_url=response.url
65 |             code=pat.findall(download_url)
66 |             print('code: ',code)
67 |             
68 |             el.add_value('number',code)
69 |                 
70 |             R.sadd('share:dupefilter_bak',download_url) #储存已经spider的网页url，实现去重     
71 |             
72 |             self.log('parse end!',level=log.INFO)
73 |             
74 |             return el.load_item()
75 | 


--------------------------------------------------------------------------------
/slave/share_code/share_code/spiders/SlaveSpider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/spiders/SlaveSpider.pyc


--------------------------------------------------------------------------------
/slave/share_code/share_code/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/slave/share_code/share_code/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/spiders/__init__.pyc


--------------------------------------------------------------------------------
/slave/share_code/share_code/spiders/__pycache__/SlaveSpider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/spiders/__pycache__/SlaveSpider.cpython-36.pyc


--------------------------------------------------------------------------------
/slave/share_code/share_code/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/slave/share_code/share_code/untitled2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Apr 20 01:08:28 2018
 4 | 
 5 | @author: Administrator
 6 | """
 7 | 
 8 | class zoo:
 9 |     b=1
10 |     def __init__(self):
11 |         print('init',self.b)
12 |     @classmethod
13 |     def ok(cls,iin):
14 |         print('classmethod: ',iin)
15 |     print('hello')
16 |     global a
17 |     a=1
18 |     a+=1
19 |     print(a)
20 | zoo()


--------------------------------------------------------------------------------
/slave/zoo_detect/zoo_watcher.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Apr 20 00:57:45 2018
 4 | 
 5 | @author: Administrator
 6 | """
 7 | 
 8 | from kazoo.client import KazooClient  
 9 |   
10 | import time  
11 |   
12 | import logging  
13 | logging.basicConfig()  
14 |   
15 | zk = KazooClient(hosts='127.0.0.1:2181')  
16 | zk.start()  
17 |   
18 | # Determine if a node exists  
19 | while True:  
20 |     for ip in ['192.168.111.129']:
21 |         if zk.exists("/ip_process/" + ip):  
22 |             print ("%s is alive!"%ip)  
23 |         else:  
24 |             print ("%s is dead!"%ip) 
25 |             break  
26 |     time.sleep(5)  
27 |   
28 | zk.stop()  
29 | 


--------------------------------------------------------------------------------
/test_nupic.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | 
  3 | import time
  4 | import numpy as np
  5 | import pandas as pd
  6 | import sklearn
  7 | from hdfs.client import Client
  8 | from similarity import MSD
  9 | import copy
 10 | from sklearn.neural_network import MLPRegressor
 11 | from sklearn import preprocessing
 12 | from sklearn.model_selection import GridSearchCV
 13 | from KMSD import kmeans_MSD
 14 | from share_experiment import swarm
 15 | from nupic.frameworks.opf.model_factory import ModelFactory
 16 | from nupic_output import NuPICFileOutput, NuPICPlotOutput
 17 | from nupic.swarming import permutations_runner
 18 | '''
 19 | filepath='/sdbadmin/hadoop/input'
 20 | try:
 21 |     client=Client('http://192.168.111.130:50070')
 22 | except Exception as e:
 23 |     print(e)
 24 | 
 25 | dirs=client.list(filepath)
 26 | '''
 27 | 
 28 | import os
 29 | filepath='/opt/share_code_data'
 30 | dirs=os.listdir(filepath)
 31 | 
 32 | #将hdfs本地化
 33 | print('there are %d shares'%(len(dirs)))
 34 | '''
 35 | try:
 36 |     for i in range(len(dirs)):
 37 |         client.download(filepath+'/'+dirs[i],'/opt/share_code_data/'+dirs[i])
 38 | 	print('download success')
 39 | except Exception as e:
 40 |     print(e)
 41 | '''
 42 | #即使是使用nupic，也考虑预处理
 43 | min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
 44 | DD=pd.DataFrame([])
 45 | for i in range(len(dirs)):
 46 |     df=pd.read_csv('/opt/share_code_data/'+dirs[i],index_col=0) #利用第一列做索引
 47 |     if len(DD)==len(df) and len(df)!=0:
 48 |         trun = min_max_scaler.fit_transform(copy.deepcopy(df.iloc[:,5]).values.reshape(-1, 1) )
 49 |         DD[dirs[i].strip().split('.')[0]]=trun
 50 |     elif len(df)!=0:
 51 |         cols=DD.columns
 52 | 	trun = min_max_scaler.fit_transform(copy.deepcopy(df.iloc[:,5]).values.reshape(-1,1))
 53 |         DD=pd.concat([DD,pd.DataFrame(trun)],axis=1)   #长度不一致的合并
 54 |         f=list(cols)
 55 |         f.append(dirs[i].strip().split('.')[0])
 56 |         DD.columns=f
 57 | DD.fillna(0, inplace=True)
 58 | 
 59 | print('DD shape: ',np.shape(DD))
 60 | print(DD.head()) #[6690 rows x 169 columns]
 61 | #time.sleep(6)
 62 | clusters_DF=kmeans_MSD(DD.values,k=3)   # 返回一个dict，里面包含了所有的聚类,k是聚类数
 63 | clusterAss=clusters_DF[1]
 64 | 
 65 | newD=[]
 66 | leftD=[]
 67 | for i in range(DD.shape[1]):
 68 |     if len(np.nonzero(DD.values [:,i]==0)[0])<=int(0.3*DD.shape[0]):
 69 |         if len(newD)==0:
 70 |             newD=copy.deepcopy(DD.iloc[:,i])
 71 |         else:
 72 |             newD=pd.concat([newD,copy.deepcopy(DD.iloc[:,i])],axis=1)
 73 |     else:
 74 |         if len(leftD)==0:
 75 |             leftD=copy.deepcopy(DD.iloc[:,i])
 76 |         else:
 77 |             leftD=pd.concat([leftD,copy.deepcopy(DD.iloc[:,i])],axis=1)
 78 | 
 79 | ser=set(list(np.array(clusterAss[:,0])))  #聚类数
 80 | print('cluster leibie: ',ser)
 81 | 
 82 | #对newD的数据进行操作
 83 | for clu in ser:
 84 |     if str(clu)=='0.0':continue
 85 |     print('newD:',newD.shape,clusterAss.shape)
 86 |     Data=copy.deepcopy(newD.iloc[:,np.nonzero(clusterAss[:,0]==clu)[0]])    #选出某一聚类的所有数据
 87 |     clusterA=copy.deepcopy(clusterAss[np.nonzero(clusterAss[:,0]==clu)[0]])  #选出某一聚类的所有类分类结果数据
 88 |     minV=min(clusterA[:,1])  #选出最短距离
 89 |     index=list(clusterA[:,1]).index(minV)   #找出最短距离对应的share数据是哪一条
 90 | 
 91 |     data=copy.deepcopy(Data.iloc[:,index].values)
 92 |     data=data[np.nonzero(data!=0)[0]]   #swarm只考虑非0数据
 93 |     print('Data.columns: ',Data.columns)
 94 |     #time.sleep(6)
 95 |     paras=swarm(data,number=index,col=Data.columns[index])   #运行swarm
 96 |     print('paras: ',paras)   #best params
 97 |     import csv
 98 |     model = ModelFactory.create(paras)
 99 |     model.enableInference({"predictedField": "value"})
100 | 
101 |     output = NuPICFileOutput("output"+str(clu), show_anomaly_score=True)
102 | 
103 |     for i in range(Data.shape[1]):
104 | 	input_file='/opt/share_code_data/'+str(Data.columns[i])+'.csv'
105 |         with open(input_file, "rb") as sine_input:
106 |             csv_reader = csv.reader(sine_input)
107 |             # the real data
108 |            
109 |             # skip header rows
110 |        	    csv_reader.next()
111 |        	    csv_reader.next()
112 |             csv_reader.next()
113 |     
114 |        	    for row in csv_reader: 
115 |                 timeS=row[0]
116 |             	value = float(row[6])  
117 |     	    	result = model.run({"value": value})
118 |             	output.write(timeS,value, result, prediction_step=3)
119 | 
120 |             
121 |     
122 | #对leftD数据进行操作
123 | maxl=0;maxi=-1
124 | for i in range(leftD.shape[1]):    #找出最长的非0序列
125 |     if maxl<len(np.nonzero(leftD.iloc[:,i]==0)):maxl=len(np.nonzero(leftD.iloc[:,i]==0));maxi=i
126 | if maxi!=-1:
127 |     data=leftD.values[np.nonzero(leftD.values[:,maxi]!=0)[0],maxi]   #swarm只考虑非0数据
128 |     paras=swarm(data,number=maxi,col=leftD.columns[maxi])   #运行swarm
129 |     print('paras: ',paras)   #best params
130 |     model = ModelFactory.create(paras)
131 |     model.enableInference({"predictedField": "value"})
132 | 
133 |     output = NuPICFileOutput("leftD_output", show_anomaly_score=True)
134 | 
135 |     for i in range(leftD.shape[1]):
136 |         input_file='/opt/share_code_data/'+str(leftD.columns[i])+'.csv'
137 |         with open(input_file, "rb") as sine_input:
138 |             csv_reader = csv.reader(sine_input)
139 |             # the real data
140 | 
141 |             # skip header rows
142 |             csv_reader.next()
143 |             csv_reader.next()
144 |             csv_reader.next()
145 | 
146 |             for row in csv_reader:
147 |                 timeS=row[0]
148 |                 value = float(row[6])
149 |                 result = model.run({"value": value})
150 |                 output.write(timeS,value, result, prediction_step=3)
151 | 
152 | 


--------------------------------------------------------------------------------
/testbp.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | #  version-1  #
  3 | #将hdfs数据本地化---读入DataFrame----利用BP+SMD
  4 | #下面讲述如何利用BP+SMD
  5 | #将数据的‘开盘价’取出，形成一列。对所有股票操作，形成多列。使用SMD
  6 | import time
  7 | import numpy as np
  8 | import pandas as pd
  9 | import sklearn
 10 | from hdfs.client import Client
 11 | from similarity import MSD
 12 | import copy
 13 | from sklearn.neural_network import MLPRegressor
 14 | from sklearn import preprocessing
 15 | from sklearn.model_selection import GridSearchCV
 16 | '''
 17 | filepath='/sdbadmin/hadoop/input'
 18 | try:
 19 |     client=Client('http://192.168.111.130:50070')
 20 | except Exception as e:
 21 |     print(e)
 22 | 
 23 | dirs=client.list(filepath)
 24 | #将hdfs本地化
 25 | print('there are %d shares'%(len(dirs)))
 26 | 
 27 | try:
 28 |     for i in range(len(dirs)):
 29 |         client.download(filepath+'/'+dirs[i],'/opt/share_code_data/'+dirs[i])
 30 | except Exception as e:
 31 |     print(e)
 32 | 
 33 | min_max_scaler = preprocessing.MinMaxScaler()
 34 | DD=pd.DataFrame([])
 35 | for i in range(len(dirs)):
 36 |     df=pd.read_csv('/opt/share_code_data/'+dirs[i],index_col=0)
 37 |     if len(DD)==len(df):
 38 | 	trun = min_max_scaler.fit_transform(copy.deepcopy(df.iloc[:,5]))
 39 |         DD[dirs[i].strip().split('.')[0]]=trun
 40 |     else:
 41 | 	cols=DD.columns
 42 | 	DD=pd.concat([DD,copy.deepcopy(df.iloc[:,5])],axis=1)   #长度不一致的合并
 43 | 	f=list(cols)
 44 | 	f.append(dirs[i].strip().split('.')[0])
 45 | 	DD.columns=f
 46 | DD.fillna(-1, inplace=True)
 47 | 
 48 | print('DD shape: ',np.shape(DD))
 49 | print(DD) #[6690 rows x 169 columns]
 50 | '''
 51 | 
 52 | 
 53 | #计算MSD
 54 | #只针对非-1序列值
 55 | #设定距离长为100,计算得到的share称为 局部msd最优share
 56 | #算法如下：
 57 | '''
 58 | 1.选取要进行预测的share
 59 | 2.计算剩余的share中，msd最小的20条局部msd最优share
 60 | 3.对2中share计算百分比*share值，作为输入，输出为下一刻的目标值,取70%长度作为训练
 61 | '''
 62 | def local_MSD(a,b,start=0,end=100): #局部最优，默认100
 63 |     return MSD(a.iloc[start:end,],b.iloc[start:end,])
 64 | 
 65 | def local_MSD_layer(a,b):   #对应于出现断层数据
 66 |     l1=np.nonzero(a!=-1)[0]
 67 |     l2=np.nonzero(b!=-1)[0]
 68 |     l=list(set(l1) & set(l2))
 69 |     sim=local_MSD(a.iloc[l,],b.iloc[l,],end=len(a))
 70 |     return sim
 71 | def sortedTw(a):
 72 |     res=sorted(a,reverse=True)
 73 |     D={}
 74 |     for i in range(20):
 75 | 	val=res[i].values()[0].split('-')[1]
 76 | 	D[val]=res[i].keys()[0]   #序列号做keys,相似度做value
 77 |     return D
 78 | def update_train(tempDF,D,detect,trainD,trainL):
 79 |     #key=D.keys()
 80 |     #value=D.values()
 81 |     #tempLength=len(tempDF)
 82 |     FG=[]
 83 |     for key,value in D.items():   #value 就是相似度 ， key是对应序列
 84 |         print('key,value:',key,value)
 85 |         temp=copy.deepcopy(tempDF.iloc[:,int(key)])
 86 | 	temp=temp/float(value) if value!=0 else temp
 87 | 	FG.append(temp)
 88 |     if len(trainD)!=0:
 89 |         trainD=np.concatenate([trainD,np.array(FG).T],axis=0)  #向下拼接
 90 | 	trainL=np.concatenate([trainL,tempDF.iloc[:,detect].values.reshape(-1)],axis=0)
 91 |     else:
 92 | 	trainD=np.array(FG).T
 93 | 	trainL=tempDF.iloc[:,detect].values.reshape(-1)
 94 |     print('trainD,trainL:',trainD.shape,trainL.shape)
 95 |     return trainD,trainL
 96 | 
 97 | def create_examples(DF,detect,pro=1.0,msd_len=100):  #DF是pandas，pro可选，这里默认为1，意味着train/test数据集都生成，msd_len默认msd距离长为100,detect是目标列
 98 |     length=int(len(DF)*pro)
 99 |     print('length: ',length)
100 |     i=0
101 |     Data=np.array([])   #train data
102 |     Label=np.array([])   #train label
103 |     while i<length:
104 | 	start=i
105 | 	i+=msd_len
106 |         end=start+msd_len if start+msd_len<=length else length  #训练集不得超过length
107 |         tempDF=DF.iloc[start:end,:]   #截取msd_len长度的Data
108 | 	if len(np.nonzero(tempDF.iloc[:,detect]==-1)[0])>=int(0.1*len(tempDF)):    #对于目标列，限制为10%的好处是，可以避免这种情况：两个序列中，有一个序列非常多-1，导致计算不准确
109 | 	    print('Not satisfied!')
110 | 	    continue
111 | 	simL=[]
112 | 	for j in range(tempDF.shape[1]):  #遍历所有股票数
113 | 	    if j!=detect:  #自己不会跟自己计算
114 | 		l=len(np.nonzero(tempDF.iloc[:,j]==-1)[0])
115 | 		if l<=int(0.1*len(tempDF)):    #如果-1占比不超过10%,计算
116 | 		    sim=local_MSD_layer(tempDF.iloc[:,detect],tempDF.iloc[:,j]) 
117 | 		    simL.append({sim:str(detect)+'-'+str(j)})  #相似度做keys
118 | 		else:
119 | 		    print('%d col is not suitable for %d line because -1 too much '%(j,detect))
120 |         D=sortedTw(simL)   #得到与detect符合的20个序列序号，以及相似度
121 | 	Data,Label = update_train(tempDF,D,detect,Data,Label)   #更新train训练集和标签	
122 | 	print('Data print: ',Data)
123 | 	#time.sleep(6)
124 |     return Data,Label
125 | 
126 | def percent(a):
127 |     a=pd.DataFrame(a)
128 |     a.columns=['origin','predict']
129 |     b=[]
130 |     for i in range(len(a)):
131 | 	d=abs(a.iloc[i,0]-a.iloc[i,1])/a.iloc[i,1] if a.iloc[i,1] else 'Error'
132 | 	if not isinstance(d,str):
133 | 	    d=str(d*100)+'%'
134 | 	b.append(d)
135 |     a['loss']=b
136 |     return a
137 | def use_algorithm(Data,Label,pro=0.7):   #利用sklearn的神经网络实现预测，默认0.7训练集
138 |     #from sklearn.neural_network import MLPRegressor    
139 |     train_length=int(pro*len(Data))
140 |     trainD=Data[:train_length,:]
141 |     testD=Data[train_length:,:]
142 |     trainL=Label[:train_length]
143 |     testL=Label[train_length:]
144 |     print('trainD,trainL,testD,testL: ',trainD.shape,trainL.shape,testD.shape,testL.shape)
145 |     #exit(1)
146 |     # 神经网络
147 |     parameters = {  
148 |         'hidden_layer_sizes': [(15,),(15,7,3),(21,3),(7,3,3),(15,7,5)], 
149 | 	'max_iter': [20000,100000,200000],
150 | 	'momentum': [0,0.5,0.7,1],
151 | 	'learning_rate': ['adaptive','constant','invscaling'],\
152 | 	'solver': ['sgd','adam'],\
153 | 	'shuffle': [False],\
154 | 	'activation': ['logistic','relu','tanh']
155 |     }
156 |     mlp = MLPRegressor()
157 |     clf = GridSearchCV(mlp, parameters)
158 |     model=clf.fit(trainD,trainL)
159 |     bestp=clf.best_params_   
160 |     with open('best_params.csv','w') as f:
161 | 	f.write(str(bestp))
162 | 	f.close()
163 |     with open('best_params.csv','r') as f:
164 |         bestp=eval(f.read())
165 |         f.close()
166 |     print('bestp: ',bestp)
167 |     #clf = MLPRegressor(hidden_layer_sizes=(15,), max_iter=100000,learning_rate='adaptive',solver='sgd',shuffle=False,activation='logistic')
168 |     #model = clf.fit(trainD,trainL)
169 |     predictD = model.predict(testD)   #predict data
170 | 
171 |     nnDF=np.concatenate([predictD.reshape(-1,1),testL.reshape(-1,1)],axis=1)
172 |     loss=percent(nnDF)
173 |     print('loss: ',loss)
174 |     #coefficient of determination ---- 1.0 is the best
175 |     score = model.score(testD,testL)
176 |     print('score: ',score)
177 |     loss.to_csv('/home/sdbadmin/loss.csv',index=False)   #将预测结果本地化
178 | 
179 | if __name__=='__main__':       
180 |     
181 |     import time
182 |     beginTime=time.time()
183 |     '''
184 |     filepath='/sdbadmin/hadoop/input'
185 |     try:
186 |         client=Client('http://192.168.111.130:50070')
187 |     except Exception as e:
188 |         print(e)
189 | 
190 |     dirs=client.list(filepath)
191 |     '''
192 |     import os
193 |     filepath='/opt/share_code_data'
194 |     dirs=os.listdir(filepath)
195 |     #将hdfs本地化
196 |     print('there are %d shares'%(len(dirs)))
197 |     '''
198 |     try:
199 |         for i in range(len(dirs)):
200 |             client.download(filepath+'/'+dirs[i],'/opt/share_code_data/'+dirs[i])
201 |     except Exception as e:
202 |         print(e)
203 |     '''
204 |     min_max_scaler = preprocessing.MinMaxScaler()
205 |     DD=pd.DataFrame([])
206 |     for i in range(len(dirs)):
207 | 	print('which csv: ',dirs[i])
208 |         df=pd.read_csv('/opt/share_code_data/'+dirs[i],index_col=0)
209 |         if len(DD)==len(df) or len(DD)==0 and len(df)!=0:
210 |             trun = min_max_scaler.fit_transform(copy.deepcopy(df.iloc[:,5]).values.reshape(-1,1))
211 |             DD[dirs[i].strip().split('.')[0]]=trun.ravel()	
212 |         elif len(df)!=0:
213 |             cols=DD.columns
214 | 	    try:
215 | 	    	trun = min_max_scaler.fit_transform(copy.deepcopy(df.iloc[:,5]).values.reshape(-1,1))
216 | 	    except Exception as e:
217 | 		print(e)
218 | 		print(df.iloc[:,5])
219 | 		time.sleep(56)
220 |             DD=pd.concat([DD,pd.DataFrame(trun)],axis=1)   #长度不一致的合并
221 |             f=list(cols)
222 |             f.append(dirs[i].strip().split('.')[0])
223 |             DD.columns=f
224 |     DD.fillna(-1, inplace=True)
225 | 
226 |     print('DD shape: ',np.shape(DD))
227 |     print(DD) #[6690 rows x 169 columns]
228 | 
229 |     Data,Label=create_examples(DD,detect=0)	
230 | 	
231 |     use_algorithm(Data,Label)		
232 |     endTime=time.time()
233 |     print('\nalgorithm runTime: ',endTime-beginTime)
234 | 
235 | '''
236 | for i in range(y):  #遍历所有share
237 |     print('Now is %s'%DD.columns[i])
238 |     for j in range(y):
239 | 	if i!=j:
240 | 	    l1=np.nonzero(DD.iloc[:,i]!=-1)[0]
241 | 	    l2=np.nonzero(DD.iloc[:,j]!=-1)[0]
242 | 	    l=set(l1) & set(l2)
243 | 	    tempDF=copy.deepcopy(DD.iloc[list(l),[i,j]])
244 | 	    print('tempDF shape: ',tempDF.shape)
245 | 	    train_len=int(len(tempDF)*0.7)
246 | 	    test_len=int(len(tempDF)*0.3)
247 | '''	     
248 | 


--------------------------------------------------------------------------------
/参考博客:
--------------------------------------------------------------------------------
 1 |  解决 Scrapy-Redis 空跑问题，链接跑完后自动关闭爬虫
 2 |  https://my.oschina.net/2devil/blog/1631116
 3 |  
 4 |  Scrapy之Extension实例——计算吞吐量及时延
 5 |  https://blog.csdn.net/q_an1314/article/details/51188137
 6 |  
 7 |  使用Python进行分布式系统协调 (ZooKeeper/Consul/etcd)
 8 |  https://blog.csdn.net/younger_china/article/details/53063426
 9 |  
10 |  在Scrapy中使用IP池或用户代理（python3）
11 |  https://www.cnblogs.com/xiaomingzaixian/p/7121280.html
12 |  
13 |  爬取网易财经中股票的历史交易数据
14 |  https://blog.csdn.net/pythoncodez/article/details/77623287
15 |  
16 |  使用scrapy框架爬取股票数据
17 |  https://blog.csdn.net/u010986776/article/details/79261999
18 |  
19 |  https://blog.csdn.net/u012150179/article/details/38091411
20 |  scrapy-redis实现爬虫分布式爬取分析与实现
21 |  
22 |  使用scrapy-redis构建简单的分布式爬虫
23 |  https://blog.csdn.net/howtogetout/article/details/51633814
24 |  
25 |  scrapy-redis介绍（一）
26 |  https://blog.csdn.net/hjhmpl123/article/details/53292602
27 | 


--------------------------------------------------------------------------------
/各组件安装文档说明:
--------------------------------------------------------------------------------
 1 | #1.zookeeper 安装
 2 | 
 3 | [root@sdb3 opt]# tar -zxf zookeeper-3.4.10.tar.gz 
 4 | [root@sdb3 opt]# chown -R sdbadmin:sdbadmin_group zookeeper-3.4.10
 5 | [root@sdb3 opt]# su – sdbadmin
 6 | [sdbadmin@sdb3 src]$ cd ../conf/
 7 | [sdbadmin@sdb3 conf]$ vim zoo.cfg
 8 | tickTime = 2000
 9 | dataDir = /opt/zookeeper-3.4.10/data
10 | clientPort = 2181
11 | initLimit = 5
12 | syncLimit = 2
13 | server.0=192.168.111.130:2888:3888
14 | server.1=192.168.111.129:2888:3888
15 | server.2=192.168.111.128:2888:3888
16 | [sdbadmin@sdb3 conf]$ cd ../
17 | [sdbadmin@sdb3 zookeeper-3.4.10]$ mkdir data
18 | [sdbadmin@sdb3 zookeeper-3.4.10]$ cd data
19 | [sdbadmin@sdb3 data]$ echo '2' > myid
20 | ---------------------------------------------------
21 | 
22 | #2.hive安装
23 | 
24 | 利用MySQL储存hive的元数据信息，因此先安装MySQL
25 | 
26 | yum install mysql
27 | yum install mysql-server
28 | yum install mysql-devel
29 | 
30 | 下载hive安装包
31 | tar -zxcf /opt/apache-hive-1.2.2-bin.tar.gz
32 | 配置hive与MySQL的集成，储存元信息
33 | 
34 | CREATE USER 'sdbadmin'@'%' IDENTIFIED BY '';
35 | GRANT ALL ON *.* TO 'sdbadmin'@'%';
36 | flush privileges;
37 | create databases hive;
38 | 
39 | #配置hive
40 | 具体参考https://www.cnblogs.com/kinginme/p/7233315.html
41 | https://blog.csdn.net/jssg_tzw/article/details/72354470
42 | https://blog.csdn.net/jssg_tzw/article/detahttps://mp.weixin.qq.com/s?__biz=MzIzODExMDE5MA==&mid=2694182433&idx=1&sn=687b754cddc7255026434c683f487ac0#rdils/72354470
43 | 
44 | 
45 | #hive学习
46 | https://mp.weixin.qq.com/s?__biz=MzIzODExMDE5MA==&mid=2694182433&idx=1&sn=687b754cddc7255026434c683f487ac0#rd
47 | https://blog.csdn.net/jssg_tzw/article/details/72354470
48 | https://www.cnblogs.com/kinginme/p/7233315.html
49 | 
50 | #利用hive读取hdfs数据到外部表
51 |  hive> create external table os
52 |     > (dt string, StockCode string, Name string, ClosingPrice float, HighestPrice float, LowestPrice float, OpeningPrice float, LastClose float, Change float, QuoteChange float, Turnover float, DealAmount float, DealValue float, TotalMarketValue float, CirculationMarketValue float)
53 |     > row format delimited fields terminated by ',' lines terminated by '\n'
54 |     > stored as textfile
55 | > tblproperties ("skip.header.line.count"="1");
56 | 
57 | >load data inpath '/sdbadmin/hadoop/input/601799.csv' into table os;
58 | 	>select * from os;
59 |   ---------------------------------------------------------
60 |   
61 | #3.安装redis
62 | 
63 | $ wget http://download.redis.io/releases/redis-2.8.17.tar.gz
64 | $ tar xzf redis-2.8.17.tar.gz
65 | $ cd redis-2.8.17
66 | $ make
67 | $ cd src
68 | $ ./redis-server
69 | ---------------------------------------
70 | 
71 | #4.安装scrapy
72 | 
73 | wget https://twistedmatrix.com/Releases/Twisted/13.1/Twisted-13.1.0.tar.bz2
74 | 1.	wget https://twistedmatrix.com/Releases/Twisted/17.1/Twisted-17.1.0.tar.bz2  
75 | 2.	tar -jxvf Twisted-17.1.0.tar.bz2  
76 | 3.	cd Twisted-17.1.0  
77 | 4.	python setup.py install  
78 | 5.	cd ..  
79 | 6.	pip install scrapy  
80 | 
81 | # 安装scrapy-redis
82 | 7.  pip install scrapy-redis
83 | 
84 | 解决scrapy任何地方启动：
85 | 进入 Python 的主目录，如cd /usr/local/python3.6/bin，查找 scrapy 项 
86 | （2）检查 cd /usr/bin/ | ll | grep scrapy，查看是否存在 
87 | （3）不存在则执行
88 |  ln -s /usr/local/python-2.7/bin/scrapy /usr/bin/scrapy 
89 | （4）回到shell，执行 scrapy version，成功
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/启动步骤:
--------------------------------------------------------------------------------
 1 | #项目的框架有点大，所以启动步骤有点麻烦，下面贴出了master/slave主机的启动步骤。
 2 | 
 3 | #Master主机（Master端的程序负责给定初始url，程序基于此下载好目标url，储存进redis，供slave端程序调用）
 4 | 1.	[sdbadmin@sdb1 ~]$ /opt/zookeeper-3.4.10/bin/zkServer.sh start   #启动zookeeper
 5 | 2.	[sdbadmin@sdb1 ~]$ /opt/hadoop-2.7.2/sbin/start-all.sh   #启动Hadoop
 6 | 3.	[sdbadmin@sdb1 ~]$ /opt/redis-2.8.17/src/redis-server   #启动redis
 7 | 4.	[sdbadmin@sdb1 ~]$hdfs dfs -mkdir -p /sdbadmin/hadoop/input
 8 | 
 9 | 给定初始url，准备工作完毕
10 | 5.  [sdbadmin@sdb1 share_code]$ /opt/redis-2.8.17/src/redis-cli sadd share:start_urls http://quote.eastmoney.com/stocklist.html#sh
11 | 
12 | 进入爬虫所在目录，开始跑爬虫程序
13 | 6.  [sdbadmin@sdb1 share_code]$ scrapy crawl share
14 | 
15 | #Slave端主机（Slave端程序负责从redis中读取目标url，下载好股票数据，并存进hdfs）
16 | 1.  [root@sdb2 ~]# service mysqld start   #启动MySQL 为hive做准备
17 | 2.  [sdbadmin@sdb2 ~]# hive --service metastore #启动hive
18 | 3.  [sdbadmin@sdb2 share_code]# scrapy crawl spider  #进入爬虫所在目录，运行爬虫
19 | 
20 | 


--------------------------------------------------------------------------------
/笔记摘录:
--------------------------------------------------------------------------------
 1 | 在redis的服务器中，会至少存在三个队列： 
 2 | a.用于请求对象去重的集合，队列的名称为spider.name:dupefilter，其中spider.name就是我们自定义的spider的名字，下同。 
 3 | b.待抓取的request对象的有序集合，队列的名称为spider.name:requests 
 4 | c.保存提取到item的列表，队列的名称为spider.name:items 
 5 | d.可能存在存放初始url的集合或者是列表，队列的名称可能是spider.name:start_urls
 6 | 
 7 | 内存监控：
 8 | MEMUSAGE_NOTIFY_MAIL = ['3081881935@qq.com']
 9 | MEMUSAGE_REPORT = True
10 | MEMUSAGE_ENABLED = True
11 | MEMUSAGE_LIMIT_MB = 2048
12 | MEMDEBUG_ENABLED = True
13 | MEMDEBUG_NOTIFY = []
14 | 
15 | EXTENSIONS = {
16 |     'share_code.extensionsItem.SpiderOpenCloseLogging': 100,
17 |     'share_code.extensionsTime.Latencies': None,
18 |     'scrapy.contrib.memusage.MemoryUsage': 50,
19 |     'scrapy.contrib.memdebug.MemoryDebugger': 60
20 | }
21 | 
22 | 基于形态相似距离的时间序列相似度计算  李中刘洋洋 
23 | https://wenku.baidu.com/view/58dfefbc2b160b4e777fcf77.html
24 | 
25 | 
26 | 隐藏层大小：（输入大小+输出大小）*2/3
27 | 
28 | hive -e "select * from /sdbadmin/hadoop/input/900915.csv" >> res1.csv新建csv文件，在此之前先将hdfs数据导入hive。
29 | 
30 | 启动hive：
31 | Service mysqld start
32 | hive --service metastore
33 | hive
34 | 
35 | 
36 | scrapy_redis原理
37 | 
38 | (https://blog.csdn.net/hjhmpl123/article/details/53292602)
39 | scrapy-redis原理: 
40 | 1.spider解析下载器下载下来的response,返回item或者是links 
41 | 2.item或者links经过spidermiddleware的process_spider_out()方法，交给engine。 
42 | 3.engine将item交给itempipeline,将links交给调度器 
43 | 4.在调度器中，先将request对象利用scrapy内置的指纹函数，生成一个指纹对象 
44 | 5.如果request对象中的dont_filter参数设置为False,并且该request对象的指纹不在信息指纹的队列中，那么就把该request对象放到优先级的队列中 
45 | 6.从优先级队列中获取request对象，交给engine 
46 | 7.engine将request对象交给下载器下载，期间会通过downloadmiddleware的process_request()方法 
47 | 8.下载器完成下载，获得response对象，将该对象交给engine,期间会通过downloadmiddleware的process_response()方法 
48 | 9.engine将获得的response对象交给spider进行解析，期间会经过spidermiddleware的process_spider_input()方法 
49 | 10.从第一步开始循环
50 | 
51 | 
52 | 
53 | zookeeper 路由和负载均衡的实现。
54 | 在zookeeper中，一但服务器与zookeeper集群断开连接，znode节点已经不存在，此时通过注册相应的watcher机制，服务消费者能够第一时间获取服务提供者信息的变更。利用znode的特点和watcher机制将其作为动态注册和获取服务信息的配置中心，统一管理服务名称和其对应的服务器列表，能够实时的感知到后端服务器的状态，从而保持服务配置信息能够一致以及进行简单的扩容。
55 | Zookeeper类似于一棵节点树，当服务提供者启动时，将服务器的名称、地址以节点的信息添加到配置中心，而服务消费中通过服务器名称以及配置中心来获得需要调用的服务节点下的服务地址，再利用负载均衡算法选取其中的某一台服务器进行调用。
56 | 
57 | 
58 | 
59 | HIVE 介绍
60 | (https://mp.weixin.qq.com/s?__biz=MzIzODExMDE5MA==&mid=2694182433&idx=1&sn=687b754cddc7255026434c683f487ac0#rd)
61 | （1）hive 是基于 Hadoop 的一个数据仓库工具，可以将结构化的数据文件映射为一张数据库表，并提供完整的 sql 查询功能，可以将 sql 语句转换为 MapReduce 任务进行运行。其优点是学习成本低，可以通过类 SQL 语句快速实现简单的 MapReduce 统计，不必开发专门的 MapReduce 应用，十分适合数据仓库的统计分析。
62 | （2）Hive 是建立在 Hadoop 上的数据仓库基础构架。它提供了一系列的工具，可以用来进行数据提取转化加载（ETL），这是一种可以存储、查询和分析存储在 Hadoop 中的大规模数据的机制。Hive 定义了简单的类 SQL 查询语言，称为 HQL，它允许熟悉 SQL 的用户查询数据。同时，这个语言也允许熟悉 MapReduce 开发者的开发自定义的 mapper 和 reducer 来处理内建的 mapper 和 reducer 无法完成的复杂的分析工作。
63 | 
64 | 


--------------------------------------------------------------------------------