├── PCA 与LDA.py
├── README.md
├── mapreduce .py
├── 二值化.py
├── 二元变量的相似度计算.py
├── 互信息法.py
├── 区间缩放.py
├── 单变量选择.py
├── 卡方检验.py
├── 基于惩罚项的特征选择法.py
├── 基于树模型的特征选择法.py
├── 对缺省值的处理.py
├── 归一化.py
├── 数据变换.py
├── 数据标准化.py
├── 数据预处理与特征选择.py
├── 方差判别特征相似程度.py
├── 最大互信息系统.py
├── 特征提取.py
├── 特征的相关系数法.py
├── 皮尔逊相关系数.py
└── 距离相关系数.py


/PCA 与LDA.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Jun 30 16:13:57 2017
 4 | 
 5 | @author: Lenovo-Y430p
 6 | """
 7 | from sklearn.decomposition import PCA
 8 | from  sklearn.datasets import load_iris
 9 | iris=load_iris()
10 | #主成分分析法，返回降维后的数据
11 | #参数n_components为主成分数目
12 | print(PCA(n_components=2).fit_transform(iris.data))
13 | from sklearn.lda import LDA
14 | #线性判别分析法，返回降维后的数据
15 | #参数n_components为降维后的维数
16 | print(LDA(n_components=2).fit_transform(iris.data, iris.target))
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # data-analysis
2 | 基于自构造函数的特征提取评分项目（缺失值处理，单变量相关性分析，特征评分，降维）
3 | 


--------------------------------------------------------------------------------
/mapreduce .py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Jul 18 09:59:37 2017
 4 | 
 5 | @author: Lenovo-Y430p
 6 | """
 7 | import sys  
 8 | from operator import itemgetter     
 9 | # 调用标准输入流    
10 | for line in sys.stdin:    
11 |     # 读取文本内容     
12 |     line = line.strip()    
13 |     # 对文本内容分词，形成一个列表   
14 |     words = line.split()    
15 |     # 读取列表中每一个元素的值    
16 |     for word in words:    
17 |         # map函数输出，key为word，下一步将进行shuffle过程，将按照key排序，输出，这两步为map阶段工作为，在本地节点进行    
18 |         print ('%s\t%s' % (word, 1))    
19 | current_word = None    
20 | current_count = 0    
21 | word = None    
22 | # input comes from STDIN    
23 | for line in sys.stdin:    
24 |     # remove leading and trailing whitespace    
25 |     line = line.strip()    
26 |     
27 |     # parse the input we got from mapper.py    
28 |     word, count = line.split('\t', 1)    
29 |     
30 |     # convert count (currently a string) to int    
31 |     try:    
32 |         count = int(count)    
33 |     except ValueError:    
34 |         # count was not a number, so silently    
35 |         # ignore/discard this line    
36 |         continue    
37 |     
38 |     # this IF-switch only works because Hadoop sorts map output    
39 |     # by key (here: word) before it is passed to the reducer    
40 |     if current_word == word:    
41 |         current_count += count    
42 |     else:    
43 |         if current_word:    
44 |             # write result to STDOUT    
45 |             print ('%s\t%s' % (current_word, current_count))    
46 |         current_count = count    
47 |         current_word = word    
48 |     
49 | # do not forget to output the last word if needed!    
50 | if current_word == word:    
51 |     print ('%s\t%s' % (current_word, current_count) )
52 | 
53 | 


--------------------------------------------------------------------------------
/二值化.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Jun 30 14:22:12 2017
 4 | 
 5 | @author: Lenovo-Y430p
 6 | """
 7 | from  sklearn.datasets import load_iris
 8 | from sklearn.preprocessing import Binarizer
 9 | import numpy as np
10 | iris=load_iris()
11 | #二值化，阈值设置为3，返回值为二值化后的数据
12 | #print(Binarizer(threshold=10).fit_transform(iris.data))
13 | def Binarizer1(threshold):
14 |     m=np.shape(iris.data)[0]
15 |     n=np.shape(iris.data)[1]
16 |     for i in range(m):
17 |         for j in range(n):
18 |             if iris.data[i,j]>=threshold:
19 |                 iris.data[i,j]=1
20 |             else:
21 |                 iris.data[i,j]=0
22 |     return iris.data
23 | def main():
24 |     k=Binarizer1(3)
25 |     print(k)
26 | if __name__=='__main__':
27 |     main()
28 |     
29 | 


--------------------------------------------------------------------------------
/二元变量的相似度计算.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jul 19 20:48:21 2017
 4 | 
 5 | @author: Lenovo-Y430p
 6 | """
 7 | #-*-coding:utf-8 -*-
 8 | # 计算jaccard系数
 9 | '''
10 | 当数据集为二元变量时，我们只有两种状态：0或者1。
11 | 这个时候以上的计算相似度的方法就无法派上用场，
12 | 于是我们引出Jaccard系数，
13 | 这是一个能够表示两个数据集都是二元变量（也可以多元）的相似度的指标，其公式为
14 | '''
15 | def jaccard(p,q):
16 |     c = [a for i in p if v in b]
17 |     return float(len(c))/(len(a)+len(b)-len(b))
18 | #注意：在使用之前必须对两个数据集进行去重
19 | #我们用一些特殊的数据集去测试一下：
20 | 
21 | p = ['shirt','shoes','pants','socks']
22 | q = ['shirt','shoes']
23 | print jaccard(p,q)
24 | 得出结果是:0.5
25 | 


--------------------------------------------------------------------------------
/互信息法.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Jun 30 16:07:34 2017
 4 | 
 5 | @author: Lenovo-Y430p
 6 | """
 7 | from sklearn.feature_selection import SelectKBest
 8 | from minepy import MINE
 9 | from  sklearn.datasets import load_iris
10 | iris=load_iris()
11 |  #由于MINE的设计不是函数式的，定义mic方法将其为函数式的，返回一个二元组，二元组的第2项设置成固定的P值0.5
12 | def mic(x, y):
13 |     m = MINE()
14 |     m.compute_score(x, y)
15 |     return (m.mic(), 0.5)
16 | 
17 | #选择K个最好的特征，返回特征选择后的数据
18 | SelectKBest(lambda X, Y: array(map(lambda x:mic(x, Y), X.T)).T, k=2).fit_transform(iris.data, iris.target)
19 | 


--------------------------------------------------------------------------------
/区间缩放.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Jun 29 11:03:39 2017
 4 | 
 5 | @author: Lenovo-Y430p
 6 | """
 7 | from  sklearn.datasets import load_iris
 8 | from sklearn.preprocessing import MinMaxScaler
 9 | import numpy as np
10 | #区间放缩法
11 | iris=load_iris()
12 | #def qujian():
13 |     #print(MinMaxScaler().fit_transform(iris.data))
14 | 
15 | #代码实现
16 | def minmax():
17 |     m=np.shape(iris.data)[1]
18 |     temp1=[];temp2=[]
19 |     for i in range(m):
20 |         t=np.max(iris.data[:,i])
21 |         t1=np.min(iris.data[:,i])
22 |         temp1.append(float('%.2f'%t))#可以在转换成float的时候，就指定精度
23 |         temp2.append(float('%.2f'%t1))
24 |     return np.mat(temp1),np.mat(temp2)
25 | 
26 | def main():
27 |     #qujian()
28 |     maxvalue,minvalue=minmax()
29 |     offset=maxvalue-minvalue
30 |     temp=(iris.data-minvalue)/offset
31 |     print(temp)
32 |     
33 | if __name__=='__main__':
34 |     main()


--------------------------------------------------------------------------------
/单变量选择.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Jun 30 17:00:42 2017
  4 | 
  5 | @author: Lenovo-Y430p
  6 | """
  7 | #1 单变量特征选择
  8 | 
  9 | #1.1 Pearson相关系数 （Pearson Correlation) [-1,1]
 10 | #理解特征和响应变量之间关系的方法，该方法衡量的是变量之间的线性相关性
 11 | 
 12 | import numpy as np
 13 | from scipy.stats import pearsonr  #从scipy中引入pearsonr
 14 | from  sklearn.datasets import load_iris
 15 | iris=load_iris()
 16 | print ("Lower noise", pearsonr(iris.data, iris.target))
 17 | #print ("Higher noise", pearsonr(x, x + np.random.normal(0, 10, size)))
 18 | 
 19 | #明显缺陷:作为特征排序机制，他只对线性关系敏感.即便两个变量具有一一对应的关系，Pearson相关性也可能会接近0
 20 | a = np.random.uniform(-1, 1, 100000)   #uniform(low,high,size) 随机数
 21 | print (pearsonr(a, a**2)[0])
 22 | 
 23 | 
 24 | #1.2 互信息和最大信息系数 (Mutual information and maximal information)，[0,1]
 25 | #互信息直接用于特征选择不太方便，最大信息系数首先寻找一种最优的离散化方式，
 26 | #然后把互信息取值转换成一种度量方式，取值区间在[0，1]。minepy提供了MIC功能。
 27 | 
 28 | from minepy import MINE  #
 29 | m = MINE()
 30 | x = np.random.uniform(-1, 1, 10000)
 31 | m.compute_score(x, x**2)
 32 | print (m.mic())
 33 | 
 34 | 
 35 | #1.3 距离相关系数 (Distance correlation)，[0,1]
 36 | #距离相关系数是为了克服Pearson相关系数的弱点而生的。在x和x^2这个例子中，即便Pearson相关系数是0，
 37 | #我们也不能断定这两个变量是独立的（有可能是非线性相关）；但如果距离相关系数是0，那么我们就可以说这两个变量是独立的。
 38 | import numpy as np
 39 | 
 40 | def dist(x, y):
 41 |     #1d only
 42 |     return np.abs(x[:, None] - y)
 43 |     
 44 | 
 45 | def d_n(x):
 46 |     d = dist(x, x)
 47 |     dn = d - d.mean(0) - d.mean(1)[:,None] + d.mean()
 48 |     return dn
 49 | 
 50 | 
 51 | def dcov_all(x, y):
 52 |     dnx = d_n(x)
 53 |     dny = d_n(y)
 54 |     
 55 |     denom = np.product(dnx.shape)
 56 |     dc = (dnx * dny).sum() / denom
 57 |     dvx = (dnx**2).sum() / denom
 58 |     dvy = (dny**2).sum() / denom
 59 |     dr = dc / (np.sqrt(dvx) * np.sqrt(dvy))
 60 |     return dc, dr, dvx, dvy
 61 | 
 62 | 
 63 | import matplotlib.pyplot as plt
 64 | 
 65 | fig = plt.figure()
 66 | for case in range(1,5):
 67 | 
 68 |     np.random.seed(9854673)
 69 |     x = np.linspace(-1,1, 501)
 70 |     if case == 1:
 71 |         y = - x**2 + 0.2 * np.random.rand(len(x))
 72 |     elif case == 2:
 73 |         y = np.cos(x*2*np.pi) + 0.1 * np.random.rand(len(x))
 74 |     elif case == 3:
 75 |         x = np.sin(x*2*np.pi) + 0.0 * np.random.rand(len(x))  #circle
 76 |     elif case == 4:
 77 |         x = np.sin(x*1.5*np.pi) + 0.1 * np.random.rand(len(x))  #bretzel
 78 |     dc, dr, dvx, dvy = dcov_all(x, y)
 79 |     print( dc, dr, dvx, dvy)
 80 |     
 81 |     ax = fig.add_subplot(2,2, case)
 82 |     #ax.set_xlim(-1, 1)
 83 |     ax.plot(x, y, '.')
 84 |     yl = ax.get_ylim()
 85 |     ax.text(-0.95, yl[0] + 0.9 * np.diff(yl), 'dr=%4.2f' % dr)
 86 | 
 87 | plt.show()
 88 | 
 89 | 
 90 | #1.4 基于学习模型的特征排序 (Model based ranking)
 91 | #直接使用你要用的机器学习算法，针对每个单独的特征和响应变量建立预测模型。线性：pearson等价；非线性：基于树的方法（决策树、随机森林）
 92 | from sklearn.cross_validation import cross_val_score, ShuffleSplit
 93 | from sklearn.datasets import load_boston
 94 | from sklearn.ensemble import RandomForestRegressor
 95 | #Load boston housing dataset as an example
 96 | boston = load_boston()
 97 | X = boston["data"]
 98 | Y = boston["target"]
 99 | names = boston["feature_names"]
100 | rf = RandomForestRegressor(n_estimators=20, max_depth=4)  #树的深度最好不要太大，再就是运用交叉验证
101 | scores = []
102 | for i in range(X.shape[1]):
103 |     score = cross_val_score(rf, X[:, i:i+1], Y, scoring="r2", cv=ShuffleSplit(len(X), 3, .3))
104 |     scores.append((round(np.mean(score), 3), names[i]))
105 | print (sorted(scores, reverse=True))
106 | 


--------------------------------------------------------------------------------
/卡方检验.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Jun 30 16:03:18 2017
 4 | 
 5 | @author: Lenovo-Y430p
 6 | """
 7 | from sklearn.feature_selection import SelectKBest
 8 | from sklearn.feature_selection import chi2
 9 | from  sklearn.datasets import load_iris
10 | iris=load_iris()
11 | #选择K个最好的特征，返回选择特征后的数据
12 | '''
13 | 卡方检验就是统计样本的实际观测值与理论推断值之间的偏离程度，
14 | 实际观测值与理论推断值之间的偏离程度就决定卡方值的大小，
15 | 卡方值越大，越不符合；卡方值越小，偏差越小，越趋于符合，
16 | 若两个值完全相等时，卡方值就为0，表明理论值完全符合。
17 | 检验样本是否符合正太分布
18 | '''
19 | #文本分析，和筛选异常用户
20 | print(SelectKBest(chi2, k=2).fit_transform(iris.data, iris.target))
21 | 
22 | 


--------------------------------------------------------------------------------
/基于惩罚项的特征选择法.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Jun 30 16:08:26 2017
 4 | 
 5 | @author: Lenovo-Y430p
 6 | """
 7 | #基于惩罚项的特征选择法
 8 | from sklearn.feature_selection import SelectFromModel
 9 | from sklearn.linear_model import LogisticRegression
10 | from numpy import array
11 | from  sklearn.datasets import load_iris
12 | iris=load_iris()
13 | #带L1惩罚项的逻辑回归作为基模型的特征选择
14 | print(SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(iris.data, iris.target))
15 | 
16 | class LR(LogisticRegression):
17 |     def __init__(self, threshold=0.01, dual=False, tol=1e-4, C=1.0,
18 |                  fit_intercept=True, intercept_scaling=1, class_weight=None,
19 |                  random_state=None, solver='liblinear', max_iter=100,
20 |                  multi_class='ovr', verbose=0, warm_start=False, n_jobs=1):
21 | 
22 |         #权值相近的阈值
23 |         self.threshold = threshold
24 |         LogisticRegression.__init__(self, penalty='l1', dual=dual, tol=tol, C=C,
25 |                  fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight=class_weight,
26 |                  random_state=random_state, solver=solver, max_iter=max_iter,
27 |                  multi_class=multi_class, verbose=verbose, warm_start=warm_start, n_jobs=n_jobs)
28 |         #使用同样的参数创建L2逻辑回归
29 |         self.l2 = LogisticRegression(penalty='l2', dual=dual, tol=tol, C=C, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight = class_weight, random_state=random_state, solver=solver, max_iter=max_iter, multi_class=multi_class, verbose=verbose, warm_start=warm_start, n_jobs=n_jobs)
30 | 
31 |     def fit(self, X, y, sample_weight=None):
32 |         #训练L1逻辑回归
33 |         super(LR, self).fit(X, y, sample_weight=sample_weight)
34 |         self.coef_old_ = self.coef_.copy()
35 |         #训练L2逻辑回归
36 |         self.l2.fit(X, y, sample_weight=sample_weight)
37 | 
38 |         cntOfRow, cntOfCol = self.coef_.shape
39 |         #权值系数矩阵的行数对应目标值的种类数目
40 |         for i in range(cntOfRow):
41 |             for j in range(cntOfCol):
42 |                 coef = self.coef_[i][j]
43 |                 #L1逻辑回归的权值系数不为0
44 |                 if coef != 0:
45 |                     idx = [j]
46 |                     #对应在L2逻辑回归中的权值系数
47 |                     coef1 = self.l2.coef_[i][j]
48 |                     for k in range(cntOfCol):
49 |                         coef2 = self.l2.coef_[i][k]
50 |                         #在L2逻辑回归中，权值系数之差小于设定的阈值，且在L1中对应的权值为0
51 |                         if abs(coef1-coef2) < self.threshold and j != k and self.coef_[i][k] == 0:
52 |                             idx.append(k)
53 |                     #计算这一类特征的权值系数均值
54 |                     mean = coef / len(idx)
55 |                     self.coef_[i][idx] = mean
56 |         return self
57 | #带L1和L2惩罚项的逻辑回归作为基模型的特征选择
58 | #参数threshold为权值系数之差的阈值
59 | print(SelectFromModel(LR(threshold=0.5, C=0.1)).fit_transform(iris.data, iris.target))
60 | 


--------------------------------------------------------------------------------
/基于树模型的特征选择法.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jul 19 14:58:19 2017
 4 | 
 5 | @author: Lenovo-Y430p
 6 | """
 7 | from sklearn.feature_selection import SelectFromModel
 8 | from sklearn.ensemble import GradientBoostingClassifier
 9 | from  sklearn.datasets import load_iris
10 | iris=load_iris()
11 | #GBDT作为基模型的特征选择
12 | print(SelectFromModel(GradientBoostingClassifier()).fit_transform(iris.data, iris.target))
13 | 


--------------------------------------------------------------------------------
/对缺省值的处理.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Jun 30 14:41:37 2017
 4 | 
 5 | @author: Lenovo-Y430p
 6 | """
 7 | from numpy import vstack, array, nan
 8 | from sklearn.preprocessing import Imputer
 9 | #缺失值计算，返回值为计算缺失值后的数据
10 | #参数missing_value为缺失值的表示形式，默认为NaN
11 | #参数strategy为缺失值填充方式，默认为mean（均值）
12 | imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
13 | X=[[1, 2], [nan, 3], [7, 6]]
14 | Y=[[nan, 2], [6, nan], [7, 6]]
15 | print(imp.fit(X))
16 | print(imp.transform(X))
17 | print(imp.transform(Y))


--------------------------------------------------------------------------------
/归一化.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Jun 30 11:33:41 2017
 4 | 
 5 | @author: Lenovo-Y430p
 6 | """
 7 | #归一化处理
 8 | from  sklearn.datasets import load_iris
 9 | from sklearn.preprocessing import Normalizer
10 | import numpy as np
11 | #归一化，返回值为归一化后的数据
12 | iris=load_iris()
13 | print(Normalizer().fit_transform(iris.data))
14 | #归一化就是把数据化成单位向量，这是按行进行处理，这样在用核函数计算相似性时具有统一的标准,使用的是L2归一化公式
15 | temp1=[]
16 | def guiyi():
17 |     for i in range(np.shape(iris.data)[0]):
18 |         temp=np.sqrt(sum(np.multiply(iris.data[i,:],iris.data[i,:])))
19 |         print(temp)
20 |         iris.data[i,:]/=temp
21 |     print(iris.data)
22 |         
23 | def main():
24 |     guiyi()
25 |     
26 |     
27 | if __name__=='__main__':
28 |     main()
29 |     
30 | 
31 |       
32 |         
33 |     
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/数据变换.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Jun 30 15:27:50 2017
 4 | 
 5 | @author: Lenovo-Y430p
 6 | """
 7 | from numpy import vstack, array, nan
 8 | from sklearn.preprocessing import Imputer
 9 | from  sklearn.datasets import load_iris
10 | import numpy as np
11 | from sklearn.preprocessing import PolynomialFeatures
12 | #多项式转换
13 | #参数degree为度，默认值为2
14 | iris=load_iris()
15 | print(PolynomialFeatures().fit_transform(iris.data))
16 | from numpy import log1p
17 | from sklearn.preprocessing import FunctionTransformer
18 | #自定义转换函数为对数函数的数据变换
19 | #第一个参数是单变元函数
20 | print(FunctionTransformer(log1p).fit_transform(iris.data))
21 | 
22 | 


--------------------------------------------------------------------------------
/数据标准化.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Jun 29 10:59:15 2017
 4 | 
 5 | @author: Lenovo-Y430p
 6 | """
 7 | 
 8 | from  sklearn.datasets import load_iris
 9 | from sklearn.preprocessing import StandardScaler
10 | import matplotlib.pyplot as plt
11 | from sklearn import preprocessing 
12 | import numpy as np  
13 | iris=load_iris()
14 | #不能保存
15 | def bianhuan():
16 |     X_scaled = preprocessing.scale(iris.data) 
17 |     print(X_scaled)
18 |     print(X_scaled.mean(axis=0))#均值   
19 |     print(X_scaled.std(axis=0)) #方差
20 | #能保存
21 | def baocuan():
22 |     scaler = StandardScaler().fit(iris.data)
23 |     print(scaler)
24 |     #StandardScaler(copy=True, with_mean=True, with_std=True)
25 |     print(scaler.mean_) 
26 |     print(scaler.std_)
27 |     #测试将该scaler用于输入数据，变换之后得到的结果同上
28 |     print(scaler.transform(iris.data))
29 |     
30 | if __name__=='__main__':
31 |     x=np.mean(iris.data,axis=0)#axis=0是按列，axis=1是按行
32 |     #print(x)
33 |     c=np.multiply((iris.data-x),(iris.data-x))
34 |     b=c/len(iris.data)
35 |     t=sum(b,axis=0)
36 |     t=np.sqrt(t)
37 |     #print(t)
38 |     #baocuan()
39 |     print((iris.data-x)/t)#具有广播规则
40 | #标准化源码公式：x=(x-mean(x))/std(x) 相当于标准正态分布
41 | 
42 | 
43 | 
44 |     


--------------------------------------------------------------------------------
/数据预处理与特征选择.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Jun 28 12:49:48 2017
  4 | 
  5 | @author: Lenovo-Y430p
  6 | """
  7 | from  sklearn.datasets import load_iris
  8 | iris=load_iris()
  9 | iris.data
 10 | iris.target
 11 | #标准化
 12 | from sklearn.preprocessing import StandardScaler
 13 | #标准化，返回值为标准化后的数据
 14 | 
 15 | StandardScaler().fit_transform(iris.data)
 16 | from sklearn.preprocessing import MinMaxScaler
 17 | #区间放缩法
 18 | MinMaxScaler().fit_transform(iris.data)
 19 | from sklearn.preprocessing import Normalizer
 20 | #归一化，返回值为归一化后的数据
 21 | Normalizer().fit_transform(iris.data)
 22 | from sklearn.preprocessing import Binarizer
 23 | #二值化，阈值设置为3，返回值为二值化后的数据
 24 | Binarizer(threshold=3).fit_transform(iris.data)
 25 | from sklearn.preprocessing import OneHotEncoder
 26 | #哑编码，对IRIS数据集的目标值，返回值为哑编码后的数据
 27 | OneHotEncoder().fit_transform(iris.target.reshape((-1,1)))
 28 | from numpy import vstack, array, nan
 29 | from sklearn.preprocessing import Imputer
 30 | #缺失值计算，返回值为计算缺失值后的数据
 31 | #参数missing_value为缺失值的表示形式，默认为NaN
 32 | #参数strategy为缺失值填充方式，默认为mean（均值）
 33 | Imputer().fit_transform(vstack((array([nan, nan, nan, nan]), iris.data)))
 34 | from sklearn.preprocessing import PolynomialFeatures
 35 | #多项式转换
 36 | #参数degree为度，默认值为2
 37 | PolynomialFeatures().fit_transform(iris.data)
 38 | from numpy import log1p
 39 | from sklearn.preprocessing import FunctionTransformer
 40 | #自定义转换函数为对数函数的数据变换
 41 | #第一个参数是单变元函数
 42 | FunctionTransformer(log1p).fit_transform(iris.data)
 43 | from sklearn.feature_selection import VarianceThreshold
 44 | #方差选择法，返回值为特征选择后的数据
 45 | #参数threshold为方差的阈值
 46 | VarianceThreshold(threshold=3).fit_transform(iris.data)
 47 | from sklearn.feature_selection import SelectKBest
 48 | from scipy.stats import pearsonr
 49 | 
 50 | #选择K个最好的特征，返回选择特征后的数据
 51 | #第一个参数为计算评估特征是否好的函数，该函数输入特征矩阵和目标向量，输出二元组（评分，P值）的数组，数组第i项为第i个特征的评分和P值。在此定义为计算相关系数
 52 | #参数k为选择的特征个数
 53 | SelectKBest(lambda X, Y: array(map(lambda x:pearsonr(x, Y), X.T)).T, k=2).fit_transform(iris.data, iris.target)
 54 | from sklearn.feature_selection import SelectKBest
 55 | from sklearn.feature_selection import chi2
 56 | 
 57 | #选择K个最好的特征，返回选择特征后的数据
 58 | SelectKBest(chi2, k=2).fit_transform(iris.data, iris.target)
 59 | from sklearn.feature_selection import SelectKBest
 60 | from minepy import MINE
 61 |  
 62 |  #由于MINE的设计不是函数式的，定义mic方法将其为函数式的，返回一个二元组，二元组的第2项设置成固定的P值0.5
 63 | def mic(x, y):
 64 |     m = MINE()
 65 |     m.compute_score(x, y)
 66 |     return (m.mic(), 0.5)
 67 | 
 68 | #选择K个最好的特征，返回特征选择后的数据
 69 | SelectKBest(lambda X, Y: array(map(lambda x:mic(x, Y), X.T)).T, k=2).fit_transform(iris.data, iris.target)
 70 | from sklearn.feature_selection import RFE
 71 | from sklearn.linear_model import LogisticRegression
 72 | 
 73 | #递归特征消除法，返回特征选择后的数据
 74 | #参数estimator为基模型
 75 | #参数n_features_to_select为选择的特征个数
 76 | RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(iris.data, iris.target)
 77 | from sklearn.feature_selection import SelectFromModel
 78 | from sklearn.linear_model import LogisticRegression
 79 | 
 80 | #带L1惩罚项的逻辑回归作为基模型的特征选择
 81 | SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(iris.data, iris.target)
 82 | from sklearn.linear_model import LogisticRegression
 83 | 
 84 | class LR(LogisticRegression):
 85 |     def __init__(self, threshold=0.01, dual=False, tol=1e-4, C=1.0,
 86 |                  fit_intercept=True, intercept_scaling=1, class_weight=None,
 87 |                  random_state=None, solver='liblinear', max_iter=100,
 88 |                  multi_class='ovr', verbose=0, warm_start=False, n_jobs=1):
 89 | 
 90 |         #权值相近的阈值
 91 |         self.threshold = threshold
 92 |         LogisticRegression.__init__(self, penalty='l1', dual=dual, tol=tol, C=C,
 93 |                  fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight=class_weight,
 94 |                  random_state=random_state, solver=solver, max_iter=max_iter,
 95 |                  multi_class=multi_class, verbose=verbose, warm_start=warm_start, n_jobs=n_jobs)
 96 |         #使用同样的参数创建L2逻辑回归
 97 |         self.l2 = LogisticRegression(penalty='l2', dual=dual, tol=tol, C=C, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight = class_weight, random_state=random_state, solver=solver, max_iter=max_iter, multi_class=multi_class, verbose=verbose, warm_start=warm_start, n_jobs=n_jobs)
 98 | 
 99 |     def fit(self, X, y, sample_weight=None):
100 |         #训练L1逻辑回归
101 |         super(LR, self).fit(X, y, sample_weight=sample_weight)
102 |         self.coef_old_ = self.coef_.copy()
103 |         #训练L2逻辑回归
104 |         self.l2.fit(X, y, sample_weight=sample_weight)
105 | 
106 |         cntOfRow, cntOfCol = self.coef_.shape
107 |         #权值系数矩阵的行数对应目标值的种类数目
108 |         for i in range(cntOfRow):
109 |             for j in range(cntOfCol):
110 |                 coef = self.coef_[i][j]
111 |                 #L1逻辑回归的权值系数不为0
112 |                 if coef != 0:
113 |                     idx = [j]
114 |                     #对应在L2逻辑回归中的权值系数
115 |                     coef1 = self.l2.coef_[i][j]
116 |                     for k in range(cntOfCol):
117 |                         coef2 = self.l2.coef_[i][k]
118 |                         #在L2逻辑回归中，权值系数之差小于设定的阈值，且在L1中对应的权值为0
119 |                         if abs(coef1-coef2) < self.threshold and j != k and self.coef_[i][k] == 0:
120 |                             idx.append(k)
121 |                     #计算这一类特征的权值系数均值
122 |                     mean = coef / len(idx)
123 |                     self.coef_[i][idx] = mean
124 |         return self
125 | from sklearn.feature_selection import SelectFromModel
126 |  
127 | #带L1和L2惩罚项的逻辑回归作为基模型的特征选择
128 | #参数threshold为权值系数之差的阈值
129 | SelectFromModel(LR(threshold=0.5, C=0.1)).fit_transform(iris.data, iris.target)
130 | from sklearn.feature_selection import SelectFromModel
131 | from sklearn.ensemble import GradientBoostingClassifier
132 | 
133 | #GBDT作为基模型的特征选择
134 | SelectFromModel(GradientBoostingClassifier()).fit_transform(iris.data, iris.target)
135 | from sklearn.decomposition import PCA
136 | 
137 | #主成分分析法，返回降维后的数据
138 | #参数n_components为主成分数目
139 | PCA(n_components=2).fit_transform(iris.data)
140 | from sklearn.lda import LDA
141 | 
142 | #线性判别分析法，返回降维后的数据
143 | #参数n_components为降维后的维数
144 | LDA(n_components=2).fit_transform(iris.data, iris.target)
145 | 
146 | 
147 | 


--------------------------------------------------------------------------------
/方差判别特征相似程度.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Jun 30 15:35:51 2017
 4 | 
 5 | @author: Lenovo-Y430p
 6 | """
 7 | from  sklearn.datasets import load_iris
 8 | import numpy as np
 9 | from sklearn.feature_selection import VarianceThreshold
10 | iris=load_iris()
11 | #方差选择法，返回值为特征选择后的数据
12 | #参数threshold为方差的阈值
13 | print(VarianceThreshold(threshold=3).fit_transform(iris.data))
14 | def variance():
15 |     x=np.mean(iris.data)
16 |     temp=np.multiply((iris.data-x),(iris.data-x))
17 |     temp1=np.sqrt(np.sum(temp,axis=0))
18 |     print(temp1)
19 | def main():
20 |     variance()
21 | if __name__=='__main__':
22 |     main()
23 |     
24 |     
25 |     
26 |     
27 | 


--------------------------------------------------------------------------------
/最大互信息系统.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Jun 30 18:23:35 2017
 4 | 
 5 | @author: Lenovo-Y430p
 6 | """
 7 | #1.2 互信息和最大信息系数 (Mutual information and maximal information)，[0,1]
 8 | #互信息直接用于特征选择不太方便，最大信息系数首先寻找一种最优的离散化方式，
 9 | #然后把互信息取值转换成一种度量方式，取值区间在[0，1]。minepy提供了MIC功能。
10 | import numpy as np
11 | from minepy import MINE  #
12 | m = MINE()
13 | x = np.random.uniform(-1, 1, 10000)
14 | m.compute_score(x, x**2)
15 | print (m.mic())
16 | 


--------------------------------------------------------------------------------
/特征提取.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jul 19 19:41:18 2017
 4 | 
 5 | @author: Lenovo-Y430p
 6 | """
 7 | from sklearn.datasets import load_boston  
 8 | from sklearn.linear_model import (LinearRegression, Ridge,   
 9 |                                   Lasso, RandomizedLasso)  
10 | from sklearn.feature_selection import RFE, f_regression  
11 | from sklearn.preprocessing import MinMaxScaler  
12 | from sklearn.ensemble import RandomForestRegressor  
13 | import numpy as np  
14 | from minepy import MINE  
15 |   
16 | np.random.seed(0)  
17 |   
18 | size = 750  
19 | X = np.random.uniform(0, 1, (size, 14))  
20 | #x0到x4对输出来说是有用的 
21 | #"Friedamn #1” regression problem  
22 | Y = (10 * np.sin(np.pi*X[:,0]*X[:,1]) + 20*(X[:,2] - .5)**2 +  
23 |      10*X[:,3] + 5*X[:,4] + np.random.normal(0,1))  
24 | #Add 3 additional correlated variables (correlated with X1-X3) 
25 | #x10到x14是x0到x4的变种
26 | X[:,9:] = X[:,:5] + np.random.normal(0, .025, (size,5))  
27 | #其余的都是噪声
28 | names = ["x%s" % i for i in range(0,14)]  
29 | ranks = {}  
30 |   
31 | def rank_to_dict(ranks, names, order=1):  
32 |     minmax = MinMaxScaler()  
33 |     ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]  
34 |     ranks = list(map(lambda x: round(x, 2), ranks))  
35 |     return dict(zip(names, ranks ))  
36 |   
37 | lr = LinearRegression(normalize=True)  
38 | lr.fit(X, Y)  
39 | ranks["reg"] = rank_to_dict(np.abs(lr.coef_), names)  
40 |   
41 | ridge = Ridge(alpha=7)  
42 | ridge.fit(X, Y)  
43 | ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)  
44 |   
45 |   
46 | lasso = Lasso(alpha=0.05)  
47 | lasso.fit(X, Y)  
48 | ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)  
49 |   
50 |   
51 | rlasso = RandomizedLasso(alpha=0.04)  
52 | rlasso.fit(X, Y)  
53 | ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names)  
54 |   
55 | #stop the search when 5 features are left (they will get equal scores)  
56 | rfe = RFE(lr, n_features_to_select=5)  
57 | rfe.fit(X,Y)  
58 | ranks["RFE"] = rank_to_dict(list(map(float, rfe.ranking_)), names, order=-1)  
59 |   
60 | rf = RandomForestRegressor()  
61 | rf.fit(X,Y)  
62 | ranks["RF"] = rank_to_dict(rf.feature_importances_, names)  
63 |   
64 |   
65 | f, pval  = f_regression(X, Y, center=True)  
66 | ranks["Corr."] = rank_to_dict(f, names)  
67 | ''' 
68 | mine = MINE()  
69 | mic_scores = []  
70 | for i in range(X.shape[1]):  
71 |     mine.compute_score(X[:,i], Y)  
72 |     m = mine.mic()  
73 |     mic_scores.append(m)  
74 |   
75 | ranks["MIC"] = rank_to_dict(mic_scores, names)  
76 |   
77 | '''  
78 | r = {}  
79 | for name in names:  
80 |     r[name] = round(np.mean([ranks[method][name]   
81 |                              for method in ranks.keys()]), 2)  
82 |   
83 | methods = sorted(ranks.keys())  
84 | ranks["Mean"] = r  
85 | methods.append("Mean")  
86 |   
87 | print ("\t%s" % "\t".join(methods))  
88 | for name in names:  
89 |     print ("%s\t%s" % (name, "\t".join(map(str,   
90 |                          [ranks[method][name] for method in methods]))))  
91 | 


--------------------------------------------------------------------------------
/特征的相关系数法.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Jun 30 15:49:15 2017
 4 | 
 5 | @author: Lenovo-Y430p
 6 | """
 7 | from sklearn.feature_selection import SelectKBest
 8 | from scipy.stats import pearsonr
 9 | from  sklearn.datasets import load_iris
10 | from numpy import array
11 | iris=load_iris()
12 | #选择K个最好的特征，返回选择特征后的数据
13 | #第一个参数为计算评估特征是否好的函数，该函数输入特征矩阵和目标向量，输出二元组（评分，P值）的数组，数组第i项为第i个特征的评分和P值。在此定义为计算相关系数
14 | #参数k为选择的特征个数
15 | print(SelectKBest(lambda X, Y: array(map(lambda x:pearsonr(x, Y), X.T)).T, k=2).fit_transform(iris.data, iris.target))
16 | 
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/皮尔逊相关系数.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Jun 30 18:08:48 2017
 4 | 
 5 | @author: Lenovo-Y430p
 6 | """
 7 | #皮尔逊相关系数(来处理线性关系)
 8 | #1.1 Pearson相关系数 （Pearson Correlation) [-1,1]
 9 | #理解特征和响应变量之间关系的方法，该方法衡量的是变量之间的线性相关性，可以理解为两个向量之间的夹角
10 | #之所以能度量相似程度就是因为时距离均值的大小，同正同负表征了变化的方向和大小。但因为离散程度不同
11 | #需要加入对标准差的除法
12 | '''
13 | 统计学上规定的P值意义见下表
14 |   P值 碰巧的概率 对无效假设 统计意义 
15 | P＞0.05 碰巧出现的可能性大于5% 不能否定无效假设 两组差别无显著意义 
16 | P＜0.05 碰巧出现的可能性小于5% 可以否定无效假设 两组差别有显著意义 
17 | P ＜0.01 碰巧出现的可能性小于1% 可以否定无效假设 两者差别有非常显著意义 
18 | '''
19 | import numpy as np
20 | from scipy.stats import pearsonr  #从scipy中引入pearsonr
21 | from  sklearn.datasets import load_iris
22 | from numpy import *
23 | iris=load_iris()
24 | m=shape(iris.data)[1]
25 | for i in range(m):
26 |     print ("i", pearsonr(iris.data[:,i], iris.target))
27 | #print ("Higher noise", pearsonr(x, x + np.random.normal(0, 10, size)))
28 | #明显缺陷:作为特征排序机制，他只对线性关系敏感.即便两个变量具有一一对应的关系，Pearson相关性也可能会接近0
29 | a = np.random.uniform(-1, 1, 100000)   #uniform(low,high,size) 随机数
30 | print (pearsonr(a, a**2))
31 | #返回两个值评分和p值，p值越大越坏
32 | def pearson(x,y):
33 |     xmean=mean(x,axis=0)
34 |     ymean=mean(y,axis=0)
35 |     covxy=dot((x-xmean).T,y-ymean)
36 |     xvar=dot((x-xmean).T,x-xmean)
37 |     yvar=dot((y-ymean).T,y-ymean)
38 |     p=covxy/sqrt(xvar*yvar)
39 |     print(p)
40 | def main():
41 |     iris=load_iris()
42 |     m=shape(iris.data)[1]
43 |     for i in range(m):
44 |         pearson(iris.data[:,i], iris.target)
45 | if __name__=='__main__':
46 |     main()
47 |     


--------------------------------------------------------------------------------
/距离相关系数.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jul 19 20:52:11 2017
 4 | 
 5 | @author: Lenovo-Y430p
 6 | """
 7 | #计算欧几里德距离：
 8 | def euclidean(p,q):
 9 | #如果两数据集数目不同，计算两者之间都对应有的数
10 | same = 0
11 | for i in p:
12 |     if i in q:
13 |         same +=1
14 | 
15 | #计算欧几里德距离,并将其标准化
16 | e = sum([(p[i] - q[i])**2 for i in range(same)])
17 | return 1/(1+e**.5)
18 | 


--------------------------------------------------------------------------------