├── PCA 与LDA.py ├── README.md ├── mapreduce .py ├── 二值化.py ├── 二元变量的相似度计算.py ├── 互信息法.py ├── 区间缩放.py ├── 单变量选择.py ├── 卡方检验.py ├── 基于惩罚项的特征选择法.py ├── 基于树模型的特征选择法.py ├── 对缺省值的处理.py ├── 归一化.py ├── 数据变换.py ├── 数据标准化.py ├── 数据预处理与特征选择.py ├── 方差判别特征相似程度.py ├── 最大互信息系统.py ├── 特征提取.py ├── 特征的相关系数法.py ├── 皮尔逊相关系数.py └── 距离相关系数.py /PCA 与LDA.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jun 30 16:13:57 2017 4 | 5 | @author: Lenovo-Y430p 6 | """ 7 | from sklearn.decomposition import PCA 8 | from sklearn.datasets import load_iris 9 | iris=load_iris() 10 | #主成分分析法,返回降维后的数据 11 | #参数n_components为主成分数目 12 | print(PCA(n_components=2).fit_transform(iris.data)) 13 | from sklearn.lda import LDA 14 | #线性判别分析法,返回降维后的数据 15 | #参数n_components为降维后的维数 16 | print(LDA(n_components=2).fit_transform(iris.data, iris.target)) 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # data-analysis 2 | 基于自构造函数的特征提取评分项目(缺失值处理,单变量相关性分析,特征评分,降维) 3 | -------------------------------------------------------------------------------- /mapreduce .py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jul 18 09:59:37 2017 4 | 5 | @author: Lenovo-Y430p 6 | """ 7 | import sys 8 | from operator import itemgetter 9 | # 调用标准输入流 10 | for line in sys.stdin: 11 | # 读取文本内容 12 | line = line.strip() 13 | # 对文本内容分词,形成一个列表 14 | words = line.split() 15 | # 读取列表中每一个元素的值 16 | for word in words: 17 | # map函数输出,key为word,下一步将进行shuffle过程,将按照key排序,输出,这两步为map阶段工作为,在本地节点进行 18 | print ('%s\t%s' % (word, 1)) 19 | current_word = None 20 | current_count = 0 21 | word = None 22 | # input comes from STDIN 23 | for line in sys.stdin: 24 | # remove leading and trailing whitespace 25 | line = line.strip() 26 | 27 | # parse the input we got from mapper.py 28 | word, count = line.split('\t', 1) 29 | 30 | # convert count (currently a string) to int 31 | try: 32 | count = int(count) 33 | except ValueError: 34 | # count was not a number, so silently 35 | # ignore/discard this line 36 | continue 37 | 38 | # this IF-switch only works because Hadoop sorts map output 39 | # by key (here: word) before it is passed to the reducer 40 | if current_word == word: 41 | current_count += count 42 | else: 43 | if current_word: 44 | # write result to STDOUT 45 | print ('%s\t%s' % (current_word, current_count)) 46 | current_count = count 47 | current_word = word 48 | 49 | # do not forget to output the last word if needed! 50 | if current_word == word: 51 | print ('%s\t%s' % (current_word, current_count) ) 52 | 53 | -------------------------------------------------------------------------------- /二值化.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jun 30 14:22:12 2017 4 | 5 | @author: Lenovo-Y430p 6 | """ 7 | from sklearn.datasets import load_iris 8 | from sklearn.preprocessing import Binarizer 9 | import numpy as np 10 | iris=load_iris() 11 | #二值化,阈值设置为3,返回值为二值化后的数据 12 | #print(Binarizer(threshold=10).fit_transform(iris.data)) 13 | def Binarizer1(threshold): 14 | m=np.shape(iris.data)[0] 15 | n=np.shape(iris.data)[1] 16 | for i in range(m): 17 | for j in range(n): 18 | if iris.data[i,j]>=threshold: 19 | iris.data[i,j]=1 20 | else: 21 | iris.data[i,j]=0 22 | return iris.data 23 | def main(): 24 | k=Binarizer1(3) 25 | print(k) 26 | if __name__=='__main__': 27 | main() 28 | 29 | -------------------------------------------------------------------------------- /二元变量的相似度计算.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jul 19 20:48:21 2017 4 | 5 | @author: Lenovo-Y430p 6 | """ 7 | #-*-coding:utf-8 -*- 8 | # 计算jaccard系数 9 | ''' 10 | 当数据集为二元变量时,我们只有两种状态:0或者1。 11 | 这个时候以上的计算相似度的方法就无法派上用场, 12 | 于是我们引出Jaccard系数, 13 | 这是一个能够表示两个数据集都是二元变量(也可以多元)的相似度的指标,其公式为 14 | ''' 15 | def jaccard(p,q): 16 | c = [a for i in p if v in b] 17 | return float(len(c))/(len(a)+len(b)-len(b)) 18 | #注意:在使用之前必须对两个数据集进行去重 19 | #我们用一些特殊的数据集去测试一下: 20 | 21 | p = ['shirt','shoes','pants','socks'] 22 | q = ['shirt','shoes'] 23 | print jaccard(p,q) 24 | 得出结果是:0.5 25 | -------------------------------------------------------------------------------- /互信息法.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jun 30 16:07:34 2017 4 | 5 | @author: Lenovo-Y430p 6 | """ 7 | from sklearn.feature_selection import SelectKBest 8 | from minepy import MINE 9 | from sklearn.datasets import load_iris 10 | iris=load_iris() 11 | #由于MINE的设计不是函数式的,定义mic方法将其为函数式的,返回一个二元组,二元组的第2项设置成固定的P值0.5 12 | def mic(x, y): 13 | m = MINE() 14 | m.compute_score(x, y) 15 | return (m.mic(), 0.5) 16 | 17 | #选择K个最好的特征,返回特征选择后的数据 18 | SelectKBest(lambda X, Y: array(map(lambda x:mic(x, Y), X.T)).T, k=2).fit_transform(iris.data, iris.target) 19 | -------------------------------------------------------------------------------- /区间缩放.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Jun 29 11:03:39 2017 4 | 5 | @author: Lenovo-Y430p 6 | """ 7 | from sklearn.datasets import load_iris 8 | from sklearn.preprocessing import MinMaxScaler 9 | import numpy as np 10 | #区间放缩法 11 | iris=load_iris() 12 | #def qujian(): 13 | #print(MinMaxScaler().fit_transform(iris.data)) 14 | 15 | #代码实现 16 | def minmax(): 17 | m=np.shape(iris.data)[1] 18 | temp1=[];temp2=[] 19 | for i in range(m): 20 | t=np.max(iris.data[:,i]) 21 | t1=np.min(iris.data[:,i]) 22 | temp1.append(float('%.2f'%t))#可以在转换成float的时候,就指定精度 23 | temp2.append(float('%.2f'%t1)) 24 | return np.mat(temp1),np.mat(temp2) 25 | 26 | def main(): 27 | #qujian() 28 | maxvalue,minvalue=minmax() 29 | offset=maxvalue-minvalue 30 | temp=(iris.data-minvalue)/offset 31 | print(temp) 32 | 33 | if __name__=='__main__': 34 | main() -------------------------------------------------------------------------------- /单变量选择.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jun 30 17:00:42 2017 4 | 5 | @author: Lenovo-Y430p 6 | """ 7 | #1 单变量特征选择 8 | 9 | #1.1 Pearson相关系数 (Pearson Correlation) [-1,1] 10 | #理解特征和响应变量之间关系的方法,该方法衡量的是变量之间的线性相关性 11 | 12 | import numpy as np 13 | from scipy.stats import pearsonr #从scipy中引入pearsonr 14 | from sklearn.datasets import load_iris 15 | iris=load_iris() 16 | print ("Lower noise", pearsonr(iris.data, iris.target)) 17 | #print ("Higher noise", pearsonr(x, x + np.random.normal(0, 10, size))) 18 | 19 | #明显缺陷:作为特征排序机制,他只对线性关系敏感.即便两个变量具有一一对应的关系,Pearson相关性也可能会接近0 20 | a = np.random.uniform(-1, 1, 100000) #uniform(low,high,size) 随机数 21 | print (pearsonr(a, a**2)[0]) 22 | 23 | 24 | #1.2 互信息和最大信息系数 (Mutual information and maximal information),[0,1] 25 | #互信息直接用于特征选择不太方便,最大信息系数首先寻找一种最优的离散化方式, 26 | #然后把互信息取值转换成一种度量方式,取值区间在[0,1]。minepy提供了MIC功能。 27 | 28 | from minepy import MINE # 29 | m = MINE() 30 | x = np.random.uniform(-1, 1, 10000) 31 | m.compute_score(x, x**2) 32 | print (m.mic()) 33 | 34 | 35 | #1.3 距离相关系数 (Distance correlation),[0,1] 36 | #距离相关系数是为了克服Pearson相关系数的弱点而生的。在x和x^2这个例子中,即便Pearson相关系数是0, 37 | #我们也不能断定这两个变量是独立的(有可能是非线性相关);但如果距离相关系数是0,那么我们就可以说这两个变量是独立的。 38 | import numpy as np 39 | 40 | def dist(x, y): 41 | #1d only 42 | return np.abs(x[:, None] - y) 43 | 44 | 45 | def d_n(x): 46 | d = dist(x, x) 47 | dn = d - d.mean(0) - d.mean(1)[:,None] + d.mean() 48 | return dn 49 | 50 | 51 | def dcov_all(x, y): 52 | dnx = d_n(x) 53 | dny = d_n(y) 54 | 55 | denom = np.product(dnx.shape) 56 | dc = (dnx * dny).sum() / denom 57 | dvx = (dnx**2).sum() / denom 58 | dvy = (dny**2).sum() / denom 59 | dr = dc / (np.sqrt(dvx) * np.sqrt(dvy)) 60 | return dc, dr, dvx, dvy 61 | 62 | 63 | import matplotlib.pyplot as plt 64 | 65 | fig = plt.figure() 66 | for case in range(1,5): 67 | 68 | np.random.seed(9854673) 69 | x = np.linspace(-1,1, 501) 70 | if case == 1: 71 | y = - x**2 + 0.2 * np.random.rand(len(x)) 72 | elif case == 2: 73 | y = np.cos(x*2*np.pi) + 0.1 * np.random.rand(len(x)) 74 | elif case == 3: 75 | x = np.sin(x*2*np.pi) + 0.0 * np.random.rand(len(x)) #circle 76 | elif case == 4: 77 | x = np.sin(x*1.5*np.pi) + 0.1 * np.random.rand(len(x)) #bretzel 78 | dc, dr, dvx, dvy = dcov_all(x, y) 79 | print( dc, dr, dvx, dvy) 80 | 81 | ax = fig.add_subplot(2,2, case) 82 | #ax.set_xlim(-1, 1) 83 | ax.plot(x, y, '.') 84 | yl = ax.get_ylim() 85 | ax.text(-0.95, yl[0] + 0.9 * np.diff(yl), 'dr=%4.2f' % dr) 86 | 87 | plt.show() 88 | 89 | 90 | #1.4 基于学习模型的特征排序 (Model based ranking) 91 | #直接使用你要用的机器学习算法,针对每个单独的特征和响应变量建立预测模型。线性:pearson等价;非线性:基于树的方法(决策树、随机森林) 92 | from sklearn.cross_validation import cross_val_score, ShuffleSplit 93 | from sklearn.datasets import load_boston 94 | from sklearn.ensemble import RandomForestRegressor 95 | #Load boston housing dataset as an example 96 | boston = load_boston() 97 | X = boston["data"] 98 | Y = boston["target"] 99 | names = boston["feature_names"] 100 | rf = RandomForestRegressor(n_estimators=20, max_depth=4) #树的深度最好不要太大,再就是运用交叉验证 101 | scores = [] 102 | for i in range(X.shape[1]): 103 | score = cross_val_score(rf, X[:, i:i+1], Y, scoring="r2", cv=ShuffleSplit(len(X), 3, .3)) 104 | scores.append((round(np.mean(score), 3), names[i])) 105 | print (sorted(scores, reverse=True)) 106 | -------------------------------------------------------------------------------- /卡方检验.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jun 30 16:03:18 2017 4 | 5 | @author: Lenovo-Y430p 6 | """ 7 | from sklearn.feature_selection import SelectKBest 8 | from sklearn.feature_selection import chi2 9 | from sklearn.datasets import load_iris 10 | iris=load_iris() 11 | #选择K个最好的特征,返回选择特征后的数据 12 | ''' 13 | 卡方检验就是统计样本的实际观测值与理论推断值之间的偏离程度, 14 | 实际观测值与理论推断值之间的偏离程度就决定卡方值的大小, 15 | 卡方值越大,越不符合;卡方值越小,偏差越小,越趋于符合, 16 | 若两个值完全相等时,卡方值就为0,表明理论值完全符合。 17 | 检验样本是否符合正太分布 18 | ''' 19 | #文本分析,和筛选异常用户 20 | print(SelectKBest(chi2, k=2).fit_transform(iris.data, iris.target)) 21 | 22 | -------------------------------------------------------------------------------- /基于惩罚项的特征选择法.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jun 30 16:08:26 2017 4 | 5 | @author: Lenovo-Y430p 6 | """ 7 | #基于惩罚项的特征选择法 8 | from sklearn.feature_selection import SelectFromModel 9 | from sklearn.linear_model import LogisticRegression 10 | from numpy import array 11 | from sklearn.datasets import load_iris 12 | iris=load_iris() 13 | #带L1惩罚项的逻辑回归作为基模型的特征选择 14 | print(SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(iris.data, iris.target)) 15 | 16 | class LR(LogisticRegression): 17 | def __init__(self, threshold=0.01, dual=False, tol=1e-4, C=1.0, 18 | fit_intercept=True, intercept_scaling=1, class_weight=None, 19 | random_state=None, solver='liblinear', max_iter=100, 20 | multi_class='ovr', verbose=0, warm_start=False, n_jobs=1): 21 | 22 | #权值相近的阈值 23 | self.threshold = threshold 24 | LogisticRegression.__init__(self, penalty='l1', dual=dual, tol=tol, C=C, 25 | fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight=class_weight, 26 | random_state=random_state, solver=solver, max_iter=max_iter, 27 | multi_class=multi_class, verbose=verbose, warm_start=warm_start, n_jobs=n_jobs) 28 | #使用同样的参数创建L2逻辑回归 29 | self.l2 = LogisticRegression(penalty='l2', dual=dual, tol=tol, C=C, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight = class_weight, random_state=random_state, solver=solver, max_iter=max_iter, multi_class=multi_class, verbose=verbose, warm_start=warm_start, n_jobs=n_jobs) 30 | 31 | def fit(self, X, y, sample_weight=None): 32 | #训练L1逻辑回归 33 | super(LR, self).fit(X, y, sample_weight=sample_weight) 34 | self.coef_old_ = self.coef_.copy() 35 | #训练L2逻辑回归 36 | self.l2.fit(X, y, sample_weight=sample_weight) 37 | 38 | cntOfRow, cntOfCol = self.coef_.shape 39 | #权值系数矩阵的行数对应目标值的种类数目 40 | for i in range(cntOfRow): 41 | for j in range(cntOfCol): 42 | coef = self.coef_[i][j] 43 | #L1逻辑回归的权值系数不为0 44 | if coef != 0: 45 | idx = [j] 46 | #对应在L2逻辑回归中的权值系数 47 | coef1 = self.l2.coef_[i][j] 48 | for k in range(cntOfCol): 49 | coef2 = self.l2.coef_[i][k] 50 | #在L2逻辑回归中,权值系数之差小于设定的阈值,且在L1中对应的权值为0 51 | if abs(coef1-coef2) < self.threshold and j != k and self.coef_[i][k] == 0: 52 | idx.append(k) 53 | #计算这一类特征的权值系数均值 54 | mean = coef / len(idx) 55 | self.coef_[i][idx] = mean 56 | return self 57 | #带L1和L2惩罚项的逻辑回归作为基模型的特征选择 58 | #参数threshold为权值系数之差的阈值 59 | print(SelectFromModel(LR(threshold=0.5, C=0.1)).fit_transform(iris.data, iris.target)) 60 | -------------------------------------------------------------------------------- /基于树模型的特征选择法.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jul 19 14:58:19 2017 4 | 5 | @author: Lenovo-Y430p 6 | """ 7 | from sklearn.feature_selection import SelectFromModel 8 | from sklearn.ensemble import GradientBoostingClassifier 9 | from sklearn.datasets import load_iris 10 | iris=load_iris() 11 | #GBDT作为基模型的特征选择 12 | print(SelectFromModel(GradientBoostingClassifier()).fit_transform(iris.data, iris.target)) 13 | -------------------------------------------------------------------------------- /对缺省值的处理.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jun 30 14:41:37 2017 4 | 5 | @author: Lenovo-Y430p 6 | """ 7 | from numpy import vstack, array, nan 8 | from sklearn.preprocessing import Imputer 9 | #缺失值计算,返回值为计算缺失值后的数据 10 | #参数missing_value为缺失值的表示形式,默认为NaN 11 | #参数strategy为缺失值填充方式,默认为mean(均值) 12 | imp = Imputer(missing_values='NaN', strategy='mean', axis=0) 13 | X=[[1, 2], [nan, 3], [7, 6]] 14 | Y=[[nan, 2], [6, nan], [7, 6]] 15 | print(imp.fit(X)) 16 | print(imp.transform(X)) 17 | print(imp.transform(Y)) -------------------------------------------------------------------------------- /归一化.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jun 30 11:33:41 2017 4 | 5 | @author: Lenovo-Y430p 6 | """ 7 | #归一化处理 8 | from sklearn.datasets import load_iris 9 | from sklearn.preprocessing import Normalizer 10 | import numpy as np 11 | #归一化,返回值为归一化后的数据 12 | iris=load_iris() 13 | print(Normalizer().fit_transform(iris.data)) 14 | #归一化就是把数据化成单位向量,这是按行进行处理,这样在用核函数计算相似性时具有统一的标准,使用的是L2归一化公式 15 | temp1=[] 16 | def guiyi(): 17 | for i in range(np.shape(iris.data)[0]): 18 | temp=np.sqrt(sum(np.multiply(iris.data[i,:],iris.data[i,:]))) 19 | print(temp) 20 | iris.data[i,:]/=temp 21 | print(iris.data) 22 | 23 | def main(): 24 | guiyi() 25 | 26 | 27 | if __name__=='__main__': 28 | main() 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /数据变换.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jun 30 15:27:50 2017 4 | 5 | @author: Lenovo-Y430p 6 | """ 7 | from numpy import vstack, array, nan 8 | from sklearn.preprocessing import Imputer 9 | from sklearn.datasets import load_iris 10 | import numpy as np 11 | from sklearn.preprocessing import PolynomialFeatures 12 | #多项式转换 13 | #参数degree为度,默认值为2 14 | iris=load_iris() 15 | print(PolynomialFeatures().fit_transform(iris.data)) 16 | from numpy import log1p 17 | from sklearn.preprocessing import FunctionTransformer 18 | #自定义转换函数为对数函数的数据变换 19 | #第一个参数是单变元函数 20 | print(FunctionTransformer(log1p).fit_transform(iris.data)) 21 | 22 | -------------------------------------------------------------------------------- /数据标准化.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Jun 29 10:59:15 2017 4 | 5 | @author: Lenovo-Y430p 6 | """ 7 | 8 | from sklearn.datasets import load_iris 9 | from sklearn.preprocessing import StandardScaler 10 | import matplotlib.pyplot as plt 11 | from sklearn import preprocessing 12 | import numpy as np 13 | iris=load_iris() 14 | #不能保存 15 | def bianhuan(): 16 | X_scaled = preprocessing.scale(iris.data) 17 | print(X_scaled) 18 | print(X_scaled.mean(axis=0))#均值 19 | print(X_scaled.std(axis=0)) #方差 20 | #能保存 21 | def baocuan(): 22 | scaler = StandardScaler().fit(iris.data) 23 | print(scaler) 24 | #StandardScaler(copy=True, with_mean=True, with_std=True) 25 | print(scaler.mean_) 26 | print(scaler.std_) 27 | #测试将该scaler用于输入数据,变换之后得到的结果同上 28 | print(scaler.transform(iris.data)) 29 | 30 | if __name__=='__main__': 31 | x=np.mean(iris.data,axis=0)#axis=0是按列,axis=1是按行 32 | #print(x) 33 | c=np.multiply((iris.data-x),(iris.data-x)) 34 | b=c/len(iris.data) 35 | t=sum(b,axis=0) 36 | t=np.sqrt(t) 37 | #print(t) 38 | #baocuan() 39 | print((iris.data-x)/t)#具有广播规则 40 | #标准化源码公式:x=(x-mean(x))/std(x) 相当于标准正态分布 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /数据预处理与特征选择.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jun 28 12:49:48 2017 4 | 5 | @author: Lenovo-Y430p 6 | """ 7 | from sklearn.datasets import load_iris 8 | iris=load_iris() 9 | iris.data 10 | iris.target 11 | #标准化 12 | from sklearn.preprocessing import StandardScaler 13 | #标准化,返回值为标准化后的数据 14 | 15 | StandardScaler().fit_transform(iris.data) 16 | from sklearn.preprocessing import MinMaxScaler 17 | #区间放缩法 18 | MinMaxScaler().fit_transform(iris.data) 19 | from sklearn.preprocessing import Normalizer 20 | #归一化,返回值为归一化后的数据 21 | Normalizer().fit_transform(iris.data) 22 | from sklearn.preprocessing import Binarizer 23 | #二值化,阈值设置为3,返回值为二值化后的数据 24 | Binarizer(threshold=3).fit_transform(iris.data) 25 | from sklearn.preprocessing import OneHotEncoder 26 | #哑编码,对IRIS数据集的目标值,返回值为哑编码后的数据 27 | OneHotEncoder().fit_transform(iris.target.reshape((-1,1))) 28 | from numpy import vstack, array, nan 29 | from sklearn.preprocessing import Imputer 30 | #缺失值计算,返回值为计算缺失值后的数据 31 | #参数missing_value为缺失值的表示形式,默认为NaN 32 | #参数strategy为缺失值填充方式,默认为mean(均值) 33 | Imputer().fit_transform(vstack((array([nan, nan, nan, nan]), iris.data))) 34 | from sklearn.preprocessing import PolynomialFeatures 35 | #多项式转换 36 | #参数degree为度,默认值为2 37 | PolynomialFeatures().fit_transform(iris.data) 38 | from numpy import log1p 39 | from sklearn.preprocessing import FunctionTransformer 40 | #自定义转换函数为对数函数的数据变换 41 | #第一个参数是单变元函数 42 | FunctionTransformer(log1p).fit_transform(iris.data) 43 | from sklearn.feature_selection import VarianceThreshold 44 | #方差选择法,返回值为特征选择后的数据 45 | #参数threshold为方差的阈值 46 | VarianceThreshold(threshold=3).fit_transform(iris.data) 47 | from sklearn.feature_selection import SelectKBest 48 | from scipy.stats import pearsonr 49 | 50 | #选择K个最好的特征,返回选择特征后的数据 51 | #第一个参数为计算评估特征是否好的函数,该函数输入特征矩阵和目标向量,输出二元组(评分,P值)的数组,数组第i项为第i个特征的评分和P值。在此定义为计算相关系数 52 | #参数k为选择的特征个数 53 | SelectKBest(lambda X, Y: array(map(lambda x:pearsonr(x, Y), X.T)).T, k=2).fit_transform(iris.data, iris.target) 54 | from sklearn.feature_selection import SelectKBest 55 | from sklearn.feature_selection import chi2 56 | 57 | #选择K个最好的特征,返回选择特征后的数据 58 | SelectKBest(chi2, k=2).fit_transform(iris.data, iris.target) 59 | from sklearn.feature_selection import SelectKBest 60 | from minepy import MINE 61 | 62 | #由于MINE的设计不是函数式的,定义mic方法将其为函数式的,返回一个二元组,二元组的第2项设置成固定的P值0.5 63 | def mic(x, y): 64 | m = MINE() 65 | m.compute_score(x, y) 66 | return (m.mic(), 0.5) 67 | 68 | #选择K个最好的特征,返回特征选择后的数据 69 | SelectKBest(lambda X, Y: array(map(lambda x:mic(x, Y), X.T)).T, k=2).fit_transform(iris.data, iris.target) 70 | from sklearn.feature_selection import RFE 71 | from sklearn.linear_model import LogisticRegression 72 | 73 | #递归特征消除法,返回特征选择后的数据 74 | #参数estimator为基模型 75 | #参数n_features_to_select为选择的特征个数 76 | RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(iris.data, iris.target) 77 | from sklearn.feature_selection import SelectFromModel 78 | from sklearn.linear_model import LogisticRegression 79 | 80 | #带L1惩罚项的逻辑回归作为基模型的特征选择 81 | SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(iris.data, iris.target) 82 | from sklearn.linear_model import LogisticRegression 83 | 84 | class LR(LogisticRegression): 85 | def __init__(self, threshold=0.01, dual=False, tol=1e-4, C=1.0, 86 | fit_intercept=True, intercept_scaling=1, class_weight=None, 87 | random_state=None, solver='liblinear', max_iter=100, 88 | multi_class='ovr', verbose=0, warm_start=False, n_jobs=1): 89 | 90 | #权值相近的阈值 91 | self.threshold = threshold 92 | LogisticRegression.__init__(self, penalty='l1', dual=dual, tol=tol, C=C, 93 | fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight=class_weight, 94 | random_state=random_state, solver=solver, max_iter=max_iter, 95 | multi_class=multi_class, verbose=verbose, warm_start=warm_start, n_jobs=n_jobs) 96 | #使用同样的参数创建L2逻辑回归 97 | self.l2 = LogisticRegression(penalty='l2', dual=dual, tol=tol, C=C, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight = class_weight, random_state=random_state, solver=solver, max_iter=max_iter, multi_class=multi_class, verbose=verbose, warm_start=warm_start, n_jobs=n_jobs) 98 | 99 | def fit(self, X, y, sample_weight=None): 100 | #训练L1逻辑回归 101 | super(LR, self).fit(X, y, sample_weight=sample_weight) 102 | self.coef_old_ = self.coef_.copy() 103 | #训练L2逻辑回归 104 | self.l2.fit(X, y, sample_weight=sample_weight) 105 | 106 | cntOfRow, cntOfCol = self.coef_.shape 107 | #权值系数矩阵的行数对应目标值的种类数目 108 | for i in range(cntOfRow): 109 | for j in range(cntOfCol): 110 | coef = self.coef_[i][j] 111 | #L1逻辑回归的权值系数不为0 112 | if coef != 0: 113 | idx = [j] 114 | #对应在L2逻辑回归中的权值系数 115 | coef1 = self.l2.coef_[i][j] 116 | for k in range(cntOfCol): 117 | coef2 = self.l2.coef_[i][k] 118 | #在L2逻辑回归中,权值系数之差小于设定的阈值,且在L1中对应的权值为0 119 | if abs(coef1-coef2) < self.threshold and j != k and self.coef_[i][k] == 0: 120 | idx.append(k) 121 | #计算这一类特征的权值系数均值 122 | mean = coef / len(idx) 123 | self.coef_[i][idx] = mean 124 | return self 125 | from sklearn.feature_selection import SelectFromModel 126 | 127 | #带L1和L2惩罚项的逻辑回归作为基模型的特征选择 128 | #参数threshold为权值系数之差的阈值 129 | SelectFromModel(LR(threshold=0.5, C=0.1)).fit_transform(iris.data, iris.target) 130 | from sklearn.feature_selection import SelectFromModel 131 | from sklearn.ensemble import GradientBoostingClassifier 132 | 133 | #GBDT作为基模型的特征选择 134 | SelectFromModel(GradientBoostingClassifier()).fit_transform(iris.data, iris.target) 135 | from sklearn.decomposition import PCA 136 | 137 | #主成分分析法,返回降维后的数据 138 | #参数n_components为主成分数目 139 | PCA(n_components=2).fit_transform(iris.data) 140 | from sklearn.lda import LDA 141 | 142 | #线性判别分析法,返回降维后的数据 143 | #参数n_components为降维后的维数 144 | LDA(n_components=2).fit_transform(iris.data, iris.target) 145 | 146 | 147 | -------------------------------------------------------------------------------- /方差判别特征相似程度.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jun 30 15:35:51 2017 4 | 5 | @author: Lenovo-Y430p 6 | """ 7 | from sklearn.datasets import load_iris 8 | import numpy as np 9 | from sklearn.feature_selection import VarianceThreshold 10 | iris=load_iris() 11 | #方差选择法,返回值为特征选择后的数据 12 | #参数threshold为方差的阈值 13 | print(VarianceThreshold(threshold=3).fit_transform(iris.data)) 14 | def variance(): 15 | x=np.mean(iris.data) 16 | temp=np.multiply((iris.data-x),(iris.data-x)) 17 | temp1=np.sqrt(np.sum(temp,axis=0)) 18 | print(temp1) 19 | def main(): 20 | variance() 21 | if __name__=='__main__': 22 | main() 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /最大互信息系统.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jun 30 18:23:35 2017 4 | 5 | @author: Lenovo-Y430p 6 | """ 7 | #1.2 互信息和最大信息系数 (Mutual information and maximal information),[0,1] 8 | #互信息直接用于特征选择不太方便,最大信息系数首先寻找一种最优的离散化方式, 9 | #然后把互信息取值转换成一种度量方式,取值区间在[0,1]。minepy提供了MIC功能。 10 | import numpy as np 11 | from minepy import MINE # 12 | m = MINE() 13 | x = np.random.uniform(-1, 1, 10000) 14 | m.compute_score(x, x**2) 15 | print (m.mic()) 16 | -------------------------------------------------------------------------------- /特征提取.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jul 19 19:41:18 2017 4 | 5 | @author: Lenovo-Y430p 6 | """ 7 | from sklearn.datasets import load_boston 8 | from sklearn.linear_model import (LinearRegression, Ridge, 9 | Lasso, RandomizedLasso) 10 | from sklearn.feature_selection import RFE, f_regression 11 | from sklearn.preprocessing import MinMaxScaler 12 | from sklearn.ensemble import RandomForestRegressor 13 | import numpy as np 14 | from minepy import MINE 15 | 16 | np.random.seed(0) 17 | 18 | size = 750 19 | X = np.random.uniform(0, 1, (size, 14)) 20 | #x0到x4对输出来说是有用的 21 | #"Friedamn #1” regression problem 22 | Y = (10 * np.sin(np.pi*X[:,0]*X[:,1]) + 20*(X[:,2] - .5)**2 + 23 | 10*X[:,3] + 5*X[:,4] + np.random.normal(0,1)) 24 | #Add 3 additional correlated variables (correlated with X1-X3) 25 | #x10到x14是x0到x4的变种 26 | X[:,9:] = X[:,:5] + np.random.normal(0, .025, (size,5)) 27 | #其余的都是噪声 28 | names = ["x%s" % i for i in range(0,14)] 29 | ranks = {} 30 | 31 | def rank_to_dict(ranks, names, order=1): 32 | minmax = MinMaxScaler() 33 | ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0] 34 | ranks = list(map(lambda x: round(x, 2), ranks)) 35 | return dict(zip(names, ranks )) 36 | 37 | lr = LinearRegression(normalize=True) 38 | lr.fit(X, Y) 39 | ranks["reg"] = rank_to_dict(np.abs(lr.coef_), names) 40 | 41 | ridge = Ridge(alpha=7) 42 | ridge.fit(X, Y) 43 | ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names) 44 | 45 | 46 | lasso = Lasso(alpha=0.05) 47 | lasso.fit(X, Y) 48 | ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names) 49 | 50 | 51 | rlasso = RandomizedLasso(alpha=0.04) 52 | rlasso.fit(X, Y) 53 | ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names) 54 | 55 | #stop the search when 5 features are left (they will get equal scores) 56 | rfe = RFE(lr, n_features_to_select=5) 57 | rfe.fit(X,Y) 58 | ranks["RFE"] = rank_to_dict(list(map(float, rfe.ranking_)), names, order=-1) 59 | 60 | rf = RandomForestRegressor() 61 | rf.fit(X,Y) 62 | ranks["RF"] = rank_to_dict(rf.feature_importances_, names) 63 | 64 | 65 | f, pval = f_regression(X, Y, center=True) 66 | ranks["Corr."] = rank_to_dict(f, names) 67 | ''' 68 | mine = MINE() 69 | mic_scores = [] 70 | for i in range(X.shape[1]): 71 | mine.compute_score(X[:,i], Y) 72 | m = mine.mic() 73 | mic_scores.append(m) 74 | 75 | ranks["MIC"] = rank_to_dict(mic_scores, names) 76 | 77 | ''' 78 | r = {} 79 | for name in names: 80 | r[name] = round(np.mean([ranks[method][name] 81 | for method in ranks.keys()]), 2) 82 | 83 | methods = sorted(ranks.keys()) 84 | ranks["Mean"] = r 85 | methods.append("Mean") 86 | 87 | print ("\t%s" % "\t".join(methods)) 88 | for name in names: 89 | print ("%s\t%s" % (name, "\t".join(map(str, 90 | [ranks[method][name] for method in methods])))) 91 | -------------------------------------------------------------------------------- /特征的相关系数法.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jun 30 15:49:15 2017 4 | 5 | @author: Lenovo-Y430p 6 | """ 7 | from sklearn.feature_selection import SelectKBest 8 | from scipy.stats import pearsonr 9 | from sklearn.datasets import load_iris 10 | from numpy import array 11 | iris=load_iris() 12 | #选择K个最好的特征,返回选择特征后的数据 13 | #第一个参数为计算评估特征是否好的函数,该函数输入特征矩阵和目标向量,输出二元组(评分,P值)的数组,数组第i项为第i个特征的评分和P值。在此定义为计算相关系数 14 | #参数k为选择的特征个数 15 | print(SelectKBest(lambda X, Y: array(map(lambda x:pearsonr(x, Y), X.T)).T, k=2).fit_transform(iris.data, iris.target)) 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /皮尔逊相关系数.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jun 30 18:08:48 2017 4 | 5 | @author: Lenovo-Y430p 6 | """ 7 | #皮尔逊相关系数(来处理线性关系) 8 | #1.1 Pearson相关系数 (Pearson Correlation) [-1,1] 9 | #理解特征和响应变量之间关系的方法,该方法衡量的是变量之间的线性相关性,可以理解为两个向量之间的夹角 10 | #之所以能度量相似程度就是因为时距离均值的大小,同正同负表征了变化的方向和大小。但因为离散程度不同 11 | #需要加入对标准差的除法 12 | ''' 13 | 统计学上规定的P值意义见下表 14 | P值 碰巧的概率 对无效假设 统计意义 15 | P>0.05 碰巧出现的可能性大于5% 不能否定无效假设 两组差别无显著意义 16 | P<0.05 碰巧出现的可能性小于5% 可以否定无效假设 两组差别有显著意义 17 | P <0.01 碰巧出现的可能性小于1% 可以否定无效假设 两者差别有非常显著意义 18 | ''' 19 | import numpy as np 20 | from scipy.stats import pearsonr #从scipy中引入pearsonr 21 | from sklearn.datasets import load_iris 22 | from numpy import * 23 | iris=load_iris() 24 | m=shape(iris.data)[1] 25 | for i in range(m): 26 | print ("i", pearsonr(iris.data[:,i], iris.target)) 27 | #print ("Higher noise", pearsonr(x, x + np.random.normal(0, 10, size))) 28 | #明显缺陷:作为特征排序机制,他只对线性关系敏感.即便两个变量具有一一对应的关系,Pearson相关性也可能会接近0 29 | a = np.random.uniform(-1, 1, 100000) #uniform(low,high,size) 随机数 30 | print (pearsonr(a, a**2)) 31 | #返回两个值评分和p值,p值越大越坏 32 | def pearson(x,y): 33 | xmean=mean(x,axis=0) 34 | ymean=mean(y,axis=0) 35 | covxy=dot((x-xmean).T,y-ymean) 36 | xvar=dot((x-xmean).T,x-xmean) 37 | yvar=dot((y-ymean).T,y-ymean) 38 | p=covxy/sqrt(xvar*yvar) 39 | print(p) 40 | def main(): 41 | iris=load_iris() 42 | m=shape(iris.data)[1] 43 | for i in range(m): 44 | pearson(iris.data[:,i], iris.target) 45 | if __name__=='__main__': 46 | main() 47 | -------------------------------------------------------------------------------- /距离相关系数.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jul 19 20:52:11 2017 4 | 5 | @author: Lenovo-Y430p 6 | """ 7 | #计算欧几里德距离: 8 | def euclidean(p,q): 9 | #如果两数据集数目不同,计算两者之间都对应有的数 10 | same = 0 11 | for i in p: 12 | if i in q: 13 | same +=1 14 | 15 | #计算欧几里德距离,并将其标准化 16 | e = sum([(p[i] - q[i])**2 for i in range(same)]) 17 | return 1/(1+e**.5) 18 | --------------------------------------------------------------------------------