├── README.md ├── kmm.py ├── TrAdaboost2.py ├── TrAdaboostreg.py ├── kmmClassification.py ├── TrAdaboostkmm.py ├── SPY.py ├── Mutisource.py ├── TrReg.py ├── test.py └── TrAdaboost.R2.py /README.md: -------------------------------------------------------------------------------- 1 | # TransferLearning 2 | instance based Transfer learning, TrAdaboost, mutisource-trAdaBoost regresion 3 | -------------------------------------------------------------------------------- /kmm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | import math 5 | import matplotlib.pyplot as plt 6 | from cvxopt import matrix, solvers 7 | def kernel_mean_matching(X, Z, kern='lin', B=1.0, eps=None): 8 | nx = X.shape[0] 9 | nz = Z.shape[0] 10 | if eps == None: 11 | eps = B / math.sqrt(nz) 12 | if kern == 'lin': 13 | K = np.dot(Z, Z.T) 14 | kappa = np.sum(np.dot(Z, X.T) * float(nz) / float(nx), axis=1) 15 | elif kern == 'rbf': 16 | K = compute_rbf(Z, Z) 17 | kappa = np.sum(compute_rbf(Z, X), axis=1) * float(nz) / float(nx) 18 | else: 19 | raise ValueError('unknown kernel') 20 | 21 | K = matrix(K) 22 | kappa = matrix(kappa) 23 | G = matrix(np.r_[np.ones((1, nz)), -np.ones((1, nz)), np.eye(nz), -np.eye(nz)]) 24 | h = matrix(np.r_[nz * (1 + eps), nz * (eps - 1), B * np.ones((nz,)), np.zeros((nz,))]) 25 | 26 | sol = solvers.qp(K, -kappa, G, h) 27 | coef = np.array(sol['x']) 28 | return coef 29 | 30 | 31 | def compute_rbf(X, Z, sigma=1.0): 32 | K = np.zeros((X.shape[0], Z.shape[0]), dtype=float) 33 | for i, vx in enumerate(X): 34 | K[i, :] = np.exp(-np.sum((vx - Z) ** 2, axis=1) / (2.0 * sigma)) 35 | return K 36 | x = 11*np.random.random(200)- 6.0 37 | y = x**2 + 10*np.random.random(200) - 5 38 | Z = np.c_[x, y] 39 | 40 | x = 2*np.random.random(10) - 6.0 41 | y = x**2 + 10*np.random.random(10) - 5 42 | X = np.c_[x, y] 43 | 44 | 45 | 46 | from sklearn.model_selection import train_test_split 47 | from sklearn.linear_model import LogisticRegression 48 | from sklearn import metrics 49 | # 样本重标记 50 | lable_spy_a = np.zeros([200, 1]) 51 | lable_spy_s = np.ones([10, 1]) 52 | 53 | 54 | trans_data = np.concatenate((X, Z), axis=0) 55 | trans_label = np.concatenate((lable_spy_s,lable_spy_a), axis=0) 56 | 57 | X_train, X_test, y_train, y_test = train_test_split(trans_data, trans_label, test_size=0.33, random_state=42) 58 | clf = LogisticRegression(penalty='l1',class_weight='balanced') 59 | # gnb = BernoulliNB() 60 | clf.fit(X_train, y_train) 61 | print("LR的预测精度", metrics.confusion_matrix(y_test, clf.predict(X_test))) 62 | print("LR的预测精度", metrics.accuracy_score(y_test, clf.predict(X_test))) 63 | 64 | 65 | 66 | 67 | 68 | coef = clf.predict_proba(Z)[:, -1].tolist() 69 | 70 | 71 | # coef = kernel_mean_matching(X, Z, kern='rbf', B=10) 72 | # print(coef) 73 | # print(coef.shape) 74 | # print(Z.shape) 75 | # 76 | # plt.close() 77 | # plt.figure() 78 | # plt.scatter(Z[:,0], Z[:,1], color='black', marker='x') 79 | # plt.scatter(X[:,0], X[:,1], color='red') 80 | # plt.scatter(Z[:,0], Z[:,1], color='green', s=coef*10, alpha=0.5) 81 | # plt.show() 82 | # np.sum(coef > 1e-2) 83 | 84 | # 85 | # 86 | # print(coef) 87 | # print(Z.shape) 88 | # 89 | # plt.close() 90 | # plt.figure() 91 | # 92 | # w=clf.coef_ 93 | # p=clf.intercept_ 94 | # print(w) 95 | # print(p) 96 | # x = np.mat(np.arange(min(Z[:,0]),max(Z[:,0]), 0.1)) 97 | # y = (-p[0]- w[0,0] * x) / w[0,1] 98 | # 99 | # coef = np.asarray(coef) 100 | # plt.plot(x.transpose(), y.transpose()) 101 | # # plt.scatter(Z[:,0], Z[:,1], color='black', marker='x') 102 | # plt.scatter(X[:,0], X[:,1], color='red', marker='x') 103 | # plt.scatter(Z[:,0], Z[:,1], color='green',marker='o',s=coef*80, alpha=0.5) 104 | # plt.show() 105 | -------------------------------------------------------------------------------- /TrAdaboost2.py: -------------------------------------------------------------------------------- 1 | import sklearn.svm 2 | from sklearn.datasets import fetch_20newsgroups 3 | from dataQuality.kmm import * 4 | # ala=np.concatenate((trans_A, label_A.reshape(row_A,1)[:,-1:]), axis=1) 5 | # s=np.concatenate((trans_S, label_S.reshape(row_S,1)[:,-1:]), axis=1) 6 | # 初始化权重 7 | # coef = kernel_mean_matching(s,ala, 8 | # kern='rbf', B=10) 9 | # code by chenchiwei 10 | # -*- coding: UTF-8 -*- 11 | import numpy as np 12 | from sklearn import tree 13 | from scipy import sparse 14 | from sklearn import metrics 15 | from sklearn import svm 16 | # H 测试样本分类结果 17 | # TrainS 原训练样本 np数组 18 | # TrainA 辅助训练样本 19 | # LabelS 原训练样本标签 20 | # LabelA 辅助训练样本标签 21 | # Test 测试样本 22 | # N 迭代次数 23 | from KMM import kmmClassification 24 | def tradaboost(trans_S, trans_A, label_S, label_A, test,test_label, N): 25 | trans_data = sparse.vstack((trans_A, trans_S)) 26 | trans_label = np.concatenate((label_A, label_S), axis=0) 27 | 28 | row_A = trans_A.shape[0] 29 | row_S = trans_S.shape[0] 30 | row_T = test.shape[0] 31 | 32 | # print('目标源的大小',row_S,'辅助源的大小',row_A,'测试集的大小',row_T) 33 | test_data = sparse.vstack((trans_data, test)) 34 | 35 | 36 | # coef = kmmClassification.getBeta(trans_A,test.toarray(),49098) 37 | # weights_A = coef 38 | # weights_A = np.asarray(weights_A).reshape(row_A,1) 39 | # total=sum(weights_A[:,0]) 40 | # for j in range(row_A): 41 | # weights_A[j,0] = weights_A[j,0]/total 42 | # weights_S = np.ones([row_S, 1]) * np.mean(weights_A) 43 | weights_S = np.ones([row_S, 1])/row_S 44 | weights_A = np.ones([row_A, 1])/row_A 45 | # weights_S = np.ones([row_S, 1]) 46 | # weights_A = np.ones([row_A, 1]) 47 | weights = np.concatenate((weights_A, weights_S), axis=0) 48 | 49 | bata = 1 / (1 + np.sqrt(2.0 * np.log(row_A/ N))) 50 | 51 | #bata = 1/(1+np.sqrt(2.0*np.log(row_A)/N)); 52 | 53 | # 存储每次迭代的标签和bata值? 54 | bata_T = np.zeros([1, N]) 55 | result_label = np.ones([row_A + row_S + row_T, N]) 56 | 57 | predict = np.zeros([row_T]) 58 | 59 | # trans_data = np.asarray(trans_data, order='C') 60 | # trans_label = np.asarray(trans_label, order='C') 61 | # test_data = np.asarray(test_data, order='C') 62 | 63 | # print(trans_data.shape) 64 | # print(test_data.shape) 65 | accuracy_scorelist=[] 66 | f1_scorelist=[] 67 | recall_scorelist=[] 68 | for i in range(N): 69 | P = calculate_P(weights, trans_label) 70 | 71 | result_label[:, i] = train_classify(trans_data, trans_label, 72 | test_data, P) 73 | 74 | error_rate = 0.0 75 | for j in range(row_A, row_A + row_S): 76 | error_rate += (weights[j] * abs(result_label[j, i] - trans_label[j])) 77 | error_rate = error_rate / sum(weights[row_A:]) 78 | 79 | 80 | #error_rate = calculate_error_rate(label_S, result_label[row_A:row_A + row_S, i], 81 | # weights[row_A:row_A + row_S, :]) 82 | #print ('Error rate:', error_rate) 83 | # if error_rate != 1: 84 | # bata_T[0, i] = error_rate / (1.0 - error_rate) 85 | # if error_rate >= 0.5 and error_rate != 1: 86 | # bata_T[0, i] = 0.45 / (0.51) 87 | # if error_rate == 1: 88 | # bata_T[0, i] = 0.4 89 | 90 | if error_rate >= 0.5: 91 | #error_rate = 0.5 92 | error_rate = 0.499; 93 | if error_rate == 0: 94 | #error_rate = 0.000001 95 | #error_rate=0.0001 96 | error_rate = 0.001 97 | 98 | bata_T[0, i] = error_rate / (1 - error_rate) 99 | # Ct = 2 * (1 - error_rate); 100 | # 调整源域样本权重 101 | for j in range(row_S): 102 | weights[row_A + j] = weights[row_A + j] * np.power(bata_T[0, i], 103 | (-np.abs(result_label[row_A + j, i] - trans_label[row_A+j]))) 104 | 105 | # 调整辅域样本权重 106 | for j in range(row_A): 107 | weights[j] = weights[j] * np.power(bata, np.abs(result_label[j, i] - trans_label[j])) 108 | 109 | 110 | ##每次迭代完成计算下在测试集合上的误差 111 | # predic_temp = np.zeros([row_T]) 112 | # iteration = i; 113 | # for i in range(row_T): 114 | # left = np.sum( 115 | # result_label[row_A + row_S + i, int(np.ceil(iteration / 2)):iteration] * np.log(1 / bata_T[0, int(np.ceil(iteration / 2)):iteration])) 116 | # right = 0.5 * np.sum(np.log(1 / bata_T[0, int(np.ceil(iteration / 2)):iteration])) 117 | # if left >= right: 118 | # predic_temp[i] = 1 119 | # else: 120 | # predic_temp[i] = 0 121 | # accuracy_scorelist.append(metrics.accuracy_score(test_label, predic_temp)) 122 | # recall_scorelist.append(metrics.recall_score(test_label, predic_temp)) 123 | # f1_scorelist.append(metrics.f1_score(test_label, predic_temp)) 124 | # print bata_T 125 | for i in range(row_T): 126 | # 跳过训练数据的标签 127 | # left = np.sum( 128 | # result_label[row_A + row_S + i, int(np.ceil(N / 2)):N] * np.log(1 / bata_T[0, int(np.ceil(N / 2)):N])) 129 | # right = 0.5 * np.sum(np.log(1 / bata_T[0, int(np.ceil(N / 2)):N])) 130 | left = np.sum( 131 | result_label[row_A + row_S + i, 0:N] * np.log(1 / bata_T[0, 0:N])) 132 | right = 0.5 * np.sum(np.log(1 / bata_T[0, 0:N])) 133 | if left >= right: 134 | predict[i] = 1 135 | else: 136 | predict[i] = 0 137 | # print left, right, predict[i] 138 | # predict[i] = left - right; 139 | return predict, accuracy_scorelist, recall_scorelist, f1_scorelist 140 | 141 | 142 | def calculate_P(weights, label): 143 | total = np.sum(weights) 144 | return np.asarray(weights)/total 145 | 146 | from sklearn.linear_model import LogisticRegression 147 | 148 | def train_classify(trans_data, trans_label, test_data, P): 149 | clf = LogisticRegression() 150 | clf.fit(trans_data, trans_label, sample_weight=P[:, 0]) 151 | return clf.predict(test_data) 152 | 153 | 154 | # def calculate_error_rate(label_R, label_H, weight): 155 | # total = np.sum(weight) 156 | # #return np.sum((weight[:, 0] / total)* np.abs(label_R - label_H)) 157 | # return return_correct_rate(label_R,label_H) 158 | -------------------------------------------------------------------------------- /TrAdaboostreg.py: -------------------------------------------------------------------------------- 1 | # code by chenchiwei 2 | # -*- coding: UTF-8 -*- 3 | import numpy as np 4 | from sklearn import tree 5 | from sklearn import svm 6 | import math 7 | from sklearn.metrics import r2_score 8 | from sklearn.tree import DecisionTreeRegressor 9 | 10 | def stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08): 11 | out = np.cumsum(arr, axis=axis, dtype=np.float64) 12 | return out 13 | 14 | 15 | # H 测试样本分类结果 16 | # TrainS 原训练样本 np数组 17 | # TrainA 辅助训练样本 18 | # LabelS 原训练样本标签 19 | # LabelA 辅助训练样本标签 20 | # Test 测试样本 21 | # N 迭代次数 22 | def tradaboost(trans_S, trans_A, label_S, label_A, test, y_target_test,N,islog): 23 | trans_data = np.concatenate((trans_A, trans_S), axis=0) 24 | trans_label = np.concatenate((label_A, label_S), axis=0) 25 | 26 | row_A = trans_A.shape[0] 27 | row_S = trans_S.shape[0] 28 | row_T = test.shape[0] 29 | 30 | test_data = np.concatenate((trans_data, test), axis=0) 31 | 32 | # 初始化权重 33 | weights_A = np.ones([row_A, 1])/row_A 34 | weights_S = np.ones([row_S, 1])/row_S 35 | weights = np.concatenate((weights_A, weights_S), axis=0) 36 | 37 | bata = 1 / (1 + np.sqrt(2 * np.log(row_A/N))) 38 | 39 | # 存储每次迭代的标签和bata值 40 | bata_T = np.zeros([1, N]) 41 | result_label = np.ones([row_A + row_S + row_T, N]) 42 | 43 | predict = np.zeros([row_T]) 44 | 45 | #print ('params initial finished.') 46 | 47 | 48 | for i in range(N): 49 | #将权重向量归一化 50 | P = calculate_P(weights) 51 | 52 | 53 | result_label[:, i] = train_classify(trans_data, trans_label, 54 | test_data, weights,test,y_target_test,islog) 55 | 56 | temp0 = np.abs(result_label[:row_A + row_S, i] - trans_label) 57 | error_max0 = temp0.max() 58 | temp = np.abs(result_label[row_A:row_A + row_S, i] - label_S) 59 | error_max = temp.max() 60 | if error_max0==0.0 or error_max==0.0: 61 | N=i; 62 | break 63 | temp2 = np.abs(result_label[:row_A, i] - label_A) 64 | error_max2 = temp2.max() 65 | error_rate = 0.0 66 | for j in range(row_A, row_A + row_S): 67 | error_rate += (weights[j] * ((abs(result_label[j, i] - trans_label[j])/error_max0))) 68 | error_rate = error_rate / sum(weights[row_A:]) 69 | if error_rate >= 0.5: 70 | error_rate = 0.499; 71 | if error_rate == 0: 72 | error_rate=0.001 73 | bata_T[0, i] = error_rate / (1 - error_rate) 74 | 75 | for j in range(row_S): 76 | weights[row_A + j] = weights[row_A + j] * np.power(bata_T[0, i], -( 77 | np.abs(result_label[row_A + j, i] - label_S[j]) / error_max0)) 78 | # 调整辅域样本权重 79 | for j in range(row_A): 80 | if islog: 81 | if (abs(result_label[j, i] - label_A[j]) >0):#0.02872 82 | weights[j] = weights[j] * np.power(bata, np.abs((result_label[j, i] - label_A[j])/error_max0)) 83 | else: 84 | weights[j] = weights[j] * np.power(bata, np.abs((result_label[j, i] - label_A[j]) / error_max0)) 85 | # bata_T[0,:]=bata_T[0,:]/np.sum(bata_T[0,:]) 86 | 87 | # 88 | predictions=result_label[row_A + row_S:,int(np.ceil(N / 2)):N] 89 | # Sort the predictions 90 | sorted_idx = np.argsort(predictions, axis=1) 91 | # Find index of median prediction for each sample 92 | bata_T = np.log(1/bata_T[0, int(np.ceil(N / 2)):N]) 93 | #bata_T = bata_T[0, int(np.ceil(N / 2)):N] 94 | bata_T[:] = bata_T[:] / np.sum(bata_T[:]) 95 | weight_cdf = stable_cumsum(bata_T[sorted_idx], axis=1) 96 | median_or_above = weight_cdf >= 0.5 * weight_cdf[:, -1][:, np.newaxis] 97 | median_idx = median_or_above.argmax(axis=1) 98 | median_estimators = sorted_idx[np.arange(test.shape[0]), median_idx] 99 | # Return median predictions 100 | return predictions[np.arange(test.shape[0]), median_estimators] 101 | # 102 | # for i in range(row_T): 103 | # # 跳过训练数据的标签 104 | # # predict[i]=np.median( 105 | # # result_label[row_A + row_S + i, :] * np.log(1 / bata_T[0, :])) 106 | # # predict[i] = np.sum( 107 | # # result_label[row_A + row_S + i, :] * (1-bata_T[0, :])) 108 | # 109 | # predict[i] = weighted_median(result_label[row_A + row_S + i,int(np.ceil(N / 2)):N], 110 | # np.log(1 / bata_T[0,int(np.ceil(N / 2)):N])) 111 | # return predict 112 | 113 | def calculate_P(weights): 114 | total = np.sum(weights) 115 | return weights/total 116 | 117 | 118 | def train_classify(trans_data, trans_label, test_data, P,test,y_target_test,islog,): 119 | # if islog: 120 | # clf = svm.SVR(C=100) 121 | # else: 122 | clf = DecisionTreeRegressor(max_depth=3) 123 | clf.fit(trans_data, trans_label, sample_weight=P[:, 0]) 124 | return clf.predict(test_data) 125 | 126 | def weighted_median(values, weights): 127 | ''' compute the weighted median of values list. The 128 | weighted median is computed as follows: 129 | 1- sort both lists (values and weights) based on values. 130 | 2- select the 0.5 point from the weights and return the corresponding values as results 131 | e.g. values = [1, 3, 0] and weights=[0.1, 0.3, 0.6] assuming weights are probabilities. 132 | sorted values = [0, 1, 3] and corresponding sorted weights = [0.6, 0.1, 0.3] the 0.5 point on 133 | weight corresponds to the first item which is 0. so the weighted median is 0.''' 134 | 135 | #convert the weights into probabilities 136 | sum_weights = sum(weights) 137 | weights = np.array([(w*1.0)/sum_weights for w in weights]) 138 | #sort values and weights based on values 139 | values = np.array(values) 140 | sorted_indices = np.argsort(values) 141 | values_sorted = values[sorted_indices] 142 | weights_sorted = weights[sorted_indices] 143 | #select the median point 144 | it = np.nditer(weights_sorted, flags=['f_index']) 145 | accumulative_probability = 0 146 | median_index = -1 147 | while not it.finished: 148 | accumulative_probability += it[0] 149 | if accumulative_probability > 0.5: 150 | median_index = it.index 151 | return values_sorted[median_index] 152 | elif accumulative_probability == 0.5: 153 | median_index = it.index 154 | it.iternext() 155 | next_median_index = it.index 156 | return np.mean(values_sorted[[median_index, next_median_index]]) 157 | it.iternext() 158 | 159 | return values_sorted[median_index] 160 | 161 | from sklearn.ensemble import AdaBoostRegressor -------------------------------------------------------------------------------- /kmmClassification.py: -------------------------------------------------------------------------------- 1 | import math, numpy, sklearn.metrics.pairwise as sk 2 | from cvxopt import matrix, solvers 3 | import random, sys 4 | from sklearn import svm 5 | 6 | FixedBetaValue = 1.0 7 | 8 | """ 9 | Compute instance (importance) weights using Kernel Mean Matching. 10 | Returns a list of instance weights for training data. 11 | """ 12 | def kmm(Xtrain, Xtest, sigma): 13 | n_tr = len(Xtrain) 14 | n_te = len(Xtest) 15 | 16 | # calculate Kernel 17 | print('Computing kernel for training data ...') 18 | # 0.001取得最好的结果0.01 19 | K_ns = sk.rbf_kernel(Xtrain, Xtrain) 20 | # make it symmetric 21 | K = 0.9 * (K_ns + K_ns.transpose()) 22 | 23 | # calculate kappa 24 | print('Computing kernel for kappa ...') 25 | kappa_r = sk.rbf_kernel(Xtrain, Xtest) 26 | ones = numpy.ones(shape=(n_te, 1)) 27 | kappa = numpy.dot(kappa_r, ones) 28 | kappa = -(float(n_tr) / float(n_te)) * kappa 29 | 30 | # calculate eps 31 | eps = (math.sqrt(n_tr) - 1) / math.sqrt(n_tr) 32 | 33 | # constraints 34 | A0 = numpy.ones(shape=(1, n_tr)) 35 | A1 = -numpy.ones(shape=(1, n_tr)) 36 | A = numpy.vstack([A0, A1, -numpy.eye(n_tr), numpy.eye(n_tr)]) 37 | b = numpy.array([[n_tr * (eps + 1), n_tr * (eps - 1)]]) 38 | b = numpy.vstack([b.T, -numpy.zeros(shape=(n_tr, 1)), numpy.ones(shape=(n_tr, 1)) * 1000]) 39 | 40 | print('Solving quadratic program for beta ...') 41 | P = matrix(K, tc='d') 42 | q = matrix(kappa, tc='d') 43 | G = matrix(A, tc='d') 44 | h = matrix(b, tc='d') 45 | beta = solvers.qp(P, q, G, h) 46 | return [i for i in beta['x']] 47 | 48 | 49 | """ 50 | Kernel width is the median of distances between instances of sparse data 51 | """ 52 | def computeKernelWidth(data): 53 | dist = [] 54 | for i in range(len(data)): 55 | for j in range(i + 1, len(data)): 56 | # s = self.__computeDistanceSq(data[i], data[j]) 57 | # dist.append(math.sqrt(s)) 58 | dist.append(numpy.sqrt(numpy.sum((numpy.array(data[i]) - numpy.array(data[j])) ** 2))) 59 | return numpy.median(numpy.array(dist)) 60 | 61 | 62 | def read_data_set(filename): 63 | with open(filename) as f: 64 | data = f.readlines() 65 | 66 | maxvar = 0 67 | classList = [] 68 | data_set = [] 69 | for i in data: 70 | d = {} 71 | if filename.endswith('.arff'): 72 | if '@' not in i: 73 | features = i.strip().split(',') 74 | class_name = features.pop() 75 | if class_name not in classList: 76 | classList.append(class_name) 77 | d[-1] = float(classList.index(class_name)) 78 | for j in range(len(features)): 79 | d[j] = float(features[j]) 80 | maxvar = len(features) 81 | else: 82 | continue 83 | data_set.append(d) 84 | return (data_set, classList, maxvar) 85 | 86 | 87 | def getFixedBeta(value, count): 88 | beta = [] 89 | for c in range(count): 90 | beta.append(value) 91 | return beta 92 | 93 | 94 | def getBeta(trainX, testX, maxvar): 95 | beta = [] 96 | # gammab = 0.001 97 | gammab = computeKernelWidth(trainX) 98 | print("Gammab:", gammab) 99 | 100 | beta = kmm(trainX, testX, gammab) 101 | print("{0} Beta: {1}".format(len(beta), beta)) 102 | 103 | return beta 104 | 105 | 106 | def checkAccuracy(result, testY): 107 | p = 0 108 | for i, v in enumerate(result): 109 | if v == testY[i]: 110 | p += 1 111 | acc = p * 100 / len(result) 112 | # print(result) 113 | print("ACC:{0}%, Total:{1}/{2} with positive {3}".format(acc, len(result), len(testY), p)) 114 | return acc 115 | 116 | 117 | def separateData(data, maxvar): 118 | dataY = [] 119 | dataX = [] 120 | 121 | for d in data: 122 | dataY.append(d[-1]) 123 | 124 | covar = [] 125 | for c in range(maxvar): 126 | if c in d: 127 | covar.append(d[c]) 128 | else: 129 | covar.append(0.0) 130 | dataX.append(covar) 131 | return (dataX, dataY) 132 | 133 | 134 | def buildModel(trainX, trainY, beta, testX, testY, svmParam, maxvar,testdata): 135 | # Tune parameters here... 136 | #csf = svm.SVC(C=float(svmParam['c']), kernel='rbf', gamma=float(svmParam['g']), probability=True) 137 | 138 | train = separateData(testdata, maxvar) 139 | # H 测试样本分类结果 140 | # TrainS 原训练样本 np数组 141 | # TrainA 辅助训练样本 142 | # LabelS 原训练样本标签 143 | # LabelA 辅助训练样本标签 144 | # Test 测试样本 145 | # N 迭代次数 146 | beta = getBeta(train[0], trainX, maxvar) 147 | pred = tradaboost(trainX, train[0], trainY, train[1], testX, 4,beta) 148 | #csf.fit(trainX, trainY, sample_weight=beta) 149 | 150 | beta_fixed = getFixedBeta(FixedBetaValue, len(trainX)) 151 | csf2 = svm.SVC(C=float(svmParam['c']), kernel='rbf', gamma=float(svmParam['g']), probability=False) 152 | csf2.fit(trainX, trainY, sample_weight=beta_fixed) 153 | 154 | # predict and gather results 155 | #result = csf.predict(testX) 156 | acc = checkAccuracy(pred, testY) 157 | 158 | result2 = csf2.predict(testX) 159 | acc2 = checkAccuracy(result2, testY) 160 | 161 | return (acc, acc2) 162 | 163 | from TR.TrAdaboost2 import * 164 | def train(traindata, testdata, maxvar): 165 | svmParam = {'c': 131072, 'g': 0.0001} 166 | train = separateData(traindata[:250], maxvar) 167 | trainX = train[0] 168 | trainY = train[1] 169 | 170 | print("trainX"+str(len(trainX))) 171 | 172 | test = separateData(traindata[250:], maxvar) 173 | testX = test[0] 174 | testY = test[1] 175 | 176 | print("testX"+str(len(testX))) 177 | 178 | print(type(trainX)) 179 | beta = getBeta(trainX, testX, maxvar) 180 | 181 | # Model training 182 | result = buildModel(trainX, trainY, beta, testX, testY, svmParam, maxvar,testdata) 183 | return result 184 | 185 | #MAIN METHOD 186 | def main(): 187 | #reading train data file 188 | train_data_set,train_classList,train_maxVar=read_data_set("./apps_data_k100/datafile-qrsnc2a7k20.c0.d4.C23.N2000.t16.T4.D1.E1.F1.G1.H1.I1.B16.J8.K300.L0.05.M100.A1.V0.P0.G0.l0.0.b600-train.arff") 189 | # reading test data file 190 | test_data_set,test_classList,test_maxVar=read_data_set("./apps_data_k100/datafile-qrsnc2a7k20.c0.d4.C23.N2000.t16.T4.D1.E1.F1.G1.H1.I1.B16.J8.K300.L0.05.M100.A1.V0.P0.G0.l0.0.b600-test.arff") 191 | if(train_maxVar>=test_maxVar): 192 | mxVar=train_maxVar 193 | else: 194 | mxVar=test_maxVar 195 | #Gathering Accuracies 196 | res1,res2=train(train_data_set,test_data_set,mxVar) 197 | print("Accuracy without KMM:{0}%".format(res1)) 198 | print("Accuracy with KMM:{0}%".format(res2)) 199 | 200 | if __name__ == '__main__': 201 | main() -------------------------------------------------------------------------------- /TrAdaboostkmm.py: -------------------------------------------------------------------------------- 1 | import sklearn.svm 2 | from sklearn.datasets import fetch_20newsgroups 3 | from dataQuality.kmm import * 4 | # ala=np.concatenate((trans_A, label_A.reshape(row_A,1)[:,-1:]), axis=1) 5 | # s=np.concatenate((trans_S, label_S.reshape(row_S,1)[:,-1:]), axis=1) 6 | # 初始化权重 7 | # coef = kernel_mean_matching(s,ala, 8 | # kern='rbf', B=10) 9 | # code by chenchiwei 10 | # -*- coding: UTF-8 -*- 11 | import numpy as np 12 | from sklearn import tree 13 | from scipy import sparse 14 | from sklearn import metrics 15 | 16 | from sklearn import svm 17 | # H 测试样本分类结果 18 | # TrainS 原训练样本 np数组 19 | # TrainA 辅助训练样本 20 | # LabelS 原训练样本标签 21 | # LabelA 辅助训练样本标签 22 | # Test 测试样本 23 | # N 迭代次数 24 | from KMM import kmmClassification 25 | def tradaboost(trans_S, trans_A, label_S, label_A, test,test_label, N,eliminate=False): 26 | 27 | 28 | 29 | 30 | coef = kmmClassification.getBeta(trans_A,test.toarray(),49098) 31 | #排除一些低权重的样本 32 | if eliminate: 33 | percenttile=np.percentile(coef, 7) 34 | indexlist=[] 35 | for index,x in enumerate(coef): 36 | if(x= 0.5 and error_rate != 1: 102 | # bata_T[0, i] = 0.45 / (0.51) 103 | # if error_rate == 1: 104 | # bata_T[0, i] = 0.4 105 | if error_rate >= 0.5: 106 | # error_rate = 0.5 107 | error_rate = 0.499; 108 | if error_rate == 0: 109 | # error_rate = 0.000001 110 | # error_rate=0.0001 111 | error_rate = 0.001 112 | 113 | bata_T[0, i] = error_rate / (1 - error_rate) 114 | 115 | # 调整源域样本权重 116 | for j in range(row_S): 117 | weights[row_A + j] = weights[row_A + j] * np.power(bata_T[0, i], 118 | (-np.abs(result_label[row_A + j, i] - trans_label[row_A+j]))) 119 | 120 | # 调整辅域样本权重 121 | for j in range(row_A): 122 | weights[j] = weights[j] * np.power(bata, np.abs(result_label[j, i] - trans_label[j])) 123 | 124 | 125 | 126 | ##每次迭代完成计算下在测试集合上的误差 127 | # predic_temp=np.zeros([row_T]) 128 | # iteration=i+1; 129 | # for j in range(row_T): 130 | # left = np.sum( 131 | # result_label[row_A + row_S + j, 0:iteration] * np.log(1 / bata_T[0, 0:iteration])) 132 | # right = 0.5 * np.sum(np.log(1 / bata_T[0, 0:iteration])) 133 | # # left = np.sum( 134 | # # result_label[row_A + row_S + i, int(np.ceil(iteration / 2)):iteration] * np.log( 135 | # # 1 / bata_T[0, int(np.ceil(iteration / 2)):iteration])) 136 | # # right = 0.5 * np.sum(np.log(1 / bata_T[0, int(np.ceil(iteration / 2)):iteration])) 137 | # if left >= right: 138 | # predic_temp[j] = 1 139 | # else: 140 | # predic_temp[j] = 0 141 | # accuracy_scorelist.append(metrics.accuracy_score(test_label, predic_temp)) 142 | # recall_scorelist.append(metrics.recall_score(test_label, predic_temp)) 143 | # f1_scorelist.append(metrics.f1_score(test_label, predic_temp)) 144 | 145 | # print bata_T 146 | for i in range(row_T): 147 | # 跳过训练数据的标签 148 | # left = np.sum( 149 | # result_label[row_A + row_S + i, int(np.ceil(N / 2)):N] * np.log(1 / bata_T[0, int(np.ceil(N / 2)):N])) 150 | # right = 0.5 * np.sum(np.log(1 / bata_T[0, int(np.ceil(N / 2)):N])) 151 | left = np.sum( 152 | result_label[row_A + row_S + i, 0:N] * np.log(1 / bata_T[0, 0:N])) 153 | right = 0.5 * np.sum(np.log(1 / bata_T[0, 0:N])) 154 | if left >= right: 155 | predict[i] = 1 156 | else: 157 | predict[i] = 0 158 | # print left, right, predict[i] 159 | # predict[i]=left-right; 160 | 161 | 162 | print(accuracy_scorelist) 163 | return predict,accuracy_scorelist,recall_scorelist,f1_scorelist 164 | 165 | 166 | def calculate_P(weights, label): 167 | total = np.sum(weights) 168 | return np.asarray(weights)/total 169 | 170 | from sklearn.linear_model import LogisticRegression 171 | 172 | def train_classify(trans_data, trans_label, test_data, P): 173 | clf = LogisticRegression() 174 | clf.fit(trans_data, trans_label, sample_weight=P[:, 0]) 175 | return clf.predict(test_data) 176 | 177 | 178 | # def calculate_error_rate(label_R, label_H, weight): 179 | # total = np.sum(weight) 180 | # #return np.sum((weight[:, 0] / total)* np.abs(label_R - label_H)) 181 | # return return_correct_rate(label_R,label_H) 182 | -------------------------------------------------------------------------------- /SPY.py: -------------------------------------------------------------------------------- 1 | import sklearn.svm 2 | from sklearn.datasets import fetch_20newsgroups 3 | from dataQuality.kmm import * 4 | # ala=np.concatenate((trans_A, label_A.reshape(row_A,1)[:,-1:]), axis=1) 5 | # s=np.concatenate((trans_S, label_S.reshape(row_S,1)[:,-1:]), axis=1) 6 | # 初始化权重 7 | # coef = kernel_mean_matching(s,ala, 8 | # kern='rbf', B=10) 9 | # code by chenchiwei 10 | # -*- coding: UTF-8 -*- 11 | import numpy as np 12 | from sklearn import tree 13 | from scipy import sparse 14 | from sklearn import metrics 15 | from sklearn.model_selection import train_test_split 16 | from sklearn import svm 17 | # H 测试样本分类结果 18 | # TrainS 原训练样本 np数组 19 | # TrainA 辅助训练样本 20 | # LabelS 原训练样本标签 21 | # LabelA 辅助训练样本标签 22 | # Test 测试样本 23 | # N 迭代次数 24 | from KMM import kmmClassification 25 | from sklearn.naive_bayes import BernoulliNB 26 | def tradaboost(trans_S, trans_A, label_S, label_A, test,test_label, N,eliminate=True): 27 | 28 | 29 | row_A = trans_A.shape[0] 30 | row_S = trans_S.shape[0] 31 | row_T = test.shape[0] 32 | # 样本重标记 33 | lable_spy_a = np.zeros([row_A, 1]) 34 | lable_spy_s = np.ones([row_S, 1]) 35 | 36 | trans_data = sparse.vstack((trans_A, trans_S)) 37 | trans_label=np.concatenate((lable_spy_a, lable_spy_s), axis=0) 38 | 39 | X_train, X_test, y_train, y_test = train_test_split(trans_data, trans_label, test_size = 0.33, random_state = 42) 40 | clf = LogisticRegression(penalty='l1',class_weight='balanced') 41 | clf = LogisticRegression(class_weight='balanced') 42 | #gnb = BernoulliNB() 43 | clf.fit(X_train, y_train) 44 | print("LR的预测精度",metrics.confusion_matrix(y_test, clf.predict(X_test))) 45 | print("LR的预测精度", metrics.accuracy_score(y_test, clf.predict(X_test))) 46 | # clf.predict(X_test) 47 | # predict=clf.predict_proba(X_test)[:,-1] 48 | # np.sort(predict) 49 | #排除一些低权重的样本 50 | # 排除一些低权重的样本 51 | # print(clf.predict_proba(trans_A)[:,-1]) 52 | 53 | 54 | weights_A = clf.predict_proba(trans_A)[:,-1].tolist() 55 | 56 | if eliminate: 57 | percenttile=np.percentile(weights_A, 32)#32 58 | indexlist=[] 59 | for index,x in enumerate(weights_A): 60 | if(x= 0.5 and error_rate != 1: 122 | # bata_T[0, i] = 0.45 / (0.51) 123 | # if error_rate == 1: 124 | # bata_T[0, i] = 0.4 125 | if error_rate >= 0.5: 126 | # error_rate = 0.5 127 | error_rate = 0.499; 128 | if error_rate == 0: 129 | # error_rate = 0.000001 130 | # error_rate=0.0001 131 | error_rate = 0.001 132 | 133 | bata_T[0, i] = error_rate / (1 - error_rate) 134 | # 调整源域样本权重 135 | for j in range(row_S): 136 | weights[row_A + j] = weights[row_A + j] * np.power(bata_T[0, i], 137 | (-np.abs(result_label[row_A + j, i] - trans_label[row_A+j]))) 138 | 139 | # 调整辅域样本权重 140 | for j in range(row_A): 141 | weights[j] = weights[j] * np.power(bata, np.abs(result_label[j, i] - trans_label[j])) 142 | 143 | 144 | 145 | ##每次迭代完成计算下在测试集合上的误差 146 | # predic_temp=np.zeros([row_T]) 147 | # iteration=i; 148 | # for i in range(row_T): 149 | # left = np.sum( 150 | # result_label[row_A + row_S + i, 0:iteration] * np.log(1 / bata_T[0, 0:iteration])) 151 | # right = 0.5 * np.sum(np.log(1 / bata_T[0, 0:iteration])) 152 | # if left >= right: 153 | # predic_temp[i] = 1 154 | # else: 155 | # predic_temp[i] = 0 156 | # accuracy_scorelist.append(metrics.accuracy_score(test_label, predic_temp)) 157 | # recall_scorelist.append(metrics.recall_score(test_label, predic_temp)) 158 | # f1_scorelist.append(metrics.f1_score(test_label, predic_temp)) 159 | 160 | # print bata_T 161 | for i in range(row_T): 162 | # 跳过训练数据的标签 163 | # left = np.sum( 164 | # result_label[row_A + row_S + i, int(np.ceil(N / 2)):N] * np.log(1 / bata_T[0, int(np.ceil(N / 2)):N])) 165 | # right = 0.5 * np.sum(np.log(1 / bata_T[0, int(np.ceil(N / 2)):N])) 166 | left = np.sum( 167 | result_label[row_A + row_S + i, 0:N] * np.log(1 / bata_T[0, 0:N])) 168 | right = 0.5 * np.sum(np.log(1 / bata_T[0, 0:N])) 169 | if left >= right: 170 | predict[i] = 1 171 | else: 172 | predict[i] = 0 173 | # print left, right, predict[i] 174 | # predict[i]=left-right; 175 | 176 | 177 | print(accuracy_scorelist) 178 | return predict,accuracy_scorelist,recall_scorelist,f1_scorelist 179 | 180 | 181 | def calculate_P(weights, label): 182 | total = np.sum(weights) 183 | return np.asarray(weights)/total 184 | 185 | from sklearn.linear_model import LogisticRegression 186 | 187 | def train_classify(trans_data, trans_label, test_data, P): 188 | clf = LogisticRegression() 189 | clf.fit(trans_data, trans_label, sample_weight=P[:, 0]) 190 | return clf.predict(test_data) 191 | 192 | 193 | # def calculate_error_rate(label_R, label_H, weight): 194 | # total = np.sum(weight) 195 | # #return np.sum((weight[:, 0] / total)* np.abs(label_R - label_H)) 196 | # return return_correct_rate(label_R,label_H) 197 | -------------------------------------------------------------------------------- /Mutisource.py: -------------------------------------------------------------------------------- 1 | # code by chenchiwei 2 | # -*- coding: UTF-8 -*- 3 | import numpy as np 4 | from sklearn import tree 5 | from sklearn import svm 6 | import math 7 | from sklearn.tree import DecisionTreeRegressor 8 | 9 | def stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08): 10 | out = np.cumsum(arr, axis=axis, dtype=np.float64) 11 | return out 12 | 13 | 14 | # TrainS 原训练样本 np数组 15 | # TrainA 辅助训练样本 16 | # LabelS 原训练样本标签 17 | # LabelA 辅助训练样本标签 18 | # Test 测试样本 19 | # N 迭代次数 20 | 21 | # source_sum 源领域的数目 22 | # soucenum 源领域所有样本的数目 23 | 24 | def Mutisource_tradaboost(trans_S, trans_A_list, label_S, label_A_list, test, N,source_sum,soucenum): 25 | 26 | 27 | #首先计算bata 28 | bata = 1 / (1 + np.sqrt(2 * np.log(soucenum) / N)) 29 | row_S = trans_S.shape[0] 30 | row_T = test.shape[0] 31 | weights_S= np.ones([row_S, 1])/row_S 32 | weights_A=[] 33 | train=[] 34 | train_lable=[] 35 | test_data=[] 36 | result_label=[] 37 | # 存储每次迭代的标签和bata值 38 | bata_T = np.zeros([1, N]) 39 | result_labelsum = np.zeros([row_S + row_T, N]) 40 | for i in range(source_sum): 41 | row_A = trans_A_list[i].shape[0] 42 | weights_A.append(np.ones([row_A, 1])/row_A) 43 | train.append(np.concatenate((trans_A_list[i], trans_S), axis=0)) 44 | train_lable.append(np.concatenate((label_A_list[i], label_S), axis=0)) 45 | test_data.append( np.concatenate((train[i], test), axis=0)) 46 | result_label.append(np.ones([row_A + row_S + row_T, N])) 47 | #生成初始的权重 48 | for i in range(N): 49 | #将权重向量归一化 50 | error_list=[] 51 | max_error_list=[] 52 | for j in range(source_sum): 53 | row_A = trans_A_list[j].shape[0] 54 | weights = np.concatenate((weights_A[j], weights_S), axis=0) 55 | P = calculate_P(weights) 56 | result_label[j][:, i] = train_classify(train[j], train_lable[j], 57 | test_data[j], P) 58 | temp = np.abs(result_label[j][row_A:row_A + row_S, i] - train_lable[j][row_A:]) 59 | #temp = np.abs(result_label[j][:row_A + row_S, i] - train_lable[j]) 60 | error_max = temp.max() 61 | max_error_list.append(error_max) 62 | error_rate = 0.0 63 | for m in range(row_A, row_A + row_S): 64 | error_rate += (weights_S[m-row_A] * ((abs(result_label[j][m, i] - train_lable[j][m]) / error_max))) 65 | error_rate = error_rate / sum(weights_S) 66 | error_list.append(error_rate) 67 | g=[] 68 | #g=error_list; 69 | for j in range(source_sum): 70 | g.append(math.exp(1-error_list[j])/math.exp(error_list[j])) 71 | g = [x/sum(g) for x in g] 72 | 73 | for j in range(source_sum): 74 | row_A = trans_A_list[j].shape[0] 75 | result_labelsum[:, i]=result_labelsum[:, i]+g[j]*result_label[j][row_A:row_A + row_S+row_T, i] 76 | 77 | temp = np.abs(result_labelsum[:row_S, i] - label_S) 78 | error_max = temp.max() 79 | error_rate = 0.0 80 | for m in range(row_S): 81 | error_rate += (weights_S[m] * ((abs(result_labelsum[m, i] - label_S[m]) / error_max))) 82 | error_rate = error_rate / sum(weights_S) 83 | 84 | 85 | #更新样本权重 86 | if error_rate >= 0.5: 87 | error_rate = 0.499; 88 | if error_rate == 0: 89 | error_rate = 0.001 90 | 91 | bata_T[0, i] = error_rate / (1 - error_rate) 92 | for j in range(row_S): 93 | weights_S[j] = weights_S[j] * np.power(bata_T[0, i], 1-((abs(result_labelsum[j, i] - label_S[j]) /error_max))) 94 | for j in range(source_sum): 95 | row_A = trans_A_list[j].shape[0] 96 | temp = np.abs(result_label[j][:row_A, i] - label_A_list[j]) 97 | error_max = temp.max() 98 | for m in range(row_A): 99 | if (abs(result_label[j][m, i] - label_A_list[j][m]) > 0): 100 | weights_A[j][m]=weights_A[j][m] * np.power(bata, abs(result_label[j][m, i] - label_A_list[j][m]) / error_max) 101 | 102 | # bata_T[0, i] = (1/2)*math.log((1 - error_rate)/error_rate) 103 | # for j in range(row_S): 104 | # weights_S[j] = weights_S[j] * np.exp(bata_T[0, i]*((abs(result_labelsum[j, i] - label_S[j]) / error_max))) 105 | # for j in range(source_sum): 106 | # row_A = trans_A_list[j].shape[0] 107 | # temp = np.abs(result_label[j][:row_A, i] - label_A_list[j]) 108 | # error_max = temp.max() 109 | # for m in range(row_A): 110 | # if (abs(result_label[j][m, i] - label_A_list[j][m]) > 0.04): 111 | # weights_A[j][m] = weights_A[j][m] * np.exp(-bata*abs( 112 | # result_label[j][m, i] - label_A_list[j][m]) / error_max) 113 | 114 | # 115 | # predictions=result_labelsum[row_S:,0:N] 116 | # # Sort the predictions 117 | # sorted_idx = np.argsort(predictions, axis=1) 118 | # # Find index of median prediction for each sample 119 | # #bata_T = 1/bata_T[0, 0:N] 120 | # bata_T = np.log(1/bata_T[0, :N]) 121 | # bata_T[:] = bata_T[:] / np.sum(bata_T[:]) 122 | # weight_cdf = stable_cumsum(bata_T[sorted_idx], axis=1) 123 | # median_or_above = weight_cdf >= 0.5 * weight_cdf[:, -1][:, np.newaxis] 124 | # median_idx = median_or_above.argmax(axis=1) 125 | # median_estimators = sorted_idx[np.arange(test.shape[0]), median_idx] 126 | # # Return median predictions 127 | # return predictions[np.arange(test.shape[0]), median_estimators] 128 | 129 | 130 | predictions = result_labelsum[row_S:, int(np.ceil(N / 2)):N] 131 | # Sort the predictions 132 | sorted_idx = np.argsort(predictions, axis=1) 133 | # Find index of median prediction for each sample 134 | bata_T = np.log(1 / bata_T[0, int(np.ceil(N / 2)):N]) 135 | #bata_T = bata_T[0, int(np.ceil(N / 2)):N] 136 | bata_T[:] = bata_T[:] / np.sum(bata_T[:]) 137 | weight_cdf = stable_cumsum(bata_T[sorted_idx], axis=1) 138 | median_or_above = weight_cdf >= 0.5 * weight_cdf[:, -1][:, np.newaxis] 139 | median_idx = median_or_above.argmax(axis=1) 140 | median_estimators = sorted_idx[np.arange(test.shape[0]), median_idx] 141 | # Return median predictions 142 | return predictions[np.arange(test.shape[0]), median_estimators] 143 | 144 | 145 | 146 | def calculate_P(weights): 147 | total = np.sum(weights) 148 | return weights/total 149 | 150 | from sklearn import neighbors 151 | def train_classify(trans_data, trans_label, test_data, P): 152 | clf = DecisionTreeRegressor(max_depth=3) 153 | #clf = neighbors.KNeighborsRegressor() 154 | clf.fit(trans_data, trans_label, sample_weight=P[:, 0]) 155 | return clf.predict(test_data) 156 | 157 | def weighted_median(values, weights): 158 | ''' compute the weighted median of values list. The 159 | weighted median is computed as follows: 160 | 1- sort both lists (values and weights) based on values. 161 | 2- select the 0.5 point from the weights and return the corresponding values as results 162 | e.g. values = [1, 3, 0] and weights=[0.1, 0.3, 0.6] assuming weights are probabilities. 163 | sorted values = [0, 1, 3] and corresponding sorted weights = [0.6, 0.1, 0.3] the 0.5 point on 164 | weight corresponds to the first item which is 0. so the weighted median is 0.''' 165 | 166 | #convert the weights into probabilities 167 | sum_weights = sum(weights) 168 | weights = np.array([(w*1.0)/sum_weights for w in weights]) 169 | #sort values and weights based on values 170 | values = np.array(values) 171 | sorted_indices = np.argsort(values) 172 | values_sorted = values[sorted_indices] 173 | weights_sorted = weights[sorted_indices] 174 | #select the median point 175 | it = np.nditer(weights_sorted, flags=['f_index']) 176 | accumulative_probability = 0 177 | median_index = -1 178 | while not it.finished: 179 | accumulative_probability += it[0] 180 | if accumulative_probability > 0.5: 181 | median_index = it.index 182 | return values_sorted[median_index] 183 | elif accumulative_probability == 0.5: 184 | median_index = it.index 185 | it.iternext() 186 | next_median_index = it.index 187 | return np.mean(values_sorted[[median_index, next_median_index]]) 188 | it.iternext() 189 | 190 | return values_sorted[median_index] 191 | 192 | from sklearn.ensemble import AdaBoostRegressor -------------------------------------------------------------------------------- /TrReg.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import copy 4 | from sklearn.tree import DecisionTreeRegressor 5 | import matplotlib.pyplot as plt 6 | from sklearn.ensemble import AdaBoostRegressor 7 | from sklearn.metrics import mean_squared_error 8 | from sklearn.model_selection import KFold 9 | from sklearn.metrics import r2_score 10 | from TR.TrAdaboostreg import * 11 | from TR.TTemp import * 12 | from TR.Mutisource import * 13 | ##============================================================================= 14 | 15 | # Example 1 16 | ##============================================================================= 17 | 18 | # 1. define the data generating function 19 | def response(x, d, random_state,rng_temp): 20 | """ 21 | x is the input variable 22 | d controls the simularity of different tasks 23 | """ 24 | # a1 = np.random.normal(1, 0.1 * d) 25 | # a2 = np.random.normal(1, 0.1 * d) 26 | # b1 = np.random.normal(1, 0.1 * d) 27 | # b2 = np.random.normal(1, 0.1 * d) 28 | # c1 = np.random.normal(1, 0.05 * d) 29 | # c2 = np.random.normal(1, 0.05 * d) 30 | a1 = rng_temp.normal(1, 0.1 * d) 31 | a2 = rng_temp.normal(1, 0.1 * d) 32 | b1 = rng_temp.normal(1, 0.1 * d) 33 | b2 = rng_temp.normal(1, 0.1 * d) 34 | c1 = rng_temp.normal(1, 0.05 * d) 35 | c2 = rng_temp.normal(1, 0.05 * d) 36 | y = a1 * np.sin(b1 * x + c1).ravel() + a2 * np.sin(b2 * 6 * x + c2).ravel() + random_state.normal(0, 0.1, 37 | x.shape[0]) 38 | return y 39 | 40 | 41 | # ============================================================================== 42 | 43 | # 2. decide the degree of similarity of multiple data sources using d 44 | 45 | d = 2 46 | # ============================================================================== 47 | rng = np.random.RandomState(11) 48 | rng_temp = np.random.RandomState(11) 49 | # 3.1 create source data and target data 50 | n_source1 = 50 51 | #x_source1 = np.linspace(0, 6, n_source1)[:, np.newaxis] 52 | 53 | x_source1=6*rng_temp.random_sample(n_source1)[:, np.newaxis] 54 | y_source1 = response(x_source1, 0.5, rng,rng_temp) 55 | n_source2 = 100 56 | #x_source2 = np.linspace(0, 6, n_source2)[:, np.newaxis] 57 | x_source2=6*rng_temp.random_sample(n_source1)[:, np.newaxis] 58 | y_source2 = response(x_source2, 2.0, rng,rng_temp) 59 | n_source3 = 100 60 | #x_source3 = np.linspace(0, 6, n_source3)[:, np.newaxis] 61 | x_source3=6*rng_temp.random_sample(n_source1)[:, np.newaxis] 62 | y_source3 = response(x_source3, 6.5, rng,rng_temp) 63 | n_source4 = 100 64 | #x_source4 = np.linspace(0, 6, n_source4)[:, np.newaxis] 65 | x_source4=6*rng_temp.random_sample(n_source1)[:, np.newaxis] 66 | y_source4 = response(x_source4, 6.0, rng,rng_temp) 67 | n_source5 = 100 68 | #x_source5 = np.linspace(0, 6, n_source5)[:, np.newaxis] 69 | x_source5=6*rng_temp.random_sample(n_source1)[:, np.newaxis] 70 | y_source5 = response(x_source5, 5.0, rng,rng_temp) 71 | 72 | # 3.2 create target data (n_target_train and n_target_test are the sample size of train and test datasets) 73 | d=0.05 74 | rng_temp2 = np.random.RandomState(43) 75 | # a1 = np.random.normal(1, 0.1 * d) 76 | # a2 = np.random.normal(1, 0.1 * d) 77 | # b1 = np.random.normal(1, 0.1 * d) 78 | # b2 = np.random.normal(1, 0.1 * d) 79 | # c1 = np.random.normal(1, 0.05 * d) 80 | # c2 = np.random.normal(1, 0.05 * d) 81 | a1 = rng_temp2.normal(1, 0.1 * d) 82 | a2 = rng_temp2.normal(1, 0.1 * d) 83 | b1 = rng_temp2.normal(1, 0.1 * d) 84 | b2 = rng_temp2.normal(1, 0.1 * d) 85 | c1 = rng_temp2.normal(1, 0.05 * d) 86 | c2 = rng_temp2.normal(1, 0.05 * d) 87 | 88 | # target_train 89 | # ============================================================================== 90 | 91 | n_target_train = 40 92 | 93 | # ============================================================================== 94 | #x_target_train = np.linspace(0, 6, n_target_train)[:, np.newaxis] 95 | 96 | x_target_train=6*rng_temp2.random_sample(n_target_train)[:, np.newaxis] 97 | 98 | 99 | y_target_train = a1 * np.sin(b1 * x_target_train + c1).ravel() + a2 * np.sin( 100 | b2 * 6 * x_target_train + c2).ravel() + rng.normal(0, 0.1, x_target_train.shape[0]) 101 | 102 | # target_test 103 | n_target_test = 600 104 | #x_target_test = np.linspace(0, 6, n_target_test)[:, np.newaxis] 105 | 106 | x_target_test=6*rng_temp2.random_sample(n_target_test)[:, np.newaxis] 107 | y_target_test = a1 * np.sin(b1 * x_target_test + c1).ravel() + a2 * np.sin( 108 | b2 * 6 * x_target_test + c2).ravel() + rng.normal(0, 0.1, x_target_test.shape[0]) 109 | 110 | 111 | X = np.concatenate((x_source1, x_source2, x_source3, x_source4, x_source5)) 112 | y = np.concatenate((y_source1, y_source2, y_source3, y_source4, y_source5)) 113 | 114 | 115 | # ============================================================================== 116 | from sklearn import neighbors 117 | 118 | clf = DecisionTreeRegressor(max_depth=3) 119 | #clf = neighbors.KNeighborsRegressor() 120 | 121 | clf.fit(x_target_train,y_target_train) 122 | predict1=clf.predict(x_target_test) 123 | mse_twostageboost = mean_squared_error(y_target_test, predict1) 124 | print("MSE of tree:", mse_twostageboost) 125 | print("r2 of tree:", r2_score(y_target_test, predict1)) 126 | # ============================================================================== 127 | 128 | 129 | 130 | xlist=[x_source1,x_source2,x_source3,x_source4,x_source5] 131 | ylist=[y_source1, y_source2, y_source3, y_source4, y_source5] 132 | reslisttempx2 = [] 133 | reslisttemp2 = []; 134 | for i in range(4, 50, 1): 135 | predict = Mutisource_tradaboost( 136 | x_target_train, xlist, y_target_train, ylist, x_target_test, i,5,4*100+50) 137 | mse_twostageboost = mean_squared_error(y_target_test, predict) 138 | reslisttemp2.append(r2_score(y_target_test, predict)) 139 | reslisttempx2.append(i) 140 | print("r2 of tradaboost:", reslisttemp2) 141 | 142 | 143 | 144 | print("MSE of muti:", mse_twostageboost) 145 | print("r2 of muti:", r2_score(y_target_test, predict)) 146 | reslisttempx1 = [] 147 | reslisttemp1 = []; 148 | for i in range(4, 50, 1): 149 | predict = tradaboost( 150 | x_target_train, X, y_target_train, y, x_target_test, y_target_test, i, True) 151 | mse_twostageboost = mean_squared_error(y_target_test, predict) 152 | reslisttemp1.append(r2_score(y_target_test, predict)) 153 | reslisttempx1.append(i) 154 | print("r2 of tradaboost:", reslisttemp1) 155 | # 156 | # 157 | # reslisttempx2 = [] 158 | # reslisttemp2 = []; 159 | # for i in range(20, 300, 10): 160 | # predict = tradaboost( 161 | # x_target_train, X, y_target_train, y, x_target_test, y_target_test, i, False) 162 | # mse_twostageboost = mean_squared_error(y_target_test, predict) 163 | # reslisttemp2.append(r2_score(y_target_test, predict)) 164 | # reslisttempx2.append(i) 165 | # print("r2 of tradaboost:", reslisttemp2) 166 | # 167 | plt.plot(reslisttempx1, reslisttemp1, marker='*', linestyle='dashed', linewidth=1, label="tradaboost") 168 | plt.plot(reslisttempx2, reslisttemp2, marker='+', linestyle='dashed', linewidth=1, label="mutisource vfkmm-tradaboost") 169 | plt.plot(range(4, 50, 1), [0.70]*46, marker='_', linestyle='dashed', linewidth=1, label="baseline") 170 | 171 | plt.xlabel("Iterations") 172 | plt.ylabel("score") 173 | plt.legend(loc="lower right") 174 | plt.show() 175 | 176 | 177 | 178 | predict = tradaboost( 179 | x_target_train, X, y_target_train, y, x_target_test,y_target_test, 300,True) 180 | mse_twostageboost = mean_squared_error(y_target_test, predict) 181 | print("MSE of tradaboost:", mse_twostageboost) 182 | print("r2 of tradaboost:", r2_score(y_target_test, predict)) 183 | 184 | 185 | # 186 | # 187 | predict2 = tradaboost( 188 | x_target_train, X, y_target_train, y, x_target_test,y_target_test, 300,False) 189 | mse_twostageboost = mean_squared_error(y_target_test, predict2) 190 | print("MSE of tradaboost margin:", mse_twostageboost) 191 | print("r2 of tradaboost: margin", r2_score(y_target_test, predict2)) 192 | 193 | 194 | 195 | 196 | # 4.4 Plot the results 197 | plt.figure() 198 | plt.scatter(x_target_train, y_target_train, c="k", label="target_train") 199 | plt.plot(x_target_test, y_target_test, c="b", label="target_test", linewidth=0.5) 200 | plt.plot(x_target_test, predict1, c="r", label="AdaBoostRegressor", linewidth=2) 201 | # plt.plot(x_target_test, predict, c="g", label="VFKMM-TrAdaBoost without margin", linewidth=2) 202 | # plt.plot(x_target_test, predict2, c="y", label="VFKMM-TrAdaBoost", linewidth=2) 203 | plt.xlabel("x") 204 | plt.ylabel("y") 205 | plt.legend(loc="lower left") 206 | plt.title("mutisource VFKMM-TrAdaBoost Regressor") 207 | plt.legend() 208 | # plt.show() 209 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import math 4 | from sklearn.datasets import make_gaussian_quantiles 5 | import sklearn.svm 6 | from sklearn.datasets import fetch_20newsgroups 7 | from sklearn.feature_extraction.text import CountVectorizer 8 | from sklearn.feature_extraction.text import TfidfTransformer 9 | from sklearn.naive_bayes import MultinomialNB 10 | from sklearn.pipeline import Pipeline 11 | from scipy import interp 12 | from scipy import sparse 13 | from sklearn.linear_model import LogisticRegression 14 | import TR.TrAdaboostkmm as kmmtr 15 | import TR.TrAdaboost2 as tr 16 | import TR.SPY as SPY 17 | 18 | 19 | import matplotlib as mpl 20 | mpl.rcParams['font.sans-serif'] = [u'SimHei'] 21 | from text import util 22 | from sklearn.metrics import roc_curve, auc 23 | 24 | from TR.classification_report import * 25 | import time 26 | 27 | 28 | def main(): 29 | 30 | 31 | 32 | # ''' 33 | # 定义一个二分类问题,分类rec和sci,但是目标领域和源领域的数据来源不同 34 | # 如何控制源领域和目标领域数据的数量 35 | # ''' 36 | 37 | 38 | # target_categories = ["rec.sport.hockey", "rec.motorcycles","sci.crypt", "sci.electronics"] 39 | # source_categories = ["rec.sport.baseball", "rec.autos","sci.med", "sci.space"] 40 | 41 | 42 | target_categories = ["rec.autos", "rec.sport.baseball","sci.med", "sci.space"] 43 | source_categories = ["rec.motorcycles", "rec.sport.hockey","sci.crypt", "sci.electronics"] 44 | 45 | #实验组1 46 | target_categories = ["rec.autos", "sci.med"] 47 | source_categories = ["rec.sport.hockey", "sci.electronics"] 48 | 49 | #实验组2 50 | target_categories = ["comp.graphics", "rec.autos"] 51 | source_categories = ["comp.os.ms-windows.misc", "rec.sport.hockey"] 52 | 53 | # target_categories = ["rec.sport.hockey", "rec.motorcycles"] 54 | # source_categories = ["sci.med", "sci.space"] 55 | 56 | target_categories = ["sci.crypt", "sci.space", "talk.politics.guns", "talk.politics.mideast"] 57 | source_categories = ["sci.electronics", "sci.med", "talk.politics.misc", "talk.religion.misc"] 58 | 59 | 60 | # 实验组1 61 | target_categories = ["rec.autos", "sci.med"] 62 | source_categories = ["rec.sport.hockey", "sci.electronics"] 63 | 64 | target = fetch_20newsgroups(subset='test',categories = target_categories, shuffle = True, random_state = 42) 65 | source= fetch_20newsgroups(subset='test',categories = source_categories, shuffle = True, random_state = 42) 66 | 67 | # source.data = source.data[0:1000] 68 | # source.target = source.target[0:1000] 69 | # 70 | target.data = target.data[0:400] 71 | target.target = target.target[0:400] 72 | 73 | print(target.target) 74 | print(target.target_names) 75 | print(source.target_names) 76 | 77 | print('目标源的大小', len(target.data), '辅助源的大小', len(source.data)) 78 | 79 | # # 80 | # target.target[target.target == 0] = 0 81 | # target.target[target.target == 1] = 0 82 | # target.target[target.target == 2] = 1 83 | # target.target[target.target == 3] = 1 84 | # # print(type(target.target)) 85 | # # print(target.target) 86 | # 87 | # source.target[source.target == 0] = 0 88 | # source.target[source.target == 1] = 0 89 | # source.target[source.target == 2] = 1 90 | # source.target[source.target == 3] = 1 91 | 92 | 93 | merge_target_source = np.concatenate((target.data, source.data), axis=0) 94 | merge_target_source_label = np.concatenate((target.target, source.target), axis=0) 95 | # print(set(merge_target_source_label)) 96 | 97 | # refine emails - delete unwanted text form them 98 | util.refine_all_emails(merge_target_source) 99 | # feature Extractoin 100 | # BOW Bag Of Words 101 | TFIDF = util.bagOfWords(merge_target_source) 102 | #TFIDF = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(TFIDF) 103 | #TFIDF = sklearn.feature_extraction.text.TfidfTransformer.transform(TFIDF) 104 | 105 | length=len(target.data) 106 | X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(TFIDF[:length], merge_target_source_label[:length], test_size=0.6,random_state = 0) 107 | 108 | #X_train_temp, X_test, y_train_temp, y_test=sklearn.cross_validation.train_test_split(X_test,y_test) 109 | 110 | print("测试集的大小",y_test.shape) 111 | TFIDF = np.array(TFIDF.toarray()) 112 | merge_target_source_label=np.array(merge_target_source_label) 113 | print((X_train.shape)) 114 | print((TFIDF.shape)) 115 | # build classifier 116 | # clf = sklearn.svm.LinearSVC() 117 | 118 | 119 | clf = LogisticRegression() 120 | 121 | 122 | # print("辅助数据集和目标数据集一起训练",split_test_classifier(clf, X, 123 | # np.concatenate((y_train[:,None], merge_target_source_label[length:,None]), axis=0) 124 | # ,X_test[0:200,:],y_test[0:200,None])) 125 | X=sparse.vstack((X_train[:,:], TFIDF[length:,:])) 126 | print("辅助数据集和目标数据集一起训练", split_test_classifier(clf, X, 127 | np.concatenate( 128 | (y_train[:, None], merge_target_source_label[length:, None]), axis=0) 129 | , X_test, y_test)) 130 | # X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(TFIDF, merge_target_source_label, test_size=0.4,random_state = 0) 131 | 132 | 133 | clf = LogisticRegression() 134 | # print("目标数据集单独训练",split_test_classifier(clf,X_train, y_train 135 | # , X_test[0:200,:], y_test[0:200,None])) 136 | print("目标数据集单独训练",split_test_classifier(clf,X_train, y_train 137 | , X_test, y_test)) 138 | #def fit(self, diff_train, diff_label, same_train, same_train_label, MAX_ITERATION=30): 139 | #model.fit(TFIDF[length:,:], merge_target_source_label[length:,None], X_train, y_train[:,None],MAX_ITERATION=100) 140 | 141 | # 142 | # predict1, accuracy_scorelist1, recall_scorelist1, f1_scorelist1 = SPY.tradaboost(X_train, TFIDF[length:, :], 143 | # y_train[:, None], 144 | # merge_target_source_label[ 145 | # length:, None], 146 | # X_test, y_test, 55, True) 147 | # print_classification_report('LR-TRadaboost', predict1, y_test) 148 | 149 | start =time.clock() 150 | predict2, accuracy_scorelist2, recall_scorelist2, f1_scorelist2 = tr.tradaboost( 151 | X_train, TFIDF[length:, :], y_train[:, None], merge_target_source_label[length:, None], X_test, y_test, 85) 152 | print_classification_report('TRadaboost', predict2, y_test) 153 | end = time.clock() 154 | print('Running time: %s Seconds'%(end-start)) 155 | 156 | # 原生的tradaboost 157 | Predict = []; 158 | 159 | reslist = []; 160 | reslistx = []; 161 | # for i in range(5, 200, 10): 162 | # predict2, accuracy_scorelist2, recall_scorelist2, f1_scorelist2 = tr.tradaboost( 163 | # X_train, TFIDF[length:, :], y_train[:, None], merge_target_source_label[length:, None], X_test, y_test, i) 164 | # reslist.append(return_correct_rate(predict2, y_test)) 165 | # reslistx.append(i) 166 | #Predict = predict2 167 | 168 | # 169 | # reslist=[0.775, 0.825, 0.875, 0.875, 0.8875, 0.8958333333333334, 0.8958333333333334, 0.8791666666666667, 0.8875, 170 | # 0.8833333333333333, 0.8833333333333333, 0.8916666666666667, 0.9166666666666666, 0.9208333333333333, 0.9125, 171 | # 0.9208333333333333, 0.9125, 0.9125, 0.9041666666666667, 0.9] 172 | 173 | #print_classification_report('TRadaboost', Predict, y_test) 174 | print(reslist) 175 | 176 | 177 | # plt.plot(reslistx, reslist, marker='+', linestyle='dashed', linewidth=1,label="tradaboost") # plt.plot(range(5,31,5), accuracy_scorelist[4:30:5],marker='x', linestyle='dashed',linewidth=1,label="vfkmm without eliminate") 178 | # plt.xlabel("迭代次数") 179 | # plt.ylabel("score") 180 | # plt.legend(loc="lower right") 181 | # plt.show() 182 | 183 | 184 | start = time.clock() 185 | #kmm排除低权重的样本 186 | predict1, accuracy_scorelist1, recall_scorelist1, f1_scorelist1 = kmmtr.tradaboost(X_train, TFIDF[length:, :], 187 | y_train[:, None], 188 | merge_target_source_label[length:, None], 189 | X_test, y_test,20, True) 190 | # 191 | # 192 | print_classification_report('KMM-TRadaboost',predict1,y_test) 193 | 194 | 195 | end = time.clock() 196 | print('Running time: %s Seconds' % (end - start)) 197 | 198 | SPY 199 | reslisttempx = [] 200 | reslisttemp = []; 201 | for i in range(5, 80, 2): 202 | # predict3, accuracy_scorelist, recall_scorelist, f1_scorelist = SPY.tradaboost( 203 | # X_train, TFIDF[length:, :], y_train[:, None], merge_target_source_label[length:, None], X_test, y_test, i, 204 | # True) 205 | # reslisttemp.append(return_correct_rate(predict3, y_test)) 206 | reslisttempx.append(i) 207 | #Predict = predict3 208 | #print_classification_report('lr-TRadaboost', Predict, y_test) 209 | reslisttemp=[0.875, 0.8791666666666667, 0.8958333333333334, 0.9083333333333333, 0.9041666666666667, 0.8916666666666667, 0.9, 0.9, 0.9041666666666667, 0.9083333333333333, 0.9, 0.9, 0.9083333333333333, 0.9041666666666667, 0.9041666666666667, 0.9083333333333333, 0.9041666666666667, 0.9041666666666667, 0.9041666666666667, 0.9041666666666667, 0.9041666666666667, 0.9125, 0.9208333333333333, 0.9291666666666667, 0.9291666666666667, 0.9291666666666667, 0.9291666666666667, 0.9291666666666667, 0.9291666666666667, 0.9291666666666667, 0.925, 0.925, 0.9208333333333333, 0.9208333333333333, 0.9166666666666666, 0.9166666666666666, 0.9083333333333333, 0.9041666666666667] 210 | 211 | print(reslisttemp) 212 | print(reslisttempx) 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | reslisttempx1 = [] 221 | reslisttemp1 = []; 222 | for i in range(5, 80, 2): 223 | predict1, accuracy_scorelist1, recall_scorelist1, f1_scorelist1 = kmmtr.tradaboost(X_train, TFIDF[length:, :], 224 | y_train[:, None], 225 | merge_target_source_label[ 226 | length:, None], 227 | X_test, y_test, i, True) 228 | reslisttemp1.append(return_correct_rate(predict1, y_test)) 229 | reslisttempx1.append(i) 230 | Predict = predict1 231 | print_classification_report('KMM-TRadaboost', Predict, y_test) 232 | print(reslisttemp1) 233 | print(reslisttempx1) 234 | 235 | plt.plot(reslisttempx1, reslisttemp1, marker='o', linestyle='dashed', linewidth=1, label="vfkmm tradaboost") 236 | plt.plot(reslistx, reslist, marker='+', linestyle='dashed', linewidth=1,label="tradaboost") # plt.plot(range(5,31,5), accuracy_scorelist[4:30:5],marker='x', linestyle='dashed',linewidth=1,label="vfkmm without eliminate") 237 | plt.plot(reslisttempx, reslisttemp, marker='x', linestyle='dashed', linewidth=1, label="LR tradaboost") 238 | plt.xlabel("迭代次数") 239 | plt.ylabel("score") 240 | plt.legend(loc="lower right") 241 | plt.show() 242 | 243 | 244 | 245 | # # SPY 246 | # reslisttempx=[] 247 | # reslisttemp = []; 248 | # for i in range(5,100,2): 249 | # predict3, accuracy_scorelist, recall_scorelist, f1_scorelist = SPY.tradaboost( 250 | # X_train, TFIDF[length:, :], y_train[:, None], merge_target_source_label[length:, None], X_test, y_test, i, 251 | # True) 252 | # reslisttemp.append(return_correct_rate(predict3,y_test)) 253 | # reslisttempx.append(i) 254 | # Predict = predict3 255 | # print_classification_report('lr-TRadaboost', Predict, y_test) 256 | # print(reslisttemp) 257 | 258 | # predict, accuracy_scorelist, recall_scorelist, f1_scorelist =SPY.tradaboost( 259 | # X_train, TFIDF[length:, :], y_train[:, None],merge_target_source_label[length:, None],X_test,y_test, 62,True) 260 | # print_classification_report('SPY',predict,y_test) 261 | # 262 | # predict2, accuracy_scorelist2, recall_scorelist2, f1_scorelist2=tr.tradaboost( 263 | # X_train, TFIDF[length:, :], y_train[:, None],merge_target_source_label[length:, None],X_test,y_test, 100) 264 | # print_classification_report('TRadaboost',predict2,y_test) 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | # # kmm排除低权重的样本 273 | # predict1, accuracy_scorelist1, recall_scorelist1, f1_scorelist1 = kmmtr.tradaboost(X_train, TFIDF[length:, :], 274 | # y_train[:, None], 275 | # merge_target_source_label[length:, None], 276 | # X_test, y_test,65, True) 277 | # # 278 | # # 279 | # print_classification_report('KMM-TRadaboost',predict1,y_test) 280 | # X_test, i), y_test) 281 | # 画ROC曲线和计算AUC,将返回的list归一化到0,1之间 282 | # min_max_scaler = sklearn.preprocessing.MinMaxScaler() 283 | # 284 | # 285 | # predict = np.asarray(predict) 286 | # predict = predict.reshape(len(y_test), 1) 287 | # predict = min_max_scaler.fit_transform(predict) 288 | # predict=predict.tolist() 289 | # fpr1, tpr1, thresholds1 = roc_curve(y_test, predict) 290 | # roc_auc1 = auc(fpr1, tpr1) 291 | # 292 | # # predict1=np.asarray(predict1) 293 | # # predict1=predict1.reshape(len(y_test),1) 294 | # # predict1 = min_max_scaler.fit_transform(predict1) 295 | # # predict1=predict1.tolist() 296 | # mean_tpr = 0.0 297 | # mean_fpr = np.linspace(0, 1, 100) 298 | # all_tpr = [] 299 | # fpr, tpr, thresholds = roc_curve(y_test, predict1) ##指定正例标签,pos_label = ###########在数之联的时候学到的,要制定正例 300 | # mean_tpr += interp(mean_fpr, fpr, tpr) #对mean_tpr在mean_fpr处进行插值,通过scipy包调用interp()函数 301 | # mean_tpr[0] = 0.0 302 | # roc_auc = auc(fpr, tpr) 303 | # plt.plot(fpr, tpr, lw=1, label='vfkmm-tradaboost AUC = %0.2f'% roc_auc) 304 | # plt.plot(fpr1, tpr1, lw=1, label='tradaboost AUC = %0.2f'% roc_auc1) 305 | # plt.legend(loc='lower right') 306 | # plt.plot([0,1],[0,1],'m--',c='#666666') 307 | # plt.show() 308 | 309 | 310 | 311 | plt.plot(reslisttempx, reslisttemp, marker='x', linestyle='dashed', linewidth=1, label="LR tradaboost") 312 | plt.plot(reslisttempx1, reslisttemp1, marker='o', linestyle='dashed', linewidth=1, label="vfkmm tradaboost") 313 | plt.plot(reslistx, reslist, marker='+', linestyle='dashed', linewidth=1,label="tradaboost") # plt.plot(range(5,31,5), accuracy_scorelist[4:30:5],marker='x', linestyle='dashed',linewidth=1,label="vfkmm without eliminate") 314 | # plt.plot(range(5,31,5), accuracy_scorelist1[4:30:5], marker='o', linestyle='dashed',linewidth=1,label="vfkmm eliminate") 315 | plt.xlabel("迭代次数") 316 | plt.ylabel("score") 317 | plt.legend(loc="lower right") 318 | plt.show() 319 | #res = return_correct_rate(tradaboost(X_train, TFIDF[length:,:],y_train[:,None], merge_target_source_label[length:,None], X_test[0:200,:], 100),y_test[0:200,None]) 320 | 321 | 322 | from sklearn import svm 323 | def naive_model_return_error(train, y, test,test_y): 324 | """implement a comparative method as a naive model""" 325 | #model = sklearn.linear_model.LogisticRegression(C=10000, penalty='l1', tol=0.0001) 326 | model = svm.SVC(C=131072,gamma=0.0001, kernel='rbf', probability=True) 327 | model.fit(train,y ) 328 | preds = model.predict(test) 329 | c= 0 330 | for i in range(len(preds)): 331 | if preds[i] == test_y[i] : 332 | c+=1 333 | res = c/len(test_y) 334 | return res 335 | 336 | 337 | def return_correct_rate(preds, target): 338 | c= 0 339 | for i in range(len(preds)): 340 | if preds[i] == target[i] : 341 | c+=1 342 | res = c/len(target) 343 | #print("准确率",np.mean(preds == target),'召回率',recall_score(preds,target),'F1分数',f1_score(preds,target)) 344 | return res 345 | 346 | def precision_score(y_true, y_pred): 347 | return ((y_true==1)*(y_pred==1)).sum()/(y_pred==1).sum() 348 | def recall_score(y_true, y_pred): 349 | return ((y_true==1)*(y_pred==1)).sum()/(y_true==1).sum() 350 | def f1_score(y_true, y_pred): 351 | num = 2*precision_score(y_true, y_pred)*recall_score(y_true, y_pred) 352 | deno = (precision_score(y_true, y_pred)+recall_score(y_true, y_pred)) 353 | return num/deno 354 | def split_test_classifier(clf, X, y,X_test,y_test): 355 | 356 | clf.fit(X, y) 357 | # predict 358 | y_predicted = clf.predict(X_test) 359 | # calculate percision 360 | print_classification_report('',y_predicted,y_test) 361 | print("准确率",np.mean(y_predicted == y_test),'召回率',recall_score(y_test,y_predicted),'F1分数',f1_score(y_test,y_predicted)) 362 | return np.mean(y_predicted == y_test) 363 | 364 | def plot_results(i, results_list, labels_list): 365 | colors_list = ['red', 'blue', 'black', 'green', 'cyan', 'yellow'] 366 | 367 | if not len(results_list) == len(labels_list): 368 | raise Exception 369 | 370 | for (result, label, color) in zip(results_list, labels_list, colors_list): 371 | plt.plot(i, result, color=color, lw=2.0, label=label) 372 | plt.legend() 373 | plt.show() 374 | if __name__ == "__main__": 375 | main() 376 | 377 | 378 | -------------------------------------------------------------------------------- /TrAdaboost.R2.py: -------------------------------------------------------------------------------- 1 | """ 2 | An example showing the usage of the TwoStageTrAdaBoostR2 algorithm. 3 | Example starts at line 396. 4 | """ 5 | 6 | import numpy as np 7 | import copy 8 | from sklearn.tree import DecisionTreeRegressor 9 | import matplotlib.pyplot as plt 10 | from sklearn.ensemble import AdaBoostRegressor 11 | from sklearn.metrics import mean_squared_error 12 | from sklearn.model_selection import KFold 13 | from sklearn.metrics import r2_score 14 | 15 | # import matplotlib as mpl 16 | # mpl.rcParams['font.sans-serif'] = [u'SimHei'] 17 | ##============================================================================= 18 | 19 | # copy the two classes from TwoStageTrAdaBoostR2 algorithm 20 | 21 | ##============================================================================= 22 | class Stage2_TrAdaBoostR2: 23 | def __init__(self, 24 | base_estimator=DecisionTreeRegressor(max_depth=4), 25 | sample_size=None, 26 | n_estimators=50, 27 | learning_rate=1., 28 | loss='linear', 29 | random_state=np.random.mtrand._rand, 30 | margin=True): 31 | self.base_estimator = base_estimator 32 | self.sample_size = sample_size 33 | self.n_estimators = n_estimators 34 | self.learning_rate = learning_rate 35 | self.loss = loss 36 | self.random_state = random_state 37 | self.margin=margin 38 | 39 | 40 | def fit(self, X, y, sample_weight=None): 41 | # Check parameters 42 | if self.learning_rate <= 0: 43 | raise ValueError("learning_rate must be greater than zero") 44 | 45 | if sample_weight is None: 46 | # Initialize weights to 1 / n_samples 47 | sample_weight = np.empty(X.shape[0], dtype=np.float64) 48 | sample_weight[:] = 1. / X.shape[0] 49 | else: 50 | # Normalize existing weights 51 | sample_weight = sample_weight / sample_weight.sum(dtype=np.float64) 52 | 53 | # Check that the sample weights sum is positive 54 | if sample_weight.sum() <= 0: 55 | raise ValueError( 56 | "Attempting to fit with a non-positive " 57 | "weighted number of samples.") 58 | 59 | if self.sample_size is None: 60 | raise ValueError("Additional input required: sample size of source and target is missing") 61 | elif np.array(self.sample_size).sum() != X.shape[0]: 62 | raise ValueError("Input error: the specified sample size does not equal to the input size") 63 | 64 | # Clear any previous fit results 65 | self.estimators_ = [] 66 | self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64) 67 | self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64) 68 | 69 | for iboost in range( 70 | self.n_estimators): # this for loop is sequential and does not support parallel(revison is needed if making parallel) 71 | # Boosting step 72 | sample_weight, estimator_weight, estimator_error = self._stage2_adaboostR2( 73 | iboost, 74 | X, y, 75 | sample_weight) 76 | # Early termination 77 | if sample_weight is None: 78 | break 79 | 80 | self.estimator_weights_[iboost] = estimator_weight 81 | self.estimator_errors_[iboost] = estimator_error 82 | 83 | # Stop if error is zero 84 | if estimator_error == 0: 85 | break 86 | 87 | sample_weight_sum = np.sum(sample_weight) 88 | 89 | # Stop if the sum of sample weights has become non-positive 90 | if sample_weight_sum <= 0: 91 | break 92 | 93 | if iboost < self.n_estimators - 1: 94 | # Normalize 95 | sample_weight /= sample_weight_sum 96 | return self 97 | 98 | def _stage2_adaboostR2(self, iboost, X, y, sample_weight): 99 | 100 | estimator = copy.deepcopy( 101 | self.base_estimator) # some estimators allow for specifying random_state estimator = base_estimator(random_state=random_state) 102 | 103 | ## using sampling method to account for sample_weight as discussed in Drucker's paper 104 | # Weighted sampling of the training set with replacement 105 | cdf = np.cumsum(sample_weight) 106 | cdf /= cdf[-1] 107 | uniform_samples = self.random_state.random_sample(X.shape[0]) 108 | bootstrap_idx = cdf.searchsorted(uniform_samples, side='right') 109 | # searchsorted returns a scalar 110 | bootstrap_idx = np.array(bootstrap_idx, copy=False) 111 | 112 | # Fit on the bootstrapped sample and obtain a prediction 113 | # for all samples in the training set 114 | estimator.fit(X[bootstrap_idx], y[bootstrap_idx]) 115 | y_predict = estimator.predict(X) 116 | 117 | self.estimators_.append(estimator) # add the fitted estimator 118 | 119 | 120 | error_vect = np.abs(y_predict - y) 121 | 122 | if(self.margin): 123 | error_vect[error_vect <0.05] = 0 124 | error_max = error_vect.max() 125 | 126 | if error_max != 0.: 127 | error_vect /= error_max 128 | 129 | if self.loss == 'square': 130 | error_vect **= 2 131 | elif self.loss == 'exponential': 132 | error_vect = 1. - np.exp(- error_vect) 133 | 134 | # Calculate the average loss 135 | estimator_error = (sample_weight * error_vect).sum() 136 | 137 | if estimator_error <= 0: 138 | # Stop if fit is perfect 139 | return sample_weight, 1., 0. 140 | 141 | elif estimator_error >= 0.5: 142 | # Discard current estimator only if it isn't the only one 143 | if len(self.estimators_) > 1: 144 | self.estimators_.pop(-1) 145 | return None, None, None 146 | 147 | beta = estimator_error / (1. - estimator_error) 148 | # avoid overflow of np.log(1. / beta) 149 | if beta < 1e-308: 150 | beta = 1e-308 151 | estimator_weight = self.learning_rate * np.log(1. / beta) 152 | 153 | # Boost weight using AdaBoost.R2 alg except the weight of the source data 154 | # the weight of the source data are remained 155 | source_weight_sum = np.sum(sample_weight[:-self.sample_size[-1]]) / np.sum(sample_weight) 156 | target_weight_sum = np.sum(sample_weight[-self.sample_size[-1]:]) / np.sum(sample_weight) 157 | 158 | if not iboost == self.n_estimators - 1: 159 | sample_weight[-self.sample_size[-1]:] *= np.power( 160 | beta,(error_vect[-self.sample_size[-1]:]) * self.learning_rate) 161 | #(1. - error_vect[-self.sample_size[-1]:]) * self.learning_rate) 162 | # make the sum weight of the source data not changing 163 | source_weight_sum_new = np.sum(sample_weight[:-self.sample_size[-1]]) / np.sum(sample_weight) 164 | target_weight_sum_new = np.sum(sample_weight[-self.sample_size[-1]:]) / np.sum(sample_weight) 165 | if source_weight_sum_new != 0. and target_weight_sum_new != 0.: 166 | sample_weight[:-self.sample_size[-1]] = sample_weight[:-self.sample_size[ 167 | -1]] * source_weight_sum / source_weight_sum_new 168 | sample_weight[-self.sample_size[-1]:] = sample_weight[-self.sample_size[ 169 | -1]:] * target_weight_sum / target_weight_sum_new 170 | 171 | return sample_weight, estimator_weight, estimator_error 172 | 173 | def predict(self, X): 174 | # Evaluate predictions of all estimators 175 | predictions = np.array([ 176 | est.predict(X) for est in self.estimators_[:len(self.estimators_)]]).T 177 | 178 | # Sort the predictions 179 | sorted_idx = np.argsort(predictions, axis=1) 180 | 181 | # Find index of median prediction for each sample 182 | weight_cdf = np.cumsum(self.estimator_weights_[sorted_idx], axis=1) 183 | median_or_above = weight_cdf >= 0.5 * weight_cdf[:, -1][:, np.newaxis] 184 | median_idx = median_or_above.argmax(axis=1) 185 | 186 | median_estimators = sorted_idx[np.arange(X.shape[0]), median_idx] 187 | 188 | # Return median predictions 189 | return predictions[np.arange(X.shape[0]), median_estimators] 190 | 191 | 192 | class TwoStageTrAdaBoostR2: 193 | def __init__(self, 194 | base_estimator=DecisionTreeRegressor(max_depth=4), 195 | sample_size=None, 196 | n_estimators=50, 197 | steps=10, 198 | fold=5, 199 | learning_rate=1., 200 | loss='linear', 201 | random_state=np.random.mtrand._rand, 202 | margin=True 203 | ): 204 | self.base_estimator = base_estimator 205 | self.sample_size = sample_size 206 | self.n_estimators = n_estimators 207 | self.steps = steps 208 | self.fold = fold 209 | self.learning_rate = learning_rate 210 | self.loss = loss 211 | self.random_state = random_state 212 | self.margin=margin 213 | 214 | def fit(self, X, y, sample_weight=None): 215 | # Check parameters 216 | if self.learning_rate <= 0: 217 | raise ValueError("learning_rate must be greater than zero") 218 | 219 | if sample_weight is None: 220 | # Initialize weights to 1 / n_samples 221 | sample_weight = np.empty(X.shape[0], dtype=np.float64) 222 | sample_weight[:] = 1. / X.shape[0] 223 | else: 224 | # Normalize existing weights 225 | sample_weight = sample_weight / sample_weight.sum(dtype=np.float64) 226 | 227 | # Check that the sample weights sum is positive 228 | if sample_weight.sum() <= 0: 229 | raise ValueError( 230 | "Attempting to fit with a non-positive " 231 | "weighted number of samples.") 232 | 233 | if self.sample_size is None: 234 | raise ValueError("Additional input required: sample size of source and target is missing") 235 | elif np.array(self.sample_size).sum() != X.shape[0]: 236 | raise ValueError("Input error: the specified sample size does not equal to the input size") 237 | 238 | X_source = X[:-self.sample_size[-1]] 239 | y_source = y[:-self.sample_size[-1]] 240 | X_target = X[-self.sample_size[-1]:] 241 | y_target = y[-self.sample_size[-1]:] 242 | 243 | self.models_ = [] 244 | self.errors_ = [] 245 | for istep in range(self.steps): 246 | model = Stage2_TrAdaBoostR2(self.base_estimator, 247 | sample_size=self.sample_size, 248 | n_estimators=self.n_estimators, 249 | learning_rate=self.learning_rate, loss=self.loss, 250 | random_state=self.random_state,margin=self.margin) 251 | model.fit(X, y, sample_weight=sample_weight) 252 | self.models_.append(model) 253 | # cv training 254 | kf = KFold(n_splits=self.fold) 255 | error = [] 256 | target_weight = sample_weight[-self.sample_size[-1]:] 257 | source_weight = sample_weight[:-self.sample_size[-1]] 258 | for train, test in kf.split(X_target): 259 | sample_size = [self.sample_size[0], len(train)] 260 | model = Stage2_TrAdaBoostR2(self.base_estimator, 261 | sample_size=sample_size, 262 | n_estimators=self.n_estimators, 263 | learning_rate=self.learning_rate, loss=self.loss, 264 | random_state=self.random_state,margin=self.margin) 265 | X_train = np.concatenate((X_source, X_target[train])) 266 | y_train = np.concatenate((y_source, y_source[train])) 267 | X_test = X_target[test] 268 | y_test = y_target[test] 269 | # make sure the sum weight of the target data do not change with CV's split sampling 270 | target_weight_train = target_weight[train] * np.sum(target_weight) / np.sum(target_weight[train]) 271 | model.fit(X_train, y_train, sample_weight=np.concatenate((source_weight, target_weight_train))) 272 | y_predict = model.predict(X_test) 273 | error.append(mean_squared_error(y_predict, y_test)) 274 | 275 | self.errors_.append(np.array(error).mean()) 276 | 277 | sample_weight = self._twostage_adaboostR2(istep, X, y, sample_weight) 278 | 279 | if sample_weight is None: 280 | break 281 | if np.array(error).mean() == 0: 282 | break 283 | 284 | sample_weight_sum = np.sum(sample_weight) 285 | 286 | # Stop if the sum of sample weights has become non-positive 287 | if sample_weight_sum <= 0: 288 | break 289 | 290 | if istep < self.steps - 1: 291 | # Normalize 292 | sample_weight /= sample_weight_sum 293 | return self 294 | 295 | def _twostage_adaboostR2(self, istep, X, y, sample_weight): 296 | 297 | estimator = copy.deepcopy( 298 | self.base_estimator) # some estimators allow for specifying random_state estimator = base_estimator(random_state=random_state) 299 | 300 | ## using sampling method to account for sample_weight as discussed in Drucker's paper 301 | # Weighted sampling of the training set with replacement 302 | cdf = np.cumsum(sample_weight) 303 | cdf /= cdf[-1] 304 | uniform_samples = self.random_state.random_sample(X.shape[0]) 305 | bootstrap_idx = cdf.searchsorted(uniform_samples, side='right') 306 | # searchsorted returns a scalar 307 | bootstrap_idx = np.array(bootstrap_idx, copy=False) 308 | 309 | # Fit on the bootstrapped sample and obtain a prediction 310 | # for all samples in the training set 311 | estimator.fit(X[bootstrap_idx], y[bootstrap_idx]) 312 | y_predict = estimator.predict(X) 313 | 314 | error_vect = np.abs(y_predict - y) 315 | error_max = error_vect.max() 316 | 317 | if error_max != 0.: 318 | error_vect /= error_max 319 | 320 | if self.loss == 'square': 321 | error_vect **= 2 322 | elif self.loss == 'exponential': 323 | error_vect = 1. - np.exp(- error_vect) 324 | 325 | # Update the weight vector 326 | beta = self._beta_binary_search(istep, sample_weight, error_vect, stp=1e-30) 327 | 328 | if not istep == self.steps - 1: 329 | sample_weight[:-self.sample_size[-1]] *= np.power( 330 | beta, 331 | (error_vect[:-self.sample_size[-1]]) * self.learning_rate) 332 | return sample_weight 333 | 334 | def _beta_binary_search(self, istep, sample_weight, error_vect, stp): 335 | # calculate the specified sum of weight for the target data 336 | n_target = self.sample_size[-1] 337 | n_source = np.array(self.sample_size).sum() - n_target 338 | theoretical_sum = n_target / (n_source + n_target) + istep / (self.steps - 1) * ( 339 | 1 - n_target / (n_source + n_target)) 340 | # for the last iteration step, beta is 0. 341 | if istep == self.steps - 1: 342 | beta = 0. 343 | return beta 344 | # binary search for beta 345 | L = 0. 346 | R = 1. 347 | beta = (L + R) / 2 348 | sample_weight_ = copy.deepcopy(sample_weight) 349 | sample_weight_[:-n_target] *= np.power( 350 | beta, 351 | (error_vect[:-n_target]) * self.learning_rate) 352 | sample_weight_ /= np.sum(sample_weight_, dtype=np.float64) 353 | updated_weight_sum = np.sum(sample_weight_[-n_target:], dtype=np.float64) 354 | 355 | while np.abs(updated_weight_sum - theoretical_sum) > 0.01: 356 | if updated_weight_sum < theoretical_sum: 357 | R = beta - stp 358 | if R > L: 359 | beta = (L + R) / 2 360 | sample_weight_ = copy.deepcopy(sample_weight) 361 | sample_weight_[:-n_target] *= np.power( 362 | beta, 363 | (error_vect[:-n_target]) * self.learning_rate) 364 | sample_weight_ /= np.sum(sample_weight_, dtype=np.float64) 365 | updated_weight_sum = np.sum(sample_weight_[-n_target:], dtype=np.float64) 366 | else: 367 | print("At step:", istep + 1) 368 | print("Binary search's goal not meeted! Value is set to be the available best!") 369 | print("Try reducing the search interval. Current stp interval:", stp) 370 | break 371 | 372 | elif updated_weight_sum > theoretical_sum: 373 | L = beta + stp 374 | if L < R: 375 | beta = (L + R) / 2 376 | sample_weight_ = copy.deepcopy(sample_weight) 377 | sample_weight_[:-n_target] *= np.power( 378 | beta, 379 | (error_vect[:-n_target]) * self.learning_rate) 380 | sample_weight_ /= np.sum(sample_weight_, dtype=np.float64) 381 | updated_weight_sum = np.sum(sample_weight_[-n_target:], dtype=np.float64) 382 | else: 383 | print("At step:", istep + 1) 384 | print("Binary search's goal not meeted! Value is set to be the available best!") 385 | print("Try reducing the search interval. Current stp interval:", stp) 386 | break 387 | return beta 388 | 389 | def predict(self, X): 390 | # select the model with the least CV error 391 | fmodel = self.models_[np.array(self.errors_).argmin()] 392 | predictions = fmodel.predict(X) 393 | return predictions 394 | 395 | ##============================================================================= 396 | 397 | 398 | # end copying the two classes 399 | 400 | ##============================================================================= 401 | 402 | # Example 1 403 | 404 | ##============================================================================= 405 | 406 | # 1. define the data generating function 407 | def response(x, d, random_state): 408 | """ 409 | x is the input variable 410 | d controls the simularity of different tasks 411 | """ 412 | a1 = np.random.normal(1, 0.1 * d) 413 | a2 = np.random.normal(1, 0.1 * d) 414 | b1 = np.random.normal(1, 0.1 * d) 415 | b2 = np.random.normal(1, 0.1 * d) 416 | c1 = np.random.normal(1, 0.05 * d) 417 | c2 = np.random.normal(1, 0.05 * d) 418 | y = a1 * np.sin(b1 * x + c1).ravel() + a2 * np.sin(b2 * 6 * x + c2).ravel() + random_state.normal(0, 0.1, 419 | x.shape[0]) 420 | return y 421 | 422 | 423 | # ============================================================================== 424 | 425 | # 2. decide the degree of similarity of multiple data sources using d 426 | 427 | d = 5 428 | # ============================================================================== 429 | rng = np.random.RandomState(1) 430 | 431 | # 3.1 create source data and target data 432 | n_source1 = 100 433 | x_source1 = np.linspace(0, 6, n_source1)[:, np.newaxis] 434 | y_source1 = response(x_source1, d, rng) 435 | n_source2 = 100 436 | x_source2 = np.linspace(0, 6, n_source2)[:, np.newaxis] 437 | y_source2 = response(x_source2, d, rng) 438 | n_source3 = 100 439 | x_source3 = np.linspace(0, 6, n_source3)[:, np.newaxis] 440 | y_source3 = response(x_source3, d, rng) 441 | n_source4 = 100 442 | x_source4 = np.linspace(0, 6, n_source4)[:, np.newaxis] 443 | y_source4 = response(x_source4, d, rng) 444 | n_source5 = 100 445 | x_source5 = np.linspace(0, 6, n_source5)[:, np.newaxis] 446 | y_source5 = response(x_source5, d, rng) 447 | 448 | # 3.2 create target data (n_target_train and n_target_test are the sample size of train and test datasets) 449 | a1 = np.random.normal(1, 0.1 * d) 450 | a2 = np.random.normal(1, 0.1 * d) 451 | b1 = np.random.normal(1, 0.1 * d) 452 | b2 = np.random.normal(1, 0.1 * d) 453 | c1 = np.random.normal(1, 0.05 * d) 454 | c2 = np.random.normal(1, 0.05 * d) 455 | 456 | # target_train 457 | # ============================================================================== 458 | 459 | n_target_train = 15 460 | 461 | # ============================================================================== 462 | x_target_train = np.linspace(0, 6, n_target_train)[:, np.newaxis] 463 | y_target_train = a1 * np.sin(b1 * x_target_train + c1).ravel() + a2 * np.sin( 464 | b2 * 6 * x_target_train + c2).ravel() + rng.normal(0, 0.1, x_target_train.shape[0]) 465 | 466 | # target_test 467 | n_target_test = 600 468 | x_target_test = np.linspace(0, 6, n_target_test)[:, np.newaxis] 469 | y_target_test = a1 * np.sin(b1 * x_target_test + c1).ravel() + a2 * np.sin( 470 | b2 * 6 * x_target_test + c2).ravel() + rng.normal(0, 0.1, x_target_test.shape[0]) 471 | 472 | # 3.3 plot the generated data 473 | plt.figure() 474 | plt.plot(x_source1, y_source1, c="r", label="source1", linewidth=1) 475 | plt.plot(x_source2, y_source2, c="y", label="source2", linewidth=1) 476 | plt.plot(x_source3, y_source3, c="g", label="source3", linewidth=1) 477 | plt.plot(x_source4, y_source4, c="c", label="source4", linewidth=1) 478 | plt.plot(x_source5, y_source5, c="m", label="source5", linewidth=1) 479 | plt.plot(x_target_test, y_target_test, c="b", label="target_test", linewidth=0.5) 480 | plt.scatter(x_target_train, y_target_train, c="k", label="target_train") 481 | plt.xlabel("x") 482 | plt.ylabel("y") 483 | plt.title("Multiple datasets") 484 | plt.legend() 485 | plt.show() 486 | 487 | # 4. transfer learning regressiong for the target_train data 488 | # 4.1 data combination and initial setting specification 489 | X = np.concatenate((x_source1, x_source2, x_source3, x_source4, x_source5, x_target_train)) 490 | y = np.concatenate((y_source1, y_source2, y_source3, y_source4, y_source5, y_target_train)) 491 | sample_size = [n_source1 + n_source2 + n_source3 + n_source4 + n_source5, n_target_train] 492 | 493 | # ============================================================================== 494 | 495 | n_estimators = 10 496 | steps = 100 497 | fold = 5 498 | random_state = np.random.RandomState(1) 499 | 500 | # ============================================================================== 501 | 502 | # 4.2 TwoStageAdaBoostR2 503 | regr_1 = TwoStageTrAdaBoostR2(DecisionTreeRegressor(max_depth=6), 504 | n_estimators=n_estimators, sample_size=sample_size, 505 | steps=steps, fold=fold, 506 | random_state=random_state) 507 | regr_1.fit(X, y) 508 | y_pred1 = regr_1.predict(x_target_test) 509 | 510 | # 4.3 As comparision, use AdaBoostR2 without transfer learning 511 | # ============================================================================== 512 | 513 | # ============================================================================== 514 | 515 | # 4.2 TwoStageAdaBoostR2 without margin 516 | regr_3 = TwoStageTrAdaBoostR2(DecisionTreeRegressor(max_depth=6), 517 | n_estimators=n_estimators, sample_size=sample_size, 518 | steps=steps, fold=fold, 519 | random_state=random_state,margin=False) 520 | regr_3.fit(X, y) 521 | y_pred3 = regr_3.predict(x_target_test) 522 | 523 | # 4.3 As comparision, use AdaBoostR2 without transfer learning 524 | # ============================================================================== 525 | regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=6), 526 | n_estimators=n_estimators) 527 | # ============================================================================== 528 | regr_2.fit(x_target_train, y_target_train) 529 | y_pred2 = regr_2.predict(x_target_test) 530 | 531 | 532 | 533 | # 4.5 Calculate mse 534 | mse_twostageboost = mean_squared_error(y_target_test, y_pred1) 535 | mse_adaboost = mean_squared_error(y_target_test, y_pred2) 536 | mse_twostageboost3 = mean_squared_error(y_target_test, y_pred3) 537 | print("MSE of regular AdaboostR2:", mse_adaboost) 538 | print("MSE of TwoStageTrAdaboostR2:", mse_twostageboost) 539 | print("MSE of TwoStageTrAdaboostR2 without margin:", mse_twostageboost3) 540 | # ============================================================================== 541 | print("r2 of regular AdaboostR2:", r2_score(y_target_test, y_pred2)) 542 | print("r2 of TwoStageTrAdaboostR2:", r2_score(y_target_test, y_pred1)) 543 | print("r2 of TwoStageTrAdaboostR2 without margin:", r2_score(y_target_test, y_pred3)) 544 | 545 | 546 | 547 | # 4.4 Plot the results 548 | # plt.figure() 549 | # plt.scatter(x_target_train, y_target_train, c="k", label="target_train") 550 | # plt.plot(x_target_test, y_target_test, c="b", label="target_test", linewidth=0.5) 551 | # plt.plot(x_target_test, y_pred1, c="r", label="VFKMM-TrAdaBoost", linewidth=2) 552 | # plt.plot(x_target_test, y_pred2, c="y", label="AdaBoostRegressor", linewidth=2) 553 | # plt.plot(x_target_test, y_pred3, c="g", label="VFKMM-TrAdaBoost without margin", linewidth=2) 554 | # plt.xlabel("x") 555 | # plt.ylabel("y") 556 | # plt.legend(loc="lower left") 557 | # plt.title("mutisource VFKMM-TrAdaBoost Regressor") 558 | # plt.legend() 559 | # plt.show() 560 | 561 | --------------------------------------------------------------------------------