├── README.md
├── kmm.py
├── TrAdaboost2.py
├── TrAdaboostreg.py
├── kmmClassification.py
├── TrAdaboostkmm.py
├── SPY.py
├── Mutisource.py
├── TrReg.py
├── test.py
└── TrAdaboost.R2.py


/README.md:
--------------------------------------------------------------------------------
1 | # TransferLearning
2 | instance based Transfer learning, TrAdaboost, mutisource-trAdaBoost regresion
3 | 


--------------------------------------------------------------------------------
/kmm.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import numpy as np
  4 | import math
  5 | import matplotlib.pyplot as plt
  6 | from cvxopt import matrix, solvers
  7 | def kernel_mean_matching(X, Z, kern='lin', B=1.0, eps=None):
  8 |     nx = X.shape[0]
  9 |     nz = Z.shape[0]
 10 |     if eps == None:
 11 |         eps = B / math.sqrt(nz)
 12 |     if kern == 'lin':
 13 |         K = np.dot(Z, Z.T)
 14 |         kappa = np.sum(np.dot(Z, X.T) * float(nz) / float(nx), axis=1)
 15 |     elif kern == 'rbf':
 16 |         K = compute_rbf(Z, Z)
 17 |         kappa = np.sum(compute_rbf(Z, X), axis=1) * float(nz) / float(nx)
 18 |     else:
 19 |         raise ValueError('unknown kernel')
 20 | 
 21 |     K = matrix(K)
 22 |     kappa = matrix(kappa)
 23 |     G = matrix(np.r_[np.ones((1, nz)), -np.ones((1, nz)), np.eye(nz), -np.eye(nz)])
 24 |     h = matrix(np.r_[nz * (1 + eps), nz * (eps - 1), B * np.ones((nz,)), np.zeros((nz,))])
 25 | 
 26 |     sol = solvers.qp(K, -kappa, G, h)
 27 |     coef = np.array(sol['x'])
 28 |     return coef
 29 | 
 30 | 
 31 | def compute_rbf(X, Z, sigma=1.0):
 32 |     K = np.zeros((X.shape[0], Z.shape[0]), dtype=float)
 33 |     for i, vx in enumerate(X):
 34 |         K[i, :] = np.exp(-np.sum((vx - Z) ** 2, axis=1) / (2.0 * sigma))
 35 |     return K
 36 | x = 11*np.random.random(200)- 6.0
 37 | y = x**2 + 10*np.random.random(200) - 5
 38 | Z = np.c_[x, y]
 39 | 
 40 | x = 2*np.random.random(10) - 6.0
 41 | y = x**2 + 10*np.random.random(10) - 5
 42 | X = np.c_[x, y]
 43 | 
 44 | 
 45 | 
 46 | from sklearn.model_selection import train_test_split
 47 | from sklearn.linear_model import LogisticRegression
 48 | from sklearn import metrics
 49 | # 样本重标记
 50 | lable_spy_a = np.zeros([200, 1])
 51 | lable_spy_s = np.ones([10, 1])
 52 | 
 53 | 
 54 | trans_data = np.concatenate((X, Z), axis=0)
 55 | trans_label = np.concatenate((lable_spy_s,lable_spy_a), axis=0)
 56 | 
 57 | X_train, X_test, y_train, y_test = train_test_split(trans_data, trans_label, test_size=0.33, random_state=42)
 58 | clf = LogisticRegression(penalty='l1',class_weight='balanced')
 59 | # gnb = BernoulliNB()
 60 | clf.fit(X_train, y_train)
 61 | print("LR的预测精度", metrics.confusion_matrix(y_test, clf.predict(X_test)))
 62 | print("LR的预测精度", metrics.accuracy_score(y_test, clf.predict(X_test)))
 63 | 
 64 | 
 65 | 
 66 | 
 67 | 
 68 | coef = clf.predict_proba(Z)[:, -1].tolist()
 69 | 
 70 | 
 71 | # coef = kernel_mean_matching(X, Z, kern='rbf', B=10)
 72 | # print(coef)
 73 | # print(coef.shape)
 74 | # print(Z.shape)
 75 | #
 76 | # plt.close()
 77 | # plt.figure()
 78 | # plt.scatter(Z[:,0], Z[:,1], color='black', marker='x')
 79 | # plt.scatter(X[:,0], X[:,1], color='red')
 80 | # plt.scatter(Z[:,0], Z[:,1], color='green', s=coef*10, alpha=0.5)
 81 | # plt.show()
 82 | # np.sum(coef > 1e-2)
 83 | 
 84 | #
 85 | #
 86 | # print(coef)
 87 | # print(Z.shape)
 88 | #
 89 | # plt.close()
 90 | # plt.figure()
 91 | #
 92 | # w=clf.coef_
 93 | # p=clf.intercept_
 94 | # print(w)
 95 | # print(p)
 96 | # x = np.mat(np.arange(min(Z[:,0]),max(Z[:,0]), 0.1))
 97 | # y = (-p[0]- w[0,0] * x) / w[0,1]
 98 | #
 99 | # coef = np.asarray(coef)
100 | # plt.plot(x.transpose(), y.transpose())
101 | # # plt.scatter(Z[:,0], Z[:,1], color='black', marker='x')
102 | # plt.scatter(X[:,0], X[:,1], color='red', marker='x')
103 | # plt.scatter(Z[:,0], Z[:,1], color='green',marker='o',s=coef*80, alpha=0.5)
104 | # plt.show()
105 | 


--------------------------------------------------------------------------------
/TrAdaboost2.py:
--------------------------------------------------------------------------------
  1 | import sklearn.svm
  2 | from sklearn.datasets import fetch_20newsgroups
  3 | from dataQuality.kmm import *
  4 | # ala=np.concatenate((trans_A, label_A.reshape(row_A,1)[:,-1:]), axis=1)
  5 | # s=np.concatenate((trans_S, label_S.reshape(row_S,1)[:,-1:]), axis=1)
  6 | # 初始化权重
  7 | # coef = kernel_mean_matching(s,ala,
  8 | #           kern='rbf', B=10)
  9 | # code by chenchiwei
 10 | # -*- coding: UTF-8 -*-
 11 | import numpy as np
 12 | from sklearn import tree
 13 | from scipy import sparse
 14 | from sklearn import metrics
 15 | from sklearn import svm
 16 | # H 测试样本分类结果
 17 | # TrainS 原训练样本 np数组
 18 | # TrainA 辅助训练样本
 19 | # LabelS 原训练样本标签
 20 | # LabelA 辅助训练样本标签
 21 | # Test  测试样本
 22 | # N 迭代次数
 23 | from KMM import kmmClassification
 24 | def tradaboost(trans_S, trans_A, label_S, label_A, test,test_label, N):
 25 |     trans_data = sparse.vstack((trans_A, trans_S))
 26 |     trans_label = np.concatenate((label_A, label_S), axis=0)
 27 | 
 28 |     row_A = trans_A.shape[0]
 29 |     row_S = trans_S.shape[0]
 30 |     row_T = test.shape[0]
 31 | 
 32 |     # print('目标源的大小',row_S,'辅助源的大小',row_A,'测试集的大小',row_T)
 33 |     test_data = sparse.vstack((trans_data, test))
 34 | 
 35 | 
 36 |     # coef = kmmClassification.getBeta(trans_A,test.toarray(),49098)
 37 |     # weights_A = coef
 38 |     # weights_A = np.asarray(weights_A).reshape(row_A,1)
 39 |     # total=sum(weights_A[:,0])
 40 |     # for j in range(row_A):
 41 |     #     weights_A[j,0] = weights_A[j,0]/total
 42 |     # weights_S = np.ones([row_S, 1]) * np.mean(weights_A)
 43 |     weights_S = np.ones([row_S, 1])/row_S
 44 |     weights_A = np.ones([row_A, 1])/row_A
 45 |     # weights_S = np.ones([row_S, 1])
 46 |     # weights_A = np.ones([row_A, 1])
 47 |     weights = np.concatenate((weights_A, weights_S), axis=0)
 48 | 
 49 |     bata = 1 / (1 + np.sqrt(2.0 * np.log(row_A/ N)))
 50 | 
 51 |     #bata = 1/(1+np.sqrt(2.0*np.log(row_A)/N));
 52 | 
 53 |     # 存储每次迭代的标签和bata值？
 54 |     bata_T = np.zeros([1, N])
 55 |     result_label = np.ones([row_A + row_S + row_T, N])
 56 | 
 57 |     predict = np.zeros([row_T])
 58 | 
 59 |     # trans_data = np.asarray(trans_data, order='C')
 60 |     # trans_label = np.asarray(trans_label, order='C')
 61 |     # test_data = np.asarray(test_data, order='C')
 62 | 
 63 |     # print(trans_data.shape)
 64 |     # print(test_data.shape)
 65 |     accuracy_scorelist=[]
 66 |     f1_scorelist=[]
 67 |     recall_scorelist=[]
 68 |     for i in range(N):
 69 |         P = calculate_P(weights, trans_label)
 70 | 
 71 |         result_label[:, i] = train_classify(trans_data, trans_label,
 72 |                                             test_data, P)
 73 | 
 74 |         error_rate = 0.0
 75 |         for j in range(row_A, row_A + row_S):
 76 |             error_rate += (weights[j] * abs(result_label[j, i] - trans_label[j]))
 77 |         error_rate = error_rate / sum(weights[row_A:])
 78 | 
 79 | 
 80 |         #error_rate = calculate_error_rate(label_S, result_label[row_A:row_A + row_S, i],
 81 |         #                                  weights[row_A:row_A + row_S, :])
 82 |         #print ('Error rate:', error_rate)
 83 |         # if error_rate != 1:
 84 |         #     bata_T[0, i] = error_rate / (1.0 - error_rate)
 85 |         # if error_rate >= 0.5 and error_rate != 1:
 86 |         #     bata_T[0, i] = 0.45 / (0.51)
 87 |         # if error_rate == 1:
 88 |         #     bata_T[0, i] = 0.4
 89 | 
 90 |         if error_rate >= 0.5:
 91 |             #error_rate = 0.5
 92 |             error_rate = 0.499;
 93 |         if error_rate == 0:
 94 |             #error_rate = 0.000001
 95 |             #error_rate=0.0001
 96 |             error_rate = 0.001
 97 | 
 98 |         bata_T[0, i] = error_rate / (1 - error_rate)
 99 |         # Ct = 2 * (1 - error_rate);
100 |         # 调整源域样本权重
101 |         for j in range(row_S):
102 |             weights[row_A + j] = weights[row_A + j] * np.power(bata_T[0, i],
103 |                                                                (-np.abs(result_label[row_A + j, i] - trans_label[row_A+j])))
104 | 
105 |         # 调整辅域样本权重
106 |         for j in range(row_A):
107 |             weights[j] = weights[j] * np.power(bata, np.abs(result_label[j, i] - trans_label[j]))
108 | 
109 | 
110 |             ##每次迭代完成计算下在测试集合上的误差
111 |         # predic_temp = np.zeros([row_T])
112 |         # iteration = i;
113 |         # for i in range(row_T):
114 |         #     left = np.sum(
115 |         #         result_label[row_A + row_S + i, int(np.ceil(iteration / 2)):iteration] * np.log(1 / bata_T[0, int(np.ceil(iteration / 2)):iteration]))
116 |         #     right = 0.5 * np.sum(np.log(1 / bata_T[0, int(np.ceil(iteration / 2)):iteration]))
117 |         #     if left >= right:
118 |         #         predic_temp[i] = 1
119 |         #     else:
120 |         #         predic_temp[i] = 0
121 |         # accuracy_scorelist.append(metrics.accuracy_score(test_label, predic_temp))
122 |         # recall_scorelist.append(metrics.recall_score(test_label, predic_temp))
123 |         # f1_scorelist.append(metrics.f1_score(test_label, predic_temp))
124 |     # print bata_T
125 |     for i in range(row_T):
126 |         # 跳过训练数据的标签
127 |         # left = np.sum(
128 |         #     result_label[row_A + row_S + i, int(np.ceil(N / 2)):N] * np.log(1 / bata_T[0, int(np.ceil(N / 2)):N]))
129 |         # right = 0.5 * np.sum(np.log(1 / bata_T[0, int(np.ceil(N / 2)):N]))
130 |         left = np.sum(
131 |             result_label[row_A + row_S + i, 0:N] * np.log(1 / bata_T[0, 0:N]))
132 |         right = 0.5 * np.sum(np.log(1 / bata_T[0, 0:N]))
133 |         if left >= right:
134 |             predict[i] = 1
135 |         else:
136 |             predict[i] = 0
137 |             # print left, right, predict[i]
138 |         # predict[i] = left - right;
139 |     return predict, accuracy_scorelist, recall_scorelist, f1_scorelist
140 | 
141 | 
142 | def calculate_P(weights, label):
143 |     total = np.sum(weights)
144 |     return np.asarray(weights)/total
145 | 
146 | from sklearn.linear_model import LogisticRegression
147 | 
148 | def train_classify(trans_data, trans_label, test_data, P):
149 |     clf = LogisticRegression()
150 |     clf.fit(trans_data, trans_label, sample_weight=P[:, 0])
151 |     return clf.predict(test_data)
152 | 
153 | 
154 | # def calculate_error_rate(label_R, label_H, weight):
155 | #     total = np.sum(weight)
156 | #     #return np.sum((weight[:, 0] / total)* np.abs(label_R - label_H))
157 | #     return  return_correct_rate(label_R,label_H)
158 | 


--------------------------------------------------------------------------------
/TrAdaboostreg.py:
--------------------------------------------------------------------------------
  1 | # code by chenchiwei
  2 | # -*- coding: UTF-8 -*-
  3 | import numpy as np
  4 | from sklearn import tree
  5 | from sklearn import svm
  6 | import math
  7 | from sklearn.metrics import r2_score
  8 | from sklearn.tree import DecisionTreeRegressor
  9 | 
 10 | def stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
 11 |     out = np.cumsum(arr, axis=axis, dtype=np.float64)
 12 |     return out
 13 | 
 14 | 
 15 | # H 测试样本分类结果
 16 | # TrainS 原训练样本 np数组
 17 | # TrainA 辅助训练样本
 18 | # LabelS 原训练样本标签
 19 | # LabelA 辅助训练样本标签
 20 | # Test  测试样本
 21 | # N 迭代次数
 22 | def tradaboost(trans_S, trans_A, label_S, label_A, test, y_target_test,N,islog):
 23 |     trans_data = np.concatenate((trans_A, trans_S), axis=0)
 24 |     trans_label = np.concatenate((label_A, label_S), axis=0)
 25 | 
 26 |     row_A = trans_A.shape[0]
 27 |     row_S = trans_S.shape[0]
 28 |     row_T = test.shape[0]
 29 | 
 30 |     test_data = np.concatenate((trans_data, test), axis=0)
 31 | 
 32 |     # 初始化权重
 33 |     weights_A = np.ones([row_A, 1])/row_A
 34 |     weights_S = np.ones([row_S, 1])/row_S
 35 |     weights = np.concatenate((weights_A, weights_S), axis=0)
 36 | 
 37 |     bata = 1 / (1 + np.sqrt(2 * np.log(row_A/N)))
 38 | 
 39 |     # 存储每次迭代的标签和bata值
 40 |     bata_T = np.zeros([1, N])
 41 |     result_label = np.ones([row_A + row_S + row_T, N])
 42 | 
 43 |     predict = np.zeros([row_T])
 44 | 
 45 |     #print ('params initial finished.')
 46 | 
 47 | 
 48 |     for i in range(N):
 49 |         #将权重向量归一化
 50 |         P = calculate_P(weights)
 51 | 
 52 | 
 53 |         result_label[:, i] = train_classify(trans_data, trans_label,
 54 |                                             test_data, weights,test,y_target_test,islog)
 55 | 
 56 |         temp0 = np.abs(result_label[:row_A + row_S, i] - trans_label)
 57 |         error_max0 = temp0.max()
 58 |         temp = np.abs(result_label[row_A:row_A + row_S, i] - label_S)
 59 |         error_max = temp.max()
 60 |         if error_max0==0.0 or error_max==0.0:
 61 |             N=i;
 62 |             break
 63 |         temp2 = np.abs(result_label[:row_A, i] - label_A)
 64 |         error_max2 = temp2.max()
 65 |         error_rate = 0.0
 66 |         for j in range(row_A, row_A + row_S):
 67 |             error_rate += (weights[j] * ((abs(result_label[j, i] - trans_label[j])/error_max0)))
 68 |         error_rate = error_rate / sum(weights[row_A:])
 69 |         if error_rate >= 0.5:
 70 |             error_rate = 0.499;
 71 |         if error_rate == 0:
 72 |             error_rate=0.001
 73 |         bata_T[0, i] = error_rate / (1 - error_rate)
 74 | 
 75 |         for j in range(row_S):
 76 |                 weights[row_A + j] = weights[row_A + j] * np.power(bata_T[0, i], -(
 77 |                 np.abs(result_label[row_A + j, i] - label_S[j]) / error_max0))
 78 |         # 调整辅域样本权重
 79 |         for j in range(row_A):
 80 |             if islog:
 81 |                 if (abs(result_label[j, i] - label_A[j]) >0):#0.02872
 82 |                         weights[j] = weights[j] * np.power(bata, np.abs((result_label[j, i] - label_A[j])/error_max0))
 83 |             else:
 84 |                  weights[j] = weights[j] * np.power(bata, np.abs((result_label[j, i] - label_A[j]) / error_max0))
 85 |     # bata_T[0,:]=bata_T[0,:]/np.sum(bata_T[0,:])
 86 | 
 87 |     #
 88 |     predictions=result_label[row_A + row_S:,int(np.ceil(N / 2)):N]
 89 |     # Sort the predictions
 90 |     sorted_idx = np.argsort(predictions, axis=1)
 91 |     # Find index of median prediction for each sample
 92 |     bata_T = np.log(1/bata_T[0, int(np.ceil(N / 2)):N])
 93 |     #bata_T =  bata_T[0, int(np.ceil(N / 2)):N]
 94 |     bata_T[:] = bata_T[:] / np.sum(bata_T[:])
 95 |     weight_cdf = stable_cumsum(bata_T[sorted_idx], axis=1)
 96 |     median_or_above = weight_cdf >= 0.5 * weight_cdf[:, -1][:, np.newaxis]
 97 |     median_idx = median_or_above.argmax(axis=1)
 98 |     median_estimators = sorted_idx[np.arange(test.shape[0]), median_idx]
 99 |     # Return median predictions
100 |     return predictions[np.arange(test.shape[0]), median_estimators]
101 |     #
102 |     # for i in range(row_T):
103 |     #     # 跳过训练数据的标签
104 |     #     # predict[i]=np.median(
105 |     #     #     result_label[row_A + row_S + i, :] * np.log(1 / bata_T[0, :]))
106 |     #     # predict[i] = np.sum(
107 |     #     #     result_label[row_A + row_S + i, :] * (1-bata_T[0, :]))
108 |     #
109 |     #     predict[i] = weighted_median(result_label[row_A + row_S + i,int(np.ceil(N / 2)):N],
110 |     #                                  np.log(1 / bata_T[0,int(np.ceil(N / 2)):N]))
111 |     # return predict
112 | 
113 | def calculate_P(weights):
114 |     total = np.sum(weights)
115 |     return weights/total
116 | 
117 | 
118 | def train_classify(trans_data, trans_label, test_data, P,test,y_target_test,islog,):
119 |     # if islog:
120 |     #     clf = svm.SVR(C=100)
121 |     # else:
122 |     clf = DecisionTreeRegressor(max_depth=3)
123 |     clf.fit(trans_data, trans_label, sample_weight=P[:, 0])
124 |     return clf.predict(test_data)
125 | 
126 | def weighted_median(values, weights):
127 |     ''' compute the weighted median of values list. The
128 | weighted median is computed as follows:
129 |     1- sort both lists (values and weights) based on values.
130 |     2- select the 0.5 point from the weights and return the corresponding values as results
131 |     e.g. values = [1, 3, 0] and weights=[0.1, 0.3, 0.6] assuming weights are probabilities.
132 |     sorted values = [0, 1, 3] and corresponding sorted weights = [0.6,     0.1, 0.3] the 0.5 point on
133 |     weight corresponds to the first item which is 0. so the weighted     median is 0.'''
134 | 
135 |     #convert the weights into probabilities
136 |     sum_weights = sum(weights)
137 |     weights = np.array([(w*1.0)/sum_weights for w in weights])
138 |     #sort values and weights based on values
139 |     values = np.array(values)
140 |     sorted_indices = np.argsort(values)
141 |     values_sorted  = values[sorted_indices]
142 |     weights_sorted = weights[sorted_indices]
143 |     #select the median point
144 |     it = np.nditer(weights_sorted, flags=['f_index'])
145 |     accumulative_probability = 0
146 |     median_index = -1
147 |     while not it.finished:
148 |         accumulative_probability += it[0]
149 |         if accumulative_probability > 0.5:
150 |             median_index = it.index
151 |             return values_sorted[median_index]
152 |         elif accumulative_probability == 0.5:
153 |             median_index = it.index
154 |             it.iternext()
155 |             next_median_index = it.index
156 |             return np.mean(values_sorted[[median_index, next_median_index]])
157 |         it.iternext()
158 | 
159 |     return values_sorted[median_index]
160 | 
161 | from sklearn.ensemble import AdaBoostRegressor


--------------------------------------------------------------------------------
/kmmClassification.py:
--------------------------------------------------------------------------------
  1 | import math, numpy, sklearn.metrics.pairwise as sk
  2 | from cvxopt import matrix, solvers
  3 | import random, sys
  4 | from sklearn import svm
  5 | 
  6 | FixedBetaValue = 1.0
  7 | 
  8 | """
  9 | Compute instance (importance) weights using Kernel Mean Matching.
 10 | Returns a list of instance weights for training data.
 11 | """
 12 | def kmm(Xtrain, Xtest, sigma):
 13 |     n_tr = len(Xtrain)
 14 |     n_te = len(Xtest)
 15 | 
 16 |     # calculate Kernel
 17 |     print('Computing kernel for training data ...')
 18 |     #  0.001取得最好的结果0.01
 19 |     K_ns = sk.rbf_kernel(Xtrain, Xtrain)
 20 |     # make it symmetric
 21 |     K = 0.9 * (K_ns + K_ns.transpose())
 22 | 
 23 |     # calculate kappa
 24 |     print('Computing kernel for kappa ...')
 25 |     kappa_r = sk.rbf_kernel(Xtrain, Xtest)
 26 |     ones = numpy.ones(shape=(n_te, 1))
 27 |     kappa = numpy.dot(kappa_r, ones)
 28 |     kappa = -(float(n_tr) / float(n_te)) * kappa
 29 | 
 30 |     # calculate eps
 31 |     eps = (math.sqrt(n_tr) - 1) / math.sqrt(n_tr)
 32 | 
 33 |     # constraints
 34 |     A0 = numpy.ones(shape=(1, n_tr))
 35 |     A1 = -numpy.ones(shape=(1, n_tr))
 36 |     A = numpy.vstack([A0, A1, -numpy.eye(n_tr), numpy.eye(n_tr)])
 37 |     b = numpy.array([[n_tr * (eps + 1), n_tr * (eps - 1)]])
 38 |     b = numpy.vstack([b.T, -numpy.zeros(shape=(n_tr, 1)), numpy.ones(shape=(n_tr, 1)) * 1000])
 39 | 
 40 |     print('Solving quadratic program for beta ...')
 41 |     P = matrix(K, tc='d')
 42 |     q = matrix(kappa, tc='d')
 43 |     G = matrix(A, tc='d')
 44 |     h = matrix(b, tc='d')
 45 |     beta = solvers.qp(P, q, G, h)
 46 |     return [i for i in beta['x']]
 47 | 
 48 | 
 49 | """
 50 | Kernel width is the median of distances between instances of sparse data
 51 | """
 52 | def computeKernelWidth(data):
 53 |     dist = []
 54 |     for i in range(len(data)):
 55 |         for j in range(i + 1, len(data)):
 56 |             # s = self.__computeDistanceSq(data[i], data[j])
 57 |             # dist.append(math.sqrt(s))
 58 |             dist.append(numpy.sqrt(numpy.sum((numpy.array(data[i]) - numpy.array(data[j])) ** 2)))
 59 |     return numpy.median(numpy.array(dist))
 60 | 
 61 | 
 62 | def read_data_set(filename):
 63 |     with open(filename) as f:
 64 |         data = f.readlines()
 65 | 
 66 |     maxvar = 0
 67 |     classList = []
 68 |     data_set = []
 69 |     for i in data:
 70 |         d = {}
 71 |         if filename.endswith('.arff'):
 72 |             if '@' not in i:
 73 |                 features = i.strip().split(',')
 74 |                 class_name = features.pop()
 75 |                 if class_name not in classList:
 76 |                     classList.append(class_name)
 77 |                 d[-1] = float(classList.index(class_name))
 78 |                 for j in range(len(features)):
 79 |                     d[j] = float(features[j])
 80 |                 maxvar = len(features)
 81 |             else:
 82 |                 continue
 83 |         data_set.append(d)
 84 |     return (data_set, classList, maxvar)
 85 | 
 86 | 
 87 | def getFixedBeta(value, count):
 88 |     beta = []
 89 |     for c in range(count):
 90 |         beta.append(value)
 91 |     return beta
 92 | 
 93 | 
 94 | def getBeta(trainX, testX, maxvar):
 95 |     beta = []
 96 |     # gammab = 0.001
 97 |     gammab = computeKernelWidth(trainX)
 98 |     print("Gammab:", gammab)
 99 | 
100 |     beta = kmm(trainX, testX, gammab)
101 |     print("{0} Beta: {1}".format(len(beta), beta))
102 | 
103 |     return beta
104 | 
105 | 
106 | def checkAccuracy(result, testY):
107 |     p = 0
108 |     for i, v in enumerate(result):
109 |         if v == testY[i]:
110 |             p += 1
111 |     acc = p * 100 / len(result)
112 |     # print(result)
113 |     print("ACC:{0}%, Total:{1}/{2} with positive {3}".format(acc, len(result), len(testY), p))
114 |     return acc
115 | 
116 | 
117 | def separateData(data, maxvar):
118 |     dataY = []
119 |     dataX = []
120 | 
121 |     for d in data:
122 |         dataY.append(d[-1])
123 | 
124 |         covar = []
125 |         for c in range(maxvar):
126 |             if c in d:
127 |                 covar.append(d[c])
128 |             else:
129 |                 covar.append(0.0)
130 |         dataX.append(covar)
131 |     return (dataX, dataY)
132 | 
133 | 
134 | def buildModel(trainX, trainY, beta, testX, testY, svmParam, maxvar,testdata):
135 |     # Tune parameters here...
136 |     #csf = svm.SVC(C=float(svmParam['c']), kernel='rbf', gamma=float(svmParam['g']), probability=True)
137 | 
138 |     train = separateData(testdata, maxvar)
139 |     # H 测试样本分类结果
140 |     # TrainS 原训练样本 np数组
141 |     # TrainA 辅助训练样本
142 |     # LabelS 原训练样本标签
143 |     # LabelA 辅助训练样本标签
144 |     # Test  测试样本
145 |     # N 迭代次数
146 |     beta = getBeta(train[0], trainX, maxvar)
147 |     pred = tradaboost(trainX, train[0], trainY, train[1], testX, 4,beta)
148 |     #csf.fit(trainX, trainY, sample_weight=beta)
149 | 
150 |     beta_fixed = getFixedBeta(FixedBetaValue, len(trainX))
151 |     csf2 = svm.SVC(C=float(svmParam['c']), kernel='rbf', gamma=float(svmParam['g']), probability=False)
152 |     csf2.fit(trainX, trainY, sample_weight=beta_fixed)
153 | 
154 |     # predict and gather results
155 |     #result = csf.predict(testX)
156 |     acc = checkAccuracy(pred, testY)
157 | 
158 |     result2 = csf2.predict(testX)
159 |     acc2 = checkAccuracy(result2, testY)
160 | 
161 |     return (acc, acc2)
162 | 
163 | from TR.TrAdaboost2 import *
164 | def train(traindata, testdata, maxvar):
165 |     svmParam = {'c': 131072, 'g': 0.0001}
166 |     train = separateData(traindata[:250], maxvar)
167 |     trainX = train[0]
168 |     trainY = train[1]
169 | 
170 |     print("trainX"+str(len(trainX)))
171 | 
172 |     test = separateData(traindata[250:], maxvar)
173 |     testX = test[0]
174 |     testY = test[1]
175 | 
176 |     print("testX"+str(len(testX)))
177 | 
178 |     print(type(trainX))
179 |     beta = getBeta(trainX, testX, maxvar)
180 | 
181 |     # Model training
182 |     result = buildModel(trainX, trainY, beta, testX, testY, svmParam, maxvar,testdata)
183 |     return result
184 | 
185 | #MAIN METHOD
186 | def main():
187 |     #reading train data file
188 |     train_data_set,train_classList,train_maxVar=read_data_set("./apps_data_k100/datafile-qrsnc2a7k20.c0.d4.C23.N2000.t16.T4.D1.E1.F1.G1.H1.I1.B16.J8.K300.L0.05.M100.A1.V0.P0.G0.l0.0.b600-train.arff")
189 |     # reading test data file
190 |     test_data_set,test_classList,test_maxVar=read_data_set("./apps_data_k100/datafile-qrsnc2a7k20.c0.d4.C23.N2000.t16.T4.D1.E1.F1.G1.H1.I1.B16.J8.K300.L0.05.M100.A1.V0.P0.G0.l0.0.b600-test.arff")
191 |     if(train_maxVar>=test_maxVar):
192 |         mxVar=train_maxVar
193 |     else:
194 |         mxVar=test_maxVar
195 |     #Gathering Accuracies
196 |     res1,res2=train(train_data_set,test_data_set,mxVar)
197 |     print("Accuracy without KMM:{0}%".format(res1))
198 |     print("Accuracy with KMM:{0}%".format(res2))
199 | 
200 | if __name__ == '__main__':
201 |     main()


--------------------------------------------------------------------------------
/TrAdaboostkmm.py:
--------------------------------------------------------------------------------
  1 | import sklearn.svm
  2 | from sklearn.datasets import fetch_20newsgroups
  3 | from dataQuality.kmm import *
  4 | # ala=np.concatenate((trans_A, label_A.reshape(row_A,1)[:,-1:]), axis=1)
  5 | # s=np.concatenate((trans_S, label_S.reshape(row_S,1)[:,-1:]), axis=1)
  6 | # 初始化权重
  7 | # coef = kernel_mean_matching(s,ala,
  8 | #           kern='rbf', B=10)
  9 | # code by chenchiwei
 10 | # -*- coding: UTF-8 -*-
 11 | import numpy as np
 12 | from sklearn import tree
 13 | from scipy import sparse
 14 | from sklearn import metrics
 15 | 
 16 | from sklearn import svm
 17 | # H 测试样本分类结果
 18 | # TrainS 原训练样本 np数组
 19 | # TrainA 辅助训练样本
 20 | # LabelS 原训练样本标签
 21 | # LabelA 辅助训练样本标签
 22 | # Test  测试样本
 23 | # N 迭代次数
 24 | from KMM import kmmClassification
 25 | def tradaboost(trans_S, trans_A, label_S, label_A, test,test_label, N,eliminate=False):
 26 | 
 27 | 
 28 | 
 29 | 
 30 |     coef = kmmClassification.getBeta(trans_A,test.toarray(),49098)
 31 |     #排除一些低权重的样本
 32 |     if eliminate:
 33 |         percenttile=np.percentile(coef, 7)
 34 |         indexlist=[]
 35 |         for index,x in enumerate(coef):
 36 |             if(x<percenttile):
 37 |                 indexlist.append(index)
 38 |                 del coef[index]
 39 | 
 40 |         trans_A = np.delete(trans_A, indexlist, axis=0)
 41 |         label_A = np.delete(label_A, indexlist, axis=0)
 42 |         print('排除的元素大小',len(indexlist))
 43 |     # 排除一些低权重的样本
 44 | 
 45 |     trans_data = sparse.vstack((trans_A, trans_S))
 46 |     trans_label = np.concatenate((label_A, label_S), axis=0)
 47 | 
 48 |     row_A = trans_A.shape[0]
 49 |     row_S = trans_S.shape[0]
 50 |     row_T = test.shape[0]
 51 | 
 52 |     #print('目标源的大小',row_S,'辅助源的大小',row_A,'测试集的大小',row_T)
 53 |     test_data = sparse.vstack((trans_data, test))
 54 | 
 55 |     weights_A = coef
 56 |     weights_A = np.asarray(weights_A).reshape(row_A,1)
 57 |     total=sum(weights_A[:,0])
 58 |     for j in range(row_A):
 59 |         weights_A[j,0] = weights_A[j,0]/total
 60 |     weights_S = np.ones([row_S, 1]) * np.max(weights_A)
 61 |     # weights_S = np.ones([row_S, 1])/row_S
 62 |     # weights_A = np.ones([row_A, 1])/row_A
 63 | 
 64 |     weights = np.concatenate((weights_A, weights_S), axis=0)
 65 | 
 66 |     bata = 1 / (1 + np.sqrt(2.0 * np.log(row_A/ N)))
 67 | 
 68 |     # 存储每次迭代的标签和bata值？
 69 |     bata_T = np.zeros([1, N])
 70 |     result_label = np.ones([row_A + row_S + row_T, N])
 71 | 
 72 |     predict = np.zeros([row_T])
 73 | 
 74 |     # trans_data = np.asarray(trans_data, order='C')
 75 |     # trans_label = np.asarray(trans_label, order='C')
 76 |     # test_data = np.asarray(test_data, order='C')
 77 | 
 78 |     # print(trans_data.shape)
 79 |     # print(test_data.shape)
 80 | 
 81 |     accuracy_scorelist=[]
 82 |     f1_scorelist=[]
 83 |     recall_scorelist=[]
 84 |     for i in range(N):
 85 |         P = calculate_P(weights, trans_label)
 86 | 
 87 |         result_label[:, i] = train_classify(trans_data, trans_label,
 88 |                                             test_data, P)
 89 | 
 90 |         error_rate = 0.0
 91 |         for j in range(row_A, row_A + row_S):
 92 |             error_rate += (weights[j] * abs(result_label[j, i] - trans_label[j]))
 93 |         error_rate = error_rate / sum(weights[row_A:])
 94 | 
 95 | 
 96 |         #error_rate = calculate_error_rate(label_S, result_label[row_A:row_A + row_S, i],
 97 |         #                                  weights[row_A:row_A + row_S, :])
 98 |         #print ('Error rate:', error_rate)
 99 |         # if error_rate != 1:
100 |         #     bata_T[0, i] = error_rate / (1.0 - error_rate)
101 |         # if error_rate >= 0.5 and error_rate != 1:
102 |         #     bata_T[0, i] = 0.45 / (0.51)
103 |         # if error_rate == 1:
104 |         #     bata_T[0, i] = 0.4
105 |         if error_rate >= 0.5:
106 |             # error_rate = 0.5
107 |             error_rate = 0.499;
108 |         if error_rate == 0:
109 |             # error_rate = 0.000001
110 |             # error_rate=0.0001
111 |             error_rate = 0.001
112 | 
113 |         bata_T[0, i] = error_rate / (1 - error_rate)
114 | 
115 |         # 调整源域样本权重
116 |         for j in range(row_S):
117 |             weights[row_A + j] = weights[row_A + j] * np.power(bata_T[0, i],
118 |                                                                (-np.abs(result_label[row_A + j, i] - trans_label[row_A+j])))
119 | 
120 |         # 调整辅域样本权重
121 |         for j in range(row_A):
122 |             weights[j] = weights[j] * np.power(bata, np.abs(result_label[j, i] - trans_label[j]))
123 | 
124 | 
125 | 
126 |         ##每次迭代完成计算下在测试集合上的误差
127 |         # predic_temp=np.zeros([row_T])
128 |         # iteration=i+1;
129 |         # for j in range(row_T):
130 |         #     left = np.sum(
131 |         #         result_label[row_A + row_S + j, 0:iteration] * np.log(1 / bata_T[0, 0:iteration]))
132 |         #     right = 0.5 * np.sum(np.log(1 / bata_T[0, 0:iteration]))
133 |         #     # left = np.sum(
134 |         #     #     result_label[row_A + row_S + i, int(np.ceil(iteration / 2)):iteration] * np.log(
135 |         #     #         1 / bata_T[0, int(np.ceil(iteration / 2)):iteration]))
136 |         #     # right = 0.5 * np.sum(np.log(1 / bata_T[0, int(np.ceil(iteration / 2)):iteration]))
137 |         #     if left >= right:
138 |         #         predic_temp[j] = 1
139 |         #     else:
140 |         #         predic_temp[j] = 0
141 |         # accuracy_scorelist.append(metrics.accuracy_score(test_label, predic_temp))
142 |         # recall_scorelist.append(metrics.recall_score(test_label, predic_temp))
143 |         # f1_scorelist.append(metrics.f1_score(test_label, predic_temp))
144 | 
145 |     # print bata_T
146 |     for i in range(row_T):
147 |         # 跳过训练数据的标签
148 |         # left = np.sum(
149 |         #     result_label[row_A + row_S + i, int(np.ceil(N / 2)):N] * np.log(1 / bata_T[0, int(np.ceil(N / 2)):N]))
150 |         # right = 0.5 * np.sum(np.log(1 / bata_T[0, int(np.ceil(N / 2)):N]))
151 |         left = np.sum(
152 |             result_label[row_A + row_S + i, 0:N] * np.log(1 / bata_T[0, 0:N]))
153 |         right = 0.5 * np.sum(np.log(1 / bata_T[0, 0:N]))
154 |         if left >= right:
155 |             predict[i] = 1
156 |         else:
157 |             predict[i] = 0
158 |             # print left, right, predict[i]
159 |         # predict[i]=left-right;
160 | 
161 | 
162 |     print(accuracy_scorelist)
163 |     return predict,accuracy_scorelist,recall_scorelist,f1_scorelist
164 | 
165 | 
166 | def calculate_P(weights, label):
167 |     total = np.sum(weights)
168 |     return np.asarray(weights)/total
169 | 
170 | from sklearn.linear_model import LogisticRegression
171 | 
172 | def train_classify(trans_data, trans_label, test_data, P):
173 |     clf = LogisticRegression()
174 |     clf.fit(trans_data, trans_label, sample_weight=P[:, 0])
175 |     return clf.predict(test_data)
176 | 
177 | 
178 | # def calculate_error_rate(label_R, label_H, weight):
179 | #     total = np.sum(weight)
180 | #     #return np.sum((weight[:, 0] / total)* np.abs(label_R - label_H))
181 | #     return  return_correct_rate(label_R,label_H)
182 | 


--------------------------------------------------------------------------------
/SPY.py:
--------------------------------------------------------------------------------
  1 | import sklearn.svm
  2 | from sklearn.datasets import fetch_20newsgroups
  3 | from dataQuality.kmm import *
  4 | # ala=np.concatenate((trans_A, label_A.reshape(row_A,1)[:,-1:]), axis=1)
  5 | # s=np.concatenate((trans_S, label_S.reshape(row_S,1)[:,-1:]), axis=1)
  6 | # 初始化权重
  7 | # coef = kernel_mean_matching(s,ala,
  8 | #           kern='rbf', B=10)
  9 | # code by chenchiwei
 10 | # -*- coding: UTF-8 -*-
 11 | import numpy as np
 12 | from sklearn import tree
 13 | from scipy import sparse
 14 | from sklearn import metrics
 15 | from sklearn.model_selection import train_test_split
 16 | from sklearn import svm
 17 | # H 测试样本分类结果
 18 | # TrainS 原训练样本 np数组
 19 | # TrainA 辅助训练样本
 20 | # LabelS 原训练样本标签
 21 | # LabelA 辅助训练样本标签
 22 | # Test  测试样本
 23 | # N 迭代次数
 24 | from KMM import kmmClassification
 25 | from sklearn.naive_bayes import BernoulliNB
 26 | def tradaboost(trans_S, trans_A, label_S, label_A, test,test_label, N,eliminate=True):
 27 | 
 28 | 
 29 |     row_A = trans_A.shape[0]
 30 |     row_S = trans_S.shape[0]
 31 |     row_T = test.shape[0]
 32 |     # 样本重标记
 33 |     lable_spy_a = np.zeros([row_A, 1])
 34 |     lable_spy_s = np.ones([row_S, 1])
 35 | 
 36 |     trans_data = sparse.vstack((trans_A, trans_S))
 37 |     trans_label=np.concatenate((lable_spy_a, lable_spy_s), axis=0)
 38 | 
 39 |     X_train, X_test, y_train, y_test = train_test_split(trans_data, trans_label, test_size = 0.33, random_state = 42)
 40 |     clf = LogisticRegression(penalty='l1',class_weight='balanced')
 41 |     clf = LogisticRegression(class_weight='balanced')
 42 |     #gnb = BernoulliNB()
 43 |     clf.fit(X_train, y_train)
 44 |     print("LR的预测精度",metrics.confusion_matrix(y_test, clf.predict(X_test)))
 45 |     print("LR的预测精度", metrics.accuracy_score(y_test, clf.predict(X_test)))
 46 |     # clf.predict(X_test)
 47 |     # predict=clf.predict_proba(X_test)[:,-1]
 48 |     # np.sort(predict)
 49 |     #排除一些低权重的样本
 50 |     # 排除一些低权重的样本
 51 |     # print(clf.predict_proba(trans_A)[:,-1])
 52 | 
 53 | 
 54 |     weights_A = clf.predict_proba(trans_A)[:,-1].tolist()
 55 | 
 56 |     if eliminate:
 57 |         percenttile=np.percentile(weights_A, 32)#32
 58 |         indexlist=[]
 59 |         for index,x in enumerate(weights_A):
 60 |             if(x<percenttile):
 61 |                 indexlist.append(index)
 62 |                 del weights_A[index]
 63 |         trans_A = np.delete(trans_A, indexlist, axis=0)
 64 |         label_A = np.delete(label_A, indexlist, axis=0)
 65 |         print('排除的元素大小',len(indexlist))
 66 | 
 67 |     trans_data = sparse.vstack((trans_A, trans_S))
 68 |     trans_label = np.concatenate((label_A, label_S), axis=0)
 69 | 
 70 |     row_A = trans_A.shape[0]
 71 |     row_S = trans_S.shape[0]
 72 |     row_T = test.shape[0]
 73 | 
 74 |     #print('目标源的大小',row_S,'辅助源的大小',row_A,'测试集的大小',row_T)
 75 |     test_data = sparse.vstack((trans_data, test))
 76 |     weights_A = np.asarray(weights_A).reshape(row_A,1)
 77 |     total=sum(weights_A[:,0])
 78 |     for j in range(row_A):
 79 |         weights_A[j,0] = weights_A[j,0]/total
 80 |     weights_S = np.ones([row_S, 1]) * np.max(weights_A)
 81 |     # weights_S = np.ones([row_S, 1])/row_S
 82 |     # weights_A = np.ones([row_A, 1])/row_A
 83 | 
 84 |     weights = np.concatenate((weights_A, weights_S), axis=0)
 85 | 
 86 |     bata = 1 / (1 + np.sqrt(2.0 * np.log(row_A/ N)))
 87 | 
 88 |     # 存储每次迭代的标签和bata值？
 89 |     bata_T = np.zeros([1, N])
 90 |     result_label = np.ones([row_A + row_S + row_T, N])
 91 | 
 92 |     predict = np.zeros([row_T])
 93 | 
 94 |     # trans_data = np.asarray(trans_data, order='C')
 95 |     # trans_label = np.asarray(trans_label, order='C')
 96 |     # test_data = np.asarray(test_data, order='C')
 97 | 
 98 |     # print(trans_data.shape)
 99 |     # print(test_data.shape)
100 | 
101 |     accuracy_scorelist=[]
102 |     f1_scorelist=[]
103 |     recall_scorelist=[]
104 |     for i in range(N):
105 |         P = calculate_P(weights, trans_label)
106 | 
107 |         result_label[:, i] = train_classify(trans_data, trans_label,
108 |                                             test_data, P)
109 | 
110 |         error_rate = 0.0
111 |         for j in range(row_A, row_A + row_S):
112 |             error_rate += (weights[j] * abs(result_label[j, i] - trans_label[j]))
113 |         error_rate = error_rate / sum(weights[row_A:])
114 | 
115 | 
116 |         #error_rate = calculate_error_rate(label_S, result_label[row_A:row_A + row_S, i],
117 |         #                                  weights[row_A:row_A + row_S, :])
118 |         #print ('Error rate:', error_rate)
119 |         # if error_rate != 1:
120 |         #     bata_T[0, i] = error_rate / (1.0 - error_rate)
121 |         # if error_rate >= 0.5 and error_rate != 1:
122 |         #     bata_T[0, i] = 0.45 / (0.51)
123 |         # if error_rate == 1:
124 |         #     bata_T[0, i] = 0.4
125 |         if error_rate >= 0.5:
126 |             # error_rate = 0.5
127 |             error_rate = 0.499;
128 |         if error_rate == 0:
129 |             # error_rate = 0.000001
130 |             # error_rate=0.0001
131 |             error_rate = 0.001
132 | 
133 |         bata_T[0, i] = error_rate / (1 - error_rate)
134 |         # 调整源域样本权重
135 |         for j in range(row_S):
136 |             weights[row_A + j] = weights[row_A + j] * np.power(bata_T[0, i],
137 |                                                                (-np.abs(result_label[row_A + j, i] - trans_label[row_A+j])))
138 | 
139 |         # 调整辅域样本权重
140 |         for j in range(row_A):
141 |             weights[j] = weights[j] * np.power(bata, np.abs(result_label[j, i] - trans_label[j]))
142 | 
143 | 
144 | 
145 |         ##每次迭代完成计算下在测试集合上的误差
146 |         # predic_temp=np.zeros([row_T])
147 |         # iteration=i;
148 |         # for i in range(row_T):
149 |         #     left = np.sum(
150 |         #         result_label[row_A + row_S + i, 0:iteration] * np.log(1 / bata_T[0, 0:iteration]))
151 |         #     right = 0.5 * np.sum(np.log(1 / bata_T[0, 0:iteration]))
152 |         #     if left >= right:
153 |         #         predic_temp[i] = 1
154 |         #     else:
155 |         #         predic_temp[i] = 0
156 |         # accuracy_scorelist.append(metrics.accuracy_score(test_label, predic_temp))
157 |         # recall_scorelist.append(metrics.recall_score(test_label, predic_temp))
158 |         # f1_scorelist.append(metrics.f1_score(test_label, predic_temp))
159 | 
160 |     # print bata_T
161 |     for i in range(row_T):
162 |         # 跳过训练数据的标签
163 |         # left = np.sum(
164 |         #     result_label[row_A + row_S + i, int(np.ceil(N / 2)):N] * np.log(1 / bata_T[0, int(np.ceil(N / 2)):N]))
165 |         # right = 0.5 * np.sum(np.log(1 / bata_T[0, int(np.ceil(N / 2)):N]))
166 |         left = np.sum(
167 |             result_label[row_A + row_S + i, 0:N] * np.log(1 / bata_T[0, 0:N]))
168 |         right = 0.5 * np.sum(np.log(1 / bata_T[0, 0:N]))
169 |         if left >= right:
170 |             predict[i] = 1
171 |         else:
172 |             predict[i] = 0
173 |             # print left, right, predict[i]
174 |         # predict[i]=left-right;
175 | 
176 | 
177 |     print(accuracy_scorelist)
178 |     return predict,accuracy_scorelist,recall_scorelist,f1_scorelist
179 | 
180 | 
181 | def calculate_P(weights, label):
182 |     total = np.sum(weights)
183 |     return np.asarray(weights)/total
184 | 
185 | from sklearn.linear_model import LogisticRegression
186 | 
187 | def train_classify(trans_data, trans_label, test_data, P):
188 |     clf = LogisticRegression()
189 |     clf.fit(trans_data, trans_label, sample_weight=P[:, 0])
190 |     return clf.predict(test_data)
191 | 
192 | 
193 | # def calculate_error_rate(label_R, label_H, weight):
194 | #     total = np.sum(weight)
195 | #     #return np.sum((weight[:, 0] / total)* np.abs(label_R - label_H))
196 | #     return  return_correct_rate(label_R,label_H)
197 | 


--------------------------------------------------------------------------------
/Mutisource.py:
--------------------------------------------------------------------------------
  1 | # code by chenchiwei
  2 | # -*- coding: UTF-8 -*-
  3 | import numpy as np
  4 | from sklearn import tree
  5 | from sklearn import svm
  6 | import math
  7 | from sklearn.tree import DecisionTreeRegressor
  8 | 
  9 | def stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
 10 |     out = np.cumsum(arr, axis=axis, dtype=np.float64)
 11 |     return out
 12 | 
 13 | 
 14 | # TrainS 原训练样本 np数组
 15 | # TrainA 辅助训练样本
 16 | # LabelS 原训练样本标签
 17 | # LabelA 辅助训练样本标签
 18 | # Test  测试样本
 19 | # N 迭代次数
 20 | 
 21 | # source_sum 源领域的数目
 22 | # soucenum   源领域所有样本的数目
 23 | 
 24 | def Mutisource_tradaboost(trans_S, trans_A_list, label_S, label_A_list, test, N,source_sum,soucenum):
 25 | 
 26 | 
 27 |     #首先计算bata
 28 |     bata = 1 / (1 + np.sqrt(2 * np.log(soucenum) / N))
 29 |     row_S = trans_S.shape[0]
 30 |     row_T = test.shape[0]
 31 |     weights_S=  np.ones([row_S, 1])/row_S
 32 |     weights_A=[]
 33 |     train=[]
 34 |     train_lable=[]
 35 |     test_data=[]
 36 |     result_label=[]
 37 |     # 存储每次迭代的标签和bata值
 38 |     bata_T = np.zeros([1, N])
 39 |     result_labelsum = np.zeros([row_S + row_T, N])
 40 |     for i in range(source_sum):
 41 |         row_A = trans_A_list[i].shape[0]
 42 |         weights_A.append(np.ones([row_A, 1])/row_A)
 43 |         train.append(np.concatenate((trans_A_list[i], trans_S), axis=0))
 44 |         train_lable.append(np.concatenate((label_A_list[i], label_S), axis=0))
 45 |         test_data.append( np.concatenate((train[i], test), axis=0))
 46 |         result_label.append(np.ones([row_A + row_S + row_T, N]))
 47 |     #生成初始的权重
 48 |     for i in range(N):
 49 |         #将权重向量归一化
 50 |         error_list=[]
 51 |         max_error_list=[]
 52 |         for j in range(source_sum):
 53 |             row_A = trans_A_list[j].shape[0]
 54 |             weights = np.concatenate((weights_A[j], weights_S), axis=0)
 55 |             P = calculate_P(weights)
 56 |             result_label[j][:, i] = train_classify(train[j], train_lable[j],
 57 |                                                 test_data[j], P)
 58 |             temp = np.abs(result_label[j][row_A:row_A + row_S, i] - train_lable[j][row_A:])
 59 |             #temp = np.abs(result_label[j][:row_A + row_S, i] - train_lable[j])
 60 |             error_max = temp.max()
 61 |             max_error_list.append(error_max)
 62 |             error_rate = 0.0
 63 |             for m in range(row_A, row_A + row_S):
 64 |                 error_rate += (weights_S[m-row_A] * ((abs(result_label[j][m, i] - train_lable[j][m]) / error_max)))
 65 |             error_rate = error_rate / sum(weights_S)
 66 |             error_list.append(error_rate)
 67 |         g=[]
 68 |         #g=error_list;
 69 |         for j in range(source_sum):
 70 |             g.append(math.exp(1-error_list[j])/math.exp(error_list[j]))
 71 |         g = [x/sum(g) for x in g]
 72 | 
 73 |         for j in range(source_sum):
 74 |             row_A = trans_A_list[j].shape[0]
 75 |             result_labelsum[:, i]=result_labelsum[:, i]+g[j]*result_label[j][row_A:row_A + row_S+row_T, i]
 76 | 
 77 |         temp = np.abs(result_labelsum[:row_S, i] - label_S)
 78 |         error_max = temp.max()
 79 |         error_rate = 0.0
 80 |         for m in range(row_S):
 81 |             error_rate += (weights_S[m] * ((abs(result_labelsum[m, i] - label_S[m]) / error_max)))
 82 |         error_rate = error_rate / sum(weights_S)
 83 | 
 84 | 
 85 |         #更新样本权重
 86 |         if error_rate >= 0.5:
 87 |             error_rate = 0.499;
 88 |         if error_rate == 0:
 89 |             error_rate = 0.001
 90 | 
 91 |         bata_T[0, i] = error_rate / (1 - error_rate)
 92 |         for j in range(row_S):
 93 |             weights_S[j] = weights_S[j] * np.power(bata_T[0, i], 1-((abs(result_labelsum[j, i] - label_S[j]) /error_max)))
 94 |         for j in range(source_sum):
 95 |             row_A = trans_A_list[j].shape[0]
 96 |             temp = np.abs(result_label[j][:row_A, i] - label_A_list[j])
 97 |             error_max = temp.max()
 98 |             for m in range(row_A):
 99 |                 if (abs(result_label[j][m, i] - label_A_list[j][m]) > 0):
100 |                     weights_A[j][m]=weights_A[j][m] * np.power(bata, abs(result_label[j][m, i] - label_A_list[j][m]) / error_max)
101 | 
102 |         # bata_T[0, i] = (1/2)*math.log((1 - error_rate)/error_rate)
103 |         # for j in range(row_S):
104 |         #     weights_S[j] = weights_S[j] * np.exp(bata_T[0, i]*((abs(result_labelsum[j, i] - label_S[j]) / error_max)))
105 |         # for j in range(source_sum):
106 |         #     row_A = trans_A_list[j].shape[0]
107 |         #     temp = np.abs(result_label[j][:row_A, i] - label_A_list[j])
108 |         #     error_max = temp.max()
109 |         #     for m in range(row_A):
110 |         #         if (abs(result_label[j][m, i] - label_A_list[j][m]) > 0.04):
111 |         #             weights_A[j][m] = weights_A[j][m] * np.exp(-bata*abs(
112 |         #                 result_label[j][m, i] - label_A_list[j][m]) / error_max)
113 | 
114 |     #
115 |     # predictions=result_labelsum[row_S:,0:N]
116 |     # # Sort the predictions
117 |     # sorted_idx = np.argsort(predictions, axis=1)
118 |     # # Find index of median prediction for each sample
119 |     # #bata_T = 1/bata_T[0, 0:N]
120 |     # bata_T = np.log(1/bata_T[0, :N])
121 |     # bata_T[:] = bata_T[:] / np.sum(bata_T[:])
122 |     # weight_cdf = stable_cumsum(bata_T[sorted_idx], axis=1)
123 |     # median_or_above = weight_cdf >= 0.5 * weight_cdf[:, -1][:, np.newaxis]
124 |     # median_idx = median_or_above.argmax(axis=1)
125 |     # median_estimators = sorted_idx[np.arange(test.shape[0]), median_idx]
126 |     # # Return median predictions
127 |     # return predictions[np.arange(test.shape[0]), median_estimators]
128 | 
129 | 
130 |     predictions = result_labelsum[row_S:, int(np.ceil(N / 2)):N]
131 |     # Sort the predictions
132 |     sorted_idx = np.argsort(predictions, axis=1)
133 |     # Find index of median prediction for each sample
134 |     bata_T = np.log(1 / bata_T[0, int(np.ceil(N / 2)):N])
135 |     #bata_T =  bata_T[0, int(np.ceil(N / 2)):N]
136 |     bata_T[:] = bata_T[:] / np.sum(bata_T[:])
137 |     weight_cdf = stable_cumsum(bata_T[sorted_idx], axis=1)
138 |     median_or_above = weight_cdf >= 0.5 * weight_cdf[:, -1][:, np.newaxis]
139 |     median_idx = median_or_above.argmax(axis=1)
140 |     median_estimators = sorted_idx[np.arange(test.shape[0]), median_idx]
141 |     # Return median predictions
142 |     return predictions[np.arange(test.shape[0]), median_estimators]
143 | 
144 | 
145 | 
146 | def calculate_P(weights):
147 |     total = np.sum(weights)
148 |     return weights/total
149 | 
150 | from sklearn import neighbors
151 | def train_classify(trans_data, trans_label, test_data, P):
152 |     clf = DecisionTreeRegressor(max_depth=3)
153 |     #clf = neighbors.KNeighborsRegressor()
154 |     clf.fit(trans_data, trans_label, sample_weight=P[:, 0])
155 |     return clf.predict(test_data)
156 | 
157 | def weighted_median(values, weights):
158 |     ''' compute the weighted median of values list. The
159 | weighted median is computed as follows:
160 |     1- sort both lists (values and weights) based on values.
161 |     2- select the 0.5 point from the weights and return the corresponding values as results
162 |     e.g. values = [1, 3, 0] and weights=[0.1, 0.3, 0.6] assuming weights are probabilities.
163 |     sorted values = [0, 1, 3] and corresponding sorted weights = [0.6,     0.1, 0.3] the 0.5 point on
164 |     weight corresponds to the first item which is 0. so the weighted     median is 0.'''
165 | 
166 |     #convert the weights into probabilities
167 |     sum_weights = sum(weights)
168 |     weights = np.array([(w*1.0)/sum_weights for w in weights])
169 |     #sort values and weights based on values
170 |     values = np.array(values)
171 |     sorted_indices = np.argsort(values)
172 |     values_sorted  = values[sorted_indices]
173 |     weights_sorted = weights[sorted_indices]
174 |     #select the median point
175 |     it = np.nditer(weights_sorted, flags=['f_index'])
176 |     accumulative_probability = 0
177 |     median_index = -1
178 |     while not it.finished:
179 |         accumulative_probability += it[0]
180 |         if accumulative_probability > 0.5:
181 |             median_index = it.index
182 |             return values_sorted[median_index]
183 |         elif accumulative_probability == 0.5:
184 |             median_index = it.index
185 |             it.iternext()
186 |             next_median_index = it.index
187 |             return np.mean(values_sorted[[median_index, next_median_index]])
188 |         it.iternext()
189 | 
190 |     return values_sorted[median_index]
191 | 
192 | from sklearn.ensemble import AdaBoostRegressor


--------------------------------------------------------------------------------
/TrReg.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | import copy
  4 | from sklearn.tree import DecisionTreeRegressor
  5 | import matplotlib.pyplot as plt
  6 | from sklearn.ensemble import AdaBoostRegressor
  7 | from sklearn.metrics import mean_squared_error
  8 | from sklearn.model_selection import KFold
  9 | from sklearn.metrics import r2_score
 10 | from TR.TrAdaboostreg import *
 11 | from TR.TTemp import *
 12 | from TR.Mutisource import *
 13 | ##=============================================================================
 14 | 
 15 | #                                Example 1
 16 | ##=============================================================================
 17 | 
 18 | # 1. define the data generating function
 19 | def response(x, d, random_state,rng_temp):
 20 |     """
 21 |     x is the input variable
 22 |     d controls the simularity of different tasks
 23 |     """
 24 |     # a1 = np.random.normal(1, 0.1 * d)
 25 |     # a2 = np.random.normal(1, 0.1 * d)
 26 |     # b1 = np.random.normal(1, 0.1 * d)
 27 |     # b2 = np.random.normal(1, 0.1 * d)
 28 |     # c1 = np.random.normal(1, 0.05 * d)
 29 |     # c2 = np.random.normal(1, 0.05 * d)
 30 |     a1 = rng_temp.normal(1, 0.1 * d)
 31 |     a2 = rng_temp.normal(1, 0.1 * d)
 32 |     b1 = rng_temp.normal(1, 0.1 * d)
 33 |     b2 = rng_temp.normal(1, 0.1 * d)
 34 |     c1 = rng_temp.normal(1, 0.05 * d)
 35 |     c2 = rng_temp.normal(1, 0.05 * d)
 36 |     y = a1 * np.sin(b1 * x + c1).ravel() + a2 * np.sin(b2 * 6 * x + c2).ravel() + random_state.normal(0, 0.1,
 37 |                                                                                                       x.shape[0])
 38 |     return y
 39 | 
 40 | 
 41 | # ==============================================================================
 42 | 
 43 | #     2. decide the degree of similarity of multiple data sources using d
 44 | 
 45 | d = 2
 46 | # ==============================================================================
 47 | rng = np.random.RandomState(11)
 48 | rng_temp = np.random.RandomState(11)
 49 | # 3.1 create source data and target data
 50 | n_source1 = 50
 51 | #x_source1 = np.linspace(0, 6, n_source1)[:, np.newaxis]
 52 | 
 53 | x_source1=6*rng_temp.random_sample(n_source1)[:, np.newaxis]
 54 | y_source1 = response(x_source1, 0.5, rng,rng_temp)
 55 | n_source2 = 100
 56 | #x_source2 = np.linspace(0, 6, n_source2)[:, np.newaxis]
 57 | x_source2=6*rng_temp.random_sample(n_source1)[:, np.newaxis]
 58 | y_source2 = response(x_source2, 2.0, rng,rng_temp)
 59 | n_source3 = 100
 60 | #x_source3 = np.linspace(0, 6, n_source3)[:, np.newaxis]
 61 | x_source3=6*rng_temp.random_sample(n_source1)[:, np.newaxis]
 62 | y_source3 = response(x_source3, 6.5, rng,rng_temp)
 63 | n_source4 = 100
 64 | #x_source4 = np.linspace(0, 6, n_source4)[:, np.newaxis]
 65 | x_source4=6*rng_temp.random_sample(n_source1)[:, np.newaxis]
 66 | y_source4 = response(x_source4, 6.0, rng,rng_temp)
 67 | n_source5 = 100
 68 | #x_source5 = np.linspace(0, 6, n_source5)[:, np.newaxis]
 69 | x_source5=6*rng_temp.random_sample(n_source1)[:, np.newaxis]
 70 | y_source5 = response(x_source5, 5.0, rng,rng_temp)
 71 | 
 72 | # 3.2 create target data (n_target_train and n_target_test are the sample size of train and test datasets)
 73 | d=0.05
 74 | rng_temp2 = np.random.RandomState(43)
 75 | # a1 = np.random.normal(1, 0.1 * d)
 76 | # a2 = np.random.normal(1, 0.1 * d)
 77 | # b1 = np.random.normal(1, 0.1 * d)
 78 | # b2 = np.random.normal(1, 0.1 * d)
 79 | # c1 = np.random.normal(1, 0.05 * d)
 80 | # c2 = np.random.normal(1, 0.05 * d)
 81 | a1 = rng_temp2.normal(1, 0.1 * d)
 82 | a2 = rng_temp2.normal(1, 0.1 * d)
 83 | b1 = rng_temp2.normal(1, 0.1 * d)
 84 | b2 = rng_temp2.normal(1, 0.1 * d)
 85 | c1 = rng_temp2.normal(1, 0.05 * d)
 86 | c2 = rng_temp2.normal(1, 0.05 * d)
 87 | 
 88 | # target_train
 89 | # ==============================================================================
 90 | 
 91 | n_target_train = 40
 92 | 
 93 | # ==============================================================================
 94 | #x_target_train = np.linspace(0, 6, n_target_train)[:, np.newaxis]
 95 | 
 96 | x_target_train=6*rng_temp2.random_sample(n_target_train)[:, np.newaxis]
 97 | 
 98 | 
 99 | y_target_train = a1 * np.sin(b1 * x_target_train + c1).ravel() + a2 * np.sin(
100 |     b2 * 6 * x_target_train + c2).ravel() + rng.normal(0, 0.1, x_target_train.shape[0])
101 | 
102 | # target_test
103 | n_target_test = 600
104 | #x_target_test = np.linspace(0, 6, n_target_test)[:, np.newaxis]
105 | 
106 | x_target_test=6*rng_temp2.random_sample(n_target_test)[:, np.newaxis]
107 | y_target_test = a1 * np.sin(b1 * x_target_test + c1).ravel() + a2 * np.sin(
108 |     b2 * 6 * x_target_test + c2).ravel() + rng.normal(0, 0.1, x_target_test.shape[0])
109 | 
110 | 
111 | X = np.concatenate((x_source1, x_source2, x_source3, x_source4, x_source5))
112 | y = np.concatenate((y_source1, y_source2, y_source3, y_source4, y_source5))
113 | 
114 | 
115 | # ==============================================================================
116 | from sklearn import neighbors
117 | 
118 | clf = DecisionTreeRegressor(max_depth=3)
119 | #clf = neighbors.KNeighborsRegressor()
120 | 
121 | clf.fit(x_target_train,y_target_train)
122 | predict1=clf.predict(x_target_test)
123 | mse_twostageboost = mean_squared_error(y_target_test, predict1)
124 | print("MSE of tree:", mse_twostageboost)
125 | print("r2 of tree:", r2_score(y_target_test, predict1))
126 | # ==============================================================================
127 | 
128 | 
129 | 
130 | xlist=[x_source1,x_source2,x_source3,x_source4,x_source5]
131 | ylist=[y_source1, y_source2, y_source3, y_source4, y_source5]
132 | reslisttempx2 = []
133 | reslisttemp2 = [];
134 | for i in range(4, 50, 1):
135 |     predict = Mutisource_tradaboost(
136 |     x_target_train, xlist, y_target_train, ylist,  x_target_test, i,5,4*100+50)
137 |     mse_twostageboost = mean_squared_error(y_target_test, predict)
138 |     reslisttemp2.append(r2_score(y_target_test, predict))
139 |     reslisttempx2.append(i)
140 | print("r2 of tradaboost:", reslisttemp2)
141 | 
142 | 
143 | 
144 | print("MSE of muti:", mse_twostageboost)
145 | print("r2 of muti:", r2_score(y_target_test, predict))
146 | reslisttempx1 = []
147 | reslisttemp1 = [];
148 | for i in range(4, 50, 1):
149 |     predict = tradaboost(
150 |         x_target_train, X, y_target_train, y, x_target_test, y_target_test, i, True)
151 |     mse_twostageboost = mean_squared_error(y_target_test, predict)
152 |     reslisttemp1.append(r2_score(y_target_test, predict))
153 |     reslisttempx1.append(i)
154 | print("r2 of tradaboost:", reslisttemp1)
155 | #
156 | #
157 | # reslisttempx2 = []
158 | # reslisttemp2 = [];
159 | # for i in range(20, 300, 10):
160 | #     predict = tradaboost(
161 | #         x_target_train, X, y_target_train, y, x_target_test, y_target_test, i, False)
162 | #     mse_twostageboost = mean_squared_error(y_target_test, predict)
163 | #     reslisttemp2.append(r2_score(y_target_test, predict))
164 | #     reslisttempx2.append(i)
165 | # print("r2 of tradaboost:", reslisttemp2)
166 | #
167 | plt.plot(reslisttempx1, reslisttemp1, marker='*', linestyle='dashed', linewidth=1, label="tradaboost")
168 | plt.plot(reslisttempx2, reslisttemp2, marker='+', linestyle='dashed', linewidth=1, label="mutisource vfkmm-tradaboost")
169 | plt.plot(range(4, 50, 1), [0.70]*46, marker='_', linestyle='dashed', linewidth=1, label="baseline")
170 | 
171 | plt.xlabel("Iterations")
172 | plt.ylabel("score")
173 | plt.legend(loc="lower right")
174 | plt.show()
175 | 
176 | 
177 | 
178 | predict = tradaboost(
179 |     x_target_train, X, y_target_train, y,  x_target_test,y_target_test, 300,True)
180 | mse_twostageboost = mean_squared_error(y_target_test, predict)
181 | print("MSE of tradaboost:", mse_twostageboost)
182 | print("r2 of tradaboost:", r2_score(y_target_test, predict))
183 | 
184 | 
185 | #
186 | #
187 | predict2 = tradaboost(
188 |     x_target_train, X, y_target_train, y,  x_target_test,y_target_test, 300,False)
189 | mse_twostageboost = mean_squared_error(y_target_test, predict2)
190 | print("MSE of tradaboost margin:", mse_twostageboost)
191 | print("r2 of tradaboost: margin", r2_score(y_target_test, predict2))
192 | 
193 | 
194 | 
195 | 
196 | # 4.4 Plot the results
197 | plt.figure()
198 | plt.scatter(x_target_train, y_target_train, c="k", label="target_train")
199 | plt.plot(x_target_test, y_target_test, c="b", label="target_test", linewidth=0.5)
200 | plt.plot(x_target_test, predict1, c="r", label="AdaBoostRegressor", linewidth=2)
201 | # plt.plot(x_target_test, predict, c="g", label="VFKMM-TrAdaBoost without margin", linewidth=2)
202 | # plt.plot(x_target_test, predict2, c="y", label="VFKMM-TrAdaBoost", linewidth=2)
203 | plt.xlabel("x")
204 | plt.ylabel("y")
205 | plt.legend(loc="lower left")
206 | plt.title("mutisource VFKMM-TrAdaBoost Regressor")
207 | plt.legend()
208 | # plt.show()
209 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import math
  4 | from sklearn.datasets import make_gaussian_quantiles
  5 | import sklearn.svm
  6 | from sklearn.datasets import fetch_20newsgroups
  7 | from sklearn.feature_extraction.text import CountVectorizer
  8 | from sklearn.feature_extraction.text import TfidfTransformer
  9 | from sklearn.naive_bayes import MultinomialNB
 10 | from sklearn.pipeline import Pipeline
 11 | from scipy import interp
 12 | from scipy import sparse
 13 | from sklearn.linear_model import LogisticRegression
 14 | import TR.TrAdaboostkmm as kmmtr
 15 | import TR.TrAdaboost2 as tr
 16 | import TR.SPY as SPY
 17 | 
 18 | 
 19 | import matplotlib as mpl
 20 | mpl.rcParams['font.sans-serif'] = [u'SimHei']
 21 | from  text import util
 22 | from sklearn.metrics import roc_curve, auc
 23 | 
 24 | from TR.classification_report import  *
 25 | import time
 26 | 
 27 | 
 28 | def  main():
 29 | 
 30 | 
 31 | 
 32 | # '''
 33 | #     定义一个二分类问题，分类rec和sci，但是目标领域和源领域的数据来源不同
 34 | #     如何控制源领域和目标领域数据的数量
 35 | # '''
 36 | 
 37 | 
 38 |     # target_categories = ["rec.sport.hockey", "rec.motorcycles","sci.crypt", "sci.electronics"]
 39 |     # source_categories = ["rec.sport.baseball", "rec.autos","sci.med", "sci.space"]
 40 | 
 41 | 
 42 |     target_categories = ["rec.autos", "rec.sport.baseball","sci.med", "sci.space"]
 43 |     source_categories = ["rec.motorcycles", "rec.sport.hockey","sci.crypt", "sci.electronics"]
 44 | 
 45 |     #实验组1
 46 |     target_categories = ["rec.autos", "sci.med"]
 47 |     source_categories = ["rec.sport.hockey", "sci.electronics"]
 48 | 
 49 |     #实验组2
 50 |     target_categories = ["comp.graphics", "rec.autos"]
 51 |     source_categories = ["comp.os.ms-windows.misc", "rec.sport.hockey"]
 52 | 
 53 |     # target_categories = ["rec.sport.hockey", "rec.motorcycles"]
 54 |     # source_categories = ["sci.med", "sci.space"]
 55 | 
 56 |     target_categories = ["sci.crypt", "sci.space", "talk.politics.guns", "talk.politics.mideast"]
 57 |     source_categories = ["sci.electronics", "sci.med", "talk.politics.misc", "talk.religion.misc"]
 58 | 
 59 | 
 60 |     # 实验组1
 61 |     target_categories = ["rec.autos", "sci.med"]
 62 |     source_categories = ["rec.sport.hockey", "sci.electronics"]
 63 | 
 64 |     target = fetch_20newsgroups(subset='test',categories = target_categories, shuffle = True, random_state = 42)
 65 |     source= fetch_20newsgroups(subset='test',categories = source_categories, shuffle = True, random_state = 42)
 66 | 
 67 |     # source.data = source.data[0:1000]
 68 |     # source.target = source.target[0:1000]
 69 |     #
 70 |     target.data = target.data[0:400]
 71 |     target.target = target.target[0:400]
 72 | 
 73 |     print(target.target)
 74 |     print(target.target_names)
 75 |     print(source.target_names)
 76 | 
 77 |     print('目标源的大小', len(target.data), '辅助源的大小', len(source.data))
 78 | 
 79 |     # #
 80 |     # target.target[target.target == 0] = 0
 81 |     # target.target[target.target == 1] = 0
 82 |     # target.target[target.target == 2] = 1
 83 |     # target.target[target.target == 3] = 1
 84 |     # # print(type(target.target))
 85 |     # # print(target.target)
 86 |     #
 87 |     # source.target[source.target == 0] = 0
 88 |     # source.target[source.target == 1] = 0
 89 |     # source.target[source.target == 2] = 1
 90 |     # source.target[source.target == 3] = 1
 91 | 
 92 | 
 93 |     merge_target_source = np.concatenate((target.data, source.data), axis=0)
 94 |     merge_target_source_label = np.concatenate((target.target, source.target), axis=0)
 95 |     # print(set(merge_target_source_label))
 96 | 
 97 |     # refine emails - delete unwanted text form them
 98 |     util.refine_all_emails(merge_target_source)
 99 |     # feature Extractoin
100 |     # BOW Bag Of Words
101 |     TFIDF = util.bagOfWords(merge_target_source)
102 |     #TFIDF = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(TFIDF)
103 |     #TFIDF = sklearn.feature_extraction.text.TfidfTransformer.transform(TFIDF)
104 | 
105 |     length=len(target.data)
106 |     X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(TFIDF[:length], merge_target_source_label[:length], test_size=0.6,random_state = 0)
107 | 
108 |     #X_train_temp, X_test, y_train_temp, y_test=sklearn.cross_validation.train_test_split(X_test,y_test)
109 | 
110 |     print("测试集的大小",y_test.shape)
111 |     TFIDF = np.array(TFIDF.toarray())
112 |     merge_target_source_label=np.array(merge_target_source_label)
113 |     print((X_train.shape))
114 |     print((TFIDF.shape))
115 |     # build classifier
116 |     # clf = sklearn.svm.LinearSVC()
117 | 
118 | 
119 |     clf = LogisticRegression()
120 | 
121 | 
122 |     # print("辅助数据集和目标数据集一起训练",split_test_classifier(clf, X,
123 |     #       np.concatenate((y_train[:,None], merge_target_source_label[length:,None]), axis=0)
124 |     #                             ,X_test[0:200,:],y_test[0:200,None]))
125 |     X=sparse.vstack((X_train[:,:], TFIDF[length:,:]))
126 |     print("辅助数据集和目标数据集一起训练", split_test_classifier(clf, X,
127 |                                                    np.concatenate(
128 |                                                        (y_train[:, None], merge_target_source_label[length:, None]), axis=0)
129 |                                                    , X_test, y_test))
130 |     # X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(TFIDF, merge_target_source_label, test_size=0.4,random_state = 0)
131 | 
132 | 
133 |     clf = LogisticRegression()
134 |     # print("目标数据集单独训练",split_test_classifier(clf,X_train, y_train
135 |     #                             , X_test[0:200,:], y_test[0:200,None]))
136 |     print("目标数据集单独训练",split_test_classifier(clf,X_train, y_train
137 |                             , X_test, y_test))
138 |     #def fit(self, diff_train, diff_label, same_train, same_train_label, MAX_ITERATION=30):
139 |     #model.fit(TFIDF[length:,:], merge_target_source_label[length:,None], X_train, y_train[:,None],MAX_ITERATION=100)
140 | 
141 |     #
142 |     # predict1, accuracy_scorelist1, recall_scorelist1, f1_scorelist1 = SPY.tradaboost(X_train, TFIDF[length:, :],
143 |     #                                                                                    y_train[:, None],
144 |     #                                                                                    merge_target_source_label[
145 |     #                                                                                    length:, None],
146 |     #                                                                                    X_test, y_test, 55, True)
147 |     # print_classification_report('LR-TRadaboost', predict1, y_test)
148 | 
149 |     start =time.clock()
150 |     predict2, accuracy_scorelist2, recall_scorelist2, f1_scorelist2 = tr.tradaboost(
151 |         X_train, TFIDF[length:, :], y_train[:, None], merge_target_source_label[length:, None], X_test, y_test, 85)
152 |     print_classification_report('TRadaboost', predict2, y_test)
153 |     end = time.clock()
154 |     print('Running time: %s Seconds'%(end-start))
155 | 
156 |     # 原生的tradaboost
157 |     Predict = [];
158 | 
159 |     reslist = [];
160 |     reslistx = [];
161 |     # for i in range(5, 200, 10):
162 |     #     predict2, accuracy_scorelist2, recall_scorelist2, f1_scorelist2 = tr.tradaboost(
163 |     #         X_train, TFIDF[length:, :], y_train[:, None], merge_target_source_label[length:, None], X_test, y_test, i)
164 |     #     reslist.append(return_correct_rate(predict2, y_test))
165 |     #     reslistx.append(i)
166 |         #Predict = predict2
167 | 
168 |     #
169 |     # reslist=[0.775, 0.825, 0.875, 0.875, 0.8875, 0.8958333333333334, 0.8958333333333334, 0.8791666666666667, 0.8875,
170 |     #  0.8833333333333333, 0.8833333333333333, 0.8916666666666667, 0.9166666666666666, 0.9208333333333333, 0.9125,
171 |     #  0.9208333333333333, 0.9125, 0.9125, 0.9041666666666667, 0.9]
172 | 
173 |     #print_classification_report('TRadaboost', Predict, y_test)
174 |     print(reslist)
175 | 
176 | 
177 |     # plt.plot(reslistx, reslist, marker='+', linestyle='dashed', linewidth=1,label="tradaboost")  # plt.plot(range(5,31,5), accuracy_scorelist[4:30:5],marker='x', linestyle='dashed',linewidth=1,label="vfkmm without eliminate")
178 |     # plt.xlabel("迭代次数")
179 |     # plt.ylabel("score")
180 |     # plt.legend(loc="lower right")
181 |     # plt.show()
182 | 
183 | 
184 |     start = time.clock()
185 |     #kmm排除低权重的样本
186 |     predict1, accuracy_scorelist1, recall_scorelist1, f1_scorelist1 = kmmtr.tradaboost(X_train, TFIDF[length:, :],
187 |                                                                                    y_train[:, None],
188 |                                                                                    merge_target_source_label[length:, None],
189 |                                                                                    X_test, y_test,20, True)
190 |     #
191 |     #
192 |     print_classification_report('KMM-TRadaboost',predict1,y_test)
193 | 
194 | 
195 |     end = time.clock()
196 |     print('Running time: %s Seconds' % (end - start))
197 | 
198 |     SPY
199 |     reslisttempx = []
200 |     reslisttemp = [];
201 |     for i in range(5, 80, 2):
202 |         # predict3, accuracy_scorelist, recall_scorelist, f1_scorelist = SPY.tradaboost(
203 |         #     X_train, TFIDF[length:, :], y_train[:, None], merge_target_source_label[length:, None], X_test, y_test, i,
204 |         #     True)
205 |         # reslisttemp.append(return_correct_rate(predict3, y_test))
206 |         reslisttempx.append(i)
207 |         #Predict = predict3
208 |     #print_classification_report('lr-TRadaboost', Predict, y_test)
209 |     reslisttemp=[0.875, 0.8791666666666667, 0.8958333333333334, 0.9083333333333333, 0.9041666666666667, 0.8916666666666667, 0.9, 0.9, 0.9041666666666667, 0.9083333333333333, 0.9, 0.9, 0.9083333333333333, 0.9041666666666667, 0.9041666666666667, 0.9083333333333333, 0.9041666666666667, 0.9041666666666667, 0.9041666666666667, 0.9041666666666667, 0.9041666666666667, 0.9125, 0.9208333333333333, 0.9291666666666667, 0.9291666666666667, 0.9291666666666667, 0.9291666666666667, 0.9291666666666667, 0.9291666666666667, 0.9291666666666667, 0.925, 0.925, 0.9208333333333333, 0.9208333333333333, 0.9166666666666666, 0.9166666666666666, 0.9083333333333333, 0.9041666666666667]
210 | 
211 |     print(reslisttemp)
212 |     print(reslisttempx)
213 | 
214 | 
215 | 
216 | 
217 | 
218 | 
219 | 
220 |     reslisttempx1 = []
221 |     reslisttemp1 = [];
222 |     for i in range(5, 80, 2):
223 |         predict1, accuracy_scorelist1, recall_scorelist1, f1_scorelist1 = kmmtr.tradaboost(X_train, TFIDF[length:, :],
224 |                                                                                            y_train[:, None],
225 |                                                                                            merge_target_source_label[
226 |                                                                                            length:, None],
227 |                                                                                            X_test, y_test, i, True)
228 |         reslisttemp1.append(return_correct_rate(predict1, y_test))
229 |         reslisttempx1.append(i)
230 |         Predict = predict1
231 |     print_classification_report('KMM-TRadaboost', Predict, y_test)
232 |     print(reslisttemp1)
233 |     print(reslisttempx1)
234 | 
235 |     plt.plot(reslisttempx1, reslisttemp1, marker='o', linestyle='dashed', linewidth=1, label="vfkmm tradaboost")
236 |     plt.plot(reslistx, reslist, marker='+', linestyle='dashed', linewidth=1,label="tradaboost")  # plt.plot(range(5,31,5), accuracy_scorelist[4:30:5],marker='x', linestyle='dashed',linewidth=1,label="vfkmm without eliminate")
237 |     plt.plot(reslisttempx, reslisttemp, marker='x', linestyle='dashed', linewidth=1, label="LR tradaboost")
238 |     plt.xlabel("迭代次数")
239 |     plt.ylabel("score")
240 |     plt.legend(loc="lower right")
241 |     plt.show()
242 | 
243 | 
244 | 
245 |     # # SPY
246 |     # reslisttempx=[]
247 |     # reslisttemp = [];
248 |     # for i in range(5,100,2):
249 |     #     predict3, accuracy_scorelist, recall_scorelist, f1_scorelist = SPY.tradaboost(
250 |     #         X_train, TFIDF[length:, :], y_train[:, None], merge_target_source_label[length:, None], X_test, y_test, i,
251 |     #         True)
252 |     #     reslisttemp.append(return_correct_rate(predict3,y_test))
253 |     #     reslisttempx.append(i)
254 |     #     Predict = predict3
255 |     # print_classification_report('lr-TRadaboost', Predict, y_test)
256 |     # print(reslisttemp)
257 | 
258 |     # predict, accuracy_scorelist, recall_scorelist, f1_scorelist =SPY.tradaboost(
259 |     #     X_train, TFIDF[length:, :], y_train[:, None],merge_target_source_label[length:, None],X_test,y_test, 62,True)
260 |     # print_classification_report('SPY',predict,y_test)
261 |     #
262 |     # predict2, accuracy_scorelist2, recall_scorelist2, f1_scorelist2=tr.tradaboost(
263 |     #     X_train, TFIDF[length:, :], y_train[:, None],merge_target_source_label[length:, None],X_test,y_test, 100)
264 |     # print_classification_report('TRadaboost',predict2,y_test)
265 | 
266 | 
267 | 
268 | 
269 | 
270 | 
271 | 
272 |     # # kmm排除低权重的样本
273 |     # predict1, accuracy_scorelist1, recall_scorelist1, f1_scorelist1 = kmmtr.tradaboost(X_train, TFIDF[length:, :],
274 |     #                                                                                y_train[:, None],
275 |     #                                                                                merge_target_source_label[length:, None],
276 |     #                                                                                X_test, y_test,65, True)
277 |     # #
278 |     # #
279 |     # print_classification_report('KMM-TRadaboost',predict1,y_test)
280 |     # X_test, i), y_test)
281 |     # 画ROC曲线和计算AUC,将返回的list归一化到0,1之间
282 |     # min_max_scaler = sklearn.preprocessing.MinMaxScaler()
283 |     #
284 |     #
285 |     # predict = np.asarray(predict)
286 |     # predict = predict.reshape(len(y_test), 1)
287 |     # predict = min_max_scaler.fit_transform(predict)
288 |     # predict=predict.tolist()
289 |     # fpr1, tpr1, thresholds1 = roc_curve(y_test, predict)
290 |     # roc_auc1 = auc(fpr1, tpr1)
291 |     #
292 |     # # predict1=np.asarray(predict1)
293 |     # # predict1=predict1.reshape(len(y_test),1)
294 |     # # predict1 = min_max_scaler.fit_transform(predict1)
295 |     # # predict1=predict1.tolist()
296 |     # mean_tpr = 0.0
297 |     # mean_fpr = np.linspace(0, 1, 100)
298 |     # all_tpr = []
299 |     # fpr, tpr, thresholds = roc_curve(y_test, predict1)  ##指定正例标签，pos_label = ###########在数之联的时候学到的，要制定正例
300 |     # mean_tpr += interp(mean_fpr, fpr, tpr)          #对mean_tpr在mean_fpr处进行插值，通过scipy包调用interp()函数
301 |     # mean_tpr[0] = 0.0
302 |     # roc_auc = auc(fpr, tpr)
303 |     # plt.plot(fpr, tpr, lw=1, label='vfkmm-tradaboost AUC = %0.2f'% roc_auc)
304 |     # plt.plot(fpr1, tpr1, lw=1, label='tradaboost AUC = %0.2f'% roc_auc1)
305 |     # plt.legend(loc='lower right')
306 |     # plt.plot([0,1],[0,1],'m--',c='#666666')
307 |     # plt.show()
308 | 
309 | 
310 | 
311 |     plt.plot(reslisttempx, reslisttemp, marker='x', linestyle='dashed', linewidth=1, label="LR tradaboost")
312 |     plt.plot(reslisttempx1, reslisttemp1, marker='o', linestyle='dashed', linewidth=1, label="vfkmm tradaboost")
313 |     plt.plot(reslistx, reslist, marker='+', linestyle='dashed', linewidth=1,label="tradaboost")  # plt.plot(range(5,31,5), accuracy_scorelist[4:30:5],marker='x', linestyle='dashed',linewidth=1,label="vfkmm without eliminate")
314 |     # plt.plot(range(5,31,5), accuracy_scorelist1[4:30:5], marker='o', linestyle='dashed',linewidth=1,label="vfkmm eliminate")
315 |     plt.xlabel("迭代次数")
316 |     plt.ylabel("score")
317 |     plt.legend(loc="lower right")
318 |     plt.show()
319 |     #res = return_correct_rate(tradaboost(X_train, TFIDF[length:,:],y_train[:,None], merge_target_source_label[length:,None], X_test[0:200,:], 100),y_test[0:200,None])
320 | 
321 | 
322 | from sklearn import svm
323 | def naive_model_return_error(train, y, test,test_y):
324 |     """implement a comparative method as a naive model"""
325 |     #model = sklearn.linear_model.LogisticRegression(C=10000, penalty='l1', tol=0.0001)
326 |     model = svm.SVC(C=131072,gamma=0.0001, kernel='rbf', probability=True)
327 |     model.fit(train,y )
328 |     preds = model.predict(test)
329 |     c= 0
330 |     for i in range(len(preds)):
331 |         if preds[i] == test_y[i] :
332 |             c+=1
333 |     res = c/len(test_y)
334 |     return res
335 | 
336 | 
337 | def return_correct_rate(preds, target):
338 |     c= 0
339 |     for i in range(len(preds)):
340 |         if preds[i] == target[i] :
341 |             c+=1
342 |     res = c/len(target)
343 |     #print("准确率",np.mean(preds == target),'召回率',recall_score(preds,target),'F1分数',f1_score(preds,target))
344 |     return res
345 | 
346 | def precision_score(y_true, y_pred):
347 |     return ((y_true==1)*(y_pred==1)).sum()/(y_pred==1).sum()
348 | def recall_score(y_true, y_pred):
349 |     return ((y_true==1)*(y_pred==1)).sum()/(y_true==1).sum()
350 | def f1_score(y_true, y_pred):
351 |     num = 2*precision_score(y_true, y_pred)*recall_score(y_true, y_pred)
352 |     deno = (precision_score(y_true, y_pred)+recall_score(y_true, y_pred))
353 |     return num/deno
354 | def split_test_classifier(clf, X, y,X_test,y_test):
355 | 
356 |     clf.fit(X, y)
357 |     # predict
358 |     y_predicted = clf.predict(X_test)
359 |     # calculate percision
360 |     print_classification_report('',y_predicted,y_test)
361 |     print("准确率",np.mean(y_predicted == y_test),'召回率',recall_score(y_test,y_predicted),'F1分数',f1_score(y_test,y_predicted))
362 |     return  np.mean(y_predicted == y_test)
363 | 
364 | def plot_results(i, results_list, labels_list):
365 |     colors_list = ['red', 'blue', 'black', 'green', 'cyan', 'yellow']
366 | 
367 |     if not len(results_list) == len(labels_list):
368 |         raise Exception
369 | 
370 |     for (result, label, color) in zip(results_list, labels_list, colors_list):
371 |         plt.plot(i, result, color=color, lw=2.0, label=label)
372 |     plt.legend()
373 |     plt.show()
374 | if __name__ == "__main__":
375 |     main()
376 | 
377 | 
378 | 


--------------------------------------------------------------------------------
/TrAdaboost.R2.py:
--------------------------------------------------------------------------------
  1 | """
  2 | An example showing the usage of the TwoStageTrAdaBoostR2 algorithm.
  3 | Example starts at line 396.
  4 | """
  5 | 
  6 | import numpy as np
  7 | import copy
  8 | from sklearn.tree import DecisionTreeRegressor
  9 | import matplotlib.pyplot as plt
 10 | from sklearn.ensemble import AdaBoostRegressor
 11 | from sklearn.metrics import mean_squared_error
 12 | from sklearn.model_selection import KFold
 13 | from sklearn.metrics import r2_score
 14 | 
 15 | # import matplotlib as mpl
 16 | # mpl.rcParams['font.sans-serif'] = [u'SimHei']
 17 | ##=============================================================================
 18 | 
 19 | #        copy the two classes from TwoStageTrAdaBoostR2 algorithm
 20 | 
 21 | ##=============================================================================
 22 | class Stage2_TrAdaBoostR2:
 23 |     def __init__(self,
 24 |                  base_estimator=DecisionTreeRegressor(max_depth=4),
 25 |                  sample_size=None,
 26 |                  n_estimators=50,
 27 |                  learning_rate=1.,
 28 |                  loss='linear',
 29 |                  random_state=np.random.mtrand._rand,
 30 |                  margin=True):
 31 |         self.base_estimator = base_estimator
 32 |         self.sample_size = sample_size
 33 |         self.n_estimators = n_estimators
 34 |         self.learning_rate = learning_rate
 35 |         self.loss = loss
 36 |         self.random_state = random_state
 37 |         self.margin=margin
 38 | 
 39 | 
 40 |     def fit(self, X, y, sample_weight=None):
 41 |         # Check parameters
 42 |         if self.learning_rate <= 0:
 43 |             raise ValueError("learning_rate must be greater than zero")
 44 | 
 45 |         if sample_weight is None:
 46 |             # Initialize weights to 1 / n_samples
 47 |             sample_weight = np.empty(X.shape[0], dtype=np.float64)
 48 |             sample_weight[:] = 1. / X.shape[0]
 49 |         else:
 50 |             # Normalize existing weights
 51 |             sample_weight = sample_weight / sample_weight.sum(dtype=np.float64)
 52 | 
 53 |             # Check that the sample weights sum is positive
 54 |             if sample_weight.sum() <= 0:
 55 |                 raise ValueError(
 56 |                     "Attempting to fit with a non-positive "
 57 |                     "weighted number of samples.")
 58 | 
 59 |         if self.sample_size is None:
 60 |             raise ValueError("Additional input required: sample size of source and target is missing")
 61 |         elif np.array(self.sample_size).sum() != X.shape[0]:
 62 |             raise ValueError("Input error: the specified sample size does not equal to the input size")
 63 | 
 64 |         # Clear any previous fit results
 65 |         self.estimators_ = []
 66 |         self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64)
 67 |         self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64)
 68 | 
 69 |         for iboost in range(
 70 |                 self.n_estimators):  # this for loop is sequential and does not support parallel(revison is needed if making parallel)
 71 |             # Boosting step
 72 |             sample_weight, estimator_weight, estimator_error = self._stage2_adaboostR2(
 73 |                 iboost,
 74 |                 X, y,
 75 |                 sample_weight)
 76 |             # Early termination
 77 |             if sample_weight is None:
 78 |                 break
 79 | 
 80 |             self.estimator_weights_[iboost] = estimator_weight
 81 |             self.estimator_errors_[iboost] = estimator_error
 82 | 
 83 |             # Stop if error is zero
 84 |             if estimator_error == 0:
 85 |                 break
 86 | 
 87 |             sample_weight_sum = np.sum(sample_weight)
 88 | 
 89 |             # Stop if the sum of sample weights has become non-positive
 90 |             if sample_weight_sum <= 0:
 91 |                 break
 92 | 
 93 |             if iboost < self.n_estimators - 1:
 94 |                 # Normalize
 95 |                 sample_weight /= sample_weight_sum
 96 |         return self
 97 | 
 98 |     def _stage2_adaboostR2(self, iboost, X, y, sample_weight):
 99 | 
100 |         estimator = copy.deepcopy(
101 |             self.base_estimator)  # some estimators allow for specifying random_state estimator = base_estimator(random_state=random_state)
102 | 
103 |         ## using sampling method to account for sample_weight as discussed in Drucker's paper
104 |         # Weighted sampling of the training set with replacement
105 |         cdf = np.cumsum(sample_weight)
106 |         cdf /= cdf[-1]
107 |         uniform_samples = self.random_state.random_sample(X.shape[0])
108 |         bootstrap_idx = cdf.searchsorted(uniform_samples, side='right')
109 |         # searchsorted returns a scalar
110 |         bootstrap_idx = np.array(bootstrap_idx, copy=False)
111 | 
112 |         # Fit on the bootstrapped sample and obtain a prediction
113 |         # for all samples in the training set
114 |         estimator.fit(X[bootstrap_idx], y[bootstrap_idx])
115 |         y_predict = estimator.predict(X)
116 | 
117 |         self.estimators_.append(estimator)  # add the fitted estimator
118 | 
119 | 
120 |         error_vect = np.abs(y_predict - y)
121 | 
122 |         if(self.margin):
123 |             error_vect[error_vect <0.05] = 0
124 |         error_max = error_vect.max()
125 | 
126 |         if error_max != 0.:
127 |             error_vect /= error_max
128 | 
129 |         if self.loss == 'square':
130 |             error_vect **= 2
131 |         elif self.loss == 'exponential':
132 |             error_vect = 1. - np.exp(- error_vect)
133 | 
134 |         # Calculate the average loss
135 |         estimator_error = (sample_weight * error_vect).sum()
136 | 
137 |         if estimator_error <= 0:
138 |             # Stop if fit is perfect
139 |             return sample_weight, 1., 0.
140 | 
141 |         elif estimator_error >= 0.5:
142 |             # Discard current estimator only if it isn't the only one
143 |             if len(self.estimators_) > 1:
144 |                 self.estimators_.pop(-1)
145 |             return None, None, None
146 | 
147 |         beta = estimator_error / (1. - estimator_error)
148 |         # avoid overflow of np.log(1. / beta)
149 |         if beta < 1e-308:
150 |             beta = 1e-308
151 |         estimator_weight = self.learning_rate * np.log(1. / beta)
152 | 
153 |         # Boost weight using AdaBoost.R2 alg except the weight of the source data
154 |         # the weight of the source data are remained
155 |         source_weight_sum = np.sum(sample_weight[:-self.sample_size[-1]]) / np.sum(sample_weight)
156 |         target_weight_sum = np.sum(sample_weight[-self.sample_size[-1]:]) / np.sum(sample_weight)
157 | 
158 |         if not iboost == self.n_estimators - 1:
159 |             sample_weight[-self.sample_size[-1]:] *= np.power(
160 |                 beta,(error_vect[-self.sample_size[-1]:]) * self.learning_rate)
161 |                 #(1. - error_vect[-self.sample_size[-1]:]) * self.learning_rate)
162 |             # make the sum weight of the source data not changing
163 |             source_weight_sum_new = np.sum(sample_weight[:-self.sample_size[-1]]) / np.sum(sample_weight)
164 |             target_weight_sum_new = np.sum(sample_weight[-self.sample_size[-1]:]) / np.sum(sample_weight)
165 |             if source_weight_sum_new != 0. and target_weight_sum_new != 0.:
166 |                 sample_weight[:-self.sample_size[-1]] = sample_weight[:-self.sample_size[
167 |                     -1]] * source_weight_sum / source_weight_sum_new
168 |                 sample_weight[-self.sample_size[-1]:] = sample_weight[-self.sample_size[
169 |                     -1]:] * target_weight_sum / target_weight_sum_new
170 | 
171 |         return sample_weight, estimator_weight, estimator_error
172 | 
173 |     def predict(self, X):
174 |         # Evaluate predictions of all estimators
175 |         predictions = np.array([
176 |                                    est.predict(X) for est in self.estimators_[:len(self.estimators_)]]).T
177 | 
178 |         # Sort the predictions
179 |         sorted_idx = np.argsort(predictions, axis=1)
180 | 
181 |         # Find index of median prediction for each sample
182 |         weight_cdf = np.cumsum(self.estimator_weights_[sorted_idx], axis=1)
183 |         median_or_above = weight_cdf >= 0.5 * weight_cdf[:, -1][:, np.newaxis]
184 |         median_idx = median_or_above.argmax(axis=1)
185 | 
186 |         median_estimators = sorted_idx[np.arange(X.shape[0]), median_idx]
187 | 
188 |         # Return median predictions
189 |         return predictions[np.arange(X.shape[0]), median_estimators]
190 | 
191 | 
192 | class TwoStageTrAdaBoostR2:
193 |     def __init__(self,
194 |                  base_estimator=DecisionTreeRegressor(max_depth=4),
195 |                  sample_size=None,
196 |                  n_estimators=50,
197 |                  steps=10,
198 |                  fold=5,
199 |                  learning_rate=1.,
200 |                  loss='linear',
201 |                  random_state=np.random.mtrand._rand,
202 |                  margin=True
203 |                  ):
204 |         self.base_estimator = base_estimator
205 |         self.sample_size = sample_size
206 |         self.n_estimators = n_estimators
207 |         self.steps = steps
208 |         self.fold = fold
209 |         self.learning_rate = learning_rate
210 |         self.loss = loss
211 |         self.random_state = random_state
212 |         self.margin=margin
213 | 
214 |     def fit(self, X, y, sample_weight=None):
215 |         # Check parameters
216 |         if self.learning_rate <= 0:
217 |             raise ValueError("learning_rate must be greater than zero")
218 | 
219 |         if sample_weight is None:
220 |             # Initialize weights to 1 / n_samples
221 |             sample_weight = np.empty(X.shape[0], dtype=np.float64)
222 |             sample_weight[:] = 1. / X.shape[0]
223 |         else:
224 |             # Normalize existing weights
225 |             sample_weight = sample_weight / sample_weight.sum(dtype=np.float64)
226 | 
227 |             # Check that the sample weights sum is positive
228 |             if sample_weight.sum() <= 0:
229 |                 raise ValueError(
230 |                     "Attempting to fit with a non-positive "
231 |                     "weighted number of samples.")
232 | 
233 |         if self.sample_size is None:
234 |             raise ValueError("Additional input required: sample size of source and target is missing")
235 |         elif np.array(self.sample_size).sum() != X.shape[0]:
236 |             raise ValueError("Input error: the specified sample size does not equal to the input size")
237 | 
238 |         X_source = X[:-self.sample_size[-1]]
239 |         y_source = y[:-self.sample_size[-1]]
240 |         X_target = X[-self.sample_size[-1]:]
241 |         y_target = y[-self.sample_size[-1]:]
242 | 
243 |         self.models_ = []
244 |         self.errors_ = []
245 |         for istep in range(self.steps):
246 |             model = Stage2_TrAdaBoostR2(self.base_estimator,
247 |                                         sample_size=self.sample_size,
248 |                                         n_estimators=self.n_estimators,
249 |                                         learning_rate=self.learning_rate, loss=self.loss,
250 |                                         random_state=self.random_state,margin=self.margin)
251 |             model.fit(X, y, sample_weight=sample_weight)
252 |             self.models_.append(model)
253 |             # cv training
254 |             kf = KFold(n_splits=self.fold)
255 |             error = []
256 |             target_weight = sample_weight[-self.sample_size[-1]:]
257 |             source_weight = sample_weight[:-self.sample_size[-1]]
258 |             for train, test in kf.split(X_target):
259 |                 sample_size = [self.sample_size[0], len(train)]
260 |                 model = Stage2_TrAdaBoostR2(self.base_estimator,
261 |                                             sample_size=sample_size,
262 |                                             n_estimators=self.n_estimators,
263 |                                             learning_rate=self.learning_rate, loss=self.loss,
264 |                                             random_state=self.random_state,margin=self.margin)
265 |                 X_train = np.concatenate((X_source, X_target[train]))
266 |                 y_train = np.concatenate((y_source, y_source[train]))
267 |                 X_test = X_target[test]
268 |                 y_test = y_target[test]
269 |                 # make sure the sum weight of the target data do not change with CV's split sampling
270 |                 target_weight_train = target_weight[train] * np.sum(target_weight) / np.sum(target_weight[train])
271 |                 model.fit(X_train, y_train, sample_weight=np.concatenate((source_weight, target_weight_train)))
272 |                 y_predict = model.predict(X_test)
273 |                 error.append(mean_squared_error(y_predict, y_test))
274 | 
275 |             self.errors_.append(np.array(error).mean())
276 | 
277 |             sample_weight = self._twostage_adaboostR2(istep, X, y, sample_weight)
278 | 
279 |             if sample_weight is None:
280 |                 break
281 |             if np.array(error).mean() == 0:
282 |                 break
283 | 
284 |             sample_weight_sum = np.sum(sample_weight)
285 | 
286 |             # Stop if the sum of sample weights has become non-positive
287 |             if sample_weight_sum <= 0:
288 |                 break
289 | 
290 |             if istep < self.steps - 1:
291 |                 # Normalize
292 |                 sample_weight /= sample_weight_sum
293 |         return self
294 | 
295 |     def _twostage_adaboostR2(self, istep, X, y, sample_weight):
296 | 
297 |         estimator = copy.deepcopy(
298 |             self.base_estimator)  # some estimators allow for specifying random_state estimator = base_estimator(random_state=random_state)
299 | 
300 |         ## using sampling method to account for sample_weight as discussed in Drucker's paper
301 |         # Weighted sampling of the training set with replacement
302 |         cdf = np.cumsum(sample_weight)
303 |         cdf /= cdf[-1]
304 |         uniform_samples = self.random_state.random_sample(X.shape[0])
305 |         bootstrap_idx = cdf.searchsorted(uniform_samples, side='right')
306 |         # searchsorted returns a scalar
307 |         bootstrap_idx = np.array(bootstrap_idx, copy=False)
308 | 
309 |         # Fit on the bootstrapped sample and obtain a prediction
310 |         # for all samples in the training set
311 |         estimator.fit(X[bootstrap_idx], y[bootstrap_idx])
312 |         y_predict = estimator.predict(X)
313 | 
314 |         error_vect = np.abs(y_predict - y)
315 |         error_max = error_vect.max()
316 | 
317 |         if error_max != 0.:
318 |             error_vect /= error_max
319 | 
320 |         if self.loss == 'square':
321 |             error_vect **= 2
322 |         elif self.loss == 'exponential':
323 |             error_vect = 1. - np.exp(- error_vect)
324 | 
325 |         # Update the weight vector
326 |         beta = self._beta_binary_search(istep, sample_weight, error_vect, stp=1e-30)
327 | 
328 |         if not istep == self.steps - 1:
329 |             sample_weight[:-self.sample_size[-1]] *= np.power(
330 |                 beta,
331 |                 (error_vect[:-self.sample_size[-1]]) * self.learning_rate)
332 |         return sample_weight
333 | 
334 |     def _beta_binary_search(self, istep, sample_weight, error_vect, stp):
335 |         # calculate the specified sum of weight for the target data
336 |         n_target = self.sample_size[-1]
337 |         n_source = np.array(self.sample_size).sum() - n_target
338 |         theoretical_sum = n_target / (n_source + n_target) + istep / (self.steps - 1) * (
339 |         1 - n_target / (n_source + n_target))
340 |         # for the last iteration step, beta is 0.
341 |         if istep == self.steps - 1:
342 |             beta = 0.
343 |             return beta
344 |         # binary search for beta
345 |         L = 0.
346 |         R = 1.
347 |         beta = (L + R) / 2
348 |         sample_weight_ = copy.deepcopy(sample_weight)
349 |         sample_weight_[:-n_target] *= np.power(
350 |             beta,
351 |             (error_vect[:-n_target]) * self.learning_rate)
352 |         sample_weight_ /= np.sum(sample_weight_, dtype=np.float64)
353 |         updated_weight_sum = np.sum(sample_weight_[-n_target:], dtype=np.float64)
354 | 
355 |         while np.abs(updated_weight_sum - theoretical_sum) > 0.01:
356 |             if updated_weight_sum < theoretical_sum:
357 |                 R = beta - stp
358 |                 if R > L:
359 |                     beta = (L + R) / 2
360 |                     sample_weight_ = copy.deepcopy(sample_weight)
361 |                     sample_weight_[:-n_target] *= np.power(
362 |                         beta,
363 |                         (error_vect[:-n_target]) * self.learning_rate)
364 |                     sample_weight_ /= np.sum(sample_weight_, dtype=np.float64)
365 |                     updated_weight_sum = np.sum(sample_weight_[-n_target:], dtype=np.float64)
366 |                 else:
367 |                     print("At step:", istep + 1)
368 |                     print("Binary search's goal not meeted! Value is set to be the available best!")
369 |                     print("Try reducing the search interval. Current stp interval:", stp)
370 |                     break
371 | 
372 |             elif updated_weight_sum > theoretical_sum:
373 |                 L = beta + stp
374 |                 if L < R:
375 |                     beta = (L + R) / 2
376 |                     sample_weight_ = copy.deepcopy(sample_weight)
377 |                     sample_weight_[:-n_target] *= np.power(
378 |                         beta,
379 |                         (error_vect[:-n_target]) * self.learning_rate)
380 |                     sample_weight_ /= np.sum(sample_weight_, dtype=np.float64)
381 |                     updated_weight_sum = np.sum(sample_weight_[-n_target:], dtype=np.float64)
382 |                 else:
383 |                     print("At step:", istep + 1)
384 |                     print("Binary search's goal not meeted! Value is set to be the available best!")
385 |                     print("Try reducing the search interval. Current stp interval:", stp)
386 |                     break
387 |         return beta
388 | 
389 |     def predict(self, X):
390 |         # select the model with the least CV error
391 |         fmodel = self.models_[np.array(self.errors_).argmin()]
392 |         predictions = fmodel.predict(X)
393 |         return predictions
394 | 
395 |         ##=============================================================================
396 | 
397 | 
398 | # end copying the two classes
399 | 
400 | ##=============================================================================
401 | 
402 | #                                Example 1
403 | 
404 | ##=============================================================================
405 | 
406 | # 1. define the data generating function
407 | def response(x, d, random_state):
408 |     """
409 |     x is the input variable
410 |     d controls the simularity of different tasks
411 |     """
412 |     a1 = np.random.normal(1, 0.1 * d)
413 |     a2 = np.random.normal(1, 0.1 * d)
414 |     b1 = np.random.normal(1, 0.1 * d)
415 |     b2 = np.random.normal(1, 0.1 * d)
416 |     c1 = np.random.normal(1, 0.05 * d)
417 |     c2 = np.random.normal(1, 0.05 * d)
418 |     y = a1 * np.sin(b1 * x + c1).ravel() + a2 * np.sin(b2 * 6 * x + c2).ravel() + random_state.normal(0, 0.1,
419 |                                                                                                       x.shape[0])
420 |     return y
421 | 
422 | 
423 | # ==============================================================================
424 | 
425 | #     2. decide the degree of similarity of multiple data sources using d
426 | 
427 | d = 5
428 | # ==============================================================================
429 | rng = np.random.RandomState(1)
430 | 
431 | # 3.1 create source data and target data
432 | n_source1 = 100
433 | x_source1 = np.linspace(0, 6, n_source1)[:, np.newaxis]
434 | y_source1 = response(x_source1, d, rng)
435 | n_source2 = 100
436 | x_source2 = np.linspace(0, 6, n_source2)[:, np.newaxis]
437 | y_source2 = response(x_source2, d, rng)
438 | n_source3 = 100
439 | x_source3 = np.linspace(0, 6, n_source3)[:, np.newaxis]
440 | y_source3 = response(x_source3, d, rng)
441 | n_source4 = 100
442 | x_source4 = np.linspace(0, 6, n_source4)[:, np.newaxis]
443 | y_source4 = response(x_source4, d, rng)
444 | n_source5 = 100
445 | x_source5 = np.linspace(0, 6, n_source5)[:, np.newaxis]
446 | y_source5 = response(x_source5, d, rng)
447 | 
448 | # 3.2 create target data (n_target_train and n_target_test are the sample size of train and test datasets)
449 | a1 = np.random.normal(1, 0.1 * d)
450 | a2 = np.random.normal(1, 0.1 * d)
451 | b1 = np.random.normal(1, 0.1 * d)
452 | b2 = np.random.normal(1, 0.1 * d)
453 | c1 = np.random.normal(1, 0.05 * d)
454 | c2 = np.random.normal(1, 0.05 * d)
455 | 
456 | # target_train
457 | # ==============================================================================
458 | 
459 | n_target_train = 15
460 | 
461 | # ==============================================================================
462 | x_target_train = np.linspace(0, 6, n_target_train)[:, np.newaxis]
463 | y_target_train = a1 * np.sin(b1 * x_target_train + c1).ravel() + a2 * np.sin(
464 |     b2 * 6 * x_target_train + c2).ravel() + rng.normal(0, 0.1, x_target_train.shape[0])
465 | 
466 | # target_test
467 | n_target_test = 600
468 | x_target_test = np.linspace(0, 6, n_target_test)[:, np.newaxis]
469 | y_target_test = a1 * np.sin(b1 * x_target_test + c1).ravel() + a2 * np.sin(
470 |     b2 * 6 * x_target_test + c2).ravel() + rng.normal(0, 0.1, x_target_test.shape[0])
471 | 
472 | # 3.3 plot the generated data
473 | plt.figure()
474 | plt.plot(x_source1, y_source1, c="r", label="source1", linewidth=1)
475 | plt.plot(x_source2, y_source2, c="y", label="source2", linewidth=1)
476 | plt.plot(x_source3, y_source3, c="g", label="source3", linewidth=1)
477 | plt.plot(x_source4, y_source4, c="c", label="source4", linewidth=1)
478 | plt.plot(x_source5, y_source5, c="m", label="source5", linewidth=1)
479 | plt.plot(x_target_test, y_target_test, c="b", label="target_test", linewidth=0.5)
480 | plt.scatter(x_target_train, y_target_train, c="k", label="target_train")
481 | plt.xlabel("x")
482 | plt.ylabel("y")
483 | plt.title("Multiple datasets")
484 | plt.legend()
485 | plt.show()
486 | 
487 | # 4. transfer learning regressiong for the target_train data
488 | # 4.1 data combination and initial setting specification
489 | X = np.concatenate((x_source1, x_source2, x_source3, x_source4, x_source5, x_target_train))
490 | y = np.concatenate((y_source1, y_source2, y_source3, y_source4, y_source5, y_target_train))
491 | sample_size = [n_source1 + n_source2 + n_source3 + n_source4 + n_source5, n_target_train]
492 | 
493 | # ==============================================================================
494 | 
495 | n_estimators = 10
496 | steps = 100
497 | fold = 5
498 | random_state = np.random.RandomState(1)
499 | 
500 | # ==============================================================================
501 | 
502 | # 4.2 TwoStageAdaBoostR2
503 | regr_1 = TwoStageTrAdaBoostR2(DecisionTreeRegressor(max_depth=6),
504 |                               n_estimators=n_estimators, sample_size=sample_size,
505 |                               steps=steps, fold=fold,
506 |                               random_state=random_state)
507 | regr_1.fit(X, y)
508 | y_pred1 = regr_1.predict(x_target_test)
509 | 
510 | # 4.3 As comparision, use AdaBoostR2 without transfer learning
511 | # ==============================================================================
512 | 
513 | # ==============================================================================
514 | 
515 | # 4.2 TwoStageAdaBoostR2 without margin
516 | regr_3 = TwoStageTrAdaBoostR2(DecisionTreeRegressor(max_depth=6),
517 |                               n_estimators=n_estimators, sample_size=sample_size,
518 |                               steps=steps, fold=fold,
519 |                               random_state=random_state,margin=False)
520 | regr_3.fit(X, y)
521 | y_pred3 = regr_3.predict(x_target_test)
522 | 
523 | # 4.3 As comparision, use AdaBoostR2 without transfer learning
524 | # ==============================================================================
525 | regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=6),
526 |                            n_estimators=n_estimators)
527 | # ==============================================================================
528 | regr_2.fit(x_target_train, y_target_train)
529 | y_pred2 = regr_2.predict(x_target_test)
530 | 
531 | 
532 | 
533 | # 4.5 Calculate mse
534 | mse_twostageboost = mean_squared_error(y_target_test, y_pred1)
535 | mse_adaboost = mean_squared_error(y_target_test, y_pred2)
536 | mse_twostageboost3 = mean_squared_error(y_target_test, y_pred3)
537 | print("MSE of regular AdaboostR2:", mse_adaboost)
538 | print("MSE of TwoStageTrAdaboostR2:", mse_twostageboost)
539 | print("MSE of TwoStageTrAdaboostR2 without margin:", mse_twostageboost3)
540 | # ==============================================================================
541 | print("r2 of regular AdaboostR2:", r2_score(y_target_test, y_pred2))
542 | print("r2 of TwoStageTrAdaboostR2:", r2_score(y_target_test, y_pred1))
543 | print("r2 of TwoStageTrAdaboostR2 without margin:", r2_score(y_target_test, y_pred3))
544 | 
545 | 
546 | 
547 | # 4.4 Plot the results
548 | # plt.figure()
549 | # plt.scatter(x_target_train, y_target_train, c="k", label="target_train")
550 | # plt.plot(x_target_test, y_target_test, c="b", label="target_test", linewidth=0.5)
551 | # plt.plot(x_target_test, y_pred1, c="r", label="VFKMM-TrAdaBoost", linewidth=2)
552 | # plt.plot(x_target_test, y_pred2, c="y", label="AdaBoostRegressor", linewidth=2)
553 | # plt.plot(x_target_test, y_pred3, c="g", label="VFKMM-TrAdaBoost without margin", linewidth=2)
554 | # plt.xlabel("x")
555 | # plt.ylabel("y")
556 | # plt.legend(loc="lower left")
557 | # plt.title("mutisource VFKMM-TrAdaBoost Regressor")
558 | # plt.legend()
559 | # plt.show()
560 | 
561 | 


--------------------------------------------------------------------------------