├── Backend.py ├── Frontend.py ├── HRM.py ├── README.md ├── Selection_bias.py └── __init__.py /Backend.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | import torch.optim as optim 5 | import numpy as np 6 | from torch.autograd import grad 7 | 8 | class LinearRegression(nn.Module): 9 | def __init__(self, input_dim, output_dim=1): 10 | super(LinearRegression, self).__init__() 11 | self.linear = nn.Linear(input_dim, output_dim, bias=False) 12 | self.weight_init() 13 | 14 | def weight_init(self): 15 | torch.nn.init.xavier_uniform_(self.linear.weight) 16 | 17 | def forward(self, x): 18 | return self.linear(x) 19 | 20 | def pretty(vector): 21 | if type(vector) is list: 22 | vlist = vector 23 | elif type(vector) is np.ndarray: 24 | vlist = vector.reshape(-1).tolist() 25 | else: 26 | vlist = vector.view(-1).tolist() 27 | return "[" + ", ".join("{:+.4f}".format(vi) for vi in vlist) + "]" 28 | 29 | # Feature selection part 30 | class FeatureSelector(nn.Module): 31 | def __init__(self, input_dim, sigma): 32 | super(FeatureSelector, self).__init__() 33 | self.mu = torch.nn.Parameter(0.00 * torch.randn(input_dim, ), requires_grad=True) 34 | self.noise = torch.randn(self.mu.size()) 35 | self.sigma = sigma 36 | self.input_dim = input_dim 37 | 38 | def renew(self): 39 | self.mu = torch.nn.Parameter(0.00 * torch.randn(self.input_dim, ), requires_grad=True) 40 | self.noise = torch.randn(self.mu.size()) 41 | 42 | def forward(self, prev_x): 43 | z = self.mu + self.sigma * self.noise.normal_() * self.training 44 | stochastic_gate = self.hard_sigmoid(z) 45 | new_x = prev_x * stochastic_gate 46 | return new_x 47 | 48 | def hard_sigmoid(self, x): 49 | return torch.clamp(x + 0.5, 0.0, 1.0) 50 | 51 | def regularizer(self, x): 52 | return 0.5 * (1 + torch.erf(x / math.sqrt(2))) 53 | 54 | def _apply(self, fn): 55 | super(FeatureSelector, self)._apply(fn) 56 | self.noise = fn(self.noise) 57 | return self 58 | 59 | 60 | class MpModel: 61 | def __init__(self, input_dim, output_dim, sigma=1.0, lam=0.1, alpha=0.5, hard_sum = 1.0, penalty='Ours'): 62 | self.backmodel = LinearRegression(input_dim, output_dim) 63 | self.loss = nn.MSELoss() 64 | self.featureSelector = FeatureSelector(input_dim, sigma) 65 | self.reg = self.featureSelector.regularizer 66 | self.lam = lam 67 | self.mu = self.featureSelector.mu 68 | self.sigma = self.featureSelector.sigma 69 | self.alpha = alpha 70 | self.optimizer = optim.Adam([{'params': self.backmodel.parameters(), 'lr': 1e-3}, 71 | {'params': self.mu, 'lr': 3e-4}]) 72 | self.penalty = penalty 73 | self.hard_sum = hard_sum 74 | self.input_dim = input_dim 75 | self.accumulate_mip_penalty = torch.tensor(np.zeros(10, dtype=np.float32)) 76 | 77 | def renew(self): 78 | self.featureSelector.renew() 79 | self.mu = self.featureSelector.mu 80 | self.backmodel.weight_init() 81 | self.optimizer = optim.Adam([{'params': self.backmodel.parameters(), 'lr': 1e-3}, 82 | {'params': self.mu, 'lr': 3e-4}]) 83 | 84 | 85 | def combine_envs(self, envs): 86 | X = [] 87 | y = [] 88 | for env in envs: 89 | X.append(env[0]) 90 | y.append(env[1]) 91 | X = torch.cat(X, dim=0) 92 | y = torch.cat(y, dim=0) 93 | return X.reshape(-1, X.shape[1]), y.reshape(-1,1) 94 | 95 | def pretrain(self, envs, pretrain_epoch=100): 96 | 97 | pre_optimizer = optim.Adam([{'params': self.backmodel.parameters(), 'lr': 1e-3}]) 98 | X, y = self.combine_envs(envs) 99 | 100 | for i in range(pretrain_epoch): 101 | self.optimizer.zero_grad() 102 | pred = self.backmodel(X) 103 | loss = self.loss(pred, y.reshape(pred.shape)) 104 | loss.backward() 105 | pre_optimizer.step() 106 | 107 | 108 | def single_forward(self, x, regularizer_flag = False): 109 | output_x = self.featureSelector(x) 110 | if regularizer_flag == True: 111 | x = output_x.clone().detach() 112 | else: 113 | x = output_x 114 | return self.backmodel(x) 115 | 116 | 117 | def single_iter_mip(self, envs): 118 | assert type(envs) == list 119 | num_envs = len(envs) 120 | loss_avg = 0.0 121 | grad_avg = 0.0 122 | grad_list = [] 123 | for [x,y] in envs: 124 | pred = self.single_forward(x) 125 | loss = self.loss(pred, y.reshape(pred.shape)) 126 | loss_avg += loss/num_envs 127 | 128 | for [x,y] in envs: 129 | pred = self.single_forward(x, True) 130 | loss = self.loss(pred, y.reshape(pred.shape)) 131 | grad_single = grad(loss, self.backmodel.parameters(), create_graph=True)[0].reshape(-1) 132 | grad_avg += grad_single / num_envs 133 | grad_list.append(grad_single) 134 | 135 | penalty = torch.tensor(np.zeros(self.input_dim, dtype=np.float32)) 136 | for gradient in grad_list: 137 | penalty += (gradient - grad_avg)**2 138 | 139 | penalty_detach = torch.sum(penalty.reshape(self.mu.shape)*(self.mu+0.5)) 140 | reg = torch.sum(self.reg((self.mu + 0.5) / self.sigma)) 141 | reg = (reg-self.hard_sum)**2 142 | total_loss = loss_avg + self.alpha * (penalty_detach) 143 | total_loss = total_loss + self.lam * reg 144 | return total_loss, penalty_detach, self.reg((self.mu + 0.5) / self.sigma) 145 | 146 | 147 | def get_gates(self): 148 | return pretty(self.mu+0.5) 149 | 150 | def get_paras(self): 151 | return pretty(self.backmodel.linear.weight) 152 | 153 | def train(self, envs, epochs): 154 | self.renew() 155 | self.pretrain(envs, 3000) 156 | for epoch in range(1,epochs+1): 157 | self.optimizer.zero_grad() 158 | loss, penalty, reg = self.single_iter_mip(envs) 159 | loss.backward() 160 | self.optimizer.step() 161 | if epoch % epochs == 0: 162 | print("Epoch %d | Loss = %.4f | Gates %s | Theta = %s" % 163 | (epoch, loss, self.get_gates(), pretty(self.backmodel.linear.weight))) 164 | return self.mu + 0.5, reg -------------------------------------------------------------------------------- /Frontend.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.optim as optim 4 | import torch.nn as nn 5 | 6 | 7 | def pretty(vector): 8 | if type(vector) is list: 9 | vlist = vector 10 | elif type(vector) is np.ndarray: 11 | vlist = vector.reshape(-1).tolist() 12 | else: 13 | vlist = vector.view(-1).tolist() 14 | return "[" + ", ".join("{:+.4f}".format(vi) for vi in vlist) + "]" 15 | 16 | 17 | class LinearRegression(nn.Module): 18 | def __init__(self, input_dim, output_dim=1): 19 | super(LinearRegression, self).__init__() 20 | self.linear = nn.Linear(input_dim, output_dim, bias=True) 21 | self.weight_init() 22 | 23 | def weight_init(self): 24 | torch.nn.init.xavier_uniform_(self.linear.weight) 25 | 26 | def forward(self, x): 27 | return self.linear(x) 28 | 29 | 30 | class WeightedLasso: 31 | def __init__(self, X, y, weight, lam): 32 | self.model = LinearRegression(X.shape[1], 1) 33 | self.X = X 34 | self.y = y 35 | self.weight = weight.reshape(-1, 1) 36 | self.loss = nn.MSELoss() 37 | self.lam = lam 38 | self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3) 39 | 40 | def train(self): 41 | self.model.weight_init() 42 | epochs = 3000 43 | 44 | for epoch in range(epochs): 45 | self.optimizer.zero_grad() 46 | pred = self.model(self.X) 47 | loss = self.loss(pred, self.y) +\ 48 | self.lam * torch.mean(torch.abs(self.weight*self.model.linear.weight.reshape(self.weight.shape))) 49 | loss.backward(retain_graph=True) 50 | self.optimizer.step() 51 | return self.model.linear.weight.clone().cpu().detach(), self.model.linear.bias.clone().cpu().detach() 52 | 53 | 54 | 55 | class McModel: 56 | def __init__(self, num_classes, X, y): 57 | self.num_classes = num_classes 58 | self.X = X 59 | self.y = y.reshape(-1, 1) 60 | self.center = None 61 | self.bias = None 62 | self.domain = None 63 | self.weights = None 64 | 65 | def ols(self): 66 | for i in range(self.num_classes): 67 | index = torch.where(self.domain == i)[0] 68 | tempx = (self.X[index, :]).reshape(-1, self.X.shape[1]) 69 | tempy = (self.y[index, :]).reshape(-1, 1) 70 | clf = WeightedLasso(tempx, tempy, self.weights, 1.0) 71 | self.center[i, :], self.bias[i] = clf.train() 72 | 73 | def cluster(self, weight, past_domains, reuse=False): 74 | self.center = torch.tensor(np.zeros((self.num_classes, self.X.shape[1]), dtype=np.float32)) 75 | self.bias = torch.tensor(np.zeros(self.num_classes, dtype=np.float32)) 76 | 77 | if past_domains is None or not reuse: 78 | self.domain = torch.tensor(np.random.randint(0, self.num_classes, self.X.shape[0])) 79 | else: 80 | self.domain = past_domains 81 | assert self.domain.shape[0] == self.X.shape[0] 82 | self.weights = weight 83 | 84 | iter = 0 85 | end_flag = False 86 | delta_threshold = 250 87 | 88 | while not end_flag: 89 | iter += 1 90 | self.ols() 91 | ols_error = [] 92 | for i in range(self.num_classes): 93 | coef = self.center[i].reshape(-1, 1) 94 | error = torch.abs(torch.mm(self.X, coef) + self.bias[i] - self.y) 95 | assert error.shape == (self.X.shape[0], 1) 96 | ols_error.append(error) 97 | ols_error = torch.stack(ols_error, dim=0).reshape(self.num_classes, self.X.shape[0]) 98 | new_domain = torch.argmin(ols_error, dim=0) 99 | assert new_domain.shape[0] == self.X.shape[0] 100 | diff = self.domain.reshape(-1, 1) - new_domain.reshape(-1, 1) 101 | diff[diff != 0] = 1 102 | delta = torch.sum(diff) 103 | if iter % 10 == 9: 104 | print("Iter %d | Delta = %d" % (iter, delta)) 105 | if delta <= delta_threshold: 106 | end_flag = True 107 | self.domain = new_domain 108 | 109 | environments = [] 110 | for i in range(self.num_classes): 111 | index = torch.where(self.domain == i)[0] 112 | tempx = (self.X[index, :]).reshape(-1, self.X.shape[1]) 113 | tempy = (self.y[index, :]).reshape(-1, 1) 114 | environments.append([tempx, tempy]) 115 | return environments, self.domain 116 | 117 | 118 | 119 | def comobine_envs(envs): 120 | X = [] 121 | y = [] 122 | for env in envs: 123 | X.append(env[0]) 124 | y.append(env[1]) 125 | X = torch.cat(X, dim=0) 126 | y = torch.cat(y, dim=0) 127 | return X.reshape(-1, X.shape[1]), y.reshape(-1,1) 128 | -------------------------------------------------------------------------------- /HRM.py: -------------------------------------------------------------------------------- 1 | from Backend import MpModel, pretty 2 | from Frontend import McModel 3 | from Selection_bias import Multi_env_selection_bias, generate_test, modified_Multi_env_selection_bias 4 | import torch 5 | import numpy as np 6 | import os 7 | from sklearn.linear_model import LinearRegression 8 | 9 | 10 | 11 | class HRM: 12 | def __init__(self, front_params, back_params, X, y, test_X=None, test_y=None): 13 | self.X = X 14 | self.y = y 15 | self.test_X = [test_X] 16 | self.test_y = [test_y] 17 | self.frontend = McModel(front_params['num_clusters'], self.X, self.y) 18 | self.backend = MpModel(input_dim=back_params['input_dim'], 19 | output_dim=back_params['output_dim'], 20 | sigma=back_params['sigma'], 21 | lam=back_params['lam'], 22 | alpha=back_params['alpha'], 23 | hard_sum=back_params['hard_sum']) 24 | self.domains = None 25 | self.weight = torch.tensor(np.zeros(self.X.shape[1], dtype=np.float32)) 26 | 27 | 28 | def solve(self, iters): 29 | self.density_result = None 30 | density_record = [] 31 | flag = False 32 | for i in range(iters): 33 | environments, self.domains = self.frontend.cluster(self.weight, self.domains, flag) 34 | weight, density = self.backend.train(environments, epochs=6000) 35 | density_record.append(density) 36 | self.density_result = density 37 | self.weight = density 38 | self.backend.lam *= 1.05 39 | self.backend.alpha *= 1.05 40 | print('Selection Ratio is %s' % self.weight) 41 | 42 | f = open('./save.txt', 'a+') 43 | print('Density results:') 44 | for i in range(len(density_record)): 45 | print("Iter %d Density %s" % (i, pretty(density_record[i]))) 46 | f.writelines(pretty(density_record[i]) + '\n') 47 | f.close() 48 | return self.weight 49 | 50 | def test(self, test_envs): 51 | test_accs = [] 52 | self.backend.backmodel.eval() 53 | self.backend.featureSelector.eval() 54 | for i in range(len(test_envs)): 55 | pred = self.backend.single_forward(test_envs[i][0]) 56 | error = torch.sqrt(torch.mean((pred.reshape(test_envs[i][1].shape) - test_envs[i][1]) ** 2)) 57 | test_accs.append(error.data) 58 | 59 | print(pretty(test_accs)) 60 | self.backend.backmodel.train() 61 | self.backend.featureSelector.train() 62 | return test_accs 63 | 64 | 65 | def combine_envs(envs): 66 | X = [] 67 | y = [] 68 | for env in envs: 69 | X.append(env[0]) 70 | y.append(env[1]) 71 | X = torch.cat(X, dim=0) 72 | y = torch.cat(y, dim=0) 73 | return X.reshape(-1, X.shape[1]), y.reshape(-1, 1) 74 | 75 | 76 | def seed_torch(seed=2018): 77 | os.environ['PYTHONHASHSEED'] = str(seed) 78 | np.random.seed(seed) 79 | torch.manual_seed(seed) 80 | torch.backends.cudnn.deterministic = True 81 | 82 | class EmpiricalRiskMinimizer(object): 83 | def __init__(self, X, y, mask): 84 | x_all = X.numpy() 85 | y_all = y.numpy() 86 | self.mask = mask 87 | x_all = x_all[:, self.mask] 88 | w = LinearRegression(fit_intercept=False).fit(x_all, y_all).coef_ 89 | self.w = torch.Tensor(w) 90 | 91 | def solution(self): 92 | return self.w 93 | 94 | def test(self, X, y): 95 | X = X.numpy() 96 | X = X[:, self.mask] 97 | y = y.numpy() 98 | err = np.mean((X.dot(self.w.T) - y) ** 2.).item() 99 | return np.sqrt(err) 100 | 101 | 102 | if __name__ == "__main__": 103 | all_weights = torch.tensor(np.zeros(10, dtype=np.float32)) 104 | average = 0.0 105 | std = 0.0 106 | seeds = 10 107 | average_error_list = torch.Tensor(np.zeros(10, dtype=np.float)) 108 | for seed in range(0, seeds): 109 | seed_torch(seed) 110 | print("---------------seed = %d------------------" % seed) 111 | environments, _ = Multi_env_selection_bias() 112 | X, y = combine_envs(environments) 113 | 114 | # params 115 | front_params = {} 116 | front_params['num_clusters'] = 3 117 | 118 | back_params = {} 119 | back_params['input_dim'] = X.shape[1] 120 | back_params['output_dim'] = 1 121 | back_params['sigma'] = 0.1 122 | back_params['lam'] = 0.1 123 | back_params['alpha'] = 1000.0 124 | back_params['hard_sum'] = 10 125 | back_params['overall_threshold'] = 0.20 126 | whole_iters = 5 127 | 128 | # train and test 129 | model = HRM(front_params, back_params, X, y) 130 | result_weight = model.solve(whole_iters) 131 | all_weights += result_weight 132 | 133 | mask = torch.where(result_weight > back_params['overall_threshold'])[0] 134 | evaluate_model = EmpiricalRiskMinimizer(X, y, mask) 135 | testing_envs = generate_test() 136 | 137 | testing_errors = [] 138 | for [X, y] in testing_envs: 139 | testing_errors.append(evaluate_model.test(X, y)) 140 | 141 | testing_errors = torch.Tensor(testing_errors) 142 | print(testing_errors) 143 | average += torch.mean(testing_errors) / seeds 144 | std += torch.std(testing_errors) / seeds 145 | average_error_list += testing_errors / seeds 146 | print(average_error_list) 147 | 148 | print(average) 149 | print(std) 150 | print(all_weights) 151 | 152 | 153 | 154 | 155 | 156 | 157 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Heterogeneous Risk Minimization 2 | > Jiashuo Liu 3 | 4 | This repository contains the code for our ICML21 paper **Heterogeneous Risk Minimization**[1], including the implementation of HRM algorithm and 5 | the selection bias simulation data. 6 | 7 | Specifically, the repository contains the following files: 8 | * `Selection_bias.py`: the implementation of our selection bias simulation data. Details of the functions included: 9 | * `data_generation`: the basic data generation function with respect to the equation (18) in the paper. 10 | * `modified_selection_bias`: when dealing with high dimensions of $V_b$ , the efficiency of original function 'data_generation' is quite low, and we propose a equivalent way to generate data. 11 | * `Multi_env_selection_bias` & `modified_Multi_env_selection_bias`: generate multi-environment training data. (The data are pooled together before inputting to the algorithm) 12 | * `Frontend.py`: the implementation of the $\mathcal{M}_c$ model, which we implement as a clustering method 13 | * `Backend.py`: the implementation of the $\mathcal{M}_p$ model, which contains two parts: feature selection and invariant learning. Details of the classes included: 14 | * `FeatureSelector`: a feature selection module, for which we use the code from [2]. 15 | * `MpModel`: the whole backend module. 16 | 17 | Besides, there are many hyper-parameters to be tuned for the whole framework, which are different among different tasks and require users to carefully tune. Note that although we provide the hyper-parameters used in our selection bias experiment, it is possible that the results are not exactly the same as ours, which may due to the randomness or something else. During the experiments, we found serveral important factors and some intuitive tuning ways: 18 | 19 | * `alpha`: this differs a lot among tasks, from 1e-1 to 1e3, and users may have to carefully tune it. 20 | * `hard_sum`: in fact, this factor reflects the number of the ground-truth stable covariates. Since we have no idea the exact number of them, we propose to simply set it to the input number of covariates, and alternatively adjust the parameter `lam`. 21 | * `Overall_threshold`: when the HRM algorithm gives the probabilities of covariates, we use a threshold to disgard the inferred unstable covariates by this threshold. As for tasks where the gaps of probabilities among different covariates are quite large, we simply disgard the covariates whose probabilities are below this(set to 0.20 in the simulation data). As for tasks where the gaps are small, we do not apply this and use the continuous probabilities in testing. 22 | 23 | 24 | 25 | Further, we view the proposed HRM as a general framework, which contains several techniques, including clustering, feature selection and invariant learning. Therefore, the components in our framework can be replaced by other methods. For example, in practice, the regularizer for invariant learning can be replaced by other invariant learning methods with multiple environments(though the theoretical properties might be affected...). And our proposed algorithm has many drawbacks: 26 | 27 | * The convergence of the frontend module cannot be guaranteed, and we notice that there may be some cases the next iteration does not improve the current results or even hurts. 28 | * Hyper-parameters among different tasks are quite different. 29 | * In this paper, we only conduct experiments under linear cases, and more complicated models are not tested yet(maybe later we will add...) 30 | 31 | 32 | 33 | ps: I am really unsatisfied with the style of my code, and a better version is under development. For questions, feel free to contact liujiashuo77@gmail.com. 34 | 35 | [1]Jiashuo Liu, Zheyuan Hu, Peng Cui, Bo Li, and Zheyan Shen. "Heterogeneous Risk Minimization." ICML(2021). 36 | [2]Yamada, Y., Lindenbaum, O., Negahban, S., and Kluger, Y. Feature selection using stochastic gates. ICML(2020). 37 | -------------------------------------------------------------------------------- /Selection_bias.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.random import seed 3 | seed(1) 4 | import torch 5 | import math 6 | import random 7 | random.seed(1) 8 | 9 | def sign(x): 10 | if x > 0: 11 | return 1 12 | if x < 0: 13 | return -1 14 | return 0 15 | 16 | def data_generation(n1, n2, ps, pvb, pv, r): 17 | S = np.random.normal(0, 1, [n1, ps]) 18 | V = np.random.normal(0, 1, [n1, pvb + pv]) 19 | 20 | Z = np.random.normal(0, 1, [n1, ps + 1]) 21 | for i in range(ps): 22 | S[:, i:i + 1] = 0.8 * Z[:, i:i + 1] + 0.2 * Z[:, i + 1:i + 2] 23 | 24 | beta = np.zeros((ps, 1)) 25 | for i in range(ps): 26 | beta[i] = (-1) ** i * (i % 3 + 1) * 1.0 / 2 27 | 28 | noise = np.random.normal(0, 0.3, [n1, 1]) 29 | 30 | Y = np.dot(S, beta) + noise + 1 * S[:, 0:1] * S[:, 1:2] * S[:, 2:3] 31 | Y_compare = np.dot(S, beta) + 1 * S[:, 0:1] * S[:, 1:2] * S[:, 2:3] 32 | 33 | index_pre = np.ones([n1, 1], dtype=bool) 34 | for i in range(pvb): 35 | D = np.abs(V[:, pv + i:pv + i + 1] * sign(r) - Y_compare) 36 | pro = np.power(np.abs(r), -D * 5) 37 | selection_bias = np.random.random([n1, 1]) 38 | index_pre = index_pre & ( 39 | selection_bias < pro) 40 | index = np.where(index_pre == True) 41 | S_re = S[index[0], :] 42 | V_re = V[index[0], :] 43 | Y_re = Y[index[0]] 44 | n, p = S_re.shape 45 | index_s = np.random.permutation(n) 46 | 47 | X_re = np.hstack((S_re, V_re)) 48 | beta_X = np.vstack((beta, np.zeros((pv + pvb, 1)))) 49 | 50 | return torch.Tensor(X_re[index_s[0:n2], :]), torch.Tensor(Y_re[index_s[0:n2], :]), beta_X 51 | 52 | 53 | def modified_selection_bias(ps, pv, n, r): 54 | S = np.random.normal(0, 1, [n, ps]) 55 | Z = np.random.normal(0, 1, [n, ps + 1]) 56 | for i in range(ps): 57 | S[:, i:i + 1] = 0.8 * Z[:, i:i + 1] + 0.2 * Z[:, i + 1:i + 2] 58 | 59 | beta = np.zeros((ps, 1)) 60 | for i in range(ps): 61 | beta[i] = (-1) ** i * (i % 3 + 1) * 1.0 / 3 62 | 63 | noise = np.random.normal(0, 0.3, [n, 1]) 64 | 65 | Y = np.dot(S, beta) + noise + 1 * S[:, 0:1] * S[:, 1:2] * S[:, 2:3] 66 | Y_compare = np.dot(S, beta) + 1 * S[:, 0:1] * S[:, 1:2] * S[:, 2:3] 67 | 68 | if r > 0: 69 | center = Y_compare 70 | else: 71 | center = -Y_compare 72 | 73 | r = abs(r) 74 | sigma = math.sqrt(1/math.log2(r)) 75 | 76 | V = np.zeros((center.shape[0], pv), dtype=np.float32) 77 | for i in range(center.shape[0]): 78 | V[i,:] = np.random.multivariate_normal(center[i]*(np.zeros(pv)+1.0), sigma*np.eye(pv), 1) 79 | 80 | X = np.concatenate((S,V), axis=1) 81 | X = torch.Tensor(X) 82 | Y = torch.Tensor(Y) 83 | return X, Y 84 | 85 | def modified_Multi_env_selection_bias(): 86 | trainX = None 87 | trainy = None 88 | env = [] 89 | n_list = [1900, 100, 100] 90 | r_list = [1.9, -1.1, -1.1] 91 | ps = 5 92 | pv = 5 93 | for e in range(len(n_list)): 94 | if trainy is None: 95 | trainX, trainy = modified_selection_bias(ps, pv, n_list[e], r_list[e]) 96 | env.append([trainX, trainy]) 97 | else: 98 | tempx, tempy = modified_selection_bias(ps, pv, n_list[e], r_list[e]) 99 | trainX = np.concatenate([trainX, tempx], axis=0) 100 | trainy = np.concatenate([trainy, tempy], axis=0) 101 | env.append([tempx, tempy]) 102 | 103 | return env, 0 104 | 105 | 106 | 107 | def Multi_env_selection_bias(): 108 | n1 = 100000 109 | p = 10 110 | ps = int(p * 0.5) 111 | pvb = int(p * 0.1) 112 | pv = p - ps - pvb 113 | 114 | r = 1.5 115 | r_list = [-1.1] 116 | num_list = [100] 117 | environments = [] 118 | n2 = 1900 119 | trainx, trainy, real_beta = data_generation(n1, n2, ps, pvb, pv, r) 120 | environments.append([trainx, trainy]) 121 | 122 | for idx, r in enumerate(r_list): 123 | x_bias, y_bias, real_beta = data_generation(n1, num_list[idx], ps, pvb, pv, r) 124 | environments.append([x_bias, y_bias]) 125 | print(environments[0][0].shape, environments[0][1].shape, environments[1][0].shape) 126 | return environments, real_beta 127 | 128 | 129 | 130 | def generate_test(): 131 | n1 = 100000 132 | p = 10 133 | ps = int(p * 0.5) 134 | pvb = int(p * 0.1) 135 | pv = p - ps - pvb 136 | 137 | r_list = [-3, -2.7, -2.3, -2.0, -1.7,1.7,2.0,2.3,2.7,3.0] 138 | testing = [] 139 | 140 | for r in r_list: 141 | n2 = 2000 142 | trainx, trainy, real_beta = data_generation(n1, n2, ps, pvb, pv, r) 143 | testing.append([trainx, trainy]) 144 | 145 | return testing 146 | 147 | if __name__=="__main__": 148 | Multi_env_selection_bias() -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LJSthu/HRM/c7b64dccb8f2a1cdba0a072a3f34d3c8309dce6f/__init__.py --------------------------------------------------------------------------------