├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── distance_matching.py ├── functions.py ├── personalized_regression_figure1.png └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # Data Files 104 | *.tsv 105 | *.npy 106 | *.npz 107 | *.json 108 | *.txt 109 | *.csv 110 | *Data/ 111 | *.zip -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Ben Lengerich 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Personalized Regression 2 | 3 | The goal of *Personalized Regression* is to push the limits of machine learning for heterogeneous samples. Instead of assuming that a single model is appropriate for all samples, we go to the other extreme and fit different model parameters for each samples. At this extreme, we are required to devise ways to share statistical power between models. After estimating these sample-specific models, we have a matrix of model parameters which we may analyze. 4 | 5 | ![Personalized Regression Example](https://github.com/blengerich/Personalized_Regression/blob/master/personalized_regression_figure1.png) 6 | 7 | 8 | # Using this code 9 | 10 | This repository includes code for both personalized logistic and personalized linear regression, but can be extended to personalize any predictive model. The main file is `distance_matching.py`, which is designed to take in a black-box predictive model and the corresponding subgradient updates as Python functions. Examples of these functions for linear and logistic regression are included in the file `functions.py`. In addition, `DistanceMatching` objects require feature-specific distance metrics for covariates as Python functions. Examples of these distance metrics are also provided in `functions.py`. 11 | 12 | 13 | ## NeurIPS 2019 Paper 14 | The most recent paper on this project is our 2019 NeurIPS Paper: ["Learning Sample-Specific Models with Low-Rank Personalized Regression"](http://papers.nips.cc/paper/8616-learning-sample-specific-models-with-low-rank-personalized-regression). A snapshot of code relating to that paper is available at: [github.com/blengerich/Personalized_Regression_NeurIPS19](https://github.com/blengerich/Personalized_Regression_NeurIPS19). 15 | 16 | 17 | ## ISMB 2018 Paper 18 | These ideas were first developed in our 2018 ISMB Paper: ["Personalized Regression Enables Sample-Specific Pan-Cancer Analysis."](https://academic.oup.com/bioinformatics/article/34/13/i178/5045771). 19 | A snapshot of the code relating to that paper is available at: [github.com/blengerich/Personalized_Regression_ISMB18](https://github.com/blengerich/Personalized_Regression_ISMB18). 20 | 21 | For more information about these experiments, please see the [slides for our ISMB 2018 presentation](//www.cs.cmu.edu/~blengeri/downloads/slides/personalized_regression_ismb_2018.pdf). 22 | 23 | 24 | ## Citing 25 | 26 | If you use the code or machine learning ideas in this repository, please cite the most recent paper: 27 | ``` 28 | @inproceedings{lengerich2019learning, 29 | title = {Learning Sample-Specific Models with Low-Rank Personalized Regression}, 30 | author = {Lengerich, Benjamin J and Aragam, Bryon and Xing, Eric P}, 31 | booktitle={Advances in Neural Information Processing Systems}, 32 | pages={}, 33 | year={2019} 34 | } 35 | ``` 36 | 37 | 38 | If you are specifically interested in the used of personalized regression for cancer analysis, please cite the ISMB 2018 paper: 39 | ``` 40 | @article{lengerich2018personalized, 41 | author = {Lengerich, Benjamin J and Aragam, Bryon and Xing, Eric P}, 42 | title = {Personalized regression enables sample-specific pan-cancer analysis}, 43 | journal = {Bioinformatics}, 44 | volume = {34}, 45 | number = {13}, 46 | pages = {i178-i186}, 47 | year = {2018}, 48 | doi = {10.1093/bioinformatics/bty250}, 49 | URL = {http://dx.doi.org/10.1093/bioinformatics/bty250}, 50 | eprint = {/oup/backfile/content_public/journal/bioinformatics/34/13/10.1093_bioinformatics_bty250/1/bty250.pdf} 51 | } 52 | ``` 53 | 54 | ## Contact 55 | Please contact blengeri@cs.cmu.edu, naragam@cs.cmu.edu or epxing@cs.cmu.edu with any questions. Pull requests are always welcome. 56 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blengerich/Personalized_Regression/dbd47797e9f6fece473125572c643e8568e222e1/__init__.py -------------------------------------------------------------------------------- /distance_matching.py: -------------------------------------------------------------------------------- 1 | # Personalized Logistic Regression 2 | #from numba import jit 3 | import numpy as np 4 | np.set_printoptions(precision=4) 5 | import time 6 | 7 | from utils import * 8 | from scipy.spatial import KDTree # To store Z matrix and find closest neighbors. 9 | #from sklearn.metrics.pairwise import cosine_similarity 10 | from sklearn.decomposition import PCA 11 | from sklearn.preprocessing import normalize 12 | import concurrent 13 | import concurrent.futures 14 | from numba import jit 15 | 16 | import warnings 17 | warnings.filterwarnings("error") 18 | 19 | 20 | default_log_file='logs/distance_matching_{}.log'.format( 21 | time.strftime("%Y_%m_%d-%H_%M_%S")) 22 | 23 | 24 | class DistanceMatching(): 25 | 26 | def __init__(self, init_beta, 27 | f, f_prime, 28 | gamma, latent_dim, n_neighbors, 29 | update_ztree_every, calc_dist_errors_every, calc_closest_every, 30 | rho_beta, rho_beta_prime, 31 | init_phi_u, psi_u, psi_u_prime, 32 | intercept, log_dir="./", n_threads=1): 33 | # Define functions on initialization. 34 | self.init_beta = init_beta 35 | self.f = f 36 | self.f_prime = f_prime 37 | self.gamma = gamma 38 | self.latent_dim = latent_dim 39 | self.n_neighbors = n_neighbors 40 | self.update_ztree_every = update_ztree_every 41 | self.calc_dist_errors_every = calc_dist_errors_every 42 | self.calc_closest_every = calc_closest_every 43 | self.rho_beta = rho_beta 44 | self.rho_beta_prime = rho_beta_prime 45 | 46 | self.init_phi_u = init_phi_u 47 | self.psi_u = psi_u 48 | self.psi_u_prime = psi_u_prime 49 | 50 | self.intercept = intercept 51 | self.log_dir = log_dir 52 | self.n_threads = n_threads 53 | if self.n_threads > 0: 54 | self.executor = concurrent.futures.ThreadPoolExecutor( 55 | max_workers=self.n_threads) 56 | self.map = lambda x,y: list(self.executor.map(x,y)) 57 | else: 58 | self.pool = None 59 | self.map = lambda x,y: list(map(x, y)) 60 | 61 | 62 | def _check_shapes(self, X, Y, U, dU=None): 63 | """ Does some basic checks on the shapes on the parameters. """ 64 | N = X.shape[0] 65 | assert(U.shape[0] == N) 66 | #P = X.shape[1] 67 | K = U.shape[1] 68 | if dU: 69 | assert(len(dU) == K) 70 | return N, K 71 | 72 | 73 | def make_covariate_distances(self, U, dU, K, N, should_normalize=True): 74 | # Make fixed distance matrix for co-variates 75 | t = time.time() 76 | print("Making Co-Variate Distance Matrix of Size {}x{}x{}".format(N, N, K)) 77 | D = np.zeros((N, N, K)) 78 | #with concurrent.futures.ThreadPoolExecutor(max_workers=self.n_threads) as executor: 79 | get_dist = lambda i, j: np.array([ 80 | dU[k](U[i, k], U[j, k]) for k in range(K)], dtype="float32") 81 | for i in range(1, N): 82 | print("{}\t/{}".format(i, N), end='\r') 83 | D[i, 0:i, :] = self.map(lambda j: get_dist(i, j), range(i)) 84 | for i in range(1, N): 85 | for j in range(i): 86 | D[j, i, :] = D[i, j, :] # could cut memory in half by only storing lists. 87 | print("Finished making unnormalized version.") 88 | if should_normalize: 89 | normalized = np.array([normalize(D[:, :, k]) for k in range(K)]) 90 | # Now the first axis references k. Move it to the back. 91 | normalized = np.swapaxes(normalized, 0, 1) 92 | D = np.swapaxes(normalized, 1, 2) 93 | print("Finished normalizing.") 94 | print("Took {:.3f} seconds.".format(time.time() - t)) 95 | return D 96 | 97 | 98 | def make_covariate_distance_function(self, U, dU, K): 99 | # Allows lazy calculations of covariate distances, but cannot normalize. 100 | func = lambda i,j: np.array([dU[k](U[i,k], U[j,k]) for k in range(K)]) 101 | return func 102 | 103 | import matplotlib.pyplot as plt 104 | def _update_beta_hat(self, iteration=0, plot=False): 105 | if self.multitask: 106 | self.beta_hat = np.tensordot(self.Z, self.Q, axes=1) 107 | else: 108 | self.beta_hat = self.Z.dot(self.Q) + self.beta_mean 109 | if plot: 110 | fig = plt.figure() 111 | plt.scatter(self.beta_hat[:, 0], self.beta_hat[:, 1], marker='+', alpha=0.5) 112 | plt.savefig("results/optimization/{}.png".format(iteration)) 113 | plt.show() 114 | 115 | def _calc_losses(self, iteration): 116 | self._update_beta_hat(iteration, True) 117 | loss1 = np.sum([ 118 | self.f(self.X[i], self.Y[i], self.beta_hat[i]) for i in range(self.N)]) 119 | loss2 = 0.5*self.gamma*np.mean( 120 | np.mean(np.square(self.dist_errors), axis=1)) 121 | loss3 = np.mean([self.rho_beta(self.beta_hat[i], i) for i in range(self.N)]) 122 | #loss4 = self.psi_beta(phi_beta) 123 | loss4 = 0. 124 | loss5 = self.psi_u(self.phi_u) 125 | loss6 = 0.#self.psi_nu(self.nu) 126 | return [loss1, loss2, loss3, loss4, loss5, loss6] 127 | 128 | 129 | def _init_data_vars(self, data): 130 | self.X = data["X"] 131 | self.Y = data["Y"] 132 | self.U = data["U"] 133 | self.delta_U = data["delta_U"] 134 | self.N, self.K = self._check_shapes(self.X, self.Y, self.U) 135 | self.P = self.X.shape[1] 136 | if len(self.init_beta.shape) > 2: 137 | self.multitask = True 138 | self.T = self.init_beta.shape[2] 139 | else: 140 | self.multitask = False 141 | 142 | self.beta_hat = self.init_beta.copy() 143 | # Initialize B, Z by PCA of beta_hat 144 | pca = PCA(n_components=self.latent_dim, whiten=False) 145 | if self.multitask: 146 | self.Z = np.zeros((self.N, self.latent_dim)) 147 | self.Q = np.zeros((self.latent_dim, self.P, self.T)) 148 | for t in range(self.T): 149 | self.Z += pca.fit_transform(self.beta_hat[:, :, t]) 150 | self.Q[:, :, t] = pca.components_ 151 | self.Z /= self.T 152 | else: 153 | self.beta_mean = np.mean(self.beta_hat, axis=0) 154 | print(self.beta_mean.shape) 155 | self.Z = pca.fit_transform(self.beta_hat) 156 | self.Q = pca.components_ 157 | print(self.beta_hat) 158 | print(self.Z.dot(self.Q)) 159 | print(pca.explained_variance_ratio_) 160 | """ 161 | self.Z = np.random.multivariate_normal(np.zeros(self.latent_dim), 162 | np.eye(self.latent_dim), self.N) 163 | self.Q = np.random.uniform(0, 1, size=(self.latent_dim, self.P)) 164 | """ 165 | """ 166 | try: 167 | z_norms = np.linalg.norm(self.Z, axis=0, ord=1) # normalize features 168 | self.Z /= np.clip(np.tile(z_norms, (self.N, 1)), 0.01, 100) 169 | if self.multitask: 170 | for t in range(self.T): 171 | self.Q[:, :, t] *= np.tile(z_norms, (self.P, 1)).T 172 | else: 173 | self.Q *= np.tile(z_norms, (self.P, 1)).T 174 | except RuntimeWarning: 175 | self.Z = np.random.normal(0, 0.01, size=self.Z.shape) 176 | self.Q = np.random.normal(0, 0.01, size=self.Q.shape) 177 | """ 178 | 179 | #print(self.Z, self.Q) 180 | #self.Z = np.random.normal(0, 0.01, size=(self.N, self.latent_dim)) 181 | #self.B = np.random.normal(0, 0.01, size=(self.latent_dim, self.P))#np.linalg.lstsq(self.Z, self.beta_hat)[0] 182 | #self.nu = 1.#self.init_nu # scalar, no need to copy 183 | self.phi_u = self.init_phi_u.copy() 184 | self.beta_prev = self.beta_hat.copy() 185 | self.phi_u_prev = self.phi_u.copy() 186 | 187 | def _make_z_tree(self): 188 | self.z_tree = KDTree(self.Z, leafsize=self.kd_leafsize) 189 | 190 | def _init_opt_vars(self, opt_params): 191 | self.patience = opt_params["init_patience"] 192 | self.lr = opt_params["init_lr"] 193 | self.lr_decay = opt_params["lr_decay"] 194 | self.max_iters = opt_params["max_iters"] 195 | self.tol = opt_params["tol"] 196 | self.prev_loss = np.inf 197 | self.distances_over_time = [] 198 | self.losses_over_time = [] 199 | 200 | if opt_params.get("neighbors", None): 201 | print("Neighborhoods supplied. Will use those.") 202 | self._find_closest_neighbors = lambda: opt_params["neighborhoods"] 203 | elif opt_params.get("calc_neighbors", False): 204 | print("No neighborhoods supplied. Will calculate nearest neighbors.") 205 | self.kd_leafsize = opt_params["kd_leafsize"] 206 | self._make_z_tree() 207 | self._find_closest_neighbors = lambda: self.z_tree.query( 208 | self.Z, k=self.n_neighbors, eps=1.1)[1] 209 | else: 210 | print("No neighborhoods supplied. Will use random neighbors.") 211 | self._find_closest_neighbors = lambda: np.tile( 212 | np.random.choice(self.N, size=(self.n_neighbors)), (self.N, 1)) 213 | 214 | self.closest = self._find_closest_neighbors() 215 | 216 | def _maybe_update_ztree(self, iteration): 217 | try: 218 | if iteration % self.update_ztree_every == 0: 219 | self._make_z_tree() 220 | except AttributeError: 221 | # TODO: handle real neighborhoods. 222 | self._find_closest_neighbors = lambda: np.tile( 223 | np.random.choice(self.N, size=(self.n_neighbors)), (self.N, 1)) 224 | return 225 | 226 | def _maybe_update_neighbors(self, iteration): 227 | if iteration % self.calc_closest_every == 0: 228 | self.closest = self._find_closest_neighbors() 229 | 230 | def _maybe_update_errors(self, iteration): 231 | if (iteration-1) % self.calc_dist_errors_every == 0: 232 | self.dist_errors = np.array( 233 | self.map(lambda i: self.calc_dist_errors(i), range(self.N))) 234 | 235 | def _calc_loss(self, iteration): 236 | losses = self._calc_losses(iteration) 237 | self.loss = np.sum(losses) 238 | self.losses_over_time.append(losses) 239 | return losses 240 | 241 | def _maybe_record_distances(self): 242 | if self.record_distances: 243 | distances = np.square(self.dist_errors) 244 | self.distances_over_time.append(np.mean(distances)) 245 | 246 | def _maybe_log_status(self, iteration, losses): 247 | if iteration % self.verbosity == 0: 248 | log_string = "Iteration: {:d} Total Loss:{:.3f} Pred:{:.3f} ".format(iteration, np.sum(losses), losses[0]) 249 | log_string += "Dist:{:.3f} l1:{:.3f} Phi_beta:{:.3f} ".format(losses[1], losses[2], losses[3]) 250 | log_string += "Phi_u:{:.3f}, Beta_Scale:{:.3f} Patience: {:d}".format(losses[4], losses[5], self.patience) 251 | """ 252 | if self.calc_com: 253 | com = np.linalg.norm(np.mean(beta_hat, axis=0) - self.init_beta[0, :], ord=2) 254 | mad = np.mean(np.array([ 255 | np.abs(beta_hat[i] - self.init_beta[0, :]) for i in range(N)]), axis=0) 256 | mad = np.linalg.norm(mad, ord=2) # Easier to read the logs if this is a single number, instead of per-feature. 257 | log_string += "\nCOM Divergence:{}\nMAD:{}".format(com, mad) 258 | """ 259 | print(log_string, file=self.log) 260 | #print("phi_beta:{}\tphi_u:{}".format(phi_beta, phi_u), file=log) 261 | #print("beta_scale:{:.3f}".format(beta_scale), file=log) 262 | """ 263 | print("Cosine Similarity between D_B and D_U:{:.3f}".format( 264 | np.mean(np.array([np.array([cosine_similarity([delta_beta(i,j)], [D[i, j]])[0] for j in range(N)]) 265 | for i in range(N)])))) 266 | """ 267 | 268 | def _should_quit(self, iteration): 269 | if self.loss > 1e8: 270 | print("Diverged at iteration: {}".format(iteration)) 271 | return True 272 | if self.loss > self.prev_loss - self.tol: 273 | self.patience -= 1 274 | if self.patience <= 0: 275 | print("Reached local minimum at iteration {:d}.".format(iteration)) 276 | self.Q = self.Q_prev 277 | self.Z = self.Z_prev 278 | self._update_beta_hat() 279 | #self.nu = self.nu_prev 280 | self.phi_u = self.phi_u_prev 281 | return True 282 | 283 | def _update_opt_vars(self): 284 | self.lr *= self.lr_decay 285 | self.Q_prev = self.Q.copy() 286 | self.Z_prev = self.Z.copy() 287 | #self.nu_prev = self.nu 288 | self.phi_u_prev = self.phi_u.copy() 289 | self.prev_loss = self.loss 290 | 291 | 292 | def _reset_grads(self): 293 | self.grad_Z = np.zeros_like(self.Z) 294 | self.grad_Q = np.zeros_like(self.Q) 295 | self.grad_phi_u = np.zeros_like(self.phi_u) 296 | #self.grad_nu = np.zeros_like(self.nu) 297 | 298 | 299 | def _calc_prediction_grads(self): 300 | # Calculate Prediction Gradients 301 | grad_beta = np.array([ 302 | (self.f_prime(self.X[i], self.Y[i], self.beta_hat[i]) + \ 303 | self.rho_beta_prime(self.beta_hat[i], i) 304 | )*1./(0.1+np.linalg.norm(self.beta_hat[i]-self.init_beta[i], ord=2)) 305 | for i in range(self.N)]) 306 | #print(self.Z, self.Q) 307 | if self.multitask: 308 | # grad_beta (N x P1 x P2) 309 | # self.Z (N x K) 310 | # self.Q (K x P1 x P2) 311 | self.grad_Q += np.tensordot(self.Z.T, grad_beta, axes=1) # K x P1 x P2 312 | self.grad_Z += np.tensordot(grad_beta, self.Q.swapaxes(0, 2).swapaxes(0, 1), axes=2) # N x K 313 | else: 314 | self.grad_Q += self.Z.T.dot(grad_beta) # K x P1 x P2 315 | self.grad_Z += grad_beta.dot(self.Q.T) # N x K 316 | 317 | 318 | def _calc_z_grad(self, i, de_i, closest_i): 319 | grad = (self.gamma / self.N) * 2 * np.mean([ 320 | de_i[idx] * (self.Z[i] - self.Z[j]) for idx, j in enumerate(closest_i)], axis=0) 321 | return np.clip(grad, -1e0, 1e0) 322 | 323 | def _calc_phi_u_grad(self, i, de_i, closest_i): 324 | return (self.gamma / self.N)* 2 * np.mean([ 325 | de_i[idx] * (-self.delta_U[i, j]) for idx, j in enumerate(closest_i)], axis=0) 326 | 327 | """ 328 | def _calc_nu_grad(self, i, de_i, closest_i): 329 | return (self.gamma / self.N)* 2 * np.mean([ 330 | de_i[idx] * np.linalg.norm(self.Z[i] - self.Z[j], ord=2) 331 | for idx, j in enumerate(closest_i)], axis=0) 332 | """ 333 | 334 | 335 | def _calc_personalized_grads(self): 336 | """ Calculates the gradients for the DMR term.""" 337 | def _calc_one(i): # Should help caching behavior. 338 | de_i = self.dist_errors[i] #np.sign(dist_errors[i]) # Derivative of squared term. 339 | closest_i = self.closest[i] 340 | self.grad_Z[i] += self._calc_z_grad(i, de_i, closest_i) 341 | self.grad_phi_u += self._calc_phi_u_grad(i, de_i, closest_i) 342 | #self.grad_nu += self._calc_nu_grad(i, de_i, closest_i) 343 | 344 | self.map(_calc_one, list(range(self.N))) 345 | self.grad_phi_u += self.psi_u_prime(self.phi_u) 346 | self.grad_Z += self.Z 347 | self.grad_Q += self.Q 348 | #self.grad_nu += self.psi_nu_prime(self.nu) 349 | 350 | 351 | def _calc_gradients(self): 352 | self._reset_grads() 353 | self._calc_prediction_grads() 354 | self._calc_personalized_grads() 355 | 356 | 357 | def _update_vars(self): 358 | if np.random.uniform() > 0.5: 359 | self.Z -= self.lr*self.grad_Z 360 | else: 361 | self.Q -= self.lr*self.grad_Q 362 | #phi_beta = soft_normalize(phi_beta - lr*grad_phi_beta) 363 | self.phi_u = soft_normalize(self.phi_u - self.lr*self.grad_phi_u) 364 | #self.nu = self.nu#np.max([1e-5, self.nu - self.lr*self.grad_nu]) 365 | self._update_beta_hat() 366 | 367 | 368 | def _single_restart(self, data, opt_params, log_params): 369 | #(X, Y, U, N, K, beta_hat, nu, phi_beta, phi_u, beta_prev, phi_beta_prev, phi_u, phi_u_prev) 370 | self._init_data_vars(data) 371 | self._init_opt_vars(opt_params) 372 | self.log = log_params["log"] 373 | self.verbosity = log_params["verbosity"] 374 | 375 | self.delta_z = lambda i, j: np.linalg.norm(self.Z[i] - self.Z[j], ord=2) 376 | self.dist_helper = lambda i, j: self.delta_z(i, j) - self.delta_U[i, j].dot(self.phi_u) 377 | self.calc_dist_errors = lambda i: np.array([ 378 | self.dist_helper(i, j) for j in self.closest[i]]) 379 | 380 | t = time.time() 381 | for iteration in range(1, self.max_iters+1): 382 | print("Iteration:{} of Max {}. Last Iteration Took {:.3f} seconds.".format( 383 | iteration, self.max_iters, time.time() - t), end='\r') 384 | t = time.time() 385 | self._maybe_update_ztree(iteration-1) 386 | self._maybe_update_neighbors(iteration-1) 387 | self._maybe_update_errors(iteration-1) 388 | losses = self._calc_loss(iteration-1) 389 | self._maybe_record_distances() 390 | self._maybe_log_status(iteration-1, losses) 391 | if self._should_quit(iteration): 392 | break 393 | self._update_opt_vars() 394 | self._calc_gradients() 395 | self._update_vars() 396 | self.log.flush() 397 | 398 | return #beta_hat, phi_beta, beta_scale, phi_u, loss, distances_over_time, losses_over_time 399 | 400 | def _maybe_make_delta_U(self, delta_U): 401 | if delta_U is None: 402 | print("Making Distances...") 403 | t = time.time() 404 | delta_U = self.make_covariate_distances( 405 | self.U, self.dU, self.K, self.N, should_normalize=True) 406 | print("Finished Making Distances. Took {:.3f} seconds.".format(time.time() - t)) 407 | return delta_U 408 | 409 | 410 | def fit(self, X, Y, U, dU, delta_U=None, neighborhoods=None, 411 | init_lr=1e-3, lr_decay=1-1e-6, n_restarts=1, 412 | init_patience=10, max_iters=20000, tol=1e-3, 413 | verbosity=100, log_file=None, hierarchical=False, 414 | kd_leafsize=None, calc_neighbors=True, 415 | record_distances=False): 416 | """ Logistic Regression with Distance Matching. 417 | 418 | Parameters: 419 | 420 | X : Data 421 | Y : Data 422 | """ 423 | N, K = self._check_shapes(X, Y, U, dU) 424 | self.U = U 425 | self.dU = dU 426 | self.K = K 427 | self.N = N 428 | delta_U = self._maybe_make_delta_U(delta_U) 429 | if kd_leafsize is None: 430 | kd_leafsize = self.n_neighbors 431 | 432 | self.best_loss = np.inf 433 | self.record_distances = record_distances 434 | if log_file is None: 435 | log_file ='{}/logs/distance_matching_{}.log'.format( 436 | self.log_dir, time.strftime("%Y_%m_%d-%H_%M_%S")) 437 | 438 | with open(log_file, 'a') as log: 439 | if hierarchical: 440 | self.f_single = self.f 441 | self.f_prime_single = self.f_prime 442 | self.f = lambda x, y, beta_hat: np.sum([ 443 | self.f_single(x[i], y[i], beta_hat) for i in range(len(x))]) 444 | self.f_prime = lambda x, y, beta_hat: np.sum([ 445 | self.f_prime_single(x[i], y[i], beta_hat) for i in range(len(x))]) 446 | # Condense X and Y into just two samples for each level. 447 | 448 | # Make hierarchy 449 | from sklearn.cluster import AgglomerativeClustering 450 | ag = AgglomerativeClustering(n_clusters=2, 451 | compute_full_tree=False, affinity='precomputed', linkage='average') 452 | distance_mat = np.sum(delta_U, axis=2) 453 | print("Distance mat shape: {}".format(distance_mat.shape)) 454 | """np.array([np.array([ 455 | delta_U[i, j].dot(np.ones_like(delta_U[i, j])) for j in range(len(delta_U))]) for i in range(len(delta_U))])""" 456 | 457 | def find_mean(U_mat): 458 | from scipy.stats import mode 459 | mean_vec = np.zeros((U_mat.shape[1])) 460 | for j in range(U_mat.shape[1]): 461 | try: 462 | mean_vec[j] = np.mean(U_mat[:, j]) 463 | except ValueError: 464 | mean_vec[j] = mode(U_mat[:, j]) 465 | return mean_vec 466 | 467 | def helper(idx, parent_beta, depth): 468 | print("Depth={}".format(depth)) 469 | print(parent_beta.shape) 470 | labels = ag.fit_predict(distance_mat[idx][:, idx]) 471 | X_idx = X[idx] 472 | Y_idx = Y[idx] 473 | U_idx = U[idx] 474 | #X_clustered = np.zeros((2, max(len()))) 475 | X_clustered = np.array([X_idx[labels == 0, :], X_idx[labels == 1, :]]) 476 | Y_clustered = np.array([Y_idx[labels == 0, :], Y_idx[labels == 1, :]]) 477 | U_clustered = np.vstack((find_mean(U_idx[labels == 0, :]), 478 | find_mean(U_idx[labels == 1, :]))) 479 | print(X_clustered) 480 | print(U_clustered) 481 | my_dist = np.array([dU[k](U_clustered[0][k], U_clustered[1][k]) for k in range(K)]) 482 | delta_U_clustered = np.array([ 483 | np.array([np.zeros_like(my_dist), my_dist]), 484 | np.array([my_dist, np.zeros_like(my_dist)])]) 485 | print(delta_U_clustered.shape) 486 | t = time.time() 487 | self.init_beta = parent_beta#np.vstack((parent_beta, parent_beta)) 488 | (beta_hat, phi_beta, beta_scale, 489 | phi_u, loss, distances_over_time, 490 | losses_over_time) = self._single_restart( 491 | X_clustered, Y_clustered, U_clustered, 492 | delta_U_clustered, None, init_lr, lr_decay, 493 | init_patience, max_iters, tol, verbosity, log) 494 | idx0 = [] 495 | idx1 = [] 496 | for i, index in enumerate(idx): 497 | if labels[i] == 0: 498 | idx0.append(index) 499 | else: 500 | idx1 501 | for i in idx0: 502 | self.beta_hat[i] = beta_hat[0].copy() 503 | for i in idx1: 504 | self.beta_hat[i] = beta_hat[1].copy() 505 | if len(idx0) > 1: 506 | helper(idx0, beta_hat[0], depth+1) 507 | if len(idx1) > 1: 508 | helper(idx1, beta_hat[1], depth+1) 509 | 510 | print("Took {:.3f} seconds.".format(time.time() - t)) 511 | 512 | self.beta_hat = np.tile(self.init_beta, (X.shape[0])) 513 | print("beta_hat shape:{}".format(self.beta_hat.shape)) 514 | helper(list(range(N)), self.beta_hat, 0) 515 | else: 516 | for restart in range(n_restarts): 517 | t = time.time() 518 | print("Restart {} of {}".format(restart+1, n_restarts)) 519 | self._single_restart({'X': X, 'Y': Y, 'U': U, 'delta_U': delta_U}, 520 | {'init_patience': init_patience, 'max_iters': max_iters, 521 | 'init_lr': init_lr, 'lr_decay': lr_decay, 'kd_leafsize': kd_leafsize, 522 | 'neighbors': neighborhoods, 'tol': tol, 'calc_neighbors': calc_neighbors}, 523 | {'verbosity': verbosity, 'log': log}) 524 | print("Took {:.3f} seconds.".format(time.time() - t)) 525 | if self.loss < self.best_loss: 526 | print("** New best solution **") 527 | self.best_loss = self.loss 528 | self.best_Z = self.Z.copy() 529 | self.best_Q = self.Q.copy() 530 | self.best_beta_hat = self._update_beta_hat() 531 | #self.best_beta_hat = self.Z.dot(self.best_Q) 532 | #self.best_nu = self.nu 533 | self.best_phi_u = self.phi_u.copy() 534 | self.best_distances_over_time = self.distances_over_time.copy() 535 | self.best_losses_over_time = self.losses_over_time.copy() 536 | 537 | return self.best_Z, self.best_Q#, self.best_nu, self.best_phi_u, self.losses_over_time 538 | -------------------------------------------------------------------------------- /functions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def logistic_loss(x, y, beta): 4 | return np.log(1 + np.exp(x.dot(beta))) - y*x.dot(beta) 5 | 6 | 7 | def logistic_loss_multitask(x, y, beta): 8 | return np.sum([logistic_loss(x, y[i], beta[:, i]) for i in range(len(y))]) 9 | 10 | def logistic_loss_prime(x, y, beta): 11 | return x * ( np.exp(x.dot(beta)) / (1 + np.exp(x.dot(beta))) - y) 12 | 13 | def logistic_loss_prime_multitask(x, y, beta): 14 | return np.array([ 15 | logistic_loss_prime(x, y[i], beta[:, i]) for i in range(len(y))]).T 16 | 17 | def linear_loss(x, y, beta): 18 | return 0.5*(y-x.dot(beta))**2 19 | 20 | def linear_loss_prime(x, y, beta): 21 | return (-x)*(y-x.dot(beta)) 22 | 23 | def linear_loss_multitask(x, y, beta): 24 | return np.sum([ 25 | linear_loss(x, y[i], beta[:, i]) for i in range(len(y))]) 26 | 27 | def linear_loss_prime_multitask(x, y, beta): 28 | return np.array([ 29 | linear_loss_prime(x, y[i], beta[:, i]) for i in range(len(y))]).T 30 | 31 | def lasso_penalty(beta, target): 32 | return np.linalg.norm(beta-target, ord=1) 33 | 34 | def lasso_derivative(beta, target): 35 | return np.sign(beta - target) 36 | 37 | def l2_penalty(x, target): 38 | return 0.5*np.linalg.norm(x - target, ord=2) 39 | 40 | def l2_prime(x, target): 41 | return x - target 42 | 43 | 44 | 45 | 46 | bad_vals = ["None", None, "not reported"] 47 | def either_bad(x, y): 48 | if x in bad_vals or y in bad_vals: 49 | #print("Bad value: {},{}".format(x,y)) 50 | return True 51 | else: 52 | #print("Values Fine") 53 | return False 54 | 55 | def abs_diff(x, y): 56 | return np.abs(float(x) - float(y)) 57 | 58 | def discrete_diff(x, y): 59 | return float(x != y) 60 | 61 | 62 | def safe_wrapper(x, y, f): 63 | if either_bad(x, y): 64 | return 0. 65 | else: 66 | #print("Trying {}".format(f)) 67 | return f(x,y) 68 | -------------------------------------------------------------------------------- /personalized_regression_figure1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blengerich/Personalized_Regression/dbd47797e9f6fece473125572c643e8568e222e1/personalized_regression_figure1.png -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | import time 4 | from numba import jit 5 | 6 | 7 | def load_delta_U(fname): 8 | # Assumes npz 9 | npz_arr = np.load(fname) 10 | delta_U = npz_arr['arr_0'] 11 | print("Successfully Loaded covariate distances from {}".format(fname)) 12 | return delta_U 13 | 14 | 15 | def create_delta_U(dml, U, dU, K, N, fname, normalize=True): 16 | # Assumes fname is .npz 17 | print("Calculating Pairwise Co-Variate Distances...") 18 | t = time.time() 19 | delta_U = dml.make_covariate_distances(U, dU, K, N, normalize) 20 | #delta_U = dml.make_covariate_distance_function(U, dU, K) 21 | print("Finished. Took {:.3f} seconds.".format(time.time() - t)) 22 | if fname is not None: 23 | print("Saving Pairwise Co-Variate Distances to {}".format(fname)) 24 | np.savez_compressed(fname, delta_U) 25 | return delta_U 26 | 27 | """ 28 | # TODO: Deprecated? 29 | def calc_prediction_error(Y, beta_hat, X, N): 30 | return 0.5*np.mean(np.square(Y - np.array([X[i].dot(beta_hat[i]) for i in range(N)]))) 31 | 32 | def calc_prediction_error_logistic(Y, beta_hat, X): 33 | return 0.5*np.mean([ 34 | np.log(np.exp(X[i].dot(beta_hat[i])) + 1) - Y[i]*X[i].dot(beta_hat[i]) 35 | for i in range(len(X))]) 36 | """ 37 | 38 | def print_errors(calc_pred_err, estimations, err_name, fname="results.txt"): 39 | with open(fname, 'a') as out_file: 40 | print("="*20 + " {} Error ".format(err_name) + "="*20) 41 | print("="*20 + " {} Error ".format(err_name) + "="*20, file=out_file) 42 | pred_errs = [] 43 | for (beta_hat, estimator_name) in estimations: 44 | err = calc_pred_err(beta_hat) 45 | pred_errs.append((err, estimator_name)) 46 | print("{}:{:.4f}".format(estimator_name, err)) 47 | print("{}:{:.4f}".format(estimator_name, err), file=out_file) 48 | return pred_errs 49 | 50 | 51 | def plot_learned_betas(true_beta, estimations, U): 52 | fig = plt.figure() 53 | 54 | # Assumes the first value in each row of U is a category 55 | colors = ['blue', 'green', 'cyan', 'orange', 'red'] 56 | true_color = 'black' 57 | true_marker = '*' 58 | markers = ['+', 'o', '.', 'x', 'v'] 59 | 60 | labels = set(U[:, 0]) 61 | for i, label in enumerate(labels): 62 | ax = fig.add_subplot(len(labels)/2+1, 2, i+1) 63 | ax.set_title("Type={}".format(label)) 64 | handles = [] 65 | descriptions = [] 66 | 67 | selection = U[:, 0] == label 68 | handle = ax.scatter( 69 | true_beta[selection, 0], 70 | true_beta[selection, 1], 71 | color=true_color, marker='*') 72 | handles.append(handle) 73 | descriptions.append('True Beta') 74 | for j, (estimation, estimator_name) in enumerate(estimations): 75 | #if 'Mixture' not in estimator_name: 76 | # continue 77 | #print(estimation) 78 | handle = ax.scatter( 79 | estimation[selection, 0]+np.random.normal(0, .02, np.sum(selection)), 80 | estimation[selection, 1]+np.random.normal(0, .02, np.sum(selection)), 81 | color=colors[j], marker='+') 82 | handles.append(handle) 83 | descriptions.append(estimator_name) 84 | 85 | ax = fig.add_subplot(len(labels)/2+1, 2, i+2) 86 | plt.legend(handles, descriptions, loc='upper center', bbox_to_anchor=(0.5, 1.05), 87 | ncol=2, fancybox=True, shadow=True) 88 | 89 | plt.show() 90 | 91 | 92 | #def softmax(x): 93 | @jit(nopython=True) 94 | def soft_normalize(x): 95 | """Compute softmax values for each sets of scores in x.""" 96 | exps = np.exp(x) 97 | return exps / np.sum(exps) 98 | 99 | """ 100 | def soft_normalize(ar, thresh=1e-3): 101 | # Makes the values in the array sum to 1, and no value is smaller than thresh. 102 | ar = np.maximum(thresh, ar) 103 | ar /= np.sum(ar) 104 | return ar 105 | """ 106 | 107 | def float_or_zero(x): 108 | try: 109 | return float(x) 110 | except ValueError: 111 | return 0. 112 | 113 | 114 | # TODO: Should do mean imputation, not 0. 115 | def to_one_hot(U, should_change): 116 | if should_change[0]: 117 | one_hot = to_one_hot_one_feature(U[:, 0]) 118 | else: 119 | one_hot = np.array([float_or_zero(U[i, 0]) for i in range(len(U))]) 120 | one_hot = np.expand_dims(one_hot, 1) 121 | #print("One Hot First Feature Shape:{}".format(one_hot.shape)) 122 | for j in range(1, U.shape[1]): 123 | if should_change[j]: 124 | #print("Changing {}".format(j)) 125 | one_hot_feature = to_one_hot_one_feature(U[:, j]) 126 | one_hot = np.hstack((one_hot, one_hot_feature)) 127 | else: 128 | continuous_feature = np.array([float_or_zero(U[i, j]) for i in range(len(U))]) 129 | continuous_feature = np.expand_dims(continuous_feature, 1) 130 | one_hot = np.hstack((one_hot, continuous_feature)) 131 | #print(one_hot.shape) 132 | return one_hot 133 | 134 | 135 | def to_one_hot_one_feature(U): 136 | """ Assumes U has a single feature. 137 | Returns matrix of size U.shape[0], number_unique + 1 138 | """ 139 | as_set = set(U) 140 | set_as_list = list(as_set) 141 | one_hot = np.zeros((U.shape[0], len(as_set))) 142 | for i in range(U.shape[0]): 143 | one_hot[i, set_as_list.index(U[i])] = 1 144 | return one_hot 145 | 146 | def to_color_map_one_feature(U): 147 | bad_vals = set(["None", None, "not reported"]) 148 | as_set = set(U) - bad_vals 149 | if "Breast" in as_set: 150 | print("Using pre-defined list") 151 | set_as_list = ["Pancreas", "Skin", "Thyroid", "Prostate", "Eye", "Kidney", "Uterus", "Liver", "Bladder", 152 | "Colorectal", "Esophagus", "Head and Neck", "Lymph Nodes", "Bile Duct", "Stomach", "Breast", "Brain", "Lung", "Ovary"] 153 | set_as_list.reverse() 154 | else: 155 | set_as_list = list(as_set) 156 | one_hot = np.zeros((U.shape[0])) 157 | for i in range(U.shape[0]): 158 | try: 159 | one_hot[i] = set_as_list.index(U[i]) 160 | except: 161 | one_hot[i] = -1 162 | return one_hot, set_as_list --------------------------------------------------------------------------------