├── README.md ├── ReadMe-AcademicUserAgreement.txt └── VAE_Model.py /README.md: -------------------------------------------------------------------------------- 1 | # DeepProfile 2 | We present the DeepProfile framework, whichlearns a variational autoencoder (VAE) networkfrom thousands of publicly available gene expres-sion samples and uses this network to encode alow-dimensional representation (LDR) to predictcomplex disease phenotypes. To our knowledge,DeepProfile is the first attempt to use deep learn-ing to extract a feature representation from a vastquantity of unlabeled (i.e, lacking phenotype in-formation) expression samples that are not incor-porated into the prediction problem. We use Deep-Profile to predict acute myeloid leukemia patients’in vitroresponses to 160 chemotherapy drugs. Weshow that, when compared to the original features (i.e., expression levels) and LDRs from two com-monly used dimensionality reduction methods,DeepProfile: (1) better predicts complex pheno-types, (2) better captures known functional genegroups, and (3) better reconstructs the input data.We show that DeepProfile is generalizable to otherdiseases and phenotypes by using it to predictovarian cancer patients’ tumor invasion patternsand breast cancer patients’ disease subtypes. 3 | 4 | Short paper is available: https://www.biorxiv.org/content/early/2018/03/08/278739 5 | -------------------------------------------------------------------------------- /ReadMe-AcademicUserAgreement.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suinleelab/DeepProfile/8c481e0e9ad33e8878a08efb406addf41631ecc1/ReadMe-AcademicUserAgreement.txt -------------------------------------------------------------------------------- /VAE_Model.py: -------------------------------------------------------------------------------- 1 | #*********************************************************************** 2 | #Author: Ayse Dincer 3 | #Date: 23 May 2018 4 | #Keras implementation of VAE for DeepProfile. Paper is available: https://www.biorxiv.org/content/early/2018/03/08/278739 5 | #Code is modified from https://github.com/keras-team/keras/blob/master/examples/variational_autoencoder.py 6 | #*********************************************************************** 7 | 8 | import os 9 | import numpy as np 10 | import pandas as pd 11 | import math 12 | import csv 13 | import sys 14 | 15 | from sklearn.metrics import mean_squared_error 16 | import matplotlib.pyplot as plt 17 | 18 | import tensorflow as tf 19 | from keras.layers import Input, Dense, Lambda, Layer, Activation 20 | from keras.layers.normalization import BatchNormalization 21 | from keras.models import Model 22 | from keras import backend as K 23 | from keras import metrics, optimizers 24 | from keras.callbacks import Callback 25 | import keras 26 | 27 | #Reparameterization trick 28 | def sampling(args): 29 | 30 | z_mean, z_log_var = args 31 | 32 | epsilon = K.random_normal(shape=K.shape(z_mean), mean=0., stddev=1.0) 33 | 34 | z = z_mean + K.exp(z_log_var / 2) * epsilon 35 | return z 36 | 37 | #Vae loss defined 38 | def vae_loss(x_input, x_decoded): 39 | reconstruction_loss = original_dim * metrics.mse(x_input, x_decoded) 40 | kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) 41 | print K.get_value(beta) 42 | return K.mean(reconstruction_loss + (K.get_value(beta) * kl_loss)) 43 | 44 | #Reconstruction loss defined 45 | def reconstruction_loss(x_input, x_decoded): 46 | return metrics.mse(x_input, x_decoded) 47 | 48 | #KL loss defined 49 | def kl_loss(x_input, x_decoded): 50 | return - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) 51 | 52 | class WarmUpCallback(Callback): 53 | def __init__(self, beta, kappa): 54 | self.beta = beta 55 | self.kappa = kappa 56 | 57 | # Behavior on each epoch 58 | def on_epoch_end(self, epoch, logs={}): 59 | if K.get_value(self.beta) <= 1: 60 | K.set_value(self.beta, K.get_value(self.beta) + self.kappa) 61 | 62 | #Read input file 63 | input_filename = sys.argv[1] 64 | output_filename = sys.argv[2] 65 | input_df = pd.read_table(input_filename, index_col=0) 66 | print "INPUT FILE..." 67 | print input_df.shape 68 | print input_df.head(5) 69 | 70 | # Set hyperparameters 71 | original_dim = input_df.shape[1] 72 | intermediate1_dim = int(sys.argv[3]) 73 | intermediate2_dim = int(sys.argv[4]) 74 | latent_dim = int(sys.argv[5]) 75 | 76 | batch_size = 50 77 | learning_rate = 0.0005 78 | beta = K.variable(1) 79 | kappa = 0 80 | 81 | test_data_size = int(sys.argv[6]) 82 | epochs = int(sys.argv[7]) 83 | fold_count = int(sys.argv[8]) 84 | 85 | #Separate data to training and test sets 86 | input_df_training = input_df.iloc[:-1 * test_data_size, :] 87 | input_df_test = input_df.iloc[-1 * test_data_size:, :] 88 | 89 | print "INPUT DF" 90 | print input_df_training.shape 91 | print input_df_training.index 92 | print "TEST DF" 93 | print input_df_test.shape 94 | print input_df_test.index 95 | 96 | #Define encoder 97 | x = Input(shape=(original_dim, )) 98 | 99 | net = Dense(intermediate1_dim)(x) 100 | net2 = BatchNormalization()(net) 101 | net3 = Activation('relu')(net2) 102 | 103 | net4 = Dense(intermediate2_dim)(net3) 104 | net5 = BatchNormalization()(net4) 105 | net6 = Activation('relu')(net5) 106 | 107 | z_mean = Dense(latent_dim)(net6) 108 | z_log_var = Dense(latent_dim)(net6) 109 | 110 | # Sample from mean and var 111 | z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var]) 112 | 113 | #Define decoder 114 | decoder_h = Dense(intermediate2_dim, activation='relu') 115 | decoder_h2 = Dense(intermediate1_dim, activation='relu') 116 | decoder_mean = Dense(original_dim) 117 | 118 | h_decoded = decoder_h(z) 119 | h_decoded2 = decoder_h2(h_decoded) 120 | x_decoded_mean = decoder_mean(h_decoded2) 121 | 122 | #VAE model 123 | vae = Model(x, x_decoded_mean) 124 | 125 | adam = optimizers.Adam(lr=learning_rate) 126 | vae.compile(optimizer=adam, loss = vae_loss, metrics = [reconstruction_loss, kl_loss]) 127 | vae.summary() 128 | 129 | #Train from only training data 130 | history = vae.fit(np.array(input_df_training), np.array(input_df_training), 131 | shuffle=True, 132 | epochs=epochs, 133 | batch_size=batch_size, 134 | verbose = 2, 135 | validation_data=(np.array(input_df_test), np.array(input_df_test)), 136 | callbacks=[WarmUpCallback(beta, kappa)]) 137 | 138 | plt.plot(history.history['loss']) 139 | plt.plot(history.history['val_loss']) 140 | plt.title('VAE Model Loss') 141 | plt.ylabel('loss') 142 | plt.xlabel('epoch') 143 | plt.legend(['train', 'test'], loc='upper left') 144 | fig = plt.gcf() 145 | fig.set_size_inches(14.5, 8.5) 146 | plt.show() 147 | 148 | plt.plot(history.history['reconstruction_loss']) 149 | plt.plot(history.history['val_reconstruction_loss']) 150 | plt.title('VAE Model Reconstruction Error') 151 | plt.ylabel('reconstruction error') 152 | plt.xlabel('epoch') 153 | plt.legend(['train', 'test'], loc='upper left') 154 | fig = plt.gcf() 155 | fig.set_size_inches(14.5, 8.5) 156 | plt.show() 157 | 158 | 159 | # DEFINE ENCODER 160 | encoder = Model(x, z_mean) 161 | 162 | #SAVE THE ENCODER 163 | from keras.models import model_from_json 164 | 165 | model_json = encoder.to_json() 166 | with open("encoder" + str(fold_count) + ".json", "w") as json_file: 167 | json_file.write(model_json) 168 | 169 | encoder.save_weights("encoder" + str(fold_count) + ".h5") 170 | print("Saved model to disk") 171 | 172 | 173 | #DEFINE DECODER 174 | decoder_input = Input(shape=(latent_dim, )) 175 | _h_decoded = decoder_h(decoder_input) 176 | _h_decoded2 = decoder_h2(_h_decoded) 177 | _x_decoded_mean = decoder_mean(_h_decoded2) 178 | decoder = Model(decoder_input, _x_decoded_mean) 179 | 180 | # Encode test data into the latent representation - and save output 181 | test_encoded = encoder.predict(input_df_test, batch_size = batch_size) 182 | test_encoded_df = pd.DataFrame(test_encoded, index = input_df_test.index) 183 | test_encoded_df.to_csv(output_filename + str(fold_count) + ".tsv", sep='\t', quoting = csv.QUOTE_NONE) 184 | 185 | # How well does the model reconstruct the input data 186 | test_reconstructed = decoder.predict(np.array(test_encoded_df)) 187 | test_reconstructed_df = pd.DataFrame(test_reconstructed, index = input_df_test.index, columns = input_df_test.columns) 188 | 189 | recons_error = mean_squared_error(np.array(input_df_test), np.array(test_reconstructed_df)) 190 | 191 | print("TEST RECONSTRUCTION ERROR: " + str(recons_error)) --------------------------------------------------------------------------------