├── README.md
├── ReadMe-AcademicUserAgreement.txt
└── VAE_Model.py


/README.md:
--------------------------------------------------------------------------------
1 | # DeepProfile
2 | We present the DeepProfile framework,  whichlearns a variational autoencoder (VAE) networkfrom thousands of publicly available gene expres-sion samples and uses this network to encode alow-dimensional representation (LDR) to predictcomplex disease phenotypes. To our knowledge,DeepProfile is the first attempt to use deep learn-ing to extract a feature representation from a vastquantity of unlabeled (i.e, lacking phenotype in-formation) expression samples that are not incor-porated into the prediction problem. We use Deep-Profile to predict acute myeloid leukemia patients’in vitroresponses to 160 chemotherapy drugs. Weshow that, when compared to the original features (i.e., expression levels) and LDRs from two com-monly used dimensionality reduction methods,DeepProfile: (1) better predicts complex pheno-types, (2) better captures known functional genegroups, and (3) better reconstructs the input data.We show that DeepProfile is generalizable to otherdiseases and phenotypes by using it to predictovarian cancer patients’ tumor invasion patternsand breast cancer patients’ disease subtypes.
3 | 
4 | Short paper is available: https://www.biorxiv.org/content/early/2018/03/08/278739
5 | 


--------------------------------------------------------------------------------
/ReadMe-AcademicUserAgreement.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/suinleelab/DeepProfile/8c481e0e9ad33e8878a08efb406addf41631ecc1/ReadMe-AcademicUserAgreement.txt


--------------------------------------------------------------------------------
/VAE_Model.py:
--------------------------------------------------------------------------------
  1 | #***********************************************************************
  2 | #Author: Ayse Dincer
  3 | #Date: 23 May 2018
  4 | #Keras implementation of VAE for DeepProfile. Paper is available: https://www.biorxiv.org/content/early/2018/03/08/278739
  5 | #Code is modified from https://github.com/keras-team/keras/blob/master/examples/variational_autoencoder.py
  6 | #***********************************************************************
  7 | 
  8 | import os
  9 | import numpy as np
 10 | import pandas as pd
 11 | import math 
 12 | import csv
 13 | import sys
 14 | 
 15 | from sklearn.metrics import mean_squared_error
 16 | import matplotlib.pyplot as plt
 17 | 
 18 | import tensorflow as tf
 19 | from keras.layers import Input, Dense, Lambda, Layer, Activation
 20 | from keras.layers.normalization import BatchNormalization
 21 | from keras.models import Model
 22 | from keras import backend as K
 23 | from keras import metrics, optimizers
 24 | from keras.callbacks import Callback
 25 | import keras
 26 | 
 27 | #Reparameterization trick 
 28 | def sampling(args):
 29 |     
 30 |     z_mean, z_log_var = args
 31 | 
 32 |     epsilon = K.random_normal(shape=K.shape(z_mean), mean=0., stddev=1.0)
 33 |     
 34 |     z = z_mean + K.exp(z_log_var / 2) * epsilon
 35 |     return z
 36 | 
 37 | #Vae loss defined
 38 | def vae_loss(x_input, x_decoded):
 39 |     reconstruction_loss = original_dim * metrics.mse(x_input, x_decoded)
 40 |     kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
 41 |     print K.get_value(beta)
 42 |     return K.mean(reconstruction_loss + (K.get_value(beta) * kl_loss))
 43 | 
 44 | #Reconstruction loss defined
 45 | def reconstruction_loss(x_input, x_decoded):
 46 |     return metrics.mse(x_input, x_decoded)
 47 | 
 48 | #KL loss defined
 49 | def kl_loss(x_input, x_decoded):
 50 |     return - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
 51 | 
 52 | class WarmUpCallback(Callback):
 53 |     def __init__(self, beta, kappa):
 54 |         self.beta = beta
 55 |         self.kappa = kappa
 56 |     
 57 |     # Behavior on each epoch
 58 |     def on_epoch_end(self, epoch, logs={}):
 59 |         if K.get_value(self.beta) <= 1:
 60 |             K.set_value(self.beta, K.get_value(self.beta) + self.kappa)
 61 | 
 62 | #Read input file
 63 | input_filename = sys.argv[1]
 64 | output_filename = sys.argv[2]
 65 | input_df = pd.read_table(input_filename, index_col=0)
 66 | print "INPUT FILE..."
 67 | print input_df.shape 
 68 | print input_df.head(5)
 69 | 
 70 | # Set hyperparameters
 71 | original_dim = input_df.shape[1]
 72 | intermediate1_dim = int(sys.argv[3])
 73 | intermediate2_dim = int(sys.argv[4])
 74 | latent_dim = int(sys.argv[5])
 75 | 
 76 | batch_size = 50
 77 | learning_rate = 0.0005
 78 | beta = K.variable(1)
 79 | kappa = 0
 80 | 
 81 | test_data_size = int(sys.argv[6])
 82 | epochs = int(sys.argv[7])
 83 | fold_count = int(sys.argv[8])
 84 | 
 85 | #Separate data to training and test sets
 86 | input_df_training = input_df.iloc[:-1 * test_data_size, :]
 87 | input_df_test = input_df.iloc[-1 * test_data_size:, :]
 88 | 
 89 | print "INPUT DF"
 90 | print input_df_training.shape
 91 | print input_df_training.index
 92 | print "TEST DF"
 93 | print input_df_test.shape
 94 | print input_df_test.index
 95 | 
 96 | #Define encoder
 97 | x = Input(shape=(original_dim, ))
 98 | 
 99 | net = Dense(intermediate1_dim)(x)
100 | net2 = BatchNormalization()(net)
101 | net3 = Activation('relu')(net2)
102 | 
103 | net4 = Dense(intermediate2_dim)(net3)
104 | net5 = BatchNormalization()(net4)
105 | net6 = Activation('relu')(net5)
106 | 
107 | z_mean = Dense(latent_dim)(net6)
108 | z_log_var = Dense(latent_dim)(net6)
109 | 
110 | # Sample from mean and var
111 | z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
112 | 
113 | #Define decoder
114 | decoder_h = Dense(intermediate2_dim, activation='relu')
115 | decoder_h2 = Dense(intermediate1_dim, activation='relu')
116 | decoder_mean = Dense(original_dim)
117 | 
118 | h_decoded = decoder_h(z)
119 | h_decoded2 = decoder_h2(h_decoded)
120 | x_decoded_mean = decoder_mean(h_decoded2)
121 | 
122 | #VAE model
123 | vae = Model(x, x_decoded_mean)
124 | 
125 | adam = optimizers.Adam(lr=learning_rate)
126 | vae.compile(optimizer=adam, loss = vae_loss, metrics = [reconstruction_loss, kl_loss])
127 | vae.summary()
128 | 
129 | #Train from only training data
130 | history  = vae.fit(np.array(input_df_training), np.array(input_df_training),
131 |                shuffle=True,
132 |                epochs=epochs,
133 |                batch_size=batch_size,
134 |                verbose = 2,
135 |                validation_data=(np.array(input_df_test), np.array(input_df_test)),
136 |                callbacks=[WarmUpCallback(beta, kappa)])
137 | 
138 | plt.plot(history.history['loss'])
139 | plt.plot(history.history['val_loss'])
140 | plt.title('VAE Model Loss')
141 | plt.ylabel('loss')
142 | plt.xlabel('epoch')
143 | plt.legend(['train', 'test'], loc='upper left')
144 | fig = plt.gcf()
145 | fig.set_size_inches(14.5, 8.5)
146 | plt.show()
147 | 
148 | plt.plot(history.history['reconstruction_loss'])
149 | plt.plot(history.history['val_reconstruction_loss'])
150 | plt.title('VAE Model Reconstruction Error')
151 | plt.ylabel('reconstruction error')
152 | plt.xlabel('epoch')
153 | plt.legend(['train', 'test'], loc='upper left')
154 | fig = plt.gcf()
155 | fig.set_size_inches(14.5, 8.5)
156 | plt.show()
157 | 
158 | 
159 | # DEFINE ENCODER
160 | encoder = Model(x, z_mean)
161 | 
162 | #SAVE THE ENCODER
163 | from keras.models import model_from_json
164 | 
165 | model_json = encoder.to_json()
166 | with open("encoder" + str(fold_count) + ".json", "w") as json_file:
167 |     json_file.write(model_json)
168 | 
169 | encoder.save_weights("encoder" + str(fold_count) + ".h5")
170 | print("Saved model to disk")
171 | 
172 | 
173 | #DEFINE DECODER
174 | decoder_input = Input(shape=(latent_dim, )) 
175 | _h_decoded = decoder_h(decoder_input)
176 | _h_decoded2 = decoder_h2(_h_decoded)
177 | _x_decoded_mean = decoder_mean(_h_decoded2)
178 | decoder = Model(decoder_input, _x_decoded_mean)
179 | 
180 | # Encode test data into the latent representation - and save output
181 | test_encoded = encoder.predict(input_df_test, batch_size = batch_size)
182 | test_encoded_df = pd.DataFrame(test_encoded, index = input_df_test.index)
183 | test_encoded_df.to_csv(output_filename + str(fold_count) + ".tsv", sep='\t', quoting = csv.QUOTE_NONE)
184 | 
185 | # How well does the model reconstruct the input data
186 | test_reconstructed = decoder.predict(np.array(test_encoded_df))
187 | test_reconstructed_df = pd.DataFrame(test_reconstructed, index = input_df_test.index, columns = input_df_test.columns)
188 | 
189 | recons_error = mean_squared_error(np.array(input_df_test), np.array(test_reconstructed_df))
190 | 
191 | print("TEST RECONSTRUCTION ERROR: " + str(recons_error))


--------------------------------------------------------------------------------