├── COMPETITOR_TRAININGS ├── AE_2Layers_Model.py ├── Create_ICA_Data.py ├── Create_PCA_Data.py ├── Create_RP_Data.py ├── DAE_2Layers_Model.py ├── Example_Run_All.py ├── Get_AE_IG_Attributions.py ├── Get_DAE_IG_Attributions.py ├── IntegratedGradients.py ├── Train_AE_Models.py └── Train_DAE_Models.py ├── LICENSE ├── MODEL_TRAININGS ├── Create_DeepProfile_Ensemble_Weights.py ├── Create_DeepProfile_Training_Embeddings.py ├── Create_Ensemble_Labels.py ├── Create_PCs_for_DeepLearning_Models.py ├── Example_Run_All.py ├── Get_VAE_IG_Attributions.py ├── IntegratedGradients.py ├── Run_VAE_Models.py ├── Select_Latent_Dimension_with_Gmeans.ipynb ├── VAE_3Layers_Model.py └── gmeans.py ├── NORMAL_TISSUE_ANALYSIS ├── Create_DeepProfile_GTEX_Embeddings.py ├── Create_Gtex_Rnaseq_PCs.py ├── Encode_GTEX_Data_with_VAE.py ├── Example_Run_All.py ├── Gtex_Tissue_Name_Mappings.ipynb ├── Normal_Tissue_Classifier.py └── Preprocess_Gtex_Rnaseq_Expressions.py ├── PATHWAY_ANALYSIS ├── Create_Pathway_Matrices.py ├── Fishers_Test.py ├── PATHWAY_COVERAGE_ANALYSIS │ ├── Plot_of_Average_Pathway_Coverages.ipynb │ ├── Plot_of_Node_Level_Pathway_Annotations.ipynb │ ├── Plot_of_Pathway_Coverage_Distributions.ipynb │ ├── Plot_of_Pathway_Detection_Comparison_VAEs_vs_DeepProfile.ipynb │ └── Plot_of_Pathway_Percent_Comparison_VAEs_vs_DeepProfile.ipynb └── Run_Multiple_Fishers_Test.py ├── README.md └── TCGA_SURVIVAL_PREDICTION ├── COMPARING_RNASEQ_and_MICROARRAY ├── Create_DeepProfile_TCGA_Microarray_Embeddings.py ├── Create_TCGA_Microarray_PCs.py ├── Encode_TCGA_Microarray_Data_with_VAE.py ├── Preprocess_TCGA_Microarray_Expression.py └── Rnaseq_and_Microarray_Embedding_Correlation_Plots.ipynb ├── CREATE_EMBEDDINGS ├── Create_All_VAE_Embeddings.py ├── Create_DeepProfile_TCGA_Embeddings.py ├── Create_TCGA_Rnaseq_PCs.py ├── Encode_TCGA_Data_with_AE.py ├── Encode_TCGA_Data_with_DAE.py ├── Encode_TCGA_Data_with_ICA.py ├── Encode_TCGA_Data_with_PCA.py ├── Encode_TCGA_Data_with_RP.py ├── Encode_TCGA_Data_with_VAE.py ├── Example_Run_All.py ├── Preprocess_TCGA_Rnaseq_Expression.py ├── Preprocess_TCGA_Rnaseq_Expression_All_Genes.py └── Preprocess_TCGA_Rnaseq_Expression_All_Genes_Uncorrected.py ├── CREATE_SURVIVAL_DATAFRAMES ├── Create_Joined_Survival_Dataframes.py ├── Create_Joined_Survival_Dataframes_Cancer_Types.py └── Create_TCGA_Survival_Dataframes.py └── PREDICT_SURVIVAL ├── Plots_of_Survival_Prediction.ipynb ├── Plots_of_Survival_Prediction_VAEs.ipynb ├── Predict_Survival.py ├── Predict_Survival_Raw_Data.py ├── Predict_Survival_Subtypes_Joined.py └── Run_Models.py /COMPETITOR_TRAININGS/AE_2Layers_Model.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #AE model 3 | 4 | #Code is modified from https://github.com/keras-team/keras/blob/master/examples/variational_autoencoder.py 5 | ############################### 6 | 7 | import os 8 | import numpy as np 9 | import pandas as pd 10 | import math 11 | from sklearn.metrics import mean_squared_error 12 | import matplotlib.pyplot as plt 13 | import tensorflow as tf 14 | from keras.layers import Input, Dense, Lambda, Layer, Activation, Dropout 15 | from keras.layers.normalization import BatchNormalization 16 | from keras.models import Model 17 | from keras import backend as K 18 | from keras import metrics, optimizers 19 | from keras.callbacks import Callback 20 | import keras 21 | import csv 22 | import sys 23 | 24 | #Prevent tensorflow from using all the memory 25 | config = tf.ConfigProto() 26 | config.gpu_options.allow_growth=True 27 | sess = tf.Session(config=config) 28 | 29 | #Define reconstruction loss 30 | def reconstruction_loss(x_input, x_decoded): 31 | return metrics.mse(x_input, x_decoded) 32 | 33 | # Set hyperparameters 34 | original_dim = input_df.shape[1] 35 | intermediate1_dim = 750 36 | latent_dim = 150 37 | cancer_type = sys.argv[1] 38 | fold = int(sys.argv[2]) 39 | 40 | #SET RANDOM SEEDS 41 | from numpy.random import seed 42 | seed(123456 * fold) 43 | from tensorflow import set_random_seed 44 | set_random_seed(123456 * fold) 45 | 46 | init_mode = 'glorot_uniform' 47 | batch_size = 100 48 | epochs = 50 49 | learning_rate = 0.0005 50 | dropout = 0.1 51 | 52 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/' 53 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/AE_FILES/' 54 | 55 | #Read input file 56 | input_filename = input_folder + cancer_type + '_DATA_TOP2_JOINED_PCA_1000L.tsv' 57 | output_filename = cancer_type + '_DATA_TOP2_INTERSECTION_GENES_encoded_' 58 | 59 | input_df = pd.read_table(input_filename, index_col=0) 60 | print("INPUT FILE", input_df.shape) 61 | print(input_df.head(5)) 62 | input_df_training = input_df 63 | 64 | #Define encoder 65 | x = Input(shape=(original_dim, )) 66 | 67 | net = Dense(intermediate1_dim, kernel_initializer=init_mode)(x) 68 | net2 = BatchNormalization()(net) 69 | net3 = Activation('relu')(net2) 70 | 71 | d1 = Dropout(dropout)(net3) 72 | core = Dense(latent_dim, kernel_initializer=init_mode)(d1) 73 | 74 | #Define decoder 75 | decoder_h = Dense(intermediate1_dim, activation='relu', kernel_initializer=init_mode) 76 | d2 = Dropout(dropout) 77 | decoder_mean = Dense(original_dim, kernel_initializer=init_mode) 78 | 79 | h_decoded = decoder_h(core) 80 | h_decoded2 = d2(h_decoded) 81 | x_decoded_mean = decoder_mean(h_decoded2) 82 | 83 | #AE model 84 | ae = Model(x, x_decoded_mean) 85 | 86 | adam = optimizers.Adam(lr=learning_rate) 87 | ae.compile(optimizer=adam, loss = reconstruction_loss) 88 | ae.summary() 89 | 90 | 91 | #Train model 92 | history = ae.fit(np.array(input_df_training), np.array(input_df_training), 93 | shuffle=True, 94 | epochs=epochs, 95 | batch_size=batch_size, 96 | verbose = 2) 97 | 98 | # DEFINE ENCODER 99 | encoder = Model(x, core) 100 | 101 | #DEFINE DECODER 102 | decoder_input = Input(shape=(latent_dim, )) 103 | _h_decoded = decoder_h(decoder_input) 104 | _h_decoded2 = d2(_h_decoded) 105 | _x_decoded_mean = decoder_mean(_h_decoded2) 106 | decoder = Model(decoder_input, _x_decoded_mean) 107 | 108 | 109 | training_encoded = encoder.predict(input_df_training, batch_size = batch_size) 110 | training_encoded_df = pd.DataFrame(training_encoded, index = input_df_training.index) 111 | 112 | # How well does the model reconstruct the input data 113 | training_reconstructed = decoder.predict(np.array(training_encoded_df)) 114 | training_reconstructed_df = pd.DataFrame(training_reconstructed, index = input_df_training.index, columns = input_df_training.columns) 115 | 116 | recons_error = mean_squared_error(np.array(input_df_training), np.array(training_reconstructed_df)) 117 | 118 | print("TRAINING RECONSTRUCTION ERROR: " + str(recons_error)) 119 | 120 | #Save encoded test data 121 | training_encoded_df.to_csv(output_folder + output_filename + str(latent_dim) + "L_TRAINING_fold" + str(fold) + ".tsv", sep = '\t') 122 | 123 | 124 | #SAVE ENCODER MODEL 125 | from keras.models import model_from_json 126 | 127 | model_json = encoder.to_json() 128 | with open(output_folder + "AE_" + cancer_type + "_encoder_" + str(latent_dim) + "L_"+ str(fold) + ".json", "w") as json_file: 129 | json_file.write(model_json) 130 | 131 | encoder.save_weights(output_folder + "AE_" + cancer_type + "_encoder_" + str(latent_dim) + "L_"+ str(fold) + ".h5") 132 | print("Saved model to disk") 133 | 134 | 135 | model_json = decoder.to_json() 136 | with open(output_folder + "AE_" + cancer_type + "_decoder_" + str(latent_dim) + "L_"+ str(fold) + ".json", "w") as json_file: 137 | json_file.write(model_json) 138 | 139 | decoder.save_weights(output_folder + "AE_" + cancer_type + "_decoder_" + str(latent_dim) + "L_"+ str(fold) + ".h5") 140 | print("Saved model to disk") 141 | 142 | 143 | #Record training, validation, and test R2 144 | from sklearn.metrics import r2_score 145 | 146 | training_r2_vals = np.zeros(input_df_training.shape[0]) 147 | for i in range(input_df_training.shape[0]): 148 | training_r2 = r2_score(input_df_training.values[i, :], training_reconstructed_df.values[i, :]) 149 | training_r2_vals[i] = training_r2 150 | 151 | print("TRAINING R2 " + str(np.mean(training_r2_vals))) 152 | -------------------------------------------------------------------------------- /COMPETITOR_TRAININGS/Create_ICA_Data.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for training ICA models 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import math 8 | import csv 9 | from sklearn.decomposition import FastICA 10 | import sys 11 | 12 | #Read cancer type 13 | cancer_type = sys.argv[1] 14 | 15 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/' 16 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/ICA_FILES/' 17 | 18 | L = 150 19 | print("Number of latent nodes " + str(L)) 20 | 21 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0) 22 | print("Training data ", data_df.shape) 23 | 24 | training_data = data_df.values 25 | training_data = np.nan_to_num(training_data) 26 | 27 | #Fit 10 different ICA models with different random seeds 28 | for run in range(10): 29 | ica = FastICA(n_components = L, random_state = 12345 * run, max_iter = 100000) 30 | print(ica) 31 | 32 | ica.fit(training_data) 33 | 34 | components = ica.components_ 35 | print(components.shape) 36 | 37 | #Save the learned components 38 | component_df = pd.DataFrame(components.T, index = data_df.columns) 39 | component_df.to_csv(output_folder + cancer_type + '_DATA_TOP2_JOINED_ICA_COMPONENTS_' + str(L) + 'L_fold' + str(run + 1) + '.tsv', sep = '\t') 40 | -------------------------------------------------------------------------------- /COMPETITOR_TRAININGS/Create_PCA_Data.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for training PCA models 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import csv 8 | from sklearn.decomposition import PCA 9 | import sys 10 | 11 | #Read cancer type 12 | cancer_type = sys.argv[1] 13 | 14 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/' 15 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/PCA_FILES/' 16 | 17 | #Method for defining PCs for training data 18 | def createData(cancer_type): 19 | 20 | L = 150 21 | print("Number of latent nodes " + str(L)) 22 | 23 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0) 24 | print("Training data ", data_df.shape) 25 | 26 | training_data = data_df.values 27 | training_data = np.nan_to_num(training_data) 28 | 29 | #Fit PCA model 30 | pca = PCA(n_components = L) 31 | pca.fit(training_data) 32 | components = pca.components_ 33 | print("PCA components ", components.shape) 34 | 35 | #Save the learned components 36 | component_df = pd.DataFrame(components.T, index = data_df.columns) 37 | component_df.to_csv(output_folder + cancer_type + '_DATA_TOP2_JOINED_PCA_COMPONENTS_' + str(L) + 'L.tsv', sep = '\t') 38 | 39 | 40 | createData(cancer_type) -------------------------------------------------------------------------------- /COMPETITOR_TRAININGS/Create_RP_Data.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for training Random Projection models 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import csv 8 | from sklearn.random_projection import GaussianRandomProjection 9 | import sys 10 | 11 | #Read cancer type 12 | cancer_type = sys.argv[1] 13 | 14 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/' 15 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/RP_FILES/' 16 | 17 | #Method for defining ICA for training data 18 | def createData(cancer_type): 19 | 20 | L = 150 21 | print("Number of latent nodes " + str(L)) 22 | 23 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0) 24 | print("Training data ", data_df.shape) 25 | 26 | training_data = data_df.values 27 | training_data = np.nan_to_num(training_data) 28 | 29 | #Fit 10 different RP models with different random seeds 30 | for run in range(10): 31 | transformer = GaussianRandomProjection(n_components = L, random_state = run * 12345) 32 | transformer.fit(training_data) 33 | 34 | components = transformer.components_ 35 | print(components.shape) 36 | 37 | #Save the learned components 38 | component_df = pd.DataFrame(components.T, index = data_df.columns) 39 | component_df.to_csv(output_folder + cancer_type + '_DATA_TOP2_JOINED_RP_COMPONENTS_fold' + str(run + 1) + '.tsv', sep = '\t') 40 | 41 | 42 | createData(cancer_type) -------------------------------------------------------------------------------- /COMPETITOR_TRAININGS/DAE_2Layers_Model.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Author: Ayse Dincer 3 | #DAE model 4 | 5 | #Code is modified from https://github.com/keras-team/keras/blob/master/examples/variational_autoencoder.py 6 | ############################### 7 | 8 | import os 9 | import numpy as np 10 | import pandas as pd 11 | import math 12 | from sklearn.metrics import mean_squared_error 13 | import matplotlib.pyplot as plt 14 | 15 | import tensorflow as tf 16 | from keras.layers import Input, Dense, Lambda, Layer, Activation, Dropout 17 | from keras.layers.normalization import BatchNormalization 18 | from keras.models import Model 19 | from keras import backend as K 20 | from keras import metrics, optimizers 21 | from keras.callbacks import Callback 22 | import keras 23 | import csv 24 | 25 | import sys 26 | 27 | #Prevent tensorflow from using all the memory 28 | config = tf.ConfigProto() 29 | config.gpu_options.allow_growth=True 30 | sess = tf.Session(config=config) 31 | 32 | #Define reconstruction loss 33 | def reconstruction_loss(x_input, x_decoded): 34 | return metrics.mse(x_input, x_decoded) 35 | 36 | #Read input file 37 | cancer_type = sys.argv[1] 38 | 39 | # Set hyperparameters 40 | intermediate1_dim = 750 41 | latent_dim = 150 42 | cancer_type = sys.argv[1] 43 | fold = int(sys.argv[2]) 44 | 45 | init_mode = 'glorot_uniform' 46 | batch_size = 100 47 | epochs = 50 48 | learning_rate = 0.0005 49 | dropout = 0.1 50 | 51 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/' 52 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/DAE_FILES/' 53 | 54 | #SET RANDOM SEEDS 55 | from numpy.random import seed 56 | seed(123456 * fold) 57 | from tensorflow import set_random_seed 58 | set_random_seed(123456 * fold) 59 | 60 | input_filename = input_folder + cancer_type + '_DATA_TOP2_JOINED_PCA_1000L.tsv' 61 | output_filename = cancer_type + '_DATA_TOP2_INTERSECTION_GENES_encoded_' 62 | 63 | input_df = pd.read_table(input_filename, index_col=0) 64 | print("INPUT FILE", input_df.shape) 65 | print(input_df.head(5)) 66 | input_df_training = input_df 67 | 68 | original_dim = input_df.shape[1] 69 | 70 | #Define noisy inputs 71 | noise = np.random.normal(loc=0, scale = 1, size=input_df_training.shape) 72 | input_df_noisy = input_df_training.values + noise 73 | 74 | #Define encoder 75 | x = Input(shape=(original_dim, )) 76 | 77 | net = Dense(intermediate1_dim, kernel_initializer=init_mode)(x) 78 | net2 = BatchNormalization()(net) 79 | net3 = Activation('relu')(net2) 80 | 81 | d1 = Dropout(dropout)(net3) 82 | core = Dense(latent_dim, kernel_initializer=init_mode)(d1) 83 | 84 | #Define decoder 85 | decoder_h = Dense(intermediate1_dim, activation='relu', kernel_initializer=init_mode) 86 | d2 = Dropout(dropout) 87 | decoder_mean = Dense(original_dim, kernel_initializer=init_mode) 88 | 89 | h_decoded = decoder_h(core) 90 | h_decoded2 = d2(h_decoded) 91 | x_decoded_mean = decoder_mean(h_decoded2) 92 | 93 | #DAE model 94 | dae = Model(x, x_decoded_mean) 95 | 96 | adam = optimizers.Adam(lr=learning_rate) 97 | dae.compile(optimizer=adam, loss = reconstruction_loss) 98 | dae.summary() 99 | 100 | 101 | #Train from only training data 102 | history = dae.fit(np.array(input_df_noisy), np.array(input_df_training), 103 | shuffle=True, 104 | epochs=epochs, 105 | batch_size=batch_size, 106 | verbose = 2) 107 | 108 | # DEFINE ENCODER 109 | encoder = Model(x, core) 110 | 111 | #DEFINE DECODER 112 | decoder_input = Input(shape=(latent_dim, )) 113 | _h_decoded = decoder_h(decoder_input) 114 | _h_decoded2 = d2(_h_decoded) 115 | _x_decoded_mean = decoder_mean(_h_decoded2) 116 | decoder = Model(decoder_input, _x_decoded_mean) 117 | 118 | 119 | training_encoded = encoder.predict(input_df_training, batch_size = batch_size) 120 | training_encoded_df = pd.DataFrame(training_encoded, index = input_df_training.index) 121 | 122 | # How well does the model reconstruct the input data 123 | training_reconstructed = decoder.predict(np.array(training_encoded_df)) 124 | training_reconstructed_df = pd.DataFrame(training_reconstructed, index = input_df_training.index, columns = input_df_training.columns) 125 | 126 | recons_error = mean_squared_error(np.array(input_df_training), np.array(training_reconstructed_df)) 127 | 128 | print("TRAINING RECONSTRUCTION ERROR: " + str(recons_error)) 129 | 130 | #Save encoded test data 131 | training_encoded_df.to_csv(output_folder + output_filename + str(latent_dim) + "L_TRAINING_fold" + str(fold) + ".tsv", sep = '\t') 132 | 133 | 134 | #SAVE ENCODER MODEL 135 | from keras.models import model_from_json 136 | 137 | model_json = encoder.to_json() 138 | with open(output_folder + "DAE_" + cancer_type + "_encoder_" + str(latent_dim) + "L_"+ str(fold) + ".json", "w") as json_file: 139 | json_file.write(model_json) 140 | 141 | encoder.save_weights(output_folder + "DAE_" + cancer_type + "_encoder_" + str(latent_dim) + "L_"+ str(fold) + ".h5") 142 | print("Saved model to disk") 143 | 144 | 145 | model_json = decoder.to_json() 146 | with open(output_folder + "DAE_" + cancer_type + "_decoder_" + str(latent_dim) + "L_"+ str(fold) + ".json", "w") as json_file: 147 | json_file.write(model_json) 148 | 149 | decoder.save_weights(output_folder + "DAE_" + cancer_type + "_decoder_" + str(latent_dim) + "L_"+ str(fold) + ".h5") 150 | print("Saved model to disk") 151 | 152 | 153 | #Record training R2 154 | from sklearn.metrics import r2_score 155 | 156 | training_r2_vals = np.zeros(input_df_training.shape[0]) 157 | for i in range(input_df_training.shape[0]): 158 | training_r2 = r2_score(input_df_training.values[i, :], training_reconstructed_df.values[i, :]) 159 | training_r2_vals[i] = training_r2 160 | 161 | print("TRAINING R2 " + str(np.mean(training_r2_vals))) 162 | -------------------------------------------------------------------------------- /COMPETITOR_TRAININGS/Example_Run_All.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Example for training competitor models for a cancer type 3 | ############################### 4 | 5 | get_ipython().magic(u"run -i Create_PCA_Data.py BRCA") 6 | 7 | get_ipython().magic(u"run -i Create_ICA_Data.py BRCA") 8 | 9 | get_ipython().magic(u"run -i Create_RP_Data.py BRCA") 10 | 11 | get_ipython().magic(u"run -i Train_AE_Models.py BRCA") 12 | get_ipython().magic(u"run -i Get_AE_IG_Attributions.py BRCA 0") 13 | 14 | get_ipython().magic(u"run -i Train_DAE_Models.py BRCA") 15 | get_ipython().magic(u"run -i Get_DAE_IG_Attributions.py BRCA 0") 16 | -------------------------------------------------------------------------------- /COMPETITOR_TRAININGS/Get_AE_IG_Attributions.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for running integrated gradients to get gene-level attributions of each node 3 | ############################### 4 | 5 | import os 6 | import numpy as np 7 | import pandas as pd 8 | import math 9 | from sklearn.metrics import mean_squared_error 10 | import tensorflow as tf 11 | from keras.layers import Input, Dense, Lambda, Layer, Activation 12 | from keras.layers.normalization import BatchNormalization 13 | from keras.models import Model 14 | from keras import backend as K 15 | from keras import metrics, optimizers 16 | from keras.callbacks import Callback 17 | import keras 18 | import csv 19 | from keras.models import model_from_json 20 | import sys 21 | 22 | #Define reconstruction loss 23 | def reconstruction_loss(x_input, x_decoded): 24 | return metrics.mse(x_input, x_decoded) 25 | 26 | #Prevent tensorflow from using all the memory 27 | config = tf.ConfigProto() 28 | config.gpu_options.allow_growth=True 29 | sess = tf.Session(config=config) 30 | 31 | #Read all user inputs 32 | cancer = sys.argv[1] 33 | vae_run = int(sys.argv[2]) 34 | dimension = 150 35 | 36 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/' 37 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/AE_FILES/' 38 | 39 | #Load PCA weights 40 | pca_df = pd.read_table(input_folder + cancer + '_DATA_TOP2_JOINED_PCA_1000L_COMPONENTS.tsv', index_col = 0) 41 | print("PCA COMPONENTS ", pca_df.shape) 42 | pca_components = pca_df.values 43 | 44 | 45 | #Save the weight for each 100 runs 46 | print("MODEL " + str(vae_run)) 47 | 48 | #Load model 49 | json_file = open(input_folder 'AE_FILES/AE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(vae_run) + '.json', 'r') 50 | loaded_model_json = json_file.read() 51 | json_file.close() 52 | 53 | encoder = model_from_json(loaded_model_json) 54 | encoder.load_weights(input_folder + 'AE_FILES/AE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(vae_run) + '.h5') 55 | print("Loaded model from disk") 56 | 57 | #Read input data 58 | input_df = pd.read_table(input_folder + cancer + '_DATA_TOP2_JOINED_PCA_1000L.tsv', index_col=0) 59 | print("INPUT FILE ", input_df.shape) 60 | 61 | #Define hyperparameters 62 | input_df_training = input_df 63 | original_dim = input_df_training.shape[1] 64 | intermediate1_dim = 100 65 | intermediate2_dim = 25 66 | latent_dim = dimension 67 | 68 | batch_size = 50 69 | epochs = 50 70 | learning_rate = 0.0005 71 | beta = K.variable(1) 72 | kappa = 0 73 | 74 | #Define encoder 75 | x = Input(shape=(original_dim, )) 76 | 77 | net = Dense(intermediate1_dim)(x) 78 | net2 = BatchNormalization()(net) 79 | net3 = Activation('relu')(net2) 80 | 81 | net4 = Dense(intermediate2_dim)(net3) 82 | net5 = BatchNormalization()(net4) 83 | net6 = Activation('relu')(net5) 84 | 85 | adam = optimizers.Adam(lr=learning_rate) 86 | encoder.compile(optimizer=adam, loss = reconstruction_loss) 87 | encoder.summary() 88 | 89 | #Encode training data using the model 90 | training_encoded = encoder.predict(input_df_training, batch_size = batch_size) 91 | print("ENCODED TRAINING DATA ", training_encoded.shape) 92 | 93 | #Measure weights and save absolute value of importance, averaged over samples 94 | from IntegratedGradients import * 95 | 96 | ig = integrated_gradients(encoder) 97 | 98 | overall_weights = np.zeros((pca_components.shape[0], dimension)) 99 | 100 | for latent in range(dimension): 101 | print("Node " + str(latent + 1)) 102 | weights = np.zeros((pca_components.shape[0], input_df_training.shape[0])) 103 | 104 | for i in range(input_df_training.shape[0]): 105 | vals = ig.explain(input_df_training.values[i, :], latent) 106 | new_vals = np.matmul(vals, pca_components.T) 107 | weights[:, i] = new_vals 108 | 109 | #Take absolute values avg over all samples 110 | overall_weights[:, latent] = np.mean(np.abs(weights), axis = 1) 111 | 112 | ig_df = pd.DataFrame(overall_weights, index = pca_df.index) 113 | print("EXPLANATIONS DF ", ig_df.shape) 114 | 115 | ig_df.to_csv(output_folder + cancer + '_DATA_AE_Weights_TRAINING_' + str(dimension) + 'L_fold' + str(vae_run) + '.tsv', sep='\t', quoting = csv.QUOTE_NONE) 116 | print(ig_df.shape) 117 | -------------------------------------------------------------------------------- /COMPETITOR_TRAININGS/Get_DAE_IG_Attributions.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for running integrated gradients to get gene-level attributions of each node 3 | ############################### 4 | 5 | import os 6 | import numpy as np 7 | import pandas as pd 8 | import math 9 | from sklearn.metrics import mean_squared_error 10 | import tensorflow as tf 11 | from keras.layers import Input, Dense, Lambda, Layer, Activation 12 | from keras.layers.normalization import BatchNormalization 13 | from keras.models import Model 14 | from keras import backend as K 15 | from keras import metrics, optimizers 16 | from keras.callbacks import Callback 17 | import keras 18 | import csv 19 | from keras.models import model_from_json 20 | import sys 21 | 22 | #Prevent tensorflow from using all the memory 23 | config = tf.ConfigProto() 24 | config.gpu_options.allow_growth=True 25 | sess = tf.Session(config=config) 26 | 27 | #Read all user inputs 28 | cancer = sys.argv[1] 29 | vae_run = int(sys.argv[2]) 30 | dimension = 150 31 | 32 | input_folder = '../ALL_CANCER_FILES/' + cancer + '/' 33 | output_folder = '../ALL_CANCER_FILES/' + cancer + '/DAE_FILES/' 34 | 35 | #Load PCA weights 36 | pca_df = pd.read_table(input_folder + cancer + '_DATA_TOP2_JOINED_PCA_1000L_COMPONENTS.tsv', index_col = 0) 37 | print("PCA COMPONENTS ", pca_df.shape) 38 | pca_components = pca_df.values 39 | 40 | #Define reconstruction loss 41 | def reconstruction_loss(x_input, x_decoded): 42 | return metrics.mse(x_input, x_decoded) 43 | 44 | #Save the weight for each 100 runs 45 | print("MODEL " + str(vae_run)) 46 | 47 | #Load model 48 | json_file = open(input_folder + 'DAE_FILES/DAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(vae_run) + '.json', 'r') 49 | loaded_model_json = json_file.read() 50 | json_file.close() 51 | 52 | encoder = model_from_json(loaded_model_json) 53 | encoder.load_weights(input_folder + 'DAE_FILES/DAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(vae_run) + '.h5') 54 | print("Loaded model from disk") 55 | 56 | #Read input data 57 | input_df = pd.read_table(input_folder + cancer + '_DATA_TOP2_JOINED_PCA_1000L.tsv', index_col=0) 58 | print("INPUT FILE ", input_df.shape) 59 | 60 | #Define hyperparameters 61 | input_df_training = input_df 62 | original_dim = input_df_training.shape[1] 63 | intermediate1_dim = 100 64 | intermediate2_dim = 25 65 | latent_dim = dimension 66 | 67 | batch_size = 50 68 | epochs = 50 69 | learning_rate = 0.0005 70 | beta = K.variable(1) 71 | kappa = 0 72 | 73 | #Define encoder 74 | x = Input(shape=(original_dim, )) 75 | 76 | net = Dense(intermediate1_dim)(x) 77 | net2 = BatchNormalization()(net) 78 | net3 = Activation('relu')(net2) 79 | 80 | net4 = Dense(intermediate2_dim)(net3) 81 | net5 = BatchNormalization()(net4) 82 | net6 = Activation('relu')(net5) 83 | 84 | adam = optimizers.Adam(lr=learning_rate) 85 | encoder.compile(optimizer=adam, loss = reconstruction_loss) 86 | encoder.summary() 87 | 88 | #Encode training data using the model 89 | training_encoded = encoder.predict(input_df_training, batch_size = batch_size) 90 | print("ENCODED TRAINING DATA ", training_encoded.shape) 91 | 92 | #Measure weights and save absolute value of importance, averaged over samples 93 | from IntegratedGradients import * 94 | 95 | ig = integrated_gradients(encoder) 96 | 97 | overall_weights = np.zeros((pca_components.shape[0], dimension)) 98 | 99 | for latent in range(dimension): 100 | print("Node " + str(latent + 1)) 101 | weights = np.zeros((pca_components.shape[0], input_df_training.shape[0])) 102 | 103 | for i in range(input_df_training.shape[0]): 104 | vals = ig.explain(input_df_training.values[i, :], latent) 105 | new_vals = np.matmul(vals, pca_components.T) 106 | weights[:, i] = new_vals 107 | 108 | #Take absolute values avg over all samples 109 | overall_weights[:, latent] = np.mean(np.abs(weights), axis = 1) 110 | 111 | ig_df = pd.DataFrame(overall_weights, index = pca_df.index) 112 | print("EXPLANATIONS DF ", ig_df.shape) 113 | 114 | ig_df.to_csv(output_folder + cancer + '_DATA_DAE_Weights_TRAINING_' + str(dimension) + 'L_fold' + str(vae_run) + '.tsv', sep='\t', quoting = csv.QUOTE_NONE) 115 | print(ig_df.shape) 116 | -------------------------------------------------------------------------------- /COMPETITOR_TRAININGS/IntegratedGradients.py: -------------------------------------------------------------------------------- 1 | ################################################################ 2 | # Implemented by Naozumi Hiranuma (hiranumn@uw.edu) # 3 | # # 4 | # Keras-compatible implmentation of Integrated Gradients # 5 | # proposed in "Axiomatic attribution for deep neuron networks" # 6 | # (https://arxiv.org/abs/1703.01365). # 7 | # # 8 | # Keywords: Shapley values, interpretable machine learning # 9 | ################################################################ 10 | 11 | from __future__ import division, print_function 12 | import numpy as np 13 | from time import sleep 14 | import sys 15 | import keras.backend as K 16 | 17 | from keras.models import Model, Sequential 18 | 19 | ''' 20 | Integrated gradients approximates Shapley values by integrating partial 21 | gradients with respect to input features from reference input to the 22 | actual input. The following class implements the paper "Axiomatic attribution 23 | for deep neuron networks". 24 | ''' 25 | class integrated_gradients: 26 | # model: Keras model that you wish to explain. 27 | # outchannels: In case the model are multi tasking, you can specify which output you want explain . 28 | def __init__(self, model, outchannels=[], verbose=1): 29 | 30 | #get backend info (either tensorflow or theano) 31 | self.backend = K.backend() 32 | 33 | #load model supports keras.Model and keras.Sequential 34 | if isinstance(model, Sequential): 35 | self.model = model.model 36 | elif isinstance(model, Model): 37 | self.model = model 38 | else: 39 | print("Invalid input model") 40 | return -1 41 | 42 | #load input tensors 43 | self.input_tensors = [] 44 | for i in self.model.inputs: 45 | self.input_tensors.append(i) 46 | # The learning phase flag is a bool tensor (0 = test, 1 = train) 47 | # to be passed as input to any Keras function that uses 48 | # a different behavior at train time and test time. 49 | self.input_tensors.append(K.learning_phase()) 50 | 51 | #If outputchanels are specified, use it. 52 | #Otherwise evalueate all outputs. 53 | self.outchannels = outchannels 54 | if len(self.outchannels) == 0: 55 | if verbose: print("Evaluated output channel (0-based index): All") 56 | if K.backend() == "tensorflow": 57 | self.outchannels = range(self.model.output.shape[1]._value) 58 | elif K.backend() == "theano": 59 | self.outchannels = range(self.model.output._keras_shape[1]) 60 | else: 61 | if verbose: 62 | print("Evaluated output channels (0-based index):") 63 | print(','.join([str(i) for i in self.outchannels])) 64 | 65 | #Build gradient functions for desired output channels. 66 | self.get_gradients = {} 67 | if verbose: print("Building gradient functions") 68 | 69 | # Evaluate over all requested channels. 70 | for c in self.outchannels: 71 | # Get tensor that calculates gradient 72 | if K.backend() == "tensorflow": 73 | gradients = self.model.optimizer.get_gradients(self.model.output[:, c], self.model.input) 74 | if K.backend() == "theano": 75 | gradients = self.model.optimizer.get_gradients(self.model.output[:, c].sum(), self.model.input) 76 | 77 | # Build computational graph that computes the tensors given inputs 78 | self.get_gradients[c] = K.function(inputs=self.input_tensors, outputs=gradients) 79 | 80 | # This takes a lot of time for a big model with many tasks. 81 | # So lets print the progress. 82 | if verbose: 83 | sys.stdout.write('\r') 84 | sys.stdout.write("Progress: "+str(int((c+1)*1.0/len(self.outchannels)*1000)*1.0/10)+"%") 85 | sys.stdout.flush() 86 | # Done 87 | if verbose: print("\nDone.") 88 | 89 | 90 | ''' 91 | Input: sample to explain, channel to explain 92 | Optional inputs: 93 | - reference: reference values (defaulted to 0s). 94 | - steps: # steps from reference values to the actual sample (defualted to 50). 95 | Output: list of numpy arrays to integrated over. 96 | ''' 97 | def explain(self, sample, outc=0, reference=False, num_steps=50, verbose=0): 98 | 99 | # Each element for each input stream. 100 | samples = [] 101 | numsteps = [] 102 | step_sizes = [] 103 | 104 | # If multiple inputs are present, feed them as list of np arrays. 105 | if isinstance(sample, list): 106 | #If reference is present, reference and sample size need to be equal. 107 | if reference != False: 108 | assert len(sample) == len(reference) 109 | for i in range(len(sample)): 110 | if reference == False: 111 | _output = integrated_gradients.linearly_interpolate(sample[i], False, num_steps) 112 | else: 113 | _output = integrated_gradients.linearly_interpolate(sample[i], reference[i], num_steps) 114 | samples.append(_output[0]) 115 | numsteps.append(_output[1]) 116 | step_sizes.append(_output[2]) 117 | 118 | # Or you can feed just a single numpy arrray. 119 | elif isinstance(sample, np.ndarray): 120 | _output = integrated_gradients.linearly_interpolate(sample, reference, num_steps) 121 | samples.append(_output[0]) 122 | numsteps.append(_output[1]) 123 | step_sizes.append(_output[2]) 124 | 125 | # Desired channel must be in the list of outputchannels 126 | assert outc in self.outchannels 127 | if verbose: print("Explaning the "+str(self.outchannels[outc])+"th output.") 128 | 129 | # For tensorflow backend 130 | _input = [] 131 | for s in samples: 132 | _input.append(s) 133 | _input.append(0) 134 | 135 | if K.backend() == "tensorflow": 136 | gradients = self.get_gradients[outc](_input) 137 | elif K.backend() == "theano": 138 | gradients = self.get_gradients[outc](_input) 139 | if len(self.model.inputs) == 1: 140 | gradients = [gradients] 141 | 142 | explanation = [] 143 | for i in range(len(gradients)): 144 | _temp = np.sum(gradients[i], axis=0) 145 | explanation.append(np.multiply(_temp, step_sizes[i])) 146 | 147 | # Format the return values according to the input sample. 148 | if isinstance(sample, list): 149 | return explanation 150 | elif isinstance(sample, np.ndarray): 151 | return explanation[0] 152 | return -1 153 | 154 | 155 | ''' 156 | Input: numpy array of a sample 157 | Optional inputs: 158 | - reference: reference values (defaulted to 0s). 159 | - steps: # steps from reference values to the actual sample. 160 | Output: list of numpy arrays to integrate over. 161 | ''' 162 | @staticmethod 163 | def linearly_interpolate(sample, reference=False, num_steps=50): 164 | # Use default reference values if reference is not specified 165 | if reference is False: reference = np.zeros(sample.shape); 166 | 167 | # Reference and sample shape needs to match exactly 168 | assert sample.shape == reference.shape 169 | 170 | # Calcuated stepwise difference from reference to the actual sample. 171 | ret = np.zeros(tuple([num_steps] +[i for i in sample.shape])) 172 | for s in range(num_steps): 173 | ret[s] = reference+(sample-reference)*(s*1.0/num_steps) 174 | 175 | return ret, num_steps, (sample-reference)*(1.0/num_steps) 176 | -------------------------------------------------------------------------------- /COMPETITOR_TRAININGS/Train_AE_Models.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for training AE models 3 | ############################### 4 | import sys 5 | cancer_type = sys.argv[1] 6 | 7 | for run in range(10): 8 | get_ipython().magic(u"run -i 'AE_2Layers_Model.py' " + cancer_type + " " + str(run)) 9 | -------------------------------------------------------------------------------- /COMPETITOR_TRAININGS/Train_DAE_Models.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for training DAE models 3 | ############################### 4 | import sys 5 | cancer_type = sys.argv[1] 6 | 7 | for run in range(10): 8 | get_ipython().magic(u"run -i 'DAE_2Layers_Model.py' " + cancer_type + " " + str(run)) 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Lee Lab @ UW Allen School 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MODEL_TRAININGS/Create_DeepProfile_Ensemble_Weights.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #This script is for creating gene attribution matrices for DeepProfile 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import csv 8 | import sys 9 | 10 | #Read user input 11 | cancer_type = sys.argv[1] 12 | 13 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/' 14 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/' 15 | 16 | #Read all VAE model gene attributions 17 | L = 150 18 | data_df = pd.read_table(input_folder + 'VAE_WEIGHTS/' + cancer_type + '_DATA_VAE_Cluster_Weights_TRAINING_' + str(100) + 'L_fold' + str(1) + '.tsv', index_col = 0) 19 | print(data_df.shape) 20 | basic_length = data_df.shape[0] 21 | 22 | weight_list = [] 23 | dims = [5, 10, 25, 50, 75, 100] 24 | run_count = 100 25 | for dim in dims: 26 | VAE_weights = np.zeros((run_count * dim, basic_length)) 27 | for i in range(run_count): 28 | data_df = pd.read_table(input_folder + 'VAE_WEIGHTS/' + cancer_type + '_DATA_VAE_Cluster_Weights_TRAINING_' + str(dim) + 'L_fold' + str(i) + '.tsv', index_col = 0) 29 | data_df = data_df.T 30 | #print(data_df.shape) 31 | start = dim * i 32 | end = dim * (i + 1) 33 | VAE_weights[start:end, :] = data_df.values 34 | weight_list.append(VAE_weights) 35 | 36 | #Read the ensemble labels 37 | labels_df = pd.read_table(input_folder + cancer_type + '_TRAINING_DATA_kmeans_ENSEMBLE_LABELS_' + str(L) + 'L.txt', header= None) 38 | labels = labels_df.values 39 | print("Ensemble labels ", len(labels)) 40 | 41 | #Concatenate all the gene attributions 42 | joined_weights = np.concatenate(weight_list) 43 | print("Joined weights ", joined_weights.shape) 44 | 45 | #Create ensemble weights 46 | ensemble_weights = np.zeros((L, joined_weights.shape[1])) 47 | for label in range(L): 48 | indices = np.where(labels == label)[0] 49 | average_weights = np.mean(joined_weights[indices, :], axis = 0) 50 | ensemble_weights[label, :] = average_weights 51 | 52 | print("Ensemble weights ", ensemble_weights.shape) 53 | 54 | #Record ensemble weights 55 | ensemble_weights_df = pd.DataFrame(ensemble_weights, index = np.arange(L), columns = data_df.columns) 56 | ensemble_weights_df.to_csv(output_folder + cancer_type + '_DeepProfile_Ensemble_Gene_Importance_Weights_' + str(L) + 'L.tsv', sep = '\t') -------------------------------------------------------------------------------- /MODEL_TRAININGS/Create_DeepProfile_Training_Embeddings.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #This script is for creating training embeddings 3 | ############################### 4 | 5 | import pandas as pd 6 | import numpy as np 7 | import csv 8 | import sys 9 | 10 | #Read user input 11 | cancer_type = sys.argv[1] 12 | 13 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/' 14 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/' 15 | 16 | #Read all training embeddings 17 | dims = [5, 10, 25, 50, 75, 100] 18 | data_list = [] 19 | for dim in dims: 20 | run = 100 21 | for i in range(run): 22 | data_df = pd.read_table(input_folder + 'VAE_FILES/' + cancer_type + '_DATA_TOP2_JOINED_encoded_' + str(dim) + 'L_TRAINING_fold' + str(i) + '.tsv', index_col = 0) 23 | print(data_df.shape) 24 | data_list.append(data_df.values) 25 | 26 | joined_data = np.concatenate(data_list, axis=1) 27 | print("Joined training embeddings" , joined_data.shape) 28 | 29 | #Read the ensemble labels 30 | L = 150 31 | labels_df = pd.read_table(input_folder + cancer_type + '_TRAINING_DATA_kmeans_ENSEMBLE_LABELS_' + str(L) + 'L.txt', header= None) 32 | labels = labels_df.values 33 | print("Ensemble labels ", len(labels)) 34 | 35 | #Create ensemble training embeddings 36 | ensemble_embeddings = np.zeros((joined_data.shape[0], L)) 37 | for label in range(L): 38 | indices = np.where(labels == label)[0] 39 | average_values = np.mean(joined_data[:, indices], axis = 1) 40 | ensemble_embeddings[:, label] = average_values 41 | 42 | print("Training ensemble embedding ", ensemble_embeddings.shape) 43 | 44 | #Save the training embedding 45 | ensemble_embeddings_df = pd.DataFrame(ensemble_embeddings, index = data_df.index, columns = np.arange(L)) 46 | ensemble_embeddings_df.to_csv(output_folder + cancer_type + '_DeepProfile_Training_Embedding_' + str(L) + 'L.tsv', sep = '\t') -------------------------------------------------------------------------------- /MODEL_TRAININGS/Create_Ensemble_Labels.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #This script is for learning ensemble labels for VAE models 3 | ############################### 4 | 5 | import pandas as pd 6 | import numpy as np 7 | import csv 8 | import sys 9 | from sklearn.cluster import KMeans 10 | 11 | #Read user inputs 12 | cancer_type = sys.argv[1] 13 | final_dim = int(sys.argv[2]) 14 | print("FINAL DIM " + str(final_dim)) 15 | 16 | #Read all training embeddings 17 | dims = [5, 10, 25, 50, 75, 100] 18 | data_list = [] 19 | 20 | for dim in dims: 21 | run = 100 22 | for i in range(run): 23 | print(i) 24 | data_df = pd.read_table('../ALL_CANCER_FILES/' + cancer_type + '/VAE_FILES/' + cancer_type + '_DATA_TOP2_JOINED_encoded_' + str(dim) + 'L_TRAINING_fold' + str(i) + '.tsv', index_col = 0) 25 | print(data_df.shape) 26 | data_list.append(data_df.values) 27 | 28 | joined_df = np.concatenate(data_list, axis=1) 29 | print("Joined training embeddings" , joined_df.shape) 30 | 31 | #Apply kmeans clustering to this data 32 | X = joined_df 33 | 34 | kmeans = KMeans(n_clusters= final_dim, random_state=123).fit(X.transpose()) 35 | print("K-means labels ", kmeans.labels_) 36 | 37 | #Save labels 38 | np.savetxt('../ALL_CANCER_FILES/' + cancer_type + '/' + cancer_type + '_TRAINING_DATA_kmeans_ENSEMBLE_LABELS_' + str(final_dim) + 'L.txt' , kmeans.labels_, delimiter=',') 39 | -------------------------------------------------------------------------------- /MODEL_TRAININGS/Create_PCs_for_DeepLearning_Models.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #This script is for PCA transforming the input data to pass to deep learning models 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import csv 8 | from sklearn.decomposition import PCA 9 | import sys 10 | 11 | #Read cancer type input 12 | cancer_type = sys.argv[1] 13 | #Read number of components 14 | component_count = int(sys.argv[2]) 15 | 16 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/' 17 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/' 18 | 19 | #Method for creating PCs 20 | def createPCs(cancer_type): 21 | 22 | print("************************* " + cancer_type) 23 | 24 | #Read training data 25 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0) 26 | print("Training data ", data_df.shape) 27 | training_data = data_df.values 28 | training_data = np.nan_to_num(training_data) 29 | 30 | #Transform training data to top principal components 31 | pca = PCA(n_components = component_count) 32 | pca.fit(training_data) 33 | components = pca.components_ 34 | print("PCA components ", components.shape) 35 | 36 | #Save the learned components 37 | component_df = pd.DataFrame(components.T, index = data_df.columns) 38 | component_df.to_csv(output_folder + cancer_type + '_DATA_TOP2_JOINED_PCA_' + str(component_count) + 'L_COMPONENTS.tsv', sep = '\t') 39 | 40 | #Save the encoded data 41 | encoded_data = pca.transform(training_data) 42 | print("PCA encoded data ", encoded_data.shape) 43 | encoded_df = pd.DataFrame(encoded_data, index = data_df.index) 44 | encoded_df.to_csv(output_folder + cancer_type + '_DATA_TOP2_JOINED_PCA_' + str(component_count) + 'L.tsv', sep = '\t') 45 | 46 | createPCs(cancer_type) -------------------------------------------------------------------------------- /MODEL_TRAININGS/Example_Run_All.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Example for training model for a cancer type 3 | ############################### 4 | import sys 5 | 6 | ##STEP 1: Creating PCs 7 | get_ipython().magic(u"run -i Create_PCs_for_DeepLearning_Models.py BRCA 1000") 8 | 9 | ##STEP 2: Training VAE models 10 | get_ipython().magic(u"run -i Run_VAE_Models.py BRCA 5 0 100") 11 | get_ipython().magic(u"run -i Run_VAE_Models.py BRCA 10 0 100") 12 | get_ipython().magic(u"run -i Run_VAE_Models.py BRCA 25 0 100") 13 | get_ipython().magic(u"run -i Run_VAE_Models.py BRCA 50 0 100") 14 | get_ipython().magic(u"run -i Run_VAE_Models.py BRCA 75 0 100") 15 | get_ipython().magic(u"run -i Run_VAE_Models.py BRCA 100 0 100") 16 | 17 | ##STEP 3: Running IG for VAE models 18 | get_ipython().magic(u"run -i Get_VAE_IG_Attributions.py BRCA 5 0 100") 19 | get_ipython().magic(u"run -i Get_VAE_IG_Attributions.py BRCA 10 0 100") 20 | get_ipython().magic(u"run -i Get_VAE_IG_Attributions.py BRCA 25 0 100") 21 | get_ipython().magic(u"run -i Get_VAE_IG_Attributions.py BRCA 50 0 100") 22 | get_ipython().magic(u"run -i Get_VAE_IG_Attributions.py BRCA 75 0 100") 23 | get_ipython().magic(u"run -i Get_VAE_IG_Attributions.py BRCA 100 0 100") 24 | 25 | ##STEP 4: Learning ensemble labels 26 | get_ipython().magic(u"run -i Create_Ensemble_Labels.py BRCA 150") 27 | 28 | ##STEP 5: Creating DeepProfile ensemble training embedding 29 | get_ipython().magic(u"run -i Create_DeepProfile_Training_Embeddings.py BRCA") 30 | 31 | ##STEP 6: Creating DeepProfile ensemble gene attribution matrices 32 | get_ipython().magic(u"run -i Create_DeepProfile_Ensemble_Weights.py BRCA") 33 | 34 | -------------------------------------------------------------------------------- /MODEL_TRAININGS/Get_VAE_IG_Attributions.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for running integrated gradients to get gene-level attributions of each node 3 | ############################### 4 | 5 | import os 6 | import numpy as np 7 | import pandas as pd 8 | import math 9 | from sklearn.metrics import mean_squared_error 10 | import tensorflow as tf 11 | from keras.layers import Input, Dense, Lambda, Layer, Activation 12 | from keras.layers.normalization import BatchNormalization 13 | from keras.models import Model 14 | from keras import backend as K 15 | from keras import metrics, optimizers 16 | from keras.callbacks import Callback 17 | import keras 18 | import csv 19 | from keras.models import model_from_json 20 | import sys 21 | 22 | #Prevent tensorflow from using all the memory 23 | config = tf.ConfigProto() 24 | config.gpu_options.allow_growth=True 25 | sess = tf.Session(config=config) 26 | 27 | #Read all user inputs 28 | cancer = sys.argv[1] 29 | dimension = int(sys.argv[2]) 30 | start = int(sys.argv[3]) 31 | end = int(sys.argv[4]) 32 | 33 | print("CANCER " + str(cancer)) 34 | print("DIM " + str(dimension)) 35 | print("START " + str(start)) 36 | print("END " + str(end)) 37 | 38 | input_folder = '../ALL_CANCER_FILES/' + cancer + '/' 39 | output_folder = '../ALL_CANCER_FILES/' + cancer + '/VAE_WEIGHTS/' 40 | 41 | #Load PCA weights 42 | pca_df = pd.read_table(input_folder + cancer + '_DATA_TOP2_JOINED_PCA_1000L_COMPONENTS.tsv', index_col = 0) 43 | print("PCA COMPONENTS ", pca_df.shape) 44 | pca_components = pca_df.values 45 | 46 | #Read input data 47 | input_df = pd.read_table(input_folder + cancer + '_DATA_TOP2_JOINED_PCA_1000L.tsv', index_col=0) 48 | print("INPUT FILE ", input_df.shape) 49 | 50 | #VAE loss definition 51 | def vae_loss(x_input, x_decoded): 52 | reconstruction_loss = original_dim * metrics.mse(x_input, x_decoded) 53 | kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) 54 | return K.mean(reconstruction_loss + (K.get_value(beta) * kl_loss)) 55 | 56 | #Save the weight for each run 57 | for vae_run in range(start, end): 58 | 59 | print("MODEL " + str(vae_run)) 60 | 61 | #Load model 62 | json_file = open(input_folder + 'VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(vae_run) + '.json', 'r') 63 | loaded_model_json = json_file.read() 64 | json_file.close() 65 | encoder = model_from_json(loaded_model_json) 66 | 67 | #Load weights 68 | encoder.load_weights(input_folder + 'VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(vae_run) + '.h5') 69 | print("Loaded model from disk") 70 | 71 | #Define hyperparameters 72 | input_df_training = input_df 73 | original_dim = input_df_training.shape[1] 74 | intermediate1_dim = 100 75 | intermediate2_dim = 25 76 | latent_dim = dimension 77 | 78 | batch_size = 50 79 | epochs = 50 80 | learning_rate = 0.0005 81 | beta = K.variable(1) 82 | kappa = 0 83 | 84 | #Encoder network 85 | x = Input(shape=(original_dim, )) 86 | 87 | net = Dense(intermediate1_dim)(x) 88 | net2 = BatchNormalization()(net) 89 | net3 = Activation('relu')(net2) 90 | 91 | net4 = Dense(intermediate2_dim)(net3) 92 | net5 = BatchNormalization()(net4) 93 | net6 = Activation('relu')(net5) 94 | 95 | z_mean = Dense(latent_dim)(net6) 96 | z_log_var = Dense(latent_dim)(net6) 97 | 98 | adam = optimizers.Adam(lr=learning_rate) 99 | encoder.compile(optimizer=adam, loss = vae_loss) 100 | encoder.summary() 101 | 102 | #Encode training data using the model 103 | training_encoded = encoder.predict(input_df_training, batch_size = batch_size) 104 | print("ENCODED TRAINING DATA ", training_encoded.shape) 105 | 106 | 107 | #Measure weights and save absolute value of importance, averaged over samples 108 | from IntegratedGradients import * 109 | 110 | ig = integrated_gradients(encoder) 111 | 112 | overall_weights = np.zeros((pca_components.shape[0], dimension)) 113 | 114 | #Go over each node 115 | for latent in range(dimension): 116 | print("Node " + str(latent + 1)) 117 | weights = np.zeros((pca_components.shape[0], input_df_training.shape[0])) 118 | 119 | #Go over each sample 120 | for i in range(input_df_training.shape[0]): 121 | #print("Sample " + str(i + 1)) 122 | vals = ig.explain(input_df_training.values[i, :], latent) 123 | new_vals = np.matmul(vals, pca_components.T) 124 | weights[:, i] = new_vals 125 | 126 | #Take absolute values avg over all samples 127 | overall_weights[:, latent] = np.mean(np.abs(weights), axis = 1) 128 | 129 | ig_df = pd.DataFrame(overall_weights, index = pca_df.index) 130 | print("EXPLANATIONS DF ", ig_df.shape) 131 | 132 | ig_df.to_csv(output_folder + cancer + '_DATA_VAE_Cluster_Weights_TRAINING_' + str(dimension) + 'L_fold' + str(vae_run) + '.tsv', sep='\t', quoting = csv.QUOTE_NONE) 133 | 134 | -------------------------------------------------------------------------------- /MODEL_TRAININGS/IntegratedGradients.py: -------------------------------------------------------------------------------- 1 | ################################################################ 2 | # Implemented by Naozumi Hiranuma (hiranumn@uw.edu) # 3 | # # 4 | # Keras-compatible implmentation of Integrated Gradients # 5 | # proposed in "Axiomatic attribution for deep neuron networks" # 6 | # (https://arxiv.org/abs/1703.01365). # 7 | # # 8 | # Keywords: Shapley values, interpretable machine learning # 9 | ################################################################ 10 | 11 | from __future__ import division, print_function 12 | import numpy as np 13 | from time import sleep 14 | import sys 15 | import keras.backend as K 16 | 17 | from keras.models import Model, Sequential 18 | 19 | ''' 20 | Integrated gradients approximates Shapley values by integrating partial 21 | gradients with respect to input features from reference input to the 22 | actual input. The following class implements the paper "Axiomatic attribution 23 | for deep neuron networks". 24 | ''' 25 | class integrated_gradients: 26 | # model: Keras model that you wish to explain. 27 | # outchannels: In case the model are multi tasking, you can specify which output you want explain . 28 | def __init__(self, model, outchannels=[], verbose=1): 29 | 30 | #get backend info (either tensorflow or theano) 31 | self.backend = K.backend() 32 | 33 | #load model supports keras.Model and keras.Sequential 34 | if isinstance(model, Sequential): 35 | self.model = model.model 36 | elif isinstance(model, Model): 37 | self.model = model 38 | else: 39 | print("Invalid input model") 40 | return -1 41 | 42 | #load input tensors 43 | self.input_tensors = [] 44 | for i in self.model.inputs: 45 | self.input_tensors.append(i) 46 | # The learning phase flag is a bool tensor (0 = test, 1 = train) 47 | # to be passed as input to any Keras function that uses 48 | # a different behavior at train time and test time. 49 | self.input_tensors.append(K.learning_phase()) 50 | 51 | #If outputchanels are specified, use it. 52 | #Otherwise evalueate all outputs. 53 | self.outchannels = outchannels 54 | if len(self.outchannels) == 0: 55 | if verbose: print("Evaluated output channel (0-based index): All") 56 | if K.backend() == "tensorflow": 57 | self.outchannels = range(self.model.output.shape[1]._value) 58 | elif K.backend() == "theano": 59 | self.outchannels = range(self.model.output._keras_shape[1]) 60 | else: 61 | if verbose: 62 | print("Evaluated output channels (0-based index):") 63 | print(','.join([str(i) for i in self.outchannels])) 64 | 65 | #Build gradient functions for desired output channels. 66 | self.get_gradients = {} 67 | if verbose: print("Building gradient functions") 68 | 69 | # Evaluate over all requested channels. 70 | for c in self.outchannels: 71 | # Get tensor that calculates gradient 72 | if K.backend() == "tensorflow": 73 | gradients = self.model.optimizer.get_gradients(self.model.output[:, c], self.model.input) 74 | if K.backend() == "theano": 75 | gradients = self.model.optimizer.get_gradients(self.model.output[:, c].sum(), self.model.input) 76 | 77 | # Build computational graph that computes the tensors given inputs 78 | self.get_gradients[c] = K.function(inputs=self.input_tensors, outputs=gradients) 79 | 80 | # This takes a lot of time for a big model with many tasks. 81 | # So lets print the progress. 82 | if verbose: 83 | sys.stdout.write('\r') 84 | sys.stdout.write("Progress: "+str(int((c+1)*1.0/len(self.outchannels)*1000)*1.0/10)+"%") 85 | sys.stdout.flush() 86 | # Done 87 | if verbose: print("\nDone.") 88 | 89 | 90 | ''' 91 | Input: sample to explain, channel to explain 92 | Optional inputs: 93 | - reference: reference values (defaulted to 0s). 94 | - steps: # steps from reference values to the actual sample (defualted to 50). 95 | Output: list of numpy arrays to integrated over. 96 | ''' 97 | def explain(self, sample, outc=0, reference=False, num_steps=50, verbose=0): 98 | 99 | # Each element for each input stream. 100 | samples = [] 101 | numsteps = [] 102 | step_sizes = [] 103 | 104 | # If multiple inputs are present, feed them as list of np arrays. 105 | if isinstance(sample, list): 106 | #If reference is present, reference and sample size need to be equal. 107 | if reference != False: 108 | assert len(sample) == len(reference) 109 | for i in range(len(sample)): 110 | if reference == False: 111 | _output = integrated_gradients.linearly_interpolate(sample[i], False, num_steps) 112 | else: 113 | _output = integrated_gradients.linearly_interpolate(sample[i], reference[i], num_steps) 114 | samples.append(_output[0]) 115 | numsteps.append(_output[1]) 116 | step_sizes.append(_output[2]) 117 | 118 | # Or you can feed just a single numpy arrray. 119 | elif isinstance(sample, np.ndarray): 120 | _output = integrated_gradients.linearly_interpolate(sample, reference, num_steps) 121 | samples.append(_output[0]) 122 | numsteps.append(_output[1]) 123 | step_sizes.append(_output[2]) 124 | 125 | # Desired channel must be in the list of outputchannels 126 | assert outc in self.outchannels 127 | if verbose: print("Explaning the "+str(self.outchannels[outc])+"th output.") 128 | 129 | # For tensorflow backend 130 | _input = [] 131 | for s in samples: 132 | _input.append(s) 133 | _input.append(0) 134 | 135 | if K.backend() == "tensorflow": 136 | gradients = self.get_gradients[outc](_input) 137 | elif K.backend() == "theano": 138 | gradients = self.get_gradients[outc](_input) 139 | if len(self.model.inputs) == 1: 140 | gradients = [gradients] 141 | 142 | explanation = [] 143 | for i in range(len(gradients)): 144 | _temp = np.sum(gradients[i], axis=0) 145 | explanation.append(np.multiply(_temp, step_sizes[i])) 146 | 147 | # Format the return values according to the input sample. 148 | if isinstance(sample, list): 149 | return explanation 150 | elif isinstance(sample, np.ndarray): 151 | return explanation[0] 152 | return -1 153 | 154 | 155 | ''' 156 | Input: numpy array of a sample 157 | Optional inputs: 158 | - reference: reference values (defaulted to 0s). 159 | - steps: # steps from reference values to the actual sample. 160 | Output: list of numpy arrays to integrate over. 161 | ''' 162 | @staticmethod 163 | def linearly_interpolate(sample, reference=False, num_steps=50): 164 | # Use default reference values if reference is not specified 165 | if reference is False: reference = np.zeros(sample.shape); 166 | 167 | # Reference and sample shape needs to match exactly 168 | assert sample.shape == reference.shape 169 | 170 | # Calcuated stepwise difference from reference to the actual sample. 171 | ret = np.zeros(tuple([num_steps] +[i for i in sample.shape])) 172 | for s in range(num_steps): 173 | ret[s] = reference+(sample-reference)*(s*1.0/num_steps) 174 | 175 | return ret, num_steps, (sample-reference)*(1.0/num_steps) 176 | -------------------------------------------------------------------------------- /MODEL_TRAININGS/Run_VAE_Models.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for training VAE models 3 | ############################### 4 | import sys 5 | 6 | cancer_type = sys.argv[1] 7 | latent = int(sys.argv[2]) 8 | start = int(sys.argv[3]) 9 | end = int(sys.argv[4]) 10 | 11 | if latent == 5: 12 | dim1 = 100 13 | dim2 = 25 14 | if latent == 10: 15 | dim1 = 250 16 | dim2 = 50 17 | if latent == 25: 18 | dim1 = 250 19 | dim2 = 100 20 | if latent == 50: 21 | dim1 = 250 22 | dim2 = 100 23 | if latent == 75: 24 | dim1 = 250 25 | dim2 = 100 26 | if latent == 100: 27 | dim1 = 250 28 | dim2 = 100 29 | 30 | for run in range(start, end): 31 | get_ipython().magic(u"run -i 'VAE_3Layers_Model.py' '" + cancer_type + "' " + str(dim1) + " " + str(dim2) + " " + str(latent) + " " + str(run)) 32 | -------------------------------------------------------------------------------- /MODEL_TRAININGS/Select_Latent_Dimension_with_Gmeans.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "###############################\n", 10 | "#g-means training to select ensemble latent dimension size\n", 11 | "\n", 12 | "###############################\n", 13 | "\n", 14 | "import numpy as np\n", 15 | "import pandas as pd\n", 16 | "import csv\n", 17 | "from sklearn.decomposition import PCA\n", 18 | "import sklearn.preprocessing\n", 19 | "\n", 20 | "import pandas as pd\n", 21 | "import numpy as np\n", 22 | "import csv\n", 23 | "import sys" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "name": "stdout", 33 | "output_type": "stream", 34 | "text": [ 35 | "************************* BRCA\n", 36 | "Joined_df (11963, 26500)\n", 37 | "(26500, 11963)\n" 38 | ] 39 | }, 40 | { 41 | "name": "stderr", 42 | "output_type": "stream", 43 | "text": [ 44 | "/homes/gws/abdincer/.local/lib/python3.6/site-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n", 45 | " import pandas.util.testing as tm\n" 46 | ] 47 | }, 48 | { 49 | "name": "stdout", 50 | "output_type": "stream", 51 | "text": [ 52 | "Selected dimension 228\n", 53 | "************************* COLON\n", 54 | "Joined_df (5616, 26500)\n", 55 | "(26500, 5616)\n", 56 | "Selected dimension 195\n", 57 | "************************* LUNG\n", 58 | "Joined_df (4869, 26500)\n", 59 | "(26500, 4869)\n", 60 | "Selected dimension 166\n", 61 | "************************* AML\n", 62 | "Joined_df (6534, 26500)\n", 63 | "(26500, 6534)\n", 64 | "Selected dimension 57\n", 65 | "************************* BRAIN\n", 66 | "Joined_df (4282, 26500)\n", 67 | "(26500, 4282)\n", 68 | "Selected dimension 192\n", 69 | "************************* SKIN\n", 70 | "Joined_df (1240, 26500)\n", 71 | "(26500, 1240)\n", 72 | "Selected dimension 165\n", 73 | "************************* SARCOMA\n", 74 | "Joined_df (2330, 26500)\n", 75 | "(26500, 2330)\n", 76 | "Selected dimension 162\n", 77 | "************************* LIVER\n", 78 | "Joined_df (1937, 26500)\n", 79 | "(26500, 1937)\n", 80 | "Selected dimension 168\n", 81 | "************************* KIDNEY\n", 82 | "Joined_df (2293, 26500)\n", 83 | "(26500, 2293)\n", 84 | "Selected dimension 123\n", 85 | "************************* OV\n", 86 | "Joined_df (2714, 26500)\n", 87 | "(26500, 2714)\n", 88 | "Selected dimension 178\n", 89 | "************************* PROSTATE\n", 90 | "Joined_df (1195, 26500)\n", 91 | "(26500, 1195)\n", 92 | "Selected dimension 163\n", 93 | "************************* CERVICAL\n", 94 | "Joined_df (443, 26500)\n", 95 | "(26500, 443)\n", 96 | "Selected dimension 142\n", 97 | "************************* BLADDER\n", 98 | "Joined_df (371, 26500)\n", 99 | "(26500, 371)\n", 100 | "Selected dimension 136\n", 101 | "************************* STOMACH\n", 102 | "Joined_df (1742, 26500)\n", 103 | "(26500, 1742)\n", 104 | "Selected dimension 137\n", 105 | "************************* THYROID\n", 106 | "Joined_df (776, 26500)\n", 107 | "(26500, 776)\n", 108 | "Selected dimension 160\n", 109 | "************************* UTERINE\n", 110 | "Joined_df (661, 26500)\n", 111 | "(26500, 661)\n", 112 | "Selected dimension 156\n", 113 | "************************* HEAD_NECK\n", 114 | "Joined_df (643, 26500)\n", 115 | "(26500, 643)\n", 116 | "Selected dimension 156\n", 117 | "************************* PANCREAS\n", 118 | "Joined_df (602, 26500)\n", 119 | "(26500, 602)\n", 120 | "Selected dimension 145\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "cancer_types = ['BRCA', 'COLON', 'LUNG', 'AML',\n", 126 | " 'BRAIN', 'SKIN', 'SARCOMA', 'LIVER', \n", 127 | " 'KIDNEY', 'OV','PROSTATE', 'CERVICAL', \n", 128 | " 'BLADDER', 'STOMACH', 'THYROID', 'UTERINE', \n", 129 | " 'HEAD_NECK', 'PANCREAS']\n", 130 | " \n", 131 | "L_values = []\n", 132 | "for cancer_type in cancer_types:\n", 133 | " print(\"************************* \" + cancer_type)\n", 134 | " input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/VAE_FILES/' \n", 135 | "\n", 136 | " #all encodings for one dimension\n", 137 | " dims = [5, 10, 25, 50, 75, 100]\n", 138 | "\n", 139 | " data_list = []\n", 140 | "\n", 141 | " for dim in dims:\n", 142 | " run = 100\n", 143 | " for i in range(run):\n", 144 | " #print(i + 1)\n", 145 | " data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_encoded_' + str(dim) + 'L_TRAINING_fold' + str(i) + '.tsv', index_col = 0) \n", 146 | " #print(data_df.shape)\n", 147 | " data_list.append(data_df.values)\n", 148 | "\n", 149 | "\n", 150 | " joined_df = np.concatenate(data_list, axis=1)\n", 151 | " print(\"Joined_df \", joined_df.shape)\n", 152 | "\n", 153 | " #Apply kmeans clustering to this data\n", 154 | " from sklearn.cluster import KMeans\n", 155 | " import numpy as np\n", 156 | " X = joined_df.T\n", 157 | " print(X.shape)\n", 158 | " \n", 159 | " from gmeans import *\n", 160 | " gmeans = GMeans(strictness=3, random_state = 12345)\n", 161 | " gmeans.fit(X)\n", 162 | " gmeans.labels_\n", 163 | " selected_L = len(np.unique(gmeans.labels_))\n", 164 | " print(\"Selected dimension \", selected_L)\n", 165 | " \n", 166 | " L_values.append(selected_L)\n", 167 | "\n" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 3, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "name": "stdout", 177 | "output_type": "stream", 178 | "text": [ 179 | "[228, 195, 166, 57, 192, 165, 162, 168, 123, 178, 163, 142, 136, 137, 160, 156, 156, 145]\n", 180 | "157.16666666666666\n" 181 | ] 182 | } 183 | ], 184 | "source": [ 185 | "print(L_values)\n", 186 | "print(np.mean(L_values))" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [] 195 | } 196 | ], 197 | "metadata": { 198 | "kernelspec": { 199 | "display_name": "Python 3", 200 | "language": "python", 201 | "name": "python3" 202 | }, 203 | "language_info": { 204 | "codemirror_mode": { 205 | "name": "ipython", 206 | "version": 3 207 | }, 208 | "file_extension": ".py", 209 | "mimetype": "text/x-python", 210 | "name": "python", 211 | "nbconvert_exporter": "python", 212 | "pygments_lexer": "ipython3", 213 | "version": "3.6.8" 214 | } 215 | }, 216 | "nbformat": 4, 217 | "nbformat_minor": 4 218 | } 219 | -------------------------------------------------------------------------------- /MODEL_TRAININGS/VAE_3Layers_Model.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #VAE model 3 | 4 | #Code is modified from https://github.com/keras-team/keras/blob/master/examples/variational_autoencoder.py 5 | ############################### 6 | 7 | import os 8 | import numpy as np 9 | import pandas as pd 10 | import math 11 | from sklearn.metrics import mean_squared_error 12 | import matplotlib.pyplot as plt 13 | import tensorflow as tf 14 | from keras.layers import Input, Dense, Lambda, Layer, Activation, Dropout 15 | from keras.layers.normalization import BatchNormalization 16 | from keras.models import Model 17 | from keras import backend as K 18 | from keras import metrics, optimizers 19 | from keras.callbacks import Callback 20 | import keras 21 | import csv 22 | import sys 23 | 24 | #Prevent tensorflow from using all the memory 25 | config = tf.ConfigProto() 26 | config.gpu_options.allow_growth=True 27 | sess = tf.Session(config=config) 28 | 29 | 30 | # Method for reparameterization trick to make model differentiable 31 | def sampling(args): 32 | 33 | # Function with args required for Keras Lambda function 34 | z_mean, z_log_var = args 35 | 36 | # Draw epsilon of the same shape from a standard normal distribution 37 | epsilon = K.random_normal(shape=K.shape(z_mean), mean=0., stddev=1.0) 38 | 39 | # The latent vector is non-deterministic and differentiable 40 | # in respect to z_mean and z_log_var 41 | z = z_mean + K.exp(z_log_var / 2) * epsilon 42 | return z 43 | 44 | #Method for defining the VAE loss 45 | def vae_loss(x_input, x_decoded): 46 | 47 | reconstruction_loss = original_dim * metrics.mse(x_input, x_decoded) 48 | kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) 49 | 50 | return K.mean(reconstruction_loss + (K.get_value(beta) * kl_loss)) 51 | 52 | #Method for calculating the reconstruction loss 53 | def reconstruction_loss(x_input, x_decoded): 54 | 55 | return metrics.mse(x_input, x_decoded) 56 | 57 | #Method for calculating the KL-divergence loss 58 | def kl_loss(x_input, x_decoded): 59 | return - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) 60 | 61 | class WarmUpCallback(Callback): 62 | def __init__(self, beta, kappa): 63 | self.beta = beta 64 | self.kappa = kappa 65 | 66 | # Behavior on each epoch 67 | def on_epoch_end(self, epoch, logs={}): 68 | if K.get_value(self.beta) <= 1: 69 | K.set_value(self.beta, K.get_value(self.beta) + self.kappa) 70 | 71 | #Read input file 72 | cancer_type = sys.argv[1] 73 | 74 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/' 75 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/VAE_FILES/' 76 | 77 | input_filename = input_folder + cancer_type + '_DATA_TOP2_JOINED_PCA_1000L.tsv' 78 | output_filename = cancer_type + '_DATA_TOP2_JOINED_encoded_' 79 | 80 | input_df = pd.read_table(input_filename, index_col=0) 81 | print("INPUT FILE", input_df.shape) 82 | print(input_df.head(5)) 83 | 84 | # Set hyperparameters 85 | original_dim = input_df.shape[1] 86 | intermediate1_dim = int(sys.argv[2]) 87 | intermediate2_dim = int(sys.argv[3]) 88 | latent_dim = int(sys.argv[4]) 89 | fold = int(sys.argv[5]) 90 | 91 | #SET RANDOM SEEDS 92 | from numpy.random import seed 93 | seed(123456 * fold) 94 | from tensorflow import set_random_seed 95 | set_random_seed(123456 * fold) 96 | 97 | 98 | init_mode = 'glorot_uniform' 99 | batch_size = 50 100 | epochs = 50 101 | learning_rate = 0.0005 102 | beta = K.variable(1) 103 | kappa = 0 104 | 105 | input_df_training = input_df 106 | 107 | #Define encoder 108 | x = Input(shape=(original_dim, )) 109 | 110 | net = Dense(intermediate1_dim, kernel_initializer=init_mode)(x) 111 | net2 = BatchNormalization()(net) 112 | net3 = Activation('relu')(net2) 113 | 114 | net4 = Dense(intermediate2_dim, kernel_initializer=init_mode)(net3) 115 | net5 = BatchNormalization()(net4) 116 | net6 = Activation('relu')(net5) 117 | 118 | z_mean = Dense(latent_dim, kernel_initializer=init_mode)(net6) 119 | z_log_var = Dense(latent_dim, kernel_initializer=init_mode)(net6) 120 | 121 | # Sample from mean and var 122 | z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var]) 123 | 124 | #Define decoder 125 | decoder_h = Dense(intermediate2_dim, activation='relu', kernel_initializer=init_mode) 126 | decoder_h2 = Dense(intermediate1_dim, activation='relu', kernel_initializer=init_mode) 127 | decoder_mean = Dense(original_dim, kernel_initializer=init_mode) 128 | 129 | h_decoded = decoder_h(z) 130 | h_decoded2 = decoder_h2(h_decoded) 131 | x_decoded_mean = decoder_mean(h_decoded2) 132 | 133 | #VAE model 134 | vae = Model(x, x_decoded_mean) 135 | 136 | adam = optimizers.Adam(lr=learning_rate) 137 | vae.compile(optimizer=adam, loss = vae_loss, metrics = [reconstruction_loss, kl_loss]) 138 | vae.summary() 139 | 140 | #Train model 141 | history = vae.fit(np.array(input_df_training), np.array(input_df_training), 142 | shuffle=True, 143 | epochs=epochs, 144 | batch_size=batch_size, 145 | verbose = 2, 146 | callbacks=[WarmUpCallback(beta, kappa)]) 147 | 148 | # DEFINE ENCODER 149 | encoder = Model(x, z_mean) 150 | 151 | #DEFINE DECODER 152 | decoder_input = Input(shape=(latent_dim, )) 153 | _h_decoded = decoder_h(decoder_input) 154 | _h_decoded2 = decoder_h2(_h_decoded) 155 | _x_decoded_mean = decoder_mean(_h_decoded2) 156 | decoder = Model(decoder_input, _x_decoded_mean) 157 | 158 | 159 | training_encoded = encoder.predict(input_df_training, batch_size = batch_size) 160 | training_encoded_df = pd.DataFrame(training_encoded, index = input_df_training.index) 161 | 162 | # How well does the model reconstruct the input data 163 | training_reconstructed = decoder.predict(np.array(training_encoded_df)) 164 | training_reconstructed_df = pd.DataFrame(training_reconstructed, index = input_df_training.index, columns = input_df_training.columns) 165 | 166 | recons_error = mean_squared_error(np.array(input_df_training), np.array(training_reconstructed_df)) 167 | 168 | print("TRAINING RECONSTRUCTION ERROR: " + str(recons_error)) 169 | 170 | #Save encoded test data 171 | training_encoded_df.to_csv(output_folder + output_filename + str(latent_dim) + "L_TRAINING_fold" + str(fold) + ".tsv", sep = '\t') 172 | 173 | 174 | #SAVE ENCODER MODEL 175 | from keras.models import model_from_json 176 | 177 | model_json = encoder.to_json() 178 | with open(output_folder + "VAE_" + cancer_type + "_encoder_" + str(latent_dim) + "L_"+ str(fold) + ".json", "w") as json_file: 179 | json_file.write(model_json) 180 | 181 | encoder.save_weights(output_folder + "VAE_" + cancer_type + "_encoder_" + str(latent_dim) + "L_"+ str(fold) + ".h5") 182 | print("Saved model to disk") 183 | 184 | 185 | model_json = decoder.to_json() 186 | with open(output_folder + "VAE_" + cancer_type + "_decoder_" + str(latent_dim) + "L_"+ str(fold) + ".json", "w") as json_file: 187 | json_file.write(model_json) 188 | 189 | decoder.save_weights(output_folder + "VAE_" + cancer_type + "_decoder_" + str(latent_dim) + "L_"+ str(fold) + ".h5") 190 | print("Saved model to disk") 191 | 192 | 193 | #Calculate training r squared 194 | from sklearn.metrics import r2_score 195 | 196 | training_r2_vals = np.zeros(input_df_training.shape[0]) 197 | for i in range(input_df_training.shape[0]): 198 | training_r2 = r2_score(input_df_training.values[i, :], training_reconstructed_df.values[i, :]) 199 | training_r2_vals[i] = training_r2 200 | 201 | print("TRAINING R2 " + str(np.mean(training_r2_vals))) 202 | -------------------------------------------------------------------------------- /MODEL_TRAININGS/gmeans.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Code is from https://github.com/flylo/g-means 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from matplotlib import pyplot as plt 8 | import seaborn as sbn 9 | 10 | from sklearn.cluster import MiniBatchKMeans 11 | from sklearn.preprocessing import scale 12 | 13 | from sklearn import datasets 14 | 15 | from scipy.stats import anderson 16 | 17 | from pdb import set_trace 18 | 19 | 20 | class GMeans(object): 21 | 22 | """strictness = how strict should the anderson-darling test for normality be 23 | 0: not at all strict 24 | 4: very strict 25 | """ 26 | 27 | def __init__(self, min_obs=1, max_depth=10, random_state=None, strictness=4): 28 | 29 | super(GMeans, self).__init__() 30 | 31 | self.max_depth = max_depth 32 | 33 | self.min_obs = min_obs 34 | 35 | self.random_state = random_state 36 | 37 | if strictness not in range(5): 38 | raise ValueError("strictness parameter must be integer from 0 to 4") 39 | self.strictness = strictness 40 | 41 | self.stopping_criteria = [] 42 | 43 | def _gaussianCheck(self, vector): 44 | """ 45 | check whether a given input vector follows a gaussian distribution 46 | H0: vector is distributed gaussian 47 | H1: vector is not distributed gaussian 48 | """ 49 | output = anderson(vector) 50 | 51 | if output[0] <= output[1][self.strictness]: 52 | return True 53 | else: 54 | return False 55 | 56 | 57 | def _recursiveClustering(self, data, depth, index): 58 | """ 59 | recursively run kmeans with k=2 on your data until a max_depth is reached or we have 60 | gaussian clusters 61 | """ 62 | depth += 1 63 | if depth == self.max_depth: 64 | self.data_index[index[:, 0]] = index 65 | self.stopping_criteria.append('max_depth') 66 | return 67 | 68 | km = MiniBatchKMeans(n_clusters=2, random_state=self.random_state) 69 | km.fit(data) 70 | 71 | centers = km.cluster_centers_ 72 | v = centers[0] - centers[1] 73 | x_prime = scale(data.dot(v) / (v.dot(v))) 74 | gaussian = self._gaussianCheck(x_prime) 75 | 76 | # print gaussian 77 | 78 | if gaussian == True: 79 | self.data_index[index[:, 0]] = index 80 | self.stopping_criteria.append('gaussian') 81 | return 82 | 83 | labels = set(km.labels_) 84 | for k in labels: 85 | current_data = data[km.labels_ == k] 86 | 87 | if current_data.shape[0] <= self.min_obs: 88 | self.data_index[index[:, 0]] = index 89 | self.stopping_criteria.append('min_obs') 90 | return 91 | 92 | 93 | current_index = index[km.labels_==k] 94 | current_index[:, 1] = np.random.randint(0,100000000000) 95 | self._recursiveClustering(data=current_data, depth=depth, index=current_index) 96 | 97 | # set_trace() 98 | 99 | 100 | def fit(self, data): 101 | """ 102 | fit the recursive clustering model to the data 103 | """ 104 | self.data = data 105 | 106 | data_index = np.array([(i, False) for i in range(data.shape[0])]) 107 | self.data_index = data_index 108 | 109 | self._recursiveClustering(data=data, depth=0, index=data_index) 110 | 111 | self.labels_ = self.data_index[:, 1] 112 | 113 | 114 | if __name__ == '__main__': 115 | # iris = datasets.load_iris().data 116 | 117 | iris = datasets.make_blobs(n_samples=10000, 118 | n_features=2, 119 | centers=4, 120 | cluster_std=1.0)[0] 121 | 122 | gmeans = GMeans(random_state=1010, 123 | strictness=4) 124 | # # set_trace() 125 | gmeans.fit(iris) 126 | 127 | plot_data = pd.DataFrame(iris[:, 0:2]) 128 | plot_data.columns = ['x', 'y'] 129 | plot_data['labels_gmeans'] = gmeans.labels_ 130 | # set_trace() 131 | 132 | km = MiniBatchKMeans(n_clusters=4) 133 | km.fit(iris) 134 | plot_data['labels_km'] = km.labels_ 135 | 136 | sbn.lmplot(x='x', y='y', data=plot_data, hue='labels_gmeans', fit_reg=False) 137 | sbn.lmplot(x='x', y='y', data=plot_data, hue='labels_km', fit_reg=False) 138 | plt.show() 139 | set_trace() 140 | 141 | 142 | -------------------------------------------------------------------------------- /NORMAL_TISSUE_ANALYSIS/Create_DeepProfile_GTEX_Embeddings.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for creating GTEX DeepProfile embeddings 3 | ############################### 4 | 5 | import pandas as pd 6 | import numpy as np 7 | import csv 8 | import sys 9 | 10 | #Read cancer tupe from user 11 | cancer_type = sys.argv[1] 12 | 13 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/' 14 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/HEALTHY_TISSUE_FILES/' 15 | 16 | #Read all VAE embeddings 17 | dims = [5, 10, 25, 50, 75, 100] 18 | run = 100 19 | 20 | data_list = [] 21 | for dim in dims: 22 | for i in range(run): 23 | data_df = pd.read_table(input_folder + 'HEALTHY_TISSUE_FILES/GTEX_' + cancer_type + '_RNASeq_Expression_VAE_encoded_' + str(dim) + 'L_' + str(i) + '.tsv', index_col = 0) 24 | print("GTEX VAE embedding ", data_df.shape) 25 | data_list.append(data_df.values) 26 | 27 | #Concatenate all embeddings 28 | joined_data = np.concatenate(data_list, axis=1) 29 | print("Joined VAE embedding ",joined_data.shape) 30 | 31 | #Read DeepProfile ensemble labels 32 | L = 150 33 | labels_df = pd.read_table(input_folder + cancer_type + '_TRAINING_DATA_kmeans_ENSEMBLE_LABELS_' + str(L) + 'L.txt', header= None) 34 | labels = labels_df.values 35 | print("DeepProfile ensemble labels ", len(labels)) 36 | 37 | #Create ensemble embedding 38 | ensemble_embeddings = np.zeros((joined_data.shape[0], L)) 39 | for label in range(L): 40 | indices = np.where(labels == label)[0] 41 | average_values = np.mean(joined_data[:, indices], axis = 1) 42 | ensemble_embeddings[:, label] = average_values 43 | 44 | 45 | #Record the ensemble embeddings 46 | print("DeepProfile ensemble embedding ", ensemble_embeddings.shape) 47 | ensemble_embeddings_df = pd.DataFrame(ensemble_embeddings, index = data_df.index, columns = np.arange(L)) 48 | ensemble_embeddings_df.to_csv(output_folder + cancer_type + '_DeepProfile_GTEX_Healthy_Tissue_Embedding_' + str(L) + 'L.tsv', sep = '\t') -------------------------------------------------------------------------------- /NORMAL_TISSUE_ANALYSIS/Create_Gtex_Rnaseq_PCs.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for creating PCs for expression matrices 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import csv 8 | from sklearn.decomposition import PCA 9 | import sklearn.preprocessing 10 | import statsmodels.api as sm 11 | from sklearn.preprocessing import scale 12 | 13 | def createData(cancer_type): 14 | 15 | input_folder ='../ALL_CANCER_FILES/' + cancer_type + '/' 16 | 17 | #Read training data 18 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0) 19 | print("Training expression dataframe ", data_df.shape) 20 | 21 | training_data = data_df.values 22 | training_data = np.nan_to_num(training_data) 23 | 24 | #Train PCA models 25 | pca = PCA(n_components = 1000) 26 | pca.fit(training_data) 27 | components = pca.components_ 28 | print("PCA Components ", components.shape) 29 | 30 | #Read GTEX expression dataframe 31 | test_df = pd.read_table(input_folder + 'HEALTHY_TISSUE_FILES/' + 'GTEX_' + cancer_type + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', sep = '\t', index_col=0) 32 | print("Gtex expression dataframe ", test_df.shape) 33 | 34 | #Get genes available in training dataset 35 | joined_df = pd.concat([data_df, test_df], sort=False, join = 'outer') 36 | joined_df = joined_df[data_df.columns] 37 | joined_df = joined_df.iloc[-1 * test_df.shape[0]:, :] 38 | test_df = joined_df 39 | 40 | print("Gtex expression dataframe ", test_df.shape) 41 | 42 | #Encode test data using trained PCA model 43 | test_df = test_df.fillna(test_df.mean().fillna(0)) 44 | test_data = test_df.values 45 | 46 | #Save the encoded data 47 | encoded_data = pca.transform(test_data) 48 | encoded_df = pd.DataFrame(encoded_data, index = test_df.index) 49 | print("GTEX PCA data ", encoded_df.shape) 50 | print("GTEX PCA data ", encoded_df.head) 51 | encoded_df.to_csv(input_folder + '/HEALTHY_TISSUE_FILES/GTEX_' + cancer_type + '_DATA_1K_PCs.tsv', sep = '\t') 52 | 53 | import sys 54 | 55 | cancer_type = sys.argv[1] 56 | createData(cancer_type) 57 | -------------------------------------------------------------------------------- /NORMAL_TISSUE_ANALYSIS/Encode_GTEX_Data_with_VAE.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for encoding GTEX expression using VAE models 3 | ############################### 4 | 5 | import os 6 | import numpy as np 7 | import pandas as pd 8 | 9 | import math 10 | from sklearn.metrics import mean_squared_error 11 | import matplotlib.pyplot as plt 12 | 13 | import tensorflow as tf 14 | from keras.layers import Input, Dense, Lambda, Layer, Activation 15 | from keras.layers.normalization import BatchNormalization 16 | from keras.models import Model 17 | from keras import backend as K 18 | from keras import metrics, optimizers 19 | from keras.callbacks import Callback 20 | import keras 21 | 22 | import csv 23 | import sys 24 | from keras.models import model_from_json 25 | from sklearn import preprocessing 26 | 27 | #Prevent tensorflow from using all the memory 28 | config = tf.ConfigProto() 29 | config.gpu_options.allow_growth=True 30 | sess = tf.Session(config=config) 31 | 32 | #Method for defining the VAE loss 33 | def vae_loss(x_input, x_decoded): 34 | reconstruction_loss = original_dim * metrics.mse(x_input, x_decoded) 35 | kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) 36 | return K.mean(reconstruction_loss + (K.get_value(beta) * kl_loss)) 37 | 38 | #Read user inputs 39 | import sys 40 | cancer = sys.argv[1] 41 | dimension = int(sys.argv[2]) 42 | start = int(sys.argv[3]) 43 | end = int(sys.argv[4]) 44 | 45 | print("CANCER NAME: " + cancer) 46 | data_folder = '../ALL_CANCER_FILES/' + cancer + '/' 47 | 48 | #Read GTEX expression 49 | input_df_test = pd.read_table(data_folder + 'HEALTHY_TISSUE_FILES/GTEX_' + cancer + '_DATA_1K_PCs.tsv', index_col = 0) 50 | print("GTEX expression dataframe ", input_df_test.shape) 51 | 52 | #Encode expression data with each VAE model 53 | for fold in range(start, end): 54 | print("VAE model with " + str(dimension) + " nodes and fold " + str(fold)) 55 | 56 | #Load VAE models 57 | json_file = open(data_folder + 'VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.json', 'r') 58 | loaded_model_json = json_file.read() 59 | json_file.close() 60 | encoder = model_from_json(loaded_model_json) 61 | 62 | encoder.load_weights(data_folder + 'VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.h5') 63 | print("Loaded model from disk") 64 | 65 | #Define placeholder VAE model 66 | original_dim = input_df_test.shape[1] 67 | intermediate1_dim = 100 68 | intermediate2_dim = 25 69 | latent_dim = dimension 70 | 71 | batch_size = 50 72 | epochs = 50 73 | learning_rate = 0.0005 74 | beta = K.variable(1) 75 | kappa = 0 76 | init_mode = 'glorot_uniform' 77 | 78 | x = Input(shape=(original_dim, )) 79 | 80 | net = Dense(intermediate1_dim, kernel_initializer=init_mode)(x) 81 | net2 = BatchNormalization()(net) 82 | net3 = Activation('relu')(net2) 83 | 84 | net4 = Dense(intermediate2_dim, kernel_initializer=init_mode)(net3) 85 | net5 = BatchNormalization()(net4) 86 | net6 = Activation('relu')(net5) 87 | 88 | z_mean = Dense(latent_dim, kernel_initializer=init_mode)(net6) 89 | z_log_var = Dense(latent_dim, kernel_initializer=init_mode)(net6) 90 | 91 | 92 | adam = optimizers.Adam(lr=learning_rate) 93 | 94 | # Encode test data using the VAE model 95 | test_encoded = encoder.predict(input_df_test, batch_size = batch_size) 96 | test_encoded_df = pd.DataFrame(test_encoded, index = input_df_test.index) 97 | test_encoded_df.to_csv(data_folder + 'HEALTHY_TISSUE_FILES/' + 'GTEX_' + cancer + '_RNASeq_Expression_VAE_encoded_' + str(dimension) + 'L_' + str(fold) + '.tsv', sep = '\t') 98 | 99 | -------------------------------------------------------------------------------- /NORMAL_TISSUE_ANALYSIS/Example_Run_All.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Example for generating healthy tissue embeddings for a cancer type 3 | ############################### 4 | 5 | #Preprocess data 6 | get_ipython().magic(u"run -i Preprocess_Gtex_Rnaseq_Expressions.py BRCA") 7 | get_ipython().magic(u"run -i Create_Gtex_Rnaseq_PCs.py BRCA") 8 | 9 | #Create DeepProfile embeddings 10 | get_ipython().magic(u"run -i Encode_GTEX_Data_with_VAE.py BRCA 5 0 100") 11 | get_ipython().magic(u"run -i Encode_GTEX_Data_with_VAE.py BRCA 10 0 100") 12 | get_ipython().magic(u"run -i Encode_GTEX_Data_with_VAE.py BRCA 25 0 100") 13 | get_ipython().magic(u"run -i Encode_GTEX_Data_with_VAE.py BRCA 50 0 100") 14 | get_ipython().magic(u"run -i Encode_GTEX_Data_with_VAE.py BRCA 75 0 100") 15 | get_ipython().magic(u"run -i Encode_GTEX_Data_with_VAE.py BRCA 100 0 100") 16 | 17 | get_ipython().magic(u"run -i Create_DeepProfile_GTEX_Embeddings.py BRCA") 18 | 19 | #Train healthy tissue classifiers 20 | get_ipython().magic(u"run -i Normal_Tissue_Classifier.py BRCA") -------------------------------------------------------------------------------- /NORMAL_TISSUE_ANALYSIS/Normal_Tissue_Classifier.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for training classifiers for separating healthy and cancer tissue embeddings 3 | ############################### 4 | 5 | import pandas as pd 6 | import seaborn as sb 7 | import numpy as np 8 | import pickle 9 | import random 10 | from tqdm import * 11 | from sklearn.linear_model import LogisticRegression 12 | from sklearn.model_selection import GridSearchCV 13 | from sklearn.model_selection import train_test_split 14 | from sklearn.utils import resample 15 | from sklearn.preprocessing import StandardScaler 16 | 17 | 18 | def trainClassifier(cancer_type): 19 | 20 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/' 21 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/HEALTHY_TISSUE_FILES/' 22 | 23 | #Read cancer embedding 24 | cancer_data = pd.read_csv(input_folder + cancer_type + '_DeepProfile_Training_Embedding_150L.tsv',sep='\t',index_col=0) 25 | print("Cancer embedding ", cancer_data.shape) 26 | 27 | #Read GTEX embedding 28 | healthy_data = pd.read_csv(input_folder + 'HEALTHY_TISSUE_FILES/' + cancer_type + '_DeepProfile_GTEX_Healthy_Tissue_Embedding_150L.tsv',sep='\t',index_col=0) 29 | print("GTEX embedding ", healthy_data.shape) 30 | 31 | #Combine datasets 32 | FULL_FRAME = pd.concat([cancer_data, healthy_data],axis=0) 33 | 34 | #Define healthy tissue labels 35 | healthy_label = [x < cancer_data.shape[0] for x in range(FULL_FRAME.shape[0])] 36 | 37 | #Train 100 L2 models with bootstrapping 38 | bootstrap_weights = [] 39 | for i in tqdm(range(500)): 40 | X_re,y_re = resample(FULL_FRAME,healthy_label, random_state = 1234 * i) 41 | clf = LogisticRegression(penalty = 'l2', solver = 'liblinear') 42 | clf.fit(X_re,y_re) 43 | 44 | bootstrap_weights.append(clf.coef_) 45 | 46 | #Save the results 47 | pickle.dump(bootstrap_weights,open(output_folder + 'bootstrap_' + cancer_type + '_weights.p','wb')) 48 | 49 | import sys 50 | 51 | cancer_type = sys.argv[1] 52 | trainClassifier(cancer_type) 53 | -------------------------------------------------------------------------------- /NORMAL_TISSUE_ANALYSIS/Preprocess_Gtex_Rnaseq_Expressions.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for creating expression matrices for GTEX healthy samples 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import csv 8 | from sklearn.decomposition import PCA 9 | import sklearn.preprocessing 10 | import statsmodels.api as sm 11 | from sklearn.preprocessing import scale 12 | 13 | #Read all GTEX expression file 14 | MAIN_df = pd.read_table('GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct', sep = '\t', index_col=0) 15 | print("Gtex expression dataframe ", MAIN_df.shape) 16 | all_samples = np.asarray(MAIN_df.columns) 17 | 18 | #Method for creating tissue-specific GTEX 19 | def save_tissue_expression(cancer): 20 | 21 | data_folder = '../ALL_CANCER_FILES/' + cancer + '/HEALTHY_TISSUE_FILES/' 22 | 23 | #Read sample names of tissue-specific samples 24 | index_df = pd.read_table(data_folder + 'GTEX_' + cancer + '_SAMPLES.txt', sep = '\n', index_col=0) 25 | cancer_specific_samples = np.asarray(index_df.index) 26 | print("Samples ", cancer_specific_samples) 27 | 28 | #Find list of matching samples 29 | matching_samples = np.intersect1d(cancer_specific_samples, all_samples) 30 | print("MATCHING SAMPLES COUNT ", len(matching_samples)) 31 | 32 | #Get the expression for these patients 33 | cancer_df = MAIN_df[matching_samples] 34 | gene_names = MAIN_df['Description'].values 35 | cancer_df = pd.DataFrame(cancer_df.values.T, index = cancer_df.columns, columns = gene_names) 36 | print("Samples ", cancer_df.shape) 37 | print('Range ', (np.max(cancer_df.values) - np.min(cancer_df.values) )) 38 | 39 | #Mean impute the missing values 40 | cancer_df = cancer_df.fillna(cancer_df.mean().fillna(0)) 41 | 42 | #Log scale the data and make 0-mean univariate 43 | scaled_expression_values = np.log(cancer_df.values) 44 | scaled_expression_values[scaled_expression_values == np.NINF] = 0 45 | normalized_data = sklearn.preprocessing.scale(scaled_expression_values) 46 | print("Mean values ", np.mean(normalized_data, axis = 0)) 47 | print("Mean values ", len(np.mean(normalized_data, axis = 0))) 48 | print("Std values ", np.std(normalized_data, axis = 0)) 49 | print("Std values ", len(np.std(normalized_data, axis = 0))) 50 | 51 | #Save the final expressiom matrix 52 | cancer_df = pd.DataFrame(normalized_data, index = cancer_df.index, columns = cancer_df.columns) 53 | print("Final dataframe ", cancer_df.shape) 54 | print("Final dataframe ", cancer_df.head()) 55 | print('Final dataframe range: ', (np.max(cancer_df.values) - np.min(cancer_df.values) )) 56 | 57 | cancer_df.to_csv(data_folder + 'GTEX_' + cancer + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', sep = '\t') 58 | 59 | import sys 60 | 61 | cancer_type = sys.argv[1] 62 | save_tissue_expression(cancer_type) -------------------------------------------------------------------------------- /PATHWAY_ANALYSIS/Create_Pathway_Matrices.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for creating pathway matrices for cancer type 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import csv 8 | import sys 9 | 10 | #Read cancer name and pathway file 11 | cancer_type = sys.argv[1] 12 | pathway_name = sys.argv[2] 13 | 14 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/' 15 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/PATHWAY_FILES/' 16 | 17 | def create_pathway_matrix(cancer_type, pathway_file): 18 | 19 | #1) Read input data 20 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', index_col=0) 21 | print("Input data ", data_df.shape) 22 | gene_names = data_df.columns 23 | 24 | #2) Read pathway data 25 | if pathway_file == 'C2': 26 | filename = 'MSIGDB_PATHWAYS/c2.v6.2.symbols.gmt' 27 | if pathway_file == 'H': 28 | filename = 'MSIGDB_PATHWAYS/h.all.v6.2.symbols.gmt' 29 | if pathway_file == 'C4_CGN': 30 | filename = 'MSIGDB_PATHWAYS/c4.cgn.v6.2.symbols.gmt' 31 | if pathway_file == 'C4_CM': 32 | filename = 'MSIGDB_PATHWAYS/c4.cm.v6.2.symbols.gmt' 33 | if pathway_file == 'C5_BP': 34 | filename = 'MSIGDB_PATHWAYS/c5.bp.v6.2.symbols.gmt' 35 | if pathway_file == 'C5_CC': 36 | filename = 'MSIGDB_PATHWAYS/c5.cc.v6.2.symbols.gmt' 37 | if pathway_file == 'C5_MF': 38 | filename = 'MSIGDB_PATHWAYS/c5.mf.v6.2.symbols.gmt' 39 | if pathway_file == 'C6': 40 | filename = 'MSIGDB_PATHWAYS/c6.all.v6.2.symbols.gmt' 41 | if pathway_file == 'C7': 42 | filename = 'MSIGDB_PATHWAYS/c7.all.v6.2.symbols.gmt' 43 | 44 | 45 | with open(filename) as f: 46 | content = f.readlines() 47 | content = [x.strip() for x in content] 48 | 49 | pathway_count = len(content) 50 | print("Pathway count " + str(pathway_count)) 51 | 52 | pathway = np.zeros((len(gene_names), pathway_count), dtype = np.int) 53 | pathway_names = [] 54 | pathway_lens = [] 55 | 56 | for i in range(pathway_count): 57 | data = content[i].split("\t") 58 | genes = data[2:] 59 | pathway_name = data[0] 60 | pathway_names.append(pathway_name) 61 | 62 | pathway_lens.append(len(genes)) 63 | 64 | #Loop through all genes 65 | for j in range(len(genes)): 66 | 67 | index = np.where(gene_names == genes[j])[0] 68 | if len(index) != 0: 69 | pathway[index[0], i] = 1 70 | 71 | #3) Save matrix 72 | new_df = pd.DataFrame(pathway, index = gene_names, columns = pathway_names) 73 | print("Pathway matrix ", new_df.shape) 74 | print("Average pathway length ", np.mean(pathway_lens)) 75 | print("Average pathway length ", pathway_lens) 76 | new_df.to_csv(output_folder + 'PATHWAY_' + pathway_file + '_MATRIX_INTERSECTION_GENES.tsv', sep='\t', quoting = csv.QUOTE_NONE) 77 | 78 | 79 | #Also record gene symbols 80 | with open(output_folder + 'Gene_Symbols.txt', 'w') as f: 81 | for item in gene_names: 82 | f.write("%s\n" % item) 83 | 84 | 85 | create_pathway_matrix(cancer_type, pathway_name) -------------------------------------------------------------------------------- /PATHWAY_ANALYSIS/Fishers_Test.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for running fisher's test for pathway enrichment analysis 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import csv 8 | import statsmodels.api as sm 9 | import sys 10 | 11 | #Read user inputs 12 | cancer_type = sys.argv[1] 13 | pathway_type = sys.argv[2] 14 | method = sys.argv[3] 15 | start = int(sys.argv[4]) 16 | end = int(sys.argv[5]) 17 | if len(sys.argv) > 6: 18 | dimension = int(sys.argv[6]) 19 | L = dimension 20 | else: 21 | L = 150 22 | 23 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/' 24 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/PATHWAY_FILES/' 25 | 26 | pathway_matrix = pd.read_table(input_folder + 'PATHWAY_FILES/PATHWAY_' + pathway_type + '_MATRIX_INTERSECTION_GENES.tsv', index_col = 0) 27 | print(pathway_matrix.shape) 28 | pathway_df = pathway_matrix 29 | pathway_matrix = pathway_matrix.values 30 | 31 | if pathway_type == 'C2': 32 | N = 51 #average number of pathways 33 | if pathway_type == 'C4_CM': 34 | N = 113 #average number of pathways 35 | if pathway_type == 'C4_CGN': 36 | N = 99 #average number of pathways 37 | if pathway_type == 'C6': 38 | N = 166 #average number of pathways 39 | if pathway_type == 'C5_BP': 40 | N = 114 #average number of pathways 41 | if pathway_type == 'C5_CC': 42 | N = 151 #average number of pathways 43 | if pathway_type == 'C5_MF': 44 | N = 106 #average number of pathways 45 | if pathway_type == 'H': 46 | N = 146 #average number of pathways 47 | 48 | #Run test for each run 49 | for run in range(start, end): 50 | if method == 'PCA': 51 | data_df = pd.read_table(input_folder + 'PCA_FILES/' + cancer_type + '_DATA_TOP2_JOINED_PCA_COMPONENTS_150L.tsv', index_col = 0) 52 | print(data_df.shape) 53 | 54 | ensemble_weights = np.abs(data_df.values.T) 55 | print(ensemble_weights.shape) 56 | 57 | if method == 'ICA': 58 | data_df = pd.read_table(input_folder + 'ICA_FILES/' + cancer_type + '_DATA_TOP2_JOINED_ICA_COMPONENTS_150L_fold' + str(run + 1) + '.tsv', index_col = 0) 59 | print(data_df.shape) 60 | 61 | ensemble_weights = np.abs(data_df.values.T) 62 | print(ensemble_weights.shape) 63 | 64 | if method == 'RP': 65 | data_df = pd.read_table(input_folder + 'RP_FILES/' + cancer_type + '_DATA_TOP2_JOINED_RP_COMPONENTS_fold' + str(run + 1) + '.tsv', index_col = 0) 66 | print(data_df.shape) 67 | 68 | ensemble_weights = np.abs(data_df.values.T) 69 | print(ensemble_weights.shape) 70 | 71 | if method == 'AE': 72 | data_df = pd.read_table(input_folder + 'AE_FILES/' + cancer_type + '_DATA_AE_Weights_TRAINING_150L_fold' + str(run + 1) + '.tsv', index_col = 0) 73 | print(data_df.shape) 74 | 75 | ensemble_weights = data_df.values.T 76 | print(ensemble_weights.shape) 77 | 78 | if method == 'DAE': 79 | data_df = pd.read_table(input_folder + 'DAE_FILES/' + cancer_type + '_DATA_DAE_Weights_TRAINING_150L_fold' + str(run + 1) + '.tsv', index_col = 0) 80 | print(data_df.shape) 81 | 82 | ensemble_weights = data_df.values.T 83 | print(ensemble_weights.shape) 84 | 85 | if method == 'DeepProfile': 86 | data_df = pd.read_table(input_folder + cancer_type + '_DeepProfile_Ensemble_Gene_Importance_Weights_150L.tsv', index_col = 0) 87 | print(data_df.shape) 88 | 89 | ensemble_weights = data_df.values 90 | print(ensemble_weights.shape) 91 | 92 | if method == 'VAE': 93 | data_df = pd.read_table(input_folder + 'VAE_WEIGHTS/' + cancer_type + '_DATA_VAE_Cluster_Weights_TRAINING_' + str(dimension) + 'L_fold' + str(run) + '.tsv', index_col = 0) 94 | print(data_df.shape) 95 | 96 | ensemble_weights = data_df.values.T 97 | print(ensemble_weights.shape) 98 | 99 | 100 | #Apply fisher test 101 | p_vals = np.zeros((ensemble_weights.shape[0], pathway_matrix.shape[1])) 102 | 103 | print("Running for top ", N, " genes") 104 | 105 | for i in range(p_vals.shape[0]): 106 | print(i) 107 | for j in range(p_vals.shape[1]): 108 | 109 | #Create contingency matrix 110 | matrix = np.zeros((2, 2)) 111 | 112 | pathway_indices = np.where(pathway_matrix[:, j] == 1)[0] 113 | #print(pathway_df.index[pathway_indices]) 114 | 115 | gene_indices = ensemble_weights[i, :].argsort()[-N:][::-1] 116 | #print(len(gene_indices)) 117 | #print(pathway_df.index[gene_indices]) 118 | 119 | in_pathway_firstN = len(np.intersect1d(pathway_indices ,gene_indices)) 120 | #print(pathway_df.index[np.intersect1d(pathway_indices ,gene_indices)]) 121 | 122 | out_pathway_firstN = N - in_pathway_firstN 123 | #print(out_pathway_firstN) 124 | 125 | in_pathway_other = len(pathway_indices) - in_pathway_firstN 126 | #print(in_pathway_other) 127 | 128 | out_pathway_other = pathway_matrix.shape[0] - in_pathway_other 129 | #print(out_pathway_other) 130 | 131 | matrix[0, 0] = in_pathway_firstN 132 | matrix[0, 1] = in_pathway_other 133 | matrix[1, 0] = out_pathway_firstN 134 | matrix[1, 1] = out_pathway_other 135 | 136 | import scipy.stats as stats 137 | oddsratio, pvalue = stats.fisher_exact(matrix) 138 | 139 | p_vals[i, j] = pvalue 140 | 141 | 142 | #Record uncorrected p-values 143 | if method == 'VAE': 144 | p_vals_df = pd.DataFrame(p_vals, index = np.arange(L) + 1, columns = pathway_df.columns) 145 | p_vals_df.to_csv(output_folder + cancer_type + '_FISHER_UNCORRECTED_PVALS_' + pathway_type + '_' + method + '_' + str(dimension) + 'L_' + str(run + 1) + '.tsv', sep = '\t') 146 | else: 147 | p_vals_df = pd.DataFrame(p_vals, index = np.arange(L) + 1, columns = pathway_df.columns) 148 | p_vals_df.to_csv(output_folder + cancer_type + '_FISHER_UNCORRECTED_PVALS_' + pathway_type + '_' + method + '_' + str(run + 1) + '.tsv', sep = '\t') 149 | 150 | new_p_values = np.zeros(((p_vals.shape[0], p_vals.shape[1]))) 151 | 152 | #Record corrected p-values 153 | for i in range(pathway_matrix.shape[1]): 154 | corrected_pval = sm.stats.multipletests( p_vals[:, i], alpha=0.05, method='fdr_bh', is_sorted=False, returnsorted=False)[1] 155 | new_p_values[:, i] = corrected_pval 156 | 157 | x = np.where([new_p_values < 0.05])[2] 158 | unique_count = len(np.unique(x)) 159 | print("UNIQUE PATHWAY COUNT: " + str(unique_count)) 160 | 161 | p_vals_df = pd.DataFrame(new_p_values, index = np.arange(L) + 1, columns = pathway_df.columns) 162 | #print(p_vals_df) 163 | if method == 'VAE': 164 | p_vals_df.to_csv(output_folder + cancer_type + '_FISHER_FDR_CORRECTED_PVALS_' + pathway_type + '_' + method + '_' + str(dimension) + 'L_' + str(run + 1) + '.tsv', sep = '\t') 165 | else: 166 | p_vals_df.to_csv(output_folder + cancer_type + '_FISHER_FDR_CORRECTED_PVALS_' + pathway_type + '_' + method + '_' + str(run + 1) + '.tsv', sep = '\t') 167 | 168 | x = np.where([p_vals_df.values < 0.05])[2] 169 | unique_count = len((x)) 170 | print("AVERAGE PATHWAY COUNT: ", unique_count / 150) 171 | 172 | -------------------------------------------------------------------------------- /PATHWAY_ANALYSIS/Run_Multiple_Fishers_Test.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for running multiple FETs 3 | ############################### 4 | 5 | import sys 6 | 7 | cancer_type = sys.argv[1] 8 | pathway = sys.argv[2] 9 | 10 | get_ipython().magic(u"run -i 'Fishers_Test.py' '" + cancer_type + "' " + pathway + " " + "DeepProfile" + " " + str(0) + " " + str(1)) 11 | get_ipython().magic(u"run -i 'Fishers_Test.py' '" + cancer_type + "' " + pathway + " " + "PCA" + " " + str(0) + " " + str(1)) 12 | get_ipython().magic(u"run -i 'Fishers_Test.py' '" + cancer_type + "' " + pathway + " " + "ICA" + " " + str(0) + " " + str(10)) 13 | get_ipython().magic(u"run -i 'Fishers_Test.py' '" + cancer_type + "' " + pathway + " " + "RP" + " " + str(0) + " " + str(10)) 14 | get_ipython().magic(u"run -i 'Fishers_Test.py' '" + cancer_type + "' " + pathway + " " + "AE" + " " + str(-1) + " " + str(9)) 15 | get_ipython().magic(u"run -i 'Fishers_Test.py' '" + cancer_type + "' " + pathway + " " + "DAE" + " " + str(-1) + " " + str(9)) 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeepProfile 2 | 3 | Repository with scripts for all model training and analysis for paper "A deep profile of gene expression across 18 human cancers" 4 | 5 | All fully pre-processed input data for training the models can be found on our Figshare Data repository. For each cancer, the basic data we used is **'CANCER_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv'** where CANCER is the name of the cancer type. This data is GEO datasets collected from top 2 platforms, intersecting genes taken, and batch correction applied. 6 | 7 | The folder **MODEL_TRAININGS** includes all scripts and notebooks for training VAE models and obtaining attributions. 8 | 9 | The script **Example_Run_All.py** includes all commands for training DeepProfile model for one cancer type. 10 | 11 | **STEP 1: Creating PCs for each data** 12 | 13 | Create a folder **./ALL_CANCER_FILES/CANCER/** then download the data and save in that folder. 14 | **Create_PCs_for_DeepLearning_Models.py** takes a cancer type and component_count as input and applies PCA on the training data to train deep learning models. 15 | 16 | **STEP 2: Training VAE models** 17 | 18 | **VAE_3Layers_Model.py** is the Keras implementation of VAE model. 19 | **Run_VAE_Models.py** takes the cancer type, number of hidden nodes, and start-end folds to train VAE models for the given cancer type. 20 | 21 | **STEP 3: Running IG for VAE models** 22 | 23 | **IntegratedGradients.py** is the Keras implementation for Integrated Gradients feature attribution method. 24 | **Get_VAE_IG_Attributions.py** is the script for running IG and get gene-level explanations for each of the nodes. It takes the cancer type, number of hidden nodes, and start-end folds to get explanations for the VAE models for the given cancer type. 25 | 26 | **STEP 4: Learning ensemble labels** 27 | 28 | **Create_Ensemble_Labels.py** is the script for running k-means clustering to learn ensemble weights. It takes the cancer type and number of final latent nodes as the input and saves the ensemble labels. 29 | **Select_Latent_Dimension_with_Gmeans** is the notebook for running g-means clustering to select the ensemble latent dimension size. 30 | 31 | **STEP 5: Creating DeepProfile ensemble training embedding** 32 | 33 | **Create_DeepProfile_Training_Embeddings.py** is the script for joining all the training data VAE embeddings and ensembling them using the learned ensemble labels. It takes the cancer type as the input and creates training DeepProfile ensemble embedding. 34 | 35 | **STEP 6: Creating DeepProfile ensemble gene attribution matrices** 36 | 37 | **Create_DeepProfile_Ensemble_Weights.py** is the script for joining all the VAE gene attributions and ensembling them using the learned ensemble labels. It takes the cancer type as the input and creates DeepProfile gene attribution matrix. 38 | 39 | 40 | 41 | ### PART 2: TRAINING COMPETITOR MODELS 42 | 43 | The script **Example_Run_All.py** includes all commands for training competitor models for one cancer type. 44 | 45 | In **COMPETITOR_TRAININGS**, all the scripts for comparing DeepProfile to other methods is included 46 | 47 | **STEP 1: Training PCA Models** 48 | 49 | **Create_PCA_Data.py** takes a cancer type and creates PCA components for the training data. 50 | 51 | **STEP 2: Training ICA Models** 52 | 53 | **Create_ICA_Data.py** takes a cancer type and creates ICA components for the training data, repeating 10 times. 54 | 55 | **STEP 3: Training RP Models** 56 | 57 | **Create_RP_Data.py** takes a cancer type and creates RP components for the training data, repeating 10 times. 58 | 59 | **STEP 4: Training AE Models** 60 | 61 | **AE_2Layers_Model.py** is the Keras implementation of AE model. 62 | **Train_AE_Models.py** takes a cancer type as input and trains 10 AE models with different random seeds. 63 | **Get_AE_IG_Attributions.py** is the script for running IG and get gene-level explanations for each of the nodes. It takes the cancer type and fold to get explanations for the AE models for the given cancer type. 64 | 65 | **STEP 5: Training DAE Models** 66 | 67 | **DAE_2Layers_Model.py** is the Keras implementation of DAE model. 68 | **Train_DAE_Models.py** takes a cancer type as input and trains 10 DAE models with different random seeds. 69 | **Get_DAE_IG_Attributions.py** is the script for running IG and get gene-level explanations for each of the nodes. It takes the cancer type and fold to get explanations for the DAE models for the given cancer type. 70 | 71 | 72 | 73 | ### PART 3: TCGA SURVIVAL PREDICTIONS 74 | 75 | In **TCGA_SURVIVAL_PREDICTION** folder, all files and scripts are included for predicting TCGA expression survival. 76 | 77 | In folder **TCGA_DATA**, **TCGA_CLINICAL_DATA** folder includes clinical data for TCGA samples. **TCGA_MICROARRAY** folder includes microarray expression data and **TCGA_RNASEQ** folder includes RNA-Seq expression data. 78 | 79 | **STEP 1: Preprocessing Data** 80 | 81 | **CREATE_EMBEDDINGS** folder includes all scripts to generate TCGA RNA-Seq embeddings. 82 | 83 | The script **Example_Run_All.py** includes all commands for generating TCGA expression embeddings for one cancer type. 84 | 85 | **Preprocess_TCGA_Rnaseq_Expression.py** script takes the cancer type and TCGA cancer type as input and preprocesses the expression data to train models. 86 | 87 | **Create_TCGA_Rnaseq_PCs.py** script again takes the cancer type and TCGA cancer type as input and applies PCA to preprocessed expression to record top PCs to train deep learning models. 88 | 89 | **STEP 2: Encoding Expression with DeepProfile** 90 | 91 | **Encode_TCGA_Data_with_VAE.py** takes the preprocessed PCAed expression and encodes it using the already trained VAE models. The script takes cancer type, TCGA type, VAE dimension, start and end runs to encode the expression. 92 | 93 | **Create_All_VAE_Embeddings.py** takes cancer type and TCGA type as input and encoder TCGA expression with all trained VAE models. 94 | 95 | **Create_DeepProfile_TCGA_Embeddings.py** takes the cancer type and TCGA type as input and generates the DeepProfile embedding. The script loads in all the VAE embeddings and ensemble labels to generate an ensemble DeepProfile embedding. 96 | 97 | **STEP 3: Encoding Expression with Competitor Models** 98 | 99 | **Encode_TCGA_Data_with_PCA.py** takes the cancer type and TCGA type as input and generated PCA embedding for TCGA RNA-Seq expressions. 100 | 101 | **Encode_TCGA_Data_with_ICA.py** takes the cancer type and TCGA type as input and generated ICA embedding for TCGA RNA-Seq expressions. 102 | 103 | **Encode_TCGA_Data_with_RP.py** takes the cancer type and TCGA type as input and generated RP embedding for TCGA RNA-Seq expressions. 104 | 105 | **Encode_TCGA_Data_with_AE.py** takes the cancer type and TCGA type as input and generated AE embedding for TCGA RNA-Seq expressions. 106 | 107 | **Encode_TCGA_Data_with_DAE.py** takes the cancer type and TCGA type as input and generated DAE embedding for TCGA RNA-Seq expressions. 108 | 109 | **STEP 4: Generating Survival DataFrames** 110 | 111 | Folder **CREATE_SURVIVAL_DATAFRAMES** includes all scripts for generating survival data frames. 112 | 113 | **Create_TCGA_Survival_Dataframes.py** takes the cancer type and TCGA type as input and extract the necssary fields from clinical data to define the survival dataframe. 114 | 115 | **Create_Joined_Survival_Dataframes.py** takes the cancer type and TCGA type as input and comnbines the DeepProfile RNA-Seq embeddings with survival data frames. 116 | 117 | **Create_Joined_Survival_Dataframes_Cancer_Types.py** combines data frames for cancer subtypes under the main cancer type. 118 | 119 | **STEP 5: Predicting Survival** 120 | 121 | Folder **PREDICT_SURVIVAL** contains scripts for predicting survival. 122 | 123 | **Predict_Survival.py** trains lasso regression models with subsampling taking the TCGA RNA-Seq embeddings as the input. 124 | 125 | **Predict_Survival_Subtypes_Joined.py** trains lasso regression models with subsampling taking the TCGA RNA-Seq embeddings as the input while joining multiple TCGA cancer types when there are multiple TCGA cancer subtypes corresponding to one major cancer type we have. 126 | 127 | **Run_Models.py** trains all prediction models for all models and cancer types. 128 | 129 | **Plots_of_Survival_Prediction.ipynb** and **Plots_of_Survival_Prediction_VAEs.ipynb** are notebooks for generating plots of comparing survival predictions of models. 130 | 131 | **STEP 6: Comparing RNA-Seq and microarray DeepProfile embeddings** 132 | 133 | **COMPARING_RNASEQ_and_MICROARRAY** folder includes all scripts to generate TCGA microarray embeddings and to compare the embeddings with RNA-Seq embeddings. 134 | 135 | **Preprocess_TCGA_Rnaseq_Expression.py** script takes the cancer type and TCGA cancer type as input and preprocesses the expression data to train models. 136 | 137 | **Create_TCGA_Microarray_PCs.py** script again takes the cancer type and TCGA cancer type as input and applies PCA to preprocessed expression to record top PCs to train deep learning models. 138 | 139 | **Encode_TCGA_Microarray_Data_with_VAE.py** takes the preprocessed PCAed expression and encodes it using the already trained VAE models. The script takes cancer type, TCGA type, VAE dimension, start and end runs to encode the expression. 140 | 141 | **Create_DeepProfile_TCGA_Microarray_Embeddings.py** takes the cancer type and TCGA type as input and generates the DeepProfile embedding. The script loads in all the VAE embeddings and ensemble labels to generate an ensemble DeepProfile embedding. 142 | 143 | **Rnaseq_and_Microarray_Embedding_Correlation_Plots** notebook calculates correlation between RNA-Seq and microarray embeddings and generates plots. 144 | 145 | 146 | 147 | ### PART 4: PATHWAY ENRICHMENT TESTS 148 | 149 | In **PATHWAY_ANALYSIS** folder, the scripts and files for pathway analysis are included. 150 | 151 | **MSIGDB_PATHWAYS** folder, the files for Molecular Signature Database pathways are included. 152 | 153 | **STEP 1: Running enrichment tests** 154 | 155 | **Create_Pathway_Matrices.py** is the script for creating binary pathway matrices for the genes that are present in the training datasets. It takes a cancer type and pathway type as input and creates an binary matrix of pathway overlaps. 156 | 157 | **Fishers_Test.py** is the script for running Fisher's test. It takes the cancer type, pathway type, the method name, and the range of runs and records uncorrected and FDR-corrected p-values. 158 | 159 | **Run_Multiple_Fishers_Test.py** is the script for running multiple tests consecutively. It takes the cancer type and pathway name as input and carries enrichment tests for all methods. 160 | 161 | **STEP 2: Comparing pathway coverages** 162 | 163 | **PATHWAY COVERAGE ANALYSIS** folder includes all scripts for comparing pathway coverage of models. 164 | 165 | **Plot_of_Average_Pathway_Coverages** genereates plots of average pathway coverage to compare DeepProfile and other dimensionality reduction methods. 166 | 167 | **Plot_of_Pathway_Coverage_Distributions** generates plots of distribution of pathway count of each node of DeepProfile and other dimensionality reduction methods. 168 | 169 | **Plot_of_Node_Level_Pathway_Annotations** generates plots of percent of nodes annotated by at least one pathway across multiple thresholds. 170 | 171 | **Plot_of_Pathway_Detection_Comparison_VAEs_vs_DeepProfile** creates plots for comparing pathways captured by DeepProfile vs individual VAE models. 172 | 173 | **Plot_of_Pathway_Percent_Comparison_VAEs_vs_DeepProfile** creates plots for comparing pathways captured by DeepProfile vs individual VAE models based on percentages. 174 | 175 | 176 | 177 | ### PART 5: NORMAL TISSUE ANALYSIS 178 | 179 | In **NORMAL_TISSUE_ANALYSIS** folder, the scripts for normal tissue analysis are included. 180 | 181 | The script **Example_Run_All.py** includes all commands for generating normal tissue expression embeddings for one cancer type. 182 | 183 | **Gtex_Tissue_Name_Mappings** is the notebook for mapping GTEX tissue names to cancer types we have. The GTEX expression data includes samples from many different tissues. We extract the GTEX sample names for each cancer type we have. 184 | 185 | **Preprocess_Gtex_Rnaseq_Expressions.py** is the script for creating preprocessed GTEX gene expression. It takes the cancer type as input and preprocesses the GTEX RNA-Seq expression using the same preprocessing steps applied to our training data. 186 | 187 | **Create_Gtex_Rnaseq_PCs.py** is the script for taking top PCs of the GTEX expression profiles to train DeepProfile model. It takes the cancer type as input and records the top PCs of GTEX expression. 188 | 189 | **Encode_GTEX_Data_with_VAE.py** is the script for encoding GTEX expression using trained VAE models. The inputs to the model are the cancer type, the number of latent nodes, and start and end runs. 190 | 191 | **Create_DeepProfile_GTEX_Embeddings.py** is the script for creating DeepProfile embedding using generated VAE embeddings. It takes the cancer type as input and records the final DeepProfile embedding for GTEX normal tissue samples. 192 | 193 | **Normal_Tissue_Classifier.py** is the script for training the classifier to separate normal vs cancer tissue embeddings. It takes the cancer type as input and records the bootstrapped classifier weights. 194 | 195 | 196 | 197 | 198 | -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/COMPARING_RNASEQ_and_MICROARRAY/Create_DeepProfile_TCGA_Microarray_Embeddings.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for creating TCGA RNA-Seq DeepProfile embeddings 3 | ############################### 4 | 5 | import pandas as pd 6 | import numpy as np 7 | import csv 8 | import sys 9 | 10 | #Read cancer type from user 11 | cancer_type = sys.argv[1] 12 | tcga_type = sys.argv[2] 13 | 14 | #Read all VAE embeddings 15 | dims = [5, 10, 25, 50, 75, 100] 16 | run = 100 17 | 18 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' 19 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/' 20 | 21 | data_list = [] 22 | for dim in dims: 23 | for i in range(run): 24 | data_df = pd.read_table(input_folder+ 'TCGA_FILES/TCGA_' + tcga_type + '_MICROARRAY_Expression_VAE_encoded_' + str(dim) + 'L_' + str(i) + '.tsv', index_col = 0) 25 | print("TCGA VAE embedding ", data_df.shape) 26 | data_list.append(data_df.values) 27 | 28 | #Concatenate all embeddings 29 | joined_data = np.concatenate(data_list, axis=1) 30 | print("Joined VAE embedding ",joined_data.shape) 31 | 32 | #Read DeepProfile ensemble labels 33 | L = 150 34 | labels_df = pd.read_table(input_folder + cancer_type + '_TRAINING_DATA_kmeans_ENSEMBLE_LABELS_' + str(L) + 'L.txt', header= None) 35 | labels = labels_df.values 36 | print("DeepProfile ensemble labels ", len(labels)) 37 | 38 | #Create ensemble embedding 39 | ensemble_embeddings = np.zeros((joined_data.shape[0], L)) 40 | for label in range(L): 41 | indices = np.where(labels == label)[0] 42 | average_values = np.mean(joined_data[:, indices], axis = 1) 43 | ensemble_embeddings[:, label] = average_values 44 | 45 | #Record the ensemble embeddings 46 | print(ensemble_embeddings.shape) 47 | ensemble_embeddings_df = pd.DataFrame(ensemble_embeddings, index = data_df.index, columns = np.arange(L)) 48 | ensemble_embeddings_df.to_csv(output_folder + tcga_type + '_DeepProfile_TCGA_MICROARRAY_Embedding_' + str(L) + 'L.tsv', sep = '\t') -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/COMPARING_RNASEQ_and_MICROARRAY/Create_TCGA_Microarray_PCs.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for recording top PCs or TCGA RNA-Seq data 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import csv 8 | from sklearn.decomposition import PCA 9 | import sklearn.preprocessing 10 | 11 | #Define method for preprocessing data 12 | def create_data(cancer_type, tcga_type): 13 | 14 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' 15 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/' 16 | 17 | #Read training data 18 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0) 19 | print("Training data ", data_df.shape) 20 | 21 | #Apply PCA to training data 22 | training_data = data_df.values 23 | training_data = np.nan_to_num(training_data) 24 | 25 | pca = PCA(n_components = 500) 26 | pca.fit(training_data) 27 | components = pca.components_ 28 | print("PCA components ", components.shape) 29 | 30 | #Read TCGA RNA-Seq expression 31 | tcga_df = pd.read_table(input_folder + 'TCGA_FILES/TCGA_' + tcga_type + '_PREPROCESSED_MICROARRAY_EXPRESSION.tsv', index_col= 0) 32 | print("TCGA expression ", tcga_df.shape) 33 | print('RANGE: ', (np.max(tcga_df.values) - np.min(tcga_df.values) )) 34 | 35 | #Encode test data using trained PCA model 36 | test_data = tcga_df.values 37 | encoded_data = pca.transform(test_data) 38 | print("Encoded TCGA data ", encoded_data.shape) 39 | 40 | #Record expression data 41 | encoded_df = pd.DataFrame(encoded_data, index = tcga_df.index) 42 | encoded_df.to_csv(output_folder + 'TCGA_MICROARRAY_' + tcga_type + '_PCA_500L.tsv', sep = '\t') 43 | 44 | 45 | import sys 46 | cancer_type = sys.argv[1] 47 | tcga_type = sys.argv[2] 48 | create_data(cancer_type, tcga_type) 49 | -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/COMPARING_RNASEQ_and_MICROARRAY/Encode_TCGA_Microarray_Data_with_VAE.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for encoding TCGA microarray expression using VAE models 3 | ############################### 4 | 5 | import os 6 | import numpy as np 7 | import pandas as pd 8 | 9 | import math 10 | from sklearn.metrics import mean_squared_error 11 | import matplotlib.pyplot as plt 12 | 13 | import tensorflow as tf 14 | from keras.layers import Input, Dense, Lambda, Layer, Activation 15 | from keras.layers.normalization import BatchNormalization 16 | from keras.models import Model 17 | from keras import backend as K 18 | from keras import metrics, optimizers 19 | from keras.callbacks import Callback 20 | import keras 21 | 22 | import csv 23 | import sys 24 | from keras.models import model_from_json 25 | from sklearn import preprocessing 26 | 27 | #Prevent tensorflow from using all the memory 28 | config = tf.ConfigProto() 29 | config.gpu_options.allow_growth=True 30 | sess = tf.Session(config=config) 31 | 32 | #Method for defining the VAE loss 33 | def vae_loss(x_input, x_decoded): 34 | reconstruction_loss = original_dim * metrics.mse(x_input, x_decoded) 35 | kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) 36 | return K.mean(reconstruction_loss + (K.get_value(beta) * kl_loss)) 37 | 38 | 39 | #Read user inputs 40 | import sys 41 | cancer = sys.argv[1] 42 | tcga_name = sys.argv[2] 43 | dimension = int(sys.argv[3]) 44 | start = int(sys.argv[4]) 45 | end = int(sys.argv[5]) 46 | 47 | print("CANCER NAME: " + cancer) 48 | print("TEST NAME: " + tcga_name) 49 | 50 | input_folder = '../../ALL_CANCER_FILES/' + cancer + '/' 51 | output_folder = '../../ALL_CANCER_FILES/' + cancer + '/TCGA_FILES/' 52 | 53 | #Read input data 54 | input_df_test = pd.read_table(input_folder + 'TCGA_FILES/TCGA_MICROARRAY_' + tcga_name + '_PCA_1000L.tsv', index_col = 0) 55 | print("TCGA expression dataframe ", input_df_test.shape) 56 | 57 | #Read GTEX expression 58 | for fold in range(start, end): 59 | print("VAE model with " + str(dimension) + " nodes and fold " + str(fold)) 60 | 61 | #Load VAE models 62 | json_file = open( input_folder + 'VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.json', 'r') 63 | loaded_model_json = json_file.read() 64 | json_file.close() 65 | encoder = model_from_json(loaded_model_json) 66 | 67 | encoder.load_weights(input_folder + 'VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.h5') 68 | print("Loaded model from disk") 69 | 70 | #Define placeholder VAE model 71 | original_dim = input_df_test.shape[1] 72 | intermediate1_dim = 100 73 | intermediate2_dim = 25 74 | latent_dim = dimension 75 | 76 | batch_size = 50 77 | epochs = 50 78 | learning_rate = 0.0005 79 | beta = K.variable(1) 80 | kappa = 0 81 | init_mode = 'glorot_uniform' 82 | 83 | x = Input(shape=(original_dim, )) 84 | 85 | net = Dense(intermediate1_dim, kernel_initializer=init_mode)(x) 86 | net2 = BatchNormalization()(net) 87 | net3 = Activation('relu')(net2) 88 | 89 | net4 = Dense(intermediate2_dim, kernel_initializer=init_mode)(net3) 90 | net5 = BatchNormalization()(net4) 91 | net6 = Activation('relu')(net5) 92 | 93 | z_mean = Dense(latent_dim, kernel_initializer=init_mode)(net6) 94 | z_log_var = Dense(latent_dim, kernel_initializer=init_mode)(net6) 95 | 96 | adam = optimizers.Adam(lr=learning_rate) 97 | 98 | #Encode test data using the VAE model 99 | test_encoded = encoder.predict(input_df_test, batch_size = batch_size) 100 | test_encoded_df = pd.DataFrame(test_encoded, index = input_df_test.index) 101 | test_encoded_df.to_csv(output_folder + 'TCGA_' + tcga_name + '_MICROARRAY_Expression_VAE_encoded_' + str(dimension) + 'L_' + str(fold) + '.tsv', sep = '\t') 102 | 103 | -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/COMPARING_RNASEQ_and_MICROARRAY/Preprocess_TCGA_Microarray_Expression.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for preprocessing TCGA RNA-Seq expression 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import csv 8 | import sklearn.preprocessing 9 | 10 | #Define method for preprocessing data 11 | def create_data(cancer_type, tcga_type): 12 | 13 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' 14 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/' 15 | 16 | #Read TCGA RNA-Seq expression 17 | tcga_df = pd.read_csv('../TCGA_DATA/TCGA_MICROARRAY/' + tcga_type + '.medianexp.txt', sep = '\t', index_col= 0) 18 | tcga_df = tcga_df.transpose() 19 | tcga_df = tcga_df.iloc[:, 1:] 20 | tcga_df = tcga_df.astype(float) 21 | 22 | tcga_df = tcga_df.fillna(tcga_df.mean().fillna(0)) 23 | print("TCGA expression ", tcga_df.shape) 24 | print('RANGE: ', (np.max(tcga_df.values) - np.min(tcga_df.values) )) 25 | print("TCGA expression mean ", np.mean(tcga_df.values, axis = 0)) 26 | print("TCGA expression mean ", len(np.mean(tcga_df.values, axis = 0))) 27 | print("TCGA expression std ", np.std(tcga_df.values, axis = 0)) 28 | print("TCGA expression std ", len(np.std(tcga_df.values, axis = 0))) 29 | 30 | new_index = [s[:15] for s in tcga_df.index] 31 | tcga_df = pd.DataFrame(tcga_df.values, index = new_index, columns = tcga_df.columns) 32 | print(tcga_df) 33 | 34 | #Eliminate normal samples 35 | print("Eliminating normal samples..") 36 | sample_codes = [s[-2:] for s in tcga_df.index] 37 | print("Sample codes ", np.unique(sample_codes)) 38 | normal_codes = [s[-2] for s in tcga_df.index] 39 | cancer_samples = np.where(np.asarray(normal_codes) == '0')[0] 40 | print("Total number of samples ", len(tcga_df.index)) 41 | print("Total number of cancer samples ", len(cancer_samples)) 42 | tcga_df = tcga_df.iloc[cancer_samples, :] 43 | print("TCGA expression ", tcga_df.shape) 44 | print("TCGA expression cancer samples ", tcga_df.index) 45 | 46 | #Read training data 47 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0) 48 | print("Training data ", data_df.shape) 49 | 50 | #Get only training genes from the expression data 51 | joined_df = pd.concat([data_df, tcga_df], join = 'outer') 52 | joined_df = joined_df[data_df.columns] 53 | joined_df = joined_df.iloc[-1 * tcga_df.shape[0]:, :] 54 | joined_df = joined_df.fillna(joined_df.mean().fillna(0)) 55 | print("TCGA expression ", joined_df.shape) 56 | 57 | #Standardize data to make 0 mean univariate 58 | normalized_data = sklearn.preprocessing.scale(joined_df.values) 59 | print("TCGA expression mean ", np.mean(normalized_data, axis = 0)) 60 | print("TCGA expression mean ", len(np.mean(normalized_data, axis = 0))) 61 | print("TCGA expression std ", np.std(normalized_data, axis = 0)) 62 | print("TCGA expression std ", len(np.std(normalized_data, axis = 0))) 63 | 64 | #Record joined dataframe 65 | joined_df = pd.DataFrame(normalized_data, index = joined_df.index, columns = joined_df.columns) 66 | print("Final dataframe ", joined_df.shape) 67 | print('RANGE: ', (np.max(joined_df.values) - np.min(joined_df.values) )) 68 | 69 | #Record expression data 70 | joined_df.to_csv(output_folder + 'TCGA_' + tcga_type + '_PREPROCESSED_MICROARRAY_EXPRESSION.tsv', sep = '\t') 71 | print(joined_df) 72 | 73 | import sys 74 | cancer_type = sys.argv[1] 75 | tcga_type = sys.argv[2] 76 | create_data(cancer_type, tcga_type) 77 | -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Create_All_VAE_Embeddings.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | cancer_type = sys.argv[1] 4 | tcga_type = sys.argv[2] 5 | 6 | dims = [5, 10, 25, 50, 75, 100] 7 | for dim in dims: 8 | get_ipython().magic(u"run -i 'Encode_TCGA_Data_with_VAE.py' '" + cancer_type + "' " + tcga_type + " " + str(dim) + " " + str(0) + " " + str(100)) -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Create_DeepProfile_TCGA_Embeddings.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for creating TCGA RNA-Seq DeepProfile embeddings 3 | ############################### 4 | 5 | import pandas as pd 6 | import numpy as np 7 | import csv 8 | import sys 9 | 10 | #Read cancer type from user 11 | cancer_type = sys.argv[1] 12 | tcga_type = sys.argv[2] 13 | 14 | #Read all VAE embeddings 15 | dims = [5, 10, 25, 50, 75, 100] 16 | run = 100 17 | 18 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' 19 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/' 20 | 21 | data_list = [] 22 | for dim in dims: 23 | for i in range(run): 24 | data_df = pd.read_table(input_folder + 'TCGA_FILES/TCGA_' + tcga_type + '_RNASeq_Expression_VAE_encoded_' + str(dim) + 'L_' + str(i) + '.tsv', index_col = 0) 25 | print("TCGA VAE embedding ", data_df.shape) 26 | data_list.append(data_df.values) 27 | 28 | #Concatenate all embeddings 29 | joined_data = np.concatenate(data_list, axis=1) 30 | print("Joined VAE embedding ",joined_data.shape) 31 | 32 | #Read DeepProfile ensemble labels 33 | L = 150 34 | labels_df = pd.read_table( input_folder + cancer_type + '_TRAINING_DATA_kmeans_ENSEMBLE_LABELS_' + str(L) + 'L.txt', header= None) 35 | labels = labels_df.values 36 | print("DeepProfile ensemble labels ", len(labels)) 37 | 38 | #Create ensemble embedding 39 | ensemble_embeddings = np.zeros((joined_data.shape[0], L)) 40 | for label in range(L): 41 | indices = np.where(labels == label)[0] 42 | average_values = np.mean(joined_data[:, indices], axis = 1) 43 | ensemble_embeddings[:, label] = average_values 44 | 45 | #Record the ensemble embeddings 46 | print(ensemble_embeddings.shape) 47 | ensemble_embeddings_df = pd.DataFrame(ensemble_embeddings, index = data_df.index, columns = np.arange(L)) 48 | ensemble_embeddings_df.to_csv(output_folder + tcga_type + '_DeepProfile_TCGA_RNASeq_Embedding_' + str(L) + 'L.tsv', sep = '\t') -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Create_TCGA_Rnaseq_PCs.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for recording top PCs or TCGA RNA-Seq data 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import csv 8 | from sklearn.decomposition import PCA 9 | import sklearn.preprocessing 10 | 11 | #Define method for preprocessing data 12 | def create_data(cancer_type, tcga_type): 13 | 14 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' 15 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/' 16 | 17 | #Read training data 18 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0) 19 | print("Training data ", data_df.shape) 20 | 21 | #Apply PCA to training data 22 | training_data = data_df.values 23 | training_data = np.nan_to_num(training_data) 24 | 25 | pca = PCA(n_components = 1000) 26 | pca.fit(training_data) 27 | components = pca.components_ 28 | print("PCA components ", components.shape) 29 | 30 | #Read TCGA RNA-Seq expression 31 | tcga_df = pd.read_table(output_folder + 'TCGA_' + tcga_type + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', index_col= 0) 32 | print("TCGA expression ", tcga_df.shape) 33 | print('RANGE: ', (np.max(tcga_df.values) - np.min(tcga_df.values) )) 34 | 35 | #Encode test data using trained PCA model 36 | test_data = tcga_df.values 37 | encoded_data = pca.transform(test_data) 38 | print("Encoded TCGA data ", encoded_data.shape) 39 | 40 | #Record expression data 41 | encoded_df = pd.DataFrame(encoded_data, index = tcga_df.index) 42 | encoded_df.to_csv(output_folder + 'TCGA_RNASEQ_' + tcga_type + '_PCA_1000L.tsv', sep = '\t') 43 | 44 | 45 | import sys 46 | cancer_type = sys.argv[1] 47 | tcga_type = sys.argv[2] 48 | create_data(cancer_type, tcga_type) 49 | -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Encode_TCGA_Data_with_AE.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for creating TCGA RNA-Seq AE embeddings 3 | ############################### 4 | 5 | import os 6 | import numpy as np 7 | import pandas as pd 8 | 9 | import math 10 | from sklearn.metrics import mean_squared_error 11 | import matplotlib.pyplot as plt 12 | from keras.models import model_from_json 13 | from sklearn import preprocessing 14 | 15 | import tensorflow as tf 16 | from keras.layers import Input, Dense, Lambda, Layer, Activation 17 | from keras.layers.normalization import BatchNormalization 18 | from keras.models import Model 19 | from keras import backend as K 20 | from keras import metrics, optimizers 21 | from keras.callbacks import Callback 22 | import keras 23 | 24 | import csv 25 | import sys 26 | 27 | #Prevent tensorflow from using all the memory 28 | config = tf.ConfigProto() 29 | config.gpu_options.allow_growth=True 30 | sess = tf.Session(config=config) 31 | 32 | #Define reconstruction loss 33 | def reconstruction_loss(x_input, x_decoded): 34 | return metrics.mse(x_input, x_decoded) 35 | 36 | #Read user inputs 37 | import sys 38 | cancer = sys.argv[1] 39 | tcga_name = sys.argv[2] 40 | print("CANCER NAME: " + cancer) 41 | print("TEST NAME: " + tcga_name) 42 | 43 | input_folder = '../../ALL_CANCER_FILES/' + cancer + '/' 44 | output_folder = '../../ALL_CANCER_FILES/' + cancer + '/TCGA_FILES/' 45 | 46 | start = 0 47 | end = 10 48 | dimension = 150 49 | 50 | #Read TCGA RNA-Seq input data 51 | input_df_test = pd.read_table(input_folder + 'TCGA_FILES/TCGA_RNASEQ_' + tcga_name + '_PCA_1000L.tsv', index_col = 0) 52 | print("RNA-Seq expression dataframe ", input_df_test.shape) 53 | 54 | #Encode test data with all 10 AE models 55 | for fold in range(start, end): 56 | print("AE model with " + str(dimension) + " nodes and fold " + str(fold)) 57 | 58 | #Load AE models 59 | json_file = open(input_folder + 'AE_FILES/AE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.json', 'r') 60 | loaded_model_json = json_file.read() 61 | json_file.close() 62 | encoder = model_from_json(loaded_model_json) 63 | 64 | encoder.load_weights(input_folder + 'AE_FILES/AE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.h5') 65 | print("Loaded model from disk") 66 | 67 | # Encode test data using the AE model 68 | test_encoded = encoder.predict(input_df_test) 69 | test_encoded_df = pd.DataFrame(test_encoded, index = input_df_test.index) 70 | print("Encoded data ", test_encoded_df.shape) 71 | test_encoded_df.to_csv(output_folder + 'TCGA_' + tcga_name + '_RNASeq_Expression_AE_encoded_' + str(dimension) + 'L_' + str(fold) + '.tsv', sep = '\t') 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Encode_TCGA_Data_with_DAE.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for creating TCGA RNA-Seq AE embeddings 3 | ############################### 4 | 5 | import os 6 | import numpy as np 7 | import pandas as pd 8 | 9 | import math 10 | from sklearn.metrics import mean_squared_error 11 | import matplotlib.pyplot as plt 12 | from keras.models import model_from_json 13 | from sklearn import preprocessing 14 | 15 | import tensorflow as tf 16 | from keras.layers import Input, Dense, Lambda, Layer, Activation 17 | from keras.layers.normalization import BatchNormalization 18 | from keras.models import Model 19 | from keras import backend as K 20 | from keras import metrics, optimizers 21 | from keras.callbacks import Callback 22 | import keras 23 | 24 | import csv 25 | import sys 26 | 27 | #Prevent tensorflow from using all the memory 28 | config = tf.ConfigProto() 29 | config.gpu_options.allow_growth=True 30 | sess = tf.Session(config=config) 31 | 32 | #Define reconstruction loss 33 | def reconstruction_loss(x_input, x_decoded): 34 | return metrics.mse(x_input, x_decoded) 35 | 36 | #Read user inputs 37 | import sys 38 | cancer = sys.argv[1] 39 | tcga_name = sys.argv[2] 40 | print("CANCER NAME: " + cancer) 41 | print("TEST NAME: " + tcga_name) 42 | 43 | input_folder = '../../ALL_CANCER_FILES/' + cancer + '/' 44 | output_folder = '../../ALL_CANCER_FILES/' + cancer + '/TCGA_FILES/' 45 | 46 | start = 0 47 | end = 10 48 | dimension = 150 49 | 50 | #Read TCGA RNA-Seq input data 51 | input_df_test = pd.read_table(input_folder + 'TCGA_FILES/TCGA_RNASEQ_' + tcga_name + '_PCA_1000L.tsv', index_col = 0) 52 | print("RNA-Seq expression dataframe ", input_df_test.shape) 53 | 54 | #Encode test data with all 10 DAE models 55 | for fold in range(start, end): 56 | print("DAE model with " + str(dimension) + " nodes and fold " + str(fold)) 57 | 58 | #Load DAE models 59 | json_file = open(input_folder + 'DAE_FILES/DAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.json', 'r') 60 | loaded_model_json = json_file.read() 61 | json_file.close() 62 | encoder = model_from_json(loaded_model_json) 63 | 64 | encoder.load_weights(input_folder + 'DAE_FILES/DAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.h5') 65 | print("Loaded model from disk") 66 | 67 | # Encode test data using the DAE model 68 | test_encoded = encoder.predict(input_df_test) 69 | test_encoded_df = pd.DataFrame(test_encoded, index = input_df_test.index) 70 | print("Encoded data ", test_encoded_df.shape) 71 | test_encoded_df.to_csv(output_folder + 'TCGA_' + tcga_name + '_RNASeq_Expression_DAE_encoded_' + str(dimension) + 'L_' + str(fold) + '.tsv', sep = '\t') 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Encode_TCGA_Data_with_ICA.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for creating TCGA RNA-Seq ICA embeddings 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import csv 8 | from sklearn.decomposition import FastICA 9 | import sklearn.preprocessing 10 | from scipy.stats.mstats import winsorize 11 | 12 | #Read cancer type and TCGA type 13 | import sys 14 | cancer_type = sys.argv[1] 15 | tcga_type = sys.argv[2] 16 | print("CANCER NAME: " + cancer_type) 17 | print("TEST NAME: " + tcga_type) 18 | 19 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' 20 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/' 21 | 22 | #Read training data 23 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0) 24 | print("Training data ", data_df.shape) 25 | training_data = data_df.values 26 | training_data = np.nan_to_num(training_data) 27 | 28 | #Read TCGA RNA-Seq expression data 29 | tcga_df = pd.read_table(input_folder + 'TCGA_FILES/TCGA_' + tcga_type + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', index_col= 0) 30 | print("TCGA data ", tcga_df.shape) 31 | test_data = tcga_df.values 32 | 33 | #Train all ICA models 34 | for run in range(10): 35 | #Train model 36 | ica = FastICA(n_components = 150, random_state = 12345 * run, tol=0.001, max_iter = 100000) 37 | print(ica) 38 | ica.fit(training_data) 39 | components = ica.components_ 40 | print("ICA components ", components.shape) 41 | 42 | #Encode RNA-Seq data 43 | encoded_data = ica.transform(test_data) 44 | print("Encoded TCGA data ", encoded_data.shape) 45 | encoded_df = pd.DataFrame(encoded_data, index = tcga_df.index) 46 | encoded_df.to_csv(output_folder + 'TCGA_RNASEQ_' + tcga_type + '_ICA_150L_' + str(run + 1) + '.tsv', sep = '\t') 47 | 48 | -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Encode_TCGA_Data_with_PCA.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for creating TCGA RNA-Seq PCA embeddings 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import csv 8 | from sklearn.decomposition import PCA 9 | import sklearn.preprocessing 10 | from scipy.stats.mstats import winsorize 11 | import sys 12 | 13 | #Read cancer type and TCGA type 14 | cancer_type = sys.argv[1] 15 | tcga_type = sys.argv[2] 16 | print("CANCER NAME: " + cancer_type) 17 | print("TEST NAME: " + tcga_type) 18 | 19 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' 20 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/' 21 | 22 | #Read training data 23 | data_df = pd.read_table( input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0) 24 | print("Training data ", data_df.shape) 25 | training_data = data_df.values 26 | training_data = np.nan_to_num(training_data) 27 | 28 | #Train PCA model 29 | pca = PCA(n_components = 150) 30 | pca.fit(training_data) 31 | components = pca.components_ 32 | print("PCA components ", components.shape) 33 | 34 | #Read TCGA RNA-Seq expression data 35 | tcga_df = pd.read_table(output_folder+ '/TCGA_' + tcga_type + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', index_col= 0) 36 | print("TCGA data ", tcga_df.shape) 37 | 38 | #Encode TCGA data with PCA model 39 | test_data = tcga_df.values 40 | encoded_data = pca.transform(test_data) 41 | print("Encoded TCGA data ", encoded_data.shape) 42 | 43 | encoded_df = pd.DataFrame(encoded_data, index = tcga_df.index) 44 | encoded_df.to_csv(output_folder + '/TCGA_RNASEQ_' + tcga_type + '_PCA_150L.tsv', sep = '\t') 45 | -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Encode_TCGA_Data_with_RP.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for creating TCGA RNA-Seq RP embeddings 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import csv 8 | from sklearn.random_projection import GaussianRandomProjection 9 | import sklearn.preprocessing 10 | from scipy.stats.mstats import winsorize 11 | 12 | #Read cancer type and TCGA type 13 | import sys 14 | cancer_type = sys.argv[1] 15 | tcga_type = sys.argv[2] 16 | print("CANCER NAME: " + cancer_type) 17 | print("TEST NAME: " + tcga_type) 18 | 19 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' 20 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/' 21 | 22 | #Read training data 23 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0) 24 | print("Training data ", data_df.shape) 25 | training_data = data_df.values 26 | training_data = np.nan_to_num(training_data) 27 | 28 | #Read TCGA RNA-Seq expression data 29 | tcga_df = pd.read_table(input_folder + 'TCGA_FILES/TCGA_' + tcga_type + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', index_col= 0) 30 | print("TCGA data ", tcga_df.shape) 31 | test_data = tcga_df.values 32 | 33 | #Train all models 34 | for run in range(10): 35 | #Train model 36 | transformer = GaussianRandomProjection(n_components = 150, random_state = run * 12345) 37 | transformer.fit(training_data) 38 | components = transformer.components_ 39 | print("RP components ", components.shape) 40 | 41 | #Save the encoded data 42 | encoded_data = transformer.transform(test_data) 43 | print("Encoded TCGA data ", encoded_data.shape) 44 | encoded_df = pd.DataFrame(encoded_data, index = tcga_df.index) 45 | encoded_df.to_csv(output_folder + 'TCGA_RNASEQ_' + tcga_type + '_RP_150L_' + str(run + 1) + '.tsv', sep = '\t') 46 | 47 | -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Encode_TCGA_Data_with_VAE.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for encoding TCGA RNA-Seq expression using VAE models 3 | ############################### 4 | 5 | import os 6 | import numpy as np 7 | import pandas as pd 8 | 9 | import math 10 | from sklearn.metrics import mean_squared_error 11 | import matplotlib.pyplot as plt 12 | 13 | import tensorflow as tf 14 | from keras.layers import Input, Dense, Lambda, Layer, Activation 15 | from keras.layers.normalization import BatchNormalization 16 | from keras.models import Model 17 | from keras import backend as K 18 | from keras import metrics, optimizers 19 | from keras.callbacks import Callback 20 | import keras 21 | 22 | import csv 23 | import sys 24 | from keras.models import model_from_json 25 | from sklearn import preprocessing 26 | 27 | #Prevent tensorflow from using all the memory 28 | config = tf.ConfigProto() 29 | config.gpu_options.allow_growth=True 30 | sess = tf.Session(config=config) 31 | 32 | #Method for defining the VAE loss 33 | def vae_loss(x_input, x_decoded): 34 | reconstruction_loss = original_dim * metrics.mse(x_input, x_decoded) 35 | kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) 36 | return K.mean(reconstruction_loss + (K.get_value(beta) * kl_loss)) 37 | 38 | 39 | #Read user inputs 40 | import sys 41 | cancer = sys.argv[1] 42 | tcga_name = sys.argv[2] 43 | dimension = int(sys.argv[3]) 44 | start = int(sys.argv[4]) 45 | end = int(sys.argv[5]) 46 | 47 | print("CANCER NAME: " + cancer) 48 | print("TEST NAME: " + tcga_name) 49 | 50 | input_folder = '../../ALL_CANCER_FILES/' + cancer + '/' 51 | output_folder = '../../ALL_CANCER_FILES/' + cancer + '/TCGA_FILES/' 52 | 53 | #Read input data 54 | input_df_test = pd.read_table(input_folder + 'TCGA_FILES/TCGA_RNASEQ_' + tcga_name + '_PCA_1000L.tsv', index_col = 0) 55 | print("TCGA expression dataframe ", input_df_test.shape) 56 | 57 | #Read GTEX expression 58 | for fold in range(start, end): 59 | print("VAE model with " + str(dimension) + " nodes and fold " + str(fold)) 60 | 61 | #Load VAE models 62 | json_file = open(input_folder + '/VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.json', 'r') 63 | loaded_model_json = json_file.read() 64 | json_file.close() 65 | encoder = model_from_json(loaded_model_json) 66 | 67 | encoder.load_weights(input_folder + '/VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.h5') 68 | print("Loaded model from disk") 69 | 70 | #Encode test data using the VAE model 71 | test_encoded = encoder.predict(input_df_test) 72 | test_encoded_df = pd.DataFrame(test_encoded, index = input_df_test.index) 73 | test_encoded_df.to_csv(output_folder + 'TCGA_' + tcga_name + '_RNASeq_Expression_VAE_encoded_' + str(dimension) + 'L_' + str(fold) + '.tsv', sep = '\t') 74 | 75 | -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Example_Run_All.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Example for training TCGA models for a cancer type 3 | ############################### 4 | import sys 5 | 6 | ##STEP 1: Preprocessing Data 7 | get_ipython().magic(u"run -i Preprocess_TCGA_Rnaseq_Expression.py BRCA BRCA") 8 | get_ipython().magic(u"run -i Create_TCGA_Rnaseq_PCs.py BRCA BRCA") 9 | 10 | ##STEP 2: Encoding Expression with DeepProfile 11 | get_ipython().magic(u"run -i Create_All_VAE_Embeddings.py BRCA BRCA") 12 | get_ipython().magic(u"run -i Create_DeepProfile_TCGA_Embeddings.py BRCA BRCA") 13 | 14 | ##STEP 3: Encoding Expression with Competitor Models 15 | get_ipython().magic(u"run -i Encode_TCGA_Data_with_PCA.py BRCA BRCA") 16 | get_ipython().magic(u"run -i Encode_TCGA_Data_with_ICA.py BRCA BRCA") 17 | get_ipython().magic(u"run -i Encode_TCGA_Data_with_RP.py BRCA BRCA") 18 | get_ipython().magic(u"run -i Encode_TCGA_Data_with_AE.py BRCA BRCA") 19 | get_ipython().magic(u"run -i Encode_TCGA_Data_with_DAE.py BRCA BRCA") -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Preprocess_TCGA_Rnaseq_Expression.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for preprocessing TCGA RNA-Seq expression 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import sklearn.preprocessing 8 | 9 | #Define method for preprocessing data 10 | def create_data(cancer_type, tcga_type): 11 | 12 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' 13 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/' 14 | 15 | #Read TCGA RNA-Seq expression 16 | print("Reading expression...") 17 | tcga_df = pd.read_table('../TCGA_DATA/TCGA_RNASEQ/' + tcga_type + '.uncv2.mRNAseq_RSEM_normalized_log2.txt', sep = '\t', index_col= 0) 18 | tcga_df = tcga_df.transpose() 19 | print("TCGA expression ", tcga_df.shape) 20 | print("TCGA expression ", tcga_df.head()) 21 | 22 | #Map to gene names and eliminate unknown genes 23 | print("Correcting gene names...") 24 | gene_names = tcga_df.columns 25 | gene_names = [n[:n.index('|')] for n in gene_names] 26 | tcga_df = pd.DataFrame(tcga_df.values, index = tcga_df.index, columns = gene_names) 27 | 28 | #Eliminate unknown genes 29 | tcga_df = tcga_df.iloc[:, tcga_df.columns.values != '?'] 30 | tcga_df = tcga_df.loc[:,~tcga_df.columns.duplicated()] 31 | print("TCGA expression ", tcga_df.shape) 32 | print("TCGA expression ", tcga_df) 33 | 34 | #Eliminate normal samples 35 | print("Eliminating normal samples..") 36 | sample_codes = [s[-2:] for s in tcga_df.index] 37 | print("Sample codes ", np.unique(sample_codes)) 38 | normal_codes = [s[-2] for s in tcga_df.index] 39 | cancer_samples = np.where(np.asarray(normal_codes) == '0')[0] 40 | print("Total number of samples ", len(tcga_df.index)) 41 | print("Total number of cancer samples ", len(cancer_samples)) 42 | tcga_df = tcga_df.iloc[cancer_samples, :] 43 | print("TCGA expression ", tcga_df.shape) 44 | print("TCGA expression cancer samples ", tcga_df.index) 45 | 46 | #Read training data 47 | print("Combining with training data...") 48 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0) 49 | print("Training data ", data_df.shape) 50 | 51 | #Get only training genes from the expression data 52 | joined_df = pd.concat([data_df, tcga_df], join = 'outer') 53 | joined_df = joined_df[data_df.columns] 54 | joined_df = joined_df.iloc[-1 * tcga_df.shape[0]:, :] 55 | joined_df = joined_df.fillna(joined_df.mean().fillna(0)) 56 | print("TCGA expression ", joined_df.shape) 57 | print("TCGA expression ", joined_df.head()) 58 | 59 | #Standardize data to make 0 mean univariate 60 | print("Standardizing the data...") 61 | scaled_expression_values = joined_df.values 62 | normalized_data = sklearn.preprocessing.scale(scaled_expression_values) 63 | print("TCGA expression mean ", np.mean(normalized_data, axis = 0)) 64 | print("TCGA expression mean ", len(np.mean(normalized_data, axis = 0))) 65 | print("TCGA expression std ", np.std(normalized_data, axis = 0)) 66 | print("TCGA expression std ", len(np.std(normalized_data, axis = 0))) 67 | 68 | #Record joined dataframe 69 | joined_df = pd.DataFrame(normalized_data, index = joined_df.index, columns = joined_df.columns) 70 | print("Final dataframe ", joined_df.shape) 71 | print('RANGE: ', (np.max(joined_df.values) - np.min(joined_df.values) )) 72 | 73 | #Record expression data 74 | joined_df.to_csv(output_folder + 'TCGA_' + tcga_type + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', sep = '\t') 75 | 76 | 77 | import sys 78 | cancer_type = sys.argv[1] 79 | tcga_type = sys.argv[2] 80 | create_data(cancer_type, tcga_type) 81 | -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Preprocess_TCGA_Rnaseq_Expression_All_Genes.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for preprocessing TCGA RNA-Seq expression 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import sklearn.preprocessing 8 | 9 | #Define method for preprocessing data 10 | def create_data(cancer_type, tcga_type): 11 | 12 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' 13 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/' 14 | 15 | #Read TCGA RNA-Seq expression 16 | print("Reading expression...") 17 | tcga_df = pd.read_table('../TCGA_DATA/TCGA_RNASEQ/' + tcga_type + '.uncv2.mRNAseq_RSEM_normalized_log2.txt', sep = '\t', index_col= 0) 18 | tcga_df = tcga_df.transpose() 19 | print("TCGA expression ", tcga_df.shape) 20 | print("TCGA expression ", tcga_df.head()) 21 | 22 | #Map to gene names and eliminate unknown genes 23 | print("Correcting gene names...") 24 | gene_names = tcga_df.columns 25 | gene_names = [n[:n.index('|')] for n in gene_names] 26 | tcga_df = pd.DataFrame(tcga_df.values, index = tcga_df.index, columns = gene_names) 27 | 28 | #Eliminate unknown genes 29 | tcga_df = tcga_df.iloc[:, tcga_df.columns.values != '?'] 30 | tcga_df = tcga_df.loc[:,~tcga_df.columns.duplicated()] 31 | print("TCGA expression ", tcga_df.shape) 32 | print("TCGA expression ", tcga_df) 33 | 34 | #Eliminate normal samples 35 | print("Eliminating normal samples..") 36 | sample_codes = [s[-2:] for s in tcga_df.index] 37 | print("Sample codes ", np.unique(sample_codes)) 38 | normal_codes = [s[-2] for s in tcga_df.index] 39 | cancer_samples = np.where(np.asarray(normal_codes) == '0')[0] 40 | print("Total number of samples ", len(tcga_df.index)) 41 | print("Total number of cancer samples ", len(cancer_samples)) 42 | tcga_df = tcga_df.iloc[cancer_samples, :] 43 | print("TCGA expression ", tcga_df.shape) 44 | print("TCGA expression cancer samples ", tcga_df.index) 45 | 46 | #Standardize data to make 0 mean univariate 47 | print("Standardizing the data...") 48 | scaled_expression_values = tcga_df.values 49 | normalized_data = sklearn.preprocessing.scale(scaled_expression_values) 50 | print("TCGA expression mean ", np.mean(normalized_data, axis = 0)) 51 | print("TCGA expression mean ", len(np.mean(normalized_data, axis = 0))) 52 | print("TCGA expression std ", np.std(normalized_data, axis = 0)) 53 | print("TCGA expression std ", len(np.std(normalized_data, axis = 0))) 54 | 55 | #Record joined dataframe 56 | tcga_df = pd.DataFrame(normalized_data, index = tcga_df.index, columns = tcga_df.columns) 57 | tcga_df = tcga_df.fillna(tcga_df.mean().fillna(0)) 58 | print("Final dataframe ", tcga_df.shape) 59 | print('RANGE: ', (np.max(tcga_df.values) - np.min(tcga_df.values) )) 60 | 61 | #Record expression data 62 | tcga_df.to_csv(output_folder + 'TCGA_' + tcga_type + '_ALL_GENES_PREPROCESSED_RNASEQ_EXPRESSION.tsv', sep = '\t') 63 | print(tcga_df) 64 | 65 | import sys 66 | cancer_type = sys.argv[1] 67 | tcga_type = sys.argv[2] 68 | create_data(cancer_type, tcga_type) 69 | -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Preprocess_TCGA_Rnaseq_Expression_All_Genes_Uncorrected.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for preprocessing TCGA RNA-Seq expression 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import sklearn.preprocessing 8 | 9 | #Define method for preprocessing data 10 | def create_data(cancer_type, tcga_type): 11 | 12 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' 13 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/' 14 | 15 | #Read TCGA RNA-Seq expression 16 | print("Reading expression...") 17 | tcga_df = pd.read_table('../TCGA_DATA/TCGA_RNASEQ/' + tcga_type + '.uncv2.mRNAseq_RSEM_normalized_log2.txt', sep = '\t', index_col= 0) 18 | tcga_df = tcga_df.transpose() 19 | print("TCGA expression ", tcga_df.shape) 20 | print("TCGA expression ", tcga_df.head()) 21 | 22 | #Map to gene names and eliminate unknown genes 23 | print("Correcting gene names...") 24 | gene_names = tcga_df.columns 25 | gene_names = [n[:n.index('|')] for n in gene_names] 26 | tcga_df = pd.DataFrame(tcga_df.values, index = tcga_df.index, columns = gene_names) 27 | 28 | #Eliminate unknown genes 29 | tcga_df = tcga_df.iloc[:, tcga_df.columns.values != '?'] 30 | tcga_df = tcga_df.loc[:,~tcga_df.columns.duplicated()] 31 | print("TCGA expression ", tcga_df.shape) 32 | print("TCGA expression ", tcga_df) 33 | 34 | #Eliminate normal samples 35 | print("Eliminating normal samples..") 36 | sample_codes = [s[-2:] for s in tcga_df.index] 37 | print("Sample codes ", np.unique(sample_codes)) 38 | normal_codes = [s[-2] for s in tcga_df.index] 39 | cancer_samples = np.where(np.asarray(normal_codes) == '0')[0] 40 | print("Total number of samples ", len(tcga_df.index)) 41 | print("Total number of cancer samples ", len(cancer_samples)) 42 | tcga_df = tcga_df.iloc[cancer_samples, :] 43 | print("TCGA expression ", tcga_df.shape) 44 | print("TCGA expression cancer samples ", tcga_df.index) 45 | 46 | #Standardize data to make 0 mean univariate 47 | print("Standardizing the data...") 48 | scaled_expression_values = tcga_df.values 49 | normalized_data = scaled_expression_values 50 | print("TCGA expression mean ", np.mean(normalized_data, axis = 0)) 51 | print("TCGA expression mean ", len(np.mean(normalized_data, axis = 0))) 52 | print("TCGA expression std ", np.std(normalized_data, axis = 0)) 53 | print("TCGA expression std ", len(np.std(normalized_data, axis = 0))) 54 | 55 | #Record joined dataframe 56 | tcga_df = pd.DataFrame(normalized_data, index = tcga_df.index, columns = tcga_df.columns) 57 | tcga_df = tcga_df.fillna(tcga_df.mean().fillna(0)) 58 | print("Final dataframe ", tcga_df.shape) 59 | print('RANGE: ', (np.max(tcga_df.values) - np.min(tcga_df.values) )) 60 | 61 | #Record expression data 62 | tcga_df.to_csv(output_folder + 'TCGA_' + tcga_type + '_ALL_GENES_NOT_PREPROCESSED_RNASEQ_EXPRESSION.tsv', sep = '\t') 63 | print(tcga_df) 64 | 65 | import sys 66 | cancer_type = sys.argv[1] 67 | tcga_type = sys.argv[2] 68 | create_data(cancer_type, tcga_type) 69 | -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/CREATE_SURVIVAL_DATAFRAMES/Create_Joined_Survival_Dataframes.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for creating joined TCGA survival dataframes and DeepProfile embeddings 3 | ############################### 4 | 5 | import pandas as pd 6 | import numpy as np 7 | import sys 8 | 9 | def createJoinedDf(tcga_type, cancer_type): 10 | 11 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/' 12 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/' 13 | 14 | #Read survival dataframe 15 | surv_df = pd.read_table(input_folder + 'TCGA_' + tcga_type + '_Survival_df.tsv', index_col = 0, sep = ',') 16 | surv_df = surv_df.astype(float) 17 | print("Survival dataframe ", surv_df.shape) 18 | 19 | #Drop nan samples 20 | indices_to_drop1 = np.where(np.isnan(surv_df.values))[0] 21 | indices_to_drop2 = np.where(surv_df['Survival_in_days'].values <= 0)[0] 22 | indices_to_drop = np.unique(np.concatenate((indices_to_drop1, indices_to_drop2))) 23 | surv_df = surv_df.drop(surv_df.index[indices_to_drop]) 24 | surv_df = pd.DataFrame(surv_df.values, index = surv_df.index, columns = ['fustat', 'futime']) 25 | print("Survival dataframe ", surv_df.shape) 26 | 27 | #Read DeepProfile embedding 28 | data_df = pd.read_table(input_folder + tcga_type + '_DeepProfile_TCGA_RNASeq_Embedding_150L.tsv', index_col = 0) 29 | print("DeepProfile embedding ", data_df.shape) 30 | 31 | #Match sample indices 32 | surv_df_sample_names = surv_df.index 33 | data_df_sample_names = data_df.index 34 | print("Surv samples ", surv_df_sample_names) 35 | print("Data samples ", data_df_sample_names) 36 | 37 | new_indices = [s.upper() for s in surv_df.index] 38 | surv_df = pd.DataFrame(surv_df.values, index = new_indices, columns = surv_df.columns) 39 | 40 | new_columns = ['Node ' + str(i) for i in range(1, 151)] 41 | new_indices = [s[:12] for s in data_df.index] 42 | data_df = pd.DataFrame(data_df.values, index = new_indices, columns = new_columns) 43 | 44 | surv_df_sample_names = surv_df.index 45 | data_df_sample_names = data_df.index 46 | print("Surv samples ", surv_df_sample_names) 47 | print("Data samples ", data_df_sample_names) 48 | 49 | #Take the samples available in both datasets 50 | # intersect_indices = np.intersect1d(data_df.index, surv_df.index) 51 | # print("Common indices ", intersect_indices) 52 | 53 | #Create joined dataframe 54 | joined_df = data_df.merge(surv_df, left_index=True, right_index=True) 55 | joined_df = joined_df.sort_index() 56 | joined_df = joined_df.loc[~joined_df.index.duplicated(keep='first')] 57 | print("Joined dataframe ", joined_df.shape) 58 | print(joined_df) 59 | joined_df.to_csv(output_folder + '/DeepProfile_Embedding_and_' + tcga_type + '_Survival_df.tsv', sep = '\t') 60 | 61 | #Read cancer types 62 | cancer_type = sys.argv[1] 63 | tcga_type = sys.argv[2] 64 | 65 | createJoinedDf(tcga_type, cancer_type) 66 | -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/CREATE_SURVIVAL_DATAFRAMES/Create_Joined_Survival_Dataframes_Cancer_Types.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for creating joined cancer types TCGA survival dataframes 3 | ############################### 4 | 5 | import pandas as pd 6 | import numpy as np 7 | 8 | #Method for combining datasets 9 | def create_Data(cancer): 10 | 11 | input_folder = '../../ALL_CANCER_FILES/' + cancer + '/' + 'TCGA_FILES/' 12 | output_folder = '../../ALL_CANCER_FILES/' + cancer + '/' + 'TCGA_FILES/' 13 | 14 | c = np.where(np.asarray(cancer_types) == cancer)[0][0] 15 | df_list = [] 16 | for test in test_cases[c]: 17 | print("TCGA type ", test) 18 | surv_df = pd.read_table(input_folder + '/DeepProfile_Embedding_and_' + test + '_Survival_df.tsv', sep = '\t', index_col = 0) 19 | print("Survival dataframe ", surv_df.shape) 20 | df_list.append(surv_df) 21 | 22 | #Combine dataframes 23 | joined_df = pd.concat(df_list) 24 | print("Joined survival dataframe ", joined_df.shape) 25 | joined_df.to_csv(output_folder + '/DeepProfile_Embedding_and_' + cancer + '_Survival_df.tsv', sep = '\t') 26 | 27 | cancer_types = ['LUNG'] 28 | test_cases = [ ['LUAD', 'LUSC']] 29 | 30 | for i in range(len(cancer_types)): 31 | print("Cancer type ", cancer_types[i]) 32 | create_Data(cancer_types[i]) 33 | 34 | #Method for combining survival dataframes 35 | def create_Data(cancer): 36 | 37 | input_folder = '../../ALL_CANCER_FILES/' + cancer + '/' + 'TCGA_FILES/' 38 | output_folder = '../../ALL_CANCER_FILES/' + cancer + '/' + 'TCGA_FILES/' 39 | 40 | c = np.where(np.asarray(cancer_types) == cancer)[0][0] 41 | df_list = [] 42 | for test in test_cases[c]: 43 | print("TCGA type ", test) 44 | surv_df = pd.read_table(input_folder + 'TCGA_' + test + '_Survival_df.tsv', sep = ',', index_col = 0) 45 | print("Survival dataframe ", surv_df.shape) 46 | df_list.append(surv_df) 47 | 48 | #Combine dataframes 49 | joined_df = pd.concat(df_list) 50 | print("Joined survival dataframe ", joined_df.shape) 51 | joined_df.to_csv(output_folder + 'TCGA_' + cancer + '_Survival_df.tsv', sep = '\t') 52 | 53 | cancer_types = ['LUNG'] 54 | test_cases = [ ['LUAD', 'LUSC']] 55 | 56 | for i in range(len(cancer_types)): 57 | print("Cancer type ", cancer_types[i]) 58 | create_Data(cancer_types[i]) 59 | 60 | -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/CREATE_SURVIVAL_DATAFRAMES/Create_TCGA_Survival_Dataframes.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for creating TCGA survival dataframes 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import math 8 | 9 | #Method for defining the survival dataframe 10 | def createSurvivalDF(cancer_type, tcga_type): 11 | 12 | input_folder = '../TCGA_DATA/TCGA_CLINICAL_DATA/' 13 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/' 14 | 15 | #Read clinical data 16 | survival_df = pd.read_table( input_folder + tcga_type + '.clin.merged.picked.txt', index_col = 0) 17 | survival_df = survival_df.transpose() 18 | print("TCGA clinical dataframe ", survival_df.shape) 19 | print("TCGA clinical dataframe ", survival_df.columns) 20 | 21 | #Extract vital status, days to death, and days to follow up 22 | vital_df = survival_df['vital_status'] 23 | dead_df = survival_df['days_to_death'] 24 | alive_df = survival_df['days_to_last_followup'] 25 | 26 | #Create joined arrays 27 | vital_status_array = [] 28 | days_status_array = [] 29 | for i in range(vital_df.shape[0]): 30 | if int(vital_df.values[i])== 0: 31 | vital_status_array.append(False) 32 | days_status_array.append(alive_df.values[i]) 33 | else: 34 | vital_status_array.append(True) 35 | days_status_array.append(dead_df.values[i]) 36 | 37 | 38 | #Create joined dataframe 39 | vital_status_df = pd.DataFrame(vital_status_array, index = survival_df.index, columns = ['Status']) 40 | days_status_df = pd.DataFrame(days_status_array, index = survival_df.index, columns = ['Survival_in_days']) 41 | joined_df = pd.concat([vital_status_df, days_status_df], axis = 1) 42 | print("TCGA survival dataframe ", joined_df) 43 | joined_df.to_csv(output_folder + '/TCGA_' + tcga_type + '_Survival_df.tsv') 44 | 45 | 46 | import sys 47 | cancer_type = sys.argv[1] 48 | tcga_type = sys.argv[2] 49 | 50 | createSurvivalDF(cancer_type, tcga_type) 51 | -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/PREDICT_SURVIVAL/Predict_Survival.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for predicting survival status of patients 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from scipy import stats 8 | from sklearn import metrics 9 | import random 10 | 11 | from sklearn.metrics import roc_auc_score 12 | from sklearn.model_selection import GridSearchCV 13 | from sklearn.model_selection import LeaveOneOut 14 | from sklearn.model_selection import KFold 15 | from sklearn import linear_model 16 | from sklearn.linear_model import LogisticRegression 17 | from sklearn.metrics import average_precision_score 18 | from sklearn.metrics import accuracy_score 19 | from sklearn.preprocessing import StandardScaler 20 | from sklearn.metrics import roc_curve, auc 21 | 22 | #Define method for training models 23 | def trait_classification_accuracy(X, Y): 24 | 25 | #Do cross validation 26 | loo = KFold(20, shuffle = True, random_state = 123456) 27 | 28 | predictions = np.zeros(X.shape[0]) 29 | probabilities = np.zeros(X.shape[0]) 30 | 31 | for train_index, test_index in loo.split(X): 32 | X_train, X_test = X[train_index], X[test_index] 33 | Y_train, Y_test = Y[train_index], Y[test_index] 34 | 35 | #Normalize training data 36 | scaler = StandardScaler() 37 | scaler.fit(X_train) 38 | 39 | X_std = scaler.transform(X_train) 40 | X_test_std = scaler.transform(X_test) 41 | 42 | # #Tune parameters 43 | tuned_parameters = [{'C': [0.001, 0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 100]}] 44 | 45 | model = LogisticRegression(random_state=12345, penalty = 'l1', max_iter=1000, 46 | solver = 'liblinear') 47 | clf = GridSearchCV(model, tuned_parameters, cv = 5, scoring='roc_auc', n_jobs = -1) 48 | clf.fit(X_std, Y_train) 49 | 50 | #Record predictions and probabilities 51 | predicted_Y = clf.predict(X_test_std) 52 | predictions[test_index] = predicted_Y 53 | 54 | probs = clf.predict_proba(X_test_std) 55 | 56 | probabilities[test_index] = probs[:, 1] 57 | 58 | 59 | #Calculate accuracy and ROC-AUC 60 | accuracy = accuracy_score(Y, predictions) 61 | score = roc_auc_score(Y, probabilities) 62 | 63 | return [accuracy, score] 64 | 65 | #Define method for predicting survival 66 | def predict_survival(cancer_type, tcga_type, method, run_index, seed): 67 | 68 | accuracies = [] 69 | aucs = [] 70 | 71 | if method == 'PCA': 72 | X_df = pd.read_table(input_folder + 'TCGA_RNASEQ_' + tcga_type + '_PCA_150L.tsv', index_col = 0) 73 | 74 | if method == 'ICA' or method == 'RP': 75 | X_df = pd.read_table(input_folder + 'TCGA_RNASEQ_' + tcga_type + '_' + method + '_150L_' + str(run_index) + '.tsv', index_col = 0) 76 | 77 | if method == 'AE' or method == 'DAE': 78 | X_df = pd.read_table(input_folder + 'TCGA_' + tcga_type + '_RNASeq_Expression_' + method + '_encoded_150L_' + str(run_index) + '.tsv', index_col = 0) 79 | 80 | if method == 'VAE': 81 | X_df = pd.read_table(input_folder + 'TCGA_' + tcga_type + '_RNASeq_Expression_' + method + '_encoded_' + VAE_dim + 'L_' + str(run_index) + '.tsv', index_col = 0) 82 | 83 | if method == 'DeepProfile': 84 | X_df = pd.read_table(input_folder + tcga_type + '_DeepProfile_TCGA_RNASeq_Embedding_150L.tsv', index_col = 0, sep = '\t') 85 | 86 | #Read expression data 87 | print("Expression data ", X_df.shape) 88 | print("Expression data ", X_df.index) 89 | 90 | #Now, replace X_df index to match with Y_df index 91 | mapper = lambda t: t[:12] 92 | vfunc = np.vectorize(mapper) 93 | newX_index = vfunc( X_df.index) 94 | 95 | X_df = pd.DataFrame(X_df.values, index = newX_index, columns = X_df.columns) 96 | 97 | #Take intersecting samples in datasets 98 | X_samples = X_df.index 99 | Y_samples = Y_df.index 100 | intersecting_samples = np.intersect1d(X_samples, Y_samples) 101 | 102 | subX_df = X_df.T[intersecting_samples].T 103 | subY_df = Y_df.T[intersecting_samples].T 104 | 105 | print("X dataframe ", subX_df.shape) 106 | print("Y dataframe ", subY_df.shape) 107 | 108 | print("X dataframe ", subX_df.index) 109 | print("Y dataframe ", subY_df.index) 110 | 111 | sample_indices = [np.where(subY_df.values == False)[0], np.where(subY_df.values == True)[0]] 112 | sample_counts = [len(sample_indices[0]), len(sample_indices[1])] 113 | print("SAMPLES LABEL 0: ", sample_counts[0], " SAMPLES LABEL 1: ", sample_counts[1]) 114 | 115 | #Now select the class with highest number of samples and subsample 116 | low_class = np.argmin(sample_counts) 117 | high_class = np.argmax(sample_counts) 118 | print("Lower class size ", sample_counts[low_class], "samples subsampled from class ", high_class) 119 | random.seed(12345 * seed) 120 | random_indices = random.sample(list(np.arange(0, sample_counts[high_class])), sample_counts[low_class]) 121 | selected_indices = np.sort(sample_indices[high_class][random_indices]) 122 | 123 | subX_df = pd.concat([subX_df.iloc[sample_indices[low_class]], subX_df.iloc[selected_indices]]) 124 | subY_df = pd.concat([subY_df.iloc[sample_indices[low_class]], subY_df.iloc[selected_indices]]) 125 | subX_df = subX_df.sort_index() 126 | subY_df = subY_df.sort_index() 127 | 128 | results = trait_classification_accuracy(subX_df.values, np.ravel(subY_df.values)) 129 | 130 | return results 131 | 132 | #Read user inputs 133 | import sys 134 | cancer_type = sys.argv[1] #main cancer type 135 | tcga_type = sys.argv[2] #TCGA type 136 | method = sys.argv[3] #name of the method 137 | run_index = int(sys.argv[4]) #run index 138 | if len(sys.argv) > 5: 139 | VAE_dim = sys.argv[5] 140 | 141 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/' 142 | output_folder = 'Prediction_Results/' 143 | 144 | Y_df = pd.read_table(input_folder + 'DeepProfile_Embedding_and_' + tcga_type + '_Survival_df.tsv', index_col = 0, sep = '\t') 145 | print("Survival dataframe ", Y_df.shape) 146 | 147 | #print( Y_df[Y_df['fustat'] == 1]['futime']) 148 | 149 | print("ALIVE..") 150 | print( Y_df[Y_df['fustat'] == 0]['futime']) 151 | print( np.mean(Y_df[Y_df['fustat'] == 0]['futime'].values)) 152 | 153 | #Select all dead patients, only if they died within 5 years 154 | Y_df_dead = Y_df[Y_df['fustat'] == 1] 155 | indices_dead = np.where(Y_df_dead['futime'] < 5 * 365)[0] 156 | print("Dead within 5 year ", Y_df_dead.iloc[indices_dead][['fustat', 'futime']]) 157 | print("Dead within 5 year ", np.max(Y_df_dead.iloc[indices_dead]['futime'])) 158 | 159 | #Select all alive patients, only if they lived more than 5 years 160 | indices_alive = np.where(Y_df['futime'] > 5 * 365)[0] 161 | print("Alive after 5 year ", Y_df.iloc[indices_alive][['fustat', 'futime']]) 162 | print("Alive after 5 year ", np.min(Y_df.iloc[indices_alive]['futime'])) 163 | 164 | indices = list(indices_dead) + list(indices_alive) 165 | indices = np.unique(indices) 166 | Y_df = Y_df['fustat'] 167 | Y_df = Y_df.iloc[indices] 168 | Y_df = Y_df.dropna() 169 | print("Survival dataframe \n ", Y_df) 170 | 171 | class0_count = len(np.where(Y_df.values == 0)[0]) 172 | class1_count = len(np.where(Y_df.values == 1)[0]) 173 | 174 | all_accuracies = [] 175 | all_aucs = [] 176 | 177 | for sampling_index in range(50): 178 | result = predict_survival(cancer_type, tcga_type, method, run_index, sampling_index) 179 | print("Accuracy: ", result[0]) 180 | print("ROC-AUC: ", result[1]) 181 | all_accuracies.append(result[0]) 182 | all_aucs.append(result[1]) 183 | 184 | print("FINAL RESULTS...") 185 | print("MEAN ACCURACY ", np.mean(np.asarray(all_accuracies))) 186 | print("MEAN ROC-AUC ", np.mean(np.asarray(all_aucs))) 187 | 188 | #Save results to a file 189 | if method == 'VAE': 190 | np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + tcga_type + '_' + method + '_' + VAE_dim + 'L_' + str(run_index) + '_ACCs.txt', np.asarray(all_accuracies), delimiter='\n') 191 | np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + tcga_type + '_' + method + '_' + VAE_dim + 'L_' + str(run_index) + '_AUCs.txt', np.asarray(all_aucs), delimiter='\n') 192 | 193 | else: 194 | np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + tcga_type + '_' + method + '_' + str(run_index) + '_ACCs.txt', np.asarray(all_accuracies), delimiter='\n') 195 | np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + tcga_type + '_' + method + '_' + str(run_index) + '_AUCs.txt', np.asarray(all_aucs), delimiter='\n') 196 | 197 | 198 | 199 | 200 | -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/PREDICT_SURVIVAL/Predict_Survival_Raw_Data.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Predicting survival status of patients using raw gene data 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from scipy import stats 8 | from sklearn import metrics 9 | import random 10 | 11 | from sklearn.metrics import roc_auc_score 12 | from sklearn.model_selection import GridSearchCV 13 | from sklearn.model_selection import LeaveOneOut 14 | from sklearn.model_selection import KFold 15 | from sklearn import linear_model 16 | from sklearn.linear_model import LogisticRegression 17 | from sklearn.metrics import average_precision_score 18 | from sklearn.metrics import accuracy_score 19 | from sklearn.preprocessing import StandardScaler 20 | from sklearn.metrics import roc_curve, auc 21 | 22 | #Define method for training models 23 | def trait_classification_accuracy(X, Y): 24 | 25 | #Do cross validation 26 | loo = KFold(20, shuffle = True, random_state = 123456) 27 | 28 | predictions = np.zeros(X.shape[0]) 29 | probabilities = np.zeros(X.shape[0]) 30 | 31 | for train_index, test_index in loo.split(X): 32 | X_train, X_test = X[train_index], X[test_index] 33 | Y_train, Y_test = Y[train_index], Y[test_index] 34 | 35 | #Normalize training data 36 | scaler = StandardScaler() 37 | scaler.fit(X_train) 38 | 39 | X_std = scaler.transform(X_train) 40 | X_test_std = scaler.transform(X_test) 41 | 42 | # #Tune parameters 43 | tuned_parameters = [{'C': [0.001, 0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 100]}] 44 | 45 | model = LogisticRegression(random_state=12345, penalty = 'l1', max_iter=1000, 46 | solver = 'liblinear') 47 | clf = GridSearchCV(model, tuned_parameters, cv = 5, scoring='roc_auc', n_jobs = -1) 48 | clf.fit(X_std, Y_train) 49 | 50 | #Record predictions and probabilities 51 | predicted_Y = clf.predict(X_test_std) 52 | predictions[test_index] = predicted_Y 53 | 54 | probs = clf.predict_proba(X_test_std) 55 | 56 | probabilities[test_index] = probs[:, 1] 57 | 58 | 59 | #Calculate accuracy and ROC-AUC 60 | accuracy = accuracy_score(Y, predictions) 61 | score = roc_auc_score(Y, probabilities) 62 | 63 | return [accuracy, score] 64 | 65 | #Define method for predicting survival 66 | def predict_survival(X_inp,Y_inp,cancer_type, tcga_type, seed): 67 | 68 | accuracies = [] 69 | aucs = [] 70 | 71 | subX_df = X_inp 72 | subY_df = Y_inp 73 | 74 | print("X dataframe ", subX_df.shape) 75 | print("Y dataframe ", subY_df.shape) 76 | 77 | print("X dataframe ", subX_df.index) 78 | print("Y dataframe ", subY_df.index) 79 | 80 | sample_indices = [np.where(subY_df.values == False)[0], np.where(subY_df.values == True)[0]] 81 | sample_counts = [len(sample_indices[0]), len(sample_indices[1])] 82 | print("SAMPLES LABEL 0: ", sample_counts[0], " SAMPLES LABEL 1: ", sample_counts[1]) 83 | 84 | #Now select the class with highest number of samples and subsample 85 | low_class = np.argmin(sample_counts) 86 | high_class = np.argmax(sample_counts) 87 | print("Lower class size ", sample_counts[low_class], "samples subsampled from class ", high_class) 88 | random.seed(12345 * seed) 89 | random_indices = random.sample(list(np.arange(0, sample_counts[high_class])), sample_counts[low_class]) 90 | selected_indices = np.sort(sample_indices[high_class][random_indices]) 91 | 92 | subX_df = pd.concat([subX_df.iloc[sample_indices[low_class]], subX_df.iloc[selected_indices]]) 93 | subY_df = pd.concat([subY_df.iloc[sample_indices[low_class]], subY_df.iloc[selected_indices]]) 94 | subX_df = subX_df.sort_index() 95 | subY_df = subY_df.sort_index() 96 | 97 | results = trait_classification_accuracy(subX_df.values, np.ravel(subY_df.values)) 98 | 99 | return results 100 | 101 | #Read user inputs 102 | run_index = 0 103 | # for cancer_type in ['BRCA', 'AML', 104 | # 'COLON', 105 | # 'BRAIN', 'OV', 106 | # 'SARCOMA', 'KIDNEY', 107 | # 'LIVER', 'STOMACH', 108 | # 'SKIN', 'UCEC', 109 | # 'HEAD_NECK', 'PANCREAS', 110 | # 'CERVICAL', 'BLADDER', 'LUNG']: 111 | for cancer_type in ['HEAD_NECK', 'PANCREAS', 112 | 'CERVICAL', 'BLADDER', 'LUNG']: 113 | 114 | if cancer_type == 'LUNG': 115 | tcga_types = ['LUAD', 'LUSC'] 116 | 117 | else: 118 | cancer_types = ['BRCA', 'AML', 119 | 'COLON', 120 | 'BRAIN', 'OV', 121 | 'SARCOMA', 'KIDNEY', 122 | 'LIVER', 'STOMACH', 123 | 'SKIN', 'UTERINE', 124 | 'HEAD_NECK', 'PANCREAS', 125 | 'CERVICAL', 'BLADDER', 'LUNG'] 126 | 127 | tcga_types = ['BRCA', 'LAML', 128 | 'COADREAD', 129 | 'GBMLGG', 'OV', 130 | 'SARC', 'KIPAN', 131 | 'LIHC', 'STAD', 132 | 'SKCM', 'UCEC', 133 | 'HNSC', 'PAAD', 134 | 'CESC', 'BLCA', 'LUNG'] 135 | cti = cancer_types.index(cancer_type) 136 | tcga_type = tcga_types[cti] 137 | 138 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/' 139 | output_folder = 'Prediction_Results/' 140 | 141 | if cancer_type == 'LUNG': 142 | df_list = [] 143 | for tcga_type in tcga_types: 144 | Y_df = pd.read_table(input_folder + 'DeepProfile_Embedding_and_' + tcga_type + '_Survival_df.tsv', 145 | index_col = 0, sep = '\t') 146 | print("Survival dataframe ", Y_df.shape) 147 | df_list.append(Y_df) 148 | 149 | Y_df = pd.concat(df_list, axis = 0) 150 | else: 151 | Y_df = pd.read_table(input_folder + 'DeepProfile_Embedding_and_' + tcga_type + '_Survival_df.tsv', 152 | index_col = 0, sep = '\t') 153 | print("Survival dataframe ", Y_df.shape) 154 | 155 | print("ALIVE..") 156 | print( Y_df[Y_df['fustat'] == 0]['futime']) 157 | print( np.mean(Y_df[Y_df['fustat'] == 0]['futime'].values)) 158 | 159 | #Select all dead patients, only if they died within 5 years 160 | Y_df_dead = Y_df[Y_df['fustat'] == 1] 161 | indices_dead = np.where(Y_df_dead['futime'] < 5 * 365)[0] 162 | print("Dead within 5 year ", Y_df_dead.iloc[indices_dead][['fustat', 'futime']]) 163 | print("Dead within 5 year ", np.max(Y_df_dead.iloc[indices_dead]['futime'])) 164 | 165 | #Select all alive patients, only if they lived more than 5 years 166 | indices_alive = np.where(Y_df['futime'] > 5 * 365)[0] 167 | print("Alive after 5 year ", Y_df.iloc[indices_alive][['fustat', 'futime']]) 168 | print("Alive after 5 year ", np.min(Y_df.iloc[indices_alive]['futime'])) 169 | 170 | indices = list(indices_dead) + list(indices_alive) 171 | indices = np.unique(indices) 172 | Y_df = Y_df['fustat'] 173 | Y_df = Y_df.iloc[indices] 174 | Y_df = Y_df.dropna() 175 | print("Survival dataframe \n ", Y_df) 176 | 177 | raw_data_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/' 178 | tcga_df = pd.read_table(raw_data_folder + 'TCGA_' + tcga_type + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', index_col= 0) 179 | #Now, replace X_df index to match with Y_df index 180 | mapper = lambda t: t[:12] 181 | vfunc = np.vectorize(mapper) 182 | newX_index = vfunc( tcga_df.index) 183 | tcga_df.index = newX_index 184 | tcga_labeled_df = tcga_df.loc[Y_df.index,:] 185 | tcga_labeled_df = tcga_labeled_df[~tcga_labeled_df.index.duplicated(keep='first')] 186 | 187 | class0_count = len(np.where(Y_df.values == 0)[0]) 188 | class1_count = len(np.where(Y_df.values == 1)[0]) 189 | 190 | all_accuracies = [] 191 | all_aucs = [] 192 | 193 | for sampling_index in range(50): 194 | result = predict_survival(tcga_labeled_df, Y_df, cancer_type, tcga_type, sampling_index) 195 | print("Accuracy: ", result[0]) 196 | print("ROC-AUC: ", result[1]) 197 | all_accuracies.append(result[0]) 198 | all_aucs.append(result[1]) 199 | 200 | method = 'RAW' 201 | np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + cancer_type + '_' + method + '_' + str(run_index) + '_ACCs.txt', np.asarray(all_accuracies), delimiter='\n') 202 | np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + cancer_type + '_' + method + '_' + str(run_index) + '_AUCs.txt', np.asarray(all_aucs), delimiter='\n') 203 | -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/PREDICT_SURVIVAL/Predict_Survival_Subtypes_Joined.py: -------------------------------------------------------------------------------- 1 | ############################### 2 | #Script for predicting survival status of patients 3 | ############################### 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from scipy import stats 8 | from sklearn import metrics 9 | import random 10 | 11 | from sklearn.metrics import roc_auc_score 12 | from sklearn.model_selection import GridSearchCV 13 | from sklearn.model_selection import LeaveOneOut 14 | from sklearn.model_selection import KFold 15 | from sklearn import linear_model 16 | from sklearn.linear_model import LogisticRegression 17 | from sklearn.metrics import average_precision_score 18 | from sklearn.metrics import accuracy_score 19 | from sklearn.preprocessing import StandardScaler 20 | from sklearn.metrics import roc_curve, auc 21 | 22 | #Define method for training models 23 | def trait_classification_accuracy(X, Y): 24 | 25 | #Do cross validation 26 | loo = KFold(20, shuffle = True, random_state = 123456) 27 | 28 | predictions = np.zeros(X.shape[0]) 29 | probabilities = np.zeros(X.shape[0]) 30 | 31 | for train_index, test_index in loo.split(X): 32 | X_train, X_test = X[train_index], X[test_index] 33 | Y_train, Y_test = Y[train_index], Y[test_index] 34 | 35 | #Normalize training data 36 | scaler = StandardScaler() 37 | scaler.fit(X_train) 38 | 39 | X_std = scaler.transform(X_train) 40 | X_test_std = scaler.transform(X_test) 41 | 42 | #Tune parameters 43 | tuned_parameters = [{'C': [0.001, 0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 100]}] 44 | 45 | model = LogisticRegression(random_state=12345, penalty = 'l1', max_iter=1000, 46 | solver = 'liblinear') 47 | clf = GridSearchCV(model, tuned_parameters, cv = 5, scoring='roc_auc', n_jobs = -1) 48 | clf.fit(X_std, Y_train) 49 | 50 | #Record predictions and probabilities 51 | predicted_Y = clf.predict(X_test_std) 52 | predictions[test_index] = predicted_Y 53 | 54 | probs = clf.predict_proba(X_test_std) 55 | 56 | probabilities[test_index] = probs[:, 1] 57 | 58 | 59 | #Calculate accuracy and ROC-AUC 60 | accuracy = accuracy_score(Y, predictions) 61 | score = roc_auc_score(Y, probabilities) 62 | 63 | return [accuracy, score] 64 | 65 | 66 | #Define method for predicting survival 67 | def predict_survival(cancer_type, tcga_types, method, run_index, seed): 68 | print(tcga_types) 69 | accuracies = [] 70 | aucs = [] 71 | 72 | df_list = [] 73 | for tcga_type in tcga_types: 74 | if method == 'PCA': 75 | X_df = pd.read_table(input_folder + 'TCGA_RNASEQ_' + tcga_type + '_PCA_150L.tsv', index_col = 0) 76 | 77 | if method == 'ICA' or method == 'RP': 78 | X_df = pd.read_table(input_folder + 'TCGA_RNASEQ_' + tcga_type + '_' + method + '_150L_' + str(run_index) + '.tsv', index_col = 0) 79 | 80 | if method == 'AE' or method == 'DAE': 81 | X_df = pd.read_table(input_folder + 'TCGA_' + tcga_type + '_RNASeq_Expression_' + method + '_encoded_150L_' + str(run_index) + '.tsv', index_col = 0) 82 | 83 | if method == 'VAE': 84 | X_df = pd.read_table(input_folder + 'TCGA_' + tcga_type + '_RNASeq_Expression_' + method + '_encoded_' + VAE_dim + 'L_' + str(run_index) + '.tsv', index_col = 0) 85 | 86 | if method == 'DeepProfile': 87 | X_df = pd.read_table(input_folder + tcga_type + '_DeepProfile_TCGA_RNASeq_Embedding_150L.tsv', index_col = 0, sep = '\t') 88 | 89 | 90 | df_list.append(X_df) 91 | 92 | X_df = pd.concat(df_list, axis = 0) 93 | print("Expression data joined ", X_df.shape) 94 | 95 | #Now, replace X_df index to match with Y_df index 96 | mapper = lambda t: t[:12] 97 | vfunc = np.vectorize(mapper) 98 | newX_index = vfunc( X_df.index) 99 | 100 | X_df = pd.DataFrame(X_df.values, index = newX_index, columns = X_df.columns) 101 | 102 | #Take intersecting samples in datasets 103 | X_samples = X_df.index 104 | Y_samples = Y_df.index 105 | intersecting_samples = np.intersect1d(X_samples, Y_samples) 106 | 107 | subX_df = X_df.T[intersecting_samples].T 108 | subY_df = Y_df.T[intersecting_samples].T 109 | 110 | print("X dataframe ", subX_df.shape) 111 | print("Y dataframe ", subY_df.shape) 112 | 113 | print("X dataframe ", subX_df.index) 114 | print("Y dataframe ", subY_df.index) 115 | 116 | sample_indices = [np.where(subY_df.values == False)[0], np.where(subY_df.values == True)[0]] 117 | sample_counts = [len(sample_indices[0]), len(sample_indices[1])] 118 | print("SAMPLES LABEL 0: ", sample_counts[0], " SAMPLES LABEL 1: ", sample_counts[1]) 119 | 120 | #Now select the class with highest number of samples and subsample 121 | low_class = np.argmin(sample_counts) 122 | high_class = np.argmax(sample_counts) 123 | print("Lower class size ", sample_counts[low_class], "samples subsampled from class ", high_class) 124 | random.seed(12345 * seed) 125 | random_indices = random.sample(list(np.arange(0, sample_counts[high_class])), sample_counts[low_class]) 126 | selected_indices = np.sort(sample_indices[high_class][random_indices]) 127 | 128 | subX_df = pd.concat([subX_df.iloc[sample_indices[low_class]], subX_df.iloc[selected_indices]]) 129 | subY_df = pd.concat([subY_df.iloc[sample_indices[low_class]], subY_df.iloc[selected_indices]]) 130 | subX_df = subX_df.sort_index() 131 | subY_df = subY_df.sort_index() 132 | 133 | results = trait_classification_accuracy(subX_df.values, np.ravel(subY_df.values)) 134 | return results 135 | 136 | #Read user inputs 137 | import sys 138 | cancer_type = sys.argv[1] #main cancer type 139 | method = sys.argv[2] #name of the method 140 | run_index = int(sys.argv[3]) #run index 141 | if len(sys.argv) > 4: 142 | VAE_dim = sys.argv[4] 143 | 144 | if cancer_type == 'LUNG': 145 | tcga_types = ['LUAD', 'LUSC'] 146 | 147 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/' 148 | output_folder = 'Prediction_Results/' 149 | 150 | #Join data for cancer subtypes 151 | df_list = [] 152 | for tcga_type in tcga_types: 153 | Y_df = pd.read_table(input_folder + 'DeepProfile_Embedding_and_' + tcga_type + '_Survival_df.tsv', index_col = 0, sep = '\t') 154 | print("Survival dataframe ", Y_df.shape) 155 | df_list.append(Y_df) 156 | 157 | Y_df = pd.concat(df_list, axis = 0) 158 | print("JOINED survival dataframe ", Y_df.shape) 159 | 160 | print("ALIVE..") 161 | print( Y_df[Y_df['fustat'] == 0]['futime']) 162 | print( np.mean(Y_df[Y_df['fustat'] == 0]['futime'].values)) 163 | 164 | #Select all dead patients, only if they died within a year 165 | Y_df_dead = Y_df[Y_df['fustat'] == 1] 166 | indices_dead = np.where(Y_df_dead['futime'] < 5 * 365)[0] 167 | print("Dead within 5 year ", Y_df_dead.iloc[indices_dead][['fustat', 'futime']]) 168 | print("Dead within 5 year ", np.max(Y_df_dead.iloc[indices_dead]['futime'])) 169 | 170 | #Select all alive patients, only if they lived more than a year 171 | indices_alive = np.where(Y_df['futime'] > 5 * 365)[0] 172 | print("Alive after 5 year ", Y_df.iloc[indices_alive][['fustat', 'futime']]) 173 | print("Alive after 5 year ", np.min(Y_df.iloc[indices_alive]['futime'])) 174 | 175 | indices = list(indices_dead) + list(indices_alive) 176 | indices = np.unique(indices) 177 | Y_df = Y_df['fustat'] 178 | Y_df = Y_df.iloc[indices] 179 | Y_df = Y_df.dropna() 180 | print("Survival dataframe \n ", Y_df) 181 | 182 | class0_count = len(np.where(Y_df.values == 0)[0]) 183 | class1_count = len(np.where(Y_df.values == 1)[0]) 184 | 185 | all_accuracies = [] 186 | all_aucs = [] 187 | 188 | for sampling_index in range(50): 189 | result = predict_survival(cancer_type, tcga_types, method, run_index, sampling_index) 190 | print("Accuracy: ", result) 191 | print("ROC-AUC: ", result[1]) 192 | all_accuracies.append(result[0]) 193 | all_aucs.append(result[1]) 194 | 195 | print("FINAL RESULTS...") 196 | print("MEAN ACCURACY ", np.mean(np.asarray(all_accuracies))) 197 | print("MEAN ROC-AUC ", np.mean(np.asarray(all_aucs))) 198 | 199 | #Save results to a file 200 | if method == 'VAE': 201 | np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + cancer_type + '_' + method + '_' + VAE_dim + 'L_' + str(run_index) + '_ACCs.txt', np.asarray(all_accuracies), delimiter='\n') 202 | np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + cancer_type + '_' + method + '_' + VAE_dim + 'L_' + str(run_index) + '_AUCs.txt', np.asarray(all_aucs), delimiter='\n') 203 | 204 | else: 205 | np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + cancer_type + '_' + method + '_' + str(run_index) + '_ACCs.txt', np.asarray(all_accuracies), delimiter='\n') 206 | np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + cancer_type + '_' + method + '_' + str(run_index) + '_AUCs.txt', np.asarray(all_aucs), delimiter='\n') 207 | 208 | 209 | 210 | 211 | -------------------------------------------------------------------------------- /TCGA_SURVIVAL_PREDICTION/PREDICT_SURVIVAL/Run_Models.py: -------------------------------------------------------------------------------- 1 | import sys 2 | run = int(sys.argv[1]) 3 | 4 | cancer_types = ['BRCA', 'AML', 5 | 'COLON', 6 | 'BRAIN', 'OV', 7 | 'SARCOMA', 'KIDNEY', 8 | 'LIVER', 'STOMACH', 9 | 'SKIN', 'UCEC', 10 | 'HEAD_NECK', 'PANCREAS', 11 | 'CERVICAL', 'BLADDER', 'LUNG'] 12 | 13 | tcga_types = ['BRCA', 'LAML', 14 | 'COADREAD', 15 | 'GBMLGG', 'OV', 16 | 'SARC', 'KIPAN', 17 | 'LIHC', 'STAD', 18 | 'SKCM', 'UTERINE', 19 | 'HNSC', 'PAAD', 20 | 'CESC', 'BLCA', 'LUNG'] 21 | 22 | for c in range(len(cancer_types)): 23 | cancer_type = cancer_types[c] 24 | tcga_type = tcga_types[c] 25 | print("------------") 26 | print(cancer_type) 27 | print(tcga_type) 28 | 29 | if cancer_type == 'LUNG': 30 | 31 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "PCA " + str(run)) 32 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "DeepProfile " + str(run)) 33 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "ICA " + str(run + 1)) 34 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "RP " + str(run + 1)) 35 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "AE " + str(run)) 36 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "DAE " + str(run)) 37 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "VAE " + str(run) + " 5") 38 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "VAE " + str(run) + " 10") 39 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "VAE " + str(run) + " 25") 40 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "VAE " + str(run) + " 50") 41 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "VAE " + str(run) + " 75") 42 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "VAE " + str(run) + " 100") 43 | 44 | else: 45 | 46 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "PCA " + str(run)) 47 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "DeepProfile " + str(run)) 48 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "ICA " + str(run + 1)) 49 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "RP " + str(run + 1)) 50 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "AE " + str(run)) 51 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "DAE " + str(run)) 52 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "VAE " + str(run) + " 5") 53 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "VAE " + str(run) + " 10") 54 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "VAE " + str(run) + " 25") 55 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "VAE " + str(run) + " 50") 56 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "VAE " + str(run) + " 75") 57 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "VAE " + str(run) + " 100") 58 | --------------------------------------------------------------------------------