├── COMPETITOR_TRAININGS
    ├── AE_2Layers_Model.py
    ├── Create_ICA_Data.py
    ├── Create_PCA_Data.py
    ├── Create_RP_Data.py
    ├── DAE_2Layers_Model.py
    ├── Example_Run_All.py
    ├── Get_AE_IG_Attributions.py
    ├── Get_DAE_IG_Attributions.py
    ├── IntegratedGradients.py
    ├── Train_AE_Models.py
    └── Train_DAE_Models.py
├── LICENSE
├── MODEL_TRAININGS
    ├── Create_DeepProfile_Ensemble_Weights.py
    ├── Create_DeepProfile_Training_Embeddings.py
    ├── Create_Ensemble_Labels.py
    ├── Create_PCs_for_DeepLearning_Models.py
    ├── Example_Run_All.py
    ├── Get_VAE_IG_Attributions.py
    ├── IntegratedGradients.py
    ├── Run_VAE_Models.py
    ├── Select_Latent_Dimension_with_Gmeans.ipynb
    ├── VAE_3Layers_Model.py
    └── gmeans.py
├── NORMAL_TISSUE_ANALYSIS
    ├── Create_DeepProfile_GTEX_Embeddings.py
    ├── Create_Gtex_Rnaseq_PCs.py
    ├── Encode_GTEX_Data_with_VAE.py
    ├── Example_Run_All.py
    ├── Gtex_Tissue_Name_Mappings.ipynb
    ├── Normal_Tissue_Classifier.py
    └── Preprocess_Gtex_Rnaseq_Expressions.py
├── PATHWAY_ANALYSIS
    ├── Create_Pathway_Matrices.py
    ├── Fishers_Test.py
    ├── PATHWAY_COVERAGE_ANALYSIS
    │   ├── Plot_of_Average_Pathway_Coverages.ipynb
    │   ├── Plot_of_Node_Level_Pathway_Annotations.ipynb
    │   ├── Plot_of_Pathway_Coverage_Distributions.ipynb
    │   ├── Plot_of_Pathway_Detection_Comparison_VAEs_vs_DeepProfile.ipynb
    │   └── Plot_of_Pathway_Percent_Comparison_VAEs_vs_DeepProfile.ipynb
    └── Run_Multiple_Fishers_Test.py
├── README.md
└── TCGA_SURVIVAL_PREDICTION
    ├── COMPARING_RNASEQ_and_MICROARRAY
        ├── Create_DeepProfile_TCGA_Microarray_Embeddings.py
        ├── Create_TCGA_Microarray_PCs.py
        ├── Encode_TCGA_Microarray_Data_with_VAE.py
        ├── Preprocess_TCGA_Microarray_Expression.py
        └── Rnaseq_and_Microarray_Embedding_Correlation_Plots.ipynb
    ├── CREATE_EMBEDDINGS
        ├── Create_All_VAE_Embeddings.py
        ├── Create_DeepProfile_TCGA_Embeddings.py
        ├── Create_TCGA_Rnaseq_PCs.py
        ├── Encode_TCGA_Data_with_AE.py
        ├── Encode_TCGA_Data_with_DAE.py
        ├── Encode_TCGA_Data_with_ICA.py
        ├── Encode_TCGA_Data_with_PCA.py
        ├── Encode_TCGA_Data_with_RP.py
        ├── Encode_TCGA_Data_with_VAE.py
        ├── Example_Run_All.py
        ├── Preprocess_TCGA_Rnaseq_Expression.py
        ├── Preprocess_TCGA_Rnaseq_Expression_All_Genes.py
        └── Preprocess_TCGA_Rnaseq_Expression_All_Genes_Uncorrected.py
    ├── CREATE_SURVIVAL_DATAFRAMES
        ├── Create_Joined_Survival_Dataframes.py
        ├── Create_Joined_Survival_Dataframes_Cancer_Types.py
        └── Create_TCGA_Survival_Dataframes.py
    └── PREDICT_SURVIVAL
        ├── Plots_of_Survival_Prediction.ipynb
        ├── Plots_of_Survival_Prediction_VAEs.ipynb
        ├── Predict_Survival.py
        ├── Predict_Survival_Raw_Data.py
        ├── Predict_Survival_Subtypes_Joined.py
        └── Run_Models.py


/COMPETITOR_TRAININGS/AE_2Layers_Model.py:
--------------------------------------------------------------------------------
  1 | ###############################
  2 | #AE model
  3 | 
  4 | #Code is modified from https://github.com/keras-team/keras/blob/master/examples/variational_autoencoder.py
  5 | ###############################
  6 | 
  7 | import os
  8 | import numpy as np
  9 | import pandas as pd
 10 | import math 
 11 | from sklearn.metrics import mean_squared_error
 12 | import matplotlib.pyplot as plt
 13 | import tensorflow as tf
 14 | from keras.layers import Input, Dense, Lambda, Layer, Activation, Dropout
 15 | from keras.layers.normalization import BatchNormalization
 16 | from keras.models import Model
 17 | from keras import backend as K
 18 | from keras import metrics, optimizers
 19 | from keras.callbacks import Callback
 20 | import keras
 21 | import csv
 22 | import sys
 23 | 
 24 | #Prevent tensorflow from using all the memory
 25 | config = tf.ConfigProto()
 26 | config.gpu_options.allow_growth=True
 27 | sess = tf.Session(config=config)
 28 | 
 29 | #Define reconstruction loss
 30 | def reconstruction_loss(x_input, x_decoded):
 31 |     return metrics.mse(x_input, x_decoded)
 32 | 
 33 | # Set hyperparameters
 34 | original_dim = input_df.shape[1]
 35 | intermediate1_dim = 750
 36 | latent_dim = 150
 37 | cancer_type = sys.argv[1]
 38 | fold = int(sys.argv[2])
 39 | 
 40 | #SET RANDOM SEEDS
 41 | from numpy.random import seed
 42 | seed(123456 * fold)
 43 | from tensorflow import set_random_seed
 44 | set_random_seed(123456 * fold)
 45 | 
 46 | init_mode = 'glorot_uniform'
 47 | batch_size = 100
 48 | epochs = 50
 49 | learning_rate = 0.0005
 50 | dropout = 0.1
 51 | 
 52 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
 53 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/AE_FILES/'
 54 | 
 55 | #Read input file
 56 | input_filename = input_folder + cancer_type + '_DATA_TOP2_JOINED_PCA_1000L.tsv'
 57 | output_filename = cancer_type + '_DATA_TOP2_INTERSECTION_GENES_encoded_'
 58 | 
 59 | input_df = pd.read_table(input_filename, index_col=0)
 60 | print("INPUT FILE", input_df.shape)
 61 | print(input_df.head(5))
 62 | input_df_training = input_df
 63 | 
 64 | #Define encoder
 65 | x = Input(shape=(original_dim, ))
 66 | 
 67 | net = Dense(intermediate1_dim, kernel_initializer=init_mode)(x)
 68 | net2 = BatchNormalization()(net)
 69 | net3 = Activation('relu')(net2)
 70 | 
 71 | d1 = Dropout(dropout)(net3)
 72 | core = Dense(latent_dim, kernel_initializer=init_mode)(d1)
 73 | 
 74 | #Define decoder
 75 | decoder_h = Dense(intermediate1_dim, activation='relu', kernel_initializer=init_mode)
 76 | d2 = Dropout(dropout)
 77 | decoder_mean = Dense(original_dim, kernel_initializer=init_mode)
 78 | 
 79 | h_decoded = decoder_h(core)
 80 | h_decoded2 = d2(h_decoded)
 81 | x_decoded_mean = decoder_mean(h_decoded2)
 82 | 
 83 | #AE model
 84 | ae = Model(x, x_decoded_mean)
 85 | 
 86 | adam = optimizers.Adam(lr=learning_rate)
 87 | ae.compile(optimizer=adam, loss = reconstruction_loss)
 88 | ae.summary()
 89 | 
 90 | 
 91 | #Train model
 92 | history  = ae.fit(np.array(input_df_training), np.array(input_df_training),
 93 |                shuffle=True,
 94 |                epochs=epochs,
 95 |                batch_size=batch_size,
 96 |                verbose = 2)
 97 | 
 98 | # DEFINE ENCODER
 99 | encoder = Model(x, core)
100 | 
101 | #DEFINE DECODER
102 | decoder_input = Input(shape=(latent_dim, )) 
103 | _h_decoded = decoder_h(decoder_input)
104 | _h_decoded2 = d2(_h_decoded)
105 | _x_decoded_mean = decoder_mean(_h_decoded2)
106 | decoder = Model(decoder_input, _x_decoded_mean)
107 | 
108 | 
109 | training_encoded = encoder.predict(input_df_training, batch_size = batch_size)
110 | training_encoded_df = pd.DataFrame(training_encoded, index = input_df_training.index)
111 | 
112 | # How well does the model reconstruct the input data
113 | training_reconstructed = decoder.predict(np.array(training_encoded_df))
114 | training_reconstructed_df = pd.DataFrame(training_reconstructed, index = input_df_training.index, columns = input_df_training.columns)
115 | 
116 | recons_error = mean_squared_error(np.array(input_df_training), np.array(training_reconstructed_df))
117 | 
118 | print("TRAINING RECONSTRUCTION ERROR: " + str(recons_error))
119 | 
120 | #Save encoded test data
121 | training_encoded_df.to_csv(output_folder +  output_filename + str(latent_dim) + "L_TRAINING_fold" + str(fold) + ".tsv", sep = '\t')
122 | 
123 | 
124 | #SAVE ENCODER MODEL
125 | from keras.models import model_from_json
126 | 
127 | model_json = encoder.to_json()
128 | with open(output_folder + "AE_" + cancer_type + "_encoder_" + str(latent_dim) + "L_"+ str(fold) + ".json", "w") as json_file:
129 |     json_file.write(model_json)
130 | 
131 | encoder.save_weights(output_folder + "AE_" + cancer_type + "_encoder_" + str(latent_dim) + "L_"+ str(fold) + ".h5")
132 | print("Saved model to disk")
133 | 
134 | 
135 | model_json = decoder.to_json()
136 | with open(output_folder + "AE_" + cancer_type + "_decoder_" + str(latent_dim) + "L_"+ str(fold) + ".json", "w") as json_file:
137 |     json_file.write(model_json)
138 | 
139 | decoder.save_weights(output_folder + "AE_" + cancer_type + "_decoder_" + str(latent_dim) + "L_"+ str(fold) + ".h5")
140 | print("Saved model to disk")
141 | 
142 | 
143 | #Record training, validation, and test R2
144 | from sklearn.metrics import r2_score
145 | 
146 | training_r2_vals = np.zeros(input_df_training.shape[0])
147 | for i in range(input_df_training.shape[0]):
148 |     training_r2 = r2_score(input_df_training.values[i, :], training_reconstructed_df.values[i, :])
149 |     training_r2_vals[i] = training_r2
150 | 
151 | print("TRAINING R2 " + str(np.mean(training_r2_vals)))
152 | 


--------------------------------------------------------------------------------
/COMPETITOR_TRAININGS/Create_ICA_Data.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for training ICA models
 3 | ###############################
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import math
 8 | import csv
 9 | from sklearn.decomposition import FastICA
10 | import sys
11 | 
12 | #Read cancer type
13 | cancer_type = sys.argv[1]
14 | 
15 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
16 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/ICA_FILES/'
17 | 
18 | L = 150
19 | print("Number of latent nodes " + str(L))
20 |     
21 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
22 | print("Training data ", data_df.shape)
23 | 
24 | training_data = data_df.values
25 | training_data = np.nan_to_num(training_data)
26 | 
27 | #Fit 10 different ICA models with different random seeds
28 | for run in range(10):
29 |     ica = FastICA(n_components = L, random_state = 12345 * run, max_iter = 100000)
30 |     print(ica)
31 | 
32 |     ica.fit(training_data) 
33 | 
34 |     components = ica.components_
35 |     print(components.shape)
36 | 
37 |     #Save the learned components
38 |     component_df = pd.DataFrame(components.T, index = data_df.columns)
39 |     component_df.to_csv(output_folder + cancer_type + '_DATA_TOP2_JOINED_ICA_COMPONENTS_' + str(L) + 'L_fold' + str(run + 1) + '.tsv', sep = '\t')
40 | 


--------------------------------------------------------------------------------
/COMPETITOR_TRAININGS/Create_PCA_Data.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for training PCA models
 3 | ###############################
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import csv
 8 | from sklearn.decomposition import PCA
 9 | import sys
10 | 
11 | #Read cancer type
12 | cancer_type = sys.argv[1]
13 | 
14 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
15 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/PCA_FILES/'
16 | 
17 | #Method for defining PCs for training data
18 | def createData(cancer_type):
19 |     
20 |     L = 150
21 |     print("Number of latent nodes " + str(L))
22 |     
23 |     data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
24 |     print("Training data ", data_df.shape)
25 | 
26 |     training_data = data_df.values
27 |     training_data = np.nan_to_num(training_data)
28 | 
29 |     #Fit PCA model
30 |     pca = PCA(n_components = L)
31 |     pca.fit(training_data)
32 |     components = pca.components_
33 |     print("PCA components ", components.shape)
34 | 
35 |     #Save the learned components
36 |     component_df = pd.DataFrame(components.T, index = data_df.columns)
37 |     component_df.to_csv(output_folder + cancer_type + '_DATA_TOP2_JOINED_PCA_COMPONENTS_' + str(L) + 'L.tsv', sep = '\t')
38 | 
39 | 
40 | createData(cancer_type)


--------------------------------------------------------------------------------
/COMPETITOR_TRAININGS/Create_RP_Data.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for training Random Projection models
 3 | ###############################
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import csv
 8 | from sklearn.random_projection import GaussianRandomProjection
 9 | import sys
10 | 
11 | #Read cancer type
12 | cancer_type = sys.argv[1]
13 | 
14 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
15 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/RP_FILES/'
16 | 
17 | #Method for defining ICA for training data
18 | def createData(cancer_type):
19 |     
20 |     L = 150
21 |     print("Number of latent nodes " + str(L))
22 |     
23 |     data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
24 |     print("Training data ", data_df.shape)
25 | 
26 |     training_data = data_df.values
27 |     training_data = np.nan_to_num(training_data)
28 | 
29 |     #Fit 10 different RP models with different random seeds
30 |     for run in range(10):
31 |         transformer = GaussianRandomProjection(n_components = L, random_state = run * 12345)
32 |         transformer.fit(training_data)
33 |         
34 |         components = transformer.components_
35 |         print(components.shape)
36 | 
37 |         #Save the learned components
38 |         component_df = pd.DataFrame(components.T, index = data_df.columns)
39 |         component_df.to_csv(output_folder + cancer_type + '_DATA_TOP2_JOINED_RP_COMPONENTS_fold' + str(run + 1) + '.tsv', sep = '\t')
40 |         
41 | 
42 | createData(cancer_type)


--------------------------------------------------------------------------------
/COMPETITOR_TRAININGS/DAE_2Layers_Model.py:
--------------------------------------------------------------------------------
  1 | ###############################
  2 | #Author: Ayse Dincer
  3 | #DAE model
  4 | 
  5 | #Code is modified from https://github.com/keras-team/keras/blob/master/examples/variational_autoencoder.py
  6 | ###############################
  7 | 
  8 | import os
  9 | import numpy as np
 10 | import pandas as pd
 11 | import math 
 12 | from sklearn.metrics import mean_squared_error
 13 | import matplotlib.pyplot as plt
 14 | 
 15 | import tensorflow as tf
 16 | from keras.layers import Input, Dense, Lambda, Layer, Activation, Dropout
 17 | from keras.layers.normalization import BatchNormalization
 18 | from keras.models import Model
 19 | from keras import backend as K
 20 | from keras import metrics, optimizers
 21 | from keras.callbacks import Callback
 22 | import keras
 23 | import csv
 24 | 
 25 | import sys
 26 | 
 27 | #Prevent tensorflow from using all the memory
 28 | config = tf.ConfigProto()
 29 | config.gpu_options.allow_growth=True
 30 | sess = tf.Session(config=config)
 31 | 
 32 | #Define reconstruction loss
 33 | def reconstruction_loss(x_input, x_decoded):
 34 |     return metrics.mse(x_input, x_decoded)
 35 | 
 36 | #Read input file
 37 | cancer_type = sys.argv[1]
 38 | 
 39 | # Set hyperparameters
 40 | intermediate1_dim = 750
 41 | latent_dim = 150
 42 | cancer_type = sys.argv[1]
 43 | fold = int(sys.argv[2])
 44 | 
 45 | init_mode = 'glorot_uniform'
 46 | batch_size = 100
 47 | epochs = 50
 48 | learning_rate = 0.0005
 49 | dropout = 0.1
 50 | 
 51 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
 52 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/DAE_FILES/'
 53 | 
 54 | #SET RANDOM SEEDS
 55 | from numpy.random import seed
 56 | seed(123456 * fold)
 57 | from tensorflow import set_random_seed
 58 | set_random_seed(123456 * fold)
 59 | 
 60 | input_filename = input_folder + cancer_type + '_DATA_TOP2_JOINED_PCA_1000L.tsv'
 61 | output_filename = cancer_type + '_DATA_TOP2_INTERSECTION_GENES_encoded_'
 62 | 
 63 | input_df = pd.read_table(input_filename, index_col=0)
 64 | print("INPUT FILE", input_df.shape)
 65 | print(input_df.head(5))
 66 | input_df_training = input_df
 67 | 
 68 | original_dim = input_df.shape[1]
 69 | 
 70 | #Define noisy inputs
 71 | noise = np.random.normal(loc=0, scale = 1, size=input_df_training.shape)
 72 | input_df_noisy = input_df_training.values + noise
 73 |    
 74 | #Define encoder
 75 | x = Input(shape=(original_dim, ))
 76 | 
 77 | net = Dense(intermediate1_dim, kernel_initializer=init_mode)(x)
 78 | net2 = BatchNormalization()(net)
 79 | net3 = Activation('relu')(net2)
 80 | 
 81 | d1 = Dropout(dropout)(net3)
 82 | core = Dense(latent_dim, kernel_initializer=init_mode)(d1)
 83 | 
 84 | #Define decoder
 85 | decoder_h = Dense(intermediate1_dim, activation='relu', kernel_initializer=init_mode)
 86 | d2 = Dropout(dropout)
 87 | decoder_mean = Dense(original_dim, kernel_initializer=init_mode)
 88 | 
 89 | h_decoded = decoder_h(core)
 90 | h_decoded2 = d2(h_decoded)
 91 | x_decoded_mean = decoder_mean(h_decoded2)
 92 | 
 93 | #DAE model
 94 | dae = Model(x, x_decoded_mean)
 95 | 
 96 | adam = optimizers.Adam(lr=learning_rate)
 97 | dae.compile(optimizer=adam, loss = reconstruction_loss)
 98 | dae.summary()
 99 | 
100 | 
101 | #Train from only training data
102 | history  = dae.fit(np.array(input_df_noisy), np.array(input_df_training),
103 |                shuffle=True,
104 |                epochs=epochs,
105 |                batch_size=batch_size,
106 |                verbose = 2)
107 | 
108 | # DEFINE ENCODER
109 | encoder = Model(x, core)
110 | 
111 | #DEFINE DECODER
112 | decoder_input = Input(shape=(latent_dim, )) 
113 | _h_decoded = decoder_h(decoder_input)
114 | _h_decoded2 = d2(_h_decoded)
115 | _x_decoded_mean = decoder_mean(_h_decoded2)
116 | decoder = Model(decoder_input, _x_decoded_mean)
117 | 
118 | 
119 | training_encoded = encoder.predict(input_df_training, batch_size = batch_size)
120 | training_encoded_df = pd.DataFrame(training_encoded, index = input_df_training.index)
121 | 
122 | # How well does the model reconstruct the input data
123 | training_reconstructed = decoder.predict(np.array(training_encoded_df))
124 | training_reconstructed_df = pd.DataFrame(training_reconstructed, index = input_df_training.index, columns = input_df_training.columns)
125 | 
126 | recons_error = mean_squared_error(np.array(input_df_training), np.array(training_reconstructed_df))
127 | 
128 | print("TRAINING RECONSTRUCTION ERROR: " + str(recons_error))
129 | 
130 | #Save encoded test data
131 | training_encoded_df.to_csv(output_folder + output_filename + str(latent_dim) + "L_TRAINING_fold" + str(fold) + ".tsv", sep = '\t')
132 | 
133 | 
134 | #SAVE ENCODER MODEL
135 | from keras.models import model_from_json
136 | 
137 | model_json = encoder.to_json()
138 | with open(output_folder + "DAE_" + cancer_type + "_encoder_" + str(latent_dim) + "L_"+ str(fold) + ".json", "w") as json_file:
139 |     json_file.write(model_json)
140 | 
141 | encoder.save_weights(output_folder + "DAE_" + cancer_type + "_encoder_" + str(latent_dim) + "L_"+ str(fold) + ".h5")
142 | print("Saved model to disk")
143 | 
144 | 
145 | model_json = decoder.to_json()
146 | with open(output_folder + "DAE_" + cancer_type + "_decoder_" + str(latent_dim) + "L_"+ str(fold) + ".json", "w") as json_file:
147 |     json_file.write(model_json)
148 | 
149 | decoder.save_weights(output_folder + "DAE_" + cancer_type + "_decoder_" + str(latent_dim) + "L_"+ str(fold) + ".h5")
150 | print("Saved model to disk")
151 | 
152 | 
153 | #Record training R2
154 | from sklearn.metrics import r2_score
155 | 
156 | training_r2_vals = np.zeros(input_df_training.shape[0])
157 | for i in range(input_df_training.shape[0]):
158 |     training_r2 = r2_score(input_df_training.values[i, :], training_reconstructed_df.values[i, :])
159 |     training_r2_vals[i] = training_r2
160 | 
161 | print("TRAINING R2 " + str(np.mean(training_r2_vals)))
162 | 


--------------------------------------------------------------------------------
/COMPETITOR_TRAININGS/Example_Run_All.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Example for training competitor models for a cancer type
 3 | ###############################
 4 | 
 5 | get_ipython().magic(u"run -i Create_PCA_Data.py BRCA")
 6 | 
 7 | get_ipython().magic(u"run -i Create_ICA_Data.py BRCA")
 8 | 
 9 | get_ipython().magic(u"run -i Create_RP_Data.py BRCA")
10 | 
11 | get_ipython().magic(u"run -i Train_AE_Models.py BRCA")
12 | get_ipython().magic(u"run -i Get_AE_IG_Attributions.py BRCA 0")
13 | 
14 | get_ipython().magic(u"run -i Train_DAE_Models.py BRCA")
15 | get_ipython().magic(u"run -i Get_DAE_IG_Attributions.py BRCA 0")
16 | 


--------------------------------------------------------------------------------
/COMPETITOR_TRAININGS/Get_AE_IG_Attributions.py:
--------------------------------------------------------------------------------
  1 | ###############################
  2 | #Script for running integrated gradients to get gene-level attributions of each node
  3 | ###############################
  4 | 
  5 | import os
  6 | import numpy as np
  7 | import pandas as pd
  8 | import math 
  9 | from sklearn.metrics import mean_squared_error
 10 | import tensorflow as tf
 11 | from keras.layers import Input, Dense, Lambda, Layer, Activation
 12 | from keras.layers.normalization import BatchNormalization
 13 | from keras.models import Model
 14 | from keras import backend as K
 15 | from keras import metrics, optimizers
 16 | from keras.callbacks import Callback
 17 | import keras
 18 | import csv
 19 | from keras.models import model_from_json
 20 | import sys
 21 | 
 22 | #Define reconstruction loss
 23 | def reconstruction_loss(x_input, x_decoded):
 24 |     return metrics.mse(x_input, x_decoded)
 25 | 
 26 | #Prevent tensorflow from using all the memory
 27 | config = tf.ConfigProto()
 28 | config.gpu_options.allow_growth=True
 29 | sess = tf.Session(config=config)
 30 | 
 31 | #Read all user inputs
 32 | cancer = sys.argv[1]
 33 | vae_run = int(sys.argv[2])
 34 | dimension = 150
 35 | 
 36 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
 37 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/AE_FILES/'
 38 | 
 39 | #Load PCA weights
 40 | pca_df = pd.read_table(input_folder + cancer + '_DATA_TOP2_JOINED_PCA_1000L_COMPONENTS.tsv', index_col = 0)
 41 | print("PCA COMPONENTS ",  pca_df.shape)
 42 | pca_components = pca_df.values
 43 | 
 44 | 
 45 | #Save the weight for each 100 runs
 46 | print("MODEL " + str(vae_run))
 47 | 
 48 | #Load model
 49 | json_file = open(input_folder 'AE_FILES/AE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(vae_run) + '.json', 'r')
 50 | loaded_model_json = json_file.read()
 51 | json_file.close()
 52 | 
 53 | encoder = model_from_json(loaded_model_json)
 54 | encoder.load_weights(input_folder + 'AE_FILES/AE_' + cancer  + '_encoder_' + str(dimension) + 'L_' + str(vae_run) + '.h5')
 55 | print("Loaded model from disk")
 56 | 
 57 | #Read input data
 58 | input_df = pd.read_table(input_folder + cancer + '_DATA_TOP2_JOINED_PCA_1000L.tsv', index_col=0)
 59 | print("INPUT FILE ", input_df.shape)
 60 | 
 61 | #Define hyperparameters
 62 | input_df_training = input_df
 63 | original_dim = input_df_training.shape[1]
 64 | intermediate1_dim = 100
 65 | intermediate2_dim = 25
 66 | latent_dim = dimension
 67 | 
 68 | batch_size = 50
 69 | epochs = 50
 70 | learning_rate = 0.0005
 71 | beta = K.variable(1)
 72 | kappa = 0
 73 | 
 74 | #Define encoder
 75 | x = Input(shape=(original_dim, ))
 76 | 
 77 | net = Dense(intermediate1_dim)(x)
 78 | net2 = BatchNormalization()(net)
 79 | net3 = Activation('relu')(net2)
 80 | 
 81 | net4 = Dense(intermediate2_dim)(net3)
 82 | net5 = BatchNormalization()(net4)
 83 | net6 = Activation('relu')(net5)
 84 | 
 85 | adam = optimizers.Adam(lr=learning_rate)
 86 | encoder.compile(optimizer=adam, loss = reconstruction_loss)
 87 | encoder.summary()
 88 | 
 89 | #Encode training data using the model
 90 | training_encoded = encoder.predict(input_df_training, batch_size = batch_size)
 91 | print("ENCODED TRAINING DATA ", training_encoded.shape)
 92 | 
 93 | #Measure weights and save absolute value of importance, averaged over samples
 94 | from IntegratedGradients import *
 95 | 
 96 | ig = integrated_gradients(encoder)
 97 | 
 98 | overall_weights = np.zeros((pca_components.shape[0], dimension))
 99 | 
100 | for latent in range(dimension):
101 |     print("Node " + str(latent + 1))
102 |     weights = np.zeros((pca_components.shape[0], input_df_training.shape[0]))
103 | 
104 |     for i in range(input_df_training.shape[0]):
105 |         vals = ig.explain(input_df_training.values[i, :], latent)    
106 |         new_vals = np.matmul(vals, pca_components.T)
107 |         weights[:, i] = new_vals
108 |         
109 |     #Take absolute values avg over all samples 
110 |     overall_weights[:, latent] = np.mean(np.abs(weights), axis = 1)
111 | 
112 | ig_df = pd.DataFrame(overall_weights, index = pca_df.index)
113 | print("EXPLANATIONS DF ", ig_df.shape)
114 | 
115 | ig_df.to_csv(output_folder + cancer + '_DATA_AE_Weights_TRAINING_' + str(dimension) + 'L_fold' + str(vae_run) + '.tsv', sep='\t', quoting = csv.QUOTE_NONE)
116 | print(ig_df.shape)
117 | 


--------------------------------------------------------------------------------
/COMPETITOR_TRAININGS/Get_DAE_IG_Attributions.py:
--------------------------------------------------------------------------------
  1 | ###############################
  2 | #Script for running integrated gradients to get gene-level attributions of each node
  3 | ###############################
  4 | 
  5 | import os
  6 | import numpy as np
  7 | import pandas as pd
  8 | import math 
  9 | from sklearn.metrics import mean_squared_error
 10 | import tensorflow as tf
 11 | from keras.layers import Input, Dense, Lambda, Layer, Activation
 12 | from keras.layers.normalization import BatchNormalization
 13 | from keras.models import Model
 14 | from keras import backend as K
 15 | from keras import metrics, optimizers
 16 | from keras.callbacks import Callback
 17 | import keras
 18 | import csv
 19 | from keras.models import model_from_json
 20 | import sys
 21 | 
 22 | #Prevent tensorflow from using all the memory
 23 | config = tf.ConfigProto()
 24 | config.gpu_options.allow_growth=True
 25 | sess = tf.Session(config=config)
 26 | 
 27 | #Read all user inputs
 28 | cancer = sys.argv[1]
 29 | vae_run = int(sys.argv[2])
 30 | dimension = 150
 31 | 
 32 | input_folder = '../ALL_CANCER_FILES/' + cancer + '/'
 33 | output_folder = '../ALL_CANCER_FILES/' + cancer + '/DAE_FILES/'
 34 | 
 35 | #Load PCA weights
 36 | pca_df = pd.read_table(input_folder + cancer + '_DATA_TOP2_JOINED_PCA_1000L_COMPONENTS.tsv', index_col = 0)
 37 | print("PCA COMPONENTS ",  pca_df.shape)
 38 | pca_components = pca_df.values
 39 | 
 40 | #Define reconstruction loss
 41 | def reconstruction_loss(x_input, x_decoded):
 42 |     return metrics.mse(x_input, x_decoded)
 43 | 
 44 | #Save the weight for each 100 runs
 45 | print("MODEL " + str(vae_run))
 46 | 
 47 | #Load model
 48 | json_file = open(input_folder + 'DAE_FILES/DAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(vae_run) + '.json', 'r')
 49 | loaded_model_json = json_file.read()
 50 | json_file.close()
 51 | 
 52 | encoder = model_from_json(loaded_model_json)
 53 | encoder.load_weights(input_folder + 'DAE_FILES/DAE_' + cancer  + '_encoder_' + str(dimension) + 'L_' + str(vae_run) + '.h5')
 54 | print("Loaded model from disk")
 55 | 
 56 | #Read input data
 57 | input_df = pd.read_table(input_folder + cancer + '_DATA_TOP2_JOINED_PCA_1000L.tsv', index_col=0)
 58 | print("INPUT FILE ", input_df.shape)
 59 | 
 60 | #Define hyperparameters
 61 | input_df_training = input_df
 62 | original_dim = input_df_training.shape[1]
 63 | intermediate1_dim = 100
 64 | intermediate2_dim = 25
 65 | latent_dim = dimension
 66 | 
 67 | batch_size = 50
 68 | epochs = 50
 69 | learning_rate = 0.0005
 70 | beta = K.variable(1)
 71 | kappa = 0
 72 | 
 73 | #Define encoder
 74 | x = Input(shape=(original_dim, ))
 75 | 
 76 | net = Dense(intermediate1_dim)(x)
 77 | net2 = BatchNormalization()(net)
 78 | net3 = Activation('relu')(net2)
 79 | 
 80 | net4 = Dense(intermediate2_dim)(net3)
 81 | net5 = BatchNormalization()(net4)
 82 | net6 = Activation('relu')(net5)
 83 | 
 84 | adam = optimizers.Adam(lr=learning_rate)
 85 | encoder.compile(optimizer=adam, loss = reconstruction_loss)
 86 | encoder.summary()
 87 | 
 88 | #Encode training data using the model
 89 | training_encoded = encoder.predict(input_df_training, batch_size = batch_size)
 90 | print("ENCODED TRAINING DATA ", training_encoded.shape)
 91 | 
 92 | #Measure weights and save absolute value of importance, averaged over samples
 93 | from IntegratedGradients import *
 94 | 
 95 | ig = integrated_gradients(encoder)
 96 | 
 97 | overall_weights = np.zeros((pca_components.shape[0], dimension))
 98 | 
 99 | for latent in range(dimension):
100 |     print("Node " + str(latent + 1))
101 |     weights = np.zeros((pca_components.shape[0], input_df_training.shape[0]))
102 | 
103 |     for i in range(input_df_training.shape[0]):
104 |         vals = ig.explain(input_df_training.values[i, :], latent)    
105 |         new_vals = np.matmul(vals, pca_components.T)
106 |         weights[:, i] = new_vals
107 |         
108 |     #Take absolute values avg over all samples 
109 |     overall_weights[:, latent] = np.mean(np.abs(weights), axis = 1)
110 | 
111 | ig_df = pd.DataFrame(overall_weights, index = pca_df.index)
112 | print("EXPLANATIONS DF ", ig_df.shape)
113 | 
114 | ig_df.to_csv(output_folder + cancer + '_DATA_DAE_Weights_TRAINING_' + str(dimension) + 'L_fold' + str(vae_run) + '.tsv', sep='\t', quoting = csv.QUOTE_NONE)
115 | print(ig_df.shape)
116 | 


--------------------------------------------------------------------------------
/COMPETITOR_TRAININGS/IntegratedGradients.py:
--------------------------------------------------------------------------------
  1 | ################################################################
  2 | # Implemented by Naozumi Hiranuma (hiranumn@uw.edu)            #
  3 | #                                                              #
  4 | # Keras-compatible implmentation of Integrated Gradients       # 
  5 | # proposed in "Axiomatic attribution for deep neuron networks" #
  6 | # (https://arxiv.org/abs/1703.01365).                          #
  7 | #                                                              #
  8 | # Keywords: Shapley values, interpretable machine learning     #
  9 | ################################################################
 10 | 
 11 | from __future__ import division, print_function
 12 | import numpy as np
 13 | from time import sleep
 14 | import sys
 15 | import keras.backend as K
 16 | 
 17 | from keras.models import Model, Sequential
 18 | 
 19 | '''
 20 | Integrated gradients approximates Shapley values by integrating partial
 21 | gradients with respect to input features from reference input to the
 22 | actual input. The following class implements the paper "Axiomatic attribution
 23 | for deep neuron networks".
 24 | '''
 25 | class integrated_gradients:
 26 |     # model: Keras model that you wish to explain.
 27 |     # outchannels: In case the model are multi tasking, you can specify which output you want explain .
 28 |     def __init__(self, model, outchannels=[], verbose=1):
 29 |     
 30 |         #get backend info (either tensorflow or theano)
 31 |         self.backend = K.backend()
 32 |         
 33 |         #load model supports keras.Model and keras.Sequential
 34 |         if isinstance(model, Sequential):
 35 |             self.model = model.model
 36 |         elif isinstance(model, Model):
 37 |             self.model = model
 38 |         else:
 39 |             print("Invalid input model")
 40 |             return -1
 41 |         
 42 |         #load input tensors
 43 |         self.input_tensors = []
 44 |         for i in self.model.inputs:
 45 |             self.input_tensors.append(i)
 46 |         # The learning phase flag is a bool tensor (0 = test, 1 = train)
 47 |         # to be passed as input to any Keras function that uses 
 48 |         # a different behavior at train time and test time.
 49 |         self.input_tensors.append(K.learning_phase())
 50 |         
 51 |         #If outputchanels are specified, use it.
 52 |         #Otherwise evalueate all outputs.
 53 |         self.outchannels = outchannels
 54 |         if len(self.outchannels) == 0: 
 55 |             if verbose: print("Evaluated output channel (0-based index): All")
 56 |             if K.backend() == "tensorflow":
 57 |                 self.outchannels = range(self.model.output.shape[1]._value)
 58 |             elif K.backend() == "theano":
 59 |                 self.outchannels = range(self.model.output._keras_shape[1])
 60 |         else:
 61 |             if verbose: 
 62 |                 print("Evaluated output channels (0-based index):")
 63 |                 print(','.join([str(i) for i in self.outchannels]))
 64 |                 
 65 |         #Build gradient functions for desired output channels.
 66 |         self.get_gradients = {}
 67 |         if verbose: print("Building gradient functions")
 68 |         
 69 |         # Evaluate over all requested channels.
 70 |         for c in self.outchannels:
 71 |             # Get tensor that calculates gradient
 72 |             if K.backend() == "tensorflow":
 73 |                 gradients = self.model.optimizer.get_gradients(self.model.output[:, c], self.model.input)
 74 |             if K.backend() == "theano":
 75 |                 gradients = self.model.optimizer.get_gradients(self.model.output[:, c].sum(), self.model.input)
 76 |                 
 77 |             # Build computational graph that computes the tensors given inputs
 78 |             self.get_gradients[c] = K.function(inputs=self.input_tensors, outputs=gradients)
 79 |             
 80 |             # This takes a lot of time for a big model with many tasks.
 81 |             # So lets print the progress.
 82 |             if verbose:
 83 |                 sys.stdout.write('\r')
 84 |                 sys.stdout.write("Progress: "+str(int((c+1)*1.0/len(self.outchannels)*1000)*1.0/10)+"%")
 85 |                 sys.stdout.flush()
 86 |         # Done
 87 |         if verbose: print("\nDone.")
 88 |             
 89 |                 
 90 |     '''
 91 |     Input: sample to explain, channel to explain
 92 |     Optional inputs:
 93 |         - reference: reference values (defaulted to 0s).
 94 |         - steps: # steps from reference values to the actual sample (defualted to 50).
 95 |     Output: list of numpy arrays to integrated over.
 96 |     '''
 97 |     def explain(self, sample, outc=0, reference=False, num_steps=50, verbose=0):
 98 |         
 99 |         # Each element for each input stream.
100 |         samples = []
101 |         numsteps = []
102 |         step_sizes = []
103 |         
104 |         # If multiple inputs are present, feed them as list of np arrays. 
105 |         if isinstance(sample, list):
106 |             #If reference is present, reference and sample size need to be equal.
107 |             if reference != False: 
108 |                 assert len(sample) == len(reference)
109 |             for i in range(len(sample)):
110 |                 if reference == False:
111 |                     _output = integrated_gradients.linearly_interpolate(sample[i], False, num_steps)
112 |                 else:
113 |                     _output = integrated_gradients.linearly_interpolate(sample[i], reference[i], num_steps)
114 |                 samples.append(_output[0])
115 |                 numsteps.append(_output[1])
116 |                 step_sizes.append(_output[2])
117 |         
118 |         # Or you can feed just a single numpy arrray. 
119 |         elif isinstance(sample, np.ndarray):
120 |             _output = integrated_gradients.linearly_interpolate(sample, reference, num_steps)
121 |             samples.append(_output[0])
122 |             numsteps.append(_output[1])
123 |             step_sizes.append(_output[2])
124 |             
125 |         # Desired channel must be in the list of outputchannels
126 |         assert outc in self.outchannels
127 |         if verbose: print("Explaning the "+str(self.outchannels[outc])+"th output.")
128 |             
129 |         # For tensorflow backend
130 |         _input = []
131 |         for s in samples:
132 |             _input.append(s)
133 |         _input.append(0)
134 |         
135 |         if K.backend() == "tensorflow": 
136 |             gradients = self.get_gradients[outc](_input)
137 |         elif K.backend() == "theano":
138 |             gradients = self.get_gradients[outc](_input)
139 |             if len(self.model.inputs) == 1:
140 |                 gradients = [gradients]
141 |         
142 |         explanation = []
143 |         for i in range(len(gradients)):
144 |             _temp = np.sum(gradients[i], axis=0)
145 |             explanation.append(np.multiply(_temp, step_sizes[i]))
146 |            
147 |         # Format the return values according to the input sample.
148 |         if isinstance(sample, list):
149 |             return explanation
150 |         elif isinstance(sample, np.ndarray):
151 |             return explanation[0]
152 |         return -1
153 | 
154 |     
155 |     '''
156 |     Input: numpy array of a sample
157 |     Optional inputs:
158 |         - reference: reference values (defaulted to 0s).
159 |         - steps: # steps from reference values to the actual sample.
160 |     Output: list of numpy arrays to integrate over.
161 |     '''
162 |     @staticmethod
163 |     def linearly_interpolate(sample, reference=False, num_steps=50):
164 |         # Use default reference values if reference is not specified
165 |         if reference is False: reference = np.zeros(sample.shape);
166 | 
167 |         # Reference and sample shape needs to match exactly
168 |         assert sample.shape == reference.shape
169 | 
170 |         # Calcuated stepwise difference from reference to the actual sample.
171 |         ret = np.zeros(tuple([num_steps] +[i for i in sample.shape]))
172 |         for s in range(num_steps):
173 |             ret[s] = reference+(sample-reference)*(s*1.0/num_steps)
174 | 
175 |         return ret, num_steps, (sample-reference)*(1.0/num_steps)
176 | 


--------------------------------------------------------------------------------
/COMPETITOR_TRAININGS/Train_AE_Models.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for training AE models
3 | ###############################
4 | import sys
5 | cancer_type = sys.argv[1]
6 | 
7 | for run in range(10):
8 |     get_ipython().magic(u"run -i 'AE_2Layers_Model.py' " + cancer_type + " " + str(run))
9 | 


--------------------------------------------------------------------------------
/COMPETITOR_TRAININGS/Train_DAE_Models.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for training DAE models
3 | ###############################
4 | import sys
5 | cancer_type = sys.argv[1]
6 | 
7 | for run in range(10):
8 |     get_ipython().magic(u"run -i 'DAE_2Layers_Model.py' " + cancer_type + " " + str(run))
9 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Lee Lab @ UW Allen School
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MODEL_TRAININGS/Create_DeepProfile_Ensemble_Weights.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #This script is for creating gene attribution matrices for DeepProfile
 3 | ###############################
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import csv
 8 | import sys
 9 | 
10 | #Read user input
11 | cancer_type = sys.argv[1]
12 | 
13 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
14 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/' 
15 | 
16 | #Read all VAE model gene attributions
17 | L = 150
18 | data_df = pd.read_table(input_folder + 'VAE_WEIGHTS/' + cancer_type + '_DATA_VAE_Cluster_Weights_TRAINING_' + str(100) + 'L_fold' + str(1) + '.tsv', index_col = 0)
19 | print(data_df.shape)
20 | basic_length = data_df.shape[0]
21 | 
22 | weight_list = []
23 | dims = [5, 10, 25, 50, 75, 100]
24 | run_count = 100
25 | for dim in dims:
26 |     VAE_weights = np.zeros((run_count * dim, basic_length))
27 |     for i in range(run_count):
28 |         data_df = pd.read_table(input_folder + 'VAE_WEIGHTS/' + cancer_type + '_DATA_VAE_Cluster_Weights_TRAINING_' + str(dim) + 'L_fold' + str(i) + '.tsv', index_col = 0)
29 |         data_df = data_df.T
30 |         #print(data_df.shape)
31 |         start = dim * i
32 |         end = dim * (i + 1)
33 |         VAE_weights[start:end, :] = data_df.values
34 |     weight_list.append(VAE_weights)
35 | 
36 | #Read the ensemble labels
37 | labels_df = pd.read_table(input_folder + cancer_type + '_TRAINING_DATA_kmeans_ENSEMBLE_LABELS_' + str(L) + 'L.txt', header= None)
38 | labels = labels_df.values
39 | print("Ensemble labels ", len(labels))
40 | 
41 | #Concatenate all the gene attributions
42 | joined_weights = np.concatenate(weight_list)
43 | print("Joined weights ", joined_weights.shape)
44 | 
45 | #Create ensemble weights
46 | ensemble_weights = np.zeros((L, joined_weights.shape[1]))
47 | for label in range(L):
48 |     indices = np.where(labels == label)[0]
49 |     average_weights = np.mean(joined_weights[indices, :], axis = 0)
50 |     ensemble_weights[label, :] = average_weights
51 | 
52 | print("Ensemble weights ", ensemble_weights.shape)
53 | 
54 | #Record ensemble weights
55 | ensemble_weights_df = pd.DataFrame(ensemble_weights, index = np.arange(L), columns = data_df.columns)
56 | ensemble_weights_df.to_csv(output_folder + cancer_type + '_DeepProfile_Ensemble_Gene_Importance_Weights_' + str(L) + 'L.tsv', sep = '\t')


--------------------------------------------------------------------------------
/MODEL_TRAININGS/Create_DeepProfile_Training_Embeddings.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #This script is for creating training embeddings
 3 | ###############################
 4 | 
 5 | import pandas as pd
 6 | import numpy as np
 7 | import csv
 8 | import sys
 9 | 
10 | #Read user input
11 | cancer_type = sys.argv[1]
12 | 
13 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
14 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/' 
15 | 
16 | #Read all training embeddings
17 | dims  = [5, 10, 25, 50, 75, 100]
18 | data_list = []
19 | for dim in dims:
20 |     run = 100
21 |     for i in range(run):
22 |         data_df = pd.read_table(input_folder + 'VAE_FILES/' + cancer_type + '_DATA_TOP2_JOINED_encoded_' + str(dim) + 'L_TRAINING_fold' + str(i) + '.tsv', index_col = 0)      
23 |         print(data_df.shape)
24 |         data_list.append(data_df.values)
25 | 
26 | joined_data = np.concatenate(data_list, axis=1)
27 | print("Joined training embeddings" , joined_data.shape)
28 | 
29 | #Read the ensemble labels
30 | L = 150
31 | labels_df = pd.read_table(input_folder + cancer_type + '_TRAINING_DATA_kmeans_ENSEMBLE_LABELS_' + str(L) + 'L.txt', header= None)
32 | labels = labels_df.values
33 | print("Ensemble labels ", len(labels))
34 | 
35 | #Create ensemble training embeddings
36 | ensemble_embeddings = np.zeros((joined_data.shape[0], L))
37 | for label in range(L):
38 |     indices = np.where(labels == label)[0]
39 |     average_values = np.mean(joined_data[:, indices], axis = 1)
40 |     ensemble_embeddings[:, label] = average_values
41 | 
42 | print("Training ensemble embedding ", ensemble_embeddings.shape)
43 | 
44 | #Save the training embedding
45 | ensemble_embeddings_df = pd.DataFrame(ensemble_embeddings, index = data_df.index, columns = np.arange(L))
46 | ensemble_embeddings_df.to_csv(output_folder + cancer_type + '_DeepProfile_Training_Embedding_' + str(L) + 'L.tsv', sep = '\t')


--------------------------------------------------------------------------------
/MODEL_TRAININGS/Create_Ensemble_Labels.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #This script is for learning ensemble labels for VAE models
 3 | ###############################
 4 | 
 5 | import pandas as pd
 6 | import numpy as np
 7 | import csv
 8 | import sys
 9 | from sklearn.cluster import KMeans
10 | 
11 | #Read user inputs
12 | cancer_type = sys.argv[1]
13 | final_dim = int(sys.argv[2])
14 | print("FINAL DIM " + str(final_dim))
15 | 
16 | #Read all training embeddings
17 | dims  = [5, 10, 25, 50, 75, 100]
18 | data_list = []
19 | 
20 | for dim in dims:
21 |     run = 100
22 |     for i in range(run):
23 |         print(i)
24 |         data_df = pd.read_table('../ALL_CANCER_FILES/' + cancer_type + '/VAE_FILES/' + cancer_type + '_DATA_TOP2_JOINED_encoded_' + str(dim) + 'L_TRAINING_fold' + str(i) + '.tsv', index_col = 0)      
25 |         print(data_df.shape)
26 |         data_list.append(data_df.values)
27 | 
28 | joined_df = np.concatenate(data_list, axis=1)
29 | print("Joined training embeddings" , joined_df.shape)
30 | 
31 | #Apply kmeans clustering to this data
32 | X = joined_df
33 | 
34 | kmeans = KMeans(n_clusters= final_dim, random_state=123).fit(X.transpose())
35 | print("K-means labels ", kmeans.labels_)
36 | 
37 | #Save labels
38 | np.savetxt('../ALL_CANCER_FILES/' + cancer_type + '/' + cancer_type + '_TRAINING_DATA_kmeans_ENSEMBLE_LABELS_' + str(final_dim) + 'L.txt' , kmeans.labels_, delimiter=',')
39 | 


--------------------------------------------------------------------------------
/MODEL_TRAININGS/Create_PCs_for_DeepLearning_Models.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #This script is for PCA transforming the input data to pass to deep learning models
 3 | ###############################
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import csv
 8 | from sklearn.decomposition import PCA
 9 | import sys
10 | 
11 | #Read cancer type input
12 | cancer_type = sys.argv[1]
13 | #Read number of components
14 | component_count = int(sys.argv[2])
15 | 
16 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
17 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
18 | 
19 | #Method for creating PCs
20 | def createPCs(cancer_type):
21 |     
22 |     print("************************* " + cancer_type)
23 |     
24 |     #Read training data
25 |     data_df = pd.read_table(input_folder  + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
26 |     print("Training data ", data_df.shape)
27 |     training_data = data_df.values
28 |     training_data = np.nan_to_num(training_data)
29 | 
30 |     #Transform training data to top principal components
31 |     pca = PCA(n_components = component_count)
32 |     pca.fit(training_data)
33 |     components = pca.components_
34 |     print("PCA components ", components.shape)
35 | 
36 |     #Save the learned components
37 |     component_df = pd.DataFrame(components.T, index = data_df.columns)
38 |     component_df.to_csv(output_folder + cancer_type + '_DATA_TOP2_JOINED_PCA_' + str(component_count) + 'L_COMPONENTS.tsv', sep = '\t')
39 | 
40 |     #Save the encoded data
41 |     encoded_data = pca.transform(training_data)
42 |     print("PCA encoded data ", encoded_data.shape)
43 |     encoded_df = pd.DataFrame(encoded_data, index = data_df.index)
44 |     encoded_df.to_csv(output_folder + cancer_type + '_DATA_TOP2_JOINED_PCA_' + str(component_count) + 'L.tsv', sep = '\t')
45 | 
46 | createPCs(cancer_type)


--------------------------------------------------------------------------------
/MODEL_TRAININGS/Example_Run_All.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Example for training model for a cancer type
 3 | ###############################
 4 | import sys
 5 | 
 6 | ##STEP 1: Creating PCs
 7 | get_ipython().magic(u"run -i Create_PCs_for_DeepLearning_Models.py BRCA 1000")
 8 | 
 9 | ##STEP 2: Training VAE models
10 | get_ipython().magic(u"run -i Run_VAE_Models.py BRCA 5 0 100")
11 | get_ipython().magic(u"run -i Run_VAE_Models.py BRCA 10 0 100")
12 | get_ipython().magic(u"run -i Run_VAE_Models.py BRCA 25 0 100")
13 | get_ipython().magic(u"run -i Run_VAE_Models.py BRCA 50 0 100")
14 | get_ipython().magic(u"run -i Run_VAE_Models.py BRCA 75 0 100")
15 | get_ipython().magic(u"run -i Run_VAE_Models.py BRCA 100 0 100")
16 | 
17 | ##STEP 3: Running IG for VAE models
18 | get_ipython().magic(u"run -i Get_VAE_IG_Attributions.py BRCA 5 0 100")
19 | get_ipython().magic(u"run -i Get_VAE_IG_Attributions.py BRCA 10 0 100")
20 | get_ipython().magic(u"run -i Get_VAE_IG_Attributions.py BRCA 25 0 100")
21 | get_ipython().magic(u"run -i Get_VAE_IG_Attributions.py BRCA 50 0 100")
22 | get_ipython().magic(u"run -i Get_VAE_IG_Attributions.py BRCA 75 0 100")
23 | get_ipython().magic(u"run -i Get_VAE_IG_Attributions.py BRCA 100 0 100")
24 | 
25 | ##STEP 4: Learning ensemble labels 
26 | get_ipython().magic(u"run -i Create_Ensemble_Labels.py BRCA 150")
27 | 
28 | ##STEP 5: Creating DeepProfile ensemble training embedding
29 | get_ipython().magic(u"run -i Create_DeepProfile_Training_Embeddings.py BRCA")
30 | 
31 | ##STEP 6: Creating DeepProfile ensemble gene attribution matrices
32 | get_ipython().magic(u"run -i Create_DeepProfile_Ensemble_Weights.py BRCA")
33 | 
34 | 


--------------------------------------------------------------------------------
/MODEL_TRAININGS/Get_VAE_IG_Attributions.py:
--------------------------------------------------------------------------------
  1 | ###############################
  2 | #Script for running integrated gradients to get gene-level attributions of each node
  3 | ###############################
  4 | 
  5 | import os
  6 | import numpy as np
  7 | import pandas as pd
  8 | import math 
  9 | from sklearn.metrics import mean_squared_error
 10 | import tensorflow as tf
 11 | from keras.layers import Input, Dense, Lambda, Layer, Activation
 12 | from keras.layers.normalization import BatchNormalization
 13 | from keras.models import Model
 14 | from keras import backend as K
 15 | from keras import metrics, optimizers
 16 | from keras.callbacks import Callback
 17 | import keras
 18 | import csv
 19 | from keras.models import model_from_json
 20 | import sys
 21 | 
 22 | #Prevent tensorflow from using all the memory
 23 | config = tf.ConfigProto()
 24 | config.gpu_options.allow_growth=True
 25 | sess = tf.Session(config=config)
 26 | 
 27 | #Read all user inputs
 28 | cancer = sys.argv[1]
 29 | dimension = int(sys.argv[2])
 30 | start = int(sys.argv[3])
 31 | end = int(sys.argv[4])
 32 | 
 33 | print("CANCER " + str(cancer))
 34 | print("DIM " + str(dimension))
 35 | print("START " + str(start)) 
 36 | print("END " + str(end)) 
 37 | 
 38 | input_folder = '../ALL_CANCER_FILES/' + cancer + '/' 
 39 | output_folder = '../ALL_CANCER_FILES/' + cancer + '/VAE_WEIGHTS/' 
 40 | 
 41 | #Load PCA weights
 42 | pca_df = pd.read_table(input_folder + cancer + '_DATA_TOP2_JOINED_PCA_1000L_COMPONENTS.tsv', index_col = 0)
 43 | print("PCA COMPONENTS ",  pca_df.shape)
 44 | pca_components = pca_df.values
 45 | 
 46 |  #Read input data
 47 | input_df = pd.read_table(input_folder + cancer + '_DATA_TOP2_JOINED_PCA_1000L.tsv', index_col=0)
 48 | print("INPUT FILE ", input_df.shape)
 49 | 
 50 | #VAE loss definition
 51 | def vae_loss(x_input, x_decoded):
 52 |     reconstruction_loss = original_dim * metrics.mse(x_input, x_decoded)
 53 |     kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
 54 |     return K.mean(reconstruction_loss + (K.get_value(beta) * kl_loss))
 55 | 
 56 | #Save the weight for each run
 57 | for vae_run in range(start, end):
 58 |     
 59 |     print("MODEL " + str(vae_run))
 60 |     
 61 |     #Load model
 62 |     json_file = open(input_folder + 'VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(vae_run) + '.json', 'r')
 63 |     loaded_model_json = json_file.read()
 64 |     json_file.close()
 65 |     encoder = model_from_json(loaded_model_json)
 66 |     
 67 |     #Load weights
 68 |     encoder.load_weights(input_folder + 'VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(vae_run) + '.h5')
 69 |     print("Loaded model from disk")
 70 | 
 71 |     #Define hyperparameters
 72 |     input_df_training = input_df
 73 |     original_dim = input_df_training.shape[1]
 74 |     intermediate1_dim = 100
 75 |     intermediate2_dim = 25
 76 |     latent_dim = dimension
 77 | 
 78 |     batch_size = 50
 79 |     epochs = 50
 80 |     learning_rate = 0.0005
 81 |     beta = K.variable(1)
 82 |     kappa = 0
 83 | 
 84 |     #Encoder network
 85 |     x = Input(shape=(original_dim, ))
 86 | 
 87 |     net = Dense(intermediate1_dim)(x)
 88 |     net2 = BatchNormalization()(net)
 89 |     net3 = Activation('relu')(net2)
 90 | 
 91 |     net4 = Dense(intermediate2_dim)(net3)
 92 |     net5 = BatchNormalization()(net4)
 93 |     net6 = Activation('relu')(net5)
 94 | 
 95 |     z_mean = Dense(latent_dim)(net6)
 96 |     z_log_var = Dense(latent_dim)(net6)
 97 | 
 98 |     adam = optimizers.Adam(lr=learning_rate)
 99 |     encoder.compile(optimizer=adam, loss = vae_loss)
100 |     encoder.summary()
101 | 
102 |     #Encode training data using the model
103 |     training_encoded = encoder.predict(input_df_training, batch_size = batch_size)
104 |     print("ENCODED TRAINING DATA ", training_encoded.shape)
105 | 
106 | 
107 |     #Measure weights and save absolute value of importance, averaged over samples
108 |     from IntegratedGradients import *
109 | 
110 |     ig = integrated_gradients(encoder)
111 | 
112 |     overall_weights = np.zeros((pca_components.shape[0], dimension))
113 | 
114 |     #Go over each node
115 |     for latent in range(dimension):
116 |         print("Node " + str(latent + 1))
117 |         weights = np.zeros((pca_components.shape[0], input_df_training.shape[0]))
118 |         
119 |         #Go over each sample
120 |         for i in range(input_df_training.shape[0]):
121 |             #print("Sample " + str(i + 1))
122 |             vals = ig.explain(input_df_training.values[i, :], latent)    
123 |             new_vals = np.matmul(vals, pca_components.T)
124 |             weights[:, i] = new_vals
125 |             
126 |         #Take absolute values avg over all samples 
127 |         overall_weights[:, latent] = np.mean(np.abs(weights), axis = 1)
128 | 
129 |     ig_df = pd.DataFrame(overall_weights, index = pca_df.index)
130 |     print("EXPLANATIONS DF ", ig_df.shape)
131 |     
132 |     ig_df.to_csv(output_folder + cancer + '_DATA_VAE_Cluster_Weights_TRAINING_' + str(dimension) + 'L_fold' + str(vae_run) + '.tsv', sep='\t', quoting = csv.QUOTE_NONE)
133 |     
134 | 


--------------------------------------------------------------------------------
/MODEL_TRAININGS/IntegratedGradients.py:
--------------------------------------------------------------------------------
  1 | ################################################################
  2 | # Implemented by Naozumi Hiranuma (hiranumn@uw.edu)            #
  3 | #                                                              #
  4 | # Keras-compatible implmentation of Integrated Gradients       # 
  5 | # proposed in "Axiomatic attribution for deep neuron networks" #
  6 | # (https://arxiv.org/abs/1703.01365).                          #
  7 | #                                                              #
  8 | # Keywords: Shapley values, interpretable machine learning     #
  9 | ################################################################
 10 | 
 11 | from __future__ import division, print_function
 12 | import numpy as np
 13 | from time import sleep
 14 | import sys
 15 | import keras.backend as K
 16 | 
 17 | from keras.models import Model, Sequential
 18 | 
 19 | '''
 20 | Integrated gradients approximates Shapley values by integrating partial
 21 | gradients with respect to input features from reference input to the
 22 | actual input. The following class implements the paper "Axiomatic attribution
 23 | for deep neuron networks".
 24 | '''
 25 | class integrated_gradients:
 26 |     # model: Keras model that you wish to explain.
 27 |     # outchannels: In case the model are multi tasking, you can specify which output you want explain .
 28 |     def __init__(self, model, outchannels=[], verbose=1):
 29 |     
 30 |         #get backend info (either tensorflow or theano)
 31 |         self.backend = K.backend()
 32 |         
 33 |         #load model supports keras.Model and keras.Sequential
 34 |         if isinstance(model, Sequential):
 35 |             self.model = model.model
 36 |         elif isinstance(model, Model):
 37 |             self.model = model
 38 |         else:
 39 |             print("Invalid input model")
 40 |             return -1
 41 |         
 42 |         #load input tensors
 43 |         self.input_tensors = []
 44 |         for i in self.model.inputs:
 45 |             self.input_tensors.append(i)
 46 |         # The learning phase flag is a bool tensor (0 = test, 1 = train)
 47 |         # to be passed as input to any Keras function that uses 
 48 |         # a different behavior at train time and test time.
 49 |         self.input_tensors.append(K.learning_phase())
 50 |         
 51 |         #If outputchanels are specified, use it.
 52 |         #Otherwise evalueate all outputs.
 53 |         self.outchannels = outchannels
 54 |         if len(self.outchannels) == 0: 
 55 |             if verbose: print("Evaluated output channel (0-based index): All")
 56 |             if K.backend() == "tensorflow":
 57 |                 self.outchannels = range(self.model.output.shape[1]._value)
 58 |             elif K.backend() == "theano":
 59 |                 self.outchannels = range(self.model.output._keras_shape[1])
 60 |         else:
 61 |             if verbose: 
 62 |                 print("Evaluated output channels (0-based index):")
 63 |                 print(','.join([str(i) for i in self.outchannels]))
 64 |                 
 65 |         #Build gradient functions for desired output channels.
 66 |         self.get_gradients = {}
 67 |         if verbose: print("Building gradient functions")
 68 |         
 69 |         # Evaluate over all requested channels.
 70 |         for c in self.outchannels:
 71 |             # Get tensor that calculates gradient
 72 |             if K.backend() == "tensorflow":
 73 |                 gradients = self.model.optimizer.get_gradients(self.model.output[:, c], self.model.input)
 74 |             if K.backend() == "theano":
 75 |                 gradients = self.model.optimizer.get_gradients(self.model.output[:, c].sum(), self.model.input)
 76 |                 
 77 |             # Build computational graph that computes the tensors given inputs
 78 |             self.get_gradients[c] = K.function(inputs=self.input_tensors, outputs=gradients)
 79 |             
 80 |             # This takes a lot of time for a big model with many tasks.
 81 |             # So lets print the progress.
 82 |             if verbose:
 83 |                 sys.stdout.write('\r')
 84 |                 sys.stdout.write("Progress: "+str(int((c+1)*1.0/len(self.outchannels)*1000)*1.0/10)+"%")
 85 |                 sys.stdout.flush()
 86 |         # Done
 87 |         if verbose: print("\nDone.")
 88 |             
 89 |                 
 90 |     '''
 91 |     Input: sample to explain, channel to explain
 92 |     Optional inputs:
 93 |         - reference: reference values (defaulted to 0s).
 94 |         - steps: # steps from reference values to the actual sample (defualted to 50).
 95 |     Output: list of numpy arrays to integrated over.
 96 |     '''
 97 |     def explain(self, sample, outc=0, reference=False, num_steps=50, verbose=0):
 98 |         
 99 |         # Each element for each input stream.
100 |         samples = []
101 |         numsteps = []
102 |         step_sizes = []
103 |         
104 |         # If multiple inputs are present, feed them as list of np arrays. 
105 |         if isinstance(sample, list):
106 |             #If reference is present, reference and sample size need to be equal.
107 |             if reference != False: 
108 |                 assert len(sample) == len(reference)
109 |             for i in range(len(sample)):
110 |                 if reference == False:
111 |                     _output = integrated_gradients.linearly_interpolate(sample[i], False, num_steps)
112 |                 else:
113 |                     _output = integrated_gradients.linearly_interpolate(sample[i], reference[i], num_steps)
114 |                 samples.append(_output[0])
115 |                 numsteps.append(_output[1])
116 |                 step_sizes.append(_output[2])
117 |         
118 |         # Or you can feed just a single numpy arrray. 
119 |         elif isinstance(sample, np.ndarray):
120 |             _output = integrated_gradients.linearly_interpolate(sample, reference, num_steps)
121 |             samples.append(_output[0])
122 |             numsteps.append(_output[1])
123 |             step_sizes.append(_output[2])
124 |             
125 |         # Desired channel must be in the list of outputchannels
126 |         assert outc in self.outchannels
127 |         if verbose: print("Explaning the "+str(self.outchannels[outc])+"th output.")
128 |             
129 |         # For tensorflow backend
130 |         _input = []
131 |         for s in samples:
132 |             _input.append(s)
133 |         _input.append(0)
134 |         
135 |         if K.backend() == "tensorflow": 
136 |             gradients = self.get_gradients[outc](_input)
137 |         elif K.backend() == "theano":
138 |             gradients = self.get_gradients[outc](_input)
139 |             if len(self.model.inputs) == 1:
140 |                 gradients = [gradients]
141 |         
142 |         explanation = []
143 |         for i in range(len(gradients)):
144 |             _temp = np.sum(gradients[i], axis=0)
145 |             explanation.append(np.multiply(_temp, step_sizes[i]))
146 |            
147 |         # Format the return values according to the input sample.
148 |         if isinstance(sample, list):
149 |             return explanation
150 |         elif isinstance(sample, np.ndarray):
151 |             return explanation[0]
152 |         return -1
153 | 
154 |     
155 |     '''
156 |     Input: numpy array of a sample
157 |     Optional inputs:
158 |         - reference: reference values (defaulted to 0s).
159 |         - steps: # steps from reference values to the actual sample.
160 |     Output: list of numpy arrays to integrate over.
161 |     '''
162 |     @staticmethod
163 |     def linearly_interpolate(sample, reference=False, num_steps=50):
164 |         # Use default reference values if reference is not specified
165 |         if reference is False: reference = np.zeros(sample.shape);
166 | 
167 |         # Reference and sample shape needs to match exactly
168 |         assert sample.shape == reference.shape
169 | 
170 |         # Calcuated stepwise difference from reference to the actual sample.
171 |         ret = np.zeros(tuple([num_steps] +[i for i in sample.shape]))
172 |         for s in range(num_steps):
173 |             ret[s] = reference+(sample-reference)*(s*1.0/num_steps)
174 | 
175 |         return ret, num_steps, (sample-reference)*(1.0/num_steps)
176 | 


--------------------------------------------------------------------------------
/MODEL_TRAININGS/Run_VAE_Models.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for training VAE models
 3 | ###############################
 4 | import sys
 5 | 
 6 | cancer_type = sys.argv[1]
 7 | latent = int(sys.argv[2])
 8 | start = int(sys.argv[3])
 9 | end = int(sys.argv[4])
10 | 
11 | if latent == 5:  
12 |     dim1 = 100
13 |     dim2 = 25
14 | if latent == 10:   
15 |     dim1 = 250
16 |     dim2 = 50
17 | if latent == 25:  
18 |     dim1 = 250
19 |     dim2 = 100
20 | if latent == 50:  
21 |     dim1 = 250
22 |     dim2 = 100
23 | if latent == 75:  
24 |     dim1 = 250
25 |     dim2 = 100
26 | if latent == 100:   
27 |     dim1 = 250
28 |     dim2 = 100
29 | 
30 | for run in range(start, end):
31 |     get_ipython().magic(u"run -i 'VAE_3Layers_Model.py' '" +  cancer_type + "' " + str(dim1) + " " + str(dim2) + " " + str(latent) + " " + str(run))
32 | 


--------------------------------------------------------------------------------
/MODEL_TRAININGS/Select_Latent_Dimension_with_Gmeans.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "###############################\n",
 10 |     "#g-means training to select ensemble latent dimension size\n",
 11 |     "\n",
 12 |     "###############################\n",
 13 |     "\n",
 14 |     "import numpy as np\n",
 15 |     "import pandas as pd\n",
 16 |     "import csv\n",
 17 |     "from sklearn.decomposition import PCA\n",
 18 |     "import sklearn.preprocessing\n",
 19 |     "\n",
 20 |     "import pandas as pd\n",
 21 |     "import numpy as np\n",
 22 |     "import csv\n",
 23 |     "import sys"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 2,
 29 |    "metadata": {},
 30 |    "outputs": [
 31 |     {
 32 |      "name": "stdout",
 33 |      "output_type": "stream",
 34 |      "text": [
 35 |       "************************* BRCA\n",
 36 |       "Joined_df  (11963, 26500)\n",
 37 |       "(26500, 11963)\n"
 38 |      ]
 39 |     },
 40 |     {
 41 |      "name": "stderr",
 42 |      "output_type": "stream",
 43 |      "text": [
 44 |       "/homes/gws/abdincer/.local/lib/python3.6/site-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n",
 45 |       "  import pandas.util.testing as tm\n"
 46 |      ]
 47 |     },
 48 |     {
 49 |      "name": "stdout",
 50 |      "output_type": "stream",
 51 |      "text": [
 52 |       "Selected dimension  228\n",
 53 |       "************************* COLON\n",
 54 |       "Joined_df  (5616, 26500)\n",
 55 |       "(26500, 5616)\n",
 56 |       "Selected dimension  195\n",
 57 |       "************************* LUNG\n",
 58 |       "Joined_df  (4869, 26500)\n",
 59 |       "(26500, 4869)\n",
 60 |       "Selected dimension  166\n",
 61 |       "************************* AML\n",
 62 |       "Joined_df  (6534, 26500)\n",
 63 |       "(26500, 6534)\n",
 64 |       "Selected dimension  57\n",
 65 |       "************************* BRAIN\n",
 66 |       "Joined_df  (4282, 26500)\n",
 67 |       "(26500, 4282)\n",
 68 |       "Selected dimension  192\n",
 69 |       "************************* SKIN\n",
 70 |       "Joined_df  (1240, 26500)\n",
 71 |       "(26500, 1240)\n",
 72 |       "Selected dimension  165\n",
 73 |       "************************* SARCOMA\n",
 74 |       "Joined_df  (2330, 26500)\n",
 75 |       "(26500, 2330)\n",
 76 |       "Selected dimension  162\n",
 77 |       "************************* LIVER\n",
 78 |       "Joined_df  (1937, 26500)\n",
 79 |       "(26500, 1937)\n",
 80 |       "Selected dimension  168\n",
 81 |       "************************* KIDNEY\n",
 82 |       "Joined_df  (2293, 26500)\n",
 83 |       "(26500, 2293)\n",
 84 |       "Selected dimension  123\n",
 85 |       "************************* OV\n",
 86 |       "Joined_df  (2714, 26500)\n",
 87 |       "(26500, 2714)\n",
 88 |       "Selected dimension  178\n",
 89 |       "************************* PROSTATE\n",
 90 |       "Joined_df  (1195, 26500)\n",
 91 |       "(26500, 1195)\n",
 92 |       "Selected dimension  163\n",
 93 |       "************************* CERVICAL\n",
 94 |       "Joined_df  (443, 26500)\n",
 95 |       "(26500, 443)\n",
 96 |       "Selected dimension  142\n",
 97 |       "************************* BLADDER\n",
 98 |       "Joined_df  (371, 26500)\n",
 99 |       "(26500, 371)\n",
100 |       "Selected dimension  136\n",
101 |       "************************* STOMACH\n",
102 |       "Joined_df  (1742, 26500)\n",
103 |       "(26500, 1742)\n",
104 |       "Selected dimension  137\n",
105 |       "************************* THYROID\n",
106 |       "Joined_df  (776, 26500)\n",
107 |       "(26500, 776)\n",
108 |       "Selected dimension  160\n",
109 |       "************************* UTERINE\n",
110 |       "Joined_df  (661, 26500)\n",
111 |       "(26500, 661)\n",
112 |       "Selected dimension  156\n",
113 |       "************************* HEAD_NECK\n",
114 |       "Joined_df  (643, 26500)\n",
115 |       "(26500, 643)\n",
116 |       "Selected dimension  156\n",
117 |       "************************* PANCREAS\n",
118 |       "Joined_df  (602, 26500)\n",
119 |       "(26500, 602)\n",
120 |       "Selected dimension  145\n"
121 |      ]
122 |     }
123 |    ],
124 |    "source": [
125 |     "cancer_types = ['BRCA', 'COLON', 'LUNG', 'AML',\n",
126 |     "                'BRAIN', 'SKIN', 'SARCOMA', 'LIVER', \n",
127 |     "                'KIDNEY', 'OV','PROSTATE', 'CERVICAL', \n",
128 |     "                'BLADDER', 'STOMACH', 'THYROID', 'UTERINE', \n",
129 |     "                'HEAD_NECK', 'PANCREAS']\n",
130 |     "    \n",
131 |     "L_values = []\n",
132 |     "for cancer_type in cancer_types:\n",
133 |     "    print(\"************************* \" + cancer_type)\n",
134 |     "    input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/VAE_FILES/'  \n",
135 |     "\n",
136 |     "    #all encodings for one dimension\n",
137 |     "    dims  = [5, 10, 25, 50, 75, 100]\n",
138 |     "\n",
139 |     "    data_list = []\n",
140 |     "\n",
141 |     "    for dim in dims:\n",
142 |     "        run = 100\n",
143 |     "        for i in range(run):\n",
144 |     "            #print(i + 1)\n",
145 |     "            data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_encoded_' + str(dim) + 'L_TRAINING_fold' + str(i) + '.tsv', index_col = 0)      \n",
146 |     "            #print(data_df.shape)\n",
147 |     "            data_list.append(data_df.values)\n",
148 |     "\n",
149 |     "\n",
150 |     "    joined_df = np.concatenate(data_list, axis=1)\n",
151 |     "    print(\"Joined_df \", joined_df.shape)\n",
152 |     "\n",
153 |     "    #Apply kmeans clustering to this data\n",
154 |     "    from sklearn.cluster import KMeans\n",
155 |     "    import numpy as np\n",
156 |     "    X = joined_df.T\n",
157 |     "    print(X.shape)\n",
158 |     "    \n",
159 |     "    from gmeans import *\n",
160 |     "    gmeans = GMeans(strictness=3, random_state = 12345)\n",
161 |     "    gmeans.fit(X)\n",
162 |     "    gmeans.labels_\n",
163 |     "    selected_L = len(np.unique(gmeans.labels_))\n",
164 |     "    print(\"Selected dimension \", selected_L)\n",
165 |     "    \n",
166 |     "    L_values.append(selected_L)\n",
167 |     "\n"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 3,
173 |    "metadata": {},
174 |    "outputs": [
175 |     {
176 |      "name": "stdout",
177 |      "output_type": "stream",
178 |      "text": [
179 |       "[228, 195, 166, 57, 192, 165, 162, 168, 123, 178, 163, 142, 136, 137, 160, 156, 156, 145]\n",
180 |       "157.16666666666666\n"
181 |      ]
182 |     }
183 |    ],
184 |    "source": [
185 |     "print(L_values)\n",
186 |     "print(np.mean(L_values))"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": []
195 |   }
196 |  ],
197 |  "metadata": {
198 |   "kernelspec": {
199 |    "display_name": "Python 3",
200 |    "language": "python",
201 |    "name": "python3"
202 |   },
203 |   "language_info": {
204 |    "codemirror_mode": {
205 |     "name": "ipython",
206 |     "version": 3
207 |    },
208 |    "file_extension": ".py",
209 |    "mimetype": "text/x-python",
210 |    "name": "python",
211 |    "nbconvert_exporter": "python",
212 |    "pygments_lexer": "ipython3",
213 |    "version": "3.6.8"
214 |   }
215 |  },
216 |  "nbformat": 4,
217 |  "nbformat_minor": 4
218 | }
219 | 


--------------------------------------------------------------------------------
/MODEL_TRAININGS/VAE_3Layers_Model.py:
--------------------------------------------------------------------------------
  1 | ###############################
  2 | #VAE model
  3 | 
  4 | #Code is modified from https://github.com/keras-team/keras/blob/master/examples/variational_autoencoder.py
  5 | ###############################
  6 | 
  7 | import os
  8 | import numpy as np
  9 | import pandas as pd
 10 | import math 
 11 | from sklearn.metrics import mean_squared_error
 12 | import matplotlib.pyplot as plt
 13 | import tensorflow as tf
 14 | from keras.layers import Input, Dense, Lambda, Layer, Activation, Dropout
 15 | from keras.layers.normalization import BatchNormalization
 16 | from keras.models import Model
 17 | from keras import backend as K
 18 | from keras import metrics, optimizers
 19 | from keras.callbacks import Callback
 20 | import keras
 21 | import csv
 22 | import sys
 23 | 
 24 | #Prevent tensorflow from using all the memory
 25 | config = tf.ConfigProto()
 26 | config.gpu_options.allow_growth=True
 27 | sess = tf.Session(config=config)
 28 | 
 29 | 
 30 | # Method for reparameterization trick to make model differentiable
 31 | def sampling(args):
 32 |     
 33 |     # Function with args required for Keras Lambda function
 34 |     z_mean, z_log_var = args
 35 | 
 36 |     # Draw epsilon of the same shape from a standard normal distribution
 37 |     epsilon = K.random_normal(shape=K.shape(z_mean), mean=0., stddev=1.0)
 38 |     
 39 |     # The latent vector is non-deterministic and differentiable
 40 |     # in respect to z_mean and z_log_var
 41 |     z = z_mean + K.exp(z_log_var / 2) * epsilon
 42 |     return z
 43 | 
 44 | #Method for defining the VAE loss
 45 | def vae_loss(x_input, x_decoded):
 46 |     
 47 |     reconstruction_loss = original_dim * metrics.mse(x_input, x_decoded)
 48 |     kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
 49 |     
 50 |     return K.mean(reconstruction_loss + (K.get_value(beta) * kl_loss))
 51 | 
 52 | #Method for calculating the reconstruction loss
 53 | def reconstruction_loss(x_input, x_decoded):
 54 |     
 55 |     return metrics.mse(x_input, x_decoded)
 56 | 
 57 | #Method for calculating the KL-divergence loss
 58 | def kl_loss(x_input, x_decoded):
 59 |     return - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
 60 | 
 61 | class WarmUpCallback(Callback):
 62 |     def __init__(self, beta, kappa):
 63 |         self.beta = beta
 64 |         self.kappa = kappa
 65 |     
 66 |     # Behavior on each epoch
 67 |     def on_epoch_end(self, epoch, logs={}):
 68 |         if K.get_value(self.beta) <= 1:
 69 |             K.set_value(self.beta, K.get_value(self.beta) + self.kappa)
 70 | 
 71 | #Read input file
 72 | cancer_type = sys.argv[1]
 73 | 
 74 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
 75 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/VAE_FILES/'
 76 | 
 77 | input_filename = input_folder + cancer_type + '_DATA_TOP2_JOINED_PCA_1000L.tsv'
 78 | output_filename = cancer_type + '_DATA_TOP2_JOINED_encoded_'
 79 | 
 80 | input_df = pd.read_table(input_filename, index_col=0)
 81 | print("INPUT FILE", input_df.shape)
 82 | print(input_df.head(5))
 83 | 
 84 | # Set hyperparameters
 85 | original_dim = input_df.shape[1]
 86 | intermediate1_dim = int(sys.argv[2])
 87 | intermediate2_dim = int(sys.argv[3])
 88 | latent_dim = int(sys.argv[4])
 89 | fold = int(sys.argv[5])
 90 | 
 91 | #SET RANDOM SEEDS
 92 | from numpy.random import seed
 93 | seed(123456 * fold)
 94 | from tensorflow import set_random_seed
 95 | set_random_seed(123456 * fold)
 96 | 
 97 | 
 98 | init_mode = 'glorot_uniform'
 99 | batch_size = 50
100 | epochs = 50
101 | learning_rate = 0.0005
102 | beta = K.variable(1)
103 | kappa = 0
104 | 
105 | input_df_training = input_df
106 | 
107 | #Define encoder
108 | x = Input(shape=(original_dim, ))
109 | 
110 | net = Dense(intermediate1_dim, kernel_initializer=init_mode)(x)
111 | net2 = BatchNormalization()(net)
112 | net3 = Activation('relu')(net2)
113 | 
114 | net4 = Dense(intermediate2_dim, kernel_initializer=init_mode)(net3)
115 | net5 = BatchNormalization()(net4)
116 | net6 = Activation('relu')(net5)
117 | 
118 | z_mean = Dense(latent_dim, kernel_initializer=init_mode)(net6)
119 | z_log_var = Dense(latent_dim, kernel_initializer=init_mode)(net6)
120 | 
121 | # Sample from mean and var
122 | z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
123 | 
124 | #Define decoder
125 | decoder_h = Dense(intermediate2_dim, activation='relu', kernel_initializer=init_mode)
126 | decoder_h2 = Dense(intermediate1_dim, activation='relu', kernel_initializer=init_mode)
127 | decoder_mean = Dense(original_dim, kernel_initializer=init_mode)
128 | 
129 | h_decoded = decoder_h(z)
130 | h_decoded2 = decoder_h2(h_decoded)
131 | x_decoded_mean = decoder_mean(h_decoded2)
132 | 
133 | #VAE model
134 | vae = Model(x, x_decoded_mean)
135 | 
136 | adam = optimizers.Adam(lr=learning_rate)
137 | vae.compile(optimizer=adam, loss = vae_loss, metrics = [reconstruction_loss, kl_loss])
138 | vae.summary()
139 | 
140 | #Train model
141 | history  = vae.fit(np.array(input_df_training), np.array(input_df_training),
142 |                shuffle=True,
143 |                epochs=epochs,
144 |                batch_size=batch_size,
145 |                verbose = 2,
146 |                callbacks=[WarmUpCallback(beta, kappa)])
147 | 
148 | # DEFINE ENCODER
149 | encoder = Model(x, z_mean)
150 | 
151 | #DEFINE DECODER
152 | decoder_input = Input(shape=(latent_dim, )) 
153 | _h_decoded = decoder_h(decoder_input)
154 | _h_decoded2 = decoder_h2(_h_decoded)
155 | _x_decoded_mean = decoder_mean(_h_decoded2)
156 | decoder = Model(decoder_input, _x_decoded_mean)
157 | 
158 | 
159 | training_encoded = encoder.predict(input_df_training, batch_size = batch_size)
160 | training_encoded_df = pd.DataFrame(training_encoded, index = input_df_training.index)
161 | 
162 | # How well does the model reconstruct the input data
163 | training_reconstructed = decoder.predict(np.array(training_encoded_df))
164 | training_reconstructed_df = pd.DataFrame(training_reconstructed, index = input_df_training.index, columns = input_df_training.columns)
165 | 
166 | recons_error = mean_squared_error(np.array(input_df_training), np.array(training_reconstructed_df))
167 | 
168 | print("TRAINING RECONSTRUCTION ERROR: " + str(recons_error))
169 | 
170 | #Save encoded test data
171 | training_encoded_df.to_csv(output_folder + output_filename + str(latent_dim) + "L_TRAINING_fold" + str(fold) + ".tsv", sep = '\t')
172 | 
173 | 
174 | #SAVE ENCODER MODEL
175 | from keras.models import model_from_json
176 | 
177 | model_json = encoder.to_json()
178 | with open(output_folder + "VAE_" + cancer_type + "_encoder_" + str(latent_dim) + "L_"+ str(fold) + ".json", "w") as json_file:
179 |     json_file.write(model_json)
180 | 
181 | encoder.save_weights(output_folder + "VAE_" + cancer_type + "_encoder_" + str(latent_dim) + "L_"+ str(fold) + ".h5")
182 | print("Saved model to disk")
183 | 
184 | 
185 | model_json = decoder.to_json()
186 | with open(output_folder + "VAE_" + cancer_type + "_decoder_" + str(latent_dim) + "L_"+ str(fold) + ".json", "w") as json_file:
187 |     json_file.write(model_json)
188 | 
189 | decoder.save_weights(output_folder + "VAE_" + cancer_type + "_decoder_" + str(latent_dim) + "L_"+ str(fold) + ".h5")
190 | print("Saved model to disk")
191 | 
192 | 
193 | #Calculate training r squared
194 | from sklearn.metrics import r2_score
195 | 
196 | training_r2_vals = np.zeros(input_df_training.shape[0])
197 | for i in range(input_df_training.shape[0]):
198 |     training_r2 = r2_score(input_df_training.values[i, :], training_reconstructed_df.values[i, :])
199 |     training_r2_vals[i] = training_r2
200 | 
201 | print("TRAINING R2 " + str(np.mean(training_r2_vals)))
202 | 


--------------------------------------------------------------------------------
/MODEL_TRAININGS/gmeans.py:
--------------------------------------------------------------------------------
  1 | ###############################
  2 | #Code is  from https://github.com/flylo/g-means
  3 | ###############################
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from matplotlib import pyplot as plt
  8 | import seaborn as sbn
  9 | 
 10 | from sklearn.cluster import MiniBatchKMeans
 11 | from sklearn.preprocessing import scale
 12 | 
 13 | from sklearn import datasets
 14 | 
 15 | from scipy.stats import anderson
 16 | 
 17 | from pdb import set_trace
 18 | 
 19 | 
 20 | class GMeans(object):
 21 | 	
 22 | 	"""strictness = how strict should the anderson-darling test for normality be
 23 | 			0: not at all strict
 24 | 			4: very strict
 25 | 	"""
 26 | 
 27 | 	def __init__(self, min_obs=1, max_depth=10, random_state=None, strictness=4):
 28 | 
 29 | 		super(GMeans, self).__init__()
 30 | 
 31 | 		self.max_depth = max_depth
 32 | 		
 33 | 		self.min_obs = min_obs
 34 | 
 35 | 		self.random_state = random_state
 36 | 
 37 | 		if strictness not in range(5):
 38 | 			raise ValueError("strictness parameter must be integer from 0 to 4")
 39 | 		self.strictness = strictness
 40 | 
 41 | 		self.stopping_criteria = []
 42 | 		
 43 | 	def _gaussianCheck(self, vector):
 44 | 		"""
 45 | 		check whether a given input vector follows a gaussian distribution
 46 | 		H0: vector is distributed gaussian
 47 | 		H1: vector is not distributed gaussian
 48 | 		"""
 49 | 		output = anderson(vector)
 50 | 
 51 | 		if output[0] <= output[1][self.strictness]:
 52 | 			return True
 53 | 		else:
 54 | 			return False
 55 | 		
 56 | 	
 57 | 	def _recursiveClustering(self, data, depth, index):
 58 | 		"""
 59 | 		recursively run kmeans with k=2 on your data until a max_depth is reached or we have
 60 | 			gaussian clusters
 61 | 		"""
 62 | 		depth += 1
 63 | 		if depth == self.max_depth:
 64 | 			self.data_index[index[:, 0]] = index
 65 | 			self.stopping_criteria.append('max_depth')
 66 | 			return
 67 | 			
 68 | 		km = MiniBatchKMeans(n_clusters=2, random_state=self.random_state)
 69 | 		km.fit(data)
 70 | 		
 71 | 		centers = km.cluster_centers_
 72 | 		v = centers[0] - centers[1]
 73 | 		x_prime = scale(data.dot(v) / (v.dot(v)))
 74 | 		gaussian = self._gaussianCheck(x_prime)
 75 | 		
 76 | 		# print gaussian
 77 | 
 78 | 		if gaussian == True:
 79 | 			self.data_index[index[:, 0]] = index
 80 | 			self.stopping_criteria.append('gaussian')
 81 | 			return
 82 | 
 83 | 		labels = set(km.labels_)
 84 | 		for k in labels:
 85 | 			current_data = data[km.labels_ == k]
 86 | 
 87 | 			if current_data.shape[0] <= self.min_obs:
 88 | 				self.data_index[index[:, 0]] = index
 89 | 				self.stopping_criteria.append('min_obs')
 90 | 				return
 91 | 			
 92 | 
 93 | 			current_index = index[km.labels_==k]
 94 | 			current_index[:, 1] = np.random.randint(0,100000000000)
 95 | 			self._recursiveClustering(data=current_data, depth=depth, index=current_index)
 96 | 
 97 | 		# set_trace()
 98 | 	
 99 | 
100 | 	def fit(self, data):
101 | 		"""
102 | 		fit the recursive clustering model to the data
103 | 		"""
104 | 		self.data = data
105 | 		
106 | 		data_index = np.array([(i, False) for i in range(data.shape[0])])
107 | 		self.data_index = data_index
108 | 
109 | 		self._recursiveClustering(data=data, depth=0, index=data_index)
110 | 
111 | 		self.labels_ = self.data_index[:, 1]
112 | 		
113 | 		
114 | if __name__ == '__main__':
115 | 	# iris = datasets.load_iris().data
116 | 
117 | 	iris = datasets.make_blobs(n_samples=10000,
118 | 		n_features=2,
119 | 		centers=4,
120 | 		cluster_std=1.0)[0]
121 | 
122 | 	gmeans = GMeans(random_state=1010,
123 | 		strictness=4)
124 | 	# # set_trace()
125 | 	gmeans.fit(iris)
126 | 
127 | 	plot_data = pd.DataFrame(iris[:, 0:2])
128 | 	plot_data.columns = ['x', 'y']
129 | 	plot_data['labels_gmeans'] = gmeans.labels_
130 | 	# set_trace()
131 | 	
132 | 	km = MiniBatchKMeans(n_clusters=4)
133 | 	km.fit(iris)
134 | 	plot_data['labels_km'] = km.labels_
135 | 	
136 | 	sbn.lmplot(x='x', y='y', data=plot_data, hue='labels_gmeans', fit_reg=False)
137 | 	sbn.lmplot(x='x', y='y', data=plot_data, hue='labels_km', fit_reg=False)
138 | 	plt.show()
139 | 	set_trace()
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------
/NORMAL_TISSUE_ANALYSIS/Create_DeepProfile_GTEX_Embeddings.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for creating GTEX DeepProfile embeddings
 3 | ###############################
 4 | 
 5 | import pandas as pd
 6 | import numpy as np
 7 | import csv
 8 | import sys
 9 | 
10 | #Read cancer tupe from user
11 | cancer_type = sys.argv[1]
12 | 
13 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
14 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/HEALTHY_TISSUE_FILES/'
15 | 
16 | #Read all VAE embeddings
17 | dims  = [5, 10, 25, 50, 75, 100]
18 | run = 100
19 |     
20 | data_list = []
21 | for dim in dims:
22 |     for i in range(run):
23 |         data_df = pd.read_table(input_folder + 'HEALTHY_TISSUE_FILES/GTEX_' + cancer_type + '_RNASeq_Expression_VAE_encoded_' + str(dim) + 'L_' + str(i) + '.tsv', index_col = 0)      
24 |         print("GTEX VAE embedding ", data_df.shape)
25 |         data_list.append(data_df.values)
26 | 
27 | #Concatenate all embeddings   
28 | joined_data = np.concatenate(data_list, axis=1)
29 | print("Joined VAE embedding ",joined_data.shape)
30 | 
31 | #Read DeepProfile ensemble labels
32 | L = 150
33 | labels_df = pd.read_table(input_folder + cancer_type + '_TRAINING_DATA_kmeans_ENSEMBLE_LABELS_' + str(L) + 'L.txt', header= None)
34 | labels = labels_df.values
35 | print("DeepProfile ensemble labels ", len(labels))
36 | 
37 | #Create ensemble embedding
38 | ensemble_embeddings = np.zeros((joined_data.shape[0], L))
39 | for label in range(L):
40 |     indices = np.where(labels == label)[0]
41 |     average_values = np.mean(joined_data[:, indices], axis = 1)
42 |     ensemble_embeddings[:, label] = average_values
43 | 
44 | 
45 | #Record the ensemble embeddings
46 | print("DeepProfile ensemble embedding ", ensemble_embeddings.shape)
47 | ensemble_embeddings_df = pd.DataFrame(ensemble_embeddings, index = data_df.index, columns = np.arange(L))
48 | ensemble_embeddings_df.to_csv(output_folder + cancer_type + '_DeepProfile_GTEX_Healthy_Tissue_Embedding_' + str(L) + 'L.tsv', sep = '\t')


--------------------------------------------------------------------------------
/NORMAL_TISSUE_ANALYSIS/Create_Gtex_Rnaseq_PCs.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for creating PCs for expression matrices
 3 | ###############################
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import csv
 8 | from sklearn.decomposition import PCA
 9 | import sklearn.preprocessing
10 | import statsmodels.api as sm
11 | from sklearn.preprocessing import scale
12 | 
13 | def createData(cancer_type):
14 |     
15 |     input_folder ='../ALL_CANCER_FILES/' + cancer_type + '/'
16 |     
17 |     #Read training data
18 |     data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
19 |     print("Training expression dataframe ", data_df.shape)
20 | 
21 |     training_data = data_df.values
22 |     training_data = np.nan_to_num(training_data)
23 | 
24 |     #Train PCA models
25 |     pca = PCA(n_components = 1000)
26 |     pca.fit(training_data)
27 |     components = pca.components_
28 |     print("PCA Components ", components.shape)
29 | 
30 |     #Read GTEX expression dataframe
31 |     test_df = pd.read_table(input_folder + 'HEALTHY_TISSUE_FILES/' + 'GTEX_' + cancer_type + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', sep = '\t', index_col=0)
32 |     print("Gtex expression dataframe ", test_df.shape)
33 |     
34 |     #Get genes available in training dataset
35 |     joined_df = pd.concat([data_df, test_df], sort=False, join = 'outer')
36 |     joined_df = joined_df[data_df.columns]
37 |     joined_df = joined_df.iloc[-1 * test_df.shape[0]:, :]
38 |     test_df = joined_df
39 |     
40 |     print("Gtex expression dataframe ", test_df.shape)
41 |     
42 |     #Encode test data using trained PCA model
43 |     test_df = test_df.fillna(test_df.mean().fillna(0))
44 |     test_data = test_df.values
45 | 
46 |     #Save the encoded data
47 |     encoded_data = pca.transform(test_data)
48 |     encoded_df = pd.DataFrame(encoded_data, index = test_df.index)
49 |     print("GTEX PCA data ", encoded_df.shape)
50 |     print("GTEX PCA data ", encoded_df.head)
51 |     encoded_df.to_csv(input_folder + '/HEALTHY_TISSUE_FILES/GTEX_' + cancer_type + '_DATA_1K_PCs.tsv', sep = '\t')
52 | 
53 | import sys
54 | 
55 | cancer_type = sys.argv[1]
56 | createData(cancer_type)
57 | 


--------------------------------------------------------------------------------
/NORMAL_TISSUE_ANALYSIS/Encode_GTEX_Data_with_VAE.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for encoding GTEX expression using VAE models
 3 | ###############################
 4 | 
 5 | import os
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | import math 
10 | from sklearn.metrics import mean_squared_error
11 | import matplotlib.pyplot as plt
12 | 
13 | import tensorflow as tf
14 | from keras.layers import Input, Dense, Lambda, Layer, Activation
15 | from keras.layers.normalization import BatchNormalization
16 | from keras.models import Model
17 | from keras import backend as K
18 | from keras import metrics, optimizers
19 | from keras.callbacks import Callback
20 | import keras
21 | 
22 | import csv
23 | import sys
24 | from keras.models import model_from_json
25 | from sklearn import preprocessing
26 | 
27 | #Prevent tensorflow from using all the memory
28 | config = tf.ConfigProto()
29 | config.gpu_options.allow_growth=True
30 | sess = tf.Session(config=config)
31 | 
32 | #Method for defining the VAE loss
33 | def vae_loss(x_input, x_decoded):
34 |     reconstruction_loss = original_dim * metrics.mse(x_input, x_decoded)
35 |     kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
36 |     return K.mean(reconstruction_loss + (K.get_value(beta) * kl_loss))
37 | 
38 | #Read user inputs
39 | import sys
40 | cancer = sys.argv[1]
41 | dimension = int(sys.argv[2])
42 | start = int(sys.argv[3])
43 | end = int(sys.argv[4])
44 | 
45 | print("CANCER NAME: " + cancer)
46 | data_folder = '../ALL_CANCER_FILES/' + cancer + '/'
47 | 
48 | #Read GTEX expression
49 | input_df_test = pd.read_table(data_folder + 'HEALTHY_TISSUE_FILES/GTEX_' + cancer + '_DATA_1K_PCs.tsv', index_col = 0)    
50 | print("GTEX expression dataframe ", input_df_test.shape)
51 |         
52 | #Encode expression data with each VAE model
53 | for fold in range(start, end):
54 |     print("VAE model with " + str(dimension) + " nodes and fold " + str(fold))
55 | 
56 |     #Load VAE models
57 |     json_file = open(data_folder + 'VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.json', 'r')   
58 |     loaded_model_json = json_file.read()
59 |     json_file.close()
60 |     encoder = model_from_json(loaded_model_json)
61 | 
62 |     encoder.load_weights(data_folder + 'VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.h5')
63 |     print("Loaded model from disk")
64 | 
65 |     #Define placeholder VAE model
66 |     original_dim = input_df_test.shape[1]
67 |     intermediate1_dim = 100
68 |     intermediate2_dim = 25
69 |     latent_dim = dimension
70 | 
71 |     batch_size = 50
72 |     epochs = 50
73 |     learning_rate = 0.0005
74 |     beta = K.variable(1)
75 |     kappa = 0
76 |     init_mode = 'glorot_uniform'
77 | 
78 |     x = Input(shape=(original_dim, ))
79 | 
80 |     net = Dense(intermediate1_dim, kernel_initializer=init_mode)(x)
81 |     net2 = BatchNormalization()(net)
82 |     net3 = Activation('relu')(net2)
83 | 
84 |     net4 = Dense(intermediate2_dim, kernel_initializer=init_mode)(net3)
85 |     net5 = BatchNormalization()(net4)
86 |     net6 = Activation('relu')(net5)
87 | 
88 |     z_mean = Dense(latent_dim, kernel_initializer=init_mode)(net6)
89 |     z_log_var = Dense(latent_dim, kernel_initializer=init_mode)(net6)
90 | 
91 | 
92 |     adam = optimizers.Adam(lr=learning_rate)
93 | 
94 |     # Encode test data using the VAE model
95 |     test_encoded = encoder.predict(input_df_test, batch_size = batch_size)
96 |     test_encoded_df = pd.DataFrame(test_encoded, index = input_df_test.index)
97 |     test_encoded_df.to_csv(data_folder + 'HEALTHY_TISSUE_FILES/' + 'GTEX_' + cancer + '_RNASeq_Expression_VAE_encoded_' + str(dimension) + 'L_' + str(fold) + '.tsv', sep = '\t')
98 | 
99 | 


--------------------------------------------------------------------------------
/NORMAL_TISSUE_ANALYSIS/Example_Run_All.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Example for generating healthy tissue embeddings for a cancer type
 3 | ###############################
 4 | 
 5 | #Preprocess data
 6 | get_ipython().magic(u"run -i Preprocess_Gtex_Rnaseq_Expressions.py BRCA")
 7 | get_ipython().magic(u"run -i Create_Gtex_Rnaseq_PCs.py BRCA")
 8 | 
 9 | #Create DeepProfile embeddings
10 | get_ipython().magic(u"run -i Encode_GTEX_Data_with_VAE.py BRCA 5 0 100")
11 | get_ipython().magic(u"run -i Encode_GTEX_Data_with_VAE.py BRCA 10 0 100")
12 | get_ipython().magic(u"run -i Encode_GTEX_Data_with_VAE.py BRCA 25 0 100")
13 | get_ipython().magic(u"run -i Encode_GTEX_Data_with_VAE.py BRCA 50 0 100")
14 | get_ipython().magic(u"run -i Encode_GTEX_Data_with_VAE.py BRCA 75 0 100")
15 | get_ipython().magic(u"run -i Encode_GTEX_Data_with_VAE.py BRCA 100 0 100")
16 | 
17 | get_ipython().magic(u"run -i Create_DeepProfile_GTEX_Embeddings.py BRCA")
18 | 
19 | #Train healthy tissue classifiers
20 | get_ipython().magic(u"run -i Normal_Tissue_Classifier.py BRCA")


--------------------------------------------------------------------------------
/NORMAL_TISSUE_ANALYSIS/Normal_Tissue_Classifier.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for training classifiers for separating healthy and cancer tissue embeddings
 3 | ###############################
 4 | 
 5 | import pandas as pd
 6 | import seaborn as sb
 7 | import numpy as np
 8 | import pickle
 9 | import random
10 | from tqdm import *
11 | from sklearn.linear_model import LogisticRegression
12 | from sklearn.model_selection import GridSearchCV
13 | from sklearn.model_selection import train_test_split
14 | from sklearn.utils import resample
15 | from sklearn.preprocessing import StandardScaler
16 | 
17 | 
18 | def trainClassifier(cancer_type):
19 | 
20 |     input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
21 |     output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/HEALTHY_TISSUE_FILES/'
22 | 
23 |     #Read cancer embedding
24 |     cancer_data = pd.read_csv(input_folder + cancer_type + '_DeepProfile_Training_Embedding_150L.tsv',sep='\t',index_col=0)
25 |     print("Cancer embedding ", cancer_data.shape)
26 |     
27 |     #Read GTEX embedding
28 |     healthy_data = pd.read_csv(input_folder + 'HEALTHY_TISSUE_FILES/' + cancer_type + '_DeepProfile_GTEX_Healthy_Tissue_Embedding_150L.tsv',sep='\t',index_col=0)
29 |     print("GTEX embedding ", healthy_data.shape)
30 |     
31 |     #Combine datasets
32 |     FULL_FRAME = pd.concat([cancer_data, healthy_data],axis=0)
33 | 
34 |     #Define healthy tissue labels
35 |     healthy_label = [x < cancer_data.shape[0] for x in range(FULL_FRAME.shape[0])]
36 | 
37 |     #Train 100 L2 models with bootstrapping
38 |     bootstrap_weights = []
39 |     for i in tqdm(range(500)):
40 |         X_re,y_re = resample(FULL_FRAME,healthy_label, random_state = 1234 * i)
41 |         clf = LogisticRegression(penalty = 'l2', solver = 'liblinear')
42 |         clf.fit(X_re,y_re)
43 | 
44 |         bootstrap_weights.append(clf.coef_)
45 |     
46 |     #Save the results
47 |     pickle.dump(bootstrap_weights,open(output_folder + 'bootstrap_' + cancer_type + '_weights.p','wb'))
48 | 
49 | import sys
50 | 
51 | cancer_type = sys.argv[1]
52 | trainClassifier(cancer_type)
53 | 


--------------------------------------------------------------------------------
/NORMAL_TISSUE_ANALYSIS/Preprocess_Gtex_Rnaseq_Expressions.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for creating expression matrices for GTEX healthy samples
 3 | ###############################
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import csv
 8 | from sklearn.decomposition import PCA
 9 | import sklearn.preprocessing
10 | import statsmodels.api as sm
11 | from sklearn.preprocessing import scale
12 | 
13 | #Read all GTEX expression file
14 | MAIN_df = pd.read_table('GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct', sep = '\t', index_col=0)
15 | print("Gtex expression dataframe ", MAIN_df.shape)
16 | all_samples = np.asarray(MAIN_df.columns)
17 | 
18 | #Method for creating tissue-specific GTEX 
19 | def save_tissue_expression(cancer):
20 |     
21 |     data_folder = '../ALL_CANCER_FILES/' + cancer + '/HEALTHY_TISSUE_FILES/'
22 |     
23 |     #Read sample names of tissue-specific samples
24 |     index_df = pd.read_table(data_folder + 'GTEX_' + cancer + '_SAMPLES.txt', sep = '\n', index_col=0)
25 |     cancer_specific_samples = np.asarray(index_df.index)
26 |     print("Samples ", cancer_specific_samples)
27 | 
28 |     #Find list of matching samples
29 |     matching_samples = np.intersect1d(cancer_specific_samples, all_samples)
30 |     print("MATCHING SAMPLES COUNT ", len(matching_samples))
31 | 
32 |     #Get the expression for these patients
33 |     cancer_df = MAIN_df[matching_samples]
34 |     gene_names = MAIN_df['Description'].values
35 |     cancer_df = pd.DataFrame(cancer_df.values.T, index = cancer_df.columns, columns = gene_names)
36 |     print("Samples ", cancer_df.shape)
37 |     print('Range ', (np.max(cancer_df.values) - np.min(cancer_df.values) ))
38 |     
39 |     #Mean impute the missing values
40 |     cancer_df = cancer_df.fillna(cancer_df.mean().fillna(0))
41 |     
42 |     #Log scale the data and make 0-mean univariate
43 |     scaled_expression_values = np.log(cancer_df.values)
44 |     scaled_expression_values[scaled_expression_values == np.NINF] = 0
45 |     normalized_data = sklearn.preprocessing.scale(scaled_expression_values)
46 |     print("Mean values ", np.mean(normalized_data, axis = 0))
47 |     print("Mean values ", len(np.mean(normalized_data, axis = 0)))
48 |     print("Std values ", np.std(normalized_data, axis = 0))
49 |     print("Std values ", len(np.std(normalized_data, axis = 0)))
50 |     
51 |     #Save the final expressiom matrix
52 |     cancer_df = pd.DataFrame(normalized_data, index = cancer_df.index, columns = cancer_df.columns)
53 |     print("Final dataframe ", cancer_df.shape)
54 |     print("Final dataframe ", cancer_df.head())
55 |     print('Final dataframe range: ', (np.max(cancer_df.values) - np.min(cancer_df.values) ))
56 |     
57 |     cancer_df.to_csv(data_folder + 'GTEX_' + cancer + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', sep = '\t')
58 | 
59 | import sys
60 | 
61 | cancer_type = sys.argv[1]
62 | save_tissue_expression(cancer_type)


--------------------------------------------------------------------------------
/PATHWAY_ANALYSIS/Create_Pathway_Matrices.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for creating pathway matrices for cancer type
 3 | ###############################
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import csv
 8 | import sys
 9 | 
10 | #Read cancer name and pathway file
11 | cancer_type = sys.argv[1]
12 | pathway_name = sys.argv[2]
13 | 
14 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
15 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/PATHWAY_FILES/'
16 | 
17 | def create_pathway_matrix(cancer_type, pathway_file):
18 |    
19 |     #1) Read input data
20 |     data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', index_col=0)
21 |     print("Input data ", data_df.shape)
22 |     gene_names = data_df.columns
23 |     
24 |     #2) Read pathway data
25 |     if pathway_file == 'C2':
26 |         filename = 'MSIGDB_PATHWAYS/c2.v6.2.symbols.gmt'
27 |     if pathway_file == 'H':
28 |         filename = 'MSIGDB_PATHWAYS/h.all.v6.2.symbols.gmt'
29 |     if pathway_file == 'C4_CGN':
30 |         filename = 'MSIGDB_PATHWAYS/c4.cgn.v6.2.symbols.gmt'
31 |     if pathway_file == 'C4_CM':
32 |         filename = 'MSIGDB_PATHWAYS/c4.cm.v6.2.symbols.gmt'
33 |     if pathway_file == 'C5_BP':
34 |         filename = 'MSIGDB_PATHWAYS/c5.bp.v6.2.symbols.gmt'
35 |     if pathway_file == 'C5_CC':
36 |         filename = 'MSIGDB_PATHWAYS/c5.cc.v6.2.symbols.gmt'
37 |     if pathway_file == 'C5_MF':
38 |         filename = 'MSIGDB_PATHWAYS/c5.mf.v6.2.symbols.gmt'
39 |     if pathway_file == 'C6':
40 |         filename = 'MSIGDB_PATHWAYS/c6.all.v6.2.symbols.gmt'
41 |     if pathway_file == 'C7':
42 |         filename = 'MSIGDB_PATHWAYS/c7.all.v6.2.symbols.gmt'
43 |     
44 |            
45 |     with open(filename) as f:
46 |         content = f.readlines()
47 |     content = [x.strip() for x in content] 
48 | 
49 |     pathway_count = len(content)
50 |     print("Pathway count " + str(pathway_count))
51 |     
52 |     pathway = np.zeros((len(gene_names), pathway_count), dtype = np.int)
53 |     pathway_names = []
54 |     pathway_lens = []
55 |     
56 |     for i in range(pathway_count):
57 |         data = content[i].split("\t")
58 |         genes = data[2:]
59 |         pathway_name = data[0]
60 |         pathway_names.append(pathway_name)
61 | 
62 |         pathway_lens.append(len(genes))
63 |         
64 |         #Loop through all genes
65 |         for j in range(len(genes)):
66 | 
67 |             index = np.where(gene_names == genes[j])[0]
68 |             if len(index) != 0:
69 |                 pathway[index[0], i] = 1
70 | 
71 |     #3) Save matrix
72 |     new_df = pd.DataFrame(pathway, index = gene_names, columns = pathway_names)
73 |     print("Pathway matrix ", new_df.shape)
74 |     print("Average pathway length ", np.mean(pathway_lens))
75 |     print("Average pathway length ", pathway_lens)
76 |     new_df.to_csv(output_folder + 'PATHWAY_' + pathway_file + '_MATRIX_INTERSECTION_GENES.tsv', sep='\t', quoting = csv.QUOTE_NONE)
77 | 
78 |     
79 |     #Also record gene symbols
80 |     with open(output_folder + 'Gene_Symbols.txt', 'w') as f:
81 |         for item in gene_names:
82 |             f.write("%s\n" % item)
83 |     
84 | 
85 | create_pathway_matrix(cancer_type, pathway_name)


--------------------------------------------------------------------------------
/PATHWAY_ANALYSIS/Fishers_Test.py:
--------------------------------------------------------------------------------
  1 | ###############################
  2 | #Script for running fisher's test for pathway enrichment analysis
  3 | ###############################
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import csv
  8 | import statsmodels.api as sm
  9 | import sys
 10 | 
 11 | #Read user inputs
 12 | cancer_type = sys.argv[1]
 13 | pathway_type = sys.argv[2]
 14 | method = sys.argv[3]
 15 | start = int(sys.argv[4])
 16 | end = int(sys.argv[5])
 17 | if len(sys.argv) > 6:
 18 |     dimension = int(sys.argv[6])
 19 |     L = dimension
 20 | else:
 21 |     L = 150
 22 | 
 23 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
 24 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/PATHWAY_FILES/'
 25 | 
 26 | pathway_matrix = pd.read_table(input_folder + 'PATHWAY_FILES/PATHWAY_' + pathway_type + '_MATRIX_INTERSECTION_GENES.tsv', index_col = 0)
 27 | print(pathway_matrix.shape)
 28 | pathway_df = pathway_matrix
 29 | pathway_matrix = pathway_matrix.values
 30 | 
 31 | if pathway_type == 'C2':
 32 |     N = 51 #average number of pathways
 33 | if pathway_type == 'C4_CM':
 34 |     N = 113 #average number of pathways
 35 | if pathway_type == 'C4_CGN':
 36 |     N = 99 #average number of pathways
 37 | if pathway_type == 'C6':
 38 |     N = 166 #average number of pathways
 39 | if pathway_type == 'C5_BP':
 40 |     N = 114 #average number of pathways
 41 | if pathway_type == 'C5_CC':
 42 |     N = 151 #average number of pathways
 43 | if pathway_type == 'C5_MF':
 44 |     N = 106 #average number of pathways
 45 | if pathway_type == 'H':
 46 |     N = 146 #average number of pathways
 47 | 
 48 | #Run test for each run
 49 | for run in range(start, end):
 50 |     if method == 'PCA':
 51 |         data_df = pd.read_table(input_folder + 'PCA_FILES/' + cancer_type + '_DATA_TOP2_JOINED_PCA_COMPONENTS_150L.tsv', index_col = 0)
 52 |         print(data_df.shape)
 53 | 
 54 |         ensemble_weights = np.abs(data_df.values.T)
 55 |         print(ensemble_weights.shape)
 56 |         
 57 |     if method == 'ICA':
 58 |         data_df = pd.read_table(input_folder + 'ICA_FILES/' + cancer_type + '_DATA_TOP2_JOINED_ICA_COMPONENTS_150L_fold' + str(run + 1) + '.tsv', index_col = 0)
 59 |         print(data_df.shape)
 60 | 
 61 |         ensemble_weights = np.abs(data_df.values.T)
 62 |         print(ensemble_weights.shape)
 63 | 
 64 |     if method == 'RP':
 65 |         data_df = pd.read_table(input_folder + 'RP_FILES/' + cancer_type + '_DATA_TOP2_JOINED_RP_COMPONENTS_fold' + str(run + 1) + '.tsv', index_col = 0)
 66 |         print(data_df.shape)
 67 | 
 68 |         ensemble_weights = np.abs(data_df.values.T)
 69 |         print(ensemble_weights.shape)
 70 |         
 71 |     if method == 'AE':
 72 |         data_df = pd.read_table(input_folder + 'AE_FILES/' + cancer_type + '_DATA_AE_Weights_TRAINING_150L_fold' + str(run + 1) + '.tsv', index_col = 0)
 73 |         print(data_df.shape)
 74 | 
 75 |         ensemble_weights = data_df.values.T
 76 |         print(ensemble_weights.shape)
 77 |         
 78 |     if method == 'DAE':
 79 |         data_df = pd.read_table(input_folder + 'DAE_FILES/' + cancer_type + '_DATA_DAE_Weights_TRAINING_150L_fold' + str(run + 1) + '.tsv', index_col = 0)
 80 |         print(data_df.shape)
 81 | 
 82 |         ensemble_weights = data_df.values.T
 83 |         print(ensemble_weights.shape)
 84 |     
 85 |     if method == 'DeepProfile':
 86 |         data_df = pd.read_table(input_folder + cancer_type + '_DeepProfile_Ensemble_Gene_Importance_Weights_150L.tsv', index_col = 0)
 87 |         print(data_df.shape)
 88 | 
 89 |         ensemble_weights = data_df.values
 90 |         print(ensemble_weights.shape)
 91 |         
 92 |     if method == 'VAE':
 93 |         data_df = pd.read_table(input_folder + 'VAE_WEIGHTS/' + cancer_type + '_DATA_VAE_Cluster_Weights_TRAINING_' + str(dimension) + 'L_fold' + str(run) + '.tsv', index_col = 0)
 94 |         print(data_df.shape)
 95 | 
 96 |         ensemble_weights = data_df.values.T
 97 |         print(ensemble_weights.shape)
 98 |         
 99 | 
100 |     #Apply fisher test
101 |     p_vals = np.zeros((ensemble_weights.shape[0], pathway_matrix.shape[1]))
102 | 
103 |     print("Running for top ", N, " genes")
104 |     
105 |     for i in range(p_vals.shape[0]):
106 |         print(i)
107 |         for j in range(p_vals.shape[1]):
108 | 
109 |             #Create contingency matrix
110 |             matrix = np.zeros((2, 2))
111 | 
112 |             pathway_indices = np.where(pathway_matrix[:, j] == 1)[0]
113 |             #print(pathway_df.index[pathway_indices])
114 | 
115 |             gene_indices = ensemble_weights[i, :].argsort()[-N:][::-1]
116 |             #print(len(gene_indices))
117 |             #print(pathway_df.index[gene_indices])
118 | 
119 |             in_pathway_firstN = len(np.intersect1d(pathway_indices ,gene_indices))
120 |             #print(pathway_df.index[np.intersect1d(pathway_indices ,gene_indices)])
121 | 
122 |             out_pathway_firstN = N - in_pathway_firstN
123 |             #print(out_pathway_firstN)
124 | 
125 |             in_pathway_other = len(pathway_indices) - in_pathway_firstN
126 |             #print(in_pathway_other)
127 | 
128 |             out_pathway_other = pathway_matrix.shape[0] - in_pathway_other
129 |             #print(out_pathway_other)
130 | 
131 |             matrix[0, 0] = in_pathway_firstN
132 |             matrix[0, 1] = in_pathway_other
133 |             matrix[1, 0] = out_pathway_firstN
134 |             matrix[1, 1] = out_pathway_other
135 | 
136 |             import scipy.stats as stats
137 |             oddsratio, pvalue = stats.fisher_exact(matrix)
138 | 
139 |             p_vals[i, j] = pvalue
140 | 
141 | 
142 |     #Record uncorrected p-values
143 |     if method == 'VAE':
144 |         p_vals_df = pd.DataFrame(p_vals, index = np.arange(L) + 1, columns = pathway_df.columns)
145 |         p_vals_df.to_csv(output_folder + cancer_type + '_FISHER_UNCORRECTED_PVALS_' + pathway_type + '_' + method + '_' + str(dimension) + 'L_' + str(run + 1) + '.tsv', sep = '\t')
146 |     else:
147 |         p_vals_df = pd.DataFrame(p_vals, index = np.arange(L) + 1, columns = pathway_df.columns)
148 |         p_vals_df.to_csv(output_folder + cancer_type + '_FISHER_UNCORRECTED_PVALS_' + pathway_type + '_' + method + '_' + str(run + 1) + '.tsv', sep = '\t')
149 | 
150 |     new_p_values = np.zeros(((p_vals.shape[0], p_vals.shape[1])))
151 | 
152 |     #Record corrected p-values
153 |     for i in range(pathway_matrix.shape[1]):
154 |         corrected_pval = sm.stats.multipletests( p_vals[:, i], alpha=0.05, method='fdr_bh', is_sorted=False, returnsorted=False)[1]
155 |         new_p_values[:, i]  = corrected_pval                
156 | 
157 |     x = np.where([new_p_values < 0.05])[2]
158 |     unique_count = len(np.unique(x))
159 |     print("UNIQUE PATHWAY COUNT: " + str(unique_count))
160 | 
161 |     p_vals_df = pd.DataFrame(new_p_values, index = np.arange(L) + 1, columns = pathway_df.columns)
162 |     #print(p_vals_df)
163 |     if method == 'VAE':
164 |         p_vals_df.to_csv(output_folder + cancer_type + '_FISHER_FDR_CORRECTED_PVALS_' + pathway_type + '_' + method + '_' + str(dimension) + 'L_' + str(run + 1) + '.tsv', sep = '\t')
165 |     else:
166 |         p_vals_df.to_csv(output_folder + cancer_type + '_FISHER_FDR_CORRECTED_PVALS_' + pathway_type + '_' + method + '_' + str(run + 1) + '.tsv', sep = '\t')
167 | 
168 |     x = np.where([p_vals_df.values < 0.05])[2]
169 |     unique_count = len((x))
170 |     print("AVERAGE PATHWAY COUNT: ", unique_count / 150)
171 |     
172 | 


--------------------------------------------------------------------------------
/PATHWAY_ANALYSIS/Run_Multiple_Fishers_Test.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for running multiple FETs
 3 | ###############################
 4 | 
 5 | import sys
 6 | 
 7 | cancer_type = sys.argv[1]
 8 | pathway = sys.argv[2]
 9 | 
10 | get_ipython().magic(u"run -i 'Fishers_Test.py' '" +  cancer_type + "' " + pathway + " " + "DeepProfile" + " " + str(0) + " " + str(1))
11 | get_ipython().magic(u"run -i 'Fishers_Test.py' '" +  cancer_type + "' " + pathway + " " + "PCA" + " " + str(0) + " " + str(1))
12 | get_ipython().magic(u"run -i 'Fishers_Test.py' '" +  cancer_type + "' " + pathway + " " + "ICA" + " " + str(0) + " " + str(10))
13 | get_ipython().magic(u"run -i 'Fishers_Test.py' '" +  cancer_type + "' " + pathway + " " + "RP" + " " + str(0) + " " + str(10))
14 | get_ipython().magic(u"run -i 'Fishers_Test.py' '" +  cancer_type + "' " + pathway + " " + "AE" + " " + str(-1) + " " + str(9))
15 | get_ipython().magic(u"run -i 'Fishers_Test.py' '" +  cancer_type + "' " + pathway + " " + "DAE" + " " + str(-1) + " " + str(9))
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DeepProfile
  2 | 
  3 | Repository with scripts for all model training and analysis for paper "A deep profile of gene expression across 18 human cancers"
  4 | 
  5 | All fully pre-processed input data for training the models can be found on our Figshare Data repository. For each cancer, the basic data we used is **'CANCER_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv'** where CANCER is the name of the cancer type. This data is GEO datasets collected from top 2 platforms, intersecting genes taken, and batch correction applied.
  6 | 
  7 | The folder **MODEL_TRAININGS** includes all scripts and notebooks for training VAE models and obtaining attributions.
  8 | 
  9 | The script **Example_Run_All.py** includes all commands for training DeepProfile model for one cancer type. 
 10 | 
 11 | <font color=blue>**STEP 1: Creating PCs for each data**</font>
 12 | 
 13 | Create a folder **./ALL_CANCER_FILES/CANCER/** then download the data and save in that folder.
 14 | **Create_PCs_for_DeepLearning_Models.py** takes a cancer type and component_count as input and applies PCA on the training data to train deep learning models.
 15 | 
 16 | <font color=blue>**STEP 2: Training VAE models**</font>
 17 | 
 18 | **VAE_3Layers_Model.py** is the Keras implementation of VAE model.
 19 | **Run_VAE_Models.py** takes the cancer type, number of hidden nodes, and start-end folds to train VAE models for the given cancer type.
 20 | 
 21 | <font color=blue>**STEP 3: Running IG for VAE models**</font>
 22 | 
 23 | **IntegratedGradients.py** is the Keras implementation for Integrated Gradients feature attribution method.
 24 | **Get_VAE_IG_Attributions.py** is the script for running IG and get gene-level explanations for each of the nodes. It takes the cancer type, number of hidden nodes, and start-end folds to get explanations for the VAE models for the given cancer type.
 25 | 
 26 | <font color=blue>**STEP 4: Learning ensemble labels**</font>
 27 | 
 28 | **Create_Ensemble_Labels.py** is the script for running k-means clustering to learn ensemble weights. It takes the cancer type and number of final latent nodes as the input and saves the ensemble labels. 
 29 | **Select_Latent_Dimension_with_Gmeans** is the notebook for running g-means clustering to select  the ensemble latent dimension size.
 30 | 
 31 | <font color=blue>**STEP 5: Creating DeepProfile ensemble training embedding**</font>
 32 | 
 33 | **Create_DeepProfile_Training_Embeddings.py** is the script for joining all the training data VAE embeddings and ensembling them using the learned ensemble labels. It takes the cancer type as the input and creates training DeepProfile ensemble embedding. 
 34 | 
 35 | <font color=blue>**STEP 6: Creating DeepProfile ensemble gene attribution matrices**</font>
 36 | 
 37 | **Create_DeepProfile_Ensemble_Weights.py** is the script for joining all the VAE gene attributions and ensembling them using the learned ensemble labels. It takes the cancer type as the input and creates DeepProfile gene attribution matrix. 
 38 | 
 39 | 
 40 | 
 41 | ### <font color=red>  PART 2: TRAINING COMPETITOR MODELS </font>
 42 | 
 43 | The script **Example_Run_All.py** includes all commands for training competitor models for one cancer type. 
 44 | 
 45 | In **COMPETITOR_TRAININGS**, all the scripts for comparing DeepProfile to other methods is included
 46 | 
 47 | <font color=blue>**STEP 1: Training PCA Models**</font>
 48 | 
 49 | **Create_PCA_Data.py** takes a cancer type and creates PCA components for the training data.
 50 | 
 51 | <font color=blue>**STEP 2: Training ICA Models**</font>
 52 | 
 53 | **Create_ICA_Data.py** takes a cancer type and creates ICA components for the training data, repeating 10 times.
 54 | 
 55 | <font color=blue>**STEP 3: Training RP Models**</font>
 56 | 
 57 | **Create_RP_Data.py** takes a cancer type and creates RP components for the training data, repeating 10 times.
 58 | 
 59 | <font color=blue>**STEP 4: Training AE Models**</font>
 60 | 
 61 | **AE_2Layers_Model.py** is the Keras implementation of AE model.
 62 | **Train_AE_Models.py** takes a cancer type as input and trains 10 AE models with different random seeds.
 63 | **Get_AE_IG_Attributions.py** is the script for running IG and get gene-level explanations for each of the nodes. It takes the cancer type and fold to get explanations for the AE models for the given cancer type.
 64 | 
 65 | <font color=blue>**STEP 5: Training DAE Models**</font>
 66 | 
 67 | **DAE_2Layers_Model.py** is the Keras implementation of DAE model.
 68 | **Train_DAE_Models.py** takes a cancer type as input and trains 10 DAE models with different random seeds.
 69 | **Get_DAE_IG_Attributions.py** is the script for running IG and get gene-level explanations for each of the nodes. It takes the cancer type and fold to get explanations for the DAE models for the given cancer type.
 70 | 
 71 | 
 72 | 
 73 | ### <font color=red>  PART 3: TCGA SURVIVAL PREDICTIONS </font>
 74 | 
 75 | In **TCGA_SURVIVAL_PREDICTION** folder, all files and scripts are included for predicting TCGA expression survival.
 76 | 
 77 | In folder **TCGA_DATA**, **TCGA_CLINICAL_DATA** folder includes clinical data for TCGA samples. **TCGA_MICROARRAY** folder includes microarray expression data and **TCGA_RNASEQ** folder includes RNA-Seq expression data.
 78 | 
 79 | <font color=blue>**STEP 1: Preprocessing Data**</font>
 80 | 
 81 | **CREATE_EMBEDDINGS** folder includes all scripts to generate TCGA RNA-Seq embeddings.
 82 | 
 83 | The script **Example_Run_All.py** includes all commands for generating TCGA expression embeddings for one cancer type. 
 84 | 
 85 | **Preprocess_TCGA_Rnaseq_Expression.py** script takes the cancer type and TCGA cancer type as input and preprocesses the expression data to train models.
 86 | 
 87 | **Create_TCGA_Rnaseq_PCs.py** script again takes the cancer type and TCGA cancer type as input and applies PCA to preprocessed expression to record top PCs to train deep learning models.
 88 | 
 89 | <font color=blue>**STEP 2: Encoding Expression with DeepProfile**</font>
 90 | 
 91 | **Encode_TCGA_Data_with_VAE.py** takes the preprocessed PCAed expression and encodes it using the already trained VAE models. The script takes cancer type, TCGA type, VAE dimension, start and end runs to encode the expression.
 92 | 
 93 | **Create_All_VAE_Embeddings.py** takes cancer type and TCGA type as input and encoder TCGA expression with all trained VAE models.
 94 | 
 95 | **Create_DeepProfile_TCGA_Embeddings.py** takes the cancer type and TCGA type as input and generates the DeepProfile embedding. The script loads in all the VAE embeddings and ensemble labels to generate an ensemble DeepProfile embedding.
 96 | 
 97 | <font color=blue>**STEP 3: Encoding Expression with Competitor Models**</font>
 98 | 
 99 | **Encode_TCGA_Data_with_PCA.py** takes the cancer type and TCGA type as input and generated PCA embedding for TCGA RNA-Seq expressions.
100 | 
101 | **Encode_TCGA_Data_with_ICA.py** takes the cancer type and TCGA type as input and generated ICA embedding for TCGA RNA-Seq expressions.
102 | 
103 | **Encode_TCGA_Data_with_RP.py** takes the cancer type and TCGA type as input and generated RP embedding for TCGA RNA-Seq expressions.
104 | 
105 | **Encode_TCGA_Data_with_AE.py** takes the cancer type and TCGA type as input and generated AE embedding for TCGA RNA-Seq expressions.
106 | 
107 | **Encode_TCGA_Data_with_DAE.py** takes the cancer type and TCGA type as input and generated DAE embedding for TCGA RNA-Seq expressions.
108 | 
109 | <font color=blue>**STEP 4: Generating Survival DataFrames**</font>
110 | 
111 | Folder **CREATE_SURVIVAL_DATAFRAMES** includes all scripts for generating survival data frames. 
112 | 
113 | **Create_TCGA_Survival_Dataframes.py** takes the cancer type and TCGA type as input and extract the necssary fields from clinical data to define the survival dataframe.
114 | 
115 | **Create_Joined_Survival_Dataframes.py** takes the cancer type and TCGA type as input and comnbines the DeepProfile RNA-Seq embeddings with survival data frames.
116 | 
117 | **Create_Joined_Survival_Dataframes_Cancer_Types.py** combines data frames for cancer subtypes under the main cancer type. 
118 | 
119 | <font color=blue>**STEP 5: Predicting Survival**</font>
120 | 
121 | Folder **PREDICT_SURVIVAL** contains scripts for predicting survival.
122 | 
123 | **Predict_Survival.py** trains lasso regression models with subsampling taking the TCGA RNA-Seq embeddings as the input. 
124 | 
125 | **Predict_Survival_Subtypes_Joined.py** trains lasso regression models with subsampling taking the TCGA RNA-Seq embeddings as the input while joining multiple TCGA cancer types when there are multiple TCGA cancer subtypes corresponding to one major cancer type we have.
126 | 
127 | **Run_Models.py** trains all prediction models for all models and cancer types.
128 | 
129 | **Plots_of_Survival_Prediction.ipynb** and **Plots_of_Survival_Prediction_VAEs.ipynb** are notebooks for generating plots of comparing survival predictions of models. 
130 | 
131 | <font color=blue>**STEP 6: Comparing RNA-Seq and microarray DeepProfile embeddings**</font>
132 | 
133 | **COMPARING_RNASEQ_and_MICROARRAY** folder includes all scripts to generate TCGA microarray embeddings and to compare the embeddings with RNA-Seq embeddings.
134 | 
135 | **Preprocess_TCGA_Rnaseq_Expression.py** script takes the cancer type and TCGA cancer type as input and preprocesses the expression data to train models.
136 | 
137 | **Create_TCGA_Microarray_PCs.py** script again takes the cancer type and TCGA cancer type as input and applies PCA to preprocessed expression to record top PCs to train deep learning models.
138 | 
139 | **Encode_TCGA_Microarray_Data_with_VAE.py** takes the preprocessed PCAed expression and encodes it using the already trained VAE models. The script takes cancer type, TCGA type, VAE dimension, start and end runs to encode the expression.
140 | 
141 | **Create_DeepProfile_TCGA_Microarray_Embeddings.py** takes the cancer type and TCGA type as input and generates the DeepProfile embedding. The script loads in all the VAE embeddings and ensemble labels to generate an ensemble DeepProfile embedding.
142 | 
143 | **Rnaseq_and_Microarray_Embedding_Correlation_Plots** notebook calculates correlation between RNA-Seq and microarray embeddings and generates plots.
144 | 
145 | 
146 | 
147 | ### <font color=red>  PART 4: PATHWAY ENRICHMENT TESTS </font>
148 | 
149 | In **PATHWAY_ANALYSIS** folder, the scripts and files for pathway analysis are included.
150 | 
151 | **MSIGDB_PATHWAYS** folder, the files for Molecular Signature Database pathways are included.
152 | 
153 | <font color=blue>**STEP 1: Running enrichment tests**</font>
154 | 
155 | **Create_Pathway_Matrices.py** is the script for creating binary pathway matrices for the genes that are present in the training datasets. It takes a cancer type and pathway type as input and creates an binary matrix of pathway overlaps.
156 | 
157 | **Fishers_Test.py** is the script for running Fisher's test. It takes the cancer type, pathway type, the method name, and the range of runs and records uncorrected and FDR-corrected p-values. 
158 | 
159 | **Run_Multiple_Fishers_Test.py** is the script for running multiple tests consecutively. It takes the cancer type and pathway name as input and carries enrichment tests for all methods. 
160 | 
161 | <font color=blue>**STEP 2: Comparing pathway coverages**</font>
162 | 
163 | **PATHWAY COVERAGE ANALYSIS** folder includes all scripts for comparing pathway coverage of models.
164 | 
165 | **Plot_of_Average_Pathway_Coverages** genereates plots of average pathway coverage to compare DeepProfile and other dimensionality reduction methods.
166 | 
167 | **Plot_of_Pathway_Coverage_Distributions** generates plots of distribution of pathway count of each node of DeepProfile and other dimensionality reduction methods.
168 | 
169 | **Plot_of_Node_Level_Pathway_Annotations** generates plots of percent of nodes annotated by at least one pathway across multiple thresholds.
170 | 
171 | **Plot_of_Pathway_Detection_Comparison_VAEs_vs_DeepProfile** creates plots for comparing pathways captured by DeepProfile vs individual VAE models.
172 | 
173 | **Plot_of_Pathway_Percent_Comparison_VAEs_vs_DeepProfile**  creates plots for comparing pathways captured by DeepProfile vs individual VAE models based on percentages.
174 | 
175 | 
176 | 
177 | ### <font color=red>  PART 5: NORMAL TISSUE ANALYSIS </font>
178 | 
179 | In **NORMAL_TISSUE_ANALYSIS** folder, the scripts for normal tissue analysis are included.
180 | 
181 | The script **Example_Run_All.py** includes all commands for generating normal tissue expression embeddings for one cancer type. 
182 | 
183 | **Gtex_Tissue_Name_Mappings** is the notebook for mapping GTEX tissue names to cancer types we have. The GTEX expression data includes samples from many different tissues. We extract the GTEX sample names for each cancer type we have.
184 | 
185 | **Preprocess_Gtex_Rnaseq_Expressions.py** is the script for creating preprocessed GTEX gene expression. It takes the cancer type as input and preprocesses the GTEX RNA-Seq expression using the same preprocessing steps applied to our training data.
186 | 
187 | **Create_Gtex_Rnaseq_PCs.py** is the script for taking top PCs of the GTEX expression profiles to train DeepProfile model. It takes the cancer type as input and records the top PCs of GTEX expression.
188 | 
189 | **Encode_GTEX_Data_with_VAE.py** is the script for encoding GTEX expression using trained VAE models. The inputs to the model are the cancer type, the number of latent nodes, and start and end runs. 
190 | 
191 | **Create_DeepProfile_GTEX_Embeddings.py** is the script for creating DeepProfile embedding using generated VAE embeddings. It takes the cancer type as input and records the final DeepProfile embedding for GTEX normal tissue samples.
192 | 
193 | **Normal_Tissue_Classifier.py** is the script for training the classifier to separate normal vs cancer tissue embeddings. It takes the cancer type as input and records the bootstrapped classifier weights.
194 | 
195 | 
196 | 
197 | 
198 | 


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/COMPARING_RNASEQ_and_MICROARRAY/Create_DeepProfile_TCGA_Microarray_Embeddings.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for creating TCGA RNA-Seq DeepProfile embeddings
 3 | ###############################
 4 | 
 5 | import pandas as pd
 6 | import numpy as np
 7 | import csv
 8 | import sys
 9 | 
10 | #Read cancer type from user
11 | cancer_type = sys.argv[1]
12 | tcga_type = sys.argv[2]
13 | 
14 | #Read all VAE embeddings
15 | dims  = [5, 10, 25, 50, 75, 100]
16 | run = 100
17 | 
18 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/'
19 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/' 
20 | 
21 | data_list = []
22 | for dim in dims:
23 |     for i in range(run):
24 |         data_df = pd.read_table(input_folder+ 'TCGA_FILES/TCGA_' + tcga_type + '_MICROARRAY_Expression_VAE_encoded_' + str(dim) + 'L_' + str(i) + '.tsv', index_col = 0)   
25 |         print("TCGA VAE embedding ", data_df.shape)
26 |         data_list.append(data_df.values)
27 | 
28 | #Concatenate all embeddings     
29 | joined_data = np.concatenate(data_list, axis=1)
30 | print("Joined VAE embedding ",joined_data.shape)
31 | 
32 | #Read DeepProfile ensemble labels
33 | L = 150
34 | labels_df = pd.read_table(input_folder + cancer_type + '_TRAINING_DATA_kmeans_ENSEMBLE_LABELS_' + str(L) + 'L.txt', header= None)
35 | labels = labels_df.values
36 | print("DeepProfile ensemble labels ", len(labels))
37 | 
38 | #Create ensemble embedding
39 | ensemble_embeddings = np.zeros((joined_data.shape[0], L))
40 | for label in range(L):
41 |     indices = np.where(labels == label)[0]
42 |     average_values = np.mean(joined_data[:, indices], axis = 1)
43 |     ensemble_embeddings[:, label] = average_values
44 | 
45 | #Record the ensemble embeddings
46 | print(ensemble_embeddings.shape)
47 | ensemble_embeddings_df = pd.DataFrame(ensemble_embeddings, index = data_df.index, columns = np.arange(L))
48 | ensemble_embeddings_df.to_csv(output_folder + tcga_type + '_DeepProfile_TCGA_MICROARRAY_Embedding_' + str(L) + 'L.tsv', sep = '\t')


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/COMPARING_RNASEQ_and_MICROARRAY/Create_TCGA_Microarray_PCs.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for recording top PCs or TCGA RNA-Seq data
 3 | ###############################
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import csv
 8 | from sklearn.decomposition import PCA
 9 | import sklearn.preprocessing
10 | 
11 | #Define method for preprocessing data
12 | def create_data(cancer_type, tcga_type):
13 |     
14 |     input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/'
15 |     output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/'
16 |     
17 |     #Read training data
18 |     data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
19 |     print("Training data ", data_df.shape)
20 | 
21 |     #Apply PCA to training data
22 |     training_data = data_df.values
23 |     training_data = np.nan_to_num(training_data)
24 |     
25 |     pca = PCA(n_components = 500)
26 |     pca.fit(training_data)
27 |     components = pca.components_
28 |     print("PCA components ", components.shape)
29 |            
30 |     #Read TCGA RNA-Seq expression
31 |     tcga_df = pd.read_table(input_folder + 'TCGA_FILES/TCGA_' + tcga_type + '_PREPROCESSED_MICROARRAY_EXPRESSION.tsv', index_col= 0)
32 |     print("TCGA expression ", tcga_df.shape)
33 |     print('RANGE: ', (np.max(tcga_df.values) - np.min(tcga_df.values) ))
34 |     
35 |     #Encode test data using trained PCA model
36 |     test_data = tcga_df.values
37 |     encoded_data = pca.transform(test_data)
38 |     print("Encoded TCGA data ", encoded_data.shape)
39 |         
40 |     #Record expression data
41 |     encoded_df = pd.DataFrame(encoded_data, index = tcga_df.index)
42 |     encoded_df.to_csv(output_folder + 'TCGA_MICROARRAY_' + tcga_type + '_PCA_500L.tsv', sep = '\t')
43 | 
44 |     
45 | import sys
46 | cancer_type = sys.argv[1]
47 | tcga_type = sys.argv[2]
48 | create_data(cancer_type, tcga_type)
49 | 


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/COMPARING_RNASEQ_and_MICROARRAY/Encode_TCGA_Microarray_Data_with_VAE.py:
--------------------------------------------------------------------------------
  1 | ###############################
  2 | #Script for encoding TCGA microarray expression using VAE models
  3 | ###############################
  4 | 
  5 | import os
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | import math 
 10 | from sklearn.metrics import mean_squared_error
 11 | import matplotlib.pyplot as plt
 12 | 
 13 | import tensorflow as tf
 14 | from keras.layers import Input, Dense, Lambda, Layer, Activation
 15 | from keras.layers.normalization import BatchNormalization
 16 | from keras.models import Model
 17 | from keras import backend as K
 18 | from keras import metrics, optimizers
 19 | from keras.callbacks import Callback
 20 | import keras
 21 | 
 22 | import csv
 23 | import sys
 24 | from keras.models import model_from_json
 25 | from sklearn import preprocessing
 26 | 
 27 | #Prevent tensorflow from using all the memory
 28 | config = tf.ConfigProto()
 29 | config.gpu_options.allow_growth=True
 30 | sess = tf.Session(config=config)
 31 | 
 32 | #Method for defining the VAE loss
 33 | def vae_loss(x_input, x_decoded):
 34 |     reconstruction_loss = original_dim * metrics.mse(x_input, x_decoded)
 35 |     kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
 36 |     return K.mean(reconstruction_loss + (K.get_value(beta) * kl_loss))
 37 | 
 38 | 
 39 | #Read user inputs
 40 | import sys
 41 | cancer = sys.argv[1]
 42 | tcga_name = sys.argv[2]
 43 | dimension = int(sys.argv[3])
 44 | start = int(sys.argv[4])
 45 | end = int(sys.argv[5])
 46 | 
 47 | print("CANCER NAME: " + cancer)
 48 | print("TEST NAME: " + tcga_name)
 49 | 
 50 | input_folder = '../../ALL_CANCER_FILES/' + cancer + '/'
 51 | output_folder = '../../ALL_CANCER_FILES/' + cancer + '/TCGA_FILES/' 
 52 | 
 53 | #Read input data
 54 | input_df_test = pd.read_table(input_folder + 'TCGA_FILES/TCGA_MICROARRAY_' + tcga_name + '_PCA_1000L.tsv', index_col = 0)    
 55 | print("TCGA expression dataframe ", input_df_test.shape)
 56 |     
 57 | #Read GTEX expression
 58 | for fold in range(start, end):
 59 |     print("VAE model with " + str(dimension) + " nodes and fold " + str(fold))
 60 | 
 61 |     #Load VAE models
 62 |     json_file = open( input_folder + 'VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.json', 'r')   
 63 |     loaded_model_json = json_file.read()
 64 |     json_file.close()
 65 |     encoder = model_from_json(loaded_model_json)
 66 | 
 67 |     encoder.load_weights(input_folder + 'VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.h5')
 68 |     print("Loaded model from disk")
 69 | 
 70 |     #Define placeholder VAE model
 71 |     original_dim = input_df_test.shape[1]
 72 |     intermediate1_dim = 100
 73 |     intermediate2_dim = 25
 74 |     latent_dim = dimension
 75 | 
 76 |     batch_size = 50
 77 |     epochs = 50
 78 |     learning_rate = 0.0005
 79 |     beta = K.variable(1)
 80 |     kappa = 0
 81 |     init_mode = 'glorot_uniform'
 82 | 
 83 |     x = Input(shape=(original_dim, ))
 84 | 
 85 |     net = Dense(intermediate1_dim, kernel_initializer=init_mode)(x)
 86 |     net2 = BatchNormalization()(net)
 87 |     net3 = Activation('relu')(net2)
 88 | 
 89 |     net4 = Dense(intermediate2_dim, kernel_initializer=init_mode)(net3)
 90 |     net5 = BatchNormalization()(net4)
 91 |     net6 = Activation('relu')(net5)
 92 | 
 93 |     z_mean = Dense(latent_dim, kernel_initializer=init_mode)(net6)
 94 |     z_log_var = Dense(latent_dim, kernel_initializer=init_mode)(net6)
 95 | 
 96 |     adam = optimizers.Adam(lr=learning_rate)
 97 | 
 98 |     #Encode test data using the VAE model
 99 |     test_encoded = encoder.predict(input_df_test, batch_size = batch_size)
100 |     test_encoded_df = pd.DataFrame(test_encoded, index = input_df_test.index)
101 |     test_encoded_df.to_csv(output_folder + 'TCGA_' + tcga_name + '_MICROARRAY_Expression_VAE_encoded_' + str(dimension) + 'L_' + str(fold) + '.tsv', sep = '\t')
102 | 
103 | 


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/COMPARING_RNASEQ_and_MICROARRAY/Preprocess_TCGA_Microarray_Expression.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for preprocessing TCGA RNA-Seq expression
 3 | ###############################
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import csv
 8 | import sklearn.preprocessing
 9 | 
10 | #Define method for preprocessing data
11 | def create_data(cancer_type, tcga_type):
12 | 
13 |     input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/'
14 |     output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/'
15 |     
16 |     #Read TCGA RNA-Seq expression
17 |     tcga_df = pd.read_csv('../TCGA_DATA/TCGA_MICROARRAY/' + tcga_type + '.medianexp.txt', sep = '\t', index_col= 0)
18 |     tcga_df = tcga_df.transpose()
19 |     tcga_df = tcga_df.iloc[:, 1:]
20 |     tcga_df = tcga_df.astype(float)
21 |     
22 |     tcga_df = tcga_df.fillna(tcga_df.mean().fillna(0))
23 |     print("TCGA expression ", tcga_df.shape)
24 |     print('RANGE: ', (np.max(tcga_df.values) - np.min(tcga_df.values) ))
25 |     print("TCGA expression mean ", np.mean(tcga_df.values, axis = 0))
26 |     print("TCGA expression mean ", len(np.mean(tcga_df.values, axis = 0)))
27 |     print("TCGA expression std ", np.std(tcga_df.values, axis = 0))
28 |     print("TCGA expression std ", len(np.std(tcga_df.values, axis = 0)))
29 |     
30 |     new_index = [s[:15] for s in tcga_df.index]
31 |     tcga_df = pd.DataFrame(tcga_df.values, index = new_index, columns = tcga_df.columns)
32 |     print(tcga_df)
33 |     
34 |     #Eliminate normal samples
35 |     print("Eliminating normal samples..")
36 |     sample_codes = [s[-2:] for s in  tcga_df.index]
37 |     print("Sample codes ", np.unique(sample_codes))
38 |     normal_codes = [s[-2] for s in  tcga_df.index]
39 |     cancer_samples = np.where(np.asarray(normal_codes) == '0')[0]
40 |     print("Total number of samples ", len(tcga_df.index))
41 |     print("Total number of cancer samples ", len(cancer_samples))
42 |     tcga_df = tcga_df.iloc[cancer_samples, :]
43 |     print("TCGA expression ", tcga_df.shape)
44 |     print("TCGA expression cancer samples ", tcga_df.index)
45 |     
46 |     #Read training data
47 |     data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
48 |     print("Training data ", data_df.shape)
49 | 
50 |     #Get only training genes from the expression data
51 |     joined_df = pd.concat([data_df, tcga_df], join = 'outer')
52 |     joined_df = joined_df[data_df.columns]
53 |     joined_df = joined_df.iloc[-1 * tcga_df.shape[0]:, :]
54 |     joined_df = joined_df.fillna(joined_df.mean().fillna(0))
55 |     print("TCGA expression ", joined_df.shape)
56 |     
57 |     #Standardize data to make 0 mean univariate
58 |     normalized_data = sklearn.preprocessing.scale(joined_df.values)
59 |     print("TCGA expression mean ", np.mean(normalized_data, axis = 0))
60 |     print("TCGA expression mean ", len(np.mean(normalized_data, axis = 0)))
61 |     print("TCGA expression std ", np.std(normalized_data, axis = 0))
62 |     print("TCGA expression std ", len(np.std(normalized_data, axis = 0)))
63 |     
64 |     #Record joined dataframe
65 |     joined_df = pd.DataFrame(normalized_data, index = joined_df.index, columns = joined_df.columns)
66 |     print("Final dataframe ", joined_df.shape)
67 |     print('RANGE: ', (np.max(joined_df.values) - np.min(joined_df.values) ))
68 |     
69 |     #Record expression data
70 |     joined_df.to_csv(output_folder + 'TCGA_' + tcga_type + '_PREPROCESSED_MICROARRAY_EXPRESSION.tsv', sep = '\t')
71 |     print(joined_df)
72 |     
73 | import sys
74 | cancer_type = sys.argv[1]
75 | tcga_type = sys.argv[2]
76 | create_data(cancer_type, tcga_type)
77 | 


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Create_All_VAE_Embeddings.py:
--------------------------------------------------------------------------------
1 | import sys
2 | 
3 | cancer_type = sys.argv[1]
4 | tcga_type = sys.argv[2]
5 | 
6 | dims = [5, 10, 25, 50, 75, 100]
7 | for dim in dims:
8 |     get_ipython().magic(u"run -i 'Encode_TCGA_Data_with_VAE.py' '" +  cancer_type + "' " + tcga_type + " " + str(dim) + " " + str(0) + " " + str(100))


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Create_DeepProfile_TCGA_Embeddings.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for creating TCGA RNA-Seq DeepProfile embeddings
 3 | ###############################
 4 | 
 5 | import pandas as pd
 6 | import numpy as np
 7 | import csv
 8 | import sys
 9 | 
10 | #Read cancer type from user
11 | cancer_type = sys.argv[1]
12 | tcga_type = sys.argv[2]
13 | 
14 | #Read all VAE embeddings
15 | dims  = [5, 10, 25, 50, 75, 100]
16 | run = 100
17 | 
18 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/'
19 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/' 
20 | 
21 | data_list = []
22 | for dim in dims:
23 |     for i in range(run):
24 |         data_df = pd.read_table(input_folder + 'TCGA_FILES/TCGA_' + tcga_type + '_RNASeq_Expression_VAE_encoded_' + str(dim) + 'L_' + str(i) + '.tsv', index_col = 0)   
25 |         print("TCGA VAE embedding ", data_df.shape)
26 |         data_list.append(data_df.values)
27 | 
28 | #Concatenate all embeddings     
29 | joined_data = np.concatenate(data_list, axis=1)
30 | print("Joined VAE embedding ",joined_data.shape)
31 | 
32 | #Read DeepProfile ensemble labels
33 | L = 150
34 | labels_df = pd.read_table( input_folder + cancer_type + '_TRAINING_DATA_kmeans_ENSEMBLE_LABELS_' + str(L) + 'L.txt', header= None)
35 | labels = labels_df.values
36 | print("DeepProfile ensemble labels ", len(labels))
37 | 
38 | #Create ensemble embedding
39 | ensemble_embeddings = np.zeros((joined_data.shape[0], L))
40 | for label in range(L):
41 |     indices = np.where(labels == label)[0]
42 |     average_values = np.mean(joined_data[:, indices], axis = 1)
43 |     ensemble_embeddings[:, label] = average_values
44 | 
45 | #Record the ensemble embeddings
46 | print(ensemble_embeddings.shape)
47 | ensemble_embeddings_df = pd.DataFrame(ensemble_embeddings, index = data_df.index, columns = np.arange(L))
48 | ensemble_embeddings_df.to_csv(output_folder + tcga_type + '_DeepProfile_TCGA_RNASeq_Embedding_' + str(L) + 'L.tsv', sep = '\t')


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Create_TCGA_Rnaseq_PCs.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for recording top PCs or TCGA RNA-Seq data
 3 | ###############################
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import csv
 8 | from sklearn.decomposition import PCA
 9 | import sklearn.preprocessing
10 | 
11 | #Define method for preprocessing data
12 | def create_data(cancer_type, tcga_type):
13 | 
14 |     input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/'
15 |     output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/'
16 |     
17 |     #Read training data
18 |     data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
19 |     print("Training data ", data_df.shape)
20 | 
21 |     #Apply PCA to training data
22 |     training_data = data_df.values
23 |     training_data = np.nan_to_num(training_data)
24 |     
25 |     pca = PCA(n_components = 1000)
26 |     pca.fit(training_data)
27 |     components = pca.components_
28 |     print("PCA components ", components.shape)
29 |            
30 |     #Read TCGA RNA-Seq expression
31 |     tcga_df = pd.read_table(output_folder + 'TCGA_' + tcga_type + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', index_col= 0)
32 |     print("TCGA expression ", tcga_df.shape)
33 |     print('RANGE: ', (np.max(tcga_df.values) - np.min(tcga_df.values) ))
34 |     
35 |     #Encode test data using trained PCA model
36 |     test_data = tcga_df.values
37 |     encoded_data = pca.transform(test_data)
38 |     print("Encoded TCGA data ", encoded_data.shape)
39 |         
40 |     #Record expression data
41 |     encoded_df = pd.DataFrame(encoded_data, index = tcga_df.index)
42 |     encoded_df.to_csv(output_folder + 'TCGA_RNASEQ_' + tcga_type + '_PCA_1000L.tsv', sep = '\t')
43 | 
44 |     
45 | import sys
46 | cancer_type = sys.argv[1]
47 | tcga_type = sys.argv[2]
48 | create_data(cancer_type, tcga_type)
49 | 


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Encode_TCGA_Data_with_AE.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for creating TCGA RNA-Seq AE embeddings
 3 | ###############################
 4 | 
 5 | import os
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | import math 
10 | from sklearn.metrics import mean_squared_error
11 | import matplotlib.pyplot as plt
12 | from keras.models import model_from_json
13 | from sklearn import preprocessing
14 | 
15 | import tensorflow as tf
16 | from keras.layers import Input, Dense, Lambda, Layer, Activation
17 | from keras.layers.normalization import BatchNormalization
18 | from keras.models import Model
19 | from keras import backend as K
20 | from keras import metrics, optimizers
21 | from keras.callbacks import Callback
22 | import keras
23 | 
24 | import csv
25 | import sys
26 | 
27 | #Prevent tensorflow from using all the memory
28 | config = tf.ConfigProto()
29 | config.gpu_options.allow_growth=True
30 | sess = tf.Session(config=config)
31 | 
32 | #Define reconstruction loss
33 | def reconstruction_loss(x_input, x_decoded):
34 |     return metrics.mse(x_input, x_decoded)
35 | 
36 | #Read user inputs
37 | import sys
38 | cancer = sys.argv[1]
39 | tcga_name = sys.argv[2]
40 | print("CANCER NAME: " + cancer)
41 | print("TEST NAME: " + tcga_name)
42 | 
43 | input_folder = '../../ALL_CANCER_FILES/' + cancer + '/'
44 | output_folder = '../../ALL_CANCER_FILES/' + cancer + '/TCGA_FILES/' 
45 | 
46 | start = 0
47 | end = 10
48 | dimension = 150
49 | 
50 | #Read TCGA RNA-Seq input data
51 | input_df_test = pd.read_table(input_folder + 'TCGA_FILES/TCGA_RNASEQ_' + tcga_name + '_PCA_1000L.tsv', index_col = 0)      
52 | print("RNA-Seq expression dataframe ", input_df_test.shape)
53 | 
54 | #Encode test data with all 10 AE models
55 | for fold in range(start, end):
56 |     print("AE model with " + str(dimension) + " nodes and fold " + str(fold))
57 |     
58 |     #Load AE models
59 |     json_file = open(input_folder + 'AE_FILES/AE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.json', 'r')
60 |     loaded_model_json = json_file.read()
61 |     json_file.close()
62 |     encoder = model_from_json(loaded_model_json)
63 |     
64 |     encoder.load_weights(input_folder + 'AE_FILES/AE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.h5')
65 |     print("Loaded model from disk")
66 | 
67 |     # Encode test data using the AE model
68 |     test_encoded = encoder.predict(input_df_test)
69 |     test_encoded_df = pd.DataFrame(test_encoded, index = input_df_test.index)
70 |     print("Encoded data ", test_encoded_df.shape)
71 |     test_encoded_df.to_csv(output_folder + 'TCGA_' + tcga_name + '_RNASeq_Expression_AE_encoded_' + str(dimension) + 'L_' + str(fold) + '.tsv', sep = '\t')
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Encode_TCGA_Data_with_DAE.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for creating TCGA RNA-Seq AE embeddings
 3 | ###############################
 4 | 
 5 | import os
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | import math 
10 | from sklearn.metrics import mean_squared_error
11 | import matplotlib.pyplot as plt
12 | from keras.models import model_from_json
13 | from sklearn import preprocessing
14 | 
15 | import tensorflow as tf
16 | from keras.layers import Input, Dense, Lambda, Layer, Activation
17 | from keras.layers.normalization import BatchNormalization
18 | from keras.models import Model
19 | from keras import backend as K
20 | from keras import metrics, optimizers
21 | from keras.callbacks import Callback
22 | import keras
23 | 
24 | import csv
25 | import sys
26 | 
27 | #Prevent tensorflow from using all the memory
28 | config = tf.ConfigProto()
29 | config.gpu_options.allow_growth=True
30 | sess = tf.Session(config=config)
31 | 
32 | #Define reconstruction loss
33 | def reconstruction_loss(x_input, x_decoded):
34 |     return metrics.mse(x_input, x_decoded)
35 | 
36 | #Read user inputs
37 | import sys
38 | cancer = sys.argv[1]
39 | tcga_name = sys.argv[2]
40 | print("CANCER NAME: " + cancer)
41 | print("TEST NAME: " + tcga_name)
42 | 
43 | input_folder = '../../ALL_CANCER_FILES/' + cancer + '/'
44 | output_folder = '../../ALL_CANCER_FILES/' + cancer + '/TCGA_FILES/' 
45 | 
46 | start = 0
47 | end = 10
48 | dimension = 150
49 | 
50 | #Read TCGA RNA-Seq input data
51 | input_df_test = pd.read_table(input_folder + 'TCGA_FILES/TCGA_RNASEQ_' + tcga_name + '_PCA_1000L.tsv', index_col = 0)      
52 | print("RNA-Seq expression dataframe ", input_df_test.shape)
53 | 
54 | #Encode test data with all 10 DAE models
55 | for fold in range(start, end):
56 |     print("DAE model with " + str(dimension) + " nodes and fold " + str(fold))
57 |     
58 |     #Load DAE models
59 |     json_file = open(input_folder + 'DAE_FILES/DAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.json', 'r')
60 |     loaded_model_json = json_file.read()
61 |     json_file.close()
62 |     encoder = model_from_json(loaded_model_json)
63 |     
64 |     encoder.load_weights(input_folder + 'DAE_FILES/DAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.h5')
65 |     print("Loaded model from disk")
66 | 
67 |     # Encode test data using the DAE model
68 |     test_encoded = encoder.predict(input_df_test)
69 |     test_encoded_df = pd.DataFrame(test_encoded, index = input_df_test.index)
70 |     print("Encoded data ", test_encoded_df.shape)
71 |     test_encoded_df.to_csv(output_folder + 'TCGA_' + tcga_name + '_RNASeq_Expression_DAE_encoded_' + str(dimension) + 'L_' + str(fold) + '.tsv', sep = '\t')
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Encode_TCGA_Data_with_ICA.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for creating TCGA RNA-Seq ICA embeddings
 3 | ###############################
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import csv
 8 | from sklearn.decomposition import FastICA
 9 | import sklearn.preprocessing
10 | from scipy.stats.mstats import winsorize
11 | 
12 | #Read cancer type and TCGA type
13 | import sys
14 | cancer_type = sys.argv[1]
15 | tcga_type = sys.argv[2]
16 | print("CANCER NAME: " + cancer_type)
17 | print("TEST NAME: " + tcga_type)
18 | 
19 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/'
20 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/' 
21 | 
22 | #Read training data
23 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
24 | print("Training data ", data_df.shape)
25 | training_data = data_df.values
26 | training_data = np.nan_to_num(training_data)
27 | 
28 | #Read TCGA RNA-Seq expression data
29 | tcga_df = pd.read_table(input_folder + 'TCGA_FILES/TCGA_' + tcga_type + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', index_col= 0)
30 | print("TCGA data ", tcga_df.shape)
31 | test_data = tcga_df.values
32 | 
33 | #Train all ICA models
34 | for run in range(10):
35 |     #Train model
36 |     ica = FastICA(n_components = 150, random_state = 12345 * run, tol=0.001, max_iter = 100000)
37 |     print(ica)
38 |     ica.fit(training_data) 
39 |     components = ica.components_
40 |     print("ICA components ", components.shape)
41 | 
42 |     #Encode RNA-Seq data
43 |     encoded_data = ica.transform(test_data)
44 |     print("Encoded TCGA data ", encoded_data.shape)
45 |     encoded_df = pd.DataFrame(encoded_data, index = tcga_df.index)
46 |     encoded_df.to_csv(output_folder + 'TCGA_RNASEQ_' + tcga_type + '_ICA_150L_' + str(run + 1) + '.tsv', sep = '\t')
47 |     
48 | 


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Encode_TCGA_Data_with_PCA.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for creating TCGA RNA-Seq PCA embeddings
 3 | ###############################
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import csv
 8 | from sklearn.decomposition import PCA
 9 | import sklearn.preprocessing
10 | from scipy.stats.mstats import winsorize
11 | import sys
12 | 
13 | #Read cancer type and TCGA type
14 | cancer_type = sys.argv[1]
15 | tcga_type = sys.argv[2]
16 | print("CANCER NAME: " + cancer_type)
17 | print("TEST NAME: " + tcga_type)
18 | 
19 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/'
20 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/' 
21 | 
22 | #Read training data
23 | data_df = pd.read_table( input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
24 | print("Training data ", data_df.shape)
25 | training_data = data_df.values
26 | training_data = np.nan_to_num(training_data)
27 | 
28 | #Train PCA model
29 | pca = PCA(n_components = 150)
30 | pca.fit(training_data)
31 | components = pca.components_
32 | print("PCA components ", components.shape)
33 | 
34 | #Read TCGA RNA-Seq expression data
35 | tcga_df = pd.read_table(output_folder+ '/TCGA_' + tcga_type + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', index_col= 0)
36 | print("TCGA data ", tcga_df.shape)
37 | 
38 | #Encode TCGA data with PCA model
39 | test_data = tcga_df.values
40 | encoded_data = pca.transform(test_data)
41 | print("Encoded TCGA data ", encoded_data.shape)
42 | 
43 | encoded_df = pd.DataFrame(encoded_data, index = tcga_df.index)
44 | encoded_df.to_csv(output_folder + '/TCGA_RNASEQ_' + tcga_type + '_PCA_150L.tsv', sep = '\t')
45 | 


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Encode_TCGA_Data_with_RP.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for creating TCGA RNA-Seq RP embeddings
 3 | ###############################
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import csv
 8 | from sklearn.random_projection import GaussianRandomProjection
 9 | import sklearn.preprocessing
10 | from scipy.stats.mstats import winsorize
11 | 
12 | #Read cancer type and TCGA type
13 | import sys
14 | cancer_type = sys.argv[1]
15 | tcga_type = sys.argv[2]
16 | print("CANCER NAME: " + cancer_type)
17 | print("TEST NAME: " + tcga_type)
18 | 
19 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/'
20 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/' 
21 | 
22 | #Read training data
23 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
24 | print("Training data ", data_df.shape)
25 | training_data = data_df.values
26 | training_data = np.nan_to_num(training_data)
27 | 
28 | #Read TCGA RNA-Seq expression data
29 | tcga_df = pd.read_table(input_folder + 'TCGA_FILES/TCGA_' + tcga_type + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', index_col= 0)
30 | print("TCGA data ", tcga_df.shape)
31 | test_data = tcga_df.values
32 | 
33 | #Train all models
34 | for run in range(10):
35 |     #Train model
36 |     transformer = GaussianRandomProjection(n_components = 150, random_state = run * 12345)
37 |     transformer.fit(training_data)
38 |     components = transformer.components_
39 |     print("RP components ", components.shape)
40 | 
41 |     #Save the encoded data
42 |     encoded_data = transformer.transform(test_data)
43 |     print("Encoded TCGA data ", encoded_data.shape)
44 |     encoded_df = pd.DataFrame(encoded_data, index = tcga_df.index)
45 |     encoded_df.to_csv(output_folder + 'TCGA_RNASEQ_' + tcga_type + '_RP_150L_' + str(run + 1) + '.tsv', sep = '\t')
46 |     
47 | 


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Encode_TCGA_Data_with_VAE.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for encoding TCGA RNA-Seq expression using VAE models
 3 | ###############################
 4 | 
 5 | import os
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | import math 
10 | from sklearn.metrics import mean_squared_error
11 | import matplotlib.pyplot as plt
12 | 
13 | import tensorflow as tf
14 | from keras.layers import Input, Dense, Lambda, Layer, Activation
15 | from keras.layers.normalization import BatchNormalization
16 | from keras.models import Model
17 | from keras import backend as K
18 | from keras import metrics, optimizers
19 | from keras.callbacks import Callback
20 | import keras
21 | 
22 | import csv
23 | import sys
24 | from keras.models import model_from_json
25 | from sklearn import preprocessing
26 | 
27 | #Prevent tensorflow from using all the memory
28 | config = tf.ConfigProto()
29 | config.gpu_options.allow_growth=True
30 | sess = tf.Session(config=config)
31 | 
32 | #Method for defining the VAE loss
33 | def vae_loss(x_input, x_decoded):
34 |     reconstruction_loss = original_dim * metrics.mse(x_input, x_decoded)
35 |     kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
36 |     return K.mean(reconstruction_loss + (K.get_value(beta) * kl_loss))
37 | 
38 | 
39 | #Read user inputs
40 | import sys
41 | cancer = sys.argv[1]
42 | tcga_name = sys.argv[2]
43 | dimension = int(sys.argv[3])
44 | start = int(sys.argv[4])
45 | end = int(sys.argv[5])
46 | 
47 | print("CANCER NAME: " + cancer)
48 | print("TEST NAME: " + tcga_name)
49 | 
50 | input_folder = '../../ALL_CANCER_FILES/' + cancer + '/'
51 | output_folder = '../../ALL_CANCER_FILES/' + cancer + '/TCGA_FILES/' 
52 | 
53 | #Read input data
54 | input_df_test = pd.read_table(input_folder + 'TCGA_FILES/TCGA_RNASEQ_' + tcga_name + '_PCA_1000L.tsv', index_col = 0)    
55 | print("TCGA expression dataframe ", input_df_test.shape)
56 |     
57 | #Read GTEX expression
58 | for fold in range(start, end):
59 |     print("VAE model with " + str(dimension) + " nodes and fold " + str(fold))
60 | 
61 |     #Load VAE models
62 |     json_file = open(input_folder + '/VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.json', 'r')
63 |     loaded_model_json = json_file.read()
64 |     json_file.close()
65 |     encoder = model_from_json(loaded_model_json)
66 | 
67 |     encoder.load_weights(input_folder + '/VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.h5')
68 |     print("Loaded model from disk")
69 | 
70 |     #Encode test data using the VAE model
71 |     test_encoded = encoder.predict(input_df_test)
72 |     test_encoded_df = pd.DataFrame(test_encoded, index = input_df_test.index)
73 |     test_encoded_df.to_csv(output_folder + 'TCGA_' + tcga_name + '_RNASeq_Expression_VAE_encoded_' + str(dimension) + 'L_' + str(fold) + '.tsv', sep = '\t')
74 | 
75 | 


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Example_Run_All.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Example for training TCGA models for a cancer type
 3 | ###############################
 4 | import sys
 5 | 
 6 | ##STEP 1: Preprocessing Data
 7 | get_ipython().magic(u"run -i Preprocess_TCGA_Rnaseq_Expression.py BRCA BRCA")
 8 | get_ipython().magic(u"run -i Create_TCGA_Rnaseq_PCs.py BRCA BRCA")
 9 | 
10 | ##STEP 2: Encoding Expression with DeepProfile
11 | get_ipython().magic(u"run -i Create_All_VAE_Embeddings.py BRCA BRCA")
12 | get_ipython().magic(u"run -i Create_DeepProfile_TCGA_Embeddings.py BRCA BRCA")
13 | 
14 | ##STEP 3: Encoding Expression with Competitor Models
15 | get_ipython().magic(u"run -i Encode_TCGA_Data_with_PCA.py BRCA BRCA")
16 | get_ipython().magic(u"run -i Encode_TCGA_Data_with_ICA.py BRCA BRCA")
17 | get_ipython().magic(u"run -i Encode_TCGA_Data_with_RP.py BRCA BRCA")
18 | get_ipython().magic(u"run -i Encode_TCGA_Data_with_AE.py BRCA BRCA")
19 | get_ipython().magic(u"run -i Encode_TCGA_Data_with_DAE.py BRCA BRCA")


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Preprocess_TCGA_Rnaseq_Expression.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for preprocessing TCGA RNA-Seq expression
 3 | ###############################
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import sklearn.preprocessing
 8 | 
 9 | #Define method for preprocessing data
10 | def create_data(cancer_type, tcga_type):
11 | 
12 |     input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/'
13 |     output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/'
14 |     
15 |     #Read TCGA RNA-Seq expression
16 |     print("Reading expression...")
17 |     tcga_df = pd.read_table('../TCGA_DATA/TCGA_RNASEQ/' + tcga_type + '.uncv2.mRNAseq_RSEM_normalized_log2.txt', sep = '\t', index_col= 0)
18 |     tcga_df = tcga_df.transpose()
19 |     print("TCGA expression ", tcga_df.shape)
20 |     print("TCGA expression ", tcga_df.head())
21 |     
22 |     #Map to gene names and eliminate unknown genes
23 |     print("Correcting gene names...")
24 |     gene_names = tcga_df.columns
25 |     gene_names = [n[:n.index('|')] for n in gene_names]
26 |     tcga_df = pd.DataFrame(tcga_df.values, index = tcga_df.index, columns = gene_names)
27 |  
28 |     #Eliminate unknown genes
29 |     tcga_df = tcga_df.iloc[:, tcga_df.columns.values != '?']
30 |     tcga_df = tcga_df.loc[:,~tcga_df.columns.duplicated()]
31 |     print("TCGA expression ", tcga_df.shape)
32 |     print("TCGA expression ", tcga_df)
33 |    
34 |     #Eliminate normal samples
35 |     print("Eliminating normal samples..")
36 |     sample_codes = [s[-2:] for s in  tcga_df.index]
37 |     print("Sample codes ", np.unique(sample_codes))
38 |     normal_codes = [s[-2] for s in  tcga_df.index]
39 |     cancer_samples = np.where(np.asarray(normal_codes) == '0')[0]
40 |     print("Total number of samples ", len(tcga_df.index))
41 |     print("Total number of cancer samples ", len(cancer_samples))
42 |     tcga_df = tcga_df.iloc[cancer_samples, :]
43 |     print("TCGA expression ", tcga_df.shape)
44 |     print("TCGA expression cancer samples ", tcga_df.index)
45 |     
46 |     #Read training data
47 |     print("Combining with training data...")
48 |     data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
49 |     print("Training data ", data_df.shape)
50 | 
51 |     #Get only training genes from the expression data
52 |     joined_df = pd.concat([data_df, tcga_df], join = 'outer')
53 |     joined_df = joined_df[data_df.columns]
54 |     joined_df = joined_df.iloc[-1 * tcga_df.shape[0]:, :]
55 |     joined_df = joined_df.fillna(joined_df.mean().fillna(0))
56 |     print("TCGA expression ", joined_df.shape)
57 |     print("TCGA expression ", joined_df.head())
58 |     
59 |     #Standardize data to make 0 mean univariate
60 |     print("Standardizing the data...")
61 |     scaled_expression_values = joined_df.values
62 |     normalized_data = sklearn.preprocessing.scale(scaled_expression_values)
63 |     print("TCGA expression mean ", np.mean(normalized_data, axis = 0))
64 |     print("TCGA expression mean ", len(np.mean(normalized_data, axis = 0)))
65 |     print("TCGA expression std ", np.std(normalized_data, axis = 0))
66 |     print("TCGA expression std ", len(np.std(normalized_data, axis = 0)))
67 |     
68 |     #Record joined dataframe
69 |     joined_df = pd.DataFrame(normalized_data, index = joined_df.index, columns = joined_df.columns)
70 |     print("Final dataframe ", joined_df.shape)
71 |     print('RANGE: ', (np.max(joined_df.values) - np.min(joined_df.values) ))
72 |     
73 |     #Record expression data
74 |     joined_df.to_csv(output_folder + 'TCGA_' + tcga_type + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', sep = '\t')
75 | 
76 |     
77 | import sys
78 | cancer_type = sys.argv[1]
79 | tcga_type = sys.argv[2]
80 | create_data(cancer_type, tcga_type)
81 | 


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Preprocess_TCGA_Rnaseq_Expression_All_Genes.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for preprocessing TCGA RNA-Seq expression
 3 | ###############################
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import sklearn.preprocessing
 8 | 
 9 | #Define method for preprocessing data
10 | def create_data(cancer_type, tcga_type):
11 | 
12 |     input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/'
13 |     output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/'
14 |     
15 |     #Read TCGA RNA-Seq expression
16 |     print("Reading expression...")
17 |     tcga_df = pd.read_table('../TCGA_DATA/TCGA_RNASEQ/' + tcga_type + '.uncv2.mRNAseq_RSEM_normalized_log2.txt', sep = '\t', index_col= 0)
18 |     tcga_df = tcga_df.transpose()
19 |     print("TCGA expression ", tcga_df.shape)
20 |     print("TCGA expression ", tcga_df.head())
21 |     
22 |     #Map to gene names and eliminate unknown genes
23 |     print("Correcting gene names...")
24 |     gene_names = tcga_df.columns
25 |     gene_names = [n[:n.index('|')] for n in gene_names]
26 |     tcga_df = pd.DataFrame(tcga_df.values, index = tcga_df.index, columns = gene_names)
27 |  
28 |     #Eliminate unknown genes
29 |     tcga_df = tcga_df.iloc[:, tcga_df.columns.values != '?']
30 |     tcga_df = tcga_df.loc[:,~tcga_df.columns.duplicated()]
31 |     print("TCGA expression ", tcga_df.shape)
32 |     print("TCGA expression ", tcga_df)
33 |    
34 |     #Eliminate normal samples
35 |     print("Eliminating normal samples..")
36 |     sample_codes = [s[-2:] for s in  tcga_df.index]
37 |     print("Sample codes ", np.unique(sample_codes))
38 |     normal_codes = [s[-2] for s in  tcga_df.index]
39 |     cancer_samples = np.where(np.asarray(normal_codes) == '0')[0]
40 |     print("Total number of samples ", len(tcga_df.index))
41 |     print("Total number of cancer samples ", len(cancer_samples))
42 |     tcga_df = tcga_df.iloc[cancer_samples, :]
43 |     print("TCGA expression ", tcga_df.shape)
44 |     print("TCGA expression cancer samples ", tcga_df.index)
45 |     
46 |     #Standardize data to make 0 mean univariate
47 |     print("Standardizing the data...")
48 |     scaled_expression_values = tcga_df.values
49 |     normalized_data = sklearn.preprocessing.scale(scaled_expression_values)
50 |     print("TCGA expression mean ", np.mean(normalized_data, axis = 0))
51 |     print("TCGA expression mean ", len(np.mean(normalized_data, axis = 0)))
52 |     print("TCGA expression std ", np.std(normalized_data, axis = 0))
53 |     print("TCGA expression std ", len(np.std(normalized_data, axis = 0)))
54 |     
55 |     #Record joined dataframe
56 |     tcga_df = pd.DataFrame(normalized_data, index = tcga_df.index, columns = tcga_df.columns)
57 |     tcga_df = tcga_df.fillna(tcga_df.mean().fillna(0))
58 |     print("Final dataframe ", tcga_df.shape)
59 |     print('RANGE: ', (np.max(tcga_df.values) - np.min(tcga_df.values) ))
60 |     
61 |     #Record expression data
62 |     tcga_df.to_csv(output_folder + 'TCGA_' + tcga_type + '_ALL_GENES_PREPROCESSED_RNASEQ_EXPRESSION.tsv', sep = '\t')
63 |     print(tcga_df)
64 |     
65 | import sys
66 | cancer_type = sys.argv[1]
67 | tcga_type = sys.argv[2]
68 | create_data(cancer_type, tcga_type)
69 | 


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Preprocess_TCGA_Rnaseq_Expression_All_Genes_Uncorrected.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for preprocessing TCGA RNA-Seq expression
 3 | ###############################
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import sklearn.preprocessing
 8 | 
 9 | #Define method for preprocessing data
10 | def create_data(cancer_type, tcga_type):
11 | 
12 |     input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/'
13 |     output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/'
14 |     
15 |     #Read TCGA RNA-Seq expression
16 |     print("Reading expression...")
17 |     tcga_df = pd.read_table('../TCGA_DATA/TCGA_RNASEQ/' + tcga_type + '.uncv2.mRNAseq_RSEM_normalized_log2.txt', sep = '\t', index_col= 0)
18 |     tcga_df = tcga_df.transpose()
19 |     print("TCGA expression ", tcga_df.shape)
20 |     print("TCGA expression ", tcga_df.head())
21 |     
22 |     #Map to gene names and eliminate unknown genes
23 |     print("Correcting gene names...")
24 |     gene_names = tcga_df.columns
25 |     gene_names = [n[:n.index('|')] for n in gene_names]
26 |     tcga_df = pd.DataFrame(tcga_df.values, index = tcga_df.index, columns = gene_names)
27 |  
28 |     #Eliminate unknown genes
29 |     tcga_df = tcga_df.iloc[:, tcga_df.columns.values != '?']
30 |     tcga_df = tcga_df.loc[:,~tcga_df.columns.duplicated()]
31 |     print("TCGA expression ", tcga_df.shape)
32 |     print("TCGA expression ", tcga_df)
33 |    
34 |     #Eliminate normal samples
35 |     print("Eliminating normal samples..")
36 |     sample_codes = [s[-2:] for s in  tcga_df.index]
37 |     print("Sample codes ", np.unique(sample_codes))
38 |     normal_codes = [s[-2] for s in  tcga_df.index]
39 |     cancer_samples = np.where(np.asarray(normal_codes) == '0')[0]
40 |     print("Total number of samples ", len(tcga_df.index))
41 |     print("Total number of cancer samples ", len(cancer_samples))
42 |     tcga_df = tcga_df.iloc[cancer_samples, :]
43 |     print("TCGA expression ", tcga_df.shape)
44 |     print("TCGA expression cancer samples ", tcga_df.index)
45 |     
46 |     #Standardize data to make 0 mean univariate
47 |     print("Standardizing the data...")
48 |     scaled_expression_values = tcga_df.values
49 |     normalized_data = scaled_expression_values
50 |     print("TCGA expression mean ", np.mean(normalized_data, axis = 0))
51 |     print("TCGA expression mean ", len(np.mean(normalized_data, axis = 0)))
52 |     print("TCGA expression std ", np.std(normalized_data, axis = 0))
53 |     print("TCGA expression std ", len(np.std(normalized_data, axis = 0)))
54 |     
55 |     #Record joined dataframe
56 |     tcga_df = pd.DataFrame(normalized_data, index = tcga_df.index, columns = tcga_df.columns)
57 |     tcga_df = tcga_df.fillna(tcga_df.mean().fillna(0))
58 |     print("Final dataframe ", tcga_df.shape)
59 |     print('RANGE: ', (np.max(tcga_df.values) - np.min(tcga_df.values) ))
60 |     
61 |     #Record expression data
62 |     tcga_df.to_csv(output_folder + 'TCGA_' + tcga_type + '_ALL_GENES_NOT_PREPROCESSED_RNASEQ_EXPRESSION.tsv', sep = '\t')
63 |     print(tcga_df)
64 |     
65 | import sys
66 | cancer_type = sys.argv[1]
67 | tcga_type = sys.argv[2]
68 | create_data(cancer_type, tcga_type)
69 | 


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_SURVIVAL_DATAFRAMES/Create_Joined_Survival_Dataframes.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for creating joined TCGA survival dataframes and DeepProfile embeddings
 3 | ###############################
 4 | 
 5 | import pandas as pd
 6 | import numpy as np
 7 | import sys
 8 | 
 9 | def createJoinedDf(tcga_type, cancer_type):
10 |     
11 |     input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/'
12 |     output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/'
13 |     
14 |     #Read survival dataframe
15 |     surv_df = pd.read_table(input_folder + 'TCGA_' + tcga_type + '_Survival_df.tsv', index_col = 0, sep = ',')
16 |     surv_df = surv_df.astype(float)
17 |     print("Survival dataframe ", surv_df.shape)
18 |     
19 |     #Drop nan samples
20 |     indices_to_drop1 = np.where(np.isnan(surv_df.values))[0]
21 |     indices_to_drop2 = np.where(surv_df['Survival_in_days'].values <= 0)[0]
22 |     indices_to_drop = np.unique(np.concatenate((indices_to_drop1, indices_to_drop2)))
23 |     surv_df = surv_df.drop(surv_df.index[indices_to_drop])
24 |     surv_df = pd.DataFrame(surv_df.values, index = surv_df.index, columns = ['fustat', 'futime'])
25 |     print("Survival dataframe ", surv_df.shape)
26 |     
27 |     #Read DeepProfile embedding
28 |     data_df = pd.read_table(input_folder  + tcga_type + '_DeepProfile_TCGA_RNASeq_Embedding_150L.tsv', index_col = 0)
29 |     print("DeepProfile embedding ", data_df.shape)
30 |     
31 |     #Match sample indices
32 |     surv_df_sample_names = surv_df.index
33 |     data_df_sample_names = data_df.index
34 |     print("Surv samples ", surv_df_sample_names)
35 |     print("Data samples ", data_df_sample_names)
36 |     
37 |     new_indices = [s.upper() for s in surv_df.index]
38 |     surv_df = pd.DataFrame(surv_df.values, index = new_indices, columns = surv_df.columns)
39 |     
40 |     new_columns = ['Node ' + str(i) for i in range(1, 151)]
41 |     new_indices = [s[:12] for s in data_df.index]
42 |     data_df = pd.DataFrame(data_df.values, index = new_indices, columns = new_columns)
43 |     
44 |     surv_df_sample_names = surv_df.index
45 |     data_df_sample_names = data_df.index
46 |     print("Surv samples ", surv_df_sample_names)
47 |     print("Data samples ", data_df_sample_names)
48 |     
49 |     #Take the samples available in both datasets
50 | #     intersect_indices = np.intersect1d(data_df.index, surv_df.index)
51 | #     print("Common indices ", intersect_indices)
52 |     
53 |     #Create joined dataframe
54 |     joined_df = data_df.merge(surv_df, left_index=True, right_index=True)
55 |     joined_df = joined_df.sort_index()
56 |     joined_df = joined_df.loc[~joined_df.index.duplicated(keep='first')]
57 |     print("Joined dataframe ", joined_df.shape)
58 |     print(joined_df)
59 |     joined_df.to_csv(output_folder + '/DeepProfile_Embedding_and_' + tcga_type + '_Survival_df.tsv', sep = '\t')
60 |     
61 | #Read cancer types
62 | cancer_type = sys.argv[1]
63 | tcga_type = sys.argv[2]
64 | 
65 | createJoinedDf(tcga_type, cancer_type)
66 | 


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_SURVIVAL_DATAFRAMES/Create_Joined_Survival_Dataframes_Cancer_Types.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for creating joined cancer types TCGA survival dataframes 
 3 | ###############################
 4 | 
 5 | import pandas as pd
 6 | import numpy as np
 7 | 
 8 | #Method for combining datasets
 9 | def create_Data(cancer):
10 |     
11 |     input_folder = '../../ALL_CANCER_FILES/' + cancer + '/' + 'TCGA_FILES/'
12 |     output_folder = '../../ALL_CANCER_FILES/' + cancer + '/' + 'TCGA_FILES/'
13 |     
14 |     c = np.where(np.asarray(cancer_types) == cancer)[0][0]
15 |     df_list = []
16 |     for test in test_cases[c]:
17 |         print("TCGA type ", test)
18 |         surv_df = pd.read_table(input_folder + '/DeepProfile_Embedding_and_' + test + '_Survival_df.tsv', sep = '\t', index_col = 0)
19 |         print("Survival dataframe ", surv_df.shape)
20 |         df_list.append(surv_df)
21 | 
22 |     #Combine dataframes
23 |     joined_df = pd.concat(df_list)
24 |     print("Joined survival dataframe ", joined_df.shape)
25 |     joined_df.to_csv(output_folder + '/DeepProfile_Embedding_and_' + cancer + '_Survival_df.tsv', sep = '\t')
26 | 
27 | cancer_types = ['LUNG']
28 | test_cases = [ ['LUAD', 'LUSC']]
29 |   
30 | for i in range(len(cancer_types)):
31 |     print("Cancer type ", cancer_types[i])
32 |     create_Data(cancer_types[i])
33 | 
34 | #Method for combining survival dataframes
35 | def create_Data(cancer):
36 |     
37 |     input_folder = '../../ALL_CANCER_FILES/' + cancer + '/' + 'TCGA_FILES/'
38 |     output_folder = '../../ALL_CANCER_FILES/' + cancer + '/' + 'TCGA_FILES/'
39 |     
40 |     c = np.where(np.asarray(cancer_types) == cancer)[0][0]
41 |     df_list = []
42 |     for test in test_cases[c]:
43 |         print("TCGA type ", test)
44 |         surv_df = pd.read_table(input_folder + 'TCGA_' + test + '_Survival_df.tsv', sep = ',', index_col = 0)
45 |         print("Survival dataframe ", surv_df.shape)
46 |         df_list.append(surv_df)
47 | 
48 |     #Combine dataframes
49 |     joined_df = pd.concat(df_list)
50 |     print("Joined survival dataframe ", joined_df.shape)
51 |     joined_df.to_csv(output_folder + 'TCGA_' + cancer + '_Survival_df.tsv', sep = '\t')
52 | 
53 | cancer_types = ['LUNG']
54 | test_cases = [ ['LUAD', 'LUSC']]
55 | 
56 | for i in range(len(cancer_types)):
57 |     print("Cancer type ", cancer_types[i])
58 |     create_Data(cancer_types[i])
59 | 
60 |    


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_SURVIVAL_DATAFRAMES/Create_TCGA_Survival_Dataframes.py:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | #Script for creating TCGA survival dataframes
 3 | ###############################
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import math
 8 | 
 9 | #Method for defining the survival dataframe
10 | def createSurvivalDF(cancer_type, tcga_type):
11 |     
12 |     input_folder = '../TCGA_DATA/TCGA_CLINICAL_DATA/'
13 |     output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/'
14 |     
15 |     #Read clinical data
16 |     survival_df = pd.read_table( input_folder + tcga_type + '.clin.merged.picked.txt', index_col = 0)
17 |     survival_df = survival_df.transpose()
18 |     print("TCGA clinical dataframe ", survival_df.shape)
19 |     print("TCGA clinical dataframe ", survival_df.columns)
20 |     
21 |     #Extract vital status, days to death, and days to follow up
22 |     vital_df = survival_df['vital_status']
23 |     dead_df = survival_df['days_to_death']
24 |     alive_df = survival_df['days_to_last_followup']
25 |     
26 |     #Create joined arrays
27 |     vital_status_array = []
28 |     days_status_array = []
29 |     for i in range(vital_df.shape[0]):
30 |         if int(vital_df.values[i])== 0:
31 |             vital_status_array.append(False)
32 |             days_status_array.append(alive_df.values[i])
33 |         else:
34 |             vital_status_array.append(True)
35 |             days_status_array.append(dead_df.values[i])
36 | 
37 | 
38 |     #Create joined dataframe
39 |     vital_status_df = pd.DataFrame(vital_status_array, index = survival_df.index, columns = ['Status'])
40 |     days_status_df = pd.DataFrame(days_status_array, index = survival_df.index, columns = ['Survival_in_days'])
41 |     joined_df = pd.concat([vital_status_df, days_status_df], axis = 1)
42 |     print("TCGA survival dataframe ", joined_df)
43 |     joined_df.to_csv(output_folder + '/TCGA_' + tcga_type + '_Survival_df.tsv')
44 |     
45 |     
46 | import sys
47 | cancer_type = sys.argv[1]
48 | tcga_type = sys.argv[2]
49 | 
50 | createSurvivalDF(cancer_type, tcga_type)
51 | 


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/PREDICT_SURVIVAL/Predict_Survival.py:
--------------------------------------------------------------------------------
  1 | ###############################
  2 | #Script for predicting survival status of patients
  3 | ###############################
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from scipy import stats
  8 | from sklearn import metrics
  9 | import random
 10 | 
 11 | from sklearn.metrics import roc_auc_score
 12 | from sklearn.model_selection import GridSearchCV
 13 | from sklearn.model_selection import LeaveOneOut
 14 | from sklearn.model_selection import KFold
 15 | from sklearn import linear_model
 16 | from sklearn.linear_model import LogisticRegression
 17 | from sklearn.metrics import average_precision_score
 18 | from sklearn.metrics import accuracy_score
 19 | from sklearn.preprocessing import StandardScaler
 20 | from sklearn.metrics import roc_curve, auc
 21 | 
 22 | #Define method for training models
 23 | def trait_classification_accuracy(X, Y):
 24 |     
 25 |     #Do cross validation
 26 |     loo = KFold(20, shuffle = True, random_state = 123456)
 27 |     
 28 |     predictions = np.zeros(X.shape[0])
 29 |     probabilities = np.zeros(X.shape[0])
 30 |     
 31 |     for train_index, test_index in loo.split(X):
 32 |         X_train, X_test = X[train_index], X[test_index]
 33 |         Y_train, Y_test = Y[train_index], Y[test_index]
 34 | 
 35 |         #Normalize training data
 36 |         scaler = StandardScaler()
 37 |         scaler.fit(X_train)
 38 | 
 39 |         X_std = scaler.transform(X_train)
 40 |         X_test_std = scaler.transform(X_test)
 41 |      
 42 |         # #Tune parameters
 43 |         tuned_parameters = [{'C': [0.001, 0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 100]}]
 44 |         
 45 |         model = LogisticRegression(random_state=12345, penalty = 'l1', max_iter=1000, 
 46 |                                    solver = 'liblinear')
 47 |         clf = GridSearchCV(model, tuned_parameters, cv = 5, scoring='roc_auc', n_jobs = -1)
 48 |         clf.fit(X_std, Y_train)
 49 |         
 50 |         #Record predictions and probabilities
 51 |         predicted_Y = clf.predict(X_test_std)
 52 |         predictions[test_index] = predicted_Y
 53 |         
 54 |         probs = clf.predict_proba(X_test_std)
 55 | 
 56 |         probabilities[test_index] = probs[:, 1]
 57 |         
 58 | 
 59 |     #Calculate accuracy and ROC-AUC
 60 |     accuracy = accuracy_score(Y, predictions)
 61 |     score = roc_auc_score(Y, probabilities)
 62 |     
 63 |     return [accuracy, score]
 64 | 
 65 | #Define method for predicting survival
 66 | def predict_survival(cancer_type, tcga_type, method, run_index, seed):
 67 |     
 68 |     accuracies = []
 69 |     aucs = []
 70 |     
 71 |     if method == 'PCA':
 72 |         X_df  = pd.read_table(input_folder + 'TCGA_RNASEQ_' + tcga_type + '_PCA_150L.tsv', index_col = 0)
 73 | 
 74 |     if method == 'ICA' or method == 'RP':
 75 |         X_df  = pd.read_table(input_folder + 'TCGA_RNASEQ_' + tcga_type + '_' + method + '_150L_' + str(run_index) + '.tsv', index_col = 0)
 76 | 
 77 |     if method == 'AE' or method == 'DAE':
 78 |         X_df  = pd.read_table(input_folder + 'TCGA_' + tcga_type + '_RNASeq_Expression_' + method + '_encoded_150L_' + str(run_index) + '.tsv', index_col = 0)
 79 |      
 80 |     if method == 'VAE':
 81 |         X_df  = pd.read_table(input_folder + 'TCGA_' + tcga_type + '_RNASeq_Expression_' + method + '_encoded_' + VAE_dim + 'L_' + str(run_index) + '.tsv', index_col = 0)
 82 |         
 83 |     if method == 'DeepProfile':
 84 |         X_df  = pd.read_table(input_folder + tcga_type + '_DeepProfile_TCGA_RNASeq_Embedding_150L.tsv', index_col = 0, sep = '\t')
 85 | 
 86 |     #Read expression data
 87 |     print("Expression data ", X_df.shape)
 88 |     print("Expression data ", X_df.index)
 89 |     
 90 |     #Now, replace X_df index to match with Y_df index
 91 |     mapper = lambda t: t[:12]
 92 |     vfunc = np.vectorize(mapper)
 93 |     newX_index = vfunc( X_df.index)
 94 | 
 95 |     X_df = pd.DataFrame(X_df.values, index = newX_index, columns = X_df.columns)
 96 | 
 97 |     #Take intersecting samples in datasets
 98 |     X_samples = X_df.index
 99 |     Y_samples = Y_df.index
100 |     intersecting_samples = np.intersect1d(X_samples, Y_samples)
101 | 
102 |     subX_df = X_df.T[intersecting_samples].T
103 |     subY_df = Y_df.T[intersecting_samples].T
104 | 
105 |     print("X dataframe ", subX_df.shape)
106 |     print("Y dataframe ", subY_df.shape)
107 | 
108 |     print("X dataframe ", subX_df.index)
109 |     print("Y dataframe ", subY_df.index)
110 | 
111 |     sample_indices = [np.where(subY_df.values == False)[0], np.where(subY_df.values == True)[0]]
112 |     sample_counts = [len(sample_indices[0]), len(sample_indices[1])]
113 |     print("SAMPLES LABEL 0: ", sample_counts[0], " SAMPLES LABEL 1: ", sample_counts[1])
114 | 
115 |     #Now select the class with highest number of samples and subsample
116 |     low_class = np.argmin(sample_counts)
117 |     high_class = np.argmax(sample_counts)
118 |     print("Lower class size ", sample_counts[low_class], "samples subsampled from class ", high_class)
119 |     random.seed(12345 * seed)
120 |     random_indices = random.sample(list(np.arange(0, sample_counts[high_class])), sample_counts[low_class])
121 |     selected_indices = np.sort(sample_indices[high_class][random_indices])
122 | 
123 |     subX_df = pd.concat([subX_df.iloc[sample_indices[low_class]], subX_df.iloc[selected_indices]])
124 |     subY_df = pd.concat([subY_df.iloc[sample_indices[low_class]], subY_df.iloc[selected_indices]])                           
125 |     subX_df = subX_df.sort_index()
126 |     subY_df = subY_df.sort_index()
127 | 
128 |     results = trait_classification_accuracy(subX_df.values, np.ravel(subY_df.values))
129 | 
130 |     return results
131 | 
132 | #Read user inputs
133 | import sys
134 | cancer_type = sys.argv[1] #main cancer type
135 | tcga_type = sys.argv[2] #TCGA type
136 | method = sys.argv[3] #name of the method
137 | run_index = int(sys.argv[4]) #run index
138 | if len(sys.argv) > 5:
139 |     VAE_dim = sys.argv[5]
140 |     
141 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/'
142 | output_folder = 'Prediction_Results/'
143 | 
144 | Y_df  = pd.read_table(input_folder + 'DeepProfile_Embedding_and_' + tcga_type + '_Survival_df.tsv', index_col = 0, sep = '\t')
145 | print("Survival dataframe ", Y_df.shape)
146 | 
147 | #print( Y_df[Y_df['fustat']  == 1]['futime'])
148 | 
149 | print("ALIVE..")
150 | print( Y_df[Y_df['fustat']  == 0]['futime'])
151 | print( np.mean(Y_df[Y_df['fustat']  == 0]['futime'].values))
152 | 
153 | #Select all dead patients, only if they died within 5 years
154 | Y_df_dead = Y_df[Y_df['fustat']  == 1]
155 | indices_dead  = np.where(Y_df_dead['futime'] < 5 * 365)[0]
156 | print("Dead within 5 year ", Y_df_dead.iloc[indices_dead][['fustat', 'futime']])
157 | print("Dead within 5 year  ", np.max(Y_df_dead.iloc[indices_dead]['futime']))
158 | 
159 | #Select all alive patients, only if they lived more than 5 years
160 | indices_alive  = np.where(Y_df['futime'] > 5 * 365)[0]
161 | print("Alive after 5 year ", Y_df.iloc[indices_alive][['fustat', 'futime']])
162 | print("Alive after 5 year ", np.min(Y_df.iloc[indices_alive]['futime']))
163 | 
164 | indices = list(indices_dead) + list(indices_alive)
165 | indices = np.unique(indices)
166 | Y_df = Y_df['fustat']
167 | Y_df = Y_df.iloc[indices]
168 | Y_df = Y_df.dropna()
169 | print("Survival dataframe \n ", Y_df)
170 | 
171 | class0_count = len(np.where(Y_df.values == 0)[0])
172 | class1_count = len(np.where(Y_df.values == 1)[0])
173 | 
174 | all_accuracies = [] 
175 | all_aucs = [] 
176 | 
177 | for sampling_index in range(50):
178 |     result = predict_survival(cancer_type, tcga_type, method, run_index, sampling_index)
179 |     print("Accuracy: ", result[0]) 
180 |     print("ROC-AUC: ", result[1]) 
181 |     all_accuracies.append(result[0])
182 |     all_aucs.append(result[1])
183 | 
184 | print("FINAL RESULTS...")
185 | print("MEAN ACCURACY ", np.mean(np.asarray(all_accuracies)))
186 | print("MEAN ROC-AUC ", np.mean(np.asarray(all_aucs)))
187 | 
188 | #Save results to a file
189 | if method == 'VAE':
190 |     np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + tcga_type + '_' + method + '_' + VAE_dim + 'L_' + str(run_index) + '_ACCs.txt', np.asarray(all_accuracies), delimiter='\n')
191 |     np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + tcga_type + '_' + method + '_' + VAE_dim + 'L_' + str(run_index) + '_AUCs.txt', np.asarray(all_aucs), delimiter='\n')
192 | 
193 | else:
194 |     np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + tcga_type + '_' + method + '_' + str(run_index) + '_ACCs.txt', np.asarray(all_accuracies), delimiter='\n')
195 |     np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + tcga_type + '_' + method + '_' + str(run_index) + '_AUCs.txt', np.asarray(all_aucs), delimiter='\n')
196 | 
197 | 
198 |     
199 |     
200 | 


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/PREDICT_SURVIVAL/Predict_Survival_Raw_Data.py:
--------------------------------------------------------------------------------
  1 | ###############################
  2 | #Predicting survival status of patients using raw gene data
  3 | ###############################
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from scipy import stats
  8 | from sklearn import metrics
  9 | import random
 10 | 
 11 | from sklearn.metrics import roc_auc_score
 12 | from sklearn.model_selection import GridSearchCV
 13 | from sklearn.model_selection import LeaveOneOut
 14 | from sklearn.model_selection import KFold
 15 | from sklearn import linear_model
 16 | from sklearn.linear_model import LogisticRegression
 17 | from sklearn.metrics import average_precision_score
 18 | from sklearn.metrics import accuracy_score
 19 | from sklearn.preprocessing import StandardScaler
 20 | from sklearn.metrics import roc_curve, auc
 21 | 
 22 | #Define method for training models
 23 | def trait_classification_accuracy(X, Y):
 24 |     
 25 |     #Do cross validation
 26 |     loo = KFold(20, shuffle = True, random_state = 123456)
 27 |     
 28 |     predictions = np.zeros(X.shape[0])
 29 |     probabilities = np.zeros(X.shape[0])
 30 |     
 31 |     for train_index, test_index in loo.split(X):
 32 |         X_train, X_test = X[train_index], X[test_index]
 33 |         Y_train, Y_test = Y[train_index], Y[test_index]
 34 | 
 35 |         #Normalize training data
 36 |         scaler = StandardScaler()
 37 |         scaler.fit(X_train)
 38 | 
 39 |         X_std = scaler.transform(X_train)
 40 |         X_test_std = scaler.transform(X_test)
 41 |      
 42 |         # #Tune parameters
 43 |         tuned_parameters = [{'C': [0.001, 0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 100]}]
 44 |         
 45 |         model = LogisticRegression(random_state=12345, penalty = 'l1', max_iter=1000, 
 46 |                                    solver = 'liblinear')
 47 |         clf = GridSearchCV(model, tuned_parameters, cv = 5, scoring='roc_auc', n_jobs = -1)
 48 |         clf.fit(X_std, Y_train)
 49 |         
 50 |         #Record predictions and probabilities
 51 |         predicted_Y = clf.predict(X_test_std)
 52 |         predictions[test_index] = predicted_Y
 53 |         
 54 |         probs = clf.predict_proba(X_test_std)
 55 | 
 56 |         probabilities[test_index] = probs[:, 1]
 57 |         
 58 | 
 59 |     #Calculate accuracy and ROC-AUC
 60 |     accuracy = accuracy_score(Y, predictions)
 61 |     score = roc_auc_score(Y, probabilities)
 62 |     
 63 |     return [accuracy, score]
 64 | 
 65 | #Define method for predicting survival
 66 | def predict_survival(X_inp,Y_inp,cancer_type, tcga_type, seed):
 67 |     
 68 |     accuracies = []
 69 |     aucs = []
 70 |     
 71 |     subX_df = X_inp
 72 |     subY_df = Y_inp
 73 |     
 74 |     print("X dataframe ", subX_df.shape)
 75 |     print("Y dataframe ", subY_df.shape)
 76 | 
 77 |     print("X dataframe ", subX_df.index)
 78 |     print("Y dataframe ", subY_df.index)
 79 | 
 80 |     sample_indices = [np.where(subY_df.values == False)[0], np.where(subY_df.values == True)[0]]
 81 |     sample_counts = [len(sample_indices[0]), len(sample_indices[1])]
 82 |     print("SAMPLES LABEL 0: ", sample_counts[0], " SAMPLES LABEL 1: ", sample_counts[1])
 83 | 
 84 |     #Now select the class with highest number of samples and subsample
 85 |     low_class = np.argmin(sample_counts)
 86 |     high_class = np.argmax(sample_counts)
 87 |     print("Lower class size ", sample_counts[low_class], "samples subsampled from class ", high_class)
 88 |     random.seed(12345 * seed)
 89 |     random_indices = random.sample(list(np.arange(0, sample_counts[high_class])), sample_counts[low_class])
 90 |     selected_indices = np.sort(sample_indices[high_class][random_indices])
 91 | 
 92 |     subX_df = pd.concat([subX_df.iloc[sample_indices[low_class]], subX_df.iloc[selected_indices]])
 93 |     subY_df = pd.concat([subY_df.iloc[sample_indices[low_class]], subY_df.iloc[selected_indices]])                           
 94 |     subX_df = subX_df.sort_index()
 95 |     subY_df = subY_df.sort_index()
 96 | 
 97 |     results = trait_classification_accuracy(subX_df.values, np.ravel(subY_df.values))
 98 | 
 99 |     return results
100 | 
101 | #Read user inputs
102 | run_index = 0
103 | # for cancer_type in ['BRCA', 'AML', 
104 | #                 'COLON', 
105 | #                 'BRAIN', 'OV', 
106 | #                 'SARCOMA', 'KIDNEY', 
107 | #                 'LIVER', 'STOMACH', 
108 | #                 'SKIN', 'UCEC', 
109 | #                 'HEAD_NECK', 'PANCREAS',
110 | #                 'CERVICAL', 'BLADDER', 'LUNG']:
111 | for cancer_type in ['HEAD_NECK', 'PANCREAS',
112 |                 'CERVICAL', 'BLADDER', 'LUNG']:
113 | 
114 |     if cancer_type == 'LUNG':
115 |         tcga_types = ['LUAD', 'LUSC']
116 | 
117 |     else:
118 |         cancer_types = ['BRCA', 'AML', 
119 |                     'COLON', 
120 |                     'BRAIN', 'OV', 
121 |                     'SARCOMA', 'KIDNEY', 
122 |                     'LIVER', 'STOMACH', 
123 |                     'SKIN', 'UTERINE', 
124 |                     'HEAD_NECK', 'PANCREAS',
125 |                     'CERVICAL', 'BLADDER', 'LUNG']
126 | 
127 |         tcga_types = ['BRCA', 'LAML', 
128 |                         'COADREAD', 
129 |                         'GBMLGG', 'OV', 
130 |                         'SARC', 'KIPAN', 
131 |                         'LIHC', 'STAD', 
132 |                         'SKCM', 'UCEC',
133 |                         'HNSC', 'PAAD',
134 |                         'CESC', 'BLCA', 'LUNG']
135 |         cti = cancer_types.index(cancer_type)
136 |         tcga_type = tcga_types[cti]
137 | 
138 |     input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/'
139 |     output_folder = 'Prediction_Results/'
140 | 
141 |     if cancer_type == 'LUNG':
142 |         df_list = []
143 |         for tcga_type in tcga_types:
144 |             Y_df  = pd.read_table(input_folder + 'DeepProfile_Embedding_and_' + tcga_type + '_Survival_df.tsv',
145 |                                   index_col = 0, sep = '\t')
146 |             print("Survival dataframe ", Y_df.shape)
147 |             df_list.append(Y_df)
148 | 
149 |         Y_df = pd.concat(df_list, axis = 0)
150 |     else:
151 |         Y_df  = pd.read_table(input_folder + 'DeepProfile_Embedding_and_' + tcga_type + '_Survival_df.tsv', 
152 |                               index_col = 0, sep = '\t')
153 |         print("Survival dataframe ", Y_df.shape)
154 | 
155 |     print("ALIVE..")
156 |     print( Y_df[Y_df['fustat']  == 0]['futime'])
157 |     print( np.mean(Y_df[Y_df['fustat']  == 0]['futime'].values))
158 | 
159 |     #Select all dead patients, only if they died within 5 years
160 |     Y_df_dead = Y_df[Y_df['fustat']  == 1]
161 |     indices_dead  = np.where(Y_df_dead['futime'] < 5 * 365)[0]
162 |     print("Dead within 5 year ", Y_df_dead.iloc[indices_dead][['fustat', 'futime']])
163 |     print("Dead within 5 year  ", np.max(Y_df_dead.iloc[indices_dead]['futime']))
164 | 
165 |     #Select all alive patients, only if they lived more than 5 years
166 |     indices_alive  = np.where(Y_df['futime'] > 5 * 365)[0]
167 |     print("Alive after 5 year ", Y_df.iloc[indices_alive][['fustat', 'futime']])
168 |     print("Alive after 5 year ", np.min(Y_df.iloc[indices_alive]['futime']))
169 | 
170 |     indices = list(indices_dead) + list(indices_alive)
171 |     indices = np.unique(indices)
172 |     Y_df = Y_df['fustat']
173 |     Y_df = Y_df.iloc[indices]
174 |     Y_df = Y_df.dropna()
175 |     print("Survival dataframe \n ", Y_df)
176 | 
177 |     raw_data_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/'
178 |     tcga_df = pd.read_table(raw_data_folder + 'TCGA_' + tcga_type + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', index_col= 0)
179 |     #Now, replace X_df index to match with Y_df index
180 |     mapper = lambda t: t[:12]
181 |     vfunc = np.vectorize(mapper)
182 |     newX_index = vfunc( tcga_df.index)
183 |     tcga_df.index = newX_index
184 |     tcga_labeled_df = tcga_df.loc[Y_df.index,:]
185 |     tcga_labeled_df = tcga_labeled_df[~tcga_labeled_df.index.duplicated(keep='first')]
186 | 
187 |     class0_count = len(np.where(Y_df.values == 0)[0])
188 |     class1_count = len(np.where(Y_df.values == 1)[0])
189 | 
190 |     all_accuracies = [] 
191 |     all_aucs = [] 
192 | 
193 |     for sampling_index in range(50):
194 |         result = predict_survival(tcga_labeled_df, Y_df, cancer_type, tcga_type, sampling_index)
195 |         print("Accuracy: ", result[0]) 
196 |         print("ROC-AUC: ", result[1]) 
197 |         all_accuracies.append(result[0])
198 |         all_aucs.append(result[1])
199 | 
200 |     method = 'RAW'
201 |     np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + cancer_type + '_' + method + '_' + str(run_index) + '_ACCs.txt', np.asarray(all_accuracies), delimiter='\n')
202 |     np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + cancer_type + '_' + method + '_' + str(run_index) + '_AUCs.txt', np.asarray(all_aucs), delimiter='\n')
203 | 


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/PREDICT_SURVIVAL/Predict_Survival_Subtypes_Joined.py:
--------------------------------------------------------------------------------
  1 | ###############################
  2 | #Script for predicting survival status of patients
  3 | ###############################
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from scipy import stats
  8 | from sklearn import metrics
  9 | import random
 10 | 
 11 | from sklearn.metrics import roc_auc_score
 12 | from sklearn.model_selection import GridSearchCV
 13 | from sklearn.model_selection import LeaveOneOut
 14 | from sklearn.model_selection import KFold
 15 | from sklearn import linear_model
 16 | from sklearn.linear_model import LogisticRegression
 17 | from sklearn.metrics import average_precision_score
 18 | from sklearn.metrics import accuracy_score
 19 | from sklearn.preprocessing import StandardScaler
 20 | from sklearn.metrics import roc_curve, auc
 21 | 
 22 | #Define method for training models
 23 | def trait_classification_accuracy(X, Y):
 24 |     
 25 |     #Do cross validation
 26 |     loo = KFold(20, shuffle = True, random_state = 123456)
 27 |     
 28 |     predictions = np.zeros(X.shape[0])
 29 |     probabilities = np.zeros(X.shape[0])
 30 |     
 31 |     for train_index, test_index in loo.split(X):
 32 |         X_train, X_test = X[train_index], X[test_index]
 33 |         Y_train, Y_test = Y[train_index], Y[test_index]
 34 | 
 35 |         #Normalize training data
 36 |         scaler = StandardScaler()
 37 |         scaler.fit(X_train)
 38 | 
 39 |         X_std = scaler.transform(X_train)
 40 |         X_test_std = scaler.transform(X_test)
 41 |      
 42 |         #Tune parameters
 43 |         tuned_parameters = [{'C': [0.001, 0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 100]}]
 44 |         
 45 |         model = LogisticRegression(random_state=12345, penalty = 'l1', max_iter=1000, 
 46 |                                    solver = 'liblinear')
 47 |         clf = GridSearchCV(model, tuned_parameters, cv = 5, scoring='roc_auc', n_jobs = -1)
 48 |         clf.fit(X_std, Y_train)
 49 |         
 50 |         #Record predictions and probabilities
 51 |         predicted_Y = clf.predict(X_test_std)
 52 |         predictions[test_index] = predicted_Y
 53 |         
 54 |         probs = clf.predict_proba(X_test_std)
 55 | 
 56 |         probabilities[test_index] = probs[:, 1]
 57 |         
 58 | 
 59 |     #Calculate accuracy and ROC-AUC
 60 |     accuracy = accuracy_score(Y, predictions)
 61 |     score = roc_auc_score(Y, probabilities)
 62 |     
 63 |     return [accuracy, score]
 64 | 
 65 | 
 66 | #Define method for predicting survival
 67 | def predict_survival(cancer_type, tcga_types, method, run_index, seed):
 68 |     print(tcga_types)
 69 |     accuracies = []
 70 |     aucs = []
 71 |     
 72 |     df_list = []
 73 |     for tcga_type in tcga_types:
 74 |         if method == 'PCA':
 75 |             X_df  = pd.read_table(input_folder + 'TCGA_RNASEQ_' + tcga_type + '_PCA_150L.tsv', index_col = 0)
 76 | 
 77 |         if method == 'ICA' or method == 'RP':
 78 |             X_df  = pd.read_table(input_folder + 'TCGA_RNASEQ_' + tcga_type + '_' + method + '_150L_' + str(run_index) + '.tsv', index_col = 0)
 79 | 
 80 |         if method == 'AE' or method == 'DAE':
 81 |             X_df  = pd.read_table(input_folder + 'TCGA_' + tcga_type + '_RNASeq_Expression_' + method + '_encoded_150L_' + str(run_index) + '.tsv', index_col = 0)
 82 |          
 83 |         if method == 'VAE':
 84 |             X_df  = pd.read_table(input_folder + 'TCGA_' + tcga_type + '_RNASeq_Expression_' + method + '_encoded_' + VAE_dim + 'L_' + str(run_index) + '.tsv', index_col = 0)
 85 |             
 86 |         if method == 'DeepProfile':
 87 |             X_df  = pd.read_table(input_folder + tcga_type + '_DeepProfile_TCGA_RNASeq_Embedding_150L.tsv', index_col = 0, sep = '\t')
 88 | 
 89 | 
 90 |         df_list.append(X_df)
 91 | 
 92 |     X_df = pd.concat(df_list, axis = 0)
 93 |     print("Expression data joined ", X_df.shape)
 94 | 
 95 |     #Now, replace X_df index to match with Y_df index
 96 |     mapper = lambda t: t[:12]
 97 |     vfunc = np.vectorize(mapper)
 98 |     newX_index = vfunc( X_df.index)
 99 | 
100 |     X_df = pd.DataFrame(X_df.values, index = newX_index, columns = X_df.columns)
101 | 
102 |     #Take intersecting samples in datasets
103 |     X_samples = X_df.index
104 |     Y_samples = Y_df.index
105 |     intersecting_samples = np.intersect1d(X_samples, Y_samples)
106 | 
107 |     subX_df = X_df.T[intersecting_samples].T
108 |     subY_df = Y_df.T[intersecting_samples].T
109 | 
110 |     print("X dataframe ", subX_df.shape)
111 |     print("Y dataframe ", subY_df.shape)
112 | 
113 |     print("X dataframe ", subX_df.index)
114 |     print("Y dataframe ", subY_df.index)
115 | 
116 |     sample_indices = [np.where(subY_df.values == False)[0], np.where(subY_df.values == True)[0]]
117 |     sample_counts = [len(sample_indices[0]), len(sample_indices[1])]
118 |     print("SAMPLES LABEL 0: ", sample_counts[0], " SAMPLES LABEL 1: ", sample_counts[1])
119 | 
120 |     #Now select the class with highest number of samples and subsample
121 |     low_class = np.argmin(sample_counts)
122 |     high_class = np.argmax(sample_counts)
123 |     print("Lower class size ", sample_counts[low_class], "samples subsampled from class ", high_class)
124 |     random.seed(12345 * seed)
125 |     random_indices = random.sample(list(np.arange(0, sample_counts[high_class])), sample_counts[low_class])
126 |     selected_indices = np.sort(sample_indices[high_class][random_indices])
127 | 
128 |     subX_df = pd.concat([subX_df.iloc[sample_indices[low_class]], subX_df.iloc[selected_indices]])
129 |     subY_df = pd.concat([subY_df.iloc[sample_indices[low_class]], subY_df.iloc[selected_indices]])                           
130 |     subX_df = subX_df.sort_index()
131 |     subY_df = subY_df.sort_index()
132 | 
133 |     results = trait_classification_accuracy(subX_df.values, np.ravel(subY_df.values))
134 |     return results
135 | 
136 | #Read user inputs
137 | import sys
138 | cancer_type = sys.argv[1] #main cancer type
139 | method = sys.argv[2] #name of the method
140 | run_index = int(sys.argv[3]) #run index
141 | if len(sys.argv) > 4:
142 |     VAE_dim = sys.argv[4]
143 | 
144 | if cancer_type == 'LUNG':
145 |     tcga_types = ['LUAD', 'LUSC']
146 | 
147 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/'
148 | output_folder = 'Prediction_Results/'
149 | 
150 | #Join data for cancer subtypes
151 | df_list = []
152 | for tcga_type in tcga_types:
153 |     Y_df  = pd.read_table(input_folder + 'DeepProfile_Embedding_and_' + tcga_type + '_Survival_df.tsv', index_col = 0, sep = '\t')
154 |     print("Survival dataframe ", Y_df.shape)
155 |     df_list.append(Y_df)
156 |     
157 | Y_df = pd.concat(df_list, axis = 0)
158 | print("JOINED survival dataframe ", Y_df.shape)
159 | 
160 | print("ALIVE..")
161 | print( Y_df[Y_df['fustat']  == 0]['futime'])
162 | print( np.mean(Y_df[Y_df['fustat']  == 0]['futime'].values))
163 | 
164 | #Select all dead patients, only if they died within a year
165 | Y_df_dead = Y_df[Y_df['fustat']  == 1]
166 | indices_dead  = np.where(Y_df_dead['futime'] < 5 * 365)[0]
167 | print("Dead within 5 year ", Y_df_dead.iloc[indices_dead][['fustat', 'futime']])
168 | print("Dead within 5 year  ", np.max(Y_df_dead.iloc[indices_dead]['futime']))
169 | 
170 | #Select all alive patients, only if they lived more than a year
171 | indices_alive  = np.where(Y_df['futime'] > 5 * 365)[0]
172 | print("Alive after 5 year ", Y_df.iloc[indices_alive][['fustat', 'futime']])
173 | print("Alive after 5 year ", np.min(Y_df.iloc[indices_alive]['futime']))
174 | 
175 | indices = list(indices_dead) + list(indices_alive)
176 | indices = np.unique(indices)
177 | Y_df = Y_df['fustat']
178 | Y_df = Y_df.iloc[indices]
179 | Y_df = Y_df.dropna()
180 | print("Survival dataframe \n ", Y_df)
181 | 
182 | class0_count = len(np.where(Y_df.values == 0)[0])
183 | class1_count = len(np.where(Y_df.values == 1)[0])
184 | 
185 | all_accuracies = [] 
186 | all_aucs = [] 
187 | 
188 | for sampling_index in range(50):
189 |     result = predict_survival(cancer_type, tcga_types, method, run_index, sampling_index)
190 |     print("Accuracy: ", result) 
191 |     print("ROC-AUC: ", result[1]) 
192 |     all_accuracies.append(result[0])
193 |     all_aucs.append(result[1])
194 | 
195 | print("FINAL RESULTS...")
196 | print("MEAN ACCURACY ", np.mean(np.asarray(all_accuracies)))
197 | print("MEAN ROC-AUC ", np.mean(np.asarray(all_aucs)))
198 | 
199 | #Save results to a file
200 | if method == 'VAE':
201 |     np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + cancer_type + '_' + method + '_' + VAE_dim + 'L_' + str(run_index) + '_ACCs.txt', np.asarray(all_accuracies), delimiter='\n')
202 |     np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + cancer_type + '_' + method + '_' + VAE_dim + 'L_' + str(run_index) + '_AUCs.txt', np.asarray(all_aucs), delimiter='\n')
203 | 
204 | else:
205 |     np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + cancer_type + '_' + method + '_' + str(run_index) + '_ACCs.txt', np.asarray(all_accuracies), delimiter='\n')
206 |     np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + cancer_type + '_' + method + '_' + str(run_index) + '_AUCs.txt', np.asarray(all_aucs), delimiter='\n')
207 |     
208 | 
209 |     
210 |     
211 | 


--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/PREDICT_SURVIVAL/Run_Models.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | run = int(sys.argv[1])
 3 | 
 4 | cancer_types = ['BRCA', 'AML', 
 5 |                 'COLON', 
 6 |                 'BRAIN', 'OV', 
 7 |                 'SARCOMA', 'KIDNEY', 
 8 |                 'LIVER', 'STOMACH', 
 9 |                 'SKIN', 'UCEC', 
10 |                 'HEAD_NECK', 'PANCREAS',
11 |                 'CERVICAL', 'BLADDER', 'LUNG']
12 | 
13 | tcga_types = ['BRCA', 'LAML', 
14 |                 'COADREAD', 
15 |                 'GBMLGG', 'OV', 
16 |                 'SARC', 'KIPAN', 
17 |                 'LIHC', 'STAD', 
18 |                 'SKCM', 'UTERINE',
19 |                 'HNSC', 'PAAD',
20 |                 'CESC', 'BLCA', 'LUNG']
21 | 
22 | for c in range(len(cancer_types)):
23 |     cancer_type = cancer_types[c]
24 |     tcga_type = tcga_types[c]
25 |     print("------------")
26 |     print(cancer_type)
27 |     print(tcga_type)
28 |     
29 |     if cancer_type == 'LUNG':
30 |      
31 |         get_ipython().magic(u"run -i     Predict_Survival_Subtypes_Joined.py " +  cancer_type + " " + "PCA " + str(run))
32 |         get_ipython().magic(u"run -i     Predict_Survival_Subtypes_Joined.py " +  cancer_type + " " + "DeepProfile " + str(run))
33 |         get_ipython().magic(u"run -i     Predict_Survival_Subtypes_Joined.py " +  cancer_type + " " + "ICA " + str(run + 1))
34 |         get_ipython().magic(u"run -i     Predict_Survival_Subtypes_Joined.py " +  cancer_type + " " + "RP " + str(run + 1))
35 |         get_ipython().magic(u"run -i     Predict_Survival_Subtypes_Joined.py " +  cancer_type + " " + "AE " + str(run))
36 |         get_ipython().magic(u"run -i     Predict_Survival_Subtypes_Joined.py " +  cancer_type + " " + "DAE " + str(run))
37 |         get_ipython().magic(u"run -i     Predict_Survival_Subtypes_Joined.py " +  cancer_type + " " + "VAE " + str(run) + " 5")
38 |         get_ipython().magic(u"run -i     Predict_Survival_Subtypes_Joined.py " +  cancer_type + " " + "VAE " + str(run) + " 10")
39 |         get_ipython().magic(u"run -i     Predict_Survival_Subtypes_Joined.py " +  cancer_type + " " + "VAE " + str(run) + " 25")
40 |         get_ipython().magic(u"run -i     Predict_Survival_Subtypes_Joined.py " +  cancer_type + " " + "VAE " + str(run) + " 50")
41 |         get_ipython().magic(u"run -i     Predict_Survival_Subtypes_Joined.py " +  cancer_type + " " + "VAE " + str(run) + " 75")
42 |         get_ipython().magic(u"run -i     Predict_Survival_Subtypes_Joined.py " +  cancer_type + " " + "VAE " + str(run) + " 100")
43 | 
44 |     else:
45 |         
46 |         get_ipython().magic(u"run -i     Predict_Survival.py " +  cancer_type + " "  +  tcga_type + " " + "PCA " + str(run))
47 |         get_ipython().magic(u"run -i     Predict_Survival.py " +  cancer_type + " "  +  tcga_type + " " + "DeepProfile " + str(run))
48 |         get_ipython().magic(u"run -i     Predict_Survival.py " +  cancer_type + " "  +  tcga_type + " " + "ICA " + str(run + 1))
49 |         get_ipython().magic(u"run -i     Predict_Survival.py " +  cancer_type + " "  +  tcga_type + " " + "RP " + str(run + 1))
50 |         get_ipython().magic(u"run -i     Predict_Survival.py " +  cancer_type + " "  +  tcga_type + " " + "AE " + str(run))
51 |         get_ipython().magic(u"run -i     Predict_Survival.py " +  cancer_type + " "  +  tcga_type + " " + "DAE " + str(run))
52 |         get_ipython().magic(u"run -i     Predict_Survival.py " +  cancer_type + " "  +  tcga_type + " " + "VAE " + str(run) + " 5")
53 |         get_ipython().magic(u"run -i     Predict_Survival.py " +  cancer_type + " "  +  tcga_type + " " + "VAE " + str(run) + " 10")
54 |         get_ipython().magic(u"run -i     Predict_Survival.py " +  cancer_type + " "  +  tcga_type + " " + "VAE " + str(run) + " 25")
55 |         get_ipython().magic(u"run -i     Predict_Survival.py " +  cancer_type + " "  +  tcga_type + " " + "VAE " + str(run) + " 50")
56 |         get_ipython().magic(u"run -i     Predict_Survival.py " +  cancer_type + " "  +  tcga_type + " " + "VAE " + str(run) + " 75")
57 |         get_ipython().magic(u"run -i     Predict_Survival.py " +  cancer_type + " "  +  tcga_type + " " + "VAE " + str(run) + " 100")
58 | 


--------------------------------------------------------------------------------