├── COMPETITOR_TRAININGS
├── AE_2Layers_Model.py
├── Create_ICA_Data.py
├── Create_PCA_Data.py
├── Create_RP_Data.py
├── DAE_2Layers_Model.py
├── Example_Run_All.py
├── Get_AE_IG_Attributions.py
├── Get_DAE_IG_Attributions.py
├── IntegratedGradients.py
├── Train_AE_Models.py
└── Train_DAE_Models.py
├── LICENSE
├── MODEL_TRAININGS
├── Create_DeepProfile_Ensemble_Weights.py
├── Create_DeepProfile_Training_Embeddings.py
├── Create_Ensemble_Labels.py
├── Create_PCs_for_DeepLearning_Models.py
├── Example_Run_All.py
├── Get_VAE_IG_Attributions.py
├── IntegratedGradients.py
├── Run_VAE_Models.py
├── Select_Latent_Dimension_with_Gmeans.ipynb
├── VAE_3Layers_Model.py
└── gmeans.py
├── NORMAL_TISSUE_ANALYSIS
├── Create_DeepProfile_GTEX_Embeddings.py
├── Create_Gtex_Rnaseq_PCs.py
├── Encode_GTEX_Data_with_VAE.py
├── Example_Run_All.py
├── Gtex_Tissue_Name_Mappings.ipynb
├── Normal_Tissue_Classifier.py
└── Preprocess_Gtex_Rnaseq_Expressions.py
├── PATHWAY_ANALYSIS
├── Create_Pathway_Matrices.py
├── Fishers_Test.py
├── PATHWAY_COVERAGE_ANALYSIS
│ ├── Plot_of_Average_Pathway_Coverages.ipynb
│ ├── Plot_of_Node_Level_Pathway_Annotations.ipynb
│ ├── Plot_of_Pathway_Coverage_Distributions.ipynb
│ ├── Plot_of_Pathway_Detection_Comparison_VAEs_vs_DeepProfile.ipynb
│ └── Plot_of_Pathway_Percent_Comparison_VAEs_vs_DeepProfile.ipynb
└── Run_Multiple_Fishers_Test.py
├── README.md
└── TCGA_SURVIVAL_PREDICTION
├── COMPARING_RNASEQ_and_MICROARRAY
├── Create_DeepProfile_TCGA_Microarray_Embeddings.py
├── Create_TCGA_Microarray_PCs.py
├── Encode_TCGA_Microarray_Data_with_VAE.py
├── Preprocess_TCGA_Microarray_Expression.py
└── Rnaseq_and_Microarray_Embedding_Correlation_Plots.ipynb
├── CREATE_EMBEDDINGS
├── Create_All_VAE_Embeddings.py
├── Create_DeepProfile_TCGA_Embeddings.py
├── Create_TCGA_Rnaseq_PCs.py
├── Encode_TCGA_Data_with_AE.py
├── Encode_TCGA_Data_with_DAE.py
├── Encode_TCGA_Data_with_ICA.py
├── Encode_TCGA_Data_with_PCA.py
├── Encode_TCGA_Data_with_RP.py
├── Encode_TCGA_Data_with_VAE.py
├── Example_Run_All.py
├── Preprocess_TCGA_Rnaseq_Expression.py
├── Preprocess_TCGA_Rnaseq_Expression_All_Genes.py
└── Preprocess_TCGA_Rnaseq_Expression_All_Genes_Uncorrected.py
├── CREATE_SURVIVAL_DATAFRAMES
├── Create_Joined_Survival_Dataframes.py
├── Create_Joined_Survival_Dataframes_Cancer_Types.py
└── Create_TCGA_Survival_Dataframes.py
└── PREDICT_SURVIVAL
├── Plots_of_Survival_Prediction.ipynb
├── Plots_of_Survival_Prediction_VAEs.ipynb
├── Predict_Survival.py
├── Predict_Survival_Raw_Data.py
├── Predict_Survival_Subtypes_Joined.py
└── Run_Models.py
/COMPETITOR_TRAININGS/AE_2Layers_Model.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #AE model
3 |
4 | #Code is modified from https://github.com/keras-team/keras/blob/master/examples/variational_autoencoder.py
5 | ###############################
6 |
7 | import os
8 | import numpy as np
9 | import pandas as pd
10 | import math
11 | from sklearn.metrics import mean_squared_error
12 | import matplotlib.pyplot as plt
13 | import tensorflow as tf
14 | from keras.layers import Input, Dense, Lambda, Layer, Activation, Dropout
15 | from keras.layers.normalization import BatchNormalization
16 | from keras.models import Model
17 | from keras import backend as K
18 | from keras import metrics, optimizers
19 | from keras.callbacks import Callback
20 | import keras
21 | import csv
22 | import sys
23 |
24 | #Prevent tensorflow from using all the memory
25 | config = tf.ConfigProto()
26 | config.gpu_options.allow_growth=True
27 | sess = tf.Session(config=config)
28 |
29 | #Define reconstruction loss
30 | def reconstruction_loss(x_input, x_decoded):
31 | return metrics.mse(x_input, x_decoded)
32 |
33 | # Set hyperparameters
34 | original_dim = input_df.shape[1]
35 | intermediate1_dim = 750
36 | latent_dim = 150
37 | cancer_type = sys.argv[1]
38 | fold = int(sys.argv[2])
39 |
40 | #SET RANDOM SEEDS
41 | from numpy.random import seed
42 | seed(123456 * fold)
43 | from tensorflow import set_random_seed
44 | set_random_seed(123456 * fold)
45 |
46 | init_mode = 'glorot_uniform'
47 | batch_size = 100
48 | epochs = 50
49 | learning_rate = 0.0005
50 | dropout = 0.1
51 |
52 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
53 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/AE_FILES/'
54 |
55 | #Read input file
56 | input_filename = input_folder + cancer_type + '_DATA_TOP2_JOINED_PCA_1000L.tsv'
57 | output_filename = cancer_type + '_DATA_TOP2_INTERSECTION_GENES_encoded_'
58 |
59 | input_df = pd.read_table(input_filename, index_col=0)
60 | print("INPUT FILE", input_df.shape)
61 | print(input_df.head(5))
62 | input_df_training = input_df
63 |
64 | #Define encoder
65 | x = Input(shape=(original_dim, ))
66 |
67 | net = Dense(intermediate1_dim, kernel_initializer=init_mode)(x)
68 | net2 = BatchNormalization()(net)
69 | net3 = Activation('relu')(net2)
70 |
71 | d1 = Dropout(dropout)(net3)
72 | core = Dense(latent_dim, kernel_initializer=init_mode)(d1)
73 |
74 | #Define decoder
75 | decoder_h = Dense(intermediate1_dim, activation='relu', kernel_initializer=init_mode)
76 | d2 = Dropout(dropout)
77 | decoder_mean = Dense(original_dim, kernel_initializer=init_mode)
78 |
79 | h_decoded = decoder_h(core)
80 | h_decoded2 = d2(h_decoded)
81 | x_decoded_mean = decoder_mean(h_decoded2)
82 |
83 | #AE model
84 | ae = Model(x, x_decoded_mean)
85 |
86 | adam = optimizers.Adam(lr=learning_rate)
87 | ae.compile(optimizer=adam, loss = reconstruction_loss)
88 | ae.summary()
89 |
90 |
91 | #Train model
92 | history = ae.fit(np.array(input_df_training), np.array(input_df_training),
93 | shuffle=True,
94 | epochs=epochs,
95 | batch_size=batch_size,
96 | verbose = 2)
97 |
98 | # DEFINE ENCODER
99 | encoder = Model(x, core)
100 |
101 | #DEFINE DECODER
102 | decoder_input = Input(shape=(latent_dim, ))
103 | _h_decoded = decoder_h(decoder_input)
104 | _h_decoded2 = d2(_h_decoded)
105 | _x_decoded_mean = decoder_mean(_h_decoded2)
106 | decoder = Model(decoder_input, _x_decoded_mean)
107 |
108 |
109 | training_encoded = encoder.predict(input_df_training, batch_size = batch_size)
110 | training_encoded_df = pd.DataFrame(training_encoded, index = input_df_training.index)
111 |
112 | # How well does the model reconstruct the input data
113 | training_reconstructed = decoder.predict(np.array(training_encoded_df))
114 | training_reconstructed_df = pd.DataFrame(training_reconstructed, index = input_df_training.index, columns = input_df_training.columns)
115 |
116 | recons_error = mean_squared_error(np.array(input_df_training), np.array(training_reconstructed_df))
117 |
118 | print("TRAINING RECONSTRUCTION ERROR: " + str(recons_error))
119 |
120 | #Save encoded test data
121 | training_encoded_df.to_csv(output_folder + output_filename + str(latent_dim) + "L_TRAINING_fold" + str(fold) + ".tsv", sep = '\t')
122 |
123 |
124 | #SAVE ENCODER MODEL
125 | from keras.models import model_from_json
126 |
127 | model_json = encoder.to_json()
128 | with open(output_folder + "AE_" + cancer_type + "_encoder_" + str(latent_dim) + "L_"+ str(fold) + ".json", "w") as json_file:
129 | json_file.write(model_json)
130 |
131 | encoder.save_weights(output_folder + "AE_" + cancer_type + "_encoder_" + str(latent_dim) + "L_"+ str(fold) + ".h5")
132 | print("Saved model to disk")
133 |
134 |
135 | model_json = decoder.to_json()
136 | with open(output_folder + "AE_" + cancer_type + "_decoder_" + str(latent_dim) + "L_"+ str(fold) + ".json", "w") as json_file:
137 | json_file.write(model_json)
138 |
139 | decoder.save_weights(output_folder + "AE_" + cancer_type + "_decoder_" + str(latent_dim) + "L_"+ str(fold) + ".h5")
140 | print("Saved model to disk")
141 |
142 |
143 | #Record training, validation, and test R2
144 | from sklearn.metrics import r2_score
145 |
146 | training_r2_vals = np.zeros(input_df_training.shape[0])
147 | for i in range(input_df_training.shape[0]):
148 | training_r2 = r2_score(input_df_training.values[i, :], training_reconstructed_df.values[i, :])
149 | training_r2_vals[i] = training_r2
150 |
151 | print("TRAINING R2 " + str(np.mean(training_r2_vals)))
152 |
--------------------------------------------------------------------------------
/COMPETITOR_TRAININGS/Create_ICA_Data.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for training ICA models
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import math
8 | import csv
9 | from sklearn.decomposition import FastICA
10 | import sys
11 |
12 | #Read cancer type
13 | cancer_type = sys.argv[1]
14 |
15 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
16 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/ICA_FILES/'
17 |
18 | L = 150
19 | print("Number of latent nodes " + str(L))
20 |
21 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
22 | print("Training data ", data_df.shape)
23 |
24 | training_data = data_df.values
25 | training_data = np.nan_to_num(training_data)
26 |
27 | #Fit 10 different ICA models with different random seeds
28 | for run in range(10):
29 | ica = FastICA(n_components = L, random_state = 12345 * run, max_iter = 100000)
30 | print(ica)
31 |
32 | ica.fit(training_data)
33 |
34 | components = ica.components_
35 | print(components.shape)
36 |
37 | #Save the learned components
38 | component_df = pd.DataFrame(components.T, index = data_df.columns)
39 | component_df.to_csv(output_folder + cancer_type + '_DATA_TOP2_JOINED_ICA_COMPONENTS_' + str(L) + 'L_fold' + str(run + 1) + '.tsv', sep = '\t')
40 |
--------------------------------------------------------------------------------
/COMPETITOR_TRAININGS/Create_PCA_Data.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for training PCA models
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import csv
8 | from sklearn.decomposition import PCA
9 | import sys
10 |
11 | #Read cancer type
12 | cancer_type = sys.argv[1]
13 |
14 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
15 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/PCA_FILES/'
16 |
17 | #Method for defining PCs for training data
18 | def createData(cancer_type):
19 |
20 | L = 150
21 | print("Number of latent nodes " + str(L))
22 |
23 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
24 | print("Training data ", data_df.shape)
25 |
26 | training_data = data_df.values
27 | training_data = np.nan_to_num(training_data)
28 |
29 | #Fit PCA model
30 | pca = PCA(n_components = L)
31 | pca.fit(training_data)
32 | components = pca.components_
33 | print("PCA components ", components.shape)
34 |
35 | #Save the learned components
36 | component_df = pd.DataFrame(components.T, index = data_df.columns)
37 | component_df.to_csv(output_folder + cancer_type + '_DATA_TOP2_JOINED_PCA_COMPONENTS_' + str(L) + 'L.tsv', sep = '\t')
38 |
39 |
40 | createData(cancer_type)
--------------------------------------------------------------------------------
/COMPETITOR_TRAININGS/Create_RP_Data.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for training Random Projection models
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import csv
8 | from sklearn.random_projection import GaussianRandomProjection
9 | import sys
10 |
11 | #Read cancer type
12 | cancer_type = sys.argv[1]
13 |
14 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
15 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/RP_FILES/'
16 |
17 | #Method for defining ICA for training data
18 | def createData(cancer_type):
19 |
20 | L = 150
21 | print("Number of latent nodes " + str(L))
22 |
23 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
24 | print("Training data ", data_df.shape)
25 |
26 | training_data = data_df.values
27 | training_data = np.nan_to_num(training_data)
28 |
29 | #Fit 10 different RP models with different random seeds
30 | for run in range(10):
31 | transformer = GaussianRandomProjection(n_components = L, random_state = run * 12345)
32 | transformer.fit(training_data)
33 |
34 | components = transformer.components_
35 | print(components.shape)
36 |
37 | #Save the learned components
38 | component_df = pd.DataFrame(components.T, index = data_df.columns)
39 | component_df.to_csv(output_folder + cancer_type + '_DATA_TOP2_JOINED_RP_COMPONENTS_fold' + str(run + 1) + '.tsv', sep = '\t')
40 |
41 |
42 | createData(cancer_type)
--------------------------------------------------------------------------------
/COMPETITOR_TRAININGS/DAE_2Layers_Model.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Author: Ayse Dincer
3 | #DAE model
4 |
5 | #Code is modified from https://github.com/keras-team/keras/blob/master/examples/variational_autoencoder.py
6 | ###############################
7 |
8 | import os
9 | import numpy as np
10 | import pandas as pd
11 | import math
12 | from sklearn.metrics import mean_squared_error
13 | import matplotlib.pyplot as plt
14 |
15 | import tensorflow as tf
16 | from keras.layers import Input, Dense, Lambda, Layer, Activation, Dropout
17 | from keras.layers.normalization import BatchNormalization
18 | from keras.models import Model
19 | from keras import backend as K
20 | from keras import metrics, optimizers
21 | from keras.callbacks import Callback
22 | import keras
23 | import csv
24 |
25 | import sys
26 |
27 | #Prevent tensorflow from using all the memory
28 | config = tf.ConfigProto()
29 | config.gpu_options.allow_growth=True
30 | sess = tf.Session(config=config)
31 |
32 | #Define reconstruction loss
33 | def reconstruction_loss(x_input, x_decoded):
34 | return metrics.mse(x_input, x_decoded)
35 |
36 | #Read input file
37 | cancer_type = sys.argv[1]
38 |
39 | # Set hyperparameters
40 | intermediate1_dim = 750
41 | latent_dim = 150
42 | cancer_type = sys.argv[1]
43 | fold = int(sys.argv[2])
44 |
45 | init_mode = 'glorot_uniform'
46 | batch_size = 100
47 | epochs = 50
48 | learning_rate = 0.0005
49 | dropout = 0.1
50 |
51 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
52 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/DAE_FILES/'
53 |
54 | #SET RANDOM SEEDS
55 | from numpy.random import seed
56 | seed(123456 * fold)
57 | from tensorflow import set_random_seed
58 | set_random_seed(123456 * fold)
59 |
60 | input_filename = input_folder + cancer_type + '_DATA_TOP2_JOINED_PCA_1000L.tsv'
61 | output_filename = cancer_type + '_DATA_TOP2_INTERSECTION_GENES_encoded_'
62 |
63 | input_df = pd.read_table(input_filename, index_col=0)
64 | print("INPUT FILE", input_df.shape)
65 | print(input_df.head(5))
66 | input_df_training = input_df
67 |
68 | original_dim = input_df.shape[1]
69 |
70 | #Define noisy inputs
71 | noise = np.random.normal(loc=0, scale = 1, size=input_df_training.shape)
72 | input_df_noisy = input_df_training.values + noise
73 |
74 | #Define encoder
75 | x = Input(shape=(original_dim, ))
76 |
77 | net = Dense(intermediate1_dim, kernel_initializer=init_mode)(x)
78 | net2 = BatchNormalization()(net)
79 | net3 = Activation('relu')(net2)
80 |
81 | d1 = Dropout(dropout)(net3)
82 | core = Dense(latent_dim, kernel_initializer=init_mode)(d1)
83 |
84 | #Define decoder
85 | decoder_h = Dense(intermediate1_dim, activation='relu', kernel_initializer=init_mode)
86 | d2 = Dropout(dropout)
87 | decoder_mean = Dense(original_dim, kernel_initializer=init_mode)
88 |
89 | h_decoded = decoder_h(core)
90 | h_decoded2 = d2(h_decoded)
91 | x_decoded_mean = decoder_mean(h_decoded2)
92 |
93 | #DAE model
94 | dae = Model(x, x_decoded_mean)
95 |
96 | adam = optimizers.Adam(lr=learning_rate)
97 | dae.compile(optimizer=adam, loss = reconstruction_loss)
98 | dae.summary()
99 |
100 |
101 | #Train from only training data
102 | history = dae.fit(np.array(input_df_noisy), np.array(input_df_training),
103 | shuffle=True,
104 | epochs=epochs,
105 | batch_size=batch_size,
106 | verbose = 2)
107 |
108 | # DEFINE ENCODER
109 | encoder = Model(x, core)
110 |
111 | #DEFINE DECODER
112 | decoder_input = Input(shape=(latent_dim, ))
113 | _h_decoded = decoder_h(decoder_input)
114 | _h_decoded2 = d2(_h_decoded)
115 | _x_decoded_mean = decoder_mean(_h_decoded2)
116 | decoder = Model(decoder_input, _x_decoded_mean)
117 |
118 |
119 | training_encoded = encoder.predict(input_df_training, batch_size = batch_size)
120 | training_encoded_df = pd.DataFrame(training_encoded, index = input_df_training.index)
121 |
122 | # How well does the model reconstruct the input data
123 | training_reconstructed = decoder.predict(np.array(training_encoded_df))
124 | training_reconstructed_df = pd.DataFrame(training_reconstructed, index = input_df_training.index, columns = input_df_training.columns)
125 |
126 | recons_error = mean_squared_error(np.array(input_df_training), np.array(training_reconstructed_df))
127 |
128 | print("TRAINING RECONSTRUCTION ERROR: " + str(recons_error))
129 |
130 | #Save encoded test data
131 | training_encoded_df.to_csv(output_folder + output_filename + str(latent_dim) + "L_TRAINING_fold" + str(fold) + ".tsv", sep = '\t')
132 |
133 |
134 | #SAVE ENCODER MODEL
135 | from keras.models import model_from_json
136 |
137 | model_json = encoder.to_json()
138 | with open(output_folder + "DAE_" + cancer_type + "_encoder_" + str(latent_dim) + "L_"+ str(fold) + ".json", "w") as json_file:
139 | json_file.write(model_json)
140 |
141 | encoder.save_weights(output_folder + "DAE_" + cancer_type + "_encoder_" + str(latent_dim) + "L_"+ str(fold) + ".h5")
142 | print("Saved model to disk")
143 |
144 |
145 | model_json = decoder.to_json()
146 | with open(output_folder + "DAE_" + cancer_type + "_decoder_" + str(latent_dim) + "L_"+ str(fold) + ".json", "w") as json_file:
147 | json_file.write(model_json)
148 |
149 | decoder.save_weights(output_folder + "DAE_" + cancer_type + "_decoder_" + str(latent_dim) + "L_"+ str(fold) + ".h5")
150 | print("Saved model to disk")
151 |
152 |
153 | #Record training R2
154 | from sklearn.metrics import r2_score
155 |
156 | training_r2_vals = np.zeros(input_df_training.shape[0])
157 | for i in range(input_df_training.shape[0]):
158 | training_r2 = r2_score(input_df_training.values[i, :], training_reconstructed_df.values[i, :])
159 | training_r2_vals[i] = training_r2
160 |
161 | print("TRAINING R2 " + str(np.mean(training_r2_vals)))
162 |
--------------------------------------------------------------------------------
/COMPETITOR_TRAININGS/Example_Run_All.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Example for training competitor models for a cancer type
3 | ###############################
4 |
5 | get_ipython().magic(u"run -i Create_PCA_Data.py BRCA")
6 |
7 | get_ipython().magic(u"run -i Create_ICA_Data.py BRCA")
8 |
9 | get_ipython().magic(u"run -i Create_RP_Data.py BRCA")
10 |
11 | get_ipython().magic(u"run -i Train_AE_Models.py BRCA")
12 | get_ipython().magic(u"run -i Get_AE_IG_Attributions.py BRCA 0")
13 |
14 | get_ipython().magic(u"run -i Train_DAE_Models.py BRCA")
15 | get_ipython().magic(u"run -i Get_DAE_IG_Attributions.py BRCA 0")
16 |
--------------------------------------------------------------------------------
/COMPETITOR_TRAININGS/Get_AE_IG_Attributions.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for running integrated gradients to get gene-level attributions of each node
3 | ###############################
4 |
5 | import os
6 | import numpy as np
7 | import pandas as pd
8 | import math
9 | from sklearn.metrics import mean_squared_error
10 | import tensorflow as tf
11 | from keras.layers import Input, Dense, Lambda, Layer, Activation
12 | from keras.layers.normalization import BatchNormalization
13 | from keras.models import Model
14 | from keras import backend as K
15 | from keras import metrics, optimizers
16 | from keras.callbacks import Callback
17 | import keras
18 | import csv
19 | from keras.models import model_from_json
20 | import sys
21 |
22 | #Define reconstruction loss
23 | def reconstruction_loss(x_input, x_decoded):
24 | return metrics.mse(x_input, x_decoded)
25 |
26 | #Prevent tensorflow from using all the memory
27 | config = tf.ConfigProto()
28 | config.gpu_options.allow_growth=True
29 | sess = tf.Session(config=config)
30 |
31 | #Read all user inputs
32 | cancer = sys.argv[1]
33 | vae_run = int(sys.argv[2])
34 | dimension = 150
35 |
36 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
37 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/AE_FILES/'
38 |
39 | #Load PCA weights
40 | pca_df = pd.read_table(input_folder + cancer + '_DATA_TOP2_JOINED_PCA_1000L_COMPONENTS.tsv', index_col = 0)
41 | print("PCA COMPONENTS ", pca_df.shape)
42 | pca_components = pca_df.values
43 |
44 |
45 | #Save the weight for each 100 runs
46 | print("MODEL " + str(vae_run))
47 |
48 | #Load model
49 | json_file = open(input_folder 'AE_FILES/AE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(vae_run) + '.json', 'r')
50 | loaded_model_json = json_file.read()
51 | json_file.close()
52 |
53 | encoder = model_from_json(loaded_model_json)
54 | encoder.load_weights(input_folder + 'AE_FILES/AE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(vae_run) + '.h5')
55 | print("Loaded model from disk")
56 |
57 | #Read input data
58 | input_df = pd.read_table(input_folder + cancer + '_DATA_TOP2_JOINED_PCA_1000L.tsv', index_col=0)
59 | print("INPUT FILE ", input_df.shape)
60 |
61 | #Define hyperparameters
62 | input_df_training = input_df
63 | original_dim = input_df_training.shape[1]
64 | intermediate1_dim = 100
65 | intermediate2_dim = 25
66 | latent_dim = dimension
67 |
68 | batch_size = 50
69 | epochs = 50
70 | learning_rate = 0.0005
71 | beta = K.variable(1)
72 | kappa = 0
73 |
74 | #Define encoder
75 | x = Input(shape=(original_dim, ))
76 |
77 | net = Dense(intermediate1_dim)(x)
78 | net2 = BatchNormalization()(net)
79 | net3 = Activation('relu')(net2)
80 |
81 | net4 = Dense(intermediate2_dim)(net3)
82 | net5 = BatchNormalization()(net4)
83 | net6 = Activation('relu')(net5)
84 |
85 | adam = optimizers.Adam(lr=learning_rate)
86 | encoder.compile(optimizer=adam, loss = reconstruction_loss)
87 | encoder.summary()
88 |
89 | #Encode training data using the model
90 | training_encoded = encoder.predict(input_df_training, batch_size = batch_size)
91 | print("ENCODED TRAINING DATA ", training_encoded.shape)
92 |
93 | #Measure weights and save absolute value of importance, averaged over samples
94 | from IntegratedGradients import *
95 |
96 | ig = integrated_gradients(encoder)
97 |
98 | overall_weights = np.zeros((pca_components.shape[0], dimension))
99 |
100 | for latent in range(dimension):
101 | print("Node " + str(latent + 1))
102 | weights = np.zeros((pca_components.shape[0], input_df_training.shape[0]))
103 |
104 | for i in range(input_df_training.shape[0]):
105 | vals = ig.explain(input_df_training.values[i, :], latent)
106 | new_vals = np.matmul(vals, pca_components.T)
107 | weights[:, i] = new_vals
108 |
109 | #Take absolute values avg over all samples
110 | overall_weights[:, latent] = np.mean(np.abs(weights), axis = 1)
111 |
112 | ig_df = pd.DataFrame(overall_weights, index = pca_df.index)
113 | print("EXPLANATIONS DF ", ig_df.shape)
114 |
115 | ig_df.to_csv(output_folder + cancer + '_DATA_AE_Weights_TRAINING_' + str(dimension) + 'L_fold' + str(vae_run) + '.tsv', sep='\t', quoting = csv.QUOTE_NONE)
116 | print(ig_df.shape)
117 |
--------------------------------------------------------------------------------
/COMPETITOR_TRAININGS/Get_DAE_IG_Attributions.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for running integrated gradients to get gene-level attributions of each node
3 | ###############################
4 |
5 | import os
6 | import numpy as np
7 | import pandas as pd
8 | import math
9 | from sklearn.metrics import mean_squared_error
10 | import tensorflow as tf
11 | from keras.layers import Input, Dense, Lambda, Layer, Activation
12 | from keras.layers.normalization import BatchNormalization
13 | from keras.models import Model
14 | from keras import backend as K
15 | from keras import metrics, optimizers
16 | from keras.callbacks import Callback
17 | import keras
18 | import csv
19 | from keras.models import model_from_json
20 | import sys
21 |
22 | #Prevent tensorflow from using all the memory
23 | config = tf.ConfigProto()
24 | config.gpu_options.allow_growth=True
25 | sess = tf.Session(config=config)
26 |
27 | #Read all user inputs
28 | cancer = sys.argv[1]
29 | vae_run = int(sys.argv[2])
30 | dimension = 150
31 |
32 | input_folder = '../ALL_CANCER_FILES/' + cancer + '/'
33 | output_folder = '../ALL_CANCER_FILES/' + cancer + '/DAE_FILES/'
34 |
35 | #Load PCA weights
36 | pca_df = pd.read_table(input_folder + cancer + '_DATA_TOP2_JOINED_PCA_1000L_COMPONENTS.tsv', index_col = 0)
37 | print("PCA COMPONENTS ", pca_df.shape)
38 | pca_components = pca_df.values
39 |
40 | #Define reconstruction loss
41 | def reconstruction_loss(x_input, x_decoded):
42 | return metrics.mse(x_input, x_decoded)
43 |
44 | #Save the weight for each 100 runs
45 | print("MODEL " + str(vae_run))
46 |
47 | #Load model
48 | json_file = open(input_folder + 'DAE_FILES/DAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(vae_run) + '.json', 'r')
49 | loaded_model_json = json_file.read()
50 | json_file.close()
51 |
52 | encoder = model_from_json(loaded_model_json)
53 | encoder.load_weights(input_folder + 'DAE_FILES/DAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(vae_run) + '.h5')
54 | print("Loaded model from disk")
55 |
56 | #Read input data
57 | input_df = pd.read_table(input_folder + cancer + '_DATA_TOP2_JOINED_PCA_1000L.tsv', index_col=0)
58 | print("INPUT FILE ", input_df.shape)
59 |
60 | #Define hyperparameters
61 | input_df_training = input_df
62 | original_dim = input_df_training.shape[1]
63 | intermediate1_dim = 100
64 | intermediate2_dim = 25
65 | latent_dim = dimension
66 |
67 | batch_size = 50
68 | epochs = 50
69 | learning_rate = 0.0005
70 | beta = K.variable(1)
71 | kappa = 0
72 |
73 | #Define encoder
74 | x = Input(shape=(original_dim, ))
75 |
76 | net = Dense(intermediate1_dim)(x)
77 | net2 = BatchNormalization()(net)
78 | net3 = Activation('relu')(net2)
79 |
80 | net4 = Dense(intermediate2_dim)(net3)
81 | net5 = BatchNormalization()(net4)
82 | net6 = Activation('relu')(net5)
83 |
84 | adam = optimizers.Adam(lr=learning_rate)
85 | encoder.compile(optimizer=adam, loss = reconstruction_loss)
86 | encoder.summary()
87 |
88 | #Encode training data using the model
89 | training_encoded = encoder.predict(input_df_training, batch_size = batch_size)
90 | print("ENCODED TRAINING DATA ", training_encoded.shape)
91 |
92 | #Measure weights and save absolute value of importance, averaged over samples
93 | from IntegratedGradients import *
94 |
95 | ig = integrated_gradients(encoder)
96 |
97 | overall_weights = np.zeros((pca_components.shape[0], dimension))
98 |
99 | for latent in range(dimension):
100 | print("Node " + str(latent + 1))
101 | weights = np.zeros((pca_components.shape[0], input_df_training.shape[0]))
102 |
103 | for i in range(input_df_training.shape[0]):
104 | vals = ig.explain(input_df_training.values[i, :], latent)
105 | new_vals = np.matmul(vals, pca_components.T)
106 | weights[:, i] = new_vals
107 |
108 | #Take absolute values avg over all samples
109 | overall_weights[:, latent] = np.mean(np.abs(weights), axis = 1)
110 |
111 | ig_df = pd.DataFrame(overall_weights, index = pca_df.index)
112 | print("EXPLANATIONS DF ", ig_df.shape)
113 |
114 | ig_df.to_csv(output_folder + cancer + '_DATA_DAE_Weights_TRAINING_' + str(dimension) + 'L_fold' + str(vae_run) + '.tsv', sep='\t', quoting = csv.QUOTE_NONE)
115 | print(ig_df.shape)
116 |
--------------------------------------------------------------------------------
/COMPETITOR_TRAININGS/IntegratedGradients.py:
--------------------------------------------------------------------------------
1 | ################################################################
2 | # Implemented by Naozumi Hiranuma (hiranumn@uw.edu) #
3 | # #
4 | # Keras-compatible implmentation of Integrated Gradients #
5 | # proposed in "Axiomatic attribution for deep neuron networks" #
6 | # (https://arxiv.org/abs/1703.01365). #
7 | # #
8 | # Keywords: Shapley values, interpretable machine learning #
9 | ################################################################
10 |
11 | from __future__ import division, print_function
12 | import numpy as np
13 | from time import sleep
14 | import sys
15 | import keras.backend as K
16 |
17 | from keras.models import Model, Sequential
18 |
19 | '''
20 | Integrated gradients approximates Shapley values by integrating partial
21 | gradients with respect to input features from reference input to the
22 | actual input. The following class implements the paper "Axiomatic attribution
23 | for deep neuron networks".
24 | '''
25 | class integrated_gradients:
26 | # model: Keras model that you wish to explain.
27 | # outchannels: In case the model are multi tasking, you can specify which output you want explain .
28 | def __init__(self, model, outchannels=[], verbose=1):
29 |
30 | #get backend info (either tensorflow or theano)
31 | self.backend = K.backend()
32 |
33 | #load model supports keras.Model and keras.Sequential
34 | if isinstance(model, Sequential):
35 | self.model = model.model
36 | elif isinstance(model, Model):
37 | self.model = model
38 | else:
39 | print("Invalid input model")
40 | return -1
41 |
42 | #load input tensors
43 | self.input_tensors = []
44 | for i in self.model.inputs:
45 | self.input_tensors.append(i)
46 | # The learning phase flag is a bool tensor (0 = test, 1 = train)
47 | # to be passed as input to any Keras function that uses
48 | # a different behavior at train time and test time.
49 | self.input_tensors.append(K.learning_phase())
50 |
51 | #If outputchanels are specified, use it.
52 | #Otherwise evalueate all outputs.
53 | self.outchannels = outchannels
54 | if len(self.outchannels) == 0:
55 | if verbose: print("Evaluated output channel (0-based index): All")
56 | if K.backend() == "tensorflow":
57 | self.outchannels = range(self.model.output.shape[1]._value)
58 | elif K.backend() == "theano":
59 | self.outchannels = range(self.model.output._keras_shape[1])
60 | else:
61 | if verbose:
62 | print("Evaluated output channels (0-based index):")
63 | print(','.join([str(i) for i in self.outchannels]))
64 |
65 | #Build gradient functions for desired output channels.
66 | self.get_gradients = {}
67 | if verbose: print("Building gradient functions")
68 |
69 | # Evaluate over all requested channels.
70 | for c in self.outchannels:
71 | # Get tensor that calculates gradient
72 | if K.backend() == "tensorflow":
73 | gradients = self.model.optimizer.get_gradients(self.model.output[:, c], self.model.input)
74 | if K.backend() == "theano":
75 | gradients = self.model.optimizer.get_gradients(self.model.output[:, c].sum(), self.model.input)
76 |
77 | # Build computational graph that computes the tensors given inputs
78 | self.get_gradients[c] = K.function(inputs=self.input_tensors, outputs=gradients)
79 |
80 | # This takes a lot of time for a big model with many tasks.
81 | # So lets print the progress.
82 | if verbose:
83 | sys.stdout.write('\r')
84 | sys.stdout.write("Progress: "+str(int((c+1)*1.0/len(self.outchannels)*1000)*1.0/10)+"%")
85 | sys.stdout.flush()
86 | # Done
87 | if verbose: print("\nDone.")
88 |
89 |
90 | '''
91 | Input: sample to explain, channel to explain
92 | Optional inputs:
93 | - reference: reference values (defaulted to 0s).
94 | - steps: # steps from reference values to the actual sample (defualted to 50).
95 | Output: list of numpy arrays to integrated over.
96 | '''
97 | def explain(self, sample, outc=0, reference=False, num_steps=50, verbose=0):
98 |
99 | # Each element for each input stream.
100 | samples = []
101 | numsteps = []
102 | step_sizes = []
103 |
104 | # If multiple inputs are present, feed them as list of np arrays.
105 | if isinstance(sample, list):
106 | #If reference is present, reference and sample size need to be equal.
107 | if reference != False:
108 | assert len(sample) == len(reference)
109 | for i in range(len(sample)):
110 | if reference == False:
111 | _output = integrated_gradients.linearly_interpolate(sample[i], False, num_steps)
112 | else:
113 | _output = integrated_gradients.linearly_interpolate(sample[i], reference[i], num_steps)
114 | samples.append(_output[0])
115 | numsteps.append(_output[1])
116 | step_sizes.append(_output[2])
117 |
118 | # Or you can feed just a single numpy arrray.
119 | elif isinstance(sample, np.ndarray):
120 | _output = integrated_gradients.linearly_interpolate(sample, reference, num_steps)
121 | samples.append(_output[0])
122 | numsteps.append(_output[1])
123 | step_sizes.append(_output[2])
124 |
125 | # Desired channel must be in the list of outputchannels
126 | assert outc in self.outchannels
127 | if verbose: print("Explaning the "+str(self.outchannels[outc])+"th output.")
128 |
129 | # For tensorflow backend
130 | _input = []
131 | for s in samples:
132 | _input.append(s)
133 | _input.append(0)
134 |
135 | if K.backend() == "tensorflow":
136 | gradients = self.get_gradients[outc](_input)
137 | elif K.backend() == "theano":
138 | gradients = self.get_gradients[outc](_input)
139 | if len(self.model.inputs) == 1:
140 | gradients = [gradients]
141 |
142 | explanation = []
143 | for i in range(len(gradients)):
144 | _temp = np.sum(gradients[i], axis=0)
145 | explanation.append(np.multiply(_temp, step_sizes[i]))
146 |
147 | # Format the return values according to the input sample.
148 | if isinstance(sample, list):
149 | return explanation
150 | elif isinstance(sample, np.ndarray):
151 | return explanation[0]
152 | return -1
153 |
154 |
155 | '''
156 | Input: numpy array of a sample
157 | Optional inputs:
158 | - reference: reference values (defaulted to 0s).
159 | - steps: # steps from reference values to the actual sample.
160 | Output: list of numpy arrays to integrate over.
161 | '''
162 | @staticmethod
163 | def linearly_interpolate(sample, reference=False, num_steps=50):
164 | # Use default reference values if reference is not specified
165 | if reference is False: reference = np.zeros(sample.shape);
166 |
167 | # Reference and sample shape needs to match exactly
168 | assert sample.shape == reference.shape
169 |
170 | # Calcuated stepwise difference from reference to the actual sample.
171 | ret = np.zeros(tuple([num_steps] +[i for i in sample.shape]))
172 | for s in range(num_steps):
173 | ret[s] = reference+(sample-reference)*(s*1.0/num_steps)
174 |
175 | return ret, num_steps, (sample-reference)*(1.0/num_steps)
176 |
--------------------------------------------------------------------------------
/COMPETITOR_TRAININGS/Train_AE_Models.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for training AE models
3 | ###############################
4 | import sys
5 | cancer_type = sys.argv[1]
6 |
7 | for run in range(10):
8 | get_ipython().magic(u"run -i 'AE_2Layers_Model.py' " + cancer_type + " " + str(run))
9 |
--------------------------------------------------------------------------------
/COMPETITOR_TRAININGS/Train_DAE_Models.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for training DAE models
3 | ###############################
4 | import sys
5 | cancer_type = sys.argv[1]
6 |
7 | for run in range(10):
8 | get_ipython().magic(u"run -i 'DAE_2Layers_Model.py' " + cancer_type + " " + str(run))
9 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Lee Lab @ UW Allen School
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MODEL_TRAININGS/Create_DeepProfile_Ensemble_Weights.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #This script is for creating gene attribution matrices for DeepProfile
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import csv
8 | import sys
9 |
10 | #Read user input
11 | cancer_type = sys.argv[1]
12 |
13 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
14 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
15 |
16 | #Read all VAE model gene attributions
17 | L = 150
18 | data_df = pd.read_table(input_folder + 'VAE_WEIGHTS/' + cancer_type + '_DATA_VAE_Cluster_Weights_TRAINING_' + str(100) + 'L_fold' + str(1) + '.tsv', index_col = 0)
19 | print(data_df.shape)
20 | basic_length = data_df.shape[0]
21 |
22 | weight_list = []
23 | dims = [5, 10, 25, 50, 75, 100]
24 | run_count = 100
25 | for dim in dims:
26 | VAE_weights = np.zeros((run_count * dim, basic_length))
27 | for i in range(run_count):
28 | data_df = pd.read_table(input_folder + 'VAE_WEIGHTS/' + cancer_type + '_DATA_VAE_Cluster_Weights_TRAINING_' + str(dim) + 'L_fold' + str(i) + '.tsv', index_col = 0)
29 | data_df = data_df.T
30 | #print(data_df.shape)
31 | start = dim * i
32 | end = dim * (i + 1)
33 | VAE_weights[start:end, :] = data_df.values
34 | weight_list.append(VAE_weights)
35 |
36 | #Read the ensemble labels
37 | labels_df = pd.read_table(input_folder + cancer_type + '_TRAINING_DATA_kmeans_ENSEMBLE_LABELS_' + str(L) + 'L.txt', header= None)
38 | labels = labels_df.values
39 | print("Ensemble labels ", len(labels))
40 |
41 | #Concatenate all the gene attributions
42 | joined_weights = np.concatenate(weight_list)
43 | print("Joined weights ", joined_weights.shape)
44 |
45 | #Create ensemble weights
46 | ensemble_weights = np.zeros((L, joined_weights.shape[1]))
47 | for label in range(L):
48 | indices = np.where(labels == label)[0]
49 | average_weights = np.mean(joined_weights[indices, :], axis = 0)
50 | ensemble_weights[label, :] = average_weights
51 |
52 | print("Ensemble weights ", ensemble_weights.shape)
53 |
54 | #Record ensemble weights
55 | ensemble_weights_df = pd.DataFrame(ensemble_weights, index = np.arange(L), columns = data_df.columns)
56 | ensemble_weights_df.to_csv(output_folder + cancer_type + '_DeepProfile_Ensemble_Gene_Importance_Weights_' + str(L) + 'L.tsv', sep = '\t')
--------------------------------------------------------------------------------
/MODEL_TRAININGS/Create_DeepProfile_Training_Embeddings.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #This script is for creating training embeddings
3 | ###############################
4 |
5 | import pandas as pd
6 | import numpy as np
7 | import csv
8 | import sys
9 |
10 | #Read user input
11 | cancer_type = sys.argv[1]
12 |
13 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
14 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
15 |
16 | #Read all training embeddings
17 | dims = [5, 10, 25, 50, 75, 100]
18 | data_list = []
19 | for dim in dims:
20 | run = 100
21 | for i in range(run):
22 | data_df = pd.read_table(input_folder + 'VAE_FILES/' + cancer_type + '_DATA_TOP2_JOINED_encoded_' + str(dim) + 'L_TRAINING_fold' + str(i) + '.tsv', index_col = 0)
23 | print(data_df.shape)
24 | data_list.append(data_df.values)
25 |
26 | joined_data = np.concatenate(data_list, axis=1)
27 | print("Joined training embeddings" , joined_data.shape)
28 |
29 | #Read the ensemble labels
30 | L = 150
31 | labels_df = pd.read_table(input_folder + cancer_type + '_TRAINING_DATA_kmeans_ENSEMBLE_LABELS_' + str(L) + 'L.txt', header= None)
32 | labels = labels_df.values
33 | print("Ensemble labels ", len(labels))
34 |
35 | #Create ensemble training embeddings
36 | ensemble_embeddings = np.zeros((joined_data.shape[0], L))
37 | for label in range(L):
38 | indices = np.where(labels == label)[0]
39 | average_values = np.mean(joined_data[:, indices], axis = 1)
40 | ensemble_embeddings[:, label] = average_values
41 |
42 | print("Training ensemble embedding ", ensemble_embeddings.shape)
43 |
44 | #Save the training embedding
45 | ensemble_embeddings_df = pd.DataFrame(ensemble_embeddings, index = data_df.index, columns = np.arange(L))
46 | ensemble_embeddings_df.to_csv(output_folder + cancer_type + '_DeepProfile_Training_Embedding_' + str(L) + 'L.tsv', sep = '\t')
--------------------------------------------------------------------------------
/MODEL_TRAININGS/Create_Ensemble_Labels.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #This script is for learning ensemble labels for VAE models
3 | ###############################
4 |
5 | import pandas as pd
6 | import numpy as np
7 | import csv
8 | import sys
9 | from sklearn.cluster import KMeans
10 |
11 | #Read user inputs
12 | cancer_type = sys.argv[1]
13 | final_dim = int(sys.argv[2])
14 | print("FINAL DIM " + str(final_dim))
15 |
16 | #Read all training embeddings
17 | dims = [5, 10, 25, 50, 75, 100]
18 | data_list = []
19 |
20 | for dim in dims:
21 | run = 100
22 | for i in range(run):
23 | print(i)
24 | data_df = pd.read_table('../ALL_CANCER_FILES/' + cancer_type + '/VAE_FILES/' + cancer_type + '_DATA_TOP2_JOINED_encoded_' + str(dim) + 'L_TRAINING_fold' + str(i) + '.tsv', index_col = 0)
25 | print(data_df.shape)
26 | data_list.append(data_df.values)
27 |
28 | joined_df = np.concatenate(data_list, axis=1)
29 | print("Joined training embeddings" , joined_df.shape)
30 |
31 | #Apply kmeans clustering to this data
32 | X = joined_df
33 |
34 | kmeans = KMeans(n_clusters= final_dim, random_state=123).fit(X.transpose())
35 | print("K-means labels ", kmeans.labels_)
36 |
37 | #Save labels
38 | np.savetxt('../ALL_CANCER_FILES/' + cancer_type + '/' + cancer_type + '_TRAINING_DATA_kmeans_ENSEMBLE_LABELS_' + str(final_dim) + 'L.txt' , kmeans.labels_, delimiter=',')
39 |
--------------------------------------------------------------------------------
/MODEL_TRAININGS/Create_PCs_for_DeepLearning_Models.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #This script is for PCA transforming the input data to pass to deep learning models
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import csv
8 | from sklearn.decomposition import PCA
9 | import sys
10 |
11 | #Read cancer type input
12 | cancer_type = sys.argv[1]
13 | #Read number of components
14 | component_count = int(sys.argv[2])
15 |
16 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
17 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
18 |
19 | #Method for creating PCs
20 | def createPCs(cancer_type):
21 |
22 | print("************************* " + cancer_type)
23 |
24 | #Read training data
25 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
26 | print("Training data ", data_df.shape)
27 | training_data = data_df.values
28 | training_data = np.nan_to_num(training_data)
29 |
30 | #Transform training data to top principal components
31 | pca = PCA(n_components = component_count)
32 | pca.fit(training_data)
33 | components = pca.components_
34 | print("PCA components ", components.shape)
35 |
36 | #Save the learned components
37 | component_df = pd.DataFrame(components.T, index = data_df.columns)
38 | component_df.to_csv(output_folder + cancer_type + '_DATA_TOP2_JOINED_PCA_' + str(component_count) + 'L_COMPONENTS.tsv', sep = '\t')
39 |
40 | #Save the encoded data
41 | encoded_data = pca.transform(training_data)
42 | print("PCA encoded data ", encoded_data.shape)
43 | encoded_df = pd.DataFrame(encoded_data, index = data_df.index)
44 | encoded_df.to_csv(output_folder + cancer_type + '_DATA_TOP2_JOINED_PCA_' + str(component_count) + 'L.tsv', sep = '\t')
45 |
46 | createPCs(cancer_type)
--------------------------------------------------------------------------------
/MODEL_TRAININGS/Example_Run_All.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Example for training model for a cancer type
3 | ###############################
4 | import sys
5 |
6 | ##STEP 1: Creating PCs
7 | get_ipython().magic(u"run -i Create_PCs_for_DeepLearning_Models.py BRCA 1000")
8 |
9 | ##STEP 2: Training VAE models
10 | get_ipython().magic(u"run -i Run_VAE_Models.py BRCA 5 0 100")
11 | get_ipython().magic(u"run -i Run_VAE_Models.py BRCA 10 0 100")
12 | get_ipython().magic(u"run -i Run_VAE_Models.py BRCA 25 0 100")
13 | get_ipython().magic(u"run -i Run_VAE_Models.py BRCA 50 0 100")
14 | get_ipython().magic(u"run -i Run_VAE_Models.py BRCA 75 0 100")
15 | get_ipython().magic(u"run -i Run_VAE_Models.py BRCA 100 0 100")
16 |
17 | ##STEP 3: Running IG for VAE models
18 | get_ipython().magic(u"run -i Get_VAE_IG_Attributions.py BRCA 5 0 100")
19 | get_ipython().magic(u"run -i Get_VAE_IG_Attributions.py BRCA 10 0 100")
20 | get_ipython().magic(u"run -i Get_VAE_IG_Attributions.py BRCA 25 0 100")
21 | get_ipython().magic(u"run -i Get_VAE_IG_Attributions.py BRCA 50 0 100")
22 | get_ipython().magic(u"run -i Get_VAE_IG_Attributions.py BRCA 75 0 100")
23 | get_ipython().magic(u"run -i Get_VAE_IG_Attributions.py BRCA 100 0 100")
24 |
25 | ##STEP 4: Learning ensemble labels
26 | get_ipython().magic(u"run -i Create_Ensemble_Labels.py BRCA 150")
27 |
28 | ##STEP 5: Creating DeepProfile ensemble training embedding
29 | get_ipython().magic(u"run -i Create_DeepProfile_Training_Embeddings.py BRCA")
30 |
31 | ##STEP 6: Creating DeepProfile ensemble gene attribution matrices
32 | get_ipython().magic(u"run -i Create_DeepProfile_Ensemble_Weights.py BRCA")
33 |
34 |
--------------------------------------------------------------------------------
/MODEL_TRAININGS/Get_VAE_IG_Attributions.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for running integrated gradients to get gene-level attributions of each node
3 | ###############################
4 |
5 | import os
6 | import numpy as np
7 | import pandas as pd
8 | import math
9 | from sklearn.metrics import mean_squared_error
10 | import tensorflow as tf
11 | from keras.layers import Input, Dense, Lambda, Layer, Activation
12 | from keras.layers.normalization import BatchNormalization
13 | from keras.models import Model
14 | from keras import backend as K
15 | from keras import metrics, optimizers
16 | from keras.callbacks import Callback
17 | import keras
18 | import csv
19 | from keras.models import model_from_json
20 | import sys
21 |
22 | #Prevent tensorflow from using all the memory
23 | config = tf.ConfigProto()
24 | config.gpu_options.allow_growth=True
25 | sess = tf.Session(config=config)
26 |
27 | #Read all user inputs
28 | cancer = sys.argv[1]
29 | dimension = int(sys.argv[2])
30 | start = int(sys.argv[3])
31 | end = int(sys.argv[4])
32 |
33 | print("CANCER " + str(cancer))
34 | print("DIM " + str(dimension))
35 | print("START " + str(start))
36 | print("END " + str(end))
37 |
38 | input_folder = '../ALL_CANCER_FILES/' + cancer + '/'
39 | output_folder = '../ALL_CANCER_FILES/' + cancer + '/VAE_WEIGHTS/'
40 |
41 | #Load PCA weights
42 | pca_df = pd.read_table(input_folder + cancer + '_DATA_TOP2_JOINED_PCA_1000L_COMPONENTS.tsv', index_col = 0)
43 | print("PCA COMPONENTS ", pca_df.shape)
44 | pca_components = pca_df.values
45 |
46 | #Read input data
47 | input_df = pd.read_table(input_folder + cancer + '_DATA_TOP2_JOINED_PCA_1000L.tsv', index_col=0)
48 | print("INPUT FILE ", input_df.shape)
49 |
50 | #VAE loss definition
51 | def vae_loss(x_input, x_decoded):
52 | reconstruction_loss = original_dim * metrics.mse(x_input, x_decoded)
53 | kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
54 | return K.mean(reconstruction_loss + (K.get_value(beta) * kl_loss))
55 |
56 | #Save the weight for each run
57 | for vae_run in range(start, end):
58 |
59 | print("MODEL " + str(vae_run))
60 |
61 | #Load model
62 | json_file = open(input_folder + 'VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(vae_run) + '.json', 'r')
63 | loaded_model_json = json_file.read()
64 | json_file.close()
65 | encoder = model_from_json(loaded_model_json)
66 |
67 | #Load weights
68 | encoder.load_weights(input_folder + 'VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(vae_run) + '.h5')
69 | print("Loaded model from disk")
70 |
71 | #Define hyperparameters
72 | input_df_training = input_df
73 | original_dim = input_df_training.shape[1]
74 | intermediate1_dim = 100
75 | intermediate2_dim = 25
76 | latent_dim = dimension
77 |
78 | batch_size = 50
79 | epochs = 50
80 | learning_rate = 0.0005
81 | beta = K.variable(1)
82 | kappa = 0
83 |
84 | #Encoder network
85 | x = Input(shape=(original_dim, ))
86 |
87 | net = Dense(intermediate1_dim)(x)
88 | net2 = BatchNormalization()(net)
89 | net3 = Activation('relu')(net2)
90 |
91 | net4 = Dense(intermediate2_dim)(net3)
92 | net5 = BatchNormalization()(net4)
93 | net6 = Activation('relu')(net5)
94 |
95 | z_mean = Dense(latent_dim)(net6)
96 | z_log_var = Dense(latent_dim)(net6)
97 |
98 | adam = optimizers.Adam(lr=learning_rate)
99 | encoder.compile(optimizer=adam, loss = vae_loss)
100 | encoder.summary()
101 |
102 | #Encode training data using the model
103 | training_encoded = encoder.predict(input_df_training, batch_size = batch_size)
104 | print("ENCODED TRAINING DATA ", training_encoded.shape)
105 |
106 |
107 | #Measure weights and save absolute value of importance, averaged over samples
108 | from IntegratedGradients import *
109 |
110 | ig = integrated_gradients(encoder)
111 |
112 | overall_weights = np.zeros((pca_components.shape[0], dimension))
113 |
114 | #Go over each node
115 | for latent in range(dimension):
116 | print("Node " + str(latent + 1))
117 | weights = np.zeros((pca_components.shape[0], input_df_training.shape[0]))
118 |
119 | #Go over each sample
120 | for i in range(input_df_training.shape[0]):
121 | #print("Sample " + str(i + 1))
122 | vals = ig.explain(input_df_training.values[i, :], latent)
123 | new_vals = np.matmul(vals, pca_components.T)
124 | weights[:, i] = new_vals
125 |
126 | #Take absolute values avg over all samples
127 | overall_weights[:, latent] = np.mean(np.abs(weights), axis = 1)
128 |
129 | ig_df = pd.DataFrame(overall_weights, index = pca_df.index)
130 | print("EXPLANATIONS DF ", ig_df.shape)
131 |
132 | ig_df.to_csv(output_folder + cancer + '_DATA_VAE_Cluster_Weights_TRAINING_' + str(dimension) + 'L_fold' + str(vae_run) + '.tsv', sep='\t', quoting = csv.QUOTE_NONE)
133 |
134 |
--------------------------------------------------------------------------------
/MODEL_TRAININGS/IntegratedGradients.py:
--------------------------------------------------------------------------------
1 | ################################################################
2 | # Implemented by Naozumi Hiranuma (hiranumn@uw.edu) #
3 | # #
4 | # Keras-compatible implmentation of Integrated Gradients #
5 | # proposed in "Axiomatic attribution for deep neuron networks" #
6 | # (https://arxiv.org/abs/1703.01365). #
7 | # #
8 | # Keywords: Shapley values, interpretable machine learning #
9 | ################################################################
10 |
11 | from __future__ import division, print_function
12 | import numpy as np
13 | from time import sleep
14 | import sys
15 | import keras.backend as K
16 |
17 | from keras.models import Model, Sequential
18 |
19 | '''
20 | Integrated gradients approximates Shapley values by integrating partial
21 | gradients with respect to input features from reference input to the
22 | actual input. The following class implements the paper "Axiomatic attribution
23 | for deep neuron networks".
24 | '''
25 | class integrated_gradients:
26 | # model: Keras model that you wish to explain.
27 | # outchannels: In case the model are multi tasking, you can specify which output you want explain .
28 | def __init__(self, model, outchannels=[], verbose=1):
29 |
30 | #get backend info (either tensorflow or theano)
31 | self.backend = K.backend()
32 |
33 | #load model supports keras.Model and keras.Sequential
34 | if isinstance(model, Sequential):
35 | self.model = model.model
36 | elif isinstance(model, Model):
37 | self.model = model
38 | else:
39 | print("Invalid input model")
40 | return -1
41 |
42 | #load input tensors
43 | self.input_tensors = []
44 | for i in self.model.inputs:
45 | self.input_tensors.append(i)
46 | # The learning phase flag is a bool tensor (0 = test, 1 = train)
47 | # to be passed as input to any Keras function that uses
48 | # a different behavior at train time and test time.
49 | self.input_tensors.append(K.learning_phase())
50 |
51 | #If outputchanels are specified, use it.
52 | #Otherwise evalueate all outputs.
53 | self.outchannels = outchannels
54 | if len(self.outchannels) == 0:
55 | if verbose: print("Evaluated output channel (0-based index): All")
56 | if K.backend() == "tensorflow":
57 | self.outchannels = range(self.model.output.shape[1]._value)
58 | elif K.backend() == "theano":
59 | self.outchannels = range(self.model.output._keras_shape[1])
60 | else:
61 | if verbose:
62 | print("Evaluated output channels (0-based index):")
63 | print(','.join([str(i) for i in self.outchannels]))
64 |
65 | #Build gradient functions for desired output channels.
66 | self.get_gradients = {}
67 | if verbose: print("Building gradient functions")
68 |
69 | # Evaluate over all requested channels.
70 | for c in self.outchannels:
71 | # Get tensor that calculates gradient
72 | if K.backend() == "tensorflow":
73 | gradients = self.model.optimizer.get_gradients(self.model.output[:, c], self.model.input)
74 | if K.backend() == "theano":
75 | gradients = self.model.optimizer.get_gradients(self.model.output[:, c].sum(), self.model.input)
76 |
77 | # Build computational graph that computes the tensors given inputs
78 | self.get_gradients[c] = K.function(inputs=self.input_tensors, outputs=gradients)
79 |
80 | # This takes a lot of time for a big model with many tasks.
81 | # So lets print the progress.
82 | if verbose:
83 | sys.stdout.write('\r')
84 | sys.stdout.write("Progress: "+str(int((c+1)*1.0/len(self.outchannels)*1000)*1.0/10)+"%")
85 | sys.stdout.flush()
86 | # Done
87 | if verbose: print("\nDone.")
88 |
89 |
90 | '''
91 | Input: sample to explain, channel to explain
92 | Optional inputs:
93 | - reference: reference values (defaulted to 0s).
94 | - steps: # steps from reference values to the actual sample (defualted to 50).
95 | Output: list of numpy arrays to integrated over.
96 | '''
97 | def explain(self, sample, outc=0, reference=False, num_steps=50, verbose=0):
98 |
99 | # Each element for each input stream.
100 | samples = []
101 | numsteps = []
102 | step_sizes = []
103 |
104 | # If multiple inputs are present, feed them as list of np arrays.
105 | if isinstance(sample, list):
106 | #If reference is present, reference and sample size need to be equal.
107 | if reference != False:
108 | assert len(sample) == len(reference)
109 | for i in range(len(sample)):
110 | if reference == False:
111 | _output = integrated_gradients.linearly_interpolate(sample[i], False, num_steps)
112 | else:
113 | _output = integrated_gradients.linearly_interpolate(sample[i], reference[i], num_steps)
114 | samples.append(_output[0])
115 | numsteps.append(_output[1])
116 | step_sizes.append(_output[2])
117 |
118 | # Or you can feed just a single numpy arrray.
119 | elif isinstance(sample, np.ndarray):
120 | _output = integrated_gradients.linearly_interpolate(sample, reference, num_steps)
121 | samples.append(_output[0])
122 | numsteps.append(_output[1])
123 | step_sizes.append(_output[2])
124 |
125 | # Desired channel must be in the list of outputchannels
126 | assert outc in self.outchannels
127 | if verbose: print("Explaning the "+str(self.outchannels[outc])+"th output.")
128 |
129 | # For tensorflow backend
130 | _input = []
131 | for s in samples:
132 | _input.append(s)
133 | _input.append(0)
134 |
135 | if K.backend() == "tensorflow":
136 | gradients = self.get_gradients[outc](_input)
137 | elif K.backend() == "theano":
138 | gradients = self.get_gradients[outc](_input)
139 | if len(self.model.inputs) == 1:
140 | gradients = [gradients]
141 |
142 | explanation = []
143 | for i in range(len(gradients)):
144 | _temp = np.sum(gradients[i], axis=0)
145 | explanation.append(np.multiply(_temp, step_sizes[i]))
146 |
147 | # Format the return values according to the input sample.
148 | if isinstance(sample, list):
149 | return explanation
150 | elif isinstance(sample, np.ndarray):
151 | return explanation[0]
152 | return -1
153 |
154 |
155 | '''
156 | Input: numpy array of a sample
157 | Optional inputs:
158 | - reference: reference values (defaulted to 0s).
159 | - steps: # steps from reference values to the actual sample.
160 | Output: list of numpy arrays to integrate over.
161 | '''
162 | @staticmethod
163 | def linearly_interpolate(sample, reference=False, num_steps=50):
164 | # Use default reference values if reference is not specified
165 | if reference is False: reference = np.zeros(sample.shape);
166 |
167 | # Reference and sample shape needs to match exactly
168 | assert sample.shape == reference.shape
169 |
170 | # Calcuated stepwise difference from reference to the actual sample.
171 | ret = np.zeros(tuple([num_steps] +[i for i in sample.shape]))
172 | for s in range(num_steps):
173 | ret[s] = reference+(sample-reference)*(s*1.0/num_steps)
174 |
175 | return ret, num_steps, (sample-reference)*(1.0/num_steps)
176 |
--------------------------------------------------------------------------------
/MODEL_TRAININGS/Run_VAE_Models.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for training VAE models
3 | ###############################
4 | import sys
5 |
6 | cancer_type = sys.argv[1]
7 | latent = int(sys.argv[2])
8 | start = int(sys.argv[3])
9 | end = int(sys.argv[4])
10 |
11 | if latent == 5:
12 | dim1 = 100
13 | dim2 = 25
14 | if latent == 10:
15 | dim1 = 250
16 | dim2 = 50
17 | if latent == 25:
18 | dim1 = 250
19 | dim2 = 100
20 | if latent == 50:
21 | dim1 = 250
22 | dim2 = 100
23 | if latent == 75:
24 | dim1 = 250
25 | dim2 = 100
26 | if latent == 100:
27 | dim1 = 250
28 | dim2 = 100
29 |
30 | for run in range(start, end):
31 | get_ipython().magic(u"run -i 'VAE_3Layers_Model.py' '" + cancer_type + "' " + str(dim1) + " " + str(dim2) + " " + str(latent) + " " + str(run))
32 |
--------------------------------------------------------------------------------
/MODEL_TRAININGS/Select_Latent_Dimension_with_Gmeans.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "###############################\n",
10 | "#g-means training to select ensemble latent dimension size\n",
11 | "\n",
12 | "###############################\n",
13 | "\n",
14 | "import numpy as np\n",
15 | "import pandas as pd\n",
16 | "import csv\n",
17 | "from sklearn.decomposition import PCA\n",
18 | "import sklearn.preprocessing\n",
19 | "\n",
20 | "import pandas as pd\n",
21 | "import numpy as np\n",
22 | "import csv\n",
23 | "import sys"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 2,
29 | "metadata": {},
30 | "outputs": [
31 | {
32 | "name": "stdout",
33 | "output_type": "stream",
34 | "text": [
35 | "************************* BRCA\n",
36 | "Joined_df (11963, 26500)\n",
37 | "(26500, 11963)\n"
38 | ]
39 | },
40 | {
41 | "name": "stderr",
42 | "output_type": "stream",
43 | "text": [
44 | "/homes/gws/abdincer/.local/lib/python3.6/site-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n",
45 | " import pandas.util.testing as tm\n"
46 | ]
47 | },
48 | {
49 | "name": "stdout",
50 | "output_type": "stream",
51 | "text": [
52 | "Selected dimension 228\n",
53 | "************************* COLON\n",
54 | "Joined_df (5616, 26500)\n",
55 | "(26500, 5616)\n",
56 | "Selected dimension 195\n",
57 | "************************* LUNG\n",
58 | "Joined_df (4869, 26500)\n",
59 | "(26500, 4869)\n",
60 | "Selected dimension 166\n",
61 | "************************* AML\n",
62 | "Joined_df (6534, 26500)\n",
63 | "(26500, 6534)\n",
64 | "Selected dimension 57\n",
65 | "************************* BRAIN\n",
66 | "Joined_df (4282, 26500)\n",
67 | "(26500, 4282)\n",
68 | "Selected dimension 192\n",
69 | "************************* SKIN\n",
70 | "Joined_df (1240, 26500)\n",
71 | "(26500, 1240)\n",
72 | "Selected dimension 165\n",
73 | "************************* SARCOMA\n",
74 | "Joined_df (2330, 26500)\n",
75 | "(26500, 2330)\n",
76 | "Selected dimension 162\n",
77 | "************************* LIVER\n",
78 | "Joined_df (1937, 26500)\n",
79 | "(26500, 1937)\n",
80 | "Selected dimension 168\n",
81 | "************************* KIDNEY\n",
82 | "Joined_df (2293, 26500)\n",
83 | "(26500, 2293)\n",
84 | "Selected dimension 123\n",
85 | "************************* OV\n",
86 | "Joined_df (2714, 26500)\n",
87 | "(26500, 2714)\n",
88 | "Selected dimension 178\n",
89 | "************************* PROSTATE\n",
90 | "Joined_df (1195, 26500)\n",
91 | "(26500, 1195)\n",
92 | "Selected dimension 163\n",
93 | "************************* CERVICAL\n",
94 | "Joined_df (443, 26500)\n",
95 | "(26500, 443)\n",
96 | "Selected dimension 142\n",
97 | "************************* BLADDER\n",
98 | "Joined_df (371, 26500)\n",
99 | "(26500, 371)\n",
100 | "Selected dimension 136\n",
101 | "************************* STOMACH\n",
102 | "Joined_df (1742, 26500)\n",
103 | "(26500, 1742)\n",
104 | "Selected dimension 137\n",
105 | "************************* THYROID\n",
106 | "Joined_df (776, 26500)\n",
107 | "(26500, 776)\n",
108 | "Selected dimension 160\n",
109 | "************************* UTERINE\n",
110 | "Joined_df (661, 26500)\n",
111 | "(26500, 661)\n",
112 | "Selected dimension 156\n",
113 | "************************* HEAD_NECK\n",
114 | "Joined_df (643, 26500)\n",
115 | "(26500, 643)\n",
116 | "Selected dimension 156\n",
117 | "************************* PANCREAS\n",
118 | "Joined_df (602, 26500)\n",
119 | "(26500, 602)\n",
120 | "Selected dimension 145\n"
121 | ]
122 | }
123 | ],
124 | "source": [
125 | "cancer_types = ['BRCA', 'COLON', 'LUNG', 'AML',\n",
126 | " 'BRAIN', 'SKIN', 'SARCOMA', 'LIVER', \n",
127 | " 'KIDNEY', 'OV','PROSTATE', 'CERVICAL', \n",
128 | " 'BLADDER', 'STOMACH', 'THYROID', 'UTERINE', \n",
129 | " 'HEAD_NECK', 'PANCREAS']\n",
130 | " \n",
131 | "L_values = []\n",
132 | "for cancer_type in cancer_types:\n",
133 | " print(\"************************* \" + cancer_type)\n",
134 | " input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/VAE_FILES/' \n",
135 | "\n",
136 | " #all encodings for one dimension\n",
137 | " dims = [5, 10, 25, 50, 75, 100]\n",
138 | "\n",
139 | " data_list = []\n",
140 | "\n",
141 | " for dim in dims:\n",
142 | " run = 100\n",
143 | " for i in range(run):\n",
144 | " #print(i + 1)\n",
145 | " data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_encoded_' + str(dim) + 'L_TRAINING_fold' + str(i) + '.tsv', index_col = 0) \n",
146 | " #print(data_df.shape)\n",
147 | " data_list.append(data_df.values)\n",
148 | "\n",
149 | "\n",
150 | " joined_df = np.concatenate(data_list, axis=1)\n",
151 | " print(\"Joined_df \", joined_df.shape)\n",
152 | "\n",
153 | " #Apply kmeans clustering to this data\n",
154 | " from sklearn.cluster import KMeans\n",
155 | " import numpy as np\n",
156 | " X = joined_df.T\n",
157 | " print(X.shape)\n",
158 | " \n",
159 | " from gmeans import *\n",
160 | " gmeans = GMeans(strictness=3, random_state = 12345)\n",
161 | " gmeans.fit(X)\n",
162 | " gmeans.labels_\n",
163 | " selected_L = len(np.unique(gmeans.labels_))\n",
164 | " print(\"Selected dimension \", selected_L)\n",
165 | " \n",
166 | " L_values.append(selected_L)\n",
167 | "\n"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": 3,
173 | "metadata": {},
174 | "outputs": [
175 | {
176 | "name": "stdout",
177 | "output_type": "stream",
178 | "text": [
179 | "[228, 195, 166, 57, 192, 165, 162, 168, 123, 178, 163, 142, 136, 137, 160, 156, 156, 145]\n",
180 | "157.16666666666666\n"
181 | ]
182 | }
183 | ],
184 | "source": [
185 | "print(L_values)\n",
186 | "print(np.mean(L_values))"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": null,
192 | "metadata": {},
193 | "outputs": [],
194 | "source": []
195 | }
196 | ],
197 | "metadata": {
198 | "kernelspec": {
199 | "display_name": "Python 3",
200 | "language": "python",
201 | "name": "python3"
202 | },
203 | "language_info": {
204 | "codemirror_mode": {
205 | "name": "ipython",
206 | "version": 3
207 | },
208 | "file_extension": ".py",
209 | "mimetype": "text/x-python",
210 | "name": "python",
211 | "nbconvert_exporter": "python",
212 | "pygments_lexer": "ipython3",
213 | "version": "3.6.8"
214 | }
215 | },
216 | "nbformat": 4,
217 | "nbformat_minor": 4
218 | }
219 |
--------------------------------------------------------------------------------
/MODEL_TRAININGS/VAE_3Layers_Model.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #VAE model
3 |
4 | #Code is modified from https://github.com/keras-team/keras/blob/master/examples/variational_autoencoder.py
5 | ###############################
6 |
7 | import os
8 | import numpy as np
9 | import pandas as pd
10 | import math
11 | from sklearn.metrics import mean_squared_error
12 | import matplotlib.pyplot as plt
13 | import tensorflow as tf
14 | from keras.layers import Input, Dense, Lambda, Layer, Activation, Dropout
15 | from keras.layers.normalization import BatchNormalization
16 | from keras.models import Model
17 | from keras import backend as K
18 | from keras import metrics, optimizers
19 | from keras.callbacks import Callback
20 | import keras
21 | import csv
22 | import sys
23 |
24 | #Prevent tensorflow from using all the memory
25 | config = tf.ConfigProto()
26 | config.gpu_options.allow_growth=True
27 | sess = tf.Session(config=config)
28 |
29 |
30 | # Method for reparameterization trick to make model differentiable
31 | def sampling(args):
32 |
33 | # Function with args required for Keras Lambda function
34 | z_mean, z_log_var = args
35 |
36 | # Draw epsilon of the same shape from a standard normal distribution
37 | epsilon = K.random_normal(shape=K.shape(z_mean), mean=0., stddev=1.0)
38 |
39 | # The latent vector is non-deterministic and differentiable
40 | # in respect to z_mean and z_log_var
41 | z = z_mean + K.exp(z_log_var / 2) * epsilon
42 | return z
43 |
44 | #Method for defining the VAE loss
45 | def vae_loss(x_input, x_decoded):
46 |
47 | reconstruction_loss = original_dim * metrics.mse(x_input, x_decoded)
48 | kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
49 |
50 | return K.mean(reconstruction_loss + (K.get_value(beta) * kl_loss))
51 |
52 | #Method for calculating the reconstruction loss
53 | def reconstruction_loss(x_input, x_decoded):
54 |
55 | return metrics.mse(x_input, x_decoded)
56 |
57 | #Method for calculating the KL-divergence loss
58 | def kl_loss(x_input, x_decoded):
59 | return - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
60 |
61 | class WarmUpCallback(Callback):
62 | def __init__(self, beta, kappa):
63 | self.beta = beta
64 | self.kappa = kappa
65 |
66 | # Behavior on each epoch
67 | def on_epoch_end(self, epoch, logs={}):
68 | if K.get_value(self.beta) <= 1:
69 | K.set_value(self.beta, K.get_value(self.beta) + self.kappa)
70 |
71 | #Read input file
72 | cancer_type = sys.argv[1]
73 |
74 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
75 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/VAE_FILES/'
76 |
77 | input_filename = input_folder + cancer_type + '_DATA_TOP2_JOINED_PCA_1000L.tsv'
78 | output_filename = cancer_type + '_DATA_TOP2_JOINED_encoded_'
79 |
80 | input_df = pd.read_table(input_filename, index_col=0)
81 | print("INPUT FILE", input_df.shape)
82 | print(input_df.head(5))
83 |
84 | # Set hyperparameters
85 | original_dim = input_df.shape[1]
86 | intermediate1_dim = int(sys.argv[2])
87 | intermediate2_dim = int(sys.argv[3])
88 | latent_dim = int(sys.argv[4])
89 | fold = int(sys.argv[5])
90 |
91 | #SET RANDOM SEEDS
92 | from numpy.random import seed
93 | seed(123456 * fold)
94 | from tensorflow import set_random_seed
95 | set_random_seed(123456 * fold)
96 |
97 |
98 | init_mode = 'glorot_uniform'
99 | batch_size = 50
100 | epochs = 50
101 | learning_rate = 0.0005
102 | beta = K.variable(1)
103 | kappa = 0
104 |
105 | input_df_training = input_df
106 |
107 | #Define encoder
108 | x = Input(shape=(original_dim, ))
109 |
110 | net = Dense(intermediate1_dim, kernel_initializer=init_mode)(x)
111 | net2 = BatchNormalization()(net)
112 | net3 = Activation('relu')(net2)
113 |
114 | net4 = Dense(intermediate2_dim, kernel_initializer=init_mode)(net3)
115 | net5 = BatchNormalization()(net4)
116 | net6 = Activation('relu')(net5)
117 |
118 | z_mean = Dense(latent_dim, kernel_initializer=init_mode)(net6)
119 | z_log_var = Dense(latent_dim, kernel_initializer=init_mode)(net6)
120 |
121 | # Sample from mean and var
122 | z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
123 |
124 | #Define decoder
125 | decoder_h = Dense(intermediate2_dim, activation='relu', kernel_initializer=init_mode)
126 | decoder_h2 = Dense(intermediate1_dim, activation='relu', kernel_initializer=init_mode)
127 | decoder_mean = Dense(original_dim, kernel_initializer=init_mode)
128 |
129 | h_decoded = decoder_h(z)
130 | h_decoded2 = decoder_h2(h_decoded)
131 | x_decoded_mean = decoder_mean(h_decoded2)
132 |
133 | #VAE model
134 | vae = Model(x, x_decoded_mean)
135 |
136 | adam = optimizers.Adam(lr=learning_rate)
137 | vae.compile(optimizer=adam, loss = vae_loss, metrics = [reconstruction_loss, kl_loss])
138 | vae.summary()
139 |
140 | #Train model
141 | history = vae.fit(np.array(input_df_training), np.array(input_df_training),
142 | shuffle=True,
143 | epochs=epochs,
144 | batch_size=batch_size,
145 | verbose = 2,
146 | callbacks=[WarmUpCallback(beta, kappa)])
147 |
148 | # DEFINE ENCODER
149 | encoder = Model(x, z_mean)
150 |
151 | #DEFINE DECODER
152 | decoder_input = Input(shape=(latent_dim, ))
153 | _h_decoded = decoder_h(decoder_input)
154 | _h_decoded2 = decoder_h2(_h_decoded)
155 | _x_decoded_mean = decoder_mean(_h_decoded2)
156 | decoder = Model(decoder_input, _x_decoded_mean)
157 |
158 |
159 | training_encoded = encoder.predict(input_df_training, batch_size = batch_size)
160 | training_encoded_df = pd.DataFrame(training_encoded, index = input_df_training.index)
161 |
162 | # How well does the model reconstruct the input data
163 | training_reconstructed = decoder.predict(np.array(training_encoded_df))
164 | training_reconstructed_df = pd.DataFrame(training_reconstructed, index = input_df_training.index, columns = input_df_training.columns)
165 |
166 | recons_error = mean_squared_error(np.array(input_df_training), np.array(training_reconstructed_df))
167 |
168 | print("TRAINING RECONSTRUCTION ERROR: " + str(recons_error))
169 |
170 | #Save encoded test data
171 | training_encoded_df.to_csv(output_folder + output_filename + str(latent_dim) + "L_TRAINING_fold" + str(fold) + ".tsv", sep = '\t')
172 |
173 |
174 | #SAVE ENCODER MODEL
175 | from keras.models import model_from_json
176 |
177 | model_json = encoder.to_json()
178 | with open(output_folder + "VAE_" + cancer_type + "_encoder_" + str(latent_dim) + "L_"+ str(fold) + ".json", "w") as json_file:
179 | json_file.write(model_json)
180 |
181 | encoder.save_weights(output_folder + "VAE_" + cancer_type + "_encoder_" + str(latent_dim) + "L_"+ str(fold) + ".h5")
182 | print("Saved model to disk")
183 |
184 |
185 | model_json = decoder.to_json()
186 | with open(output_folder + "VAE_" + cancer_type + "_decoder_" + str(latent_dim) + "L_"+ str(fold) + ".json", "w") as json_file:
187 | json_file.write(model_json)
188 |
189 | decoder.save_weights(output_folder + "VAE_" + cancer_type + "_decoder_" + str(latent_dim) + "L_"+ str(fold) + ".h5")
190 | print("Saved model to disk")
191 |
192 |
193 | #Calculate training r squared
194 | from sklearn.metrics import r2_score
195 |
196 | training_r2_vals = np.zeros(input_df_training.shape[0])
197 | for i in range(input_df_training.shape[0]):
198 | training_r2 = r2_score(input_df_training.values[i, :], training_reconstructed_df.values[i, :])
199 | training_r2_vals[i] = training_r2
200 |
201 | print("TRAINING R2 " + str(np.mean(training_r2_vals)))
202 |
--------------------------------------------------------------------------------
/MODEL_TRAININGS/gmeans.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Code is from https://github.com/flylo/g-means
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | from matplotlib import pyplot as plt
8 | import seaborn as sbn
9 |
10 | from sklearn.cluster import MiniBatchKMeans
11 | from sklearn.preprocessing import scale
12 |
13 | from sklearn import datasets
14 |
15 | from scipy.stats import anderson
16 |
17 | from pdb import set_trace
18 |
19 |
20 | class GMeans(object):
21 |
22 | """strictness = how strict should the anderson-darling test for normality be
23 | 0: not at all strict
24 | 4: very strict
25 | """
26 |
27 | def __init__(self, min_obs=1, max_depth=10, random_state=None, strictness=4):
28 |
29 | super(GMeans, self).__init__()
30 |
31 | self.max_depth = max_depth
32 |
33 | self.min_obs = min_obs
34 |
35 | self.random_state = random_state
36 |
37 | if strictness not in range(5):
38 | raise ValueError("strictness parameter must be integer from 0 to 4")
39 | self.strictness = strictness
40 |
41 | self.stopping_criteria = []
42 |
43 | def _gaussianCheck(self, vector):
44 | """
45 | check whether a given input vector follows a gaussian distribution
46 | H0: vector is distributed gaussian
47 | H1: vector is not distributed gaussian
48 | """
49 | output = anderson(vector)
50 |
51 | if output[0] <= output[1][self.strictness]:
52 | return True
53 | else:
54 | return False
55 |
56 |
57 | def _recursiveClustering(self, data, depth, index):
58 | """
59 | recursively run kmeans with k=2 on your data until a max_depth is reached or we have
60 | gaussian clusters
61 | """
62 | depth += 1
63 | if depth == self.max_depth:
64 | self.data_index[index[:, 0]] = index
65 | self.stopping_criteria.append('max_depth')
66 | return
67 |
68 | km = MiniBatchKMeans(n_clusters=2, random_state=self.random_state)
69 | km.fit(data)
70 |
71 | centers = km.cluster_centers_
72 | v = centers[0] - centers[1]
73 | x_prime = scale(data.dot(v) / (v.dot(v)))
74 | gaussian = self._gaussianCheck(x_prime)
75 |
76 | # print gaussian
77 |
78 | if gaussian == True:
79 | self.data_index[index[:, 0]] = index
80 | self.stopping_criteria.append('gaussian')
81 | return
82 |
83 | labels = set(km.labels_)
84 | for k in labels:
85 | current_data = data[km.labels_ == k]
86 |
87 | if current_data.shape[0] <= self.min_obs:
88 | self.data_index[index[:, 0]] = index
89 | self.stopping_criteria.append('min_obs')
90 | return
91 |
92 |
93 | current_index = index[km.labels_==k]
94 | current_index[:, 1] = np.random.randint(0,100000000000)
95 | self._recursiveClustering(data=current_data, depth=depth, index=current_index)
96 |
97 | # set_trace()
98 |
99 |
100 | def fit(self, data):
101 | """
102 | fit the recursive clustering model to the data
103 | """
104 | self.data = data
105 |
106 | data_index = np.array([(i, False) for i in range(data.shape[0])])
107 | self.data_index = data_index
108 |
109 | self._recursiveClustering(data=data, depth=0, index=data_index)
110 |
111 | self.labels_ = self.data_index[:, 1]
112 |
113 |
114 | if __name__ == '__main__':
115 | # iris = datasets.load_iris().data
116 |
117 | iris = datasets.make_blobs(n_samples=10000,
118 | n_features=2,
119 | centers=4,
120 | cluster_std=1.0)[0]
121 |
122 | gmeans = GMeans(random_state=1010,
123 | strictness=4)
124 | # # set_trace()
125 | gmeans.fit(iris)
126 |
127 | plot_data = pd.DataFrame(iris[:, 0:2])
128 | plot_data.columns = ['x', 'y']
129 | plot_data['labels_gmeans'] = gmeans.labels_
130 | # set_trace()
131 |
132 | km = MiniBatchKMeans(n_clusters=4)
133 | km.fit(iris)
134 | plot_data['labels_km'] = km.labels_
135 |
136 | sbn.lmplot(x='x', y='y', data=plot_data, hue='labels_gmeans', fit_reg=False)
137 | sbn.lmplot(x='x', y='y', data=plot_data, hue='labels_km', fit_reg=False)
138 | plt.show()
139 | set_trace()
140 |
141 |
142 |
--------------------------------------------------------------------------------
/NORMAL_TISSUE_ANALYSIS/Create_DeepProfile_GTEX_Embeddings.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for creating GTEX DeepProfile embeddings
3 | ###############################
4 |
5 | import pandas as pd
6 | import numpy as np
7 | import csv
8 | import sys
9 |
10 | #Read cancer tupe from user
11 | cancer_type = sys.argv[1]
12 |
13 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
14 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/HEALTHY_TISSUE_FILES/'
15 |
16 | #Read all VAE embeddings
17 | dims = [5, 10, 25, 50, 75, 100]
18 | run = 100
19 |
20 | data_list = []
21 | for dim in dims:
22 | for i in range(run):
23 | data_df = pd.read_table(input_folder + 'HEALTHY_TISSUE_FILES/GTEX_' + cancer_type + '_RNASeq_Expression_VAE_encoded_' + str(dim) + 'L_' + str(i) + '.tsv', index_col = 0)
24 | print("GTEX VAE embedding ", data_df.shape)
25 | data_list.append(data_df.values)
26 |
27 | #Concatenate all embeddings
28 | joined_data = np.concatenate(data_list, axis=1)
29 | print("Joined VAE embedding ",joined_data.shape)
30 |
31 | #Read DeepProfile ensemble labels
32 | L = 150
33 | labels_df = pd.read_table(input_folder + cancer_type + '_TRAINING_DATA_kmeans_ENSEMBLE_LABELS_' + str(L) + 'L.txt', header= None)
34 | labels = labels_df.values
35 | print("DeepProfile ensemble labels ", len(labels))
36 |
37 | #Create ensemble embedding
38 | ensemble_embeddings = np.zeros((joined_data.shape[0], L))
39 | for label in range(L):
40 | indices = np.where(labels == label)[0]
41 | average_values = np.mean(joined_data[:, indices], axis = 1)
42 | ensemble_embeddings[:, label] = average_values
43 |
44 |
45 | #Record the ensemble embeddings
46 | print("DeepProfile ensemble embedding ", ensemble_embeddings.shape)
47 | ensemble_embeddings_df = pd.DataFrame(ensemble_embeddings, index = data_df.index, columns = np.arange(L))
48 | ensemble_embeddings_df.to_csv(output_folder + cancer_type + '_DeepProfile_GTEX_Healthy_Tissue_Embedding_' + str(L) + 'L.tsv', sep = '\t')
--------------------------------------------------------------------------------
/NORMAL_TISSUE_ANALYSIS/Create_Gtex_Rnaseq_PCs.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for creating PCs for expression matrices
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import csv
8 | from sklearn.decomposition import PCA
9 | import sklearn.preprocessing
10 | import statsmodels.api as sm
11 | from sklearn.preprocessing import scale
12 |
13 | def createData(cancer_type):
14 |
15 | input_folder ='../ALL_CANCER_FILES/' + cancer_type + '/'
16 |
17 | #Read training data
18 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
19 | print("Training expression dataframe ", data_df.shape)
20 |
21 | training_data = data_df.values
22 | training_data = np.nan_to_num(training_data)
23 |
24 | #Train PCA models
25 | pca = PCA(n_components = 1000)
26 | pca.fit(training_data)
27 | components = pca.components_
28 | print("PCA Components ", components.shape)
29 |
30 | #Read GTEX expression dataframe
31 | test_df = pd.read_table(input_folder + 'HEALTHY_TISSUE_FILES/' + 'GTEX_' + cancer_type + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', sep = '\t', index_col=0)
32 | print("Gtex expression dataframe ", test_df.shape)
33 |
34 | #Get genes available in training dataset
35 | joined_df = pd.concat([data_df, test_df], sort=False, join = 'outer')
36 | joined_df = joined_df[data_df.columns]
37 | joined_df = joined_df.iloc[-1 * test_df.shape[0]:, :]
38 | test_df = joined_df
39 |
40 | print("Gtex expression dataframe ", test_df.shape)
41 |
42 | #Encode test data using trained PCA model
43 | test_df = test_df.fillna(test_df.mean().fillna(0))
44 | test_data = test_df.values
45 |
46 | #Save the encoded data
47 | encoded_data = pca.transform(test_data)
48 | encoded_df = pd.DataFrame(encoded_data, index = test_df.index)
49 | print("GTEX PCA data ", encoded_df.shape)
50 | print("GTEX PCA data ", encoded_df.head)
51 | encoded_df.to_csv(input_folder + '/HEALTHY_TISSUE_FILES/GTEX_' + cancer_type + '_DATA_1K_PCs.tsv', sep = '\t')
52 |
53 | import sys
54 |
55 | cancer_type = sys.argv[1]
56 | createData(cancer_type)
57 |
--------------------------------------------------------------------------------
/NORMAL_TISSUE_ANALYSIS/Encode_GTEX_Data_with_VAE.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for encoding GTEX expression using VAE models
3 | ###############################
4 |
5 | import os
6 | import numpy as np
7 | import pandas as pd
8 |
9 | import math
10 | from sklearn.metrics import mean_squared_error
11 | import matplotlib.pyplot as plt
12 |
13 | import tensorflow as tf
14 | from keras.layers import Input, Dense, Lambda, Layer, Activation
15 | from keras.layers.normalization import BatchNormalization
16 | from keras.models import Model
17 | from keras import backend as K
18 | from keras import metrics, optimizers
19 | from keras.callbacks import Callback
20 | import keras
21 |
22 | import csv
23 | import sys
24 | from keras.models import model_from_json
25 | from sklearn import preprocessing
26 |
27 | #Prevent tensorflow from using all the memory
28 | config = tf.ConfigProto()
29 | config.gpu_options.allow_growth=True
30 | sess = tf.Session(config=config)
31 |
32 | #Method for defining the VAE loss
33 | def vae_loss(x_input, x_decoded):
34 | reconstruction_loss = original_dim * metrics.mse(x_input, x_decoded)
35 | kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
36 | return K.mean(reconstruction_loss + (K.get_value(beta) * kl_loss))
37 |
38 | #Read user inputs
39 | import sys
40 | cancer = sys.argv[1]
41 | dimension = int(sys.argv[2])
42 | start = int(sys.argv[3])
43 | end = int(sys.argv[4])
44 |
45 | print("CANCER NAME: " + cancer)
46 | data_folder = '../ALL_CANCER_FILES/' + cancer + '/'
47 |
48 | #Read GTEX expression
49 | input_df_test = pd.read_table(data_folder + 'HEALTHY_TISSUE_FILES/GTEX_' + cancer + '_DATA_1K_PCs.tsv', index_col = 0)
50 | print("GTEX expression dataframe ", input_df_test.shape)
51 |
52 | #Encode expression data with each VAE model
53 | for fold in range(start, end):
54 | print("VAE model with " + str(dimension) + " nodes and fold " + str(fold))
55 |
56 | #Load VAE models
57 | json_file = open(data_folder + 'VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.json', 'r')
58 | loaded_model_json = json_file.read()
59 | json_file.close()
60 | encoder = model_from_json(loaded_model_json)
61 |
62 | encoder.load_weights(data_folder + 'VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.h5')
63 | print("Loaded model from disk")
64 |
65 | #Define placeholder VAE model
66 | original_dim = input_df_test.shape[1]
67 | intermediate1_dim = 100
68 | intermediate2_dim = 25
69 | latent_dim = dimension
70 |
71 | batch_size = 50
72 | epochs = 50
73 | learning_rate = 0.0005
74 | beta = K.variable(1)
75 | kappa = 0
76 | init_mode = 'glorot_uniform'
77 |
78 | x = Input(shape=(original_dim, ))
79 |
80 | net = Dense(intermediate1_dim, kernel_initializer=init_mode)(x)
81 | net2 = BatchNormalization()(net)
82 | net3 = Activation('relu')(net2)
83 |
84 | net4 = Dense(intermediate2_dim, kernel_initializer=init_mode)(net3)
85 | net5 = BatchNormalization()(net4)
86 | net6 = Activation('relu')(net5)
87 |
88 | z_mean = Dense(latent_dim, kernel_initializer=init_mode)(net6)
89 | z_log_var = Dense(latent_dim, kernel_initializer=init_mode)(net6)
90 |
91 |
92 | adam = optimizers.Adam(lr=learning_rate)
93 |
94 | # Encode test data using the VAE model
95 | test_encoded = encoder.predict(input_df_test, batch_size = batch_size)
96 | test_encoded_df = pd.DataFrame(test_encoded, index = input_df_test.index)
97 | test_encoded_df.to_csv(data_folder + 'HEALTHY_TISSUE_FILES/' + 'GTEX_' + cancer + '_RNASeq_Expression_VAE_encoded_' + str(dimension) + 'L_' + str(fold) + '.tsv', sep = '\t')
98 |
99 |
--------------------------------------------------------------------------------
/NORMAL_TISSUE_ANALYSIS/Example_Run_All.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Example for generating healthy tissue embeddings for a cancer type
3 | ###############################
4 |
5 | #Preprocess data
6 | get_ipython().magic(u"run -i Preprocess_Gtex_Rnaseq_Expressions.py BRCA")
7 | get_ipython().magic(u"run -i Create_Gtex_Rnaseq_PCs.py BRCA")
8 |
9 | #Create DeepProfile embeddings
10 | get_ipython().magic(u"run -i Encode_GTEX_Data_with_VAE.py BRCA 5 0 100")
11 | get_ipython().magic(u"run -i Encode_GTEX_Data_with_VAE.py BRCA 10 0 100")
12 | get_ipython().magic(u"run -i Encode_GTEX_Data_with_VAE.py BRCA 25 0 100")
13 | get_ipython().magic(u"run -i Encode_GTEX_Data_with_VAE.py BRCA 50 0 100")
14 | get_ipython().magic(u"run -i Encode_GTEX_Data_with_VAE.py BRCA 75 0 100")
15 | get_ipython().magic(u"run -i Encode_GTEX_Data_with_VAE.py BRCA 100 0 100")
16 |
17 | get_ipython().magic(u"run -i Create_DeepProfile_GTEX_Embeddings.py BRCA")
18 |
19 | #Train healthy tissue classifiers
20 | get_ipython().magic(u"run -i Normal_Tissue_Classifier.py BRCA")
--------------------------------------------------------------------------------
/NORMAL_TISSUE_ANALYSIS/Normal_Tissue_Classifier.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for training classifiers for separating healthy and cancer tissue embeddings
3 | ###############################
4 |
5 | import pandas as pd
6 | import seaborn as sb
7 | import numpy as np
8 | import pickle
9 | import random
10 | from tqdm import *
11 | from sklearn.linear_model import LogisticRegression
12 | from sklearn.model_selection import GridSearchCV
13 | from sklearn.model_selection import train_test_split
14 | from sklearn.utils import resample
15 | from sklearn.preprocessing import StandardScaler
16 |
17 |
18 | def trainClassifier(cancer_type):
19 |
20 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
21 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/HEALTHY_TISSUE_FILES/'
22 |
23 | #Read cancer embedding
24 | cancer_data = pd.read_csv(input_folder + cancer_type + '_DeepProfile_Training_Embedding_150L.tsv',sep='\t',index_col=0)
25 | print("Cancer embedding ", cancer_data.shape)
26 |
27 | #Read GTEX embedding
28 | healthy_data = pd.read_csv(input_folder + 'HEALTHY_TISSUE_FILES/' + cancer_type + '_DeepProfile_GTEX_Healthy_Tissue_Embedding_150L.tsv',sep='\t',index_col=0)
29 | print("GTEX embedding ", healthy_data.shape)
30 |
31 | #Combine datasets
32 | FULL_FRAME = pd.concat([cancer_data, healthy_data],axis=0)
33 |
34 | #Define healthy tissue labels
35 | healthy_label = [x < cancer_data.shape[0] for x in range(FULL_FRAME.shape[0])]
36 |
37 | #Train 100 L2 models with bootstrapping
38 | bootstrap_weights = []
39 | for i in tqdm(range(500)):
40 | X_re,y_re = resample(FULL_FRAME,healthy_label, random_state = 1234 * i)
41 | clf = LogisticRegression(penalty = 'l2', solver = 'liblinear')
42 | clf.fit(X_re,y_re)
43 |
44 | bootstrap_weights.append(clf.coef_)
45 |
46 | #Save the results
47 | pickle.dump(bootstrap_weights,open(output_folder + 'bootstrap_' + cancer_type + '_weights.p','wb'))
48 |
49 | import sys
50 |
51 | cancer_type = sys.argv[1]
52 | trainClassifier(cancer_type)
53 |
--------------------------------------------------------------------------------
/NORMAL_TISSUE_ANALYSIS/Preprocess_Gtex_Rnaseq_Expressions.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for creating expression matrices for GTEX healthy samples
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import csv
8 | from sklearn.decomposition import PCA
9 | import sklearn.preprocessing
10 | import statsmodels.api as sm
11 | from sklearn.preprocessing import scale
12 |
13 | #Read all GTEX expression file
14 | MAIN_df = pd.read_table('GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct', sep = '\t', index_col=0)
15 | print("Gtex expression dataframe ", MAIN_df.shape)
16 | all_samples = np.asarray(MAIN_df.columns)
17 |
18 | #Method for creating tissue-specific GTEX
19 | def save_tissue_expression(cancer):
20 |
21 | data_folder = '../ALL_CANCER_FILES/' + cancer + '/HEALTHY_TISSUE_FILES/'
22 |
23 | #Read sample names of tissue-specific samples
24 | index_df = pd.read_table(data_folder + 'GTEX_' + cancer + '_SAMPLES.txt', sep = '\n', index_col=0)
25 | cancer_specific_samples = np.asarray(index_df.index)
26 | print("Samples ", cancer_specific_samples)
27 |
28 | #Find list of matching samples
29 | matching_samples = np.intersect1d(cancer_specific_samples, all_samples)
30 | print("MATCHING SAMPLES COUNT ", len(matching_samples))
31 |
32 | #Get the expression for these patients
33 | cancer_df = MAIN_df[matching_samples]
34 | gene_names = MAIN_df['Description'].values
35 | cancer_df = pd.DataFrame(cancer_df.values.T, index = cancer_df.columns, columns = gene_names)
36 | print("Samples ", cancer_df.shape)
37 | print('Range ', (np.max(cancer_df.values) - np.min(cancer_df.values) ))
38 |
39 | #Mean impute the missing values
40 | cancer_df = cancer_df.fillna(cancer_df.mean().fillna(0))
41 |
42 | #Log scale the data and make 0-mean univariate
43 | scaled_expression_values = np.log(cancer_df.values)
44 | scaled_expression_values[scaled_expression_values == np.NINF] = 0
45 | normalized_data = sklearn.preprocessing.scale(scaled_expression_values)
46 | print("Mean values ", np.mean(normalized_data, axis = 0))
47 | print("Mean values ", len(np.mean(normalized_data, axis = 0)))
48 | print("Std values ", np.std(normalized_data, axis = 0))
49 | print("Std values ", len(np.std(normalized_data, axis = 0)))
50 |
51 | #Save the final expressiom matrix
52 | cancer_df = pd.DataFrame(normalized_data, index = cancer_df.index, columns = cancer_df.columns)
53 | print("Final dataframe ", cancer_df.shape)
54 | print("Final dataframe ", cancer_df.head())
55 | print('Final dataframe range: ', (np.max(cancer_df.values) - np.min(cancer_df.values) ))
56 |
57 | cancer_df.to_csv(data_folder + 'GTEX_' + cancer + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', sep = '\t')
58 |
59 | import sys
60 |
61 | cancer_type = sys.argv[1]
62 | save_tissue_expression(cancer_type)
--------------------------------------------------------------------------------
/PATHWAY_ANALYSIS/Create_Pathway_Matrices.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for creating pathway matrices for cancer type
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import csv
8 | import sys
9 |
10 | #Read cancer name and pathway file
11 | cancer_type = sys.argv[1]
12 | pathway_name = sys.argv[2]
13 |
14 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
15 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/PATHWAY_FILES/'
16 |
17 | def create_pathway_matrix(cancer_type, pathway_file):
18 |
19 | #1) Read input data
20 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', index_col=0)
21 | print("Input data ", data_df.shape)
22 | gene_names = data_df.columns
23 |
24 | #2) Read pathway data
25 | if pathway_file == 'C2':
26 | filename = 'MSIGDB_PATHWAYS/c2.v6.2.symbols.gmt'
27 | if pathway_file == 'H':
28 | filename = 'MSIGDB_PATHWAYS/h.all.v6.2.symbols.gmt'
29 | if pathway_file == 'C4_CGN':
30 | filename = 'MSIGDB_PATHWAYS/c4.cgn.v6.2.symbols.gmt'
31 | if pathway_file == 'C4_CM':
32 | filename = 'MSIGDB_PATHWAYS/c4.cm.v6.2.symbols.gmt'
33 | if pathway_file == 'C5_BP':
34 | filename = 'MSIGDB_PATHWAYS/c5.bp.v6.2.symbols.gmt'
35 | if pathway_file == 'C5_CC':
36 | filename = 'MSIGDB_PATHWAYS/c5.cc.v6.2.symbols.gmt'
37 | if pathway_file == 'C5_MF':
38 | filename = 'MSIGDB_PATHWAYS/c5.mf.v6.2.symbols.gmt'
39 | if pathway_file == 'C6':
40 | filename = 'MSIGDB_PATHWAYS/c6.all.v6.2.symbols.gmt'
41 | if pathway_file == 'C7':
42 | filename = 'MSIGDB_PATHWAYS/c7.all.v6.2.symbols.gmt'
43 |
44 |
45 | with open(filename) as f:
46 | content = f.readlines()
47 | content = [x.strip() for x in content]
48 |
49 | pathway_count = len(content)
50 | print("Pathway count " + str(pathway_count))
51 |
52 | pathway = np.zeros((len(gene_names), pathway_count), dtype = np.int)
53 | pathway_names = []
54 | pathway_lens = []
55 |
56 | for i in range(pathway_count):
57 | data = content[i].split("\t")
58 | genes = data[2:]
59 | pathway_name = data[0]
60 | pathway_names.append(pathway_name)
61 |
62 | pathway_lens.append(len(genes))
63 |
64 | #Loop through all genes
65 | for j in range(len(genes)):
66 |
67 | index = np.where(gene_names == genes[j])[0]
68 | if len(index) != 0:
69 | pathway[index[0], i] = 1
70 |
71 | #3) Save matrix
72 | new_df = pd.DataFrame(pathway, index = gene_names, columns = pathway_names)
73 | print("Pathway matrix ", new_df.shape)
74 | print("Average pathway length ", np.mean(pathway_lens))
75 | print("Average pathway length ", pathway_lens)
76 | new_df.to_csv(output_folder + 'PATHWAY_' + pathway_file + '_MATRIX_INTERSECTION_GENES.tsv', sep='\t', quoting = csv.QUOTE_NONE)
77 |
78 |
79 | #Also record gene symbols
80 | with open(output_folder + 'Gene_Symbols.txt', 'w') as f:
81 | for item in gene_names:
82 | f.write("%s\n" % item)
83 |
84 |
85 | create_pathway_matrix(cancer_type, pathway_name)
--------------------------------------------------------------------------------
/PATHWAY_ANALYSIS/Fishers_Test.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for running fisher's test for pathway enrichment analysis
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import csv
8 | import statsmodels.api as sm
9 | import sys
10 |
11 | #Read user inputs
12 | cancer_type = sys.argv[1]
13 | pathway_type = sys.argv[2]
14 | method = sys.argv[3]
15 | start = int(sys.argv[4])
16 | end = int(sys.argv[5])
17 | if len(sys.argv) > 6:
18 | dimension = int(sys.argv[6])
19 | L = dimension
20 | else:
21 | L = 150
22 |
23 | input_folder = '../ALL_CANCER_FILES/' + cancer_type + '/'
24 | output_folder = '../ALL_CANCER_FILES/' + cancer_type + '/PATHWAY_FILES/'
25 |
26 | pathway_matrix = pd.read_table(input_folder + 'PATHWAY_FILES/PATHWAY_' + pathway_type + '_MATRIX_INTERSECTION_GENES.tsv', index_col = 0)
27 | print(pathway_matrix.shape)
28 | pathway_df = pathway_matrix
29 | pathway_matrix = pathway_matrix.values
30 |
31 | if pathway_type == 'C2':
32 | N = 51 #average number of pathways
33 | if pathway_type == 'C4_CM':
34 | N = 113 #average number of pathways
35 | if pathway_type == 'C4_CGN':
36 | N = 99 #average number of pathways
37 | if pathway_type == 'C6':
38 | N = 166 #average number of pathways
39 | if pathway_type == 'C5_BP':
40 | N = 114 #average number of pathways
41 | if pathway_type == 'C5_CC':
42 | N = 151 #average number of pathways
43 | if pathway_type == 'C5_MF':
44 | N = 106 #average number of pathways
45 | if pathway_type == 'H':
46 | N = 146 #average number of pathways
47 |
48 | #Run test for each run
49 | for run in range(start, end):
50 | if method == 'PCA':
51 | data_df = pd.read_table(input_folder + 'PCA_FILES/' + cancer_type + '_DATA_TOP2_JOINED_PCA_COMPONENTS_150L.tsv', index_col = 0)
52 | print(data_df.shape)
53 |
54 | ensemble_weights = np.abs(data_df.values.T)
55 | print(ensemble_weights.shape)
56 |
57 | if method == 'ICA':
58 | data_df = pd.read_table(input_folder + 'ICA_FILES/' + cancer_type + '_DATA_TOP2_JOINED_ICA_COMPONENTS_150L_fold' + str(run + 1) + '.tsv', index_col = 0)
59 | print(data_df.shape)
60 |
61 | ensemble_weights = np.abs(data_df.values.T)
62 | print(ensemble_weights.shape)
63 |
64 | if method == 'RP':
65 | data_df = pd.read_table(input_folder + 'RP_FILES/' + cancer_type + '_DATA_TOP2_JOINED_RP_COMPONENTS_fold' + str(run + 1) + '.tsv', index_col = 0)
66 | print(data_df.shape)
67 |
68 | ensemble_weights = np.abs(data_df.values.T)
69 | print(ensemble_weights.shape)
70 |
71 | if method == 'AE':
72 | data_df = pd.read_table(input_folder + 'AE_FILES/' + cancer_type + '_DATA_AE_Weights_TRAINING_150L_fold' + str(run + 1) + '.tsv', index_col = 0)
73 | print(data_df.shape)
74 |
75 | ensemble_weights = data_df.values.T
76 | print(ensemble_weights.shape)
77 |
78 | if method == 'DAE':
79 | data_df = pd.read_table(input_folder + 'DAE_FILES/' + cancer_type + '_DATA_DAE_Weights_TRAINING_150L_fold' + str(run + 1) + '.tsv', index_col = 0)
80 | print(data_df.shape)
81 |
82 | ensemble_weights = data_df.values.T
83 | print(ensemble_weights.shape)
84 |
85 | if method == 'DeepProfile':
86 | data_df = pd.read_table(input_folder + cancer_type + '_DeepProfile_Ensemble_Gene_Importance_Weights_150L.tsv', index_col = 0)
87 | print(data_df.shape)
88 |
89 | ensemble_weights = data_df.values
90 | print(ensemble_weights.shape)
91 |
92 | if method == 'VAE':
93 | data_df = pd.read_table(input_folder + 'VAE_WEIGHTS/' + cancer_type + '_DATA_VAE_Cluster_Weights_TRAINING_' + str(dimension) + 'L_fold' + str(run) + '.tsv', index_col = 0)
94 | print(data_df.shape)
95 |
96 | ensemble_weights = data_df.values.T
97 | print(ensemble_weights.shape)
98 |
99 |
100 | #Apply fisher test
101 | p_vals = np.zeros((ensemble_weights.shape[0], pathway_matrix.shape[1]))
102 |
103 | print("Running for top ", N, " genes")
104 |
105 | for i in range(p_vals.shape[0]):
106 | print(i)
107 | for j in range(p_vals.shape[1]):
108 |
109 | #Create contingency matrix
110 | matrix = np.zeros((2, 2))
111 |
112 | pathway_indices = np.where(pathway_matrix[:, j] == 1)[0]
113 | #print(pathway_df.index[pathway_indices])
114 |
115 | gene_indices = ensemble_weights[i, :].argsort()[-N:][::-1]
116 | #print(len(gene_indices))
117 | #print(pathway_df.index[gene_indices])
118 |
119 | in_pathway_firstN = len(np.intersect1d(pathway_indices ,gene_indices))
120 | #print(pathway_df.index[np.intersect1d(pathway_indices ,gene_indices)])
121 |
122 | out_pathway_firstN = N - in_pathway_firstN
123 | #print(out_pathway_firstN)
124 |
125 | in_pathway_other = len(pathway_indices) - in_pathway_firstN
126 | #print(in_pathway_other)
127 |
128 | out_pathway_other = pathway_matrix.shape[0] - in_pathway_other
129 | #print(out_pathway_other)
130 |
131 | matrix[0, 0] = in_pathway_firstN
132 | matrix[0, 1] = in_pathway_other
133 | matrix[1, 0] = out_pathway_firstN
134 | matrix[1, 1] = out_pathway_other
135 |
136 | import scipy.stats as stats
137 | oddsratio, pvalue = stats.fisher_exact(matrix)
138 |
139 | p_vals[i, j] = pvalue
140 |
141 |
142 | #Record uncorrected p-values
143 | if method == 'VAE':
144 | p_vals_df = pd.DataFrame(p_vals, index = np.arange(L) + 1, columns = pathway_df.columns)
145 | p_vals_df.to_csv(output_folder + cancer_type + '_FISHER_UNCORRECTED_PVALS_' + pathway_type + '_' + method + '_' + str(dimension) + 'L_' + str(run + 1) + '.tsv', sep = '\t')
146 | else:
147 | p_vals_df = pd.DataFrame(p_vals, index = np.arange(L) + 1, columns = pathway_df.columns)
148 | p_vals_df.to_csv(output_folder + cancer_type + '_FISHER_UNCORRECTED_PVALS_' + pathway_type + '_' + method + '_' + str(run + 1) + '.tsv', sep = '\t')
149 |
150 | new_p_values = np.zeros(((p_vals.shape[0], p_vals.shape[1])))
151 |
152 | #Record corrected p-values
153 | for i in range(pathway_matrix.shape[1]):
154 | corrected_pval = sm.stats.multipletests( p_vals[:, i], alpha=0.05, method='fdr_bh', is_sorted=False, returnsorted=False)[1]
155 | new_p_values[:, i] = corrected_pval
156 |
157 | x = np.where([new_p_values < 0.05])[2]
158 | unique_count = len(np.unique(x))
159 | print("UNIQUE PATHWAY COUNT: " + str(unique_count))
160 |
161 | p_vals_df = pd.DataFrame(new_p_values, index = np.arange(L) + 1, columns = pathway_df.columns)
162 | #print(p_vals_df)
163 | if method == 'VAE':
164 | p_vals_df.to_csv(output_folder + cancer_type + '_FISHER_FDR_CORRECTED_PVALS_' + pathway_type + '_' + method + '_' + str(dimension) + 'L_' + str(run + 1) + '.tsv', sep = '\t')
165 | else:
166 | p_vals_df.to_csv(output_folder + cancer_type + '_FISHER_FDR_CORRECTED_PVALS_' + pathway_type + '_' + method + '_' + str(run + 1) + '.tsv', sep = '\t')
167 |
168 | x = np.where([p_vals_df.values < 0.05])[2]
169 | unique_count = len((x))
170 | print("AVERAGE PATHWAY COUNT: ", unique_count / 150)
171 |
172 |
--------------------------------------------------------------------------------
/PATHWAY_ANALYSIS/Run_Multiple_Fishers_Test.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for running multiple FETs
3 | ###############################
4 |
5 | import sys
6 |
7 | cancer_type = sys.argv[1]
8 | pathway = sys.argv[2]
9 |
10 | get_ipython().magic(u"run -i 'Fishers_Test.py' '" + cancer_type + "' " + pathway + " " + "DeepProfile" + " " + str(0) + " " + str(1))
11 | get_ipython().magic(u"run -i 'Fishers_Test.py' '" + cancer_type + "' " + pathway + " " + "PCA" + " " + str(0) + " " + str(1))
12 | get_ipython().magic(u"run -i 'Fishers_Test.py' '" + cancer_type + "' " + pathway + " " + "ICA" + " " + str(0) + " " + str(10))
13 | get_ipython().magic(u"run -i 'Fishers_Test.py' '" + cancer_type + "' " + pathway + " " + "RP" + " " + str(0) + " " + str(10))
14 | get_ipython().magic(u"run -i 'Fishers_Test.py' '" + cancer_type + "' " + pathway + " " + "AE" + " " + str(-1) + " " + str(9))
15 | get_ipython().magic(u"run -i 'Fishers_Test.py' '" + cancer_type + "' " + pathway + " " + "DAE" + " " + str(-1) + " " + str(9))
16 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DeepProfile
2 |
3 | Repository with scripts for all model training and analysis for paper "A deep profile of gene expression across 18 human cancers"
4 |
5 | All fully pre-processed input data for training the models can be found on our Figshare Data repository. For each cancer, the basic data we used is **'CANCER_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv'** where CANCER is the name of the cancer type. This data is GEO datasets collected from top 2 platforms, intersecting genes taken, and batch correction applied.
6 |
7 | The folder **MODEL_TRAININGS** includes all scripts and notebooks for training VAE models and obtaining attributions.
8 |
9 | The script **Example_Run_All.py** includes all commands for training DeepProfile model for one cancer type.
10 |
11 | **STEP 1: Creating PCs for each data**
12 |
13 | Create a folder **./ALL_CANCER_FILES/CANCER/** then download the data and save in that folder.
14 | **Create_PCs_for_DeepLearning_Models.py** takes a cancer type and component_count as input and applies PCA on the training data to train deep learning models.
15 |
16 | **STEP 2: Training VAE models**
17 |
18 | **VAE_3Layers_Model.py** is the Keras implementation of VAE model.
19 | **Run_VAE_Models.py** takes the cancer type, number of hidden nodes, and start-end folds to train VAE models for the given cancer type.
20 |
21 | **STEP 3: Running IG for VAE models**
22 |
23 | **IntegratedGradients.py** is the Keras implementation for Integrated Gradients feature attribution method.
24 | **Get_VAE_IG_Attributions.py** is the script for running IG and get gene-level explanations for each of the nodes. It takes the cancer type, number of hidden nodes, and start-end folds to get explanations for the VAE models for the given cancer type.
25 |
26 | **STEP 4: Learning ensemble labels**
27 |
28 | **Create_Ensemble_Labels.py** is the script for running k-means clustering to learn ensemble weights. It takes the cancer type and number of final latent nodes as the input and saves the ensemble labels.
29 | **Select_Latent_Dimension_with_Gmeans** is the notebook for running g-means clustering to select the ensemble latent dimension size.
30 |
31 | **STEP 5: Creating DeepProfile ensemble training embedding**
32 |
33 | **Create_DeepProfile_Training_Embeddings.py** is the script for joining all the training data VAE embeddings and ensembling them using the learned ensemble labels. It takes the cancer type as the input and creates training DeepProfile ensemble embedding.
34 |
35 | **STEP 6: Creating DeepProfile ensemble gene attribution matrices**
36 |
37 | **Create_DeepProfile_Ensemble_Weights.py** is the script for joining all the VAE gene attributions and ensembling them using the learned ensemble labels. It takes the cancer type as the input and creates DeepProfile gene attribution matrix.
38 |
39 |
40 |
41 | ### PART 2: TRAINING COMPETITOR MODELS
42 |
43 | The script **Example_Run_All.py** includes all commands for training competitor models for one cancer type.
44 |
45 | In **COMPETITOR_TRAININGS**, all the scripts for comparing DeepProfile to other methods is included
46 |
47 | **STEP 1: Training PCA Models**
48 |
49 | **Create_PCA_Data.py** takes a cancer type and creates PCA components for the training data.
50 |
51 | **STEP 2: Training ICA Models**
52 |
53 | **Create_ICA_Data.py** takes a cancer type and creates ICA components for the training data, repeating 10 times.
54 |
55 | **STEP 3: Training RP Models**
56 |
57 | **Create_RP_Data.py** takes a cancer type and creates RP components for the training data, repeating 10 times.
58 |
59 | **STEP 4: Training AE Models**
60 |
61 | **AE_2Layers_Model.py** is the Keras implementation of AE model.
62 | **Train_AE_Models.py** takes a cancer type as input and trains 10 AE models with different random seeds.
63 | **Get_AE_IG_Attributions.py** is the script for running IG and get gene-level explanations for each of the nodes. It takes the cancer type and fold to get explanations for the AE models for the given cancer type.
64 |
65 | **STEP 5: Training DAE Models**
66 |
67 | **DAE_2Layers_Model.py** is the Keras implementation of DAE model.
68 | **Train_DAE_Models.py** takes a cancer type as input and trains 10 DAE models with different random seeds.
69 | **Get_DAE_IG_Attributions.py** is the script for running IG and get gene-level explanations for each of the nodes. It takes the cancer type and fold to get explanations for the DAE models for the given cancer type.
70 |
71 |
72 |
73 | ### PART 3: TCGA SURVIVAL PREDICTIONS
74 |
75 | In **TCGA_SURVIVAL_PREDICTION** folder, all files and scripts are included for predicting TCGA expression survival.
76 |
77 | In folder **TCGA_DATA**, **TCGA_CLINICAL_DATA** folder includes clinical data for TCGA samples. **TCGA_MICROARRAY** folder includes microarray expression data and **TCGA_RNASEQ** folder includes RNA-Seq expression data.
78 |
79 | **STEP 1: Preprocessing Data**
80 |
81 | **CREATE_EMBEDDINGS** folder includes all scripts to generate TCGA RNA-Seq embeddings.
82 |
83 | The script **Example_Run_All.py** includes all commands for generating TCGA expression embeddings for one cancer type.
84 |
85 | **Preprocess_TCGA_Rnaseq_Expression.py** script takes the cancer type and TCGA cancer type as input and preprocesses the expression data to train models.
86 |
87 | **Create_TCGA_Rnaseq_PCs.py** script again takes the cancer type and TCGA cancer type as input and applies PCA to preprocessed expression to record top PCs to train deep learning models.
88 |
89 | **STEP 2: Encoding Expression with DeepProfile**
90 |
91 | **Encode_TCGA_Data_with_VAE.py** takes the preprocessed PCAed expression and encodes it using the already trained VAE models. The script takes cancer type, TCGA type, VAE dimension, start and end runs to encode the expression.
92 |
93 | **Create_All_VAE_Embeddings.py** takes cancer type and TCGA type as input and encoder TCGA expression with all trained VAE models.
94 |
95 | **Create_DeepProfile_TCGA_Embeddings.py** takes the cancer type and TCGA type as input and generates the DeepProfile embedding. The script loads in all the VAE embeddings and ensemble labels to generate an ensemble DeepProfile embedding.
96 |
97 | **STEP 3: Encoding Expression with Competitor Models**
98 |
99 | **Encode_TCGA_Data_with_PCA.py** takes the cancer type and TCGA type as input and generated PCA embedding for TCGA RNA-Seq expressions.
100 |
101 | **Encode_TCGA_Data_with_ICA.py** takes the cancer type and TCGA type as input and generated ICA embedding for TCGA RNA-Seq expressions.
102 |
103 | **Encode_TCGA_Data_with_RP.py** takes the cancer type and TCGA type as input and generated RP embedding for TCGA RNA-Seq expressions.
104 |
105 | **Encode_TCGA_Data_with_AE.py** takes the cancer type and TCGA type as input and generated AE embedding for TCGA RNA-Seq expressions.
106 |
107 | **Encode_TCGA_Data_with_DAE.py** takes the cancer type and TCGA type as input and generated DAE embedding for TCGA RNA-Seq expressions.
108 |
109 | **STEP 4: Generating Survival DataFrames**
110 |
111 | Folder **CREATE_SURVIVAL_DATAFRAMES** includes all scripts for generating survival data frames.
112 |
113 | **Create_TCGA_Survival_Dataframes.py** takes the cancer type and TCGA type as input and extract the necssary fields from clinical data to define the survival dataframe.
114 |
115 | **Create_Joined_Survival_Dataframes.py** takes the cancer type and TCGA type as input and comnbines the DeepProfile RNA-Seq embeddings with survival data frames.
116 |
117 | **Create_Joined_Survival_Dataframes_Cancer_Types.py** combines data frames for cancer subtypes under the main cancer type.
118 |
119 | **STEP 5: Predicting Survival**
120 |
121 | Folder **PREDICT_SURVIVAL** contains scripts for predicting survival.
122 |
123 | **Predict_Survival.py** trains lasso regression models with subsampling taking the TCGA RNA-Seq embeddings as the input.
124 |
125 | **Predict_Survival_Subtypes_Joined.py** trains lasso regression models with subsampling taking the TCGA RNA-Seq embeddings as the input while joining multiple TCGA cancer types when there are multiple TCGA cancer subtypes corresponding to one major cancer type we have.
126 |
127 | **Run_Models.py** trains all prediction models for all models and cancer types.
128 |
129 | **Plots_of_Survival_Prediction.ipynb** and **Plots_of_Survival_Prediction_VAEs.ipynb** are notebooks for generating plots of comparing survival predictions of models.
130 |
131 | **STEP 6: Comparing RNA-Seq and microarray DeepProfile embeddings**
132 |
133 | **COMPARING_RNASEQ_and_MICROARRAY** folder includes all scripts to generate TCGA microarray embeddings and to compare the embeddings with RNA-Seq embeddings.
134 |
135 | **Preprocess_TCGA_Rnaseq_Expression.py** script takes the cancer type and TCGA cancer type as input and preprocesses the expression data to train models.
136 |
137 | **Create_TCGA_Microarray_PCs.py** script again takes the cancer type and TCGA cancer type as input and applies PCA to preprocessed expression to record top PCs to train deep learning models.
138 |
139 | **Encode_TCGA_Microarray_Data_with_VAE.py** takes the preprocessed PCAed expression and encodes it using the already trained VAE models. The script takes cancer type, TCGA type, VAE dimension, start and end runs to encode the expression.
140 |
141 | **Create_DeepProfile_TCGA_Microarray_Embeddings.py** takes the cancer type and TCGA type as input and generates the DeepProfile embedding. The script loads in all the VAE embeddings and ensemble labels to generate an ensemble DeepProfile embedding.
142 |
143 | **Rnaseq_and_Microarray_Embedding_Correlation_Plots** notebook calculates correlation between RNA-Seq and microarray embeddings and generates plots.
144 |
145 |
146 |
147 | ### PART 4: PATHWAY ENRICHMENT TESTS
148 |
149 | In **PATHWAY_ANALYSIS** folder, the scripts and files for pathway analysis are included.
150 |
151 | **MSIGDB_PATHWAYS** folder, the files for Molecular Signature Database pathways are included.
152 |
153 | **STEP 1: Running enrichment tests**
154 |
155 | **Create_Pathway_Matrices.py** is the script for creating binary pathway matrices for the genes that are present in the training datasets. It takes a cancer type and pathway type as input and creates an binary matrix of pathway overlaps.
156 |
157 | **Fishers_Test.py** is the script for running Fisher's test. It takes the cancer type, pathway type, the method name, and the range of runs and records uncorrected and FDR-corrected p-values.
158 |
159 | **Run_Multiple_Fishers_Test.py** is the script for running multiple tests consecutively. It takes the cancer type and pathway name as input and carries enrichment tests for all methods.
160 |
161 | **STEP 2: Comparing pathway coverages**
162 |
163 | **PATHWAY COVERAGE ANALYSIS** folder includes all scripts for comparing pathway coverage of models.
164 |
165 | **Plot_of_Average_Pathway_Coverages** genereates plots of average pathway coverage to compare DeepProfile and other dimensionality reduction methods.
166 |
167 | **Plot_of_Pathway_Coverage_Distributions** generates plots of distribution of pathway count of each node of DeepProfile and other dimensionality reduction methods.
168 |
169 | **Plot_of_Node_Level_Pathway_Annotations** generates plots of percent of nodes annotated by at least one pathway across multiple thresholds.
170 |
171 | **Plot_of_Pathway_Detection_Comparison_VAEs_vs_DeepProfile** creates plots for comparing pathways captured by DeepProfile vs individual VAE models.
172 |
173 | **Plot_of_Pathway_Percent_Comparison_VAEs_vs_DeepProfile** creates plots for comparing pathways captured by DeepProfile vs individual VAE models based on percentages.
174 |
175 |
176 |
177 | ### PART 5: NORMAL TISSUE ANALYSIS
178 |
179 | In **NORMAL_TISSUE_ANALYSIS** folder, the scripts for normal tissue analysis are included.
180 |
181 | The script **Example_Run_All.py** includes all commands for generating normal tissue expression embeddings for one cancer type.
182 |
183 | **Gtex_Tissue_Name_Mappings** is the notebook for mapping GTEX tissue names to cancer types we have. The GTEX expression data includes samples from many different tissues. We extract the GTEX sample names for each cancer type we have.
184 |
185 | **Preprocess_Gtex_Rnaseq_Expressions.py** is the script for creating preprocessed GTEX gene expression. It takes the cancer type as input and preprocesses the GTEX RNA-Seq expression using the same preprocessing steps applied to our training data.
186 |
187 | **Create_Gtex_Rnaseq_PCs.py** is the script for taking top PCs of the GTEX expression profiles to train DeepProfile model. It takes the cancer type as input and records the top PCs of GTEX expression.
188 |
189 | **Encode_GTEX_Data_with_VAE.py** is the script for encoding GTEX expression using trained VAE models. The inputs to the model are the cancer type, the number of latent nodes, and start and end runs.
190 |
191 | **Create_DeepProfile_GTEX_Embeddings.py** is the script for creating DeepProfile embedding using generated VAE embeddings. It takes the cancer type as input and records the final DeepProfile embedding for GTEX normal tissue samples.
192 |
193 | **Normal_Tissue_Classifier.py** is the script for training the classifier to separate normal vs cancer tissue embeddings. It takes the cancer type as input and records the bootstrapped classifier weights.
194 |
195 |
196 |
197 |
198 |
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/COMPARING_RNASEQ_and_MICROARRAY/Create_DeepProfile_TCGA_Microarray_Embeddings.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for creating TCGA RNA-Seq DeepProfile embeddings
3 | ###############################
4 |
5 | import pandas as pd
6 | import numpy as np
7 | import csv
8 | import sys
9 |
10 | #Read cancer type from user
11 | cancer_type = sys.argv[1]
12 | tcga_type = sys.argv[2]
13 |
14 | #Read all VAE embeddings
15 | dims = [5, 10, 25, 50, 75, 100]
16 | run = 100
17 |
18 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/'
19 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/'
20 |
21 | data_list = []
22 | for dim in dims:
23 | for i in range(run):
24 | data_df = pd.read_table(input_folder+ 'TCGA_FILES/TCGA_' + tcga_type + '_MICROARRAY_Expression_VAE_encoded_' + str(dim) + 'L_' + str(i) + '.tsv', index_col = 0)
25 | print("TCGA VAE embedding ", data_df.shape)
26 | data_list.append(data_df.values)
27 |
28 | #Concatenate all embeddings
29 | joined_data = np.concatenate(data_list, axis=1)
30 | print("Joined VAE embedding ",joined_data.shape)
31 |
32 | #Read DeepProfile ensemble labels
33 | L = 150
34 | labels_df = pd.read_table(input_folder + cancer_type + '_TRAINING_DATA_kmeans_ENSEMBLE_LABELS_' + str(L) + 'L.txt', header= None)
35 | labels = labels_df.values
36 | print("DeepProfile ensemble labels ", len(labels))
37 |
38 | #Create ensemble embedding
39 | ensemble_embeddings = np.zeros((joined_data.shape[0], L))
40 | for label in range(L):
41 | indices = np.where(labels == label)[0]
42 | average_values = np.mean(joined_data[:, indices], axis = 1)
43 | ensemble_embeddings[:, label] = average_values
44 |
45 | #Record the ensemble embeddings
46 | print(ensemble_embeddings.shape)
47 | ensemble_embeddings_df = pd.DataFrame(ensemble_embeddings, index = data_df.index, columns = np.arange(L))
48 | ensemble_embeddings_df.to_csv(output_folder + tcga_type + '_DeepProfile_TCGA_MICROARRAY_Embedding_' + str(L) + 'L.tsv', sep = '\t')
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/COMPARING_RNASEQ_and_MICROARRAY/Create_TCGA_Microarray_PCs.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for recording top PCs or TCGA RNA-Seq data
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import csv
8 | from sklearn.decomposition import PCA
9 | import sklearn.preprocessing
10 |
11 | #Define method for preprocessing data
12 | def create_data(cancer_type, tcga_type):
13 |
14 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/'
15 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/'
16 |
17 | #Read training data
18 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
19 | print("Training data ", data_df.shape)
20 |
21 | #Apply PCA to training data
22 | training_data = data_df.values
23 | training_data = np.nan_to_num(training_data)
24 |
25 | pca = PCA(n_components = 500)
26 | pca.fit(training_data)
27 | components = pca.components_
28 | print("PCA components ", components.shape)
29 |
30 | #Read TCGA RNA-Seq expression
31 | tcga_df = pd.read_table(input_folder + 'TCGA_FILES/TCGA_' + tcga_type + '_PREPROCESSED_MICROARRAY_EXPRESSION.tsv', index_col= 0)
32 | print("TCGA expression ", tcga_df.shape)
33 | print('RANGE: ', (np.max(tcga_df.values) - np.min(tcga_df.values) ))
34 |
35 | #Encode test data using trained PCA model
36 | test_data = tcga_df.values
37 | encoded_data = pca.transform(test_data)
38 | print("Encoded TCGA data ", encoded_data.shape)
39 |
40 | #Record expression data
41 | encoded_df = pd.DataFrame(encoded_data, index = tcga_df.index)
42 | encoded_df.to_csv(output_folder + 'TCGA_MICROARRAY_' + tcga_type + '_PCA_500L.tsv', sep = '\t')
43 |
44 |
45 | import sys
46 | cancer_type = sys.argv[1]
47 | tcga_type = sys.argv[2]
48 | create_data(cancer_type, tcga_type)
49 |
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/COMPARING_RNASEQ_and_MICROARRAY/Encode_TCGA_Microarray_Data_with_VAE.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for encoding TCGA microarray expression using VAE models
3 | ###############################
4 |
5 | import os
6 | import numpy as np
7 | import pandas as pd
8 |
9 | import math
10 | from sklearn.metrics import mean_squared_error
11 | import matplotlib.pyplot as plt
12 |
13 | import tensorflow as tf
14 | from keras.layers import Input, Dense, Lambda, Layer, Activation
15 | from keras.layers.normalization import BatchNormalization
16 | from keras.models import Model
17 | from keras import backend as K
18 | from keras import metrics, optimizers
19 | from keras.callbacks import Callback
20 | import keras
21 |
22 | import csv
23 | import sys
24 | from keras.models import model_from_json
25 | from sklearn import preprocessing
26 |
27 | #Prevent tensorflow from using all the memory
28 | config = tf.ConfigProto()
29 | config.gpu_options.allow_growth=True
30 | sess = tf.Session(config=config)
31 |
32 | #Method for defining the VAE loss
33 | def vae_loss(x_input, x_decoded):
34 | reconstruction_loss = original_dim * metrics.mse(x_input, x_decoded)
35 | kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
36 | return K.mean(reconstruction_loss + (K.get_value(beta) * kl_loss))
37 |
38 |
39 | #Read user inputs
40 | import sys
41 | cancer = sys.argv[1]
42 | tcga_name = sys.argv[2]
43 | dimension = int(sys.argv[3])
44 | start = int(sys.argv[4])
45 | end = int(sys.argv[5])
46 |
47 | print("CANCER NAME: " + cancer)
48 | print("TEST NAME: " + tcga_name)
49 |
50 | input_folder = '../../ALL_CANCER_FILES/' + cancer + '/'
51 | output_folder = '../../ALL_CANCER_FILES/' + cancer + '/TCGA_FILES/'
52 |
53 | #Read input data
54 | input_df_test = pd.read_table(input_folder + 'TCGA_FILES/TCGA_MICROARRAY_' + tcga_name + '_PCA_1000L.tsv', index_col = 0)
55 | print("TCGA expression dataframe ", input_df_test.shape)
56 |
57 | #Read GTEX expression
58 | for fold in range(start, end):
59 | print("VAE model with " + str(dimension) + " nodes and fold " + str(fold))
60 |
61 | #Load VAE models
62 | json_file = open( input_folder + 'VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.json', 'r')
63 | loaded_model_json = json_file.read()
64 | json_file.close()
65 | encoder = model_from_json(loaded_model_json)
66 |
67 | encoder.load_weights(input_folder + 'VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.h5')
68 | print("Loaded model from disk")
69 |
70 | #Define placeholder VAE model
71 | original_dim = input_df_test.shape[1]
72 | intermediate1_dim = 100
73 | intermediate2_dim = 25
74 | latent_dim = dimension
75 |
76 | batch_size = 50
77 | epochs = 50
78 | learning_rate = 0.0005
79 | beta = K.variable(1)
80 | kappa = 0
81 | init_mode = 'glorot_uniform'
82 |
83 | x = Input(shape=(original_dim, ))
84 |
85 | net = Dense(intermediate1_dim, kernel_initializer=init_mode)(x)
86 | net2 = BatchNormalization()(net)
87 | net3 = Activation('relu')(net2)
88 |
89 | net4 = Dense(intermediate2_dim, kernel_initializer=init_mode)(net3)
90 | net5 = BatchNormalization()(net4)
91 | net6 = Activation('relu')(net5)
92 |
93 | z_mean = Dense(latent_dim, kernel_initializer=init_mode)(net6)
94 | z_log_var = Dense(latent_dim, kernel_initializer=init_mode)(net6)
95 |
96 | adam = optimizers.Adam(lr=learning_rate)
97 |
98 | #Encode test data using the VAE model
99 | test_encoded = encoder.predict(input_df_test, batch_size = batch_size)
100 | test_encoded_df = pd.DataFrame(test_encoded, index = input_df_test.index)
101 | test_encoded_df.to_csv(output_folder + 'TCGA_' + tcga_name + '_MICROARRAY_Expression_VAE_encoded_' + str(dimension) + 'L_' + str(fold) + '.tsv', sep = '\t')
102 |
103 |
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/COMPARING_RNASEQ_and_MICROARRAY/Preprocess_TCGA_Microarray_Expression.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for preprocessing TCGA RNA-Seq expression
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import csv
8 | import sklearn.preprocessing
9 |
10 | #Define method for preprocessing data
11 | def create_data(cancer_type, tcga_type):
12 |
13 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/'
14 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/'
15 |
16 | #Read TCGA RNA-Seq expression
17 | tcga_df = pd.read_csv('../TCGA_DATA/TCGA_MICROARRAY/' + tcga_type + '.medianexp.txt', sep = '\t', index_col= 0)
18 | tcga_df = tcga_df.transpose()
19 | tcga_df = tcga_df.iloc[:, 1:]
20 | tcga_df = tcga_df.astype(float)
21 |
22 | tcga_df = tcga_df.fillna(tcga_df.mean().fillna(0))
23 | print("TCGA expression ", tcga_df.shape)
24 | print('RANGE: ', (np.max(tcga_df.values) - np.min(tcga_df.values) ))
25 | print("TCGA expression mean ", np.mean(tcga_df.values, axis = 0))
26 | print("TCGA expression mean ", len(np.mean(tcga_df.values, axis = 0)))
27 | print("TCGA expression std ", np.std(tcga_df.values, axis = 0))
28 | print("TCGA expression std ", len(np.std(tcga_df.values, axis = 0)))
29 |
30 | new_index = [s[:15] for s in tcga_df.index]
31 | tcga_df = pd.DataFrame(tcga_df.values, index = new_index, columns = tcga_df.columns)
32 | print(tcga_df)
33 |
34 | #Eliminate normal samples
35 | print("Eliminating normal samples..")
36 | sample_codes = [s[-2:] for s in tcga_df.index]
37 | print("Sample codes ", np.unique(sample_codes))
38 | normal_codes = [s[-2] for s in tcga_df.index]
39 | cancer_samples = np.where(np.asarray(normal_codes) == '0')[0]
40 | print("Total number of samples ", len(tcga_df.index))
41 | print("Total number of cancer samples ", len(cancer_samples))
42 | tcga_df = tcga_df.iloc[cancer_samples, :]
43 | print("TCGA expression ", tcga_df.shape)
44 | print("TCGA expression cancer samples ", tcga_df.index)
45 |
46 | #Read training data
47 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
48 | print("Training data ", data_df.shape)
49 |
50 | #Get only training genes from the expression data
51 | joined_df = pd.concat([data_df, tcga_df], join = 'outer')
52 | joined_df = joined_df[data_df.columns]
53 | joined_df = joined_df.iloc[-1 * tcga_df.shape[0]:, :]
54 | joined_df = joined_df.fillna(joined_df.mean().fillna(0))
55 | print("TCGA expression ", joined_df.shape)
56 |
57 | #Standardize data to make 0 mean univariate
58 | normalized_data = sklearn.preprocessing.scale(joined_df.values)
59 | print("TCGA expression mean ", np.mean(normalized_data, axis = 0))
60 | print("TCGA expression mean ", len(np.mean(normalized_data, axis = 0)))
61 | print("TCGA expression std ", np.std(normalized_data, axis = 0))
62 | print("TCGA expression std ", len(np.std(normalized_data, axis = 0)))
63 |
64 | #Record joined dataframe
65 | joined_df = pd.DataFrame(normalized_data, index = joined_df.index, columns = joined_df.columns)
66 | print("Final dataframe ", joined_df.shape)
67 | print('RANGE: ', (np.max(joined_df.values) - np.min(joined_df.values) ))
68 |
69 | #Record expression data
70 | joined_df.to_csv(output_folder + 'TCGA_' + tcga_type + '_PREPROCESSED_MICROARRAY_EXPRESSION.tsv', sep = '\t')
71 | print(joined_df)
72 |
73 | import sys
74 | cancer_type = sys.argv[1]
75 | tcga_type = sys.argv[2]
76 | create_data(cancer_type, tcga_type)
77 |
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Create_All_VAE_Embeddings.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | cancer_type = sys.argv[1]
4 | tcga_type = sys.argv[2]
5 |
6 | dims = [5, 10, 25, 50, 75, 100]
7 | for dim in dims:
8 | get_ipython().magic(u"run -i 'Encode_TCGA_Data_with_VAE.py' '" + cancer_type + "' " + tcga_type + " " + str(dim) + " " + str(0) + " " + str(100))
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Create_DeepProfile_TCGA_Embeddings.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for creating TCGA RNA-Seq DeepProfile embeddings
3 | ###############################
4 |
5 | import pandas as pd
6 | import numpy as np
7 | import csv
8 | import sys
9 |
10 | #Read cancer type from user
11 | cancer_type = sys.argv[1]
12 | tcga_type = sys.argv[2]
13 |
14 | #Read all VAE embeddings
15 | dims = [5, 10, 25, 50, 75, 100]
16 | run = 100
17 |
18 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/'
19 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/'
20 |
21 | data_list = []
22 | for dim in dims:
23 | for i in range(run):
24 | data_df = pd.read_table(input_folder + 'TCGA_FILES/TCGA_' + tcga_type + '_RNASeq_Expression_VAE_encoded_' + str(dim) + 'L_' + str(i) + '.tsv', index_col = 0)
25 | print("TCGA VAE embedding ", data_df.shape)
26 | data_list.append(data_df.values)
27 |
28 | #Concatenate all embeddings
29 | joined_data = np.concatenate(data_list, axis=1)
30 | print("Joined VAE embedding ",joined_data.shape)
31 |
32 | #Read DeepProfile ensemble labels
33 | L = 150
34 | labels_df = pd.read_table( input_folder + cancer_type + '_TRAINING_DATA_kmeans_ENSEMBLE_LABELS_' + str(L) + 'L.txt', header= None)
35 | labels = labels_df.values
36 | print("DeepProfile ensemble labels ", len(labels))
37 |
38 | #Create ensemble embedding
39 | ensemble_embeddings = np.zeros((joined_data.shape[0], L))
40 | for label in range(L):
41 | indices = np.where(labels == label)[0]
42 | average_values = np.mean(joined_data[:, indices], axis = 1)
43 | ensemble_embeddings[:, label] = average_values
44 |
45 | #Record the ensemble embeddings
46 | print(ensemble_embeddings.shape)
47 | ensemble_embeddings_df = pd.DataFrame(ensemble_embeddings, index = data_df.index, columns = np.arange(L))
48 | ensemble_embeddings_df.to_csv(output_folder + tcga_type + '_DeepProfile_TCGA_RNASeq_Embedding_' + str(L) + 'L.tsv', sep = '\t')
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Create_TCGA_Rnaseq_PCs.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for recording top PCs or TCGA RNA-Seq data
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import csv
8 | from sklearn.decomposition import PCA
9 | import sklearn.preprocessing
10 |
11 | #Define method for preprocessing data
12 | def create_data(cancer_type, tcga_type):
13 |
14 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/'
15 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/'
16 |
17 | #Read training data
18 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
19 | print("Training data ", data_df.shape)
20 |
21 | #Apply PCA to training data
22 | training_data = data_df.values
23 | training_data = np.nan_to_num(training_data)
24 |
25 | pca = PCA(n_components = 1000)
26 | pca.fit(training_data)
27 | components = pca.components_
28 | print("PCA components ", components.shape)
29 |
30 | #Read TCGA RNA-Seq expression
31 | tcga_df = pd.read_table(output_folder + 'TCGA_' + tcga_type + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', index_col= 0)
32 | print("TCGA expression ", tcga_df.shape)
33 | print('RANGE: ', (np.max(tcga_df.values) - np.min(tcga_df.values) ))
34 |
35 | #Encode test data using trained PCA model
36 | test_data = tcga_df.values
37 | encoded_data = pca.transform(test_data)
38 | print("Encoded TCGA data ", encoded_data.shape)
39 |
40 | #Record expression data
41 | encoded_df = pd.DataFrame(encoded_data, index = tcga_df.index)
42 | encoded_df.to_csv(output_folder + 'TCGA_RNASEQ_' + tcga_type + '_PCA_1000L.tsv', sep = '\t')
43 |
44 |
45 | import sys
46 | cancer_type = sys.argv[1]
47 | tcga_type = sys.argv[2]
48 | create_data(cancer_type, tcga_type)
49 |
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Encode_TCGA_Data_with_AE.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for creating TCGA RNA-Seq AE embeddings
3 | ###############################
4 |
5 | import os
6 | import numpy as np
7 | import pandas as pd
8 |
9 | import math
10 | from sklearn.metrics import mean_squared_error
11 | import matplotlib.pyplot as plt
12 | from keras.models import model_from_json
13 | from sklearn import preprocessing
14 |
15 | import tensorflow as tf
16 | from keras.layers import Input, Dense, Lambda, Layer, Activation
17 | from keras.layers.normalization import BatchNormalization
18 | from keras.models import Model
19 | from keras import backend as K
20 | from keras import metrics, optimizers
21 | from keras.callbacks import Callback
22 | import keras
23 |
24 | import csv
25 | import sys
26 |
27 | #Prevent tensorflow from using all the memory
28 | config = tf.ConfigProto()
29 | config.gpu_options.allow_growth=True
30 | sess = tf.Session(config=config)
31 |
32 | #Define reconstruction loss
33 | def reconstruction_loss(x_input, x_decoded):
34 | return metrics.mse(x_input, x_decoded)
35 |
36 | #Read user inputs
37 | import sys
38 | cancer = sys.argv[1]
39 | tcga_name = sys.argv[2]
40 | print("CANCER NAME: " + cancer)
41 | print("TEST NAME: " + tcga_name)
42 |
43 | input_folder = '../../ALL_CANCER_FILES/' + cancer + '/'
44 | output_folder = '../../ALL_CANCER_FILES/' + cancer + '/TCGA_FILES/'
45 |
46 | start = 0
47 | end = 10
48 | dimension = 150
49 |
50 | #Read TCGA RNA-Seq input data
51 | input_df_test = pd.read_table(input_folder + 'TCGA_FILES/TCGA_RNASEQ_' + tcga_name + '_PCA_1000L.tsv', index_col = 0)
52 | print("RNA-Seq expression dataframe ", input_df_test.shape)
53 |
54 | #Encode test data with all 10 AE models
55 | for fold in range(start, end):
56 | print("AE model with " + str(dimension) + " nodes and fold " + str(fold))
57 |
58 | #Load AE models
59 | json_file = open(input_folder + 'AE_FILES/AE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.json', 'r')
60 | loaded_model_json = json_file.read()
61 | json_file.close()
62 | encoder = model_from_json(loaded_model_json)
63 |
64 | encoder.load_weights(input_folder + 'AE_FILES/AE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.h5')
65 | print("Loaded model from disk")
66 |
67 | # Encode test data using the AE model
68 | test_encoded = encoder.predict(input_df_test)
69 | test_encoded_df = pd.DataFrame(test_encoded, index = input_df_test.index)
70 | print("Encoded data ", test_encoded_df.shape)
71 | test_encoded_df.to_csv(output_folder + 'TCGA_' + tcga_name + '_RNASeq_Expression_AE_encoded_' + str(dimension) + 'L_' + str(fold) + '.tsv', sep = '\t')
72 |
73 |
74 |
75 |
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Encode_TCGA_Data_with_DAE.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for creating TCGA RNA-Seq AE embeddings
3 | ###############################
4 |
5 | import os
6 | import numpy as np
7 | import pandas as pd
8 |
9 | import math
10 | from sklearn.metrics import mean_squared_error
11 | import matplotlib.pyplot as plt
12 | from keras.models import model_from_json
13 | from sklearn import preprocessing
14 |
15 | import tensorflow as tf
16 | from keras.layers import Input, Dense, Lambda, Layer, Activation
17 | from keras.layers.normalization import BatchNormalization
18 | from keras.models import Model
19 | from keras import backend as K
20 | from keras import metrics, optimizers
21 | from keras.callbacks import Callback
22 | import keras
23 |
24 | import csv
25 | import sys
26 |
27 | #Prevent tensorflow from using all the memory
28 | config = tf.ConfigProto()
29 | config.gpu_options.allow_growth=True
30 | sess = tf.Session(config=config)
31 |
32 | #Define reconstruction loss
33 | def reconstruction_loss(x_input, x_decoded):
34 | return metrics.mse(x_input, x_decoded)
35 |
36 | #Read user inputs
37 | import sys
38 | cancer = sys.argv[1]
39 | tcga_name = sys.argv[2]
40 | print("CANCER NAME: " + cancer)
41 | print("TEST NAME: " + tcga_name)
42 |
43 | input_folder = '../../ALL_CANCER_FILES/' + cancer + '/'
44 | output_folder = '../../ALL_CANCER_FILES/' + cancer + '/TCGA_FILES/'
45 |
46 | start = 0
47 | end = 10
48 | dimension = 150
49 |
50 | #Read TCGA RNA-Seq input data
51 | input_df_test = pd.read_table(input_folder + 'TCGA_FILES/TCGA_RNASEQ_' + tcga_name + '_PCA_1000L.tsv', index_col = 0)
52 | print("RNA-Seq expression dataframe ", input_df_test.shape)
53 |
54 | #Encode test data with all 10 DAE models
55 | for fold in range(start, end):
56 | print("DAE model with " + str(dimension) + " nodes and fold " + str(fold))
57 |
58 | #Load DAE models
59 | json_file = open(input_folder + 'DAE_FILES/DAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.json', 'r')
60 | loaded_model_json = json_file.read()
61 | json_file.close()
62 | encoder = model_from_json(loaded_model_json)
63 |
64 | encoder.load_weights(input_folder + 'DAE_FILES/DAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.h5')
65 | print("Loaded model from disk")
66 |
67 | # Encode test data using the DAE model
68 | test_encoded = encoder.predict(input_df_test)
69 | test_encoded_df = pd.DataFrame(test_encoded, index = input_df_test.index)
70 | print("Encoded data ", test_encoded_df.shape)
71 | test_encoded_df.to_csv(output_folder + 'TCGA_' + tcga_name + '_RNASeq_Expression_DAE_encoded_' + str(dimension) + 'L_' + str(fold) + '.tsv', sep = '\t')
72 |
73 |
74 |
75 |
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Encode_TCGA_Data_with_ICA.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for creating TCGA RNA-Seq ICA embeddings
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import csv
8 | from sklearn.decomposition import FastICA
9 | import sklearn.preprocessing
10 | from scipy.stats.mstats import winsorize
11 |
12 | #Read cancer type and TCGA type
13 | import sys
14 | cancer_type = sys.argv[1]
15 | tcga_type = sys.argv[2]
16 | print("CANCER NAME: " + cancer_type)
17 | print("TEST NAME: " + tcga_type)
18 |
19 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/'
20 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/'
21 |
22 | #Read training data
23 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
24 | print("Training data ", data_df.shape)
25 | training_data = data_df.values
26 | training_data = np.nan_to_num(training_data)
27 |
28 | #Read TCGA RNA-Seq expression data
29 | tcga_df = pd.read_table(input_folder + 'TCGA_FILES/TCGA_' + tcga_type + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', index_col= 0)
30 | print("TCGA data ", tcga_df.shape)
31 | test_data = tcga_df.values
32 |
33 | #Train all ICA models
34 | for run in range(10):
35 | #Train model
36 | ica = FastICA(n_components = 150, random_state = 12345 * run, tol=0.001, max_iter = 100000)
37 | print(ica)
38 | ica.fit(training_data)
39 | components = ica.components_
40 | print("ICA components ", components.shape)
41 |
42 | #Encode RNA-Seq data
43 | encoded_data = ica.transform(test_data)
44 | print("Encoded TCGA data ", encoded_data.shape)
45 | encoded_df = pd.DataFrame(encoded_data, index = tcga_df.index)
46 | encoded_df.to_csv(output_folder + 'TCGA_RNASEQ_' + tcga_type + '_ICA_150L_' + str(run + 1) + '.tsv', sep = '\t')
47 |
48 |
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Encode_TCGA_Data_with_PCA.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for creating TCGA RNA-Seq PCA embeddings
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import csv
8 | from sklearn.decomposition import PCA
9 | import sklearn.preprocessing
10 | from scipy.stats.mstats import winsorize
11 | import sys
12 |
13 | #Read cancer type and TCGA type
14 | cancer_type = sys.argv[1]
15 | tcga_type = sys.argv[2]
16 | print("CANCER NAME: " + cancer_type)
17 | print("TEST NAME: " + tcga_type)
18 |
19 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/'
20 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/'
21 |
22 | #Read training data
23 | data_df = pd.read_table( input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
24 | print("Training data ", data_df.shape)
25 | training_data = data_df.values
26 | training_data = np.nan_to_num(training_data)
27 |
28 | #Train PCA model
29 | pca = PCA(n_components = 150)
30 | pca.fit(training_data)
31 | components = pca.components_
32 | print("PCA components ", components.shape)
33 |
34 | #Read TCGA RNA-Seq expression data
35 | tcga_df = pd.read_table(output_folder+ '/TCGA_' + tcga_type + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', index_col= 0)
36 | print("TCGA data ", tcga_df.shape)
37 |
38 | #Encode TCGA data with PCA model
39 | test_data = tcga_df.values
40 | encoded_data = pca.transform(test_data)
41 | print("Encoded TCGA data ", encoded_data.shape)
42 |
43 | encoded_df = pd.DataFrame(encoded_data, index = tcga_df.index)
44 | encoded_df.to_csv(output_folder + '/TCGA_RNASEQ_' + tcga_type + '_PCA_150L.tsv', sep = '\t')
45 |
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Encode_TCGA_Data_with_RP.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for creating TCGA RNA-Seq RP embeddings
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import csv
8 | from sklearn.random_projection import GaussianRandomProjection
9 | import sklearn.preprocessing
10 | from scipy.stats.mstats import winsorize
11 |
12 | #Read cancer type and TCGA type
13 | import sys
14 | cancer_type = sys.argv[1]
15 | tcga_type = sys.argv[2]
16 | print("CANCER NAME: " + cancer_type)
17 | print("TEST NAME: " + tcga_type)
18 |
19 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/'
20 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/'
21 |
22 | #Read training data
23 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
24 | print("Training data ", data_df.shape)
25 | training_data = data_df.values
26 | training_data = np.nan_to_num(training_data)
27 |
28 | #Read TCGA RNA-Seq expression data
29 | tcga_df = pd.read_table(input_folder + 'TCGA_FILES/TCGA_' + tcga_type + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', index_col= 0)
30 | print("TCGA data ", tcga_df.shape)
31 | test_data = tcga_df.values
32 |
33 | #Train all models
34 | for run in range(10):
35 | #Train model
36 | transformer = GaussianRandomProjection(n_components = 150, random_state = run * 12345)
37 | transformer.fit(training_data)
38 | components = transformer.components_
39 | print("RP components ", components.shape)
40 |
41 | #Save the encoded data
42 | encoded_data = transformer.transform(test_data)
43 | print("Encoded TCGA data ", encoded_data.shape)
44 | encoded_df = pd.DataFrame(encoded_data, index = tcga_df.index)
45 | encoded_df.to_csv(output_folder + 'TCGA_RNASEQ_' + tcga_type + '_RP_150L_' + str(run + 1) + '.tsv', sep = '\t')
46 |
47 |
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Encode_TCGA_Data_with_VAE.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for encoding TCGA RNA-Seq expression using VAE models
3 | ###############################
4 |
5 | import os
6 | import numpy as np
7 | import pandas as pd
8 |
9 | import math
10 | from sklearn.metrics import mean_squared_error
11 | import matplotlib.pyplot as plt
12 |
13 | import tensorflow as tf
14 | from keras.layers import Input, Dense, Lambda, Layer, Activation
15 | from keras.layers.normalization import BatchNormalization
16 | from keras.models import Model
17 | from keras import backend as K
18 | from keras import metrics, optimizers
19 | from keras.callbacks import Callback
20 | import keras
21 |
22 | import csv
23 | import sys
24 | from keras.models import model_from_json
25 | from sklearn import preprocessing
26 |
27 | #Prevent tensorflow from using all the memory
28 | config = tf.ConfigProto()
29 | config.gpu_options.allow_growth=True
30 | sess = tf.Session(config=config)
31 |
32 | #Method for defining the VAE loss
33 | def vae_loss(x_input, x_decoded):
34 | reconstruction_loss = original_dim * metrics.mse(x_input, x_decoded)
35 | kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
36 | return K.mean(reconstruction_loss + (K.get_value(beta) * kl_loss))
37 |
38 |
39 | #Read user inputs
40 | import sys
41 | cancer = sys.argv[1]
42 | tcga_name = sys.argv[2]
43 | dimension = int(sys.argv[3])
44 | start = int(sys.argv[4])
45 | end = int(sys.argv[5])
46 |
47 | print("CANCER NAME: " + cancer)
48 | print("TEST NAME: " + tcga_name)
49 |
50 | input_folder = '../../ALL_CANCER_FILES/' + cancer + '/'
51 | output_folder = '../../ALL_CANCER_FILES/' + cancer + '/TCGA_FILES/'
52 |
53 | #Read input data
54 | input_df_test = pd.read_table(input_folder + 'TCGA_FILES/TCGA_RNASEQ_' + tcga_name + '_PCA_1000L.tsv', index_col = 0)
55 | print("TCGA expression dataframe ", input_df_test.shape)
56 |
57 | #Read GTEX expression
58 | for fold in range(start, end):
59 | print("VAE model with " + str(dimension) + " nodes and fold " + str(fold))
60 |
61 | #Load VAE models
62 | json_file = open(input_folder + '/VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.json', 'r')
63 | loaded_model_json = json_file.read()
64 | json_file.close()
65 | encoder = model_from_json(loaded_model_json)
66 |
67 | encoder.load_weights(input_folder + '/VAE_FILES/VAE_' + cancer + '_encoder_' + str(dimension) + 'L_' + str(fold) + '.h5')
68 | print("Loaded model from disk")
69 |
70 | #Encode test data using the VAE model
71 | test_encoded = encoder.predict(input_df_test)
72 | test_encoded_df = pd.DataFrame(test_encoded, index = input_df_test.index)
73 | test_encoded_df.to_csv(output_folder + 'TCGA_' + tcga_name + '_RNASeq_Expression_VAE_encoded_' + str(dimension) + 'L_' + str(fold) + '.tsv', sep = '\t')
74 |
75 |
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Example_Run_All.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Example for training TCGA models for a cancer type
3 | ###############################
4 | import sys
5 |
6 | ##STEP 1: Preprocessing Data
7 | get_ipython().magic(u"run -i Preprocess_TCGA_Rnaseq_Expression.py BRCA BRCA")
8 | get_ipython().magic(u"run -i Create_TCGA_Rnaseq_PCs.py BRCA BRCA")
9 |
10 | ##STEP 2: Encoding Expression with DeepProfile
11 | get_ipython().magic(u"run -i Create_All_VAE_Embeddings.py BRCA BRCA")
12 | get_ipython().magic(u"run -i Create_DeepProfile_TCGA_Embeddings.py BRCA BRCA")
13 |
14 | ##STEP 3: Encoding Expression with Competitor Models
15 | get_ipython().magic(u"run -i Encode_TCGA_Data_with_PCA.py BRCA BRCA")
16 | get_ipython().magic(u"run -i Encode_TCGA_Data_with_ICA.py BRCA BRCA")
17 | get_ipython().magic(u"run -i Encode_TCGA_Data_with_RP.py BRCA BRCA")
18 | get_ipython().magic(u"run -i Encode_TCGA_Data_with_AE.py BRCA BRCA")
19 | get_ipython().magic(u"run -i Encode_TCGA_Data_with_DAE.py BRCA BRCA")
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Preprocess_TCGA_Rnaseq_Expression.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for preprocessing TCGA RNA-Seq expression
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import sklearn.preprocessing
8 |
9 | #Define method for preprocessing data
10 | def create_data(cancer_type, tcga_type):
11 |
12 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/'
13 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/'
14 |
15 | #Read TCGA RNA-Seq expression
16 | print("Reading expression...")
17 | tcga_df = pd.read_table('../TCGA_DATA/TCGA_RNASEQ/' + tcga_type + '.uncv2.mRNAseq_RSEM_normalized_log2.txt', sep = '\t', index_col= 0)
18 | tcga_df = tcga_df.transpose()
19 | print("TCGA expression ", tcga_df.shape)
20 | print("TCGA expression ", tcga_df.head())
21 |
22 | #Map to gene names and eliminate unknown genes
23 | print("Correcting gene names...")
24 | gene_names = tcga_df.columns
25 | gene_names = [n[:n.index('|')] for n in gene_names]
26 | tcga_df = pd.DataFrame(tcga_df.values, index = tcga_df.index, columns = gene_names)
27 |
28 | #Eliminate unknown genes
29 | tcga_df = tcga_df.iloc[:, tcga_df.columns.values != '?']
30 | tcga_df = tcga_df.loc[:,~tcga_df.columns.duplicated()]
31 | print("TCGA expression ", tcga_df.shape)
32 | print("TCGA expression ", tcga_df)
33 |
34 | #Eliminate normal samples
35 | print("Eliminating normal samples..")
36 | sample_codes = [s[-2:] for s in tcga_df.index]
37 | print("Sample codes ", np.unique(sample_codes))
38 | normal_codes = [s[-2] for s in tcga_df.index]
39 | cancer_samples = np.where(np.asarray(normal_codes) == '0')[0]
40 | print("Total number of samples ", len(tcga_df.index))
41 | print("Total number of cancer samples ", len(cancer_samples))
42 | tcga_df = tcga_df.iloc[cancer_samples, :]
43 | print("TCGA expression ", tcga_df.shape)
44 | print("TCGA expression cancer samples ", tcga_df.index)
45 |
46 | #Read training data
47 | print("Combining with training data...")
48 | data_df = pd.read_table(input_folder + cancer_type + '_DATA_TOP2_JOINED_BATCH_CORRECTED_CLEANED.tsv', sep = '\t', index_col=0)
49 | print("Training data ", data_df.shape)
50 |
51 | #Get only training genes from the expression data
52 | joined_df = pd.concat([data_df, tcga_df], join = 'outer')
53 | joined_df = joined_df[data_df.columns]
54 | joined_df = joined_df.iloc[-1 * tcga_df.shape[0]:, :]
55 | joined_df = joined_df.fillna(joined_df.mean().fillna(0))
56 | print("TCGA expression ", joined_df.shape)
57 | print("TCGA expression ", joined_df.head())
58 |
59 | #Standardize data to make 0 mean univariate
60 | print("Standardizing the data...")
61 | scaled_expression_values = joined_df.values
62 | normalized_data = sklearn.preprocessing.scale(scaled_expression_values)
63 | print("TCGA expression mean ", np.mean(normalized_data, axis = 0))
64 | print("TCGA expression mean ", len(np.mean(normalized_data, axis = 0)))
65 | print("TCGA expression std ", np.std(normalized_data, axis = 0))
66 | print("TCGA expression std ", len(np.std(normalized_data, axis = 0)))
67 |
68 | #Record joined dataframe
69 | joined_df = pd.DataFrame(normalized_data, index = joined_df.index, columns = joined_df.columns)
70 | print("Final dataframe ", joined_df.shape)
71 | print('RANGE: ', (np.max(joined_df.values) - np.min(joined_df.values) ))
72 |
73 | #Record expression data
74 | joined_df.to_csv(output_folder + 'TCGA_' + tcga_type + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', sep = '\t')
75 |
76 |
77 | import sys
78 | cancer_type = sys.argv[1]
79 | tcga_type = sys.argv[2]
80 | create_data(cancer_type, tcga_type)
81 |
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Preprocess_TCGA_Rnaseq_Expression_All_Genes.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for preprocessing TCGA RNA-Seq expression
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import sklearn.preprocessing
8 |
9 | #Define method for preprocessing data
10 | def create_data(cancer_type, tcga_type):
11 |
12 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/'
13 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/'
14 |
15 | #Read TCGA RNA-Seq expression
16 | print("Reading expression...")
17 | tcga_df = pd.read_table('../TCGA_DATA/TCGA_RNASEQ/' + tcga_type + '.uncv2.mRNAseq_RSEM_normalized_log2.txt', sep = '\t', index_col= 0)
18 | tcga_df = tcga_df.transpose()
19 | print("TCGA expression ", tcga_df.shape)
20 | print("TCGA expression ", tcga_df.head())
21 |
22 | #Map to gene names and eliminate unknown genes
23 | print("Correcting gene names...")
24 | gene_names = tcga_df.columns
25 | gene_names = [n[:n.index('|')] for n in gene_names]
26 | tcga_df = pd.DataFrame(tcga_df.values, index = tcga_df.index, columns = gene_names)
27 |
28 | #Eliminate unknown genes
29 | tcga_df = tcga_df.iloc[:, tcga_df.columns.values != '?']
30 | tcga_df = tcga_df.loc[:,~tcga_df.columns.duplicated()]
31 | print("TCGA expression ", tcga_df.shape)
32 | print("TCGA expression ", tcga_df)
33 |
34 | #Eliminate normal samples
35 | print("Eliminating normal samples..")
36 | sample_codes = [s[-2:] for s in tcga_df.index]
37 | print("Sample codes ", np.unique(sample_codes))
38 | normal_codes = [s[-2] for s in tcga_df.index]
39 | cancer_samples = np.where(np.asarray(normal_codes) == '0')[0]
40 | print("Total number of samples ", len(tcga_df.index))
41 | print("Total number of cancer samples ", len(cancer_samples))
42 | tcga_df = tcga_df.iloc[cancer_samples, :]
43 | print("TCGA expression ", tcga_df.shape)
44 | print("TCGA expression cancer samples ", tcga_df.index)
45 |
46 | #Standardize data to make 0 mean univariate
47 | print("Standardizing the data...")
48 | scaled_expression_values = tcga_df.values
49 | normalized_data = sklearn.preprocessing.scale(scaled_expression_values)
50 | print("TCGA expression mean ", np.mean(normalized_data, axis = 0))
51 | print("TCGA expression mean ", len(np.mean(normalized_data, axis = 0)))
52 | print("TCGA expression std ", np.std(normalized_data, axis = 0))
53 | print("TCGA expression std ", len(np.std(normalized_data, axis = 0)))
54 |
55 | #Record joined dataframe
56 | tcga_df = pd.DataFrame(normalized_data, index = tcga_df.index, columns = tcga_df.columns)
57 | tcga_df = tcga_df.fillna(tcga_df.mean().fillna(0))
58 | print("Final dataframe ", tcga_df.shape)
59 | print('RANGE: ', (np.max(tcga_df.values) - np.min(tcga_df.values) ))
60 |
61 | #Record expression data
62 | tcga_df.to_csv(output_folder + 'TCGA_' + tcga_type + '_ALL_GENES_PREPROCESSED_RNASEQ_EXPRESSION.tsv', sep = '\t')
63 | print(tcga_df)
64 |
65 | import sys
66 | cancer_type = sys.argv[1]
67 | tcga_type = sys.argv[2]
68 | create_data(cancer_type, tcga_type)
69 |
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_EMBEDDINGS/Preprocess_TCGA_Rnaseq_Expression_All_Genes_Uncorrected.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for preprocessing TCGA RNA-Seq expression
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import sklearn.preprocessing
8 |
9 | #Define method for preprocessing data
10 | def create_data(cancer_type, tcga_type):
11 |
12 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/'
13 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/'
14 |
15 | #Read TCGA RNA-Seq expression
16 | print("Reading expression...")
17 | tcga_df = pd.read_table('../TCGA_DATA/TCGA_RNASEQ/' + tcga_type + '.uncv2.mRNAseq_RSEM_normalized_log2.txt', sep = '\t', index_col= 0)
18 | tcga_df = tcga_df.transpose()
19 | print("TCGA expression ", tcga_df.shape)
20 | print("TCGA expression ", tcga_df.head())
21 |
22 | #Map to gene names and eliminate unknown genes
23 | print("Correcting gene names...")
24 | gene_names = tcga_df.columns
25 | gene_names = [n[:n.index('|')] for n in gene_names]
26 | tcga_df = pd.DataFrame(tcga_df.values, index = tcga_df.index, columns = gene_names)
27 |
28 | #Eliminate unknown genes
29 | tcga_df = tcga_df.iloc[:, tcga_df.columns.values != '?']
30 | tcga_df = tcga_df.loc[:,~tcga_df.columns.duplicated()]
31 | print("TCGA expression ", tcga_df.shape)
32 | print("TCGA expression ", tcga_df)
33 |
34 | #Eliminate normal samples
35 | print("Eliminating normal samples..")
36 | sample_codes = [s[-2:] for s in tcga_df.index]
37 | print("Sample codes ", np.unique(sample_codes))
38 | normal_codes = [s[-2] for s in tcga_df.index]
39 | cancer_samples = np.where(np.asarray(normal_codes) == '0')[0]
40 | print("Total number of samples ", len(tcga_df.index))
41 | print("Total number of cancer samples ", len(cancer_samples))
42 | tcga_df = tcga_df.iloc[cancer_samples, :]
43 | print("TCGA expression ", tcga_df.shape)
44 | print("TCGA expression cancer samples ", tcga_df.index)
45 |
46 | #Standardize data to make 0 mean univariate
47 | print("Standardizing the data...")
48 | scaled_expression_values = tcga_df.values
49 | normalized_data = scaled_expression_values
50 | print("TCGA expression mean ", np.mean(normalized_data, axis = 0))
51 | print("TCGA expression mean ", len(np.mean(normalized_data, axis = 0)))
52 | print("TCGA expression std ", np.std(normalized_data, axis = 0))
53 | print("TCGA expression std ", len(np.std(normalized_data, axis = 0)))
54 |
55 | #Record joined dataframe
56 | tcga_df = pd.DataFrame(normalized_data, index = tcga_df.index, columns = tcga_df.columns)
57 | tcga_df = tcga_df.fillna(tcga_df.mean().fillna(0))
58 | print("Final dataframe ", tcga_df.shape)
59 | print('RANGE: ', (np.max(tcga_df.values) - np.min(tcga_df.values) ))
60 |
61 | #Record expression data
62 | tcga_df.to_csv(output_folder + 'TCGA_' + tcga_type + '_ALL_GENES_NOT_PREPROCESSED_RNASEQ_EXPRESSION.tsv', sep = '\t')
63 | print(tcga_df)
64 |
65 | import sys
66 | cancer_type = sys.argv[1]
67 | tcga_type = sys.argv[2]
68 | create_data(cancer_type, tcga_type)
69 |
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_SURVIVAL_DATAFRAMES/Create_Joined_Survival_Dataframes.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for creating joined TCGA survival dataframes and DeepProfile embeddings
3 | ###############################
4 |
5 | import pandas as pd
6 | import numpy as np
7 | import sys
8 |
9 | def createJoinedDf(tcga_type, cancer_type):
10 |
11 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/'
12 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/'
13 |
14 | #Read survival dataframe
15 | surv_df = pd.read_table(input_folder + 'TCGA_' + tcga_type + '_Survival_df.tsv', index_col = 0, sep = ',')
16 | surv_df = surv_df.astype(float)
17 | print("Survival dataframe ", surv_df.shape)
18 |
19 | #Drop nan samples
20 | indices_to_drop1 = np.where(np.isnan(surv_df.values))[0]
21 | indices_to_drop2 = np.where(surv_df['Survival_in_days'].values <= 0)[0]
22 | indices_to_drop = np.unique(np.concatenate((indices_to_drop1, indices_to_drop2)))
23 | surv_df = surv_df.drop(surv_df.index[indices_to_drop])
24 | surv_df = pd.DataFrame(surv_df.values, index = surv_df.index, columns = ['fustat', 'futime'])
25 | print("Survival dataframe ", surv_df.shape)
26 |
27 | #Read DeepProfile embedding
28 | data_df = pd.read_table(input_folder + tcga_type + '_DeepProfile_TCGA_RNASeq_Embedding_150L.tsv', index_col = 0)
29 | print("DeepProfile embedding ", data_df.shape)
30 |
31 | #Match sample indices
32 | surv_df_sample_names = surv_df.index
33 | data_df_sample_names = data_df.index
34 | print("Surv samples ", surv_df_sample_names)
35 | print("Data samples ", data_df_sample_names)
36 |
37 | new_indices = [s.upper() for s in surv_df.index]
38 | surv_df = pd.DataFrame(surv_df.values, index = new_indices, columns = surv_df.columns)
39 |
40 | new_columns = ['Node ' + str(i) for i in range(1, 151)]
41 | new_indices = [s[:12] for s in data_df.index]
42 | data_df = pd.DataFrame(data_df.values, index = new_indices, columns = new_columns)
43 |
44 | surv_df_sample_names = surv_df.index
45 | data_df_sample_names = data_df.index
46 | print("Surv samples ", surv_df_sample_names)
47 | print("Data samples ", data_df_sample_names)
48 |
49 | #Take the samples available in both datasets
50 | # intersect_indices = np.intersect1d(data_df.index, surv_df.index)
51 | # print("Common indices ", intersect_indices)
52 |
53 | #Create joined dataframe
54 | joined_df = data_df.merge(surv_df, left_index=True, right_index=True)
55 | joined_df = joined_df.sort_index()
56 | joined_df = joined_df.loc[~joined_df.index.duplicated(keep='first')]
57 | print("Joined dataframe ", joined_df.shape)
58 | print(joined_df)
59 | joined_df.to_csv(output_folder + '/DeepProfile_Embedding_and_' + tcga_type + '_Survival_df.tsv', sep = '\t')
60 |
61 | #Read cancer types
62 | cancer_type = sys.argv[1]
63 | tcga_type = sys.argv[2]
64 |
65 | createJoinedDf(tcga_type, cancer_type)
66 |
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_SURVIVAL_DATAFRAMES/Create_Joined_Survival_Dataframes_Cancer_Types.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for creating joined cancer types TCGA survival dataframes
3 | ###############################
4 |
5 | import pandas as pd
6 | import numpy as np
7 |
8 | #Method for combining datasets
9 | def create_Data(cancer):
10 |
11 | input_folder = '../../ALL_CANCER_FILES/' + cancer + '/' + 'TCGA_FILES/'
12 | output_folder = '../../ALL_CANCER_FILES/' + cancer + '/' + 'TCGA_FILES/'
13 |
14 | c = np.where(np.asarray(cancer_types) == cancer)[0][0]
15 | df_list = []
16 | for test in test_cases[c]:
17 | print("TCGA type ", test)
18 | surv_df = pd.read_table(input_folder + '/DeepProfile_Embedding_and_' + test + '_Survival_df.tsv', sep = '\t', index_col = 0)
19 | print("Survival dataframe ", surv_df.shape)
20 | df_list.append(surv_df)
21 |
22 | #Combine dataframes
23 | joined_df = pd.concat(df_list)
24 | print("Joined survival dataframe ", joined_df.shape)
25 | joined_df.to_csv(output_folder + '/DeepProfile_Embedding_and_' + cancer + '_Survival_df.tsv', sep = '\t')
26 |
27 | cancer_types = ['LUNG']
28 | test_cases = [ ['LUAD', 'LUSC']]
29 |
30 | for i in range(len(cancer_types)):
31 | print("Cancer type ", cancer_types[i])
32 | create_Data(cancer_types[i])
33 |
34 | #Method for combining survival dataframes
35 | def create_Data(cancer):
36 |
37 | input_folder = '../../ALL_CANCER_FILES/' + cancer + '/' + 'TCGA_FILES/'
38 | output_folder = '../../ALL_CANCER_FILES/' + cancer + '/' + 'TCGA_FILES/'
39 |
40 | c = np.where(np.asarray(cancer_types) == cancer)[0][0]
41 | df_list = []
42 | for test in test_cases[c]:
43 | print("TCGA type ", test)
44 | surv_df = pd.read_table(input_folder + 'TCGA_' + test + '_Survival_df.tsv', sep = ',', index_col = 0)
45 | print("Survival dataframe ", surv_df.shape)
46 | df_list.append(surv_df)
47 |
48 | #Combine dataframes
49 | joined_df = pd.concat(df_list)
50 | print("Joined survival dataframe ", joined_df.shape)
51 | joined_df.to_csv(output_folder + 'TCGA_' + cancer + '_Survival_df.tsv', sep = '\t')
52 |
53 | cancer_types = ['LUNG']
54 | test_cases = [ ['LUAD', 'LUSC']]
55 |
56 | for i in range(len(cancer_types)):
57 | print("Cancer type ", cancer_types[i])
58 | create_Data(cancer_types[i])
59 |
60 |
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/CREATE_SURVIVAL_DATAFRAMES/Create_TCGA_Survival_Dataframes.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for creating TCGA survival dataframes
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import math
8 |
9 | #Method for defining the survival dataframe
10 | def createSurvivalDF(cancer_type, tcga_type):
11 |
12 | input_folder = '../TCGA_DATA/TCGA_CLINICAL_DATA/'
13 | output_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/'
14 |
15 | #Read clinical data
16 | survival_df = pd.read_table( input_folder + tcga_type + '.clin.merged.picked.txt', index_col = 0)
17 | survival_df = survival_df.transpose()
18 | print("TCGA clinical dataframe ", survival_df.shape)
19 | print("TCGA clinical dataframe ", survival_df.columns)
20 |
21 | #Extract vital status, days to death, and days to follow up
22 | vital_df = survival_df['vital_status']
23 | dead_df = survival_df['days_to_death']
24 | alive_df = survival_df['days_to_last_followup']
25 |
26 | #Create joined arrays
27 | vital_status_array = []
28 | days_status_array = []
29 | for i in range(vital_df.shape[0]):
30 | if int(vital_df.values[i])== 0:
31 | vital_status_array.append(False)
32 | days_status_array.append(alive_df.values[i])
33 | else:
34 | vital_status_array.append(True)
35 | days_status_array.append(dead_df.values[i])
36 |
37 |
38 | #Create joined dataframe
39 | vital_status_df = pd.DataFrame(vital_status_array, index = survival_df.index, columns = ['Status'])
40 | days_status_df = pd.DataFrame(days_status_array, index = survival_df.index, columns = ['Survival_in_days'])
41 | joined_df = pd.concat([vital_status_df, days_status_df], axis = 1)
42 | print("TCGA survival dataframe ", joined_df)
43 | joined_df.to_csv(output_folder + '/TCGA_' + tcga_type + '_Survival_df.tsv')
44 |
45 |
46 | import sys
47 | cancer_type = sys.argv[1]
48 | tcga_type = sys.argv[2]
49 |
50 | createSurvivalDF(cancer_type, tcga_type)
51 |
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/PREDICT_SURVIVAL/Predict_Survival.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for predicting survival status of patients
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | from scipy import stats
8 | from sklearn import metrics
9 | import random
10 |
11 | from sklearn.metrics import roc_auc_score
12 | from sklearn.model_selection import GridSearchCV
13 | from sklearn.model_selection import LeaveOneOut
14 | from sklearn.model_selection import KFold
15 | from sklearn import linear_model
16 | from sklearn.linear_model import LogisticRegression
17 | from sklearn.metrics import average_precision_score
18 | from sklearn.metrics import accuracy_score
19 | from sklearn.preprocessing import StandardScaler
20 | from sklearn.metrics import roc_curve, auc
21 |
22 | #Define method for training models
23 | def trait_classification_accuracy(X, Y):
24 |
25 | #Do cross validation
26 | loo = KFold(20, shuffle = True, random_state = 123456)
27 |
28 | predictions = np.zeros(X.shape[0])
29 | probabilities = np.zeros(X.shape[0])
30 |
31 | for train_index, test_index in loo.split(X):
32 | X_train, X_test = X[train_index], X[test_index]
33 | Y_train, Y_test = Y[train_index], Y[test_index]
34 |
35 | #Normalize training data
36 | scaler = StandardScaler()
37 | scaler.fit(X_train)
38 |
39 | X_std = scaler.transform(X_train)
40 | X_test_std = scaler.transform(X_test)
41 |
42 | # #Tune parameters
43 | tuned_parameters = [{'C': [0.001, 0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 100]}]
44 |
45 | model = LogisticRegression(random_state=12345, penalty = 'l1', max_iter=1000,
46 | solver = 'liblinear')
47 | clf = GridSearchCV(model, tuned_parameters, cv = 5, scoring='roc_auc', n_jobs = -1)
48 | clf.fit(X_std, Y_train)
49 |
50 | #Record predictions and probabilities
51 | predicted_Y = clf.predict(X_test_std)
52 | predictions[test_index] = predicted_Y
53 |
54 | probs = clf.predict_proba(X_test_std)
55 |
56 | probabilities[test_index] = probs[:, 1]
57 |
58 |
59 | #Calculate accuracy and ROC-AUC
60 | accuracy = accuracy_score(Y, predictions)
61 | score = roc_auc_score(Y, probabilities)
62 |
63 | return [accuracy, score]
64 |
65 | #Define method for predicting survival
66 | def predict_survival(cancer_type, tcga_type, method, run_index, seed):
67 |
68 | accuracies = []
69 | aucs = []
70 |
71 | if method == 'PCA':
72 | X_df = pd.read_table(input_folder + 'TCGA_RNASEQ_' + tcga_type + '_PCA_150L.tsv', index_col = 0)
73 |
74 | if method == 'ICA' or method == 'RP':
75 | X_df = pd.read_table(input_folder + 'TCGA_RNASEQ_' + tcga_type + '_' + method + '_150L_' + str(run_index) + '.tsv', index_col = 0)
76 |
77 | if method == 'AE' or method == 'DAE':
78 | X_df = pd.read_table(input_folder + 'TCGA_' + tcga_type + '_RNASeq_Expression_' + method + '_encoded_150L_' + str(run_index) + '.tsv', index_col = 0)
79 |
80 | if method == 'VAE':
81 | X_df = pd.read_table(input_folder + 'TCGA_' + tcga_type + '_RNASeq_Expression_' + method + '_encoded_' + VAE_dim + 'L_' + str(run_index) + '.tsv', index_col = 0)
82 |
83 | if method == 'DeepProfile':
84 | X_df = pd.read_table(input_folder + tcga_type + '_DeepProfile_TCGA_RNASeq_Embedding_150L.tsv', index_col = 0, sep = '\t')
85 |
86 | #Read expression data
87 | print("Expression data ", X_df.shape)
88 | print("Expression data ", X_df.index)
89 |
90 | #Now, replace X_df index to match with Y_df index
91 | mapper = lambda t: t[:12]
92 | vfunc = np.vectorize(mapper)
93 | newX_index = vfunc( X_df.index)
94 |
95 | X_df = pd.DataFrame(X_df.values, index = newX_index, columns = X_df.columns)
96 |
97 | #Take intersecting samples in datasets
98 | X_samples = X_df.index
99 | Y_samples = Y_df.index
100 | intersecting_samples = np.intersect1d(X_samples, Y_samples)
101 |
102 | subX_df = X_df.T[intersecting_samples].T
103 | subY_df = Y_df.T[intersecting_samples].T
104 |
105 | print("X dataframe ", subX_df.shape)
106 | print("Y dataframe ", subY_df.shape)
107 |
108 | print("X dataframe ", subX_df.index)
109 | print("Y dataframe ", subY_df.index)
110 |
111 | sample_indices = [np.where(subY_df.values == False)[0], np.where(subY_df.values == True)[0]]
112 | sample_counts = [len(sample_indices[0]), len(sample_indices[1])]
113 | print("SAMPLES LABEL 0: ", sample_counts[0], " SAMPLES LABEL 1: ", sample_counts[1])
114 |
115 | #Now select the class with highest number of samples and subsample
116 | low_class = np.argmin(sample_counts)
117 | high_class = np.argmax(sample_counts)
118 | print("Lower class size ", sample_counts[low_class], "samples subsampled from class ", high_class)
119 | random.seed(12345 * seed)
120 | random_indices = random.sample(list(np.arange(0, sample_counts[high_class])), sample_counts[low_class])
121 | selected_indices = np.sort(sample_indices[high_class][random_indices])
122 |
123 | subX_df = pd.concat([subX_df.iloc[sample_indices[low_class]], subX_df.iloc[selected_indices]])
124 | subY_df = pd.concat([subY_df.iloc[sample_indices[low_class]], subY_df.iloc[selected_indices]])
125 | subX_df = subX_df.sort_index()
126 | subY_df = subY_df.sort_index()
127 |
128 | results = trait_classification_accuracy(subX_df.values, np.ravel(subY_df.values))
129 |
130 | return results
131 |
132 | #Read user inputs
133 | import sys
134 | cancer_type = sys.argv[1] #main cancer type
135 | tcga_type = sys.argv[2] #TCGA type
136 | method = sys.argv[3] #name of the method
137 | run_index = int(sys.argv[4]) #run index
138 | if len(sys.argv) > 5:
139 | VAE_dim = sys.argv[5]
140 |
141 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/'
142 | output_folder = 'Prediction_Results/'
143 |
144 | Y_df = pd.read_table(input_folder + 'DeepProfile_Embedding_and_' + tcga_type + '_Survival_df.tsv', index_col = 0, sep = '\t')
145 | print("Survival dataframe ", Y_df.shape)
146 |
147 | #print( Y_df[Y_df['fustat'] == 1]['futime'])
148 |
149 | print("ALIVE..")
150 | print( Y_df[Y_df['fustat'] == 0]['futime'])
151 | print( np.mean(Y_df[Y_df['fustat'] == 0]['futime'].values))
152 |
153 | #Select all dead patients, only if they died within 5 years
154 | Y_df_dead = Y_df[Y_df['fustat'] == 1]
155 | indices_dead = np.where(Y_df_dead['futime'] < 5 * 365)[0]
156 | print("Dead within 5 year ", Y_df_dead.iloc[indices_dead][['fustat', 'futime']])
157 | print("Dead within 5 year ", np.max(Y_df_dead.iloc[indices_dead]['futime']))
158 |
159 | #Select all alive patients, only if they lived more than 5 years
160 | indices_alive = np.where(Y_df['futime'] > 5 * 365)[0]
161 | print("Alive after 5 year ", Y_df.iloc[indices_alive][['fustat', 'futime']])
162 | print("Alive after 5 year ", np.min(Y_df.iloc[indices_alive]['futime']))
163 |
164 | indices = list(indices_dead) + list(indices_alive)
165 | indices = np.unique(indices)
166 | Y_df = Y_df['fustat']
167 | Y_df = Y_df.iloc[indices]
168 | Y_df = Y_df.dropna()
169 | print("Survival dataframe \n ", Y_df)
170 |
171 | class0_count = len(np.where(Y_df.values == 0)[0])
172 | class1_count = len(np.where(Y_df.values == 1)[0])
173 |
174 | all_accuracies = []
175 | all_aucs = []
176 |
177 | for sampling_index in range(50):
178 | result = predict_survival(cancer_type, tcga_type, method, run_index, sampling_index)
179 | print("Accuracy: ", result[0])
180 | print("ROC-AUC: ", result[1])
181 | all_accuracies.append(result[0])
182 | all_aucs.append(result[1])
183 |
184 | print("FINAL RESULTS...")
185 | print("MEAN ACCURACY ", np.mean(np.asarray(all_accuracies)))
186 | print("MEAN ROC-AUC ", np.mean(np.asarray(all_aucs)))
187 |
188 | #Save results to a file
189 | if method == 'VAE':
190 | np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + tcga_type + '_' + method + '_' + VAE_dim + 'L_' + str(run_index) + '_ACCs.txt', np.asarray(all_accuracies), delimiter='\n')
191 | np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + tcga_type + '_' + method + '_' + VAE_dim + 'L_' + str(run_index) + '_AUCs.txt', np.asarray(all_aucs), delimiter='\n')
192 |
193 | else:
194 | np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + tcga_type + '_' + method + '_' + str(run_index) + '_ACCs.txt', np.asarray(all_accuracies), delimiter='\n')
195 | np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + tcga_type + '_' + method + '_' + str(run_index) + '_AUCs.txt', np.asarray(all_aucs), delimiter='\n')
196 |
197 |
198 |
199 |
200 |
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/PREDICT_SURVIVAL/Predict_Survival_Raw_Data.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Predicting survival status of patients using raw gene data
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | from scipy import stats
8 | from sklearn import metrics
9 | import random
10 |
11 | from sklearn.metrics import roc_auc_score
12 | from sklearn.model_selection import GridSearchCV
13 | from sklearn.model_selection import LeaveOneOut
14 | from sklearn.model_selection import KFold
15 | from sklearn import linear_model
16 | from sklearn.linear_model import LogisticRegression
17 | from sklearn.metrics import average_precision_score
18 | from sklearn.metrics import accuracy_score
19 | from sklearn.preprocessing import StandardScaler
20 | from sklearn.metrics import roc_curve, auc
21 |
22 | #Define method for training models
23 | def trait_classification_accuracy(X, Y):
24 |
25 | #Do cross validation
26 | loo = KFold(20, shuffle = True, random_state = 123456)
27 |
28 | predictions = np.zeros(X.shape[0])
29 | probabilities = np.zeros(X.shape[0])
30 |
31 | for train_index, test_index in loo.split(X):
32 | X_train, X_test = X[train_index], X[test_index]
33 | Y_train, Y_test = Y[train_index], Y[test_index]
34 |
35 | #Normalize training data
36 | scaler = StandardScaler()
37 | scaler.fit(X_train)
38 |
39 | X_std = scaler.transform(X_train)
40 | X_test_std = scaler.transform(X_test)
41 |
42 | # #Tune parameters
43 | tuned_parameters = [{'C': [0.001, 0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 100]}]
44 |
45 | model = LogisticRegression(random_state=12345, penalty = 'l1', max_iter=1000,
46 | solver = 'liblinear')
47 | clf = GridSearchCV(model, tuned_parameters, cv = 5, scoring='roc_auc', n_jobs = -1)
48 | clf.fit(X_std, Y_train)
49 |
50 | #Record predictions and probabilities
51 | predicted_Y = clf.predict(X_test_std)
52 | predictions[test_index] = predicted_Y
53 |
54 | probs = clf.predict_proba(X_test_std)
55 |
56 | probabilities[test_index] = probs[:, 1]
57 |
58 |
59 | #Calculate accuracy and ROC-AUC
60 | accuracy = accuracy_score(Y, predictions)
61 | score = roc_auc_score(Y, probabilities)
62 |
63 | return [accuracy, score]
64 |
65 | #Define method for predicting survival
66 | def predict_survival(X_inp,Y_inp,cancer_type, tcga_type, seed):
67 |
68 | accuracies = []
69 | aucs = []
70 |
71 | subX_df = X_inp
72 | subY_df = Y_inp
73 |
74 | print("X dataframe ", subX_df.shape)
75 | print("Y dataframe ", subY_df.shape)
76 |
77 | print("X dataframe ", subX_df.index)
78 | print("Y dataframe ", subY_df.index)
79 |
80 | sample_indices = [np.where(subY_df.values == False)[0], np.where(subY_df.values == True)[0]]
81 | sample_counts = [len(sample_indices[0]), len(sample_indices[1])]
82 | print("SAMPLES LABEL 0: ", sample_counts[0], " SAMPLES LABEL 1: ", sample_counts[1])
83 |
84 | #Now select the class with highest number of samples and subsample
85 | low_class = np.argmin(sample_counts)
86 | high_class = np.argmax(sample_counts)
87 | print("Lower class size ", sample_counts[low_class], "samples subsampled from class ", high_class)
88 | random.seed(12345 * seed)
89 | random_indices = random.sample(list(np.arange(0, sample_counts[high_class])), sample_counts[low_class])
90 | selected_indices = np.sort(sample_indices[high_class][random_indices])
91 |
92 | subX_df = pd.concat([subX_df.iloc[sample_indices[low_class]], subX_df.iloc[selected_indices]])
93 | subY_df = pd.concat([subY_df.iloc[sample_indices[low_class]], subY_df.iloc[selected_indices]])
94 | subX_df = subX_df.sort_index()
95 | subY_df = subY_df.sort_index()
96 |
97 | results = trait_classification_accuracy(subX_df.values, np.ravel(subY_df.values))
98 |
99 | return results
100 |
101 | #Read user inputs
102 | run_index = 0
103 | # for cancer_type in ['BRCA', 'AML',
104 | # 'COLON',
105 | # 'BRAIN', 'OV',
106 | # 'SARCOMA', 'KIDNEY',
107 | # 'LIVER', 'STOMACH',
108 | # 'SKIN', 'UCEC',
109 | # 'HEAD_NECK', 'PANCREAS',
110 | # 'CERVICAL', 'BLADDER', 'LUNG']:
111 | for cancer_type in ['HEAD_NECK', 'PANCREAS',
112 | 'CERVICAL', 'BLADDER', 'LUNG']:
113 |
114 | if cancer_type == 'LUNG':
115 | tcga_types = ['LUAD', 'LUSC']
116 |
117 | else:
118 | cancer_types = ['BRCA', 'AML',
119 | 'COLON',
120 | 'BRAIN', 'OV',
121 | 'SARCOMA', 'KIDNEY',
122 | 'LIVER', 'STOMACH',
123 | 'SKIN', 'UTERINE',
124 | 'HEAD_NECK', 'PANCREAS',
125 | 'CERVICAL', 'BLADDER', 'LUNG']
126 |
127 | tcga_types = ['BRCA', 'LAML',
128 | 'COADREAD',
129 | 'GBMLGG', 'OV',
130 | 'SARC', 'KIPAN',
131 | 'LIHC', 'STAD',
132 | 'SKCM', 'UCEC',
133 | 'HNSC', 'PAAD',
134 | 'CESC', 'BLCA', 'LUNG']
135 | cti = cancer_types.index(cancer_type)
136 | tcga_type = tcga_types[cti]
137 |
138 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/'
139 | output_folder = 'Prediction_Results/'
140 |
141 | if cancer_type == 'LUNG':
142 | df_list = []
143 | for tcga_type in tcga_types:
144 | Y_df = pd.read_table(input_folder + 'DeepProfile_Embedding_and_' + tcga_type + '_Survival_df.tsv',
145 | index_col = 0, sep = '\t')
146 | print("Survival dataframe ", Y_df.shape)
147 | df_list.append(Y_df)
148 |
149 | Y_df = pd.concat(df_list, axis = 0)
150 | else:
151 | Y_df = pd.read_table(input_folder + 'DeepProfile_Embedding_and_' + tcga_type + '_Survival_df.tsv',
152 | index_col = 0, sep = '\t')
153 | print("Survival dataframe ", Y_df.shape)
154 |
155 | print("ALIVE..")
156 | print( Y_df[Y_df['fustat'] == 0]['futime'])
157 | print( np.mean(Y_df[Y_df['fustat'] == 0]['futime'].values))
158 |
159 | #Select all dead patients, only if they died within 5 years
160 | Y_df_dead = Y_df[Y_df['fustat'] == 1]
161 | indices_dead = np.where(Y_df_dead['futime'] < 5 * 365)[0]
162 | print("Dead within 5 year ", Y_df_dead.iloc[indices_dead][['fustat', 'futime']])
163 | print("Dead within 5 year ", np.max(Y_df_dead.iloc[indices_dead]['futime']))
164 |
165 | #Select all alive patients, only if they lived more than 5 years
166 | indices_alive = np.where(Y_df['futime'] > 5 * 365)[0]
167 | print("Alive after 5 year ", Y_df.iloc[indices_alive][['fustat', 'futime']])
168 | print("Alive after 5 year ", np.min(Y_df.iloc[indices_alive]['futime']))
169 |
170 | indices = list(indices_dead) + list(indices_alive)
171 | indices = np.unique(indices)
172 | Y_df = Y_df['fustat']
173 | Y_df = Y_df.iloc[indices]
174 | Y_df = Y_df.dropna()
175 | print("Survival dataframe \n ", Y_df)
176 |
177 | raw_data_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/' + 'TCGA_FILES/'
178 | tcga_df = pd.read_table(raw_data_folder + 'TCGA_' + tcga_type + '_PREPROCESSED_RNASEQ_EXPRESSION.tsv', index_col= 0)
179 | #Now, replace X_df index to match with Y_df index
180 | mapper = lambda t: t[:12]
181 | vfunc = np.vectorize(mapper)
182 | newX_index = vfunc( tcga_df.index)
183 | tcga_df.index = newX_index
184 | tcga_labeled_df = tcga_df.loc[Y_df.index,:]
185 | tcga_labeled_df = tcga_labeled_df[~tcga_labeled_df.index.duplicated(keep='first')]
186 |
187 | class0_count = len(np.where(Y_df.values == 0)[0])
188 | class1_count = len(np.where(Y_df.values == 1)[0])
189 |
190 | all_accuracies = []
191 | all_aucs = []
192 |
193 | for sampling_index in range(50):
194 | result = predict_survival(tcga_labeled_df, Y_df, cancer_type, tcga_type, sampling_index)
195 | print("Accuracy: ", result[0])
196 | print("ROC-AUC: ", result[1])
197 | all_accuracies.append(result[0])
198 | all_aucs.append(result[1])
199 |
200 | method = 'RAW'
201 | np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + cancer_type + '_' + method + '_' + str(run_index) + '_ACCs.txt', np.asarray(all_accuracies), delimiter='\n')
202 | np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + cancer_type + '_' + method + '_' + str(run_index) + '_AUCs.txt', np.asarray(all_aucs), delimiter='\n')
203 |
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/PREDICT_SURVIVAL/Predict_Survival_Subtypes_Joined.py:
--------------------------------------------------------------------------------
1 | ###############################
2 | #Script for predicting survival status of patients
3 | ###############################
4 |
5 | import numpy as np
6 | import pandas as pd
7 | from scipy import stats
8 | from sklearn import metrics
9 | import random
10 |
11 | from sklearn.metrics import roc_auc_score
12 | from sklearn.model_selection import GridSearchCV
13 | from sklearn.model_selection import LeaveOneOut
14 | from sklearn.model_selection import KFold
15 | from sklearn import linear_model
16 | from sklearn.linear_model import LogisticRegression
17 | from sklearn.metrics import average_precision_score
18 | from sklearn.metrics import accuracy_score
19 | from sklearn.preprocessing import StandardScaler
20 | from sklearn.metrics import roc_curve, auc
21 |
22 | #Define method for training models
23 | def trait_classification_accuracy(X, Y):
24 |
25 | #Do cross validation
26 | loo = KFold(20, shuffle = True, random_state = 123456)
27 |
28 | predictions = np.zeros(X.shape[0])
29 | probabilities = np.zeros(X.shape[0])
30 |
31 | for train_index, test_index in loo.split(X):
32 | X_train, X_test = X[train_index], X[test_index]
33 | Y_train, Y_test = Y[train_index], Y[test_index]
34 |
35 | #Normalize training data
36 | scaler = StandardScaler()
37 | scaler.fit(X_train)
38 |
39 | X_std = scaler.transform(X_train)
40 | X_test_std = scaler.transform(X_test)
41 |
42 | #Tune parameters
43 | tuned_parameters = [{'C': [0.001, 0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 100]}]
44 |
45 | model = LogisticRegression(random_state=12345, penalty = 'l1', max_iter=1000,
46 | solver = 'liblinear')
47 | clf = GridSearchCV(model, tuned_parameters, cv = 5, scoring='roc_auc', n_jobs = -1)
48 | clf.fit(X_std, Y_train)
49 |
50 | #Record predictions and probabilities
51 | predicted_Y = clf.predict(X_test_std)
52 | predictions[test_index] = predicted_Y
53 |
54 | probs = clf.predict_proba(X_test_std)
55 |
56 | probabilities[test_index] = probs[:, 1]
57 |
58 |
59 | #Calculate accuracy and ROC-AUC
60 | accuracy = accuracy_score(Y, predictions)
61 | score = roc_auc_score(Y, probabilities)
62 |
63 | return [accuracy, score]
64 |
65 |
66 | #Define method for predicting survival
67 | def predict_survival(cancer_type, tcga_types, method, run_index, seed):
68 | print(tcga_types)
69 | accuracies = []
70 | aucs = []
71 |
72 | df_list = []
73 | for tcga_type in tcga_types:
74 | if method == 'PCA':
75 | X_df = pd.read_table(input_folder + 'TCGA_RNASEQ_' + tcga_type + '_PCA_150L.tsv', index_col = 0)
76 |
77 | if method == 'ICA' or method == 'RP':
78 | X_df = pd.read_table(input_folder + 'TCGA_RNASEQ_' + tcga_type + '_' + method + '_150L_' + str(run_index) + '.tsv', index_col = 0)
79 |
80 | if method == 'AE' or method == 'DAE':
81 | X_df = pd.read_table(input_folder + 'TCGA_' + tcga_type + '_RNASeq_Expression_' + method + '_encoded_150L_' + str(run_index) + '.tsv', index_col = 0)
82 |
83 | if method == 'VAE':
84 | X_df = pd.read_table(input_folder + 'TCGA_' + tcga_type + '_RNASeq_Expression_' + method + '_encoded_' + VAE_dim + 'L_' + str(run_index) + '.tsv', index_col = 0)
85 |
86 | if method == 'DeepProfile':
87 | X_df = pd.read_table(input_folder + tcga_type + '_DeepProfile_TCGA_RNASeq_Embedding_150L.tsv', index_col = 0, sep = '\t')
88 |
89 |
90 | df_list.append(X_df)
91 |
92 | X_df = pd.concat(df_list, axis = 0)
93 | print("Expression data joined ", X_df.shape)
94 |
95 | #Now, replace X_df index to match with Y_df index
96 | mapper = lambda t: t[:12]
97 | vfunc = np.vectorize(mapper)
98 | newX_index = vfunc( X_df.index)
99 |
100 | X_df = pd.DataFrame(X_df.values, index = newX_index, columns = X_df.columns)
101 |
102 | #Take intersecting samples in datasets
103 | X_samples = X_df.index
104 | Y_samples = Y_df.index
105 | intersecting_samples = np.intersect1d(X_samples, Y_samples)
106 |
107 | subX_df = X_df.T[intersecting_samples].T
108 | subY_df = Y_df.T[intersecting_samples].T
109 |
110 | print("X dataframe ", subX_df.shape)
111 | print("Y dataframe ", subY_df.shape)
112 |
113 | print("X dataframe ", subX_df.index)
114 | print("Y dataframe ", subY_df.index)
115 |
116 | sample_indices = [np.where(subY_df.values == False)[0], np.where(subY_df.values == True)[0]]
117 | sample_counts = [len(sample_indices[0]), len(sample_indices[1])]
118 | print("SAMPLES LABEL 0: ", sample_counts[0], " SAMPLES LABEL 1: ", sample_counts[1])
119 |
120 | #Now select the class with highest number of samples and subsample
121 | low_class = np.argmin(sample_counts)
122 | high_class = np.argmax(sample_counts)
123 | print("Lower class size ", sample_counts[low_class], "samples subsampled from class ", high_class)
124 | random.seed(12345 * seed)
125 | random_indices = random.sample(list(np.arange(0, sample_counts[high_class])), sample_counts[low_class])
126 | selected_indices = np.sort(sample_indices[high_class][random_indices])
127 |
128 | subX_df = pd.concat([subX_df.iloc[sample_indices[low_class]], subX_df.iloc[selected_indices]])
129 | subY_df = pd.concat([subY_df.iloc[sample_indices[low_class]], subY_df.iloc[selected_indices]])
130 | subX_df = subX_df.sort_index()
131 | subY_df = subY_df.sort_index()
132 |
133 | results = trait_classification_accuracy(subX_df.values, np.ravel(subY_df.values))
134 | return results
135 |
136 | #Read user inputs
137 | import sys
138 | cancer_type = sys.argv[1] #main cancer type
139 | method = sys.argv[2] #name of the method
140 | run_index = int(sys.argv[3]) #run index
141 | if len(sys.argv) > 4:
142 | VAE_dim = sys.argv[4]
143 |
144 | if cancer_type == 'LUNG':
145 | tcga_types = ['LUAD', 'LUSC']
146 |
147 | input_folder = '../../ALL_CANCER_FILES/' + cancer_type + '/TCGA_FILES/'
148 | output_folder = 'Prediction_Results/'
149 |
150 | #Join data for cancer subtypes
151 | df_list = []
152 | for tcga_type in tcga_types:
153 | Y_df = pd.read_table(input_folder + 'DeepProfile_Embedding_and_' + tcga_type + '_Survival_df.tsv', index_col = 0, sep = '\t')
154 | print("Survival dataframe ", Y_df.shape)
155 | df_list.append(Y_df)
156 |
157 | Y_df = pd.concat(df_list, axis = 0)
158 | print("JOINED survival dataframe ", Y_df.shape)
159 |
160 | print("ALIVE..")
161 | print( Y_df[Y_df['fustat'] == 0]['futime'])
162 | print( np.mean(Y_df[Y_df['fustat'] == 0]['futime'].values))
163 |
164 | #Select all dead patients, only if they died within a year
165 | Y_df_dead = Y_df[Y_df['fustat'] == 1]
166 | indices_dead = np.where(Y_df_dead['futime'] < 5 * 365)[0]
167 | print("Dead within 5 year ", Y_df_dead.iloc[indices_dead][['fustat', 'futime']])
168 | print("Dead within 5 year ", np.max(Y_df_dead.iloc[indices_dead]['futime']))
169 |
170 | #Select all alive patients, only if they lived more than a year
171 | indices_alive = np.where(Y_df['futime'] > 5 * 365)[0]
172 | print("Alive after 5 year ", Y_df.iloc[indices_alive][['fustat', 'futime']])
173 | print("Alive after 5 year ", np.min(Y_df.iloc[indices_alive]['futime']))
174 |
175 | indices = list(indices_dead) + list(indices_alive)
176 | indices = np.unique(indices)
177 | Y_df = Y_df['fustat']
178 | Y_df = Y_df.iloc[indices]
179 | Y_df = Y_df.dropna()
180 | print("Survival dataframe \n ", Y_df)
181 |
182 | class0_count = len(np.where(Y_df.values == 0)[0])
183 | class1_count = len(np.where(Y_df.values == 1)[0])
184 |
185 | all_accuracies = []
186 | all_aucs = []
187 |
188 | for sampling_index in range(50):
189 | result = predict_survival(cancer_type, tcga_types, method, run_index, sampling_index)
190 | print("Accuracy: ", result)
191 | print("ROC-AUC: ", result[1])
192 | all_accuracies.append(result[0])
193 | all_aucs.append(result[1])
194 |
195 | print("FINAL RESULTS...")
196 | print("MEAN ACCURACY ", np.mean(np.asarray(all_accuracies)))
197 | print("MEAN ROC-AUC ", np.mean(np.asarray(all_aucs)))
198 |
199 | #Save results to a file
200 | if method == 'VAE':
201 | np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + cancer_type + '_' + method + '_' + VAE_dim + 'L_' + str(run_index) + '_ACCs.txt', np.asarray(all_accuracies), delimiter='\n')
202 | np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + cancer_type + '_' + method + '_' + VAE_dim + 'L_' + str(run_index) + '_AUCs.txt', np.asarray(all_aucs), delimiter='\n')
203 |
204 | else:
205 | np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + cancer_type + '_' + method + '_' + str(run_index) + '_ACCs.txt', np.asarray(all_accuracies), delimiter='\n')
206 | np.savetxt(output_folder + cancer_type + '/TCGA_Survival_5year_LR_Balanced_Subsample_20FOLD_50Runs_' + cancer_type + '_' + method + '_' + str(run_index) + '_AUCs.txt', np.asarray(all_aucs), delimiter='\n')
207 |
208 |
209 |
210 |
211 |
--------------------------------------------------------------------------------
/TCGA_SURVIVAL_PREDICTION/PREDICT_SURVIVAL/Run_Models.py:
--------------------------------------------------------------------------------
1 | import sys
2 | run = int(sys.argv[1])
3 |
4 | cancer_types = ['BRCA', 'AML',
5 | 'COLON',
6 | 'BRAIN', 'OV',
7 | 'SARCOMA', 'KIDNEY',
8 | 'LIVER', 'STOMACH',
9 | 'SKIN', 'UCEC',
10 | 'HEAD_NECK', 'PANCREAS',
11 | 'CERVICAL', 'BLADDER', 'LUNG']
12 |
13 | tcga_types = ['BRCA', 'LAML',
14 | 'COADREAD',
15 | 'GBMLGG', 'OV',
16 | 'SARC', 'KIPAN',
17 | 'LIHC', 'STAD',
18 | 'SKCM', 'UTERINE',
19 | 'HNSC', 'PAAD',
20 | 'CESC', 'BLCA', 'LUNG']
21 |
22 | for c in range(len(cancer_types)):
23 | cancer_type = cancer_types[c]
24 | tcga_type = tcga_types[c]
25 | print("------------")
26 | print(cancer_type)
27 | print(tcga_type)
28 |
29 | if cancer_type == 'LUNG':
30 |
31 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "PCA " + str(run))
32 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "DeepProfile " + str(run))
33 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "ICA " + str(run + 1))
34 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "RP " + str(run + 1))
35 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "AE " + str(run))
36 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "DAE " + str(run))
37 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "VAE " + str(run) + " 5")
38 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "VAE " + str(run) + " 10")
39 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "VAE " + str(run) + " 25")
40 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "VAE " + str(run) + " 50")
41 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "VAE " + str(run) + " 75")
42 | get_ipython().magic(u"run -i Predict_Survival_Subtypes_Joined.py " + cancer_type + " " + "VAE " + str(run) + " 100")
43 |
44 | else:
45 |
46 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "PCA " + str(run))
47 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "DeepProfile " + str(run))
48 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "ICA " + str(run + 1))
49 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "RP " + str(run + 1))
50 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "AE " + str(run))
51 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "DAE " + str(run))
52 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "VAE " + str(run) + " 5")
53 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "VAE " + str(run) + " 10")
54 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "VAE " + str(run) + " 25")
55 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "VAE " + str(run) + " 50")
56 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "VAE " + str(run) + " 75")
57 | get_ipython().magic(u"run -i Predict_Survival.py " + cancer_type + " " + tcga_type + " " + "VAE " + str(run) + " 100")
58 |
--------------------------------------------------------------------------------