├── Autoendocer_codes ├── Autoenco_cancer_vanilla_FInal.ipynb ├── Autoencode_Sparse_cancer_Final.ipynb ├── Autoencoder_Denoising_final.ipynb └── Varational_autoencoder_Final.ipynb ├── LICENSE └── README.md /Autoendocer_codes/Autoenco_cancer_vanilla_FInal.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import numpy as np # linear algebra\n", 11 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", 12 | "\n", 13 | "from numpy.random import seed\n", 14 | "from sklearn.preprocessing import minmax_scale\n", 15 | "from sklearn.model_selection import train_test_split\n", 16 | "from keras.layers import Input, Dense\n", 17 | "from keras.models import Model\n", 18 | "from sklearn.preprocessing import StandardScaler\n", 19 | "from sklearn.preprocessing import MinMaxScaler\n", 20 | "from sklearn import preprocessing" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "scrolled": true 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "Data_set= pd.read_csv(\"/home/effrancodelos/Deep_learnig/Autoencoder_Notebook/Nova_data_agosto_2020/GBM_mRNA_miRNA_DNA.csv\")\n", 32 | "Data_set.head(n=20)\n" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "scrolled": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "#Create a data-set with the string columns\n", 44 | "\n", 45 | "Pacients = Data_set['Pacients']\n", 46 | "\n", 47 | "\n" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "#Remove string columns\n", 57 | "Data_set.drop(['Pacients'], axis=1, inplace=True)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "#Data normalization \n", 67 | "\n", 68 | "normalized_df = minmax_scale(Data_set, axis = 0)\n", 69 | "#normalized_df = StandardScaler().fit_transform(Data_set)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "normalized_df" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "#split the data 80% training and 20% teste\n", 88 | " \n", 89 | "X_train, X_val = train_test_split(normalized_df, test_size=0.2,random_state = seed(7))" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "#Input feauter number \n", 99 | "ncol = normalized_df.shape[1]\n", 100 | "\n", 101 | "\n", 102 | "\n" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 1, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "encoding_dim = 100" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "#Network configuration\n", 121 | "\n", 122 | "input_dim = Input(shape = (ncol, ))\n", 123 | "# Encoder Layers\n", 124 | "encoded1 = Dense(500, activation = 'tanh')(input_dim)\n", 125 | "encoded2 = Dense(encoding_dim, activation = 'tanh')(encoded1)\n", 126 | "\n", 127 | "# Decoder Layers\n", 128 | "\n", 129 | "decoded1 = Dense(500, activation = 'tanh')(encoded2)\n", 130 | "decoded2 = Dense(ncol, activation = 'sigmoid')(decoded1)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "# Combine Encoder and Deocder layers\n", 140 | "autoencoder = Model(inputs = input_dim, outputs = decoded2)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "# Compile the Model\n", 150 | "#autoencoder.compile(optimizer = 'adadelta', loss = 'binary_crossentropy')\n", 151 | "autoencoder.compile(optimizer = 'adam', loss = 'mean_squared_error')# BEST" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": { 158 | "scrolled": true 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "autoencoder.summary()" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "#fit the model\n", 172 | "\n", 173 | "autoencoder.fit(X_train, X_train, epochs = 70, batch_size = 256, shuffle = False, validation_data = (X_val, X_val))" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "#Obtain the reduced data\n", 183 | "\n", 184 | "encoder = Model(inputs = input_dim, outputs = encoded2)\n", 185 | "encoded_input = Input(shape = (encoding_dim, ))" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "encoded_train = pd.DataFrame(encoder.predict(normalized_df))\n", 195 | "encoded_train = encoded_train.add_prefix('feature_')" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "encoded_train['Pacients'] = Pacients\n", 205 | "#encoded_train['Radiations'] = Radiation" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "#output dataset\n", 215 | "print(encoded_train.shape)\n", 216 | "encoded_train.head()" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "encoded_train.to_csv(\"/home/effrancodelos/Deep_learnig/Autoencoder_Notebook/Nova_data_agosto_2020/GBM_mRNA_miRNA_DNA_Vanilla.csv\", index=False)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [] 234 | } 235 | ], 236 | "metadata": { 237 | "kernelspec": { 238 | "display_name": "Python 3", 239 | "language": "python", 240 | "name": "python3" 241 | }, 242 | "language_info": { 243 | "codemirror_mode": { 244 | "name": "ipython", 245 | "version": 3 246 | }, 247 | "file_extension": ".py", 248 | "mimetype": "text/x-python", 249 | "name": "python", 250 | "nbconvert_exporter": "python", 251 | "pygments_lexer": "ipython3", 252 | "version": "3.8.3" 253 | } 254 | }, 255 | "nbformat": 4, 256 | "nbformat_minor": 2 257 | } 258 | -------------------------------------------------------------------------------- /Autoendocer_codes/Autoencode_Sparse_cancer_Final.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import numpy as np # linear algebra\n", 11 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", 12 | "\n", 13 | "from numpy.random import seed\n", 14 | "from sklearn.preprocessing import minmax_scale\n", 15 | "from sklearn.model_selection import train_test_split\n", 16 | "from keras.layers import Input, Dense\n", 17 | "from keras.models import Model\n", 18 | "from sklearn.preprocessing import StandardScaler\n", 19 | "from keras import regularizers\n", 20 | "import tensorflow as tf\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "scrolled": true 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "#input\n", 32 | "Data_set= pd.read_csv(\"/home/effrancodelos/Deep_learnig/Data_Nova/COAD/COAD_COX/data_Merge_Colon_t.csv\")\n", 33 | "Data_set.head(n=20)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "scrolled": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "#Create a data-set with the string columns\n", 45 | "\n", 46 | "Pacients = Data_set['Pacients']\n", 47 | "\n", 48 | "\n" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "#Remove string column\n", 58 | "Data_set.drop(['Pacients'], axis=1, inplace=True)\n" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "Data_set_scaled =minmax_scale(Data_set, axis = 0)\n", 68 | "#Data_set_scaled = StandardScaler().fit_transform(Data_set)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "ncol = Data_set_scaled.shape[1]\n", 78 | "\n", 79 | "\n" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "#split the data set 0.8 for training and 20% for test\n", 89 | "X_train, X_test = train_test_split(Data_set_scaled, train_size = 0.8, random_state = seed(2017))\n" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "len(X_test)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "encoding_dim = 100" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "input_dim = Input(shape = (ncol, ))\n", 117 | "# Encoder Layers\n", 118 | "\n", 119 | "encoded2 = Dense(500, activation = 'tanh',activity_regularizer=regularizers.l1_l2(l1=0.000001,l2=0.00001))(input_dim)\n", 120 | "#encoded3 = Dense(500, activation = 'relu',activity_regularizer=regularizers.l1(10e-5))(input_dim)\n", 121 | "encoded3 = Dense(encoding_dim, activation = 'tanh')(encoded2)\n", 122 | "\n", 123 | "# Decoder Layers\n", 124 | "\n", 125 | "#decoded1 = Dense(500, activation = 'relu')(encoded3)\n", 126 | "decoded1 = Dense(500, activation = 'tanh',activity_regularizer=regularizers.l1_l2(l1=0.000001, l2=0.00001))(encoded3)\n", 127 | "decoded2 = Dense(ncol, activation = 'sigmoid')(decoded1)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "# Combine Encoder and Deocder layers\n", 137 | "autoencoder = Model(inputs = input_dim, outputs = decoded2)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "# Compile the Model\n", 147 | "autoencoder.compile(optimizer = 'adam', loss = 'binary_crossentropy')\n", 148 | "#autoencoder.compile(optimizer = 'adam', loss = 'mean_squared_error')# BEST" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "autoencoder.summary()" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "#fit the model \n", 167 | "autoencoder.fit(X_train, X_train, epochs = 50, batch_size = 100, shuffle = False, validation_data = (X_test, X_test))" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "encoder = Model(inputs = input_dim, outputs = encoded3)\n", 177 | "encoded_input = Input(shape = (encoding_dim, ))" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "#Get the output\n", 187 | "encoded_train = pd.DataFrame(encoder.predict(Data_set_scaled))\n", 188 | "encoded_train = encoded_train.add_prefix('feature_')" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "encoded_train['Pacients'] = Pacients\n", 198 | "#encoded_train['Radiations'] = Radiation" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "print(encoded_train.shape)\n", 208 | "encoded_train.head()" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "encoded_train.to_csv(\"/home/effrancodelos/Deep_learnig/Data_Nova/COAD/COAD_COX/data_Merge_Colon_COX_Spare_1.csv\", index=False)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "\n" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [] 235 | } 236 | ], 237 | "metadata": { 238 | "kernelspec": { 239 | "display_name": "Python 3", 240 | "language": "python", 241 | "name": "python3" 242 | }, 243 | "language_info": { 244 | "codemirror_mode": { 245 | "name": "ipython", 246 | "version": 3 247 | }, 248 | "file_extension": ".py", 249 | "mimetype": "text/x-python", 250 | "name": "python", 251 | "nbconvert_exporter": "python", 252 | "pygments_lexer": "ipython3", 253 | "version": "3.8.3" 254 | } 255 | }, 256 | "nbformat": 4, 257 | "nbformat_minor": 2 258 | } 259 | -------------------------------------------------------------------------------- /Autoendocer_codes/Autoencoder_Denoising_final.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from keras.datasets import mnist\n", 10 | "from keras.layers import Input, Dense\n", 11 | "from keras.models import Model\n", 12 | "import numpy as np\n", 13 | "import pandas as pd\n", 14 | "from sklearn.preprocessing import minmax_scale\n", 15 | "\n", 16 | "import matplotlib.pyplot as plt\n", 17 | "%matplotlib inline\n", 18 | "\n", 19 | "from sklearn.preprocessing import StandardScaler\n", 20 | "from sklearn.model_selection import train_test_split\n", 21 | "from numpy.random import seed" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "#import \n", 31 | "\n", 32 | "Data_set= pd.read_csv(\"/home/effrancodelos/Deep_learnig/Data_Nova/COAD/COAD_COX/data_Merge_Colon_t.csv\")\n", 33 | "Data_set.head(n=20)\n", 34 | "\n" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "#Create a data-set with the string columns\n", 44 | "\n", 45 | "Pacients = Data_set['Pacients']\n", 46 | "#Radiation = Data_set['radiations']\n" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "Data_set.drop(['Pacients'], axis=1, inplace=True)\n", 56 | "#Data_set.drop(Data_set.columns[0], axis=1, inplace=True)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "print (Data_set.shape)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "#data tranforamtion\n", 75 | "\n", 76 | "Data_set_scaled = minmax_scale(Data_set, axis = 0)\n", 77 | "#Columns inputs number\n", 78 | "ncol = Data_set_scaled.shape[1]\n", 79 | "\n" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "#Split the dataset \n", 89 | "X_train, X_test = train_test_split(Data_set_scaled, train_size = 0.8, random_state = seed(2017))\n" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "#add noisy to the main data_set\n", 99 | "noise_factor = 0.5\n", 100 | "X_train_noisy = X_train + noise_factor * np.random.normal(loc=0.0, scale=0.3, size=X_train.shape)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "#add noisy to the main data_set\n", 110 | "X_test_noisy = X_test + noise_factor * np.random.normal(loc=0.0, scale=0.3, size=X_test.shape)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "#Clip (limit) the values in an array\n", 120 | "#X_train_noisy = np.clip(X_train_noisy, 0., 1.)\n", 121 | "#X_test_noisy= np.clip(X_train_noisy, 0., 1.)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "#encoding\n", 131 | "encoding_dim = 100" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "input_dim = Input(shape = (ncol, ))\n", 141 | "#Encoder\n", 142 | "encoded1 = Dense(units=500, activation='tanh')(input_dim)\n", 143 | "encoded2 = Dense(encoding_dim, activation='tanh')(encoded1)\n", 144 | "\n", 145 | "\n", 146 | "\n", 147 | "#decoded \n", 148 | "decoded1 = Dense(500, activation='tanh')(encoded2)\n", 149 | "decoded2= Dense(ncol, activation='sigmoid')(decoded1)\n", 150 | "# Building autoencoder\n" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "# Building autoencoder\n", 160 | "autoencoder=Model(input_dim, decoded2)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "#extracting encoder\n", 170 | "encoder = Model(input_dim, encoded2)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "# compiling the autoencoder\n", 180 | "autoencoder.compile(optimizer = 'adam', loss = 'mean_squared_error')# BEST\n", 181 | "#autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "autoencoder.summary()" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "#fit the network\n", 200 | "autoencoder.fit(X_train_noisy, X_train_noisy,\n", 201 | " epochs=40,\n", 202 | " batch_size=32,\n", 203 | " shuffle=True,\n", 204 | " validation_data=(X_test, X_test))" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "encoder = Model(inputs = input_dim, outputs = encoded2)\n", 214 | "encoded_input = Input(shape = (encoding_dim, ))" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "Data_set_scaled_noisy= Data_set_scaled + noise_factor * np.random.normal(loc=0.0, scale=0.3, size=Data_set_scaled.shape)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "encoded_train = pd.DataFrame(encoder.predict(Data_set_scaled_noisy))\n", 233 | "encoded_train = encoded_train.add_prefix('feature_')" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "encoded_train['Pacients'] = Pacients\n", 243 | "#encoded_train['Radiations'] = Radiation" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "print(encoded_train.shape)\n", 253 | "encoded_train.head()" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "encoded_train.to_csv(\"/home/effrancodelos/Deep_learnig/Data_Nova/COAD/COAD_COX/data_Merge_Colon_Cox_Denoising_1.csv\", index=False)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [] 271 | } 272 | ], 273 | "metadata": { 274 | "kernelspec": { 275 | "display_name": "Python 3", 276 | "language": "python", 277 | "name": "python3" 278 | }, 279 | "language_info": { 280 | "codemirror_mode": { 281 | "name": "ipython", 282 | "version": 3 283 | }, 284 | "file_extension": ".py", 285 | "mimetype": "text/x-python", 286 | "name": "python", 287 | "nbconvert_exporter": "python", 288 | "pygments_lexer": "ipython3", 289 | "version": "3.8.3" 290 | } 291 | }, 292 | "nbformat": 4, 293 | "nbformat_minor": 2 294 | } 295 | -------------------------------------------------------------------------------- /Autoendocer_codes/Varational_autoencoder_Final.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from keras.layers import Lambda, Input, Dense\n", 10 | "from keras.models import Model\n", 11 | "from keras.datasets import mnist\n", 12 | "from keras.losses import mse, binary_crossentropy\n", 13 | "from keras.utils import plot_model\n", 14 | "from keras import backend as K\n", 15 | "from keras.callbacks import ModelCheckpoint\n", 16 | "from keras.layers import Input, Dense, Lambda, Layer, Add, Multiply\n", 17 | "from keras.models import Model, Sequential\n", 18 | "\n", 19 | "import numpy as np\n", 20 | "import pandas as pd\n", 21 | "from sklearn.preprocessing import minmax_scale\n", 22 | "import matplotlib.pyplot as plt\n", 23 | "\n", 24 | "import argparse\n", 25 | "import os" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "\n", 35 | "Data_set= pd.read_csv(\"/home/effrancodelos/Deep_learnig/Data_Nova/COAD/COAD_COX/data_Merge_Colon_t.csv\")\n", 36 | "Data_set.head(n=20)\n" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "\n", 46 | "Pacients = Data_set['Pacients']\n", 47 | "#Radiation = Data_set['radiations']\n" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "Data_set.drop(['Pacients'], axis=1, inplace=True)\n", 57 | "#Data_set.drop(Data_set.columns[0], axis=1, inplace=True)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "from sklearn.preprocessing import MinMaxScaler\n", 67 | "\n", 68 | "scaler = MinMaxScaler()\n", 69 | "df_norm = scaler.fit_transform(Data_set)\n" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "df_norm" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "original_dim= df_norm.shape[1]\n", 88 | "input_shape = (original_dim, )\n", 89 | "intermediate_dim = int(original_dim/2)\n", 90 | "batch_size = 128\n", 91 | "latent_dim = 100\n", 92 | "epochs = 80\n", 93 | "epsilon_std = 1.0" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "class KLDivergenceLayer(Layer):\n", 103 | "\n", 104 | " \"\"\" Identity transform layer that adds KL divergence\n", 105 | " to the final model loss.\n", 106 | " \"\"\"\n", 107 | "\n", 108 | " def __init__(self, *args, **kwargs):\n", 109 | " self.is_placeholder = True\n", 110 | " super(KLDivergenceLayer, self).__init__(*args, **kwargs)\n", 111 | "\n", 112 | " def call(self, inputs):\n", 113 | "\n", 114 | " mu, log_var = inputs\n", 115 | "\n", 116 | " kl_batch = - .5 * K.sum(1 + log_var -\n", 117 | " K.square(mu) -\n", 118 | " K.exp(log_var), axis=-1)\n", 119 | "\n", 120 | " self.add_loss(K.mean(kl_batch), inputs=inputs)\n", 121 | "\n", 122 | " return inputs" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "# VAE Architecture\n", 132 | "# * original_dim - Original Input Dimension\n", 133 | "# * intermediate_dim - Hidden Layer Dimension\n", 134 | "# * latent_dim - Latent/Embedding Dimension\n", 135 | "def vae_arc(original_dim, intermediate_dim, latent_dim):\n", 136 | " # Decode\n", 137 | " decoder = Sequential([\n", 138 | " Dense(intermediate_dim, input_dim=latent_dim, activation='relu'),\n", 139 | " Dense(original_dim, activation='sigmoid')\n", 140 | " ])\n", 141 | "\n", 142 | " # Encode\n", 143 | " x = Input(shape=(original_dim,))\n", 144 | " h = Dense(intermediate_dim, activation='relu')(x)\n", 145 | "\n", 146 | " z_mu = Dense(latent_dim)(h)\n", 147 | " z_log_var = Dense(latent_dim)(h)\n", 148 | "\n", 149 | " z_mu, z_log_var = KLDivergenceLayer()([z_mu, z_log_var])\n", 150 | " z_sigma = Lambda(lambda t: K.exp(.5*t))(z_log_var)\n", 151 | "\n", 152 | " eps = Input(tensor=K.random_normal(stddev=epsilon_std,\n", 153 | " shape=(K.shape(x)[0], latent_dim)))\n", 154 | " z_eps = Multiply()([z_sigma, eps])\n", 155 | " z = Add()([z_mu, z_eps])\n", 156 | "\n", 157 | " x_pred = decoder(z)\n", 158 | " \n", 159 | " return x, eps, z_mu, x_pred" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "def nll(y_true, y_pred):\n", 169 | " \"\"\" Negative log likelihood (Bernoulli). \"\"\"\n", 170 | "\n", 171 | " # keras.losses.binary_crossentropy gives the mean\n", 172 | " # over the last axis. we require the sum\n", 173 | " return K.sum(K.binary_crossentropy(y_true, y_pred), axis=-1)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "#Netwrok Configuration\n", 183 | "x, eps, z_mu, x_pred = vae_arc(original_dim, intermediate_dim, latent_dim)\n", 184 | "vae = Model(inputs=[x, eps], outputs=x_pred)\n", 185 | "vae.compile(optimizer='adam', loss=nll)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "vae.summary()\n" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "#Data split\n", 204 | "from sklearn.model_selection import train_test_split\n", 205 | "\n", 206 | "# \n", 207 | "X_train, X_test, y_train, y_test = train_test_split(df_norm, df_norm, \n", 208 | " test_size=0.33, random_state=42)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "filepath =\"weights.hdf5\"\n", 218 | "checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')\n", 219 | "callbacks_list = [checkpoint]" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "hist = vae.fit(X_train, X_train,\n", 229 | " epochs=epochs,\n", 230 | " batch_size=batch_size,\n", 231 | " callbacks=callbacks_list,\n", 232 | " validation_data=(X_test, X_test))" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "def plt_hist(hist):\n", 242 | " # summarize history for loss\n", 243 | " plt.plot(hist.history['loss'])\n", 244 | " plt.plot(hist.history['val_loss'])\n", 245 | " plt.title('model loss')\n", 246 | " plt.ylabel('loss')\n", 247 | " plt.xlabel('epoch')\n", 248 | " plt.legend(['train', 'test'], loc='upper left')" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "plt_hist(hist)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "encoder = Model(x, z_mu)\n", 267 | "z_df = encoder.predict(df_norm, batch_size=batch_size)" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "encoded_train = pd.DataFrame(encoder.predict(df_norm, batch_size=batch_size))\n", 277 | "encoded_train = encoded_train.add_prefix('feature_')" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "encoded_train['Pacients'] = Pacients" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "encoded_train" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "encoded_train.to_csv(\"/home/effrancodelos/Deep_learnig/Data_Nova/COAD/COAD_COX/data_Merge_Colon_COX_Variotional_1.csv\", index=False)" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [] 313 | } 314 | ], 315 | "metadata": { 316 | "kernelspec": { 317 | "display_name": "Python 3", 318 | "language": "python", 319 | "name": "python3" 320 | }, 321 | "language_info": { 322 | "codemirror_mode": { 323 | "name": "ipython", 324 | "version": 3 325 | }, 326 | "file_extension": ".py", 327 | "mimetype": "text/x-python", 328 | "name": "python", 329 | "nbconvert_exporter": "python", 330 | "pygments_lexer": "ipython3", 331 | "version": "3.8.3" 332 | } 333 | }, 334 | "nbformat": 4, 335 | "nbformat_minor": 2 336 | } 337 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Biological Networks Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Performance comparison of deep learning autoencoders for cancer subtype detection using multi-omics data 2 | A heterogeneous disease like cancer is activated through multiple pathways and different perturbations. Depending upon the activated pathway(s), patients’ survival vary significantly and show different efficacy to various drugs. Therefore, cancer subtype detection using genomics level data is a significant research problem. Subtype detection is often a complex problem, and in most cases, needs multi-omics data fusion to achieve accurate subtyping. Different data fusion and subtyping approaches have been proposed, such as kernel-based fusion, matrix factorization, and deep learning autoencoders. In this paper, we compared the performance of different deep learning autoencoders for cancer subtype detection. We performed cancer subtype detection on four different cancer types from The Cancer Genome Atlas (TCGA) datasets using four autoencoder implementations. We also predicted the optimal number of subtypes in a cancer type using the silhouette score. We observed that the detected subtypes exhibit significant differences in survival profiles. Furthermore, we also compared the effect of feature selection and similarity measures for subtype detection. To evaluate the results obtained, we selected the Glioblastoma multiforme (GBM) dataset and identified the differentially expressed genes in each of the subtypes identified by the autoencoders; the obtained results coincide well with other genomic studies and can be corroborated with the involved pathways and biological functions. Thus, it shows that the results from the autoencoders, obtained through the interaction of different datatypes of cancer, can be used for the prediction and characterization of patient subgroups and survival profiles. 3 | 4 | --------------------------------------------------------------------------------