├── CITEseq.py ├── README.md ├── dim_reduct_CITEseq.py ├── dim_reduct_scNMTseq.py ├── scNMTseq.py ├── scProteomics.txt ├── scRNAseq.txt ├── tSNE_on_Autoencoder_CITEseq.py └── tsne_on_autoencoder_scNMTseq.py /CITEseq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Nikolay Oskolkov, WABI Long-Term Support, nikolay.oskolkov@scilifelab.se 3 | 4 | import os 5 | import numpy as np 6 | import pandas as pd 7 | from umap import UMAP 8 | import matplotlib as mpl 9 | from sklearn.manifold import TSNE 10 | from keras.layers import Input, Dense, Dropout 11 | from keras.layers.merge import concatenate 12 | from keras.models import Model 13 | from keras.utils import plot_model 14 | import matplotlib.pyplot as plt 15 | 16 | import warnings 17 | warnings.filterwarnings('ignore') 18 | 19 | ########################################## READ AND TRANSFORM DATA ############################################## 20 | os.chdir("/home/nikolay/WABI/Misc/SingleCell/CITEseq/") 21 | scRNAseq = pd.read_csv('scRNAseq.txt',sep='\t') 22 | scProteomics = pd.read_csv('scProteomics.txt',sep='\t') 23 | print(scRNAseq.shape) 24 | print(scProteomics.shape) 25 | print("\n") 26 | print(scRNAseq.iloc[0:5,0:5]) 27 | print(scProteomics.iloc[0:5,0:5]) 28 | 29 | X_scRNAseq = scRNAseq.values[:,0:(scRNAseq.shape[1]-1)] 30 | Y_scRNAseq = scRNAseq.values[:,scRNAseq.shape[1]-1] 31 | X_scProteomics = scProteomics.values[:,0:(scProteomics.shape[1]-1)] 32 | Y_scProteomics = scProteomics.values[:,scProteomics.shape[1]-1] 33 | print("\n") 34 | print(X_scRNAseq[0:5,0:5]) 35 | print(Y_scRNAseq[0:5]) 36 | 37 | # LOG-TRANSFORM DATA 38 | X_scRNAseq = np.log(X_scRNAseq + 1) 39 | print(X_scRNAseq[0:5,0:5]) 40 | X_scProteomics = np.log(X_scProteomics + 1) 41 | print(X_scProteomics[0:5,0:5]) 42 | 43 | ################################################## AUTOENCODER ################################################## 44 | 45 | # Input Layer 46 | ncol_scRNAseq = X_scRNAseq.shape[1] 47 | input_dim_scRNAseq = Input(shape = (ncol_scRNAseq, ), name = "scRNAseq") 48 | ncol_scProteomics = X_scProteomics.shape[1] 49 | input_dim_scProteomics = Input(shape = (ncol_scProteomics, ), name = "scProteomics") 50 | 51 | # Dimensions of Encoder for each OMIC 52 | encoding_dim_scRNAseq = 50 53 | encoding_dim_scProteomics = 10 54 | 55 | # Encoder layer for each OMIC 56 | encoded_scRNAseq = Dense(encoding_dim_scRNAseq, activation = 'linear', name = "Encoder_scRNAseq")(input_dim_scRNAseq) 57 | encoded_scProteomics = Dense(encoding_dim_scProteomics, activation = 'linear', name = "Encoder_scProteomics")(input_dim_scProteomics) 58 | 59 | # Merging Encoder layers from different OMICs 60 | merge = concatenate([encoded_scRNAseq, encoded_scProteomics]) 61 | 62 | # Bottleneck compression 63 | bottleneck = Dense(50, kernel_initializer = 'uniform', activation = 'linear', name = "Bottleneck")(merge) 64 | 65 | #Inverse merging 66 | merge_inverse = Dense(encoding_dim_scRNAseq + encoding_dim_scProteomics, activation = 'elu', name = "Concatenate_Inverse")(bottleneck) 67 | 68 | # Decoder layer for each OMIC 69 | decoded_scRNAseq = Dense(ncol_scRNAseq, activation = 'sigmoid', name = "Decoder_scRNAseq")(merge_inverse) 70 | decoded_scProteomics = Dense(ncol_scProteomics, activation = 'sigmoid', name = "Decoder_scProteomics")(merge_inverse) 71 | 72 | # Combining Encoder and Decoder into an Autoencoder model 73 | autoencoder = Model(input = [input_dim_scRNAseq, input_dim_scProteomics], output = [decoded_scRNAseq, decoded_scProteomics]) 74 | 75 | # Compile Autoencoder 76 | autoencoder.compile(optimizer = 'adam', loss={'Decoder_scRNAseq': 'mean_squared_error', 'Decoder_scProteomics': 'mean_squared_error'}) 77 | autoencoder.summary() 78 | 79 | # Autoencoder graph 80 | plot_model(autoencoder, to_file='autoencoder_graph.png') 81 | 82 | # Autoencoder training 83 | estimator = autoencoder.fit([X_scRNAseq, X_scProteomics], [X_scRNAseq, X_scProteomics], epochs = 100, batch_size = 128, validation_split = 0.2, shuffle = True, verbose = 1) 84 | print("Training Loss: ",estimator.history['loss'][-1]) 85 | print("Validation Loss: ",estimator.history['val_loss'][-1]) 86 | #plt.figure(figsize=(20, 15)) 87 | plt.plot(estimator.history['loss']) 88 | plt.plot(estimator.history['val_loss']) 89 | plt.title('Model Loss') 90 | plt.ylabel('Loss') 91 | plt.xlabel('Epoch') 92 | plt.legend(['Train','Validation'], loc = 'upper right') 93 | plt.show() 94 | 95 | # Encoder model 96 | encoder = Model(input = [input_dim_scRNAseq, input_dim_scProteomics], output = bottleneck) 97 | bottleneck_representation = encoder.predict([X_scRNAseq, X_scProteomics]) 98 | print(pd.DataFrame(bottleneck_representation).shape) 99 | print(pd.DataFrame(bottleneck_representation).iloc[0:5,0:5]) 100 | 101 | # Dimensionality reduction plot 102 | #plt.figure(figsize=(20, 15)) 103 | plt.scatter(bottleneck_representation[:, 0], bottleneck_representation[:, 1], c = Y_scRNAseq, cmap = 'tab20', s = 10) 104 | plt.title('Autoencoder Data Integration') 105 | plt.xlabel('Dimension 1') 106 | plt.ylabel('Dimension 2') 107 | #plt.colorbar() 108 | plt.show() 109 | 110 | # tSNE on Autoencoder bottleneck representation 111 | model_tsne_auto = TSNE(learning_rate = 200, n_components = 2, random_state = 123, perplexity = 90, n_iter = 1000, verbose = 1) 112 | tsne_auto = model_tsne_auto.fit_transform(bottleneck_representation) 113 | plt.scatter(tsne_auto[:, 0], tsne_auto[:, 1], c = Y_scRNAseq, cmap = 'tab20', s = 10) 114 | plt.title('tSNE on Autoencoder: Data Integration, CITEseq') 115 | plt.xlabel("tSNE1") 116 | plt.ylabel("tSNE2") 117 | plt.show() 118 | 119 | # UNIFORM MANIFOLD APPROXIMATION AND PROJECTION (UMAP) 120 | #model_umap = UMAP(n_neighbors = 20, min_dist = 0.3, n_components = 2) 121 | #umap = model_umap.fit_transform(bottleneck_representation) 122 | #plt.scatter(umap[:, 0], umap[:, 1], c = Y_scRNAseq, cmap = 'tab20', s = 10) 123 | #plt.title('UMAP on Autoencoder') 124 | #plt.xlabel("UMAP1") 125 | #plt.ylabel("UMAP2") 126 | #plt.show() 127 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Here I show how to use Deep Learning for biological and biomedical Data Integration 2 | -------------------------------------------------------------------------------- /dim_reduct_CITEseq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Nikolay Oskolkov, WABI Long-Term Support, nikolay.oskolkov@scilifelab.se 3 | # Run this script as python dim_reduct_CITEseq.py scRNAseq.txt 4 | 5 | import sys 6 | import keras 7 | import numpy as np 8 | import pandas as pd 9 | from keras.layers import Dense 10 | import matplotlib.pyplot as plt 11 | from sklearn.manifold import TSNE 12 | from keras.optimizers import Adam 13 | from sklearn.decomposition import PCA 14 | from keras.models import Sequential, Model 15 | 16 | import warnings 17 | warnings.filterwarnings("ignore") 18 | 19 | # READ DATA 20 | infile = str(sys.argv[1]) 21 | print("\n" + "You specified input file: " + infile + "\n") 22 | 23 | expr = pd.read_csv(infile,sep='\t') 24 | print("\n" + "Dimensions of input file: " + str(expr.shape) + "\n") 25 | print("\n" + "A few first lines of input file: " + "\n") 26 | print(expr.iloc[0:4, 0:4]) 27 | print("\n" + "Last column corresponds to cluster assignments: " + "\n") 28 | print(expr.iloc[0:4, (expr.shape[1]-4):expr.shape[1]]) 29 | 30 | # LOG-TRANSFORM DATA 31 | X = expr.values[:,0:(expr.shape[1]-1)] 32 | Y = expr.values[:,expr.shape[1]-1] 33 | print("\n" + "You have following unique cluster labels: " + "\n") 34 | print(set(Y)) 35 | print("\n" + "Log-transforming data..." + "\n") 36 | X = np.log(X + 1) 37 | 38 | # REDUCE DIMENSIONS WITH PRINCIPAL COMPONENT ANALYSIS (PCA) 39 | n_input = 50 40 | x_train = PCA(n_components = n_input).fit_transform(X) 41 | y_train = Y 42 | #plt.figure(figsize=(20, 15)) 43 | plt.scatter(x_train[:, 0], x_train[:, 1], c = y_train, cmap = 'tab20', s = 10) 44 | plt.title('Principal Component Analysis (PCA)') 45 | plt.xlabel("PC1") 46 | plt.ylabel("PC2") 47 | plt.show() 48 | print("\n" + "Dimensions of reduced data set: " + str(x_train.shape) + "\n") 49 | 50 | # REDUCE DIMENSIONS WITH AUTOENCODER 51 | model = Sequential() 52 | model.add(Dense(30, activation='elu', input_shape=(n_input,))) 53 | model.add(Dense(20, activation='elu')) 54 | model.add(Dense(10, activation='elu')) 55 | model.add(Dense(2, activation='linear', name="bottleneck")) 56 | model.add(Dense(10, activation='elu')) 57 | model.add(Dense(20, activation='elu')) 58 | model.add(Dense(30, activation='elu')) 59 | model.add(Dense(n_input, activation='sigmoid')) 60 | model.compile(loss = 'mean_squared_error', optimizer = Adam()) 61 | model.summary() 62 | 63 | history = model.fit(x_train, x_train, batch_size = 128, epochs = 500, shuffle = False, verbose = 1, validation_split = 0.2) 64 | print("\n" + "Training Loss: ", history.history['loss'][-1]) 65 | print("Validation Loss: ", history.history['val_loss'][-1]) 66 | plt.plot(history.history['loss']) 67 | plt.plot(history.history['val_loss']) 68 | plt.title('Model Loss') 69 | plt.ylabel('Loss') 70 | plt.xlabel('Epoch') 71 | plt.legend(['Train', 'Validate'], loc='upper right') 72 | plt.show() 73 | 74 | encoder = Model(model.input, model.get_layer('bottleneck').output) 75 | bottleneck_representation = encoder.predict(x_train) 76 | 77 | # PLOT DIMENSIONALITY REDUCTION 78 | #plt.figure(figsize=(20, 15)) 79 | plt.scatter(bottleneck_representation[:,0], bottleneck_representation[:,1], c = y_train, s = 10, cmap = 'tab20') 80 | plt.title('Autoencoder: 8 Layers') 81 | plt.xlabel("Dimension 1") 82 | plt.ylabel("Dimension 2") 83 | plt.show() 84 | 85 | # REDUCE DIMENSIONS WITH T-DISTRIBUTED STOCHASTIC NEIGHBOR EMBEDDING (tSNE) 86 | #model_tsne = TSNE(learning_rate = 200, n_components = 2, random_state = 123, perplexity = 90, n_iter = 1000, verbose = 1) 87 | #tsne = model_tsne.fit_transform(x_train) 88 | #plt.figure(figsize=(20, 15)) 89 | #plt.scatter(tsne[:, 0], tsne[:, 1], c = y_train, cmap = 'tab20', s = 10) 90 | #plt.title('tSNE on PCA') 91 | #plt.xlabel("tSNE1") 92 | #plt.ylabel("tSNE2") 93 | #plt.show() 94 | 95 | # VISUALIZE AUTOENCODER 96 | #from ann_visualizer.visualize import ann_viz 97 | #ann_viz(model, title = "Autoencoder", view = True) 98 | -------------------------------------------------------------------------------- /dim_reduct_scNMTseq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Nikolay Oskolkov, WABI Long-Term Support, nikolay.oskolkov@scilifelab.se 3 | 4 | import sys 5 | import keras 6 | import numpy as np 7 | import pandas as pd 8 | from umap import UMAP 9 | from keras.layers import Dense 10 | import matplotlib.pyplot as plt 11 | from sklearn.manifold import TSNE 12 | from keras.optimizers import Adam 13 | from sklearn.decomposition import PCA 14 | from keras.models import Sequential, Model 15 | 16 | import warnings 17 | warnings.filterwarnings("ignore") 18 | 19 | # READ DATA 20 | infile = str(sys.argv[1]) 21 | print("\n" + "You specified input file: " + infile + "\n") 22 | 23 | expr = pd.read_csv(infile,sep='\t') 24 | print("\n" + "Dimensions of input file: " + str(expr.shape) + "\n") 25 | print("\n" + "A few first lines of input file: " + "\n") 26 | print(expr.iloc[0:4, 0:4]) 27 | print("\n" + "Last column corresponds to cluster assignments: " + "\n") 28 | print(expr.iloc[0:4, (expr.shape[1]-4):expr.shape[1]]) 29 | 30 | # LOG-TRANSFORM DATA 31 | X = expr.values[:,0:(expr.shape[1]-1)] 32 | Y = expr.values[:,expr.shape[1]-1] 33 | print("\n" + "You have following unique cluster labels: " + "\n") 34 | print(set(Y)) 35 | print("\n" + "Log-transforming data..." + "\n") 36 | X = np.log(X + 1) 37 | 38 | # REDUCE DIMENSIONS WITH PRINCIPAL COMPONENT ANALYSIS (PCA) 39 | n_input = 20 40 | x_train = PCA(n_components = n_input).fit_transform(X) 41 | y_train = Y 42 | plt.scatter(x_train[:, 0], x_train[:, 1], c = y_train, cmap = 'tab10', s = 10) 43 | plt.title('Principal Component Analysis (PCA)') 44 | plt.xlabel("PC1") 45 | plt.ylabel("PC2") 46 | plt.show() 47 | print("\n" + "Dimensions of reduced data set: " + str(x_train.shape) + "\n") 48 | 49 | # REDUCE DIMENSIONS WITH T-DISTRIBUTED STOCHASTIC NEIGHBOR EMBEDDING (tSNE) 50 | model_tsne = TSNE(learning_rate = 200, n_components = 2, random_state = 123, perplexity = 11, n_iter = 1000, verbose = 1) 51 | tsne = model_tsne.fit_transform(x_train) 52 | plt.scatter(tsne[:, 0], tsne[:, 1], c = y_train, cmap = 'tab10', s = 10) 53 | plt.title('tSNE on PCA') 54 | plt.xlabel("tSNE1") 55 | plt.ylabel("tSNE2") 56 | plt.show() 57 | 58 | # UNIFORM MANIFOLD APPROXIMATION AND PROJECTION (UMAP) 59 | model_umap = UMAP(n_neighbors = 11, min_dist = 0.1, n_components = 2) 60 | umap = model_umap.fit_transform(x_train) 61 | plt.scatter(umap[:, 0], umap[:, 1], c = y_train, cmap = 'tab10', s = 10) 62 | plt.title('UMAP on PCA: scNMTseq, scRNAseq') 63 | plt.xlabel("UMAP1") 64 | plt.ylabel("UMAP2") 65 | plt.show() 66 | 67 | 68 | # REDUCE DIMENSIONS WITH AUTOENCODER 69 | #model = Sequential() 70 | #model.add(Dense(20, activation='elu', input_shape=(n_input,))) 71 | #model.add(Dense(10, activation='elu')) 72 | #model.add(Dense(2, activation='linear', name="bottleneck")) 73 | #model.add(Dense(10, activation='elu')) 74 | #model.add(Dense(20, activation='elu')) 75 | #model.add(Dense(n_input, activation='sigmoid')) 76 | #model.compile(loss = 'mean_squared_error', optimizer = Adam()) 77 | #model.summary() 78 | 79 | #history = model.fit(x_train, x_train, batch_size = 16, epochs = 500, shuffle = False, verbose = 1, validation_split = 0.2) 80 | #print("\n" + "Training Accuracy: ", history.history['loss'][-1]) 81 | #print("Validation Accuracy: ", history.history['val_loss'][-1]) 82 | #plt.plot(history.history['loss']) 83 | #plt.plot(history.history['val_loss']) 84 | #plt.title('Model Loss') 85 | #plt.ylabel('Loss') 86 | #plt.xlabel('Epoch') 87 | #plt.legend(['Train', 'Validate'], loc='upper right') 88 | #plt.show() 89 | 90 | #encoder = Model(model.input, model.get_layer('bottleneck').output) 91 | #bottleneck_representation = encoder.predict(x_train) 92 | 93 | # PLOT DIMENSIONALITY REDUCTION 94 | #plt.scatter(bottleneck_representation[:,0], bottleneck_representation[:,1], c = y_train, s = 10, cmap = 'tab10') 95 | #plt.title('Autoencoder') 96 | #plt.xlabel("Dimension 1") 97 | #plt.ylabel("Dimension 2") 98 | #plt.show() 99 | 100 | # VISUALIZE AUTOENCODER 101 | #from ann_visualizer.visualize import ann_viz 102 | #ann_viz(model, title = "Autoencoder", view = True) 103 | -------------------------------------------------------------------------------- /scNMTseq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Nikolay Oskolkov, WABI Long-Term Support, nikolay.oskolkov@scilifelab.se 3 | 4 | import os 5 | import numpy as np 6 | import pandas as pd 7 | from umap import UMAP 8 | import matplotlib as mpl 9 | from sklearn.manifold import TSNE 10 | from keras.layers import Input, Dense, Dropout 11 | from keras.layers.merge import concatenate 12 | from keras.models import Model 13 | from keras.utils import plot_model 14 | import matplotlib.pyplot as plt 15 | 16 | import warnings 17 | warnings.filterwarnings('ignore') 18 | 19 | ########################################## READ AND TRANSFORM DATA ############################################## 20 | os.chdir("/home/nikolay/WABI/Misc/SingleCell/scNMTseq") 21 | scRNAseq = pd.read_csv('scRNAseq.txt',sep='\t') 22 | scBSseq = pd.read_csv('scBSseq.txt',sep='\t') 23 | scATACseq = pd.read_csv('scATACseq.txt',sep='\t') 24 | print(scRNAseq.shape) 25 | print(scBSseq.shape) 26 | print(scATACseq.shape) 27 | print("\n") 28 | print(scRNAseq.iloc[0:5,0:5]) 29 | print(scBSseq.iloc[0:5,0:5]) 30 | print(scATACseq.iloc[0:5,0:5]) 31 | 32 | X_scRNAseq = scRNAseq.values[:,0:(scRNAseq.shape[1]-1)] 33 | Y_scRNAseq = scRNAseq.values[:,scRNAseq.shape[1]-1] 34 | X_scBSseq = scBSseq.values[:,0:(scBSseq.shape[1]-1)] 35 | Y_scBSseq = scBSseq.values[:,scBSseq.shape[1]-1] 36 | X_scATACseq = scATACseq.values[:,0:(scATACseq.shape[1]-1)] 37 | Y_scATACseq = scATACseq.values[:,scATACseq.shape[1]-1] 38 | print("\n") 39 | print(X_scRNAseq[0:5,0:5]) 40 | print(Y_scRNAseq[0:5]) 41 | 42 | # LOG-TRANSFORM DATA 43 | X_scRNAseq = np.log(X_scRNAseq + 1) 44 | print(X_scRNAseq[0:5,0:5]) 45 | X_scBSseq = np.log(X_scBSseq + 1) 46 | print(X_scBSseq[0:5,0:5]) 47 | X_scATACseq = np.log(X_scATACseq + 1) 48 | print(X_scATACseq[0:5,0:5]) 49 | 50 | ################################################## AUTOENCODER ################################################## 51 | 52 | # Input Layer 53 | ncol_scRNAseq = X_scRNAseq.shape[1] 54 | input_dim_scRNAseq = Input(shape = (ncol_scRNAseq, ), name = "scRNAseq") 55 | ncol_scBSseq = X_scBSseq.shape[1] 56 | input_dim_scBSseq = Input(shape = (ncol_scBSseq, ), name = "scBSseq") 57 | ncol_scATACseq = X_scATACseq.shape[1] 58 | input_dim_scATACseq = Input(shape = (ncol_scATACseq, ), name = "scATACseq") 59 | 60 | # Dimensions of Encoder for each OMIC 61 | #encoding_dim_scRNAseq = 18 62 | #encoding_dim_scBSseq = 26 63 | #encoding_dim_scATACseq = 3 64 | 65 | encoding_dim_scRNAseq = 30 66 | encoding_dim_scBSseq = 30 67 | encoding_dim_scATACseq = 30 68 | 69 | # Dropout on Input Layer 70 | dropout_scRNAseq = Dropout(0.2, name = "Dropout_scRNAseq")(input_dim_scRNAseq) 71 | dropout_scBSseq = Dropout(0.2, name = "Dropout_scBSseq")(input_dim_scBSseq) 72 | dropout_scATACseq = Dropout(0.2, name = "Dropout_scATACseq")(input_dim_scATACseq) 73 | 74 | # Encoder layer for each OMIC 75 | encoded_scRNAseq = Dense(encoding_dim_scRNAseq, activation = 'elu', name = "Encoder_scRNAseq")(dropout_scRNAseq) 76 | encoded_scBSseq = Dense(encoding_dim_scBSseq, activation = 'elu', name = "Encoder_scBSseq")(dropout_scBSseq) 77 | encoded_scATACseq = Dense(encoding_dim_scATACseq, activation = 'elu', name = "Encoder_scATACseq")(dropout_scATACseq) 78 | 79 | # Merging Encoder layers from different OMICs 80 | merge = concatenate([encoded_scRNAseq, encoded_scBSseq, encoded_scATACseq]) 81 | 82 | # Bottleneck compression 83 | bottleneck = Dense(50, kernel_initializer = 'uniform', activation = 'linear', name = "Bottleneck")(merge) 84 | 85 | #Inverse merging 86 | merge_inverse = Dense(encoding_dim_scRNAseq + encoding_dim_scBSseq + encoding_dim_scATACseq, activation = 'elu', name = "Concatenate_Inverse")(bottleneck) 87 | 88 | # Decoder layer for each OMIC 89 | decoded_scRNAseq = Dense(ncol_scRNAseq, activation = 'sigmoid', name = "Decoder_scRNAseq")(merge_inverse) 90 | decoded_scBSseq = Dense(ncol_scBSseq, activation = 'sigmoid', name = "Decoder_scBSseq")(merge_inverse) 91 | decoded_scATACseq = Dense(ncol_scATACseq, activation = 'sigmoid', name = "Decoder_scATACseq")(merge_inverse) 92 | 93 | # Combining Encoder and Decoder into an Autoencoder model 94 | autoencoder = Model(input = [input_dim_scRNAseq, input_dim_scBSseq, input_dim_scATACseq], output = [decoded_scRNAseq, decoded_scBSseq, decoded_scATACseq]) 95 | 96 | # Compile Autoencoder 97 | autoencoder.compile(optimizer = 'adam', loss={'Decoder_scRNAseq': 'mean_squared_error', 'Decoder_scBSseq': 'binary_crossentropy', 'Decoder_scATACseq': 'binary_crossentropy'}) 98 | autoencoder.summary() 99 | 100 | # Autoencoder graph 101 | plot_model(autoencoder, to_file='autoencoder_graph.png') 102 | 103 | # Autoencoder training 104 | estimator = autoencoder.fit([X_scRNAseq, X_scBSseq, X_scATACseq], [X_scRNAseq, X_scBSseq, X_scATACseq], epochs = 130, batch_size = 16, validation_split = 0.2, shuffle = True, verbose = 1) 105 | print("Training Loss: ",estimator.history['loss'][-1]) 106 | print("Validation Loss: ",estimator.history['val_loss'][-1]) 107 | #plt.figure(figsize=(20, 15)) 108 | plt.plot(estimator.history['loss']) 109 | plt.plot(estimator.history['val_loss']) 110 | plt.title('Model Loss') 111 | plt.ylabel('Loss') 112 | plt.xlabel('Epoch') 113 | plt.legend(['Train','Validation'], loc = 'upper right') 114 | plt.show() 115 | 116 | # Encoder model 117 | encoder = Model(input = [input_dim_scRNAseq, input_dim_scBSseq, input_dim_scATACseq], output = bottleneck) 118 | bottleneck_representation = encoder.predict([X_scRNAseq, X_scBSseq, X_scATACseq]) 119 | print(pd.DataFrame(bottleneck_representation).shape) 120 | print(pd.DataFrame(bottleneck_representation).iloc[0:5,0:5]) 121 | 122 | # Dimensionality reduction plot 123 | #plt.figure(figsize=(20, 15)) 124 | plt.scatter(bottleneck_representation[:, 0], bottleneck_representation[:, 1], c = Y_scRNAseq, cmap = 'tab10', s = 10) 125 | plt.title('Autoencoder: Data Integration, scNMTseq') 126 | plt.xlabel('Dimension 1') 127 | plt.ylabel('Dimension 2') 128 | #plt.colorbar() 129 | plt.show() 130 | 131 | # tSNE on Autoencoder bottleneck representation 132 | model_tsne_auto = TSNE(learning_rate = 200, n_components = 2, random_state = 123, perplexity = 11, n_iter = 1000, verbose = 1) 133 | tsne_auto = model_tsne_auto.fit_transform(bottleneck_representation) 134 | plt.scatter(tsne_auto[:, 0], tsne_auto[:, 1], c = Y_scRNAseq, cmap = 'tab10', s = 10) 135 | plt.title('tSNE on Autoencoder: Data Integration, scNMTseq') 136 | plt.xlabel("tSNE1") 137 | plt.ylabel("tSNE2") 138 | plt.show() 139 | 140 | # UNIFORM MANIFOLD APPROXIMATION AND PROJECTION (UMAP) 141 | model_umap = UMAP(n_neighbors = 11, min_dist = 0.1, n_components = 2) 142 | umap = model_umap.fit_transform(bottleneck_representation) 143 | plt.scatter(umap[:, 0], umap[:, 1], c = Y_scRNAseq, cmap = 'tab10', s = 10) 144 | plt.title('UMAP on Autoencoder: Data Integration, scNMTseq') 145 | plt.xlabel("UMAP1") 146 | plt.ylabel("UMAP2") 147 | plt.show() 148 | -------------------------------------------------------------------------------- /tSNE_on_Autoencoder_CITEseq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Nikolay Oskolkov, WABI Long-Term Support, nikolay.oskolkov@scilifelab.se 3 | 4 | import sys 5 | import keras 6 | import numpy as np 7 | import pandas as pd 8 | from keras import regularizers 9 | from keras.layers import Dense 10 | import matplotlib.pyplot as plt 11 | from keras.layers import Dropout 12 | from sklearn.manifold import TSNE 13 | from keras.optimizers import Adam 14 | from sklearn.decomposition import PCA 15 | from keras.models import Sequential, Model 16 | 17 | import warnings 18 | warnings.filterwarnings("ignore") 19 | 20 | # READ DATA 21 | infile = str(sys.argv[1]) 22 | print("\n" + "You specified input file: " + infile + "\n") 23 | 24 | expr = pd.read_csv(infile,sep='\t') 25 | print("\n" + "Dimensions of input file: " + str(expr.shape) + "\n") 26 | print("\n" + "A few first lines of input file: " + "\n") 27 | print(expr.iloc[0:4, 0:4]) 28 | print("\n" + "Last column corresponds to cluster assignments: " + "\n") 29 | print(expr.iloc[0:4, (expr.shape[1]-4):expr.shape[1]]) 30 | 31 | # LOG-TRANSFORM DATA 32 | X = expr.values[:,0:(expr.shape[1]-1)] 33 | Y = expr.values[:,expr.shape[1]-1] 34 | print("\n" + "You have following unique cluster labels: " + "\n") 35 | print(set(Y)) 36 | print("\n" + "Log-transforming data..." + "\n") 37 | X = np.log(X + 1) 38 | 39 | # REDUCE DIMENSIONS WITH PRINCIPAL COMPONENT ANALYSIS (PCA) 40 | n_input = 10 41 | x_train = PCA(n_components = n_input).fit_transform(X) 42 | y_train = Y 43 | #plt.figure(figsize=(20, 15)) 44 | plt.scatter(x_train[:, 0], x_train[:, 1], c = y_train, cmap = 'tab20', s = 10) 45 | plt.title('Principal Component Analysis (PCA)') 46 | plt.xlabel("PC1") 47 | plt.ylabel("PC2") 48 | plt.show() 49 | print("\n" + "Dimensions of reduced data set: " + str(x_train.shape) + "\n") 50 | 51 | # REDUCE DIMENSIONS WITH T-DISTRIBUTED STOCHASTIC NEIGHBOR EMBEDDING (tSNE) 52 | model_tsne = TSNE(learning_rate = 200, n_components = 2, random_state = 123, perplexity = 90, n_iter = 1000, verbose = 1) 53 | tsne = model_tsne.fit_transform(x_train) 54 | #plt.figure(figsize=(20, 15)) 55 | plt.scatter(tsne[:, 0], tsne[:, 1], c = y_train, cmap = 'tab20', s = 10) 56 | plt.title('tSNE on PCA') 57 | plt.xlabel("tSNE1") 58 | plt.ylabel("tSNE2") 59 | plt.show() 60 | 61 | # REDUCE DIMENSIONS WITH AUTOENCODER 62 | model = Sequential() 63 | model.add(Dropout(0.2, input_shape=(X.shape[1],))) 64 | model.add(Dense(10, activation = 'elu')) 65 | model.add(Dense(8, activation = 'elu')) 66 | model.add(Dense(6, activation = 'elu')) 67 | model.add(Dense(4, activation = 'linear', name = "bottleneck")) 68 | model.add(Dense(6, activation = 'elu')) 69 | model.add(Dense(8, activation = 'elu')) 70 | model.add(Dense(10, activation = 'elu')) 71 | model.add(Dense(X.shape[1], activation = 'sigmoid')) 72 | model.compile(loss = 'mean_squared_error', optimizer = Adam()) 73 | model.summary() 74 | 75 | history = model.fit(X, X, batch_size = 128, epochs = 100, shuffle = True, verbose = 1, validation_split = 0.2) 76 | print("\n" + "Training Accuracy: ", history.history['loss'][-1]) 77 | print("Validation Accuracy: ", history.history['val_loss'][-1], "\n") 78 | plt.plot(history.history['loss']) 79 | plt.plot(history.history['val_loss']) 80 | plt.title('Model Loss') 81 | plt.ylabel('Loss') 82 | plt.xlabel('Epoch') 83 | plt.legend(['Train', 'Validate'], loc='upper right') 84 | plt.show() 85 | 86 | encoder = Model(model.input, model.get_layer('bottleneck').output) 87 | bottleneck_representation = encoder.predict(X) 88 | 89 | model_tsne_auto = TSNE(learning_rate = 200, n_components = 2, random_state = 123, perplexity = 90, n_iter = 1000, verbose = 1) 90 | tsne_auto = model_tsne_auto.fit_transform(bottleneck_representation) 91 | #plt.figure(figsize=(20, 15)) 92 | plt.scatter(tsne_auto[:, 0], tsne_auto[:, 1], c = Y, cmap = 'tab20', s = 10) 93 | plt.title('tSNE on Autoencoder: 8 Layers') 94 | plt.xlabel("tSNE1") 95 | plt.ylabel("tSNE2") 96 | plt.show() 97 | 98 | # VISUALIZE AUTOENCODER 99 | #from ann_visualizer.visualize import ann_viz 100 | #ann_viz(model, title = "Autoencoder", view = True) 101 | -------------------------------------------------------------------------------- /tsne_on_autoencoder_scNMTseq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Nikolay Oskolkov, WABI Long-Term Support, nikolay.oskolkov@scilifelab.se 3 | 4 | import sys 5 | import keras 6 | import numpy as np 7 | import pandas as pd 8 | from keras import regularizers 9 | from keras.layers import Dense 10 | import matplotlib.pyplot as plt 11 | from keras.layers import Dropout 12 | from sklearn.manifold import TSNE 13 | from keras.optimizers import Adam 14 | from sklearn.decomposition import PCA 15 | from keras.models import Sequential, Model 16 | 17 | import warnings 18 | warnings.filterwarnings("ignore") 19 | 20 | # READ DATA 21 | infile = str(sys.argv[1]) 22 | print("\n" + "You specified input file: " + infile + "\n") 23 | 24 | expr = pd.read_csv(infile,sep='\t') 25 | print("\n" + "Dimensions of input file: " + str(expr.shape) + "\n") 26 | print("\n" + "A few first lines of input file: " + "\n") 27 | print(expr.iloc[0:4, 0:4]) 28 | print("\n" + "Last column corresponds to cluster assignments: " + "\n") 29 | print(expr.iloc[0:4, (expr.shape[1]-4):expr.shape[1]]) 30 | 31 | # LOG-TRANSFORM DATA 32 | X = expr.values[:,0:(expr.shape[1]-1)] 33 | Y = expr.values[:,expr.shape[1]-1] 34 | print("\n" + "You have following unique cluster labels: " + "\n") 35 | print(set(Y)) 36 | print("\n" + "Log-transforming data..." + "\n") 37 | X = np.log(X + 1) 38 | 39 | # REDUCE DIMENSIONS WITH PRINCIPAL COMPONENT ANALYSIS (PCA) 40 | n_input = 5 41 | x_train = PCA(n_components = n_input).fit_transform(X) 42 | y_train = Y 43 | plt.scatter(x_train[:, 0], x_train[:, 1], c = y_train, cmap = 'tab10', s = 10) 44 | plt.title('Principal Component Analysis (PCA)') 45 | plt.xlabel("PC1") 46 | plt.ylabel("PC2") 47 | plt.show() 48 | print("\n" + "Dimensions of reduced data set: " + str(x_train.shape) + "\n") 49 | 50 | # REDUCE DIMENSIONS WITH T-DISTRIBUTED STOCHASTIC NEIGHBOR EMBEDDING (tSNE) 51 | model_tsne = TSNE(learning_rate = 200, n_components = 2, random_state = 123, perplexity = 11, n_iter = 1000, verbose = 1) 52 | tsne = model_tsne.fit_transform(x_train) 53 | plt.scatter(tsne[:, 0], tsne[:, 1], c = y_train, cmap = 'tab10', s = 10) 54 | plt.title('tSNE on PCA') 55 | plt.xlabel("tSNE1") 56 | plt.ylabel("tSNE2") 57 | plt.show() 58 | 59 | # REDUCE DIMENSIONS WITH AUTOENCODER 60 | model = Sequential() 61 | model.add(Dropout(0.2, input_shape=(X.shape[1],))) 62 | model.add(Dense(20, activation = 'elu', activity_regularizer=regularizers.l1(10e-5))) 63 | model.add(Dense(10, activation = 'elu', activity_regularizer=regularizers.l1(10e-5))) 64 | model.add(Dense(5, activation = 'linear', name = "bottleneck")) 65 | model.add(Dense(10, activation = 'elu', activity_regularizer=regularizers.l1(10e-5))) 66 | model.add(Dense(20, activation = 'elu', activity_regularizer=regularizers.l1(10e-5))) 67 | model.add(Dense(X.shape[1], activation = 'sigmoid')) 68 | model.compile(loss = 'mean_squared_error', optimizer = Adam()) 69 | model.summary() 70 | 71 | history = model.fit(X, X, batch_size = 16, epochs = 200, shuffle = True, verbose = 1, validation_split = 0.2) 72 | print("\n" + "Training Accuracy: ", history.history['loss'][-1]) 73 | print("Validation Accuracy: ", history.history['val_loss'][-1], "\n") 74 | plt.plot(history.history['loss']) 75 | plt.plot(history.history['val_loss']) 76 | plt.title('Model Loss') 77 | plt.ylabel('Loss') 78 | plt.xlabel('Epoch') 79 | plt.legend(['Train', 'Validate'], loc='upper right') 80 | plt.show() 81 | 82 | encoder = Model(model.input, model.get_layer('bottleneck').output) 83 | bottleneck_representation = encoder.predict(X) 84 | 85 | model_tsne_auto = TSNE(learning_rate = 200, n_components = 2, random_state = 123, perplexity = 11, n_iter = 1000, verbose = 1) 86 | tsne_auto = model_tsne_auto.fit_transform(bottleneck_representation) 87 | plt.scatter(tsne_auto[:, 0], tsne_auto[:, 1], c = Y, cmap = 'tab10', s = 10) 88 | plt.title('tSNE on Autoencoder') 89 | plt.xlabel("tSNE1") 90 | plt.ylabel("tSNE2") 91 | plt.show() 92 | 93 | # VISUALIZE AUTOENCODER 94 | #from ann_visualizer.visualize import ann_viz 95 | #ann_viz(model, title = "Autoencoder", view = True) 96 | --------------------------------------------------------------------------------