├── CITEseq.py
├── README.md
├── dim_reduct_CITEseq.py
├── dim_reduct_scNMTseq.py
├── scNMTseq.py
├── scProteomics.txt
├── scRNAseq.txt
├── tSNE_on_Autoencoder_CITEseq.py
└── tsne_on_autoencoder_scNMTseq.py


/CITEseq.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # Nikolay Oskolkov, WABI Long-Term Support, nikolay.oskolkov@scilifelab.se
  3 | 
  4 | import os
  5 | import numpy as np
  6 | import pandas as pd
  7 | from umap import UMAP
  8 | import matplotlib as mpl
  9 | from sklearn.manifold import TSNE
 10 | from keras.layers import Input, Dense, Dropout
 11 | from keras.layers.merge import concatenate
 12 | from keras.models import Model
 13 | from keras.utils import plot_model
 14 | import matplotlib.pyplot as plt
 15 | 
 16 | import warnings
 17 | warnings.filterwarnings('ignore')
 18 | 
 19 | ########################################## READ AND TRANSFORM DATA ##############################################
 20 | os.chdir("/home/nikolay/WABI/Misc/SingleCell/CITEseq/")
 21 | scRNAseq = pd.read_csv('scRNAseq.txt',sep='\t')
 22 | scProteomics = pd.read_csv('scProteomics.txt',sep='\t')
 23 | print(scRNAseq.shape)
 24 | print(scProteomics.shape)
 25 | print("\n")
 26 | print(scRNAseq.iloc[0:5,0:5])
 27 | print(scProteomics.iloc[0:5,0:5])
 28 | 
 29 | X_scRNAseq = scRNAseq.values[:,0:(scRNAseq.shape[1]-1)]
 30 | Y_scRNAseq = scRNAseq.values[:,scRNAseq.shape[1]-1]
 31 | X_scProteomics = scProteomics.values[:,0:(scProteomics.shape[1]-1)]
 32 | Y_scProteomics = scProteomics.values[:,scProteomics.shape[1]-1]
 33 | print("\n")
 34 | print(X_scRNAseq[0:5,0:5])
 35 | print(Y_scRNAseq[0:5])
 36 | 
 37 | # LOG-TRANSFORM DATA
 38 | X_scRNAseq = np.log(X_scRNAseq + 1)
 39 | print(X_scRNAseq[0:5,0:5])
 40 | X_scProteomics = np.log(X_scProteomics + 1)
 41 | print(X_scProteomics[0:5,0:5])
 42 | 
 43 | ################################################## AUTOENCODER ##################################################
 44 | 
 45 | # Input Layer
 46 | ncol_scRNAseq = X_scRNAseq.shape[1]
 47 | input_dim_scRNAseq = Input(shape = (ncol_scRNAseq, ), name = "scRNAseq")
 48 | ncol_scProteomics = X_scProteomics.shape[1]
 49 | input_dim_scProteomics = Input(shape = (ncol_scProteomics, ), name = "scProteomics")
 50 | 
 51 | # Dimensions of Encoder for each OMIC
 52 | encoding_dim_scRNAseq = 50
 53 | encoding_dim_scProteomics = 10
 54 | 
 55 | # Encoder layer for each OMIC
 56 | encoded_scRNAseq = Dense(encoding_dim_scRNAseq, activation = 'linear', name = "Encoder_scRNAseq")(input_dim_scRNAseq)
 57 | encoded_scProteomics = Dense(encoding_dim_scProteomics, activation = 'linear', name = "Encoder_scProteomics")(input_dim_scProteomics)
 58 | 
 59 | # Merging Encoder layers from different OMICs
 60 | merge = concatenate([encoded_scRNAseq, encoded_scProteomics])
 61 | 
 62 | # Bottleneck compression
 63 | bottleneck = Dense(50, kernel_initializer = 'uniform', activation = 'linear', name = "Bottleneck")(merge)
 64 | 
 65 | #Inverse merging
 66 | merge_inverse = Dense(encoding_dim_scRNAseq + encoding_dim_scProteomics, activation = 'elu', name = "Concatenate_Inverse")(bottleneck)
 67 | 
 68 | # Decoder layer for each OMIC
 69 | decoded_scRNAseq = Dense(ncol_scRNAseq, activation = 'sigmoid', name = "Decoder_scRNAseq")(merge_inverse)
 70 | decoded_scProteomics = Dense(ncol_scProteomics, activation = 'sigmoid', name = "Decoder_scProteomics")(merge_inverse)
 71 | 
 72 | # Combining Encoder and Decoder into an Autoencoder model
 73 | autoencoder = Model(input = [input_dim_scRNAseq, input_dim_scProteomics], output = [decoded_scRNAseq, decoded_scProteomics])
 74 | 
 75 | # Compile Autoencoder
 76 | autoencoder.compile(optimizer = 'adam', loss={'Decoder_scRNAseq': 'mean_squared_error', 'Decoder_scProteomics': 'mean_squared_error'})
 77 | autoencoder.summary()
 78 | 
 79 | # Autoencoder graph
 80 | plot_model(autoencoder, to_file='autoencoder_graph.png')
 81 | 
 82 | # Autoencoder training
 83 | estimator = autoencoder.fit([X_scRNAseq, X_scProteomics], [X_scRNAseq, X_scProteomics], epochs = 100, batch_size = 128, validation_split = 0.2, shuffle = True, verbose = 1)
 84 | print("Training Loss: ",estimator.history['loss'][-1])
 85 | print("Validation Loss: ",estimator.history['val_loss'][-1])
 86 | #plt.figure(figsize=(20, 15))
 87 | plt.plot(estimator.history['loss'])
 88 | plt.plot(estimator.history['val_loss'])
 89 | plt.title('Model Loss')
 90 | plt.ylabel('Loss')
 91 | plt.xlabel('Epoch')
 92 | plt.legend(['Train','Validation'], loc = 'upper right')
 93 | plt.show()
 94 | 
 95 | # Encoder model
 96 | encoder = Model(input = [input_dim_scRNAseq, input_dim_scProteomics], output = bottleneck)
 97 | bottleneck_representation = encoder.predict([X_scRNAseq, X_scProteomics])
 98 | print(pd.DataFrame(bottleneck_representation).shape)
 99 | print(pd.DataFrame(bottleneck_representation).iloc[0:5,0:5])
100 | 
101 | # Dimensionality reduction plot
102 | #plt.figure(figsize=(20, 15))
103 | plt.scatter(bottleneck_representation[:, 0], bottleneck_representation[:, 1], c = Y_scRNAseq, cmap = 'tab20', s = 10)
104 | plt.title('Autoencoder Data Integration')
105 | plt.xlabel('Dimension 1')
106 | plt.ylabel('Dimension 2')
107 | #plt.colorbar()
108 | plt.show()
109 | 
110 | # tSNE on Autoencoder bottleneck representation
111 | model_tsne_auto = TSNE(learning_rate = 200, n_components = 2, random_state = 123, perplexity = 90, n_iter = 1000, verbose = 1)
112 | tsne_auto = model_tsne_auto.fit_transform(bottleneck_representation)
113 | plt.scatter(tsne_auto[:, 0], tsne_auto[:, 1], c = Y_scRNAseq, cmap = 'tab20', s = 10)
114 | plt.title('tSNE on Autoencoder: Data Integration, CITEseq')
115 | plt.xlabel("tSNE1")
116 | plt.ylabel("tSNE2")
117 | plt.show()
118 | 
119 | # UNIFORM MANIFOLD APPROXIMATION AND PROJECTION (UMAP)
120 | #model_umap = UMAP(n_neighbors = 20, min_dist = 0.3, n_components = 2)
121 | #umap = model_umap.fit_transform(bottleneck_representation)
122 | #plt.scatter(umap[:, 0], umap[:, 1], c = Y_scRNAseq, cmap = 'tab20', s = 10)
123 | #plt.title('UMAP on Autoencoder')
124 | #plt.xlabel("UMAP1")
125 | #plt.ylabel("UMAP2")
126 | #plt.show()
127 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Here I show how to use Deep Learning for biological and biomedical Data Integration
2 | 


--------------------------------------------------------------------------------
/dim_reduct_CITEseq.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # Nikolay Oskolkov, WABI Long-Term Support, nikolay.oskolkov@scilifelab.se
 3 | # Run this script as python dim_reduct_CITEseq.py scRNAseq.txt
 4 | 
 5 | import sys
 6 | import keras
 7 | import numpy as np
 8 | import pandas as pd
 9 | from keras.layers import Dense
10 | import matplotlib.pyplot as plt
11 | from sklearn.manifold import TSNE
12 | from keras.optimizers import Adam
13 | from sklearn.decomposition import PCA
14 | from keras.models import Sequential, Model
15 | 
16 | import warnings
17 | warnings.filterwarnings("ignore")
18 | 
19 | # READ DATA
20 | infile = str(sys.argv[1])
21 | print("\n" + "You specified input file: " + infile + "\n")
22 | 
23 | expr = pd.read_csv(infile,sep='\t')
24 | print("\n" + "Dimensions of input file: " + str(expr.shape) + "\n")
25 | print("\n" + "A few first lines of input file: " + "\n")
26 | print(expr.iloc[0:4, 0:4])
27 | print("\n" + "Last column corresponds to cluster assignments: " + "\n")
28 | print(expr.iloc[0:4, (expr.shape[1]-4):expr.shape[1]])
29 | 
30 | # LOG-TRANSFORM DATA
31 | X = expr.values[:,0:(expr.shape[1]-1)]
32 | Y = expr.values[:,expr.shape[1]-1]
33 | print("\n" + "You have following unique cluster labels: " + "\n")
34 | print(set(Y))
35 | print("\n" + "Log-transforming data..." + "\n")
36 | X = np.log(X + 1)
37 | 
38 | # REDUCE DIMENSIONS WITH PRINCIPAL COMPONENT ANALYSIS (PCA)
39 | n_input = 50
40 | x_train = PCA(n_components = n_input).fit_transform(X)
41 | y_train = Y
42 | #plt.figure(figsize=(20, 15))
43 | plt.scatter(x_train[:, 0], x_train[:, 1], c = y_train, cmap = 'tab20', s = 10)
44 | plt.title('Principal Component Analysis (PCA)')
45 | plt.xlabel("PC1")
46 | plt.ylabel("PC2")
47 | plt.show()
48 | print("\n" + "Dimensions of reduced data set: " + str(x_train.shape) + "\n")
49 | 
50 | # REDUCE DIMENSIONS WITH AUTOENCODER
51 | model = Sequential()
52 | model.add(Dense(30,       activation='elu', input_shape=(n_input,)))
53 | model.add(Dense(20,       activation='elu'))
54 | model.add(Dense(10,       activation='elu'))
55 | model.add(Dense(2,        activation='linear', name="bottleneck"))
56 | model.add(Dense(10,       activation='elu'))
57 | model.add(Dense(20,       activation='elu'))
58 | model.add(Dense(30,       activation='elu'))
59 | model.add(Dense(n_input,  activation='sigmoid'))
60 | model.compile(loss = 'mean_squared_error', optimizer = Adam())
61 | model.summary()
62 | 
63 | history = model.fit(x_train, x_train, batch_size = 128, epochs = 500, shuffle = False, verbose = 1, validation_split = 0.2)
64 | print("\n" + "Training Loss: ", history.history['loss'][-1])
65 | print("Validation Loss: ", history.history['val_loss'][-1])
66 | plt.plot(history.history['loss'])
67 | plt.plot(history.history['val_loss'])
68 | plt.title('Model Loss')
69 | plt.ylabel('Loss')
70 | plt.xlabel('Epoch')
71 | plt.legend(['Train', 'Validate'], loc='upper right')
72 | plt.show()
73 | 
74 | encoder = Model(model.input, model.get_layer('bottleneck').output)
75 | bottleneck_representation = encoder.predict(x_train)
76 | 
77 | # PLOT DIMENSIONALITY REDUCTION
78 | #plt.figure(figsize=(20, 15)) 
79 | plt.scatter(bottleneck_representation[:,0], bottleneck_representation[:,1], c = y_train, s = 10, cmap = 'tab20')
80 | plt.title('Autoencoder: 8 Layers')
81 | plt.xlabel("Dimension 1")
82 | plt.ylabel("Dimension 2")
83 | plt.show()
84 | 
85 | # REDUCE DIMENSIONS WITH T-DISTRIBUTED STOCHASTIC NEIGHBOR EMBEDDING (tSNE)
86 | #model_tsne = TSNE(learning_rate = 200, n_components = 2, random_state = 123, perplexity = 90, n_iter = 1000, verbose = 1)
87 | #tsne = model_tsne.fit_transform(x_train)
88 | #plt.figure(figsize=(20, 15))
89 | #plt.scatter(tsne[:, 0], tsne[:, 1], c = y_train, cmap = 'tab20', s = 10)
90 | #plt.title('tSNE on PCA')
91 | #plt.xlabel("tSNE1")
92 | #plt.ylabel("tSNE2")
93 | #plt.show()
94 | 
95 | # VISUALIZE AUTOENCODER
96 | #from ann_visualizer.visualize import ann_viz
97 | #ann_viz(model, title = "Autoencoder", view = True)
98 | 


--------------------------------------------------------------------------------
/dim_reduct_scNMTseq.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # Nikolay Oskolkov, WABI Long-Term Support, nikolay.oskolkov@scilifelab.se
  3 | 
  4 | import sys
  5 | import keras
  6 | import numpy as np
  7 | import pandas as pd
  8 | from umap import UMAP
  9 | from keras.layers import Dense
 10 | import matplotlib.pyplot as plt
 11 | from sklearn.manifold import TSNE
 12 | from keras.optimizers import Adam
 13 | from sklearn.decomposition import PCA
 14 | from keras.models import Sequential, Model
 15 | 
 16 | import warnings
 17 | warnings.filterwarnings("ignore")
 18 | 
 19 | # READ DATA
 20 | infile = str(sys.argv[1])
 21 | print("\n" + "You specified input file: " + infile + "\n")
 22 | 
 23 | expr = pd.read_csv(infile,sep='\t')
 24 | print("\n" + "Dimensions of input file: " + str(expr.shape) + "\n")
 25 | print("\n" + "A few first lines of input file: " + "\n")
 26 | print(expr.iloc[0:4, 0:4])
 27 | print("\n" + "Last column corresponds to cluster assignments: " + "\n")
 28 | print(expr.iloc[0:4, (expr.shape[1]-4):expr.shape[1]])
 29 | 
 30 | # LOG-TRANSFORM DATA
 31 | X = expr.values[:,0:(expr.shape[1]-1)]
 32 | Y = expr.values[:,expr.shape[1]-1]
 33 | print("\n" + "You have following unique cluster labels: " + "\n")
 34 | print(set(Y))
 35 | print("\n" + "Log-transforming data..." + "\n")
 36 | X = np.log(X + 1)
 37 | 
 38 | # REDUCE DIMENSIONS WITH PRINCIPAL COMPONENT ANALYSIS (PCA)
 39 | n_input = 20
 40 | x_train = PCA(n_components = n_input).fit_transform(X)
 41 | y_train = Y
 42 | plt.scatter(x_train[:, 0], x_train[:, 1], c = y_train, cmap = 'tab10', s = 10)
 43 | plt.title('Principal Component Analysis (PCA)')
 44 | plt.xlabel("PC1")
 45 | plt.ylabel("PC2")
 46 | plt.show()
 47 | print("\n" + "Dimensions of reduced data set: " + str(x_train.shape) + "\n")
 48 | 
 49 | # REDUCE DIMENSIONS WITH T-DISTRIBUTED STOCHASTIC NEIGHBOR EMBEDDING (tSNE)
 50 | model_tsne = TSNE(learning_rate = 200, n_components = 2, random_state = 123, perplexity = 11, n_iter = 1000, verbose = 1)
 51 | tsne = model_tsne.fit_transform(x_train)
 52 | plt.scatter(tsne[:, 0], tsne[:, 1], c = y_train, cmap = 'tab10', s = 10)
 53 | plt.title('tSNE on PCA')
 54 | plt.xlabel("tSNE1")
 55 | plt.ylabel("tSNE2")
 56 | plt.show()
 57 | 
 58 | # UNIFORM MANIFOLD APPROXIMATION AND PROJECTION (UMAP)
 59 | model_umap = UMAP(n_neighbors = 11, min_dist = 0.1, n_components = 2)
 60 | umap = model_umap.fit_transform(x_train)
 61 | plt.scatter(umap[:, 0], umap[:, 1], c = y_train, cmap = 'tab10', s = 10)
 62 | plt.title('UMAP on PCA: scNMTseq, scRNAseq')
 63 | plt.xlabel("UMAP1")
 64 | plt.ylabel("UMAP2")
 65 | plt.show()
 66 | 
 67 | 
 68 | # REDUCE DIMENSIONS WITH AUTOENCODER
 69 | #model = Sequential()
 70 | #model.add(Dense(20,    activation='elu', input_shape=(n_input,)))
 71 | #model.add(Dense(10,    activation='elu'))
 72 | #model.add(Dense(2,     activation='linear', name="bottleneck"))
 73 | #model.add(Dense(10,    activation='elu'))
 74 | #model.add(Dense(20,    activation='elu'))
 75 | #model.add(Dense(n_input,  activation='sigmoid'))
 76 | #model.compile(loss = 'mean_squared_error', optimizer = Adam())
 77 | #model.summary()
 78 | 
 79 | #history = model.fit(x_train, x_train, batch_size = 16, epochs = 500, shuffle = False, verbose = 1, validation_split = 0.2)
 80 | #print("\n" + "Training Accuracy: ", history.history['loss'][-1])
 81 | #print("Validation Accuracy: ", history.history['val_loss'][-1])
 82 | #plt.plot(history.history['loss'])
 83 | #plt.plot(history.history['val_loss'])
 84 | #plt.title('Model Loss')
 85 | #plt.ylabel('Loss')
 86 | #plt.xlabel('Epoch')
 87 | #plt.legend(['Train', 'Validate'], loc='upper right')
 88 | #plt.show()
 89 | 
 90 | #encoder = Model(model.input, model.get_layer('bottleneck').output)
 91 | #bottleneck_representation = encoder.predict(x_train)
 92 | 
 93 | # PLOT DIMENSIONALITY REDUCTION 
 94 | #plt.scatter(bottleneck_representation[:,0], bottleneck_representation[:,1], c = y_train, s = 10, cmap = 'tab10')
 95 | #plt.title('Autoencoder')
 96 | #plt.xlabel("Dimension 1")
 97 | #plt.ylabel("Dimension 2")
 98 | #plt.show()
 99 | 
100 | # VISUALIZE AUTOENCODER
101 | #from ann_visualizer.visualize import ann_viz
102 | #ann_viz(model, title = "Autoencoder", view = True)
103 | 


--------------------------------------------------------------------------------
/scNMTseq.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # Nikolay Oskolkov, WABI Long-Term Support, nikolay.oskolkov@scilifelab.se
  3 | 
  4 | import os
  5 | import numpy as np
  6 | import pandas as pd
  7 | from umap import UMAP
  8 | import matplotlib as mpl
  9 | from sklearn.manifold import TSNE
 10 | from keras.layers import Input, Dense, Dropout
 11 | from keras.layers.merge import concatenate
 12 | from keras.models import Model
 13 | from keras.utils import plot_model
 14 | import matplotlib.pyplot as plt
 15 | 
 16 | import warnings
 17 | warnings.filterwarnings('ignore')
 18 | 
 19 | ########################################## READ AND TRANSFORM DATA ##############################################
 20 | os.chdir("/home/nikolay/WABI/Misc/SingleCell/scNMTseq")
 21 | scRNAseq = pd.read_csv('scRNAseq.txt',sep='\t')
 22 | scBSseq = pd.read_csv('scBSseq.txt',sep='\t')
 23 | scATACseq = pd.read_csv('scATACseq.txt',sep='\t')
 24 | print(scRNAseq.shape)
 25 | print(scBSseq.shape)
 26 | print(scATACseq.shape)
 27 | print("\n")
 28 | print(scRNAseq.iloc[0:5,0:5])
 29 | print(scBSseq.iloc[0:5,0:5])
 30 | print(scATACseq.iloc[0:5,0:5])
 31 | 
 32 | X_scRNAseq = scRNAseq.values[:,0:(scRNAseq.shape[1]-1)]
 33 | Y_scRNAseq = scRNAseq.values[:,scRNAseq.shape[1]-1]
 34 | X_scBSseq = scBSseq.values[:,0:(scBSseq.shape[1]-1)]
 35 | Y_scBSseq = scBSseq.values[:,scBSseq.shape[1]-1]
 36 | X_scATACseq = scATACseq.values[:,0:(scATACseq.shape[1]-1)]
 37 | Y_scATACseq = scATACseq.values[:,scATACseq.shape[1]-1]
 38 | print("\n")
 39 | print(X_scRNAseq[0:5,0:5])
 40 | print(Y_scRNAseq[0:5])
 41 | 
 42 | # LOG-TRANSFORM DATA
 43 | X_scRNAseq = np.log(X_scRNAseq + 1)
 44 | print(X_scRNAseq[0:5,0:5])
 45 | X_scBSseq = np.log(X_scBSseq + 1)
 46 | print(X_scBSseq[0:5,0:5])
 47 | X_scATACseq = np.log(X_scATACseq + 1)
 48 | print(X_scATACseq[0:5,0:5])
 49 | 
 50 | ################################################## AUTOENCODER ##################################################
 51 | 
 52 | # Input Layer
 53 | ncol_scRNAseq = X_scRNAseq.shape[1]
 54 | input_dim_scRNAseq = Input(shape = (ncol_scRNAseq, ), name = "scRNAseq")
 55 | ncol_scBSseq = X_scBSseq.shape[1]
 56 | input_dim_scBSseq = Input(shape = (ncol_scBSseq, ), name = "scBSseq")
 57 | ncol_scATACseq = X_scATACseq.shape[1]
 58 | input_dim_scATACseq = Input(shape = (ncol_scATACseq, ), name = "scATACseq")
 59 | 
 60 | # Dimensions of Encoder for each OMIC
 61 | #encoding_dim_scRNAseq = 18
 62 | #encoding_dim_scBSseq = 26
 63 | #encoding_dim_scATACseq = 3
 64 | 
 65 | encoding_dim_scRNAseq = 30
 66 | encoding_dim_scBSseq = 30
 67 | encoding_dim_scATACseq = 30
 68 | 
 69 | # Dropout on Input Layer
 70 | dropout_scRNAseq = Dropout(0.2, name = "Dropout_scRNAseq")(input_dim_scRNAseq)
 71 | dropout_scBSseq = Dropout(0.2, name = "Dropout_scBSseq")(input_dim_scBSseq)
 72 | dropout_scATACseq = Dropout(0.2, name = "Dropout_scATACseq")(input_dim_scATACseq) 
 73 | 
 74 | # Encoder layer for each OMIC
 75 | encoded_scRNAseq = Dense(encoding_dim_scRNAseq, activation = 'elu', name = "Encoder_scRNAseq")(dropout_scRNAseq)
 76 | encoded_scBSseq = Dense(encoding_dim_scBSseq, activation = 'elu', name = "Encoder_scBSseq")(dropout_scBSseq)
 77 | encoded_scATACseq = Dense(encoding_dim_scATACseq, activation = 'elu', name = "Encoder_scATACseq")(dropout_scATACseq)
 78 | 
 79 | # Merging Encoder layers from different OMICs
 80 | merge = concatenate([encoded_scRNAseq, encoded_scBSseq, encoded_scATACseq])
 81 | 
 82 | # Bottleneck compression
 83 | bottleneck = Dense(50, kernel_initializer = 'uniform', activation = 'linear', name = "Bottleneck")(merge)
 84 | 
 85 | #Inverse merging
 86 | merge_inverse = Dense(encoding_dim_scRNAseq + encoding_dim_scBSseq + encoding_dim_scATACseq, activation = 'elu', name = "Concatenate_Inverse")(bottleneck)
 87 | 
 88 | # Decoder layer for each OMIC
 89 | decoded_scRNAseq = Dense(ncol_scRNAseq, activation = 'sigmoid', name = "Decoder_scRNAseq")(merge_inverse)
 90 | decoded_scBSseq = Dense(ncol_scBSseq, activation = 'sigmoid', name = "Decoder_scBSseq")(merge_inverse)
 91 | decoded_scATACseq = Dense(ncol_scATACseq, activation = 'sigmoid', name = "Decoder_scATACseq")(merge_inverse)
 92 | 
 93 | # Combining Encoder and Decoder into an Autoencoder model
 94 | autoencoder = Model(input = [input_dim_scRNAseq, input_dim_scBSseq, input_dim_scATACseq], output = [decoded_scRNAseq, decoded_scBSseq, decoded_scATACseq])
 95 | 
 96 | # Compile Autoencoder
 97 | autoencoder.compile(optimizer = 'adam', loss={'Decoder_scRNAseq': 'mean_squared_error', 'Decoder_scBSseq': 'binary_crossentropy', 'Decoder_scATACseq': 'binary_crossentropy'})
 98 | autoencoder.summary()
 99 | 
100 | # Autoencoder graph
101 | plot_model(autoencoder, to_file='autoencoder_graph.png')
102 | 
103 | # Autoencoder training
104 | estimator = autoencoder.fit([X_scRNAseq, X_scBSseq, X_scATACseq], [X_scRNAseq, X_scBSseq, X_scATACseq], epochs = 130, batch_size = 16, validation_split = 0.2, shuffle = True, verbose = 1)
105 | print("Training Loss: ",estimator.history['loss'][-1])
106 | print("Validation Loss: ",estimator.history['val_loss'][-1])
107 | #plt.figure(figsize=(20, 15))
108 | plt.plot(estimator.history['loss'])
109 | plt.plot(estimator.history['val_loss'])
110 | plt.title('Model Loss')
111 | plt.ylabel('Loss')
112 | plt.xlabel('Epoch')
113 | plt.legend(['Train','Validation'], loc = 'upper right')
114 | plt.show()
115 | 
116 | # Encoder model
117 | encoder = Model(input = [input_dim_scRNAseq, input_dim_scBSseq, input_dim_scATACseq], output = bottleneck)
118 | bottleneck_representation = encoder.predict([X_scRNAseq, X_scBSseq, X_scATACseq])
119 | print(pd.DataFrame(bottleneck_representation).shape)
120 | print(pd.DataFrame(bottleneck_representation).iloc[0:5,0:5])
121 | 
122 | # Dimensionality reduction plot
123 | #plt.figure(figsize=(20, 15))
124 | plt.scatter(bottleneck_representation[:, 0], bottleneck_representation[:, 1], c = Y_scRNAseq, cmap = 'tab10', s = 10)
125 | plt.title('Autoencoder: Data Integration, scNMTseq')
126 | plt.xlabel('Dimension 1')
127 | plt.ylabel('Dimension 2')
128 | #plt.colorbar()
129 | plt.show()
130 | 
131 | # tSNE on Autoencoder bottleneck representation
132 | model_tsne_auto = TSNE(learning_rate = 200, n_components = 2, random_state = 123, perplexity = 11, n_iter = 1000, verbose = 1)
133 | tsne_auto = model_tsne_auto.fit_transform(bottleneck_representation)
134 | plt.scatter(tsne_auto[:, 0], tsne_auto[:, 1], c = Y_scRNAseq, cmap = 'tab10', s = 10)
135 | plt.title('tSNE on Autoencoder: Data Integration, scNMTseq')
136 | plt.xlabel("tSNE1")
137 | plt.ylabel("tSNE2")
138 | plt.show()
139 | 
140 | # UNIFORM MANIFOLD APPROXIMATION AND PROJECTION (UMAP)
141 | model_umap = UMAP(n_neighbors = 11, min_dist = 0.1, n_components = 2)
142 | umap = model_umap.fit_transform(bottleneck_representation)
143 | plt.scatter(umap[:, 0], umap[:, 1], c = Y_scRNAseq, cmap = 'tab10', s = 10)
144 | plt.title('UMAP on Autoencoder: Data Integration, scNMTseq')
145 | plt.xlabel("UMAP1")
146 | plt.ylabel("UMAP2")
147 | plt.show()
148 | 


--------------------------------------------------------------------------------
/tSNE_on_Autoencoder_CITEseq.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # Nikolay Oskolkov, WABI Long-Term Support, nikolay.oskolkov@scilifelab.se
  3 | 
  4 | import sys
  5 | import keras
  6 | import numpy as np
  7 | import pandas as pd
  8 | from keras import regularizers
  9 | from keras.layers import Dense
 10 | import matplotlib.pyplot as plt
 11 | from keras.layers import Dropout
 12 | from sklearn.manifold import TSNE
 13 | from keras.optimizers import Adam
 14 | from sklearn.decomposition import PCA
 15 | from keras.models import Sequential, Model
 16 | 
 17 | import warnings
 18 | warnings.filterwarnings("ignore")
 19 | 
 20 | # READ DATA
 21 | infile = str(sys.argv[1])
 22 | print("\n" + "You specified input file: " + infile + "\n")
 23 | 
 24 | expr = pd.read_csv(infile,sep='\t')
 25 | print("\n" + "Dimensions of input file: " + str(expr.shape) + "\n")
 26 | print("\n" + "A few first lines of input file: " + "\n")
 27 | print(expr.iloc[0:4, 0:4])
 28 | print("\n" + "Last column corresponds to cluster assignments: " + "\n")
 29 | print(expr.iloc[0:4, (expr.shape[1]-4):expr.shape[1]])
 30 | 
 31 | # LOG-TRANSFORM DATA
 32 | X = expr.values[:,0:(expr.shape[1]-1)]
 33 | Y = expr.values[:,expr.shape[1]-1]
 34 | print("\n" + "You have following unique cluster labels: " + "\n")
 35 | print(set(Y))
 36 | print("\n" + "Log-transforming data..." + "\n")
 37 | X = np.log(X + 1)
 38 | 
 39 | # REDUCE DIMENSIONS WITH PRINCIPAL COMPONENT ANALYSIS (PCA)
 40 | n_input = 10
 41 | x_train = PCA(n_components = n_input).fit_transform(X)
 42 | y_train = Y
 43 | #plt.figure(figsize=(20, 15))
 44 | plt.scatter(x_train[:, 0], x_train[:, 1], c = y_train, cmap = 'tab20', s = 10)
 45 | plt.title('Principal Component Analysis (PCA)')
 46 | plt.xlabel("PC1")
 47 | plt.ylabel("PC2")
 48 | plt.show()
 49 | print("\n" + "Dimensions of reduced data set: " + str(x_train.shape) + "\n")
 50 | 
 51 | # REDUCE DIMENSIONS WITH T-DISTRIBUTED STOCHASTIC NEIGHBOR EMBEDDING (tSNE)
 52 | model_tsne = TSNE(learning_rate = 200, n_components = 2, random_state = 123, perplexity = 90, n_iter = 1000, verbose = 1)
 53 | tsne = model_tsne.fit_transform(x_train)
 54 | #plt.figure(figsize=(20, 15))
 55 | plt.scatter(tsne[:, 0], tsne[:, 1], c = y_train, cmap = 'tab20', s = 10)
 56 | plt.title('tSNE on PCA')
 57 | plt.xlabel("tSNE1")
 58 | plt.ylabel("tSNE2")
 59 | plt.show()
 60 | 
 61 | # REDUCE DIMENSIONS WITH AUTOENCODER
 62 | model = Sequential()
 63 | model.add(Dropout(0.2,  input_shape=(X.shape[1],)))
 64 | model.add(Dense(10,     activation = 'elu'))
 65 | model.add(Dense(8,      activation = 'elu'))
 66 | model.add(Dense(6,      activation = 'elu'))
 67 | model.add(Dense(4,      activation = 'linear', name = "bottleneck"))
 68 | model.add(Dense(6,      activation = 'elu'))
 69 | model.add(Dense(8,      activation = 'elu'))
 70 | model.add(Dense(10,     activation = 'elu'))
 71 | model.add(Dense(X.shape[1],   activation = 'sigmoid'))
 72 | model.compile(loss = 'mean_squared_error', optimizer = Adam())
 73 | model.summary()
 74 | 
 75 | history = model.fit(X, X, batch_size = 128, epochs = 100, shuffle = True, verbose = 1, validation_split = 0.2)
 76 | print("\n" + "Training Accuracy: ", history.history['loss'][-1])
 77 | print("Validation Accuracy: ", history.history['val_loss'][-1], "\n")
 78 | plt.plot(history.history['loss'])
 79 | plt.plot(history.history['val_loss'])
 80 | plt.title('Model Loss')
 81 | plt.ylabel('Loss')
 82 | plt.xlabel('Epoch')
 83 | plt.legend(['Train', 'Validate'], loc='upper right')
 84 | plt.show()
 85 | 
 86 | encoder = Model(model.input, model.get_layer('bottleneck').output)
 87 | bottleneck_representation = encoder.predict(X)
 88 | 
 89 | model_tsne_auto = TSNE(learning_rate = 200, n_components = 2, random_state = 123, perplexity = 90, n_iter = 1000, verbose = 1)
 90 | tsne_auto = model_tsne_auto.fit_transform(bottleneck_representation)
 91 | #plt.figure(figsize=(20, 15))
 92 | plt.scatter(tsne_auto[:, 0], tsne_auto[:, 1], c = Y, cmap = 'tab20', s = 10)
 93 | plt.title('tSNE on Autoencoder: 8 Layers')
 94 | plt.xlabel("tSNE1")
 95 | plt.ylabel("tSNE2")
 96 | plt.show()
 97 | 
 98 | # VISUALIZE AUTOENCODER
 99 | #from ann_visualizer.visualize import ann_viz
100 | #ann_viz(model, title = "Autoencoder", view = True)
101 | 


--------------------------------------------------------------------------------
/tsne_on_autoencoder_scNMTseq.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # Nikolay Oskolkov, WABI Long-Term Support, nikolay.oskolkov@scilifelab.se
 3 | 
 4 | import sys
 5 | import keras
 6 | import numpy as np
 7 | import pandas as pd
 8 | from keras import regularizers
 9 | from keras.layers import Dense
10 | import matplotlib.pyplot as plt
11 | from keras.layers import Dropout
12 | from sklearn.manifold import TSNE
13 | from keras.optimizers import Adam
14 | from sklearn.decomposition import PCA
15 | from keras.models import Sequential, Model
16 | 
17 | import warnings
18 | warnings.filterwarnings("ignore")
19 | 
20 | # READ DATA
21 | infile = str(sys.argv[1])
22 | print("\n" + "You specified input file: " + infile + "\n")
23 | 
24 | expr = pd.read_csv(infile,sep='\t')
25 | print("\n" + "Dimensions of input file: " + str(expr.shape) + "\n")
26 | print("\n" + "A few first lines of input file: " + "\n")
27 | print(expr.iloc[0:4, 0:4])
28 | print("\n" + "Last column corresponds to cluster assignments: " + "\n")
29 | print(expr.iloc[0:4, (expr.shape[1]-4):expr.shape[1]])
30 | 
31 | # LOG-TRANSFORM DATA
32 | X = expr.values[:,0:(expr.shape[1]-1)]
33 | Y = expr.values[:,expr.shape[1]-1]
34 | print("\n" + "You have following unique cluster labels: " + "\n")
35 | print(set(Y))
36 | print("\n" + "Log-transforming data..." + "\n")
37 | X = np.log(X + 1)
38 | 
39 | # REDUCE DIMENSIONS WITH PRINCIPAL COMPONENT ANALYSIS (PCA)
40 | n_input = 5
41 | x_train = PCA(n_components = n_input).fit_transform(X)
42 | y_train = Y
43 | plt.scatter(x_train[:, 0], x_train[:, 1], c = y_train, cmap = 'tab10', s = 10)
44 | plt.title('Principal Component Analysis (PCA)')
45 | plt.xlabel("PC1")
46 | plt.ylabel("PC2")
47 | plt.show()
48 | print("\n" + "Dimensions of reduced data set: " + str(x_train.shape) + "\n")
49 | 
50 | # REDUCE DIMENSIONS WITH T-DISTRIBUTED STOCHASTIC NEIGHBOR EMBEDDING (tSNE)
51 | model_tsne = TSNE(learning_rate = 200, n_components = 2, random_state = 123, perplexity = 11, n_iter = 1000, verbose = 1)
52 | tsne = model_tsne.fit_transform(x_train)
53 | plt.scatter(tsne[:, 0], tsne[:, 1], c = y_train, cmap = 'tab10', s = 10)
54 | plt.title('tSNE on PCA')
55 | plt.xlabel("tSNE1")
56 | plt.ylabel("tSNE2")
57 | plt.show()
58 | 
59 | # REDUCE DIMENSIONS WITH AUTOENCODER
60 | model = Sequential()
61 | model.add(Dropout(0.2,  input_shape=(X.shape[1],)))
62 | model.add(Dense(20,     activation = 'elu', activity_regularizer=regularizers.l1(10e-5)))
63 | model.add(Dense(10,     activation = 'elu', activity_regularizer=regularizers.l1(10e-5)))
64 | model.add(Dense(5,      activation = 'linear', name = "bottleneck"))
65 | model.add(Dense(10,     activation = 'elu', activity_regularizer=regularizers.l1(10e-5)))
66 | model.add(Dense(20,     activation = 'elu', activity_regularizer=regularizers.l1(10e-5)))
67 | model.add(Dense(X.shape[1],   activation = 'sigmoid'))
68 | model.compile(loss = 'mean_squared_error', optimizer = Adam())
69 | model.summary()
70 | 
71 | history = model.fit(X, X, batch_size = 16, epochs = 200, shuffle = True, verbose = 1, validation_split = 0.2)
72 | print("\n" + "Training Accuracy: ", history.history['loss'][-1])
73 | print("Validation Accuracy: ", history.history['val_loss'][-1], "\n")
74 | plt.plot(history.history['loss'])
75 | plt.plot(history.history['val_loss'])
76 | plt.title('Model Loss')
77 | plt.ylabel('Loss')
78 | plt.xlabel('Epoch')
79 | plt.legend(['Train', 'Validate'], loc='upper right')
80 | plt.show()
81 | 
82 | encoder = Model(model.input, model.get_layer('bottleneck').output)
83 | bottleneck_representation = encoder.predict(X)
84 | 
85 | model_tsne_auto = TSNE(learning_rate = 200, n_components = 2, random_state = 123, perplexity = 11, n_iter = 1000, verbose = 1)
86 | tsne_auto = model_tsne_auto.fit_transform(bottleneck_representation)
87 | plt.scatter(tsne_auto[:, 0], tsne_auto[:, 1], c = Y, cmap = 'tab10', s = 10)
88 | plt.title('tSNE on Autoencoder')
89 | plt.xlabel("tSNE1")
90 | plt.ylabel("tSNE2")
91 | plt.show()
92 | 
93 | # VISUALIZE AUTOENCODER
94 | #from ann_visualizer.visualize import ann_viz
95 | #ann_viz(model, title = "Autoencoder", view = True)
96 | 


--------------------------------------------------------------------------------