├── README.md
├── dataset
    ├── Ecoli.py
    └── rnaSeq.py
├── eval.py
├── imputer.py
├── models
    ├── Embedding
    │   └── model.py
    └── End_to_End
    │   ├── layers.py
    │   └── nets.py
├── options.py
├── requirements.txt
├── run.py
├── train_test.py
└── utils
    └── functions.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Graph Feature Autoencoder
 2 | A repo for implementation of Graph features autoencoder for prediction of expression values paper.
 3 | 
 4 | ## Requirement
 5 | Please install the requirement.txt file before use.
 6 | - torch==1.4.0
 7 | - torch-cluster==1.5.4
 8 | - torch-geometric==1.5.0
 9 | - torch-scatter==2.0.4
10 | - torch-sparse==0.6.0
11 | - scikit-learn==0.21.3
12 | - numpy==1.16.4
13 | - networkx==2.3
14 | - magic-impute==2.0.3
15 | 
16 | You can also download the required libraries using:
17 | ```
18 | pip install -r requirements.txt
19 | ```
20 | ## Dataset
21 | 
22 | Dataset is available for both Ecoli and Mouse in:
23 | 
24 | https://drive.google.com/drive/folders/1wQCwjwkkfmzydWW3DPgvj-JEOv05_shj?usp=sharing
25 | 
26 | Download the datasets and put them in a proper location.
27 | 
28 | 
29 | ## Experiments 
30 | 
31 | You can run the experiments using the run.py file.
32 | 
33 | Different options for different experiments are available in the options.py.
34 | 
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/dataset/Ecoli.py:
--------------------------------------------------------------------------------
  1 | from torch_geometric.data import InMemoryDataset,Data
  2 | from torch_geometric.utils import dense_to_sparse, to_undirected, remove_self_loops
  3 | import torch
  4 | import numpy as np
  5 | from sklearn.model_selection import train_test_split
  6 | import pandas as pd
  7 | import os.path as osp
  8 | 
  9 | 
 10 | class Ecoli_Exp(InMemoryDataset):
 11 |     def __init__(self, root, network='TF_net', imputation=False, Normalize=False, transform=None, pre_transform=None):
 12 |         self.network = network
 13 |         self.normalize = Normalize
 14 |         self.imputation = imputation
 15 |         super(InMemoryDataset, self).__init__(root, transform, pre_transform)
 16 |         self.data, self.slices = torch.load(self.processed_paths[0])
 17 | 
 18 | 
 19 |     @property
 20 |     def raw_file_names(self):
 21 |         return []
 22 | 
 23 | 
 24 |     @property
 25 |     def processed_file_names(self):
 26 |         return ['processed_{}_imputation_data.pt'.format(self.network) if self.imputation
 27 |                 else 'processed_{}_data'.format(self.network)]
 28 | 
 29 |     def download(self):
 30 |         pass
 31 | 
 32 | 
 33 |     def read_TF_net(self, root):
 34 |         TF_gene = pd.read_csv(root + "/network_tf_gene.txt", skiprows=34, header=None, usecols=[0, 1, 2, 4],
 35 |                               delimiter='\t')
 36 |         TF_gene = TF_gene.apply(lambda x: x.astype(str).str.lower())
 37 |         TF_gene = TF_gene[((TF_gene[2] == '-') | (TF_gene[2] == '+'))]
 38 |         TF_gene = TF_gene.drop_duplicates(subset=[0, 1])
 39 | 
 40 |         Ecoli = pd.read_table(root + '/avg_E_coli_v4_Build_6_exps466probes4297.tab')
 41 | 
 42 |         Ecoli['E_coli_v4_Build_6:genes'] = Ecoli['E_coli_v4_Build_6:genes'].str.split('_').str[0]
 43 |         Ecoli = Ecoli.apply(lambda x: x.astype(str).str.lower())
 44 |         Ecoli = Ecoli.rename(columns={"E_coli_v4_Build_6:genes": "Genes"})
 45 | 
 46 |         tf_regdb = TF_gene[0].isin(Ecoli.Genes) & TF_gene[1].isin(Ecoli.Genes)
 47 |         Filt_regDB = TF_gene[tf_regdb]
 48 |         Filt_DB_genes = np.union1d(Filt_regDB[0].unique(), Filt_regDB[1].unique())
 49 |         Ecoli_Filt = Ecoli[Ecoli.Genes.isin(Filt_DB_genes)]
 50 | 
 51 |         Adj = np.zeros([len(Filt_DB_genes), len(Filt_DB_genes)])
 52 |         features = np.zeros([len(Filt_DB_genes), Ecoli.shape[1] - 1])
 53 |         for i in range(len(Filt_regDB)):
 54 |             row = np.where(Filt_DB_genes == Filt_regDB.iloc[i][0])[0][0]
 55 |             col = np.where(Filt_DB_genes == Filt_regDB.iloc[i][1])[0][0]
 56 |             if (Filt_regDB.iloc[i][2] == '+'):
 57 |                 Adj[row][col] = 1
 58 |                 Adj[col][row] = 1
 59 |             else:
 60 |                 Adj[row][col] = 1
 61 |                 Adj[row][col] = 1
 62 | 
 63 |         print(len(Adj.nonzero()[0]))
 64 |         for i in range(len(Filt_DB_genes)):
 65 |             features[i] = Ecoli[Ecoli.Genes == Filt_DB_genes[i]].iloc[:, 1:]
 66 | 
 67 |         return dense_to_sparse(torch.tensor(Adj))[0], torch.tensor(features, dtype=torch.float32)
 68 | 
 69 |     def read_PPI(self, root):
 70 |         BioGrid = pd.read_csv(root + '/BIOGRID-ORGANISM-Escherichia_coli_K12_W3110-3.5.180.tab2.txt', delimiter='\t')
 71 |         BioGrid['Official Symbol Interactor A'] = BioGrid['Official Symbol Interactor A'].str.lower()
 72 |         BioGrid['Official Symbol Interactor B'] = BioGrid['Official Symbol Interactor B'].str.lower()
 73 |         BioGrid = BioGrid.rename(
 74 |             columns={"Official Symbol Interactor A": "Gene_A", "Official Symbol Interactor B": "Gene_B"})
 75 | 
 76 |         Ecoli = pd.read_table(root + '/avg_E_coli_v4_Build_6_exps466probes4297.tab')
 77 | 
 78 |         Ecoli['E_coli_v4_Build_6:genes'] = Ecoli['E_coli_v4_Build_6:genes'].str.split('_').str[0]
 79 |         Ecoli = Ecoli.apply(lambda x: x.astype(str).str.lower())
 80 |         Ecoli = Ecoli.rename(columns={"E_coli_v4_Build_6:genes": "Genes"})
 81 | 
 82 |         Filt_BioGrid_indeces = BioGrid.Gene_A.isin(Ecoli.Genes) & BioGrid.Gene_B.isin(
 83 |             Ecoli.Genes)  # & BioGrid['Experimental System Name'] != 'Biochemical Activity'
 84 |         Filt_BioGrid = BioGrid[Filt_BioGrid_indeces]
 85 |         Filt_BioGrid_PPI = Filt_BioGrid[Filt_BioGrid['Experimental System Type'] == 'physical']
 86 |         Filt_BioGrid_PPI_Genes = np.union1d(Filt_BioGrid_PPI.Gene_A.unique(), Filt_BioGrid_PPI.Gene_B.unique())
 87 |         Ecoli_Filt_PPI = Ecoli[Ecoli.Genes.isin(Filt_BioGrid_PPI_Genes)]
 88 | 
 89 |         Adj = np.zeros([len(Filt_BioGrid_PPI_Genes), len(Filt_BioGrid_PPI_Genes)])
 90 |         features = np.zeros([len(Filt_BioGrid_PPI_Genes), Ecoli_Filt_PPI.shape[1] - 1])
 91 |         for i in range(len(Filt_BioGrid_PPI)):
 92 |             row = np.where(Filt_BioGrid_PPI_Genes == Filt_BioGrid_PPI.iloc[i][7])[0][0]
 93 |             col = np.where(Filt_BioGrid_PPI_Genes == Filt_BioGrid_PPI.iloc[i][8])[0][0]
 94 |             Adj[row][col] = 1
 95 |             Adj[col][row] = 1
 96 | 
 97 |         for i in range(len(Filt_BioGrid_PPI_Genes)):
 98 |             features[i] = Ecoli[Ecoli.Genes == Filt_BioGrid_PPI_Genes[i]].iloc[:, 1:]
 99 | 
100 |         return dense_to_sparse(torch.tensor(Adj))[0], torch.tensor(features, dtype=torch.float32)
101 | 
102 |     def read_Genetic(self, root):
103 |         BioGrid = pd.read_csv(root + '/BIOGRID-ORGANISM-Escherichia_coli_K12_W3110-3.5.180.tab2.txt', delimiter='\t')
104 |         BioGrid['Official Symbol Interactor A'] = BioGrid['Official Symbol Interactor A'].str.lower()
105 |         BioGrid['Official Symbol Interactor B'] = BioGrid['Official Symbol Interactor B'].str.lower()
106 |         BioGrid = BioGrid.rename(
107 |             columns={"Official Symbol Interactor A": "Gene_A", "Official Symbol Interactor B": "Gene_B"})
108 | 
109 |         Ecoli = pd.read_table(root + '/avg_E_coli_v4_Build_6_exps466probes4297.tab')
110 | 
111 |         Ecoli['E_coli_v4_Build_6:genes'] = Ecoli['E_coli_v4_Build_6:genes'].str.split('_').str[0]
112 |         Ecoli = Ecoli.apply(lambda x: x.astype(str).str.lower())
113 |         Ecoli = Ecoli.rename(columns={"E_coli_v4_Build_6:genes": "Genes"})
114 | 
115 |         Filt_BioGrid_indeces = BioGrid.Gene_A.isin(Ecoli.Genes) & BioGrid.Gene_B.isin(
116 |             Ecoli.Genes)  # & BioGrid['Experimental System Name'] != 'Biochemical Activity'
117 |         Filt_BioGrid = BioGrid[Filt_BioGrid_indeces]
118 |         Filt_BioGrid_Genetic = Filt_BioGrid[Filt_BioGrid['Experimental System Type'] == 'genetic']
119 |         Filt_BioGrid_Genetic_Genes = np.union1d(Filt_BioGrid_Genetic.Gene_A.unique(),
120 |                                                 Filt_BioGrid_Genetic.Gene_B.unique())
121 |         Ecoli_Filt_Genetic = Ecoli[Ecoli.Genes.isin(Filt_BioGrid_Genetic_Genes)]
122 | 
123 |         Adj = np.zeros([len(Filt_BioGrid_Genetic_Genes), len(Filt_BioGrid_Genetic_Genes)])
124 |         features = np.zeros([len(Filt_BioGrid_Genetic_Genes), Ecoli_Filt_Genetic.shape[1] - 1])
125 |         for i in range(len(Filt_BioGrid_Genetic)):
126 |             row = np.where(Filt_BioGrid_Genetic_Genes == Filt_BioGrid_Genetic.iloc[i][7])[0][0]
127 |             col = np.where(Filt_BioGrid_Genetic_Genes == Filt_BioGrid_Genetic.iloc[i][8])[0][0]
128 |             Adj[row][col] = 1
129 |             Adj[col][row] = 1
130 | 
131 |         for i in range(len(Filt_BioGrid_Genetic_Genes)):
132 |             features[i] = Ecoli[Ecoli.Genes == Filt_BioGrid_Genetic_Genes[i]].iloc[:, 1:]
133 | 
134 |         return dense_to_sparse(torch.tensor(Adj))[0], torch.tensor(features, dtype=torch.float32)
135 | 
136 | 
137 |     def process(self):
138 |         if self.network == 'TF_net':
139 |             edge_index, x = self.read_TF_net(self.root)
140 |         elif self.network == 'PPI':
141 |             edge_index, x = self.read_PPI(self.root)
142 |         elif self.network == 'Genetic':
143 |             edge_index, x = self.read_Genetic(self.root)
144 |         assert self.network in ['TF_net', 'PPI', 'Genetic'], 'currently supported graphs are Transcription factors,' \
145 |                                                              ' Protein-Protein Interaction, and Genetics for E-Coli'
146 |         y = x
147 |         edge_index = to_undirected(edge_index)
148 |         edge_index = remove_self_loops(edge_index)[0]
149 |         data = Data(x=x, edge_index=edge_index, y=y)
150 |         if self.imputation:
151 |             data.indices = np.indices([x.size(0), x.size(1)]).reshape(2, -1)
152 | 
153 |         torch.save(self.collate([data]), self.processed_paths[0])
154 | 
155 | 
156 | 


--------------------------------------------------------------------------------
/dataset/rnaSeq.py:
--------------------------------------------------------------------------------
 1 | from torch_geometric.data import InMemoryDataset,Data
 2 | from torch_geometric.utils import dense_to_sparse, to_undirected, remove_self_loops
 3 | import torch
 4 | import numpy as np
 5 | from sklearn.model_selection import train_test_split
 6 | 
 7 | 
 8 | 
 9 | class RnaSeq(InMemoryDataset):
10 | 
11 |     def __init__(self, root, network='MousePPI', transform=None, pre_transform=None):
12 |         self.network = network
13 |         super(InMemoryDataset, self).__init__(root, transform, pre_transform)
14 | 
15 |         if self.network == 'MousePPI':
16 |             self.data, self.slices = torch.load(self.processed_paths[0])
17 | 
18 |     @property
19 |     def raw_file_names(self):
20 |         return []
21 | 
22 |     @property
23 |     def processed_file_names(self):
24 |         return ['MousePPI_processed_rnaSeq_data.pt']
25 | 
26 |     def download(self):
27 |         pass
28 | 
29 |     def index_to_mask(self, indices, index, shape):
30 |         mask = torch.zeros(shape)
31 |         mask[indices[0, index], indices[1, index]] = 1
32 |         return mask
33 | 
34 |     def process(self):
35 |         edge_index = torch.tensor(np.array(
36 |             np.load(self.root + '/' + self.network + '.npy', allow_pickle=True), dtype=np.int))
37 |         gene_names = None
38 |         if self.network == 'MousePPI':
39 |             x = torch.tensor(np.load(self.root + '/' + 'mouse_rnaSeq.npy', allow_pickle=True), dtype=torch.float)
40 |         else:
41 |             x = np.load(self.root + '/' + 'rnaSeq.npy', allow_pickle=True)
42 |             gene_names = x[:, 0]
43 |             x = torch.tensor(np.array(x[:, 1:], dtype=np.float), dtype=torch.float)
44 |         print(x.size(0))
45 |         matrix_mask = torch.zeros([x.size(0), x.size(1)])
46 |         matrix_mask[x.nonzero(as_tuple=True)] = 1
47 |         indices = np.array(x.data.numpy().nonzero())
48 |         ix_train, ix_test = train_test_split(np.arange(len(indices[0])), test_size=0.25, random_state=42)
49 | 
50 |         data = Data(x=x, edge_index=edge_index, y=x, nonzeromask=matrix_mask)
51 |         data.gene_names = gene_names if gene_names is not None else None
52 |         if self.network == 'MousePPI':
53 |             torch.save(self.collate([data]), self.processed_paths[0])


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from sklearn.metrics import mean_squared_error as scimse
  3 | from torch_geometric.utils import to_undirected
  4 | import numpy as np
  5 | import networkx as nx
  6 | from sklearn.model_selection import KFold
  7 | from train_test import train_epoch, test
  8 | import copy
  9 | from sklearn.linear_model import LinearRegression
 10 | from sklearn.ensemble import RandomForestRegressor
 11 | from utils.functions import index_to_mask
 12 | from magic import MAGIC
 13 | 
 14 | def supervised_prediction_eval(model_class, data, opts):
 15 | 
 16 |     loss_train = []
 17 |     criterion = torch.nn.MSELoss()
 18 |     kf = KFold(n_splits=3, random_state=opts.seed)
 19 |     kf_feats = KFold(n_splits=3, random_state=opts.seed)
 20 | 
 21 |     mse = []
 22 | 
 23 |     for k, train_test_indices in enumerate(kf.split(data.x)):
 24 |         print('Fold number: {:d}'.format(k))
 25 |         y_pred = []
 26 |         train_index, test_index = train_test_indices
 27 |         eval_data = copy.deepcopy(data)
 28 |         if opts.random_graph:
 29 |             print('Random Graph used')
 30 |             G_rand = nx.gnp_random_graph(data.x.shape[0],opts.random_graph_alpha)
 31 |             eval_data.edge_index = to_undirected(torch.tensor(np.array(G_rand.edges()).T).to(opts.device))
 32 |             print(eval_data)
 33 |         train_feats_indeces, test_feats_indeces = next(kf_feats.split(np.arange(data.y.size(1))))
 34 |         if not opts.no_features:
 35 |             eval_data.x = data.x[:, train_feats_indeces]
 36 |         eval_data.y = data.y[:, test_feats_indeces]
 37 |         eval_data.train_mask = index_to_mask(train_index, eval_data.x.size(0))
 38 |         eval_data.test_mask = index_to_mask(test_index, eval_data.x.size(0))
 39 |         for exp_num in range(eval_data.y.size(1)):
 40 |             if (model_class == LinearRegression) | (model_class == RandomForestRegressor):
 41 |                 model = model_class()
 42 |                 model.fit(eval_data.x[eval_data.train_mask], eval_data.y[eval_data.train_mask, exp_num])
 43 |                 pred = model.predict(eval_data.x[eval_data.test_mask])
 44 |                 test_loss = scimse(pred,
 45 |                        eval_data.y[eval_data.test_mask, exp_num])
 46 |                 print('Exp: {:03d}, Loss: {:.5f}'
 47 |                       .format(exp_num, test_loss))
 48 |                 y_pred.append(pred)
 49 |             else:
 50 |                 torch.manual_seed(opts.seed)
 51 |                 if torch.cuda.is_available():
 52 |                     torch.cuda.manual_seed_all(opts.seed)
 53 | 
 54 |                 model = model_class(eval_data.num_features, opts).to(opts.device)
 55 |                 optimizer = torch.optim.Adam(model.parameters(), lr=opts.learning_rate)
 56 |                 best_loss = 1e9
 57 |                 for epoch in range(1, opts.epochs + 1):
 58 |                     loss_train = train_epoch(model, eval_data, optimizer, opts, exp_num, criterion)
 59 |                     if loss_train < best_loss:
 60 |                         best_loss = loss_train
 61 |                         best_model = copy.deepcopy(model)
 62 |                 loss_test = test(best_model, eval_data, exp_num, criterion, opts)
 63 |                 print('Exp: {:03d}, Loss: {:.5f}, TestLoss: {:.5f}'.
 64 |                       format(exp_num, loss_train, loss_test))
 65 |                 with torch.no_grad():
 66 |                     y_pred.append(best_model(eval_data))
 67 | 
 68 |         for i in range(eval_data.y.size(1)):
 69 |             if (model_class == LinearRegression) | (model_class == RandomForestRegressor):
 70 |                 mse.append(scimse(y_pred[i],
 71 |                                   eval_data.y[eval_data.test_mask, i]))
 72 |             else:
 73 |                 mse.append(scimse(y_pred[i][eval_data.test_mask.cpu().numpy()].cpu().numpy(),
 74 |                                   eval_data.y[eval_data.test_mask, i].cpu().numpy().reshape([-1, 1])))
 75 |     print('Average+-std Error for test expression values: {:.5f}+-{:.5f}'.format(np.mean(mse), np.std(mse)))
 76 |     return mse
 77 | 
 78 | def embedding_prediction_eval(model_class, data, opts):
 79 |     loss_train = []
 80 | 
 81 |     kf = KFold(n_splits=3, random_state=opts.seed, shuffle=True)
 82 |     kf_feats = KFold(n_splits=3, random_state=opts.seed, shuffle=True)
 83 | 
 84 |     mse_lr = []
 85 |     mse_rf = []
 86 | 
 87 |     for k, train_test_indices in enumerate(kf.split(data.x)):
 88 |         print('Fold number: {:d}'.format(k))
 89 |         y_pred = []
 90 |         train_index, test_index = train_test_indices
 91 |         eval_data = copy.deepcopy(data)
 92 |         train_feats_indeces, test_feats_indeces = next(kf_feats.split(np.arange(data.y.size(1))))
 93 |         if not opts.no_features:
 94 |             eval_data.x = data.x[:, train_feats_indeces]
 95 |         eval_data.y = data.y[:, test_feats_indeces]
 96 |         eval_data.train_mask = index_to_mask(train_index, eval_data.x.size(0))
 97 |         eval_data.test_mask = index_to_mask(test_index, eval_data.x.size(0))
 98 |         model = model_class(eval_data.num_features, 32).to(opts.device)
 99 |         if torch.cuda.is_available():
100 |             torch.cuda.manual_seed_all(opts.seed)
101 |         optimizer = torch.optim.Adam(model.parameters(), lr=opts.learning_rate)
102 |         print('Training the auto encoder!')
103 |         for epoch in range(1, opts.epochs + 1):
104 |             if epoch % 10 == 0:
105 |                 print('Epoch number: {:03d}'.format(epoch))
106 |             train_epoch(model, eval_data, optimizer, opts)
107 |         for exp_num in range(eval_data.y.size(1)):
108 |             torch.manual_seed(opts.seed)
109 |             z = model.encode(eval_data.x, eval_data.edge_index)
110 |             model.fit_predictor(z[eval_data.train_mask].cpu().data.numpy(),
111 |                                 eval_data.y[eval_data.train_mask, exp_num].cpu().data.numpy())
112 | 
113 |             loss_test_lr, loss_test_rf = test(model, eval_data, exp_num, scimse, opts)
114 |             model.eval()
115 |             print('Exp: {:03d}, TestLoss_lr: {:.5f}, TestLoss_rf: {:.5f}'.
116 |                   format(exp_num, loss_test_lr, loss_test_rf))
117 |             with torch.no_grad():
118 |                 y_pred.append(model.predict(eval_data.x, eval_data.edge_index))
119 |         for i in range(eval_data.y.size(1)):
120 |             mse_lr.append(scimse(y_pred[i][0][eval_data.test_mask.cpu().numpy()],
121 |                               eval_data.y[eval_data.test_mask, i].cpu().numpy().reshape([-1, 1])))
122 |             mse_rf.append(scimse(y_pred[i][1][eval_data.test_mask.cpu().numpy()],
123 |                                  eval_data.y[eval_data.test_mask, i].cpu().numpy().reshape([-1, 1])))
124 | 
125 |     print('Average+-std Error for test expression values LR: {:.5f}+-{:.5f}'.format(np.mean(mse_lr), np.std(mse_lr)))
126 |     print('Average+-std Error for test expression values RF: {:.5f}+-{:.5f}'.format(np.mean(mse_rf), np.std(mse_rf)))
127 |     return mse_lr, mse_rf
128 | 
129 | 
130 | def imputation_eval(model_class, data, opts):
131 |     if model_class == MAGIC:
132 |         data.x = data.y = data.x.t()
133 |         data.nonzeromask = data.nonzeromask.t()
134 |     criterion = torch.nn.MSELoss()
135 |     kf = KFold(n_splits=3, random_state=opts.seed, shuffle=True)
136 |     loss_test = []
137 |     if opts.dataset == 'Ecoli':
138 |         indices = np.indices([data.x.size(0), data.x.size(1)]).reshape(2, -1)
139 |     else:
140 |         indices = np.array(data.x.cpu().data.numpy().nonzero())
141 |     for k, train_test_indices in enumerate(kf.split(np.arange(len(indices[0])))):
142 |         print('Fold number: {:d}'.format(k))
143 |         train_index, test_index = train_test_indices
144 |         eval_data = copy.deepcopy(data)
145 |         eval_data.train_mask = index_to_mask([indices[0, train_index], indices[1, train_index]],
146 |                                              eval_data.x.size()).to(opts.device)
147 |         eval_data.test_mask = index_to_mask([indices[0, test_index], indices[1, test_index]],
148 |                                             eval_data.x.size()).to(opts.device)
149 |         eval_data.x = eval_data.x * eval_data.train_mask
150 |         if model_class == MAGIC:
151 |             pred = model_class().fit_transform((eval_data.x*eval_data.train_mask).cpu().data.numpy())
152 |             loss_test.append(scimse(pred*eval_data.test_mask.cpu().data.numpy(),
153 |                                     (eval_data.y*eval_data.test_mask).cpu().data.numpy()))
154 |         else:
155 |             model = model_class(eval_data.num_features, opts).to(opts.device)
156 |             optimizer = torch.optim.Adam(model.parameters(), lr=opts.learning_rate)
157 |             best_loss = 1e9
158 |             for epoch in range(1, opts.epochs + 1):
159 |                 loss_train = train_epoch(model, eval_data, optimizer, opts, criterion=criterion)
160 |                 if loss_train < best_loss:
161 |                     best_loss = loss_train
162 |                     best_model = copy.deepcopy(model)
163 |                 if epoch % 10 == 0:
164 |                     print('Epoch number: {:03d}, Train_loss: {:.5f}'.format(epoch, loss_train))
165 |             loss_test.append(test(best_model, eval_data, None, criterion, opts))
166 |             print('Loss: {:.5f}, TestLoss: {:.5f}'.format(loss_train, loss_test[k]))
167 |     print('Average+-std Error for test RNA values: {:.5f}+-{:.5f}'.format(np.mean(loss_test), np.std(loss_test)))
168 |     return np.mean(loss_test)
169 | 


--------------------------------------------------------------------------------
/imputer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from train_test import train_epoch, test
 3 | from models.End_to_End.nets import AE_MLP
 4 | def impute(model_class, data, opts):
 5 |     criterion = torch.nn.MSELoss()
 6 |     if model_class == AE_MLP:
 7 |         data.x = data.y = data.x.t()
 8 |         data.nonzeromask = data.nonzeromask.t()
 9 |     model = model_class(data.num_features, opts).to(opts.device)
10 |     optimizer = torch.optim.Adam(model.parameters(), lr=opts.learning_rate)
11 |     for epoch in range(1, opts.epochs + 1):
12 |         loss_train = train_epoch(model, data, optimizer, opts, criterion=criterion)
13 |         if epoch%100 == 0:
14 |             print('Exp: {:03d}, Loss: {:.5f}'.
15 |                   format(epoch, loss_train))
16 |     return model(data)
17 | 


--------------------------------------------------------------------------------
/models/Embedding/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch_geometric.nn import GCNConv, GAE
 4 | from sklearn.linear_model import LinearRegression
 5 | from sklearn.ensemble import RandomForestRegressor
 6 | 
 7 | 
 8 | class Encoder(torch.nn.Module):
 9 |     def __init__(self, in_channels, out_channels):
10 |         super(Encoder, self).__init__()
11 |         self.conv1 = GCNConv(in_channels, 64, cached=True)
12 |         self.conv2 = GCNConv(64, out_channels, cached=True)
13 | 
14 |     def forward(self, x, edge_index):
15 |         x = F.relu(self.conv1(x, edge_index))
16 |         return self.conv2(x, edge_index)
17 | 
18 | 
19 | class Embedding_ExpGAE(GAE):
20 |     def __init__(self, in_channels, out_channels):
21 |         encoder = Encoder(in_channels, out_channels)
22 |         super(Embedding_ExpGAE, self).__init__(encoder=encoder)
23 |         self.predictor_lr = LinearRegression()
24 |         self.predictor_rf = RandomForestRegressor(n_estimators=20, max_depth=2)
25 |     def fit_predictor(self, z, y):
26 |         self.predictor_lr.fit(z, y)
27 |         self.predictor_rf.fit(z, y)
28 | 
29 |     def predict(self, x, edge_index):
30 |         z = self.encode(x, edge_index)
31 |         return self.predictor_lr.predict(z.cpu().data.numpy()), self.predictor_rf.predict(z.cpu().data.numpy())
32 | 


--------------------------------------------------------------------------------
/models/End_to_End/layers.py:
--------------------------------------------------------------------------------
 1 | from torch_geometric.nn.conv import MessagePassing
 2 | from torch_geometric.utils import add_remaining_self_loops
 3 | from torch.nn import Linear
 4 | import torch
 5 | 
 6 | 
 7 | class FeatGraphConv(MessagePassing):
 8 |     def __init__(self, in_channels,hidden , out_channels, aggr='mean', bias = True,
 9 |                  **kwargs):
10 |         super(FeatGraphConv, self).__init__(aggr=aggr, **kwargs)
11 |         self.lin1 = Linear(2*hidden, out_channels, bias=bias)
12 |         self.lin2 = Linear(in_channels, hidden, bias=bias)
13 |     def forward(self, x, edge_index, edge_weight=None, size=None):
14 |         edge_index,edge_weight = add_remaining_self_loops(edge_index=edge_index,edge_weight = edge_weight)
15 |         h = self.lin2(x)
16 |         return self.propagate(edge_index, size=size, x=x, h=h,
17 |                               edge_weight=edge_weight)
18 | 
19 |     def message(self, h_j, edge_weight):
20 |         return h_j if edge_weight is None else edge_weight.view(-1, 1) * h_j
21 | 
22 |     def update(self, aggr_out, h):
23 |         return self.lin1(torch.cat((h, aggr_out), 1))


--------------------------------------------------------------------------------
/models/End_to_End/nets.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from torch_geometric.nn import SAGEConv, GraphConv, GCNConv
  3 | import torch.nn as nn
  4 | import torch
  5 | from torch.nn import Linear as Lin
  6 | 
  7 | from models.End_to_End.layers import FeatGraphConv
  8 | 
  9 | 
 10 | class FAE_FeatGraphConv(nn.Module):
 11 |     def __init__(self, in_channels, opts):
 12 |         super(FAE_FeatGraphConv, self).__init__()
 13 |         self.opts = opts
 14 |         if self.opts.problem == 'Prediction':
 15 |             self.conv1 = FeatGraphConv(in_channels, 64, 64, aggr='mean')
 16 |             self.conv2 = FeatGraphConv(64, 32, 32, aggr='mean')
 17 |             self.lin = Lin(32, 1)
 18 |         else:
 19 |             self.conv1 = FeatGraphConv(in_channels, 64, 32, aggr='mean')
 20 |             self.lin = Lin(32, in_channels)
 21 |     def forward(self, data):
 22 |         if self.opts.problem == 'Prediction':
 23 |             x, edge_index = data.x, data.edge_index
 24 |             x = torch.relu(self.conv1(x, edge_index))
 25 |             x = torch.relu(self.conv2(x, edge_index))
 26 |             return self.lin(x)
 27 |         else:
 28 |             x, edge_index = data.x, data.edge_index
 29 |             x = torch.relu(self.conv1(x, edge_index))
 30 |             x = self.lin(x)
 31 |             return x
 32 | 
 33 | 
 34 | class FAE_SAGEConv(nn.Module):
 35 | 
 36 |     def __init__(self , in_channels, opts):
 37 |         super(FAE_SAGEConv, self).__init__()
 38 |         self.opts = opts
 39 |         if self.opts.problem == 'Prediction':
 40 |             self.conv1 = SAGEConv(in_channels, 64)
 41 |             self.conv2 = SAGEConv(64, 32)
 42 |             self.lin = Lin(32, 1)
 43 |         else:
 44 |             self.conv1 = SAGEConv(in_channels, 64)
 45 |             self.lin = Lin(64, in_channels)
 46 |     def forward(self, data):
 47 |         if self.opts.problem == 'Prediction':
 48 |             x, edge_index = data.x, data.edge_index
 49 |             x = torch.relu(self.conv1(x, edge_index))
 50 |             x = torch.relu(self.conv2(x, edge_index))
 51 |             x = self.lin(x)
 52 |             return x
 53 |         else:
 54 |             x, edge_index = data.x, data.edge_index
 55 |             x = torch.relu(self.conv1(x, edge_index))
 56 |             x = self.lin(x)
 57 |             return x
 58 | 
 59 | 
 60 | class FAE_GCN(nn.Module):
 61 |     def __init__(self, in_channels, opts):
 62 |         super(FAE_GCN, self).__init__()
 63 |         self.opts = opts
 64 |         if self.opts.problem == 'Prediction':
 65 |             self.conv1 = GCNConv(in_channels, 64)
 66 |             self.conv2 = GCNConv(64, 32)
 67 |             self.lin = Lin(32, 1)
 68 |         else:
 69 |             self.conv1 = GCNConv(in_channels, 64)
 70 |             # self.conv2 = GCNConv(64, 32)
 71 |             self.lin = Lin(64, in_channels)
 72 | 
 73 |     def forward(self, data):
 74 |         if self.opts.problem == 'Prediction':
 75 |             x, edge_index = data.x, data.edge_index
 76 |             x = torch.relu(self.conv1(x, edge_index))
 77 |             x = torch.relu(self.conv2(x, edge_index))
 78 |             x = self.lin(x)
 79 |             return x
 80 |         else:
 81 |             x, edge_index = data.x, data.edge_index
 82 |             x = torch.relu(self.conv1(x, edge_index))
 83 |             # x = torch.relu(self.conv2(x, edge_index))
 84 |             x = self.lin(x)
 85 |             return x
 86 | 
 87 | 
 88 | class FAE_GraphConv(nn.Module):
 89 |     def __init__(self, in_channels, opts):
 90 |         super(FAE_GraphConv, self).__init__()
 91 |         self.opts = opts
 92 |         if self.opts.problem == 'Prediction':
 93 |             self.conv1 = GraphConv(in_channels, 64)
 94 |             self.conv2 = GraphConv(64, 32, aggr='sum')
 95 |             self.lin = Lin(32, 1)
 96 |         else:
 97 |             self.conv1 = GraphConv(in_channels, 64)
 98 |             # self.conv2 = GraphConv(64, 32,aggr='mean')
 99 |             self.lin = Lin(64, in_channels)
100 | 
101 |     def forward(self, data):
102 |         if self.opts.problem == 'Prediction':
103 |             x, edge_index = data.x, data.edge_index
104 |             x = torch.relu(self.conv1(x, edge_index))
105 |             x = torch.relu(self.conv2(x, edge_index))
106 |             x = self.lin(x)
107 |             return x
108 |         else:
109 |             x, edge_index = data.x, data.edge_index
110 |             x = torch.relu(self.conv1(x, edge_index))
111 |             # x = torch.relu(self.conv2(x, edge_index))
112 |             x = self.lin(x)
113 |             return x
114 | 
115 | 
116 | class AE_MLP(nn.Module):
117 |     def __init__(self, in_channels, opts):
118 |         super(AE_MLP, self).__init__()
119 |         self.opts = opts
120 |         if self.opts.problem == 'Prediction':
121 |             self.lin1 = Lin(in_channels, 64)
122 |             self.lin2 = Lin(64, 32)
123 |             self.lin3 = Lin(32, 1)
124 |         else:
125 |             self.lin1 = Lin(in_channels, 64)
126 |             self.lin2 = Lin(64, in_channels)
127 | 
128 |     def forward(self, data):
129 |         if self.opts.problem == 'Prediction':
130 |             x = data.x
131 |             x = torch.relu(self.lin1(x))
132 |             x = torch.relu(self.lin2(x))
133 |             return self.lin3(x)
134 |         else:
135 |             x = data.x
136 |             x = torch.relu(self.lin1(x))
137 |             x = self.lin2(x)
138 |             return x
139 | 


--------------------------------------------------------------------------------
/options.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import argparse
 4 | import torch
 5 | 
 6 | def get_options(args=None):
 7 |     parser = argparse.ArgumentParser(
 8 |         description="Graph Feature Auto-Encoder for Prediction of Gene Expression Values")
 9 | 
10 |     # Data
11 |     parser.add_argument('--problem', default='Imputation_eval', help="Want to predict or Impute the dataset "
12 |                                "(Values in ['Prediction', 'Imputation', 'Imputation_eval'])")
13 |     parser.add_argument('--network', type=str, default='MousePPI', help="(Values in ['MousePPI'] for RNA and "
14 |                                                        "['TF_net', 'PPI', 'Genetic'] for Ecoli)")
15 |     parser.add_argument('--dataset', type=str, default='RNA', help="(Values in ['Ecoli', 'RNA'])")
16 |     parser.add_argument('--datadir', type=str, default='../data/Expression_Values/SingleCellRNA/Mouse', 
17 |                        help="Path to data directory")
18 | 
19 |     # Model
20 | 
21 |     parser.add_argument('--model', type=str, default='FeatGraphConv', help="Values in ['GraphConv', 'GCN', 'SAGEConv',"
22 |                                                                            "'FeatGraphConv','MLP' ,"
23 |                                                                            " 'Magic', 'LR', 'RF']")
24 |     parser.add_argument('--embedding', action='store_true', help='Whether to make predictions on the graph embedding '
25 |                                                                  '(only in prediction problem)')
26 |     parser.add_argument('--random_graph', action='store_true', help='Whether to make predictions based on random graph structure')
27 |     parser.add_argument('--random_graph_alpha', type=float, default=.021, help="alpha hyper=parameter for generating"
28 |                                                                                "random graph in Prediction problem")
29 |     parser.add_argument('--hidden', type=int, default=64)
30 |     parser.add_argument('--out_channels', type=int, default=32)
31 | 
32 |     # Training
33 | 
34 |     parser.add_argument('--learning_rate', type=float, default=0.001)
35 |     parser.add_argument('--epochs', type=float, default=20000)
36 |     parser.add_argument('--seed', type=int, default=12345, help='Random seed to use')
37 |     parser.add_argument('--no_cuda', action='store_true', help='Disable CUDA')
38 |     parser.add_argument('--no_features', action='store_true',
39 |                         help='Whether to use Expression values as node features or not')
40 | 
41 |     opts = parser.parse_args(args)
42 | 
43 |     opts.use_cuda = torch.cuda.is_available() and not opts.no_cuda
44 | 
45 |     return opts


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch==1.4.0
 2 | torch-cluster==1.5.4
 3 | torch-geometric==1.5.0
 4 | torch-scatter==2.0.4
 5 | torch-sparse==0.6.0
 6 | scikit-learn==0.21.3
 7 | numpy==1.16.4
 8 | networkx==2.3
 9 | magic-impute==2.0.3
10 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import pprint as pp
 3 | 
 4 | 
 5 | import torch
 6 | import numpy as np
 7 | 
 8 | from options import get_options
 9 | 
10 | from utils.functions import load_data_class, load_model
11 | from eval import supervised_prediction_eval, imputation_eval, embedding_prediction_eval
12 | from imputer import impute
13 | 
14 | 
15 | def run(opts):
16 | 
17 |     # Pretty print the run args
18 |     pp.pprint(vars(opts))
19 | 
20 |     # Set the random seed
21 |     torch.manual_seed(opts.seed)
22 | 
23 |     # Set the device
24 |     opts.device = torch.device("cuda:0" if opts.use_cuda else "cpu")
25 | 
26 |     # Choose the dataset to use
27 |     data_class = load_data_class(opts.dataset)
28 | 
29 |     # Load data from load_path
30 |     data = data_class(root=opts.datadir, network=opts.network)[0]
31 |         
32 |     
33 |     # Preprocess node features
34 |     if opts.no_features:
35 |         print('node ids used')
36 |         data.x = torch.eye(data.num_nodes).to(opts.device)
37 |     
38 |     data = data.to(opts.device)
39 |     model_class = load_model(opts)
40 |     assert opts.problem in ['Prediction', 'Imputation', 'Imputation_eval'], 'only support prediction or imputation of expression values'
41 |     print(data)
42 |     if opts.problem == 'Prediction':
43 |         if not opts.embedding:
44 |             supervised_prediction_eval(model_class, data, opts)
45 |         else:
46 |             embedding_prediction_eval(model_class, data, opts)
47 |     
48 |     elif opts.problem == 'Imputation_eval':
49 |         imputation_eval(model_class, data, opts)
50 |     elif opts.problem == 'Imputation':
51 |         imputed = impute(model_class, data, opts)
52 |         np.save(opts.model + opts.network + '_imputed.npy', imputed.cpu().detach().numpy())
53 |     
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     run(get_options())


--------------------------------------------------------------------------------
/train_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def train_epoch(model, data, optimizer, opts, exp_num=None, criterion=None):
 5 |     model.train()
 6 |     optimizer.zero_grad()
 7 | 
 8 |     if opts.embedding:
 9 |         z = model.encode(data.x, data.edge_index)
10 |         loss = model.recon_loss(z, data.edge_index)
11 |     elif opts.problem == 'Prediction':
12 |         output = model(data)
13 |         loss = criterion(output[data.train_mask], data.y[data.train_mask, exp_num].reshape([-1, 1]))
14 |     elif opts.problem == 'Imputation_eval':
15 |         output = model(data)
16 |         loss = criterion(output * (data.train_mask), data.y * (data.train_mask))
17 |     else:
18 |         output = model(data)
19 |         loss = criterion(output * (data.nonzeromask), data.y * (data.nonzeromask))
20 |     loss.backward()
21 |     optimizer.step()
22 |     return loss.item()
23 | 
24 | @torch.no_grad()
25 | def test(model, data, exp_num, criterion, opts):
26 |     model.eval()
27 | 
28 |     if opts.embedding:
29 |         lr_out, rf_out = model.predict(data.x, data.edge_index)
30 |         loss_lr = criterion(lr_out[data.test_mask], data.y[data.test_mask, exp_num].cpu().data.numpy())
31 |         loss_rf = criterion(rf_out[data.test_mask], data.y[data.test_mask, exp_num].cpu().data.numpy())
32 |         return loss_lr, loss_rf
33 |     elif opts.problem == 'Prediction':
34 |         output = model(data)
35 |         loss = criterion(output[data.test_mask], data.y[data.test_mask, exp_num].reshape([-1, 1]))
36 |     else:
37 |         output = model(data)
38 |         loss = criterion(output*data.test_mask, data.y*data.test_mask)
39 |     return loss.item()
40 | 


--------------------------------------------------------------------------------
/utils/functions.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | def load_data_class(name):
 4 |     from dataset import Ecoli_Exp,RnaSeq
 5 |     dataset = {
 6 |         'Ecoli': Ecoli_Exp,
 7 |         'RNA': RnaSeq
 8 |     }.get(name, None)
 9 |     assert dataset is not None, "Currently unsupported problem: {}!".format(name)
10 |     return dataset
11 | 
12 | def load_model(opts):
13 |     from models.End_to_End.nets import FAE_GraphConv, FAE_GCN, FAE_SAGEConv, FAE_FeatGraphConv\
14 |         , AE_MLP
15 |     from models.Embedding.model import Embedding_ExpGAE
16 |     from magic import MAGIC
17 |     from sklearn.linear_model import LinearRegression
18 |     from sklearn.ensemble import RandomForestRegressor
19 |     if not opts.embedding:
20 |         model = {'GraphConv': FAE_GraphConv,
21 |               'GCN': FAE_GCN,
22 |               'SAGEConv': FAE_SAGEConv ,
23 |               'FeatGraphConv': FAE_FeatGraphConv,
24 |               'MLP': AE_MLP,
25 |               'Magic': MAGIC,
26 |                  'LR': LinearRegression,
27 |                  'RF': RandomForestRegressor
28 |         }.get(opts.model, None)
29 |     else:
30 |         model = Embedding_ExpGAE
31 | 
32 | 
33 |     assert model is not None, "Currently unsupported model: {}!".format(opts.model)
34 |     return model
35 | 
36 | 
37 | def index_to_mask(index, size):
38 |     
39 |     mask = torch.zeros(size, dtype=torch.bool)
40 |     mask[index] = 1
41 |     return mask


--------------------------------------------------------------------------------