├── README.md ├── dataset ├── Ecoli.py └── rnaSeq.py ├── eval.py ├── imputer.py ├── models ├── Embedding │ └── model.py └── End_to_End │ ├── layers.py │ └── nets.py ├── options.py ├── requirements.txt ├── run.py ├── train_test.py └── utils └── functions.py /README.md: -------------------------------------------------------------------------------- 1 | # Graph Feature Autoencoder 2 | A repo for implementation of Graph features autoencoder for prediction of expression values paper. 3 | 4 | ## Requirement 5 | Please install the requirement.txt file before use. 6 | - torch==1.4.0 7 | - torch-cluster==1.5.4 8 | - torch-geometric==1.5.0 9 | - torch-scatter==2.0.4 10 | - torch-sparse==0.6.0 11 | - scikit-learn==0.21.3 12 | - numpy==1.16.4 13 | - networkx==2.3 14 | - magic-impute==2.0.3 15 | 16 | You can also download the required libraries using: 17 | ``` 18 | pip install -r requirements.txt 19 | ``` 20 | ## Dataset 21 | 22 | Dataset is available for both Ecoli and Mouse in: 23 | 24 | https://drive.google.com/drive/folders/1wQCwjwkkfmzydWW3DPgvj-JEOv05_shj?usp=sharing 25 | 26 | Download the datasets and put them in a proper location. 27 | 28 | 29 | ## Experiments 30 | 31 | You can run the experiments using the run.py file. 32 | 33 | Different options for different experiments are available in the options.py. 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /dataset/Ecoli.py: -------------------------------------------------------------------------------- 1 | from torch_geometric.data import InMemoryDataset,Data 2 | from torch_geometric.utils import dense_to_sparse, to_undirected, remove_self_loops 3 | import torch 4 | import numpy as np 5 | from sklearn.model_selection import train_test_split 6 | import pandas as pd 7 | import os.path as osp 8 | 9 | 10 | class Ecoli_Exp(InMemoryDataset): 11 | def __init__(self, root, network='TF_net', imputation=False, Normalize=False, transform=None, pre_transform=None): 12 | self.network = network 13 | self.normalize = Normalize 14 | self.imputation = imputation 15 | super(InMemoryDataset, self).__init__(root, transform, pre_transform) 16 | self.data, self.slices = torch.load(self.processed_paths[0]) 17 | 18 | 19 | @property 20 | def raw_file_names(self): 21 | return [] 22 | 23 | 24 | @property 25 | def processed_file_names(self): 26 | return ['processed_{}_imputation_data.pt'.format(self.network) if self.imputation 27 | else 'processed_{}_data'.format(self.network)] 28 | 29 | def download(self): 30 | pass 31 | 32 | 33 | def read_TF_net(self, root): 34 | TF_gene = pd.read_csv(root + "/network_tf_gene.txt", skiprows=34, header=None, usecols=[0, 1, 2, 4], 35 | delimiter='\t') 36 | TF_gene = TF_gene.apply(lambda x: x.astype(str).str.lower()) 37 | TF_gene = TF_gene[((TF_gene[2] == '-') | (TF_gene[2] == '+'))] 38 | TF_gene = TF_gene.drop_duplicates(subset=[0, 1]) 39 | 40 | Ecoli = pd.read_table(root + '/avg_E_coli_v4_Build_6_exps466probes4297.tab') 41 | 42 | Ecoli['E_coli_v4_Build_6:genes'] = Ecoli['E_coli_v4_Build_6:genes'].str.split('_').str[0] 43 | Ecoli = Ecoli.apply(lambda x: x.astype(str).str.lower()) 44 | Ecoli = Ecoli.rename(columns={"E_coli_v4_Build_6:genes": "Genes"}) 45 | 46 | tf_regdb = TF_gene[0].isin(Ecoli.Genes) & TF_gene[1].isin(Ecoli.Genes) 47 | Filt_regDB = TF_gene[tf_regdb] 48 | Filt_DB_genes = np.union1d(Filt_regDB[0].unique(), Filt_regDB[1].unique()) 49 | Ecoli_Filt = Ecoli[Ecoli.Genes.isin(Filt_DB_genes)] 50 | 51 | Adj = np.zeros([len(Filt_DB_genes), len(Filt_DB_genes)]) 52 | features = np.zeros([len(Filt_DB_genes), Ecoli.shape[1] - 1]) 53 | for i in range(len(Filt_regDB)): 54 | row = np.where(Filt_DB_genes == Filt_regDB.iloc[i][0])[0][0] 55 | col = np.where(Filt_DB_genes == Filt_regDB.iloc[i][1])[0][0] 56 | if (Filt_regDB.iloc[i][2] == '+'): 57 | Adj[row][col] = 1 58 | Adj[col][row] = 1 59 | else: 60 | Adj[row][col] = 1 61 | Adj[row][col] = 1 62 | 63 | print(len(Adj.nonzero()[0])) 64 | for i in range(len(Filt_DB_genes)): 65 | features[i] = Ecoli[Ecoli.Genes == Filt_DB_genes[i]].iloc[:, 1:] 66 | 67 | return dense_to_sparse(torch.tensor(Adj))[0], torch.tensor(features, dtype=torch.float32) 68 | 69 | def read_PPI(self, root): 70 | BioGrid = pd.read_csv(root + '/BIOGRID-ORGANISM-Escherichia_coli_K12_W3110-3.5.180.tab2.txt', delimiter='\t') 71 | BioGrid['Official Symbol Interactor A'] = BioGrid['Official Symbol Interactor A'].str.lower() 72 | BioGrid['Official Symbol Interactor B'] = BioGrid['Official Symbol Interactor B'].str.lower() 73 | BioGrid = BioGrid.rename( 74 | columns={"Official Symbol Interactor A": "Gene_A", "Official Symbol Interactor B": "Gene_B"}) 75 | 76 | Ecoli = pd.read_table(root + '/avg_E_coli_v4_Build_6_exps466probes4297.tab') 77 | 78 | Ecoli['E_coli_v4_Build_6:genes'] = Ecoli['E_coli_v4_Build_6:genes'].str.split('_').str[0] 79 | Ecoli = Ecoli.apply(lambda x: x.astype(str).str.lower()) 80 | Ecoli = Ecoli.rename(columns={"E_coli_v4_Build_6:genes": "Genes"}) 81 | 82 | Filt_BioGrid_indeces = BioGrid.Gene_A.isin(Ecoli.Genes) & BioGrid.Gene_B.isin( 83 | Ecoli.Genes) # & BioGrid['Experimental System Name'] != 'Biochemical Activity' 84 | Filt_BioGrid = BioGrid[Filt_BioGrid_indeces] 85 | Filt_BioGrid_PPI = Filt_BioGrid[Filt_BioGrid['Experimental System Type'] == 'physical'] 86 | Filt_BioGrid_PPI_Genes = np.union1d(Filt_BioGrid_PPI.Gene_A.unique(), Filt_BioGrid_PPI.Gene_B.unique()) 87 | Ecoli_Filt_PPI = Ecoli[Ecoli.Genes.isin(Filt_BioGrid_PPI_Genes)] 88 | 89 | Adj = np.zeros([len(Filt_BioGrid_PPI_Genes), len(Filt_BioGrid_PPI_Genes)]) 90 | features = np.zeros([len(Filt_BioGrid_PPI_Genes), Ecoli_Filt_PPI.shape[1] - 1]) 91 | for i in range(len(Filt_BioGrid_PPI)): 92 | row = np.where(Filt_BioGrid_PPI_Genes == Filt_BioGrid_PPI.iloc[i][7])[0][0] 93 | col = np.where(Filt_BioGrid_PPI_Genes == Filt_BioGrid_PPI.iloc[i][8])[0][0] 94 | Adj[row][col] = 1 95 | Adj[col][row] = 1 96 | 97 | for i in range(len(Filt_BioGrid_PPI_Genes)): 98 | features[i] = Ecoli[Ecoli.Genes == Filt_BioGrid_PPI_Genes[i]].iloc[:, 1:] 99 | 100 | return dense_to_sparse(torch.tensor(Adj))[0], torch.tensor(features, dtype=torch.float32) 101 | 102 | def read_Genetic(self, root): 103 | BioGrid = pd.read_csv(root + '/BIOGRID-ORGANISM-Escherichia_coli_K12_W3110-3.5.180.tab2.txt', delimiter='\t') 104 | BioGrid['Official Symbol Interactor A'] = BioGrid['Official Symbol Interactor A'].str.lower() 105 | BioGrid['Official Symbol Interactor B'] = BioGrid['Official Symbol Interactor B'].str.lower() 106 | BioGrid = BioGrid.rename( 107 | columns={"Official Symbol Interactor A": "Gene_A", "Official Symbol Interactor B": "Gene_B"}) 108 | 109 | Ecoli = pd.read_table(root + '/avg_E_coli_v4_Build_6_exps466probes4297.tab') 110 | 111 | Ecoli['E_coli_v4_Build_6:genes'] = Ecoli['E_coli_v4_Build_6:genes'].str.split('_').str[0] 112 | Ecoli = Ecoli.apply(lambda x: x.astype(str).str.lower()) 113 | Ecoli = Ecoli.rename(columns={"E_coli_v4_Build_6:genes": "Genes"}) 114 | 115 | Filt_BioGrid_indeces = BioGrid.Gene_A.isin(Ecoli.Genes) & BioGrid.Gene_B.isin( 116 | Ecoli.Genes) # & BioGrid['Experimental System Name'] != 'Biochemical Activity' 117 | Filt_BioGrid = BioGrid[Filt_BioGrid_indeces] 118 | Filt_BioGrid_Genetic = Filt_BioGrid[Filt_BioGrid['Experimental System Type'] == 'genetic'] 119 | Filt_BioGrid_Genetic_Genes = np.union1d(Filt_BioGrid_Genetic.Gene_A.unique(), 120 | Filt_BioGrid_Genetic.Gene_B.unique()) 121 | Ecoli_Filt_Genetic = Ecoli[Ecoli.Genes.isin(Filt_BioGrid_Genetic_Genes)] 122 | 123 | Adj = np.zeros([len(Filt_BioGrid_Genetic_Genes), len(Filt_BioGrid_Genetic_Genes)]) 124 | features = np.zeros([len(Filt_BioGrid_Genetic_Genes), Ecoli_Filt_Genetic.shape[1] - 1]) 125 | for i in range(len(Filt_BioGrid_Genetic)): 126 | row = np.where(Filt_BioGrid_Genetic_Genes == Filt_BioGrid_Genetic.iloc[i][7])[0][0] 127 | col = np.where(Filt_BioGrid_Genetic_Genes == Filt_BioGrid_Genetic.iloc[i][8])[0][0] 128 | Adj[row][col] = 1 129 | Adj[col][row] = 1 130 | 131 | for i in range(len(Filt_BioGrid_Genetic_Genes)): 132 | features[i] = Ecoli[Ecoli.Genes == Filt_BioGrid_Genetic_Genes[i]].iloc[:, 1:] 133 | 134 | return dense_to_sparse(torch.tensor(Adj))[0], torch.tensor(features, dtype=torch.float32) 135 | 136 | 137 | def process(self): 138 | if self.network == 'TF_net': 139 | edge_index, x = self.read_TF_net(self.root) 140 | elif self.network == 'PPI': 141 | edge_index, x = self.read_PPI(self.root) 142 | elif self.network == 'Genetic': 143 | edge_index, x = self.read_Genetic(self.root) 144 | assert self.network in ['TF_net', 'PPI', 'Genetic'], 'currently supported graphs are Transcription factors,' \ 145 | ' Protein-Protein Interaction, and Genetics for E-Coli' 146 | y = x 147 | edge_index = to_undirected(edge_index) 148 | edge_index = remove_self_loops(edge_index)[0] 149 | data = Data(x=x, edge_index=edge_index, y=y) 150 | if self.imputation: 151 | data.indices = np.indices([x.size(0), x.size(1)]).reshape(2, -1) 152 | 153 | torch.save(self.collate([data]), self.processed_paths[0]) 154 | 155 | 156 | -------------------------------------------------------------------------------- /dataset/rnaSeq.py: -------------------------------------------------------------------------------- 1 | from torch_geometric.data import InMemoryDataset,Data 2 | from torch_geometric.utils import dense_to_sparse, to_undirected, remove_self_loops 3 | import torch 4 | import numpy as np 5 | from sklearn.model_selection import train_test_split 6 | 7 | 8 | 9 | class RnaSeq(InMemoryDataset): 10 | 11 | def __init__(self, root, network='MousePPI', transform=None, pre_transform=None): 12 | self.network = network 13 | super(InMemoryDataset, self).__init__(root, transform, pre_transform) 14 | 15 | if self.network == 'MousePPI': 16 | self.data, self.slices = torch.load(self.processed_paths[0]) 17 | 18 | @property 19 | def raw_file_names(self): 20 | return [] 21 | 22 | @property 23 | def processed_file_names(self): 24 | return ['MousePPI_processed_rnaSeq_data.pt'] 25 | 26 | def download(self): 27 | pass 28 | 29 | def index_to_mask(self, indices, index, shape): 30 | mask = torch.zeros(shape) 31 | mask[indices[0, index], indices[1, index]] = 1 32 | return mask 33 | 34 | def process(self): 35 | edge_index = torch.tensor(np.array( 36 | np.load(self.root + '/' + self.network + '.npy', allow_pickle=True), dtype=np.int)) 37 | gene_names = None 38 | if self.network == 'MousePPI': 39 | x = torch.tensor(np.load(self.root + '/' + 'mouse_rnaSeq.npy', allow_pickle=True), dtype=torch.float) 40 | else: 41 | x = np.load(self.root + '/' + 'rnaSeq.npy', allow_pickle=True) 42 | gene_names = x[:, 0] 43 | x = torch.tensor(np.array(x[:, 1:], dtype=np.float), dtype=torch.float) 44 | print(x.size(0)) 45 | matrix_mask = torch.zeros([x.size(0), x.size(1)]) 46 | matrix_mask[x.nonzero(as_tuple=True)] = 1 47 | indices = np.array(x.data.numpy().nonzero()) 48 | ix_train, ix_test = train_test_split(np.arange(len(indices[0])), test_size=0.25, random_state=42) 49 | 50 | data = Data(x=x, edge_index=edge_index, y=x, nonzeromask=matrix_mask) 51 | data.gene_names = gene_names if gene_names is not None else None 52 | if self.network == 'MousePPI': 53 | torch.save(self.collate([data]), self.processed_paths[0]) -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from sklearn.metrics import mean_squared_error as scimse 3 | from torch_geometric.utils import to_undirected 4 | import numpy as np 5 | import networkx as nx 6 | from sklearn.model_selection import KFold 7 | from train_test import train_epoch, test 8 | import copy 9 | from sklearn.linear_model import LinearRegression 10 | from sklearn.ensemble import RandomForestRegressor 11 | from utils.functions import index_to_mask 12 | from magic import MAGIC 13 | 14 | def supervised_prediction_eval(model_class, data, opts): 15 | 16 | loss_train = [] 17 | criterion = torch.nn.MSELoss() 18 | kf = KFold(n_splits=3, random_state=opts.seed) 19 | kf_feats = KFold(n_splits=3, random_state=opts.seed) 20 | 21 | mse = [] 22 | 23 | for k, train_test_indices in enumerate(kf.split(data.x)): 24 | print('Fold number: {:d}'.format(k)) 25 | y_pred = [] 26 | train_index, test_index = train_test_indices 27 | eval_data = copy.deepcopy(data) 28 | if opts.random_graph: 29 | print('Random Graph used') 30 | G_rand = nx.gnp_random_graph(data.x.shape[0],opts.random_graph_alpha) 31 | eval_data.edge_index = to_undirected(torch.tensor(np.array(G_rand.edges()).T).to(opts.device)) 32 | print(eval_data) 33 | train_feats_indeces, test_feats_indeces = next(kf_feats.split(np.arange(data.y.size(1)))) 34 | if not opts.no_features: 35 | eval_data.x = data.x[:, train_feats_indeces] 36 | eval_data.y = data.y[:, test_feats_indeces] 37 | eval_data.train_mask = index_to_mask(train_index, eval_data.x.size(0)) 38 | eval_data.test_mask = index_to_mask(test_index, eval_data.x.size(0)) 39 | for exp_num in range(eval_data.y.size(1)): 40 | if (model_class == LinearRegression) | (model_class == RandomForestRegressor): 41 | model = model_class() 42 | model.fit(eval_data.x[eval_data.train_mask], eval_data.y[eval_data.train_mask, exp_num]) 43 | pred = model.predict(eval_data.x[eval_data.test_mask]) 44 | test_loss = scimse(pred, 45 | eval_data.y[eval_data.test_mask, exp_num]) 46 | print('Exp: {:03d}, Loss: {:.5f}' 47 | .format(exp_num, test_loss)) 48 | y_pred.append(pred) 49 | else: 50 | torch.manual_seed(opts.seed) 51 | if torch.cuda.is_available(): 52 | torch.cuda.manual_seed_all(opts.seed) 53 | 54 | model = model_class(eval_data.num_features, opts).to(opts.device) 55 | optimizer = torch.optim.Adam(model.parameters(), lr=opts.learning_rate) 56 | best_loss = 1e9 57 | for epoch in range(1, opts.epochs + 1): 58 | loss_train = train_epoch(model, eval_data, optimizer, opts, exp_num, criterion) 59 | if loss_train < best_loss: 60 | best_loss = loss_train 61 | best_model = copy.deepcopy(model) 62 | loss_test = test(best_model, eval_data, exp_num, criterion, opts) 63 | print('Exp: {:03d}, Loss: {:.5f}, TestLoss: {:.5f}'. 64 | format(exp_num, loss_train, loss_test)) 65 | with torch.no_grad(): 66 | y_pred.append(best_model(eval_data)) 67 | 68 | for i in range(eval_data.y.size(1)): 69 | if (model_class == LinearRegression) | (model_class == RandomForestRegressor): 70 | mse.append(scimse(y_pred[i], 71 | eval_data.y[eval_data.test_mask, i])) 72 | else: 73 | mse.append(scimse(y_pred[i][eval_data.test_mask.cpu().numpy()].cpu().numpy(), 74 | eval_data.y[eval_data.test_mask, i].cpu().numpy().reshape([-1, 1]))) 75 | print('Average+-std Error for test expression values: {:.5f}+-{:.5f}'.format(np.mean(mse), np.std(mse))) 76 | return mse 77 | 78 | def embedding_prediction_eval(model_class, data, opts): 79 | loss_train = [] 80 | 81 | kf = KFold(n_splits=3, random_state=opts.seed, shuffle=True) 82 | kf_feats = KFold(n_splits=3, random_state=opts.seed, shuffle=True) 83 | 84 | mse_lr = [] 85 | mse_rf = [] 86 | 87 | for k, train_test_indices in enumerate(kf.split(data.x)): 88 | print('Fold number: {:d}'.format(k)) 89 | y_pred = [] 90 | train_index, test_index = train_test_indices 91 | eval_data = copy.deepcopy(data) 92 | train_feats_indeces, test_feats_indeces = next(kf_feats.split(np.arange(data.y.size(1)))) 93 | if not opts.no_features: 94 | eval_data.x = data.x[:, train_feats_indeces] 95 | eval_data.y = data.y[:, test_feats_indeces] 96 | eval_data.train_mask = index_to_mask(train_index, eval_data.x.size(0)) 97 | eval_data.test_mask = index_to_mask(test_index, eval_data.x.size(0)) 98 | model = model_class(eval_data.num_features, 32).to(opts.device) 99 | if torch.cuda.is_available(): 100 | torch.cuda.manual_seed_all(opts.seed) 101 | optimizer = torch.optim.Adam(model.parameters(), lr=opts.learning_rate) 102 | print('Training the auto encoder!') 103 | for epoch in range(1, opts.epochs + 1): 104 | if epoch % 10 == 0: 105 | print('Epoch number: {:03d}'.format(epoch)) 106 | train_epoch(model, eval_data, optimizer, opts) 107 | for exp_num in range(eval_data.y.size(1)): 108 | torch.manual_seed(opts.seed) 109 | z = model.encode(eval_data.x, eval_data.edge_index) 110 | model.fit_predictor(z[eval_data.train_mask].cpu().data.numpy(), 111 | eval_data.y[eval_data.train_mask, exp_num].cpu().data.numpy()) 112 | 113 | loss_test_lr, loss_test_rf = test(model, eval_data, exp_num, scimse, opts) 114 | model.eval() 115 | print('Exp: {:03d}, TestLoss_lr: {:.5f}, TestLoss_rf: {:.5f}'. 116 | format(exp_num, loss_test_lr, loss_test_rf)) 117 | with torch.no_grad(): 118 | y_pred.append(model.predict(eval_data.x, eval_data.edge_index)) 119 | for i in range(eval_data.y.size(1)): 120 | mse_lr.append(scimse(y_pred[i][0][eval_data.test_mask.cpu().numpy()], 121 | eval_data.y[eval_data.test_mask, i].cpu().numpy().reshape([-1, 1]))) 122 | mse_rf.append(scimse(y_pred[i][1][eval_data.test_mask.cpu().numpy()], 123 | eval_data.y[eval_data.test_mask, i].cpu().numpy().reshape([-1, 1]))) 124 | 125 | print('Average+-std Error for test expression values LR: {:.5f}+-{:.5f}'.format(np.mean(mse_lr), np.std(mse_lr))) 126 | print('Average+-std Error for test expression values RF: {:.5f}+-{:.5f}'.format(np.mean(mse_rf), np.std(mse_rf))) 127 | return mse_lr, mse_rf 128 | 129 | 130 | def imputation_eval(model_class, data, opts): 131 | if model_class == MAGIC: 132 | data.x = data.y = data.x.t() 133 | data.nonzeromask = data.nonzeromask.t() 134 | criterion = torch.nn.MSELoss() 135 | kf = KFold(n_splits=3, random_state=opts.seed, shuffle=True) 136 | loss_test = [] 137 | if opts.dataset == 'Ecoli': 138 | indices = np.indices([data.x.size(0), data.x.size(1)]).reshape(2, -1) 139 | else: 140 | indices = np.array(data.x.cpu().data.numpy().nonzero()) 141 | for k, train_test_indices in enumerate(kf.split(np.arange(len(indices[0])))): 142 | print('Fold number: {:d}'.format(k)) 143 | train_index, test_index = train_test_indices 144 | eval_data = copy.deepcopy(data) 145 | eval_data.train_mask = index_to_mask([indices[0, train_index], indices[1, train_index]], 146 | eval_data.x.size()).to(opts.device) 147 | eval_data.test_mask = index_to_mask([indices[0, test_index], indices[1, test_index]], 148 | eval_data.x.size()).to(opts.device) 149 | eval_data.x = eval_data.x * eval_data.train_mask 150 | if model_class == MAGIC: 151 | pred = model_class().fit_transform((eval_data.x*eval_data.train_mask).cpu().data.numpy()) 152 | loss_test.append(scimse(pred*eval_data.test_mask.cpu().data.numpy(), 153 | (eval_data.y*eval_data.test_mask).cpu().data.numpy())) 154 | else: 155 | model = model_class(eval_data.num_features, opts).to(opts.device) 156 | optimizer = torch.optim.Adam(model.parameters(), lr=opts.learning_rate) 157 | best_loss = 1e9 158 | for epoch in range(1, opts.epochs + 1): 159 | loss_train = train_epoch(model, eval_data, optimizer, opts, criterion=criterion) 160 | if loss_train < best_loss: 161 | best_loss = loss_train 162 | best_model = copy.deepcopy(model) 163 | if epoch % 10 == 0: 164 | print('Epoch number: {:03d}, Train_loss: {:.5f}'.format(epoch, loss_train)) 165 | loss_test.append(test(best_model, eval_data, None, criterion, opts)) 166 | print('Loss: {:.5f}, TestLoss: {:.5f}'.format(loss_train, loss_test[k])) 167 | print('Average+-std Error for test RNA values: {:.5f}+-{:.5f}'.format(np.mean(loss_test), np.std(loss_test))) 168 | return np.mean(loss_test) 169 | -------------------------------------------------------------------------------- /imputer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from train_test import train_epoch, test 3 | from models.End_to_End.nets import AE_MLP 4 | def impute(model_class, data, opts): 5 | criterion = torch.nn.MSELoss() 6 | if model_class == AE_MLP: 7 | data.x = data.y = data.x.t() 8 | data.nonzeromask = data.nonzeromask.t() 9 | model = model_class(data.num_features, opts).to(opts.device) 10 | optimizer = torch.optim.Adam(model.parameters(), lr=opts.learning_rate) 11 | for epoch in range(1, opts.epochs + 1): 12 | loss_train = train_epoch(model, data, optimizer, opts, criterion=criterion) 13 | if epoch%100 == 0: 14 | print('Exp: {:03d}, Loss: {:.5f}'. 15 | format(epoch, loss_train)) 16 | return model(data) 17 | -------------------------------------------------------------------------------- /models/Embedding/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch_geometric.nn import GCNConv, GAE 4 | from sklearn.linear_model import LinearRegression 5 | from sklearn.ensemble import RandomForestRegressor 6 | 7 | 8 | class Encoder(torch.nn.Module): 9 | def __init__(self, in_channels, out_channels): 10 | super(Encoder, self).__init__() 11 | self.conv1 = GCNConv(in_channels, 64, cached=True) 12 | self.conv2 = GCNConv(64, out_channels, cached=True) 13 | 14 | def forward(self, x, edge_index): 15 | x = F.relu(self.conv1(x, edge_index)) 16 | return self.conv2(x, edge_index) 17 | 18 | 19 | class Embedding_ExpGAE(GAE): 20 | def __init__(self, in_channels, out_channels): 21 | encoder = Encoder(in_channels, out_channels) 22 | super(Embedding_ExpGAE, self).__init__(encoder=encoder) 23 | self.predictor_lr = LinearRegression() 24 | self.predictor_rf = RandomForestRegressor(n_estimators=20, max_depth=2) 25 | def fit_predictor(self, z, y): 26 | self.predictor_lr.fit(z, y) 27 | self.predictor_rf.fit(z, y) 28 | 29 | def predict(self, x, edge_index): 30 | z = self.encode(x, edge_index) 31 | return self.predictor_lr.predict(z.cpu().data.numpy()), self.predictor_rf.predict(z.cpu().data.numpy()) 32 | -------------------------------------------------------------------------------- /models/End_to_End/layers.py: -------------------------------------------------------------------------------- 1 | from torch_geometric.nn.conv import MessagePassing 2 | from torch_geometric.utils import add_remaining_self_loops 3 | from torch.nn import Linear 4 | import torch 5 | 6 | 7 | class FeatGraphConv(MessagePassing): 8 | def __init__(self, in_channels,hidden , out_channels, aggr='mean', bias = True, 9 | **kwargs): 10 | super(FeatGraphConv, self).__init__(aggr=aggr, **kwargs) 11 | self.lin1 = Linear(2*hidden, out_channels, bias=bias) 12 | self.lin2 = Linear(in_channels, hidden, bias=bias) 13 | def forward(self, x, edge_index, edge_weight=None, size=None): 14 | edge_index,edge_weight = add_remaining_self_loops(edge_index=edge_index,edge_weight = edge_weight) 15 | h = self.lin2(x) 16 | return self.propagate(edge_index, size=size, x=x, h=h, 17 | edge_weight=edge_weight) 18 | 19 | def message(self, h_j, edge_weight): 20 | return h_j if edge_weight is None else edge_weight.view(-1, 1) * h_j 21 | 22 | def update(self, aggr_out, h): 23 | return self.lin1(torch.cat((h, aggr_out), 1)) -------------------------------------------------------------------------------- /models/End_to_End/nets.py: -------------------------------------------------------------------------------- 1 | 2 | from torch_geometric.nn import SAGEConv, GraphConv, GCNConv 3 | import torch.nn as nn 4 | import torch 5 | from torch.nn import Linear as Lin 6 | 7 | from models.End_to_End.layers import FeatGraphConv 8 | 9 | 10 | class FAE_FeatGraphConv(nn.Module): 11 | def __init__(self, in_channels, opts): 12 | super(FAE_FeatGraphConv, self).__init__() 13 | self.opts = opts 14 | if self.opts.problem == 'Prediction': 15 | self.conv1 = FeatGraphConv(in_channels, 64, 64, aggr='mean') 16 | self.conv2 = FeatGraphConv(64, 32, 32, aggr='mean') 17 | self.lin = Lin(32, 1) 18 | else: 19 | self.conv1 = FeatGraphConv(in_channels, 64, 32, aggr='mean') 20 | self.lin = Lin(32, in_channels) 21 | def forward(self, data): 22 | if self.opts.problem == 'Prediction': 23 | x, edge_index = data.x, data.edge_index 24 | x = torch.relu(self.conv1(x, edge_index)) 25 | x = torch.relu(self.conv2(x, edge_index)) 26 | return self.lin(x) 27 | else: 28 | x, edge_index = data.x, data.edge_index 29 | x = torch.relu(self.conv1(x, edge_index)) 30 | x = self.lin(x) 31 | return x 32 | 33 | 34 | class FAE_SAGEConv(nn.Module): 35 | 36 | def __init__(self , in_channels, opts): 37 | super(FAE_SAGEConv, self).__init__() 38 | self.opts = opts 39 | if self.opts.problem == 'Prediction': 40 | self.conv1 = SAGEConv(in_channels, 64) 41 | self.conv2 = SAGEConv(64, 32) 42 | self.lin = Lin(32, 1) 43 | else: 44 | self.conv1 = SAGEConv(in_channels, 64) 45 | self.lin = Lin(64, in_channels) 46 | def forward(self, data): 47 | if self.opts.problem == 'Prediction': 48 | x, edge_index = data.x, data.edge_index 49 | x = torch.relu(self.conv1(x, edge_index)) 50 | x = torch.relu(self.conv2(x, edge_index)) 51 | x = self.lin(x) 52 | return x 53 | else: 54 | x, edge_index = data.x, data.edge_index 55 | x = torch.relu(self.conv1(x, edge_index)) 56 | x = self.lin(x) 57 | return x 58 | 59 | 60 | class FAE_GCN(nn.Module): 61 | def __init__(self, in_channels, opts): 62 | super(FAE_GCN, self).__init__() 63 | self.opts = opts 64 | if self.opts.problem == 'Prediction': 65 | self.conv1 = GCNConv(in_channels, 64) 66 | self.conv2 = GCNConv(64, 32) 67 | self.lin = Lin(32, 1) 68 | else: 69 | self.conv1 = GCNConv(in_channels, 64) 70 | # self.conv2 = GCNConv(64, 32) 71 | self.lin = Lin(64, in_channels) 72 | 73 | def forward(self, data): 74 | if self.opts.problem == 'Prediction': 75 | x, edge_index = data.x, data.edge_index 76 | x = torch.relu(self.conv1(x, edge_index)) 77 | x = torch.relu(self.conv2(x, edge_index)) 78 | x = self.lin(x) 79 | return x 80 | else: 81 | x, edge_index = data.x, data.edge_index 82 | x = torch.relu(self.conv1(x, edge_index)) 83 | # x = torch.relu(self.conv2(x, edge_index)) 84 | x = self.lin(x) 85 | return x 86 | 87 | 88 | class FAE_GraphConv(nn.Module): 89 | def __init__(self, in_channels, opts): 90 | super(FAE_GraphConv, self).__init__() 91 | self.opts = opts 92 | if self.opts.problem == 'Prediction': 93 | self.conv1 = GraphConv(in_channels, 64) 94 | self.conv2 = GraphConv(64, 32, aggr='sum') 95 | self.lin = Lin(32, 1) 96 | else: 97 | self.conv1 = GraphConv(in_channels, 64) 98 | # self.conv2 = GraphConv(64, 32,aggr='mean') 99 | self.lin = Lin(64, in_channels) 100 | 101 | def forward(self, data): 102 | if self.opts.problem == 'Prediction': 103 | x, edge_index = data.x, data.edge_index 104 | x = torch.relu(self.conv1(x, edge_index)) 105 | x = torch.relu(self.conv2(x, edge_index)) 106 | x = self.lin(x) 107 | return x 108 | else: 109 | x, edge_index = data.x, data.edge_index 110 | x = torch.relu(self.conv1(x, edge_index)) 111 | # x = torch.relu(self.conv2(x, edge_index)) 112 | x = self.lin(x) 113 | return x 114 | 115 | 116 | class AE_MLP(nn.Module): 117 | def __init__(self, in_channels, opts): 118 | super(AE_MLP, self).__init__() 119 | self.opts = opts 120 | if self.opts.problem == 'Prediction': 121 | self.lin1 = Lin(in_channels, 64) 122 | self.lin2 = Lin(64, 32) 123 | self.lin3 = Lin(32, 1) 124 | else: 125 | self.lin1 = Lin(in_channels, 64) 126 | self.lin2 = Lin(64, in_channels) 127 | 128 | def forward(self, data): 129 | if self.opts.problem == 'Prediction': 130 | x = data.x 131 | x = torch.relu(self.lin1(x)) 132 | x = torch.relu(self.lin2(x)) 133 | return self.lin3(x) 134 | else: 135 | x = data.x 136 | x = torch.relu(self.lin1(x)) 137 | x = self.lin2(x) 138 | return x 139 | -------------------------------------------------------------------------------- /options.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import argparse 4 | import torch 5 | 6 | def get_options(args=None): 7 | parser = argparse.ArgumentParser( 8 | description="Graph Feature Auto-Encoder for Prediction of Gene Expression Values") 9 | 10 | # Data 11 | parser.add_argument('--problem', default='Imputation_eval', help="Want to predict or Impute the dataset " 12 | "(Values in ['Prediction', 'Imputation', 'Imputation_eval'])") 13 | parser.add_argument('--network', type=str, default='MousePPI', help="(Values in ['MousePPI'] for RNA and " 14 | "['TF_net', 'PPI', 'Genetic'] for Ecoli)") 15 | parser.add_argument('--dataset', type=str, default='RNA', help="(Values in ['Ecoli', 'RNA'])") 16 | parser.add_argument('--datadir', type=str, default='../data/Expression_Values/SingleCellRNA/Mouse', 17 | help="Path to data directory") 18 | 19 | # Model 20 | 21 | parser.add_argument('--model', type=str, default='FeatGraphConv', help="Values in ['GraphConv', 'GCN', 'SAGEConv'," 22 | "'FeatGraphConv','MLP' ," 23 | " 'Magic', 'LR', 'RF']") 24 | parser.add_argument('--embedding', action='store_true', help='Whether to make predictions on the graph embedding ' 25 | '(only in prediction problem)') 26 | parser.add_argument('--random_graph', action='store_true', help='Whether to make predictions based on random graph structure') 27 | parser.add_argument('--random_graph_alpha', type=float, default=.021, help="alpha hyper=parameter for generating" 28 | "random graph in Prediction problem") 29 | parser.add_argument('--hidden', type=int, default=64) 30 | parser.add_argument('--out_channels', type=int, default=32) 31 | 32 | # Training 33 | 34 | parser.add_argument('--learning_rate', type=float, default=0.001) 35 | parser.add_argument('--epochs', type=float, default=20000) 36 | parser.add_argument('--seed', type=int, default=12345, help='Random seed to use') 37 | parser.add_argument('--no_cuda', action='store_true', help='Disable CUDA') 38 | parser.add_argument('--no_features', action='store_true', 39 | help='Whether to use Expression values as node features or not') 40 | 41 | opts = parser.parse_args(args) 42 | 43 | opts.use_cuda = torch.cuda.is_available() and not opts.no_cuda 44 | 45 | return opts -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.4.0 2 | torch-cluster==1.5.4 3 | torch-geometric==1.5.0 4 | torch-scatter==2.0.4 5 | torch-sparse==0.6.0 6 | scikit-learn==0.21.3 7 | numpy==1.16.4 8 | networkx==2.3 9 | magic-impute==2.0.3 10 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | 2 | import pprint as pp 3 | 4 | 5 | import torch 6 | import numpy as np 7 | 8 | from options import get_options 9 | 10 | from utils.functions import load_data_class, load_model 11 | from eval import supervised_prediction_eval, imputation_eval, embedding_prediction_eval 12 | from imputer import impute 13 | 14 | 15 | def run(opts): 16 | 17 | # Pretty print the run args 18 | pp.pprint(vars(opts)) 19 | 20 | # Set the random seed 21 | torch.manual_seed(opts.seed) 22 | 23 | # Set the device 24 | opts.device = torch.device("cuda:0" if opts.use_cuda else "cpu") 25 | 26 | # Choose the dataset to use 27 | data_class = load_data_class(opts.dataset) 28 | 29 | # Load data from load_path 30 | data = data_class(root=opts.datadir, network=opts.network)[0] 31 | 32 | 33 | # Preprocess node features 34 | if opts.no_features: 35 | print('node ids used') 36 | data.x = torch.eye(data.num_nodes).to(opts.device) 37 | 38 | data = data.to(opts.device) 39 | model_class = load_model(opts) 40 | assert opts.problem in ['Prediction', 'Imputation', 'Imputation_eval'], 'only support prediction or imputation of expression values' 41 | print(data) 42 | if opts.problem == 'Prediction': 43 | if not opts.embedding: 44 | supervised_prediction_eval(model_class, data, opts) 45 | else: 46 | embedding_prediction_eval(model_class, data, opts) 47 | 48 | elif opts.problem == 'Imputation_eval': 49 | imputation_eval(model_class, data, opts) 50 | elif opts.problem == 'Imputation': 51 | imputed = impute(model_class, data, opts) 52 | np.save(opts.model + opts.network + '_imputed.npy', imputed.cpu().detach().numpy()) 53 | 54 | 55 | 56 | if __name__ == "__main__": 57 | run(get_options()) -------------------------------------------------------------------------------- /train_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def train_epoch(model, data, optimizer, opts, exp_num=None, criterion=None): 5 | model.train() 6 | optimizer.zero_grad() 7 | 8 | if opts.embedding: 9 | z = model.encode(data.x, data.edge_index) 10 | loss = model.recon_loss(z, data.edge_index) 11 | elif opts.problem == 'Prediction': 12 | output = model(data) 13 | loss = criterion(output[data.train_mask], data.y[data.train_mask, exp_num].reshape([-1, 1])) 14 | elif opts.problem == 'Imputation_eval': 15 | output = model(data) 16 | loss = criterion(output * (data.train_mask), data.y * (data.train_mask)) 17 | else: 18 | output = model(data) 19 | loss = criterion(output * (data.nonzeromask), data.y * (data.nonzeromask)) 20 | loss.backward() 21 | optimizer.step() 22 | return loss.item() 23 | 24 | @torch.no_grad() 25 | def test(model, data, exp_num, criterion, opts): 26 | model.eval() 27 | 28 | if opts.embedding: 29 | lr_out, rf_out = model.predict(data.x, data.edge_index) 30 | loss_lr = criterion(lr_out[data.test_mask], data.y[data.test_mask, exp_num].cpu().data.numpy()) 31 | loss_rf = criterion(rf_out[data.test_mask], data.y[data.test_mask, exp_num].cpu().data.numpy()) 32 | return loss_lr, loss_rf 33 | elif opts.problem == 'Prediction': 34 | output = model(data) 35 | loss = criterion(output[data.test_mask], data.y[data.test_mask, exp_num].reshape([-1, 1])) 36 | else: 37 | output = model(data) 38 | loss = criterion(output*data.test_mask, data.y*data.test_mask) 39 | return loss.item() 40 | -------------------------------------------------------------------------------- /utils/functions.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def load_data_class(name): 4 | from dataset import Ecoli_Exp,RnaSeq 5 | dataset = { 6 | 'Ecoli': Ecoli_Exp, 7 | 'RNA': RnaSeq 8 | }.get(name, None) 9 | assert dataset is not None, "Currently unsupported problem: {}!".format(name) 10 | return dataset 11 | 12 | def load_model(opts): 13 | from models.End_to_End.nets import FAE_GraphConv, FAE_GCN, FAE_SAGEConv, FAE_FeatGraphConv\ 14 | , AE_MLP 15 | from models.Embedding.model import Embedding_ExpGAE 16 | from magic import MAGIC 17 | from sklearn.linear_model import LinearRegression 18 | from sklearn.ensemble import RandomForestRegressor 19 | if not opts.embedding: 20 | model = {'GraphConv': FAE_GraphConv, 21 | 'GCN': FAE_GCN, 22 | 'SAGEConv': FAE_SAGEConv , 23 | 'FeatGraphConv': FAE_FeatGraphConv, 24 | 'MLP': AE_MLP, 25 | 'Magic': MAGIC, 26 | 'LR': LinearRegression, 27 | 'RF': RandomForestRegressor 28 | }.get(opts.model, None) 29 | else: 30 | model = Embedding_ExpGAE 31 | 32 | 33 | assert model is not None, "Currently unsupported model: {}!".format(opts.model) 34 | return model 35 | 36 | 37 | def index_to_mask(index, size): 38 | 39 | mask = torch.zeros(size, dtype=torch.bool) 40 | mask[index] = 1 41 | return mask --------------------------------------------------------------------------------