├── LICENSE ├── README.md ├── environment.yml ├── graphics ├── overview.PNG └── overview.png └── src ├── Ecoli_Promoter_classification ├── Dataset.py ├── Functions.py ├── Logger.py ├── LrScheduler.py ├── Metrics.py ├── Network.py ├── evaluate.py ├── evaluate.sh ├── extract_motif.py ├── readme.md ├── run.sh ├── train.py └── v9d3.csv ├── Enchancer_classification ├── Dataset.py ├── Functions.py ├── Logger.py ├── LrScheduler.py ├── Metrics.py ├── Network.py ├── bert_enhancer_dataset.csv ├── evaluate.py ├── evaluate.sh ├── job.sh ├── job_eval.sh ├── run.sh ├── test.py ├── test.sh └── train.py ├── Eukaryotic_Promoters_Classification ├── README.md ├── human_non_tata │ ├── Dataset.py │ ├── Functions.py │ ├── Logger.py │ ├── LrScheduler.py │ ├── Metrics.py │ ├── Network.py │ ├── check_log.py │ ├── evaluate.py │ ├── extract_motif.py │ ├── run.sh │ └── train.py ├── human_non_tata_deepromoter │ ├── Dataset.py │ ├── DeePromoter.py │ ├── Functions.py │ ├── Logger.py │ ├── LrScheduler.py │ ├── Metrics.py │ ├── Network.py │ ├── check_log.py │ ├── evaluate.py │ ├── extract_motif.py │ ├── run.sh │ └── train.py ├── human_tata │ ├── Dataset.py │ ├── Functions.py │ ├── Logger.py │ ├── LrScheduler.py │ ├── Metrics.py │ ├── Network.py │ ├── check_log.py │ ├── evaluate.py │ ├── extract_motif.py │ ├── run.sh │ └── train.py ├── human_tata_deepromoter │ ├── Dataset.py │ ├── DeePromoter.py │ ├── Functions.py │ ├── Logger.py │ ├── LrScheduler.py │ ├── Metrics.py │ ├── Network.py │ ├── check_log.py │ ├── evaluate.py │ ├── extract_motif.py │ ├── run.sh │ └── train.py ├── mouse_non_tata │ ├── Dataset.py │ ├── Functions.py │ ├── Logger.py │ ├── LrScheduler.py │ ├── Metrics.py │ ├── Network.py │ ├── check_log.py │ ├── evaluate.py │ ├── extract_motif.py │ ├── run.sh │ └── train.py ├── mouse_non_tata_deepromoter │ ├── Dataset.py │ ├── DeePromoter.py │ ├── Functions.py │ ├── Logger.py │ ├── LrScheduler.py │ ├── Metrics.py │ ├── Network.py │ ├── check_log.py │ ├── evaluate.py │ ├── extract_motif.py │ ├── run.sh │ └── train.py ├── mouse_tata │ ├── Dataset.py │ ├── Functions.py │ ├── Logger.py │ ├── LrScheduler.py │ ├── Metrics.py │ ├── Network.py │ ├── check_log.py │ ├── evaluate.py │ ├── extract_motif.py │ ├── run.sh │ └── train.py ├── mouse_tata_deepromoter │ ├── Dataset.py │ ├── DeePromoter.py │ ├── Functions.py │ ├── Logger.py │ ├── LrScheduler.py │ ├── Metrics.py │ ├── Network.py │ ├── check_log.py │ ├── evaluate.py │ ├── extract_motif.py │ ├── run.sh │ └── train.py ├── run_human.sh └── run_mouse.sh ├── Non_Coding_Variant_Effects ├── Dataset.py ├── Functions.py ├── Logger.py ├── LrScheduler.py ├── Metrics.py ├── Network.py ├── compute_median_aucs.py ├── compute_val_aucs.py ├── preprocess_data.py ├── readme.md ├── restart.py ├── restart.sh ├── run.sh ├── test.py ├── test.sh └── train.py └── Viral_identification ├── Dataset.py ├── Functions.py ├── Logger.py ├── LrScheduler.py ├── Metrics.py ├── Network.py ├── evaluate_test.py ├── readme.md ├── run.sh ├── test.sh └── train.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Shujun-He 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Nucleic Transformer: Classifying DNA sequences with Self-attention and Convolutions 2 | 3 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5641875.svg)](https://doi.org/10.5281/zenodo.5641875) 4 | 5 | 6 | Source code to reproduce results in the paper "Nucleic Transformer: Classifying DNA sequences with Self-attention and Convolutions". 7 | 8 |

9 | 10 |

11 | 12 | 13 | ## How to use the models 14 | 15 | I also made a web app to use the models. Check it out at https://github.com/Shujun-He/Nucleic-Transformer-WebApp 16 | 17 | 18 | ## Requirements 19 | I included a file (environment.yml) to recreate the exact environment I used. Since I also use this environment for computer vision tasks, it includes some other packages as well. This should take around 10 minutes. After installing anaconda: 20 | 21 | 22 | ``` 23 | conda env create -f environment.yml 24 | ``` 25 | 26 | Then to activate the environment 27 | 28 | ``` 29 | conda activate torch 30 | ``` 31 | 32 | Additionally, you will need Nvidai Apex: https://github.com/NVIDIA/apex 33 | 34 | ``` 35 | git clone https://github.com/NVIDIA/apex 36 | cd apex 37 | pip install . 38 | ``` 39 | 40 | 41 | 42 | ## Repo file structure 43 | 44 | The src folder includes all the code needed to reproduce results in the paper and the OpenVaccine competition. Additional instructions are in each folder 45 | 46 | ```src/Ecoli_Promoter_classification``` includes all the code and file needed to reproduce results for E.coli promoter classification 47 | 48 | ```src/Eukaryotic_Promoters_Classification``` includes all the code and file needed to reproduce results for eukaryotic promoter classification 49 | 50 | 51 | 52 | ```src/Non_Coding_Variant_Effects``` includeds all the code needed to reproduce results for the deepsea dataset 53 | 54 | ```src/Viral_identification``` includeds all the code needed to reproduce results for the viraminer dataset 55 | 56 | ```src/Enchancer_classification``` includeds all the code needed to reproduce results for the enhancer dataset 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /graphics/overview.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Shujun-He/Nucleic-Transformer/2c020793335417111442684770009bbdf13a885c/graphics/overview.PNG -------------------------------------------------------------------------------- /graphics/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Shujun-He/Nucleic-Transformer/2c020793335417111442684770009bbdf13a885c/graphics/overview.png -------------------------------------------------------------------------------- /src/Ecoli_Promoter_classification/Dataset.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | from tqdm import tqdm 6 | import torch 7 | 8 | 9 | nt_int={ 10 | "A": 0, 11 | "T": 1, 12 | "G": 2, 13 | "C": 3,} 14 | 15 | def nucleatide2int(nt_sequence,target_length=None): 16 | int_sequence=[] 17 | for nt in nt_sequence: 18 | nt=nt.upper() 19 | if nt in nt_int: 20 | int_sequence.append(nt_int[nt]) 21 | int_sequence=np.asarray(int_sequence,dtype='int32') 22 | if target_length: 23 | int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1) 24 | return int_sequence 25 | 26 | 27 | class PromoterDataset(torch.utils.data.Dataset): 28 | def __init__(self,sequences,labels): 29 | self.data=[] 30 | for seq in sequences: 31 | self.data.append(nucleatide2int(seq)) 32 | 33 | self.data=np.asarray(self.data,dtype='int') 34 | self.labels=labels 35 | 36 | print(self.data.shape) 37 | print(self.labels.shape) 38 | 39 | def __len__(self): 40 | return len(self.labels) 41 | 42 | def __getitem__(self,idx): 43 | return {'data':self.data[idx], 'labels':self.labels[idx]} 44 | -------------------------------------------------------------------------------- /src/Ecoli_Promoter_classification/Functions.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | from sklearn import metrics 4 | import numpy as np 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from tqdm import tqdm 8 | import Metrics 9 | import numpy as np 10 | import os 11 | import pandas as pd 12 | import torch 13 | import random 14 | from sklearn.model_selection import StratifiedKFold 15 | 16 | 17 | def iter_split(data,labels,fold,nfolds=5,seed=2020): 18 | splits = StratifiedKFold(n_splits=nfolds, random_state=seed, shuffle=True) 19 | splits = list(splits.split(data,labels)) 20 | # splits = np.zeros(len(data)).astype(np.int) 21 | # for i in range(nfolds): splits[splits[i][1]] = i 22 | # indices=np.arange(len(data)) 23 | train_indices=splits[fold][0] 24 | val_indices=splits[fold][1] 25 | return train_indices, val_indices 26 | 27 | def seed_everything(seed=42): 28 | random.seed(seed) 29 | os.environ['PYTHONHASHSEED'] = str(seed) 30 | np.random.seed(seed) 31 | torch.manual_seed(seed) 32 | torch.cuda.manual_seed(seed) 33 | torch.backends.cudnn.deterministic = True 34 | seed_everything(seed=42) 35 | 36 | def get_best_weights_from_fold(fold,top=1): 37 | csv_file='log_fold{}.csv'.format(fold) 38 | 39 | history=pd.read_csv(csv_file) 40 | scores=np.asarray(history.val_acc) 41 | top_epochs=scores.argsort()[-3:][::-1] 42 | print(scores[top_epochs]) 43 | os.system('mkdir best_weights') 44 | 45 | for i in range(top): 46 | weights_path='checkpoints_fold{}/epoch{}.ckpt'.format(fold,history.epoch[top_epochs[i]]) 47 | print(weights_path) 48 | os.system('cp {} best_weights/fold{}top{}.ckpt'.format(weights_path,fold,i+1)) 49 | os.system('rm -r checkpoints_fold{}'.format(fold)) 50 | 51 | def smoothcrossentropyloss(pred,gold,n_class=2,smoothing=0.05): 52 | gold = gold.contiguous().view(-1) 53 | one_hot = torch.zeros_like(pred).scatter(1, gold.view(-1, 1), 1) 54 | one_hot = one_hot * (1 - smoothing) + (1 - one_hot) * smoothing / (n_class - 1) 55 | log_prb = F.log_softmax(pred, dim=1) 56 | loss = -(one_hot * log_prb) 57 | #loss=loss.sum(1).mean() 58 | return loss 59 | 60 | def mutate_dna_sequence(sequence,nmute=15): 61 | mutation=torch.randint(0,4,size=(sequence.shape[0],nmute)) 62 | to_mutate = torch.randperm(sequence.shape[1])[:nmute] 63 | sequence[:,to_mutate]=mutation 64 | return sequence 65 | 66 | def get_MLM_mask(sequence,nmask=12): 67 | mask=np.zeros(sequence.shape,dtype='bool') 68 | to_mask=np.random.choice(len(sequence[0]),size=(nmask),replace=False) 69 | mask[:,to_mask]=True 70 | return mask 71 | 72 | def get_complementary_sequence(sequence): 73 | complementary_sequence=sequence.copy() 74 | complementary_sequence[sequence==0]=1 75 | complementary_sequence[sequence==1]=0 76 | complementary_sequence[sequence==2]=3 77 | complementary_sequence[sequence==3]=2 78 | complementary_sequence=complementary_sequence[:,::-1] 79 | return complementary_sequence 80 | 81 | def update_lr(optimizer, lr): 82 | for param_group in optimizer.param_groups: 83 | param_group['lr'] = lr 84 | 85 | def save_weights(model,optimizer,epoch,folder): 86 | if os.path.isdir(folder)==False: 87 | os.makedirs(folder,exist_ok=True) 88 | torch.save(model.state_dict(), folder+'/epoch{}.ckpt'.format(epoch+1)) 89 | 90 | def get_lr(optimizer): 91 | for param_group in optimizer.param_groups: 92 | lr=param_group['lr'] 93 | return lr 94 | 95 | def validate(model,device,dataset,batch_size=64): 96 | batches=len(dataset) 97 | model.train(False) 98 | total=0 99 | ground_truths=[] 100 | predictions=[] 101 | loss=0 102 | criterion=nn.CrossEntropyLoss() 103 | # dataset.switch_mode(training=False) 104 | # dataset.update_batchsize(batch_size) 105 | with torch.no_grad(): 106 | for data in tqdm(dataset): 107 | #data=dataset[i] 108 | X=data['data'].to(device).long() 109 | Y=data['labels'].to(device).long() 110 | output= model(X,None) 111 | del X 112 | loss+=criterion(output,Y) 113 | classification_predictions = torch.argmax(output,dim=1).squeeze() 114 | for pred in classification_predictions: 115 | predictions.append(pred.cpu().numpy()) 116 | for truth in Y: 117 | ground_truths.append(truth.cpu().numpy()) 118 | del output 119 | ground_truths=np.asarray(ground_truths) 120 | torch.cuda.empty_cache() 121 | val_loss=(loss/batches).cpu() 122 | predictions=np.asarray(predictions) 123 | binary_predictions=predictions.copy() 124 | binary_predictions[binary_predictions==2]=1 125 | binary_ground_truths=ground_truths.copy() 126 | binary_ground_truths[binary_ground_truths==2]=1 127 | #print(predictions) 128 | #print(ground_truths) 129 | #score=metrics.cohen_kappa_score(ground_truths,predictions,weights='quadratic') 130 | val_acc=Metrics.accuracy(predictions,ground_truths) 131 | val_sens=Metrics.sensitivity(predictions,ground_truths) 132 | val_spec=Metrics.specificity(predictions,ground_truths) 133 | binary_acc=np.sum(binary_predictions==binary_ground_truths)/len(binary_ground_truths) 134 | print('Accuracy: {}, Binary Accuracy: {} Val Loss: {}'.format(val_acc,binary_acc,val_loss)) 135 | return val_loss,val_acc,val_sens,val_spec 136 | 137 | 138 | def predict(model,device,dataset,batch_size=64): 139 | batches=len(dataset) 140 | model.train(False) 141 | total=0 142 | ground_truths=[] 143 | predictions=[] 144 | attention_weights=[] 145 | sequences=[] 146 | loss=0 147 | criterion=nn.CrossEntropyLoss() 148 | with torch.no_grad(): 149 | for data in tqdm(dataset): 150 | #data=dataset[i] 151 | X=data['data'].to(device,).long() 152 | Y=data['labels'].to(device,dtype=torch.int64) 153 | 154 | output,aw= model(X,None) 155 | #del X 156 | loss+=criterion(output,Y) 157 | classification_predictions = torch.argmax(output,dim=1).squeeze() 158 | for pred in output: 159 | predictions.append(pred.cpu().numpy()) 160 | for weight in aw: 161 | attention_weights.append(weight.cpu().numpy()) 162 | 163 | for t in Y: 164 | ground_truths.append(t.cpu().numpy()) 165 | for seq in X: 166 | sequences.append(seq.cpu().numpy()) 167 | del output 168 | torch.cuda.empty_cache() 169 | val_loss=(loss/batches).cpu() 170 | predictions=np.asarray(predictions) 171 | attention_weights=np.asarray(attention_weights) 172 | binary_predictions=predictions.copy() 173 | binary_predictions[binary_predictions==2]=1 174 | binary_ground_truths=ground_truths.copy() 175 | binary_ground_truths[binary_ground_truths==2]=1 176 | return predictions,attention_weights,np.asarray(sequences),np.asarray(ground_truths) 177 | -------------------------------------------------------------------------------- /src/Ecoli_Promoter_classification/Logger.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from os import path 3 | 4 | 5 | class CSVLogger: 6 | def __init__(self,columns,file): 7 | self.columns=columns 8 | self.file=file 9 | if not self.check_header(): 10 | self._write_header() 11 | 12 | 13 | def check_header(self): 14 | if path.exists(self.file): 15 | # with open(self.file, 'r') as csvfile: 16 | # sniffer = csv.Sniffer() 17 | # has_header = sniffer.has_header(csvfile.read(2048)) 18 | # header=csvfile.seek(0) 19 | header=True 20 | else: 21 | header=False 22 | return header 23 | 24 | 25 | def _write_header(self): 26 | with open(self.file,"a") as f: 27 | string="" 28 | for attrib in self.columns: 29 | string+="{},".format(attrib) 30 | string=string[:len(string)-1] 31 | string+="\n" 32 | f.write(string) 33 | return self 34 | 35 | def log(self,row): 36 | if len(row)!=len(self.columns): 37 | raise Exception("Mismatch between row vector and number of columns in logger") 38 | with open(self.file,"a") as f: 39 | string="" 40 | for attrib in row: 41 | string+="{},".format(attrib) 42 | string=string[:len(string)-1] 43 | string+="\n" 44 | f.write(string) 45 | return self 46 | -------------------------------------------------------------------------------- /src/Ecoli_Promoter_classification/LrScheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def update_lr(optimizer, lr): 5 | for param_group in optimizer.param_groups: 6 | param_group['lr'] = lr 7 | 8 | class lr_AIAYN(): 9 | ''' 10 | Learning rate scheduler from the paper: 11 | Attention is All You Need 12 | ''' 13 | def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1): 14 | self.optimizer=optimizer 15 | self.d_model=d_model 16 | self.warmup_steps=warmup_steps 17 | self.step_num=0 18 | self.factor=factor 19 | 20 | def step(self): 21 | self.step_num+=1 22 | lr=self.d_model**-0.5*np.min([self.step_num**-0.5, 23 | self.step_num*self.warmup_steps**-1.5])*self.factor 24 | update_lr(self.optimizer,lr) 25 | return lr 26 | 27 | 28 | class Cos_Anneal(): 29 | ''' 30 | Learning rate scheduler flat and anneal 31 | ''' 32 | def __init__(self,optimizer,max_lr,min_lr,T): 33 | self.optimizer=optimizer 34 | self.max_lr=max_lr 35 | self.min_lr=min_lr 36 | self.step_num=0 37 | self.T=T 38 | 39 | def step(self): 40 | pi=3.1415 41 | self.step_num+=1 42 | lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi)) 43 | if self.optimizer: 44 | update_lr(self.optimizer,lr) 45 | return lr -------------------------------------------------------------------------------- /src/Ecoli_Promoter_classification/Metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def accuracy(predictions,ground_truths): 5 | return np.sum(predictions==ground_truths)/len(ground_truths) 6 | 7 | 8 | def sensitivity(predictions,ground_truths): 9 | ''' 10 | Here it is assumed: 11 | 0=negative 12 | 1=positive 13 | ''' 14 | return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1]) 15 | 16 | 17 | 18 | def specificity(predictions,ground_truths): 19 | ''' 20 | Here it is assumed: 21 | 0=negative 22 | 1=positive 23 | ''' 24 | return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0]) 25 | 26 | def MCC(predictions,ground_truths): 27 | ''' 28 | Here it is assumed: 29 | 0=negative 30 | 1=positive 31 | ''' 32 | N1=len(predictions[(predictions==0)&(ground_truths==1)]) 33 | N2=len(predictions[(predictions==1)&(ground_truths==0)]) 34 | N3=len(ground_truths[ground_truths==1]) 35 | N4=len(ground_truths[ground_truths==0]) 36 | sens=1-N1/N3 37 | spec=1-N2/N4 38 | denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4)) 39 | return (1-sens-spec)/denom 40 | 41 | 42 | -------------------------------------------------------------------------------- /src/Ecoli_Promoter_classification/evaluate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import time 6 | from Functions import * 7 | from Dataset import * 8 | from Network import * 9 | from LrScheduler import * 10 | import Metrics 11 | from Logger import CSVLogger 12 | import argparse 13 | 14 | try: 15 | #from apex.parallel import DistributedDataParallel as DDP 16 | from apex.fp16_utils import * 17 | from apex import amp, optimizers 18 | from apex.multi_tensor_apply import multi_tensor_applier 19 | except ImportError: 20 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.") 21 | import pickle 22 | #gpu selection 23 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 24 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 25 | from sklearn.metrics import matthews_corrcoef 26 | def get_args(): 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument('--gpu_id', type=str, default='0', help='which gpu to use') 29 | parser.add_argument('--path', type=str, default='../v9d3.csv', help='path of csv file with DNA sequences and labels') 30 | parser.add_argument('--epochs', type=int, default=150, help='number of epochs to train') 31 | parser.add_argument('--batch_size', type=int, default=24, help='size of each batch during training') 32 | parser.add_argument('--weight_decay', type=float, default=0, help='weight dacay used in optimizer') 33 | parser.add_argument('--ntoken', type=int, default=4, help='number of tokens to represent DNA nucleotides (should always be 4)') 34 | parser.add_argument('--nclass', type=int, default=2, help='number of classes from the linear decoder') 35 | parser.add_argument('--ninp', type=int, default=512, help='ninp for transformer encoder') 36 | parser.add_argument('--nhead', type=int, default=8, help='nhead for transformer encoder') 37 | parser.add_argument('--nhid', type=int, default=2048, help='nhid for transformer encoder') 38 | parser.add_argument('--nlayers', type=int, default=6, help='nlayers for transformer encoder') 39 | parser.add_argument('--save_freq', type=int, default=1, help='saving checkpoints per save_freq epochs') 40 | parser.add_argument('--dropout', type=float, default=.1, help='transformer dropout') 41 | parser.add_argument('--warmup_steps', type=int, default=3200, help='training schedule warmup steps') 42 | parser.add_argument('--lr_scale', type=float, default=0.1, help='learning rate scale') 43 | parser.add_argument('--nmute', type=int, default=18, help='number of mutations during training') 44 | parser.add_argument('--kmers', type=int, nargs='+', default=[2,3,4,5,6], help='k-mers to be aggregated') 45 | #parser.add_argument('--kmer_aggregation', type=bool, default=True, help='k-mers to be aggregated') 46 | parser.add_argument('--kmer_aggregation', dest='kmer_aggregation', action='store_true') 47 | parser.add_argument('--no_kmer_aggregation', dest='kmer_aggregation', action='store_false') 48 | parser.set_defaults(kmer_aggregation=True) 49 | parser.add_argument('--nfolds', type=int, default=5, help='number of cross validation folds') 50 | parser.add_argument('--fold', type=int, default=0, help='which fold to train') 51 | opts = parser.parse_args() 52 | return opts 53 | 54 | def evaluate_fold(fold): 55 | 56 | #load data 57 | #opts=get_args() 58 | df=pd.read_csv('v9d3.csv') 59 | 60 | sequences=np.asarray(df.seqs) 61 | labels=np.asarray(df.labels) 62 | 63 | train_indices, val_indices=iter_split(sequences,labels,fold,opts.nfolds) 64 | # print(train_indices.shape) 65 | # print(val_indices.shape) 66 | # exit() 67 | dataset=PromoterDataset(sequences[train_indices],labels[train_indices]) 68 | val_dataset=PromoterDataset(sequences[val_indices],labels[val_indices]) 69 | dataloader=torch.utils.data.DataLoader(dataset,batch_size=opts.batch_size,shuffle=True) 70 | val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=opts.batch_size*2,shuffle=False) 71 | 72 | 73 | 74 | #init model 75 | model=NucleicTransformer(opts.ntoken, opts.nclass, opts.ninp, opts.nhead, opts.nhid, 76 | opts.nlayers, opts.kmer_aggregation, kmers=opts.kmers, 77 | dropout=opts.dropout,return_aw=True).to(device) 78 | #optimizer=torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=weight_decay) 79 | 80 | # Initialization 81 | # opt_level = 'O1' 82 | # model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) 83 | 84 | # pytorch_total_params = sum(p.numel() for p in model.parameters()) 85 | # print('Total number of paramters: {}'.format(pytorch_total_params)) 86 | 87 | #evaluation loop 88 | #ground_truths=dataset.labels[dataset.val_indices] 89 | ensemble_predictions=[] 90 | acc=[] 91 | 92 | weights_path="best_weights/fold{}top1.ckpt".format(fold,i+1) 93 | print(weights_path) 94 | checkpoint=torch.load(weights_path) 95 | model.load_state_dict(checkpoint) 96 | predictions,attention_weights,sequences,ground_truths=predict(model,device,val_dataloader) 97 | # #validate(model,device,dataset,batch_size=batch_size*2) 98 | predictions=np.exp(predictions)/np.sum(np.exp(predictions),axis=1).reshape(len(predictions),1) 99 | ensemble_predictions.append(predictions) 100 | ensemble_predictions=np.asarray(ensemble_predictions) 101 | ensemble_predictions=np.mean(np.asarray(ensemble_predictions),axis=0) 102 | model.cpu() 103 | del model 104 | #del optimizer 105 | torch.cuda.empty_cache() 106 | return ensemble_predictions, ground_truths, attention_weights, sequences 107 | 108 | opts=get_args() 109 | 110 | 111 | predictions=[] 112 | ground_truths=[] 113 | attention_weights=[] 114 | sequences=[] 115 | for i in range(5): 116 | ngram=[7] 117 | p,t,at,seq= evaluate_fold(i) 118 | predictions.append(p) 119 | ground_truths.append(t) 120 | print(at.shape) 121 | attention_weights.append(at) 122 | sequences.append(seq) 123 | 124 | 125 | probs=np.concatenate(predictions) 126 | ground_truths=np.concatenate(ground_truths) 127 | predictions=np.argmax(probs,axis=1) 128 | attention_weights=np.squeeze(np.concatenate(attention_weights,0)).astype('float16') 129 | sequences=np.asarray(sequences).reshape(-1,81) 130 | acc=Metrics.accuracy(predictions,ground_truths) 131 | sens=Metrics.sensitivity(predictions,ground_truths) 132 | spec=Metrics.specificity(predictions,ground_truths) 133 | MCC=matthews_corrcoef(ground_truths,predictions) 134 | 135 | prediction_dict={'predictions':np.squeeze(predictions), 136 | 'ground_truths':np.squeeze(ground_truths), 137 | 'attention_weights':np.squeeze(attention_weights), 138 | 'sequences':np.squeeze(sequences.reshape(-1,81)) 139 | } 140 | 141 | with open("prediction_dict.p","wb+") as f: 142 | pickle.dump(prediction_dict,f) 143 | 144 | 145 | with open("cv.txt",'w+') as f: 146 | f.write(f"ACC: {acc}\n") 147 | f.write(f"sensitivity: {sens}\n") 148 | f.write(f"spec: {spec}\n") 149 | f.write(f"MCC: {MCC}\n") 150 | -------------------------------------------------------------------------------- /src/Ecoli_Promoter_classification/evaluate.sh: -------------------------------------------------------------------------------- 1 | python -i evaluate.py --gpu_id 0 --kmer_aggregation --epochs 150 --nlayers 6 --nmute 15 --path v9d3.csv --kmers 7 --ninp 256 --nhid 1024 2 | -------------------------------------------------------------------------------- /src/Ecoli_Promoter_classification/extract_motif.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import os 4 | from tqdm import tqdm 5 | import pandas as pd 6 | import argparse 7 | import matplotlib.pyplot as plt 8 | import matplotlib 9 | 10 | font = {'family' : 'normal', 11 | 'weight' : 'bold', 12 | 'size' : 10} 13 | 14 | matplotlib.rc('font', **font) 15 | 16 | def get_args(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--kmers', type=int, default='7', help='kmer') 19 | opts = parser.parse_args() 20 | return opts 21 | 22 | opts=get_args() 23 | 24 | nts=[ 25 | "A", 26 | "T", 27 | "G", 28 | "C"] 29 | 30 | def int2nucleotide(nt_sequence,target_length=None): 31 | seq='' 32 | for nt in nt_sequence: 33 | seq+=nts[nt] 34 | return seq 35 | 36 | with open("prediction_dict.p","rb") as f: 37 | prediction_dict=pickle.load(f) 38 | 39 | 40 | df=pd.DataFrame(columns=['index','sequence']) 41 | 42 | def get_kmers(sequence,k): 43 | kmers=[] 44 | for i in range(len(sequence)-k+1): 45 | kmers.append(sequence[i:i+k]) 46 | return kmers 47 | 48 | os.system('mkdir aw_visualized') 49 | 50 | top=10 51 | count=0 52 | sequences=[] 53 | top_kmers=[] 54 | top_k_count=[] 55 | for i in tqdm(range(len(prediction_dict['sequences']))): 56 | 57 | count+=1 58 | sequence=int2nucleotide(prediction_dict['sequences'][i]) 59 | sequences.append(sequence) 60 | attention_weights=prediction_dict['attention_weights'][i] 61 | ground_truth=prediction_dict['ground_truths'][i] 62 | prediction=prediction_dict['predictions'][i] 63 | 64 | kmers=np.asarray(get_kmers(sequence,opts.kmers)) 65 | 66 | attention_weights=attention_weights[-1].sum(0) 67 | #attention_weights=attention_weights/attention_weights.sum() 68 | # plt.imshow(attention_weights.reshape(1,-1).astype('float32')) 69 | # plt.show() 70 | #exit() 71 | if ground_truth==1: 72 | state='positive' 73 | else: 74 | state='negative' 75 | 76 | if ground_truth==prediction: 77 | eval='correct' 78 | else: 79 | eval='wrong' 80 | if state=='positive' and eval=='correct': 81 | sorted_indices=np.argsort(attention_weights) 82 | #print(attention_weights[sorted_indices][-3:]) 83 | top_k=kmers[sorted_indices][-3:] 84 | for kmer in top_k: 85 | if kmer not in top_kmers: 86 | top_kmers.append(kmer) 87 | top_k_count.append(1) 88 | else: 89 | top_k_count[top_kmers.index(kmer)]=top_k_count[top_kmers.index(kmer)]+1 90 | #exit() 91 | 92 | top_kmers=np.asarray(top_kmers) 93 | top_k_count=np.asarray(top_k_count) 94 | 95 | #exit() 96 | 97 | top_indices=np.flip(np.argsort(top_k_count)) 98 | 99 | fig, ax = plt.subplots() 100 | x=np.arange(top) 101 | width=0.4 102 | bar=ax.bar(x,top_k_count[top_indices[:top]],edgecolor='k',linewidth=2) 103 | ax.set_ylabel('Num of appearancesin top 3',fontsize=10) 104 | #ax.set_title('Scores by group and gender') 105 | ax.set_xticks(x) 106 | ax.set_xticklabels(top_kmers[top_indices[:top]]) 107 | plt.setp(ax.get_xticklabels(), rotation=30, ha="right", 108 | rotation_mode="anchor") 109 | ax.legend() 110 | plt.savefig('promoter_motifs.eps') 111 | #plt.show() 112 | -------------------------------------------------------------------------------- /src/Ecoli_Promoter_classification/readme.md: -------------------------------------------------------------------------------- 1 | # Source code to train nucleic transformer to reproduce results in the paper 2 | 3 | Dataset is included here in v9d3.csv 4 | 5 | To run: 6 | ```bash run.sh``` 7 | 8 | To get cross validation results: 9 | ```bash evaluate.sh``` 10 | 11 | Results will be in cv.txt 12 | 13 | To extract top promoter motifs based on attention weights: 14 | ```python extract_motif.py``` 15 | 16 | An eps file named promoter_motifs.eps will be generated 17 | -------------------------------------------------------------------------------- /src/Ecoli_Promoter_classification/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for i in {0..4};do 3 | python train.py --fold $i --gpu_id 0 --kmer_aggregation --epochs 150 --nlayers 6 --nmute 15 --path v9d3.csv --kmers 7 --ninp 256 --nhid 1024 4 | done 5 | -------------------------------------------------------------------------------- /src/Enchancer_classification/Dataset.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | from tqdm import tqdm 6 | import torch 7 | 8 | 9 | nt_int={ 10 | "A": 0, 11 | "T": 1, 12 | "G": 2, 13 | "C": 3,} 14 | 15 | def nucleatide2int(nt_sequence,target_length=None): 16 | int_sequence=[] 17 | for nt in nt_sequence: 18 | nt=nt.upper() 19 | if nt in nt_int: 20 | int_sequence.append(nt_int[nt]) 21 | int_sequence=np.asarray(int_sequence,dtype='int32') 22 | if target_length: 23 | int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1) 24 | return int_sequence 25 | 26 | 27 | class PromoterDataset(torch.utils.data.Dataset): 28 | def __init__(self,sequences,labels): 29 | self.data=[] 30 | for seq in sequences: 31 | self.data.append(nucleatide2int(seq)) 32 | 33 | self.data=np.asarray(self.data,dtype='int') 34 | self.labels=labels 35 | 36 | print(self.data.shape) 37 | print(self.labels.shape) 38 | 39 | def __len__(self): 40 | return len(self.labels) 41 | 42 | def __getitem__(self,idx): 43 | return {'data':self.data[idx], 'labels':self.labels[idx]} 44 | -------------------------------------------------------------------------------- /src/Enchancer_classification/Functions.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | from sklearn import metrics 4 | import numpy as np 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from tqdm import tqdm 8 | import Metrics 9 | import numpy as np 10 | import os 11 | import pandas as pd 12 | import torch 13 | import random 14 | from sklearn.model_selection import StratifiedKFold 15 | 16 | 17 | def iter_split(data,labels,fold,nfolds=5,seed=2020): 18 | splits = StratifiedKFold(n_splits=nfolds, random_state=seed, shuffle=True) 19 | splits = list(splits.split(data,labels)) 20 | # splits = np.zeros(len(data)).astype(np.int) 21 | # for i in range(nfolds): splits[splits[i][1]] = i 22 | # indices=np.arange(len(data)) 23 | train_indices=splits[fold][0] 24 | val_indices=splits[fold][1] 25 | return train_indices, val_indices 26 | 27 | def seed_everything(seed=42): 28 | random.seed(seed) 29 | os.environ['PYTHONHASHSEED'] = str(seed) 30 | np.random.seed(seed) 31 | torch.manual_seed(seed) 32 | torch.cuda.manual_seed(seed) 33 | torch.backends.cudnn.deterministic = True 34 | seed_everything(seed=42) 35 | 36 | def get_best_weights_from_fold(fold,top=1): 37 | csv_file='log_fold{}.csv'.format(fold) 38 | 39 | history=pd.read_csv(csv_file) 40 | scores=np.asarray(history.val_acc) 41 | top_epochs=scores.argsort()[-3:][::-1] 42 | print(scores[top_epochs]) 43 | os.system('mkdir best_weights') 44 | 45 | for i in range(top): 46 | weights_path='checkpoints_fold{}/epoch{}.ckpt'.format(fold,history.epoch[top_epochs[i]]) 47 | print(weights_path) 48 | os.system('cp {} best_weights/fold{}top{}.ckpt'.format(weights_path,fold,i+1)) 49 | os.system('rm -r checkpoints_fold{}'.format(fold)) 50 | 51 | def smoothcrossentropyloss(pred,gold,n_class=2,smoothing=0.05): 52 | gold = gold.contiguous().view(-1) 53 | one_hot = torch.zeros_like(pred).scatter(1, gold.view(-1, 1), 1) 54 | one_hot = one_hot * (1 - smoothing) + (1 - one_hot) * smoothing / (n_class - 1) 55 | log_prb = F.log_softmax(pred, dim=1) 56 | loss = -(one_hot * log_prb) 57 | #loss=loss.sum(1).mean() 58 | return loss 59 | 60 | def mutate_dna_sequence(sequence,nmute=15): 61 | mutation=torch.randint(0,4,size=(sequence.shape[0],nmute)) 62 | to_mutate = torch.randperm(sequence.shape[1])[:nmute] 63 | sequence[:,to_mutate]=mutation 64 | return sequence 65 | 66 | def get_MLM_mask(sequence,nmask=12): 67 | mask=np.zeros(sequence.shape,dtype='bool') 68 | to_mask=np.random.choice(len(sequence[0]),size=(nmask),replace=False) 69 | mask[:,to_mask]=True 70 | return mask 71 | 72 | def get_complementary_sequence(sequence): 73 | complementary_sequence=sequence.copy() 74 | complementary_sequence[sequence==0]=1 75 | complementary_sequence[sequence==1]=0 76 | complementary_sequence[sequence==2]=3 77 | complementary_sequence[sequence==3]=2 78 | complementary_sequence=complementary_sequence[:,::-1] 79 | return complementary_sequence 80 | 81 | def update_lr(optimizer, lr): 82 | for param_group in optimizer.param_groups: 83 | param_group['lr'] = lr 84 | 85 | def save_weights(model,optimizer,epoch,folder): 86 | if os.path.isdir(folder)==False: 87 | os.makedirs(folder,exist_ok=True) 88 | torch.save(model.state_dict(), folder+'/epoch{}.ckpt'.format(epoch+1)) 89 | 90 | def get_lr(optimizer): 91 | for param_group in optimizer.param_groups: 92 | lr=param_group['lr'] 93 | return lr 94 | 95 | def validate(model,device,dataset,batch_size=64): 96 | batches=len(dataset) 97 | model.train(False) 98 | total=0 99 | ground_truths=[] 100 | predictions=[] 101 | loss=0 102 | criterion=nn.CrossEntropyLoss() 103 | # dataset.switch_mode(training=False) 104 | # dataset.update_batchsize(batch_size) 105 | with torch.no_grad(): 106 | for data in tqdm(dataset): 107 | #data=dataset[i] 108 | X=data['data'].to(device).long() 109 | Y=data['labels'].to(device).long() 110 | output= model(X,None) 111 | del X 112 | loss+=criterion(output,Y) 113 | classification_predictions = torch.argmax(output,dim=1).squeeze() 114 | for pred in classification_predictions: 115 | predictions.append(pred.cpu().numpy()) 116 | for truth in Y: 117 | ground_truths.append(truth.cpu().numpy()) 118 | del output 119 | ground_truths=np.asarray(ground_truths) 120 | torch.cuda.empty_cache() 121 | val_loss=(loss/batches).cpu() 122 | predictions=np.asarray(predictions) 123 | binary_predictions=predictions.copy() 124 | binary_predictions[binary_predictions==2]=1 125 | binary_ground_truths=ground_truths.copy() 126 | binary_ground_truths[binary_ground_truths==2]=1 127 | #print(predictions) 128 | #print(ground_truths) 129 | #score=metrics.cohen_kappa_score(ground_truths,predictions,weights='quadratic') 130 | val_acc=Metrics.accuracy(predictions,ground_truths) 131 | val_sens=Metrics.sensitivity(predictions,ground_truths) 132 | val_spec=Metrics.specificity(predictions,ground_truths) 133 | binary_acc=np.sum(binary_predictions==binary_ground_truths)/len(binary_ground_truths) 134 | print('Accuracy: {}, Binary Accuracy: {} Val Loss: {}'.format(val_acc,binary_acc,val_loss)) 135 | return val_loss,val_acc,val_sens,val_spec 136 | 137 | 138 | def predict(model,device,dataset,batch_size=64): 139 | batches=len(dataset) 140 | model.train(False) 141 | total=0 142 | ground_truths=[] 143 | predictions=[] 144 | attention_weights=[] 145 | sequences=[] 146 | loss=0 147 | criterion=nn.CrossEntropyLoss() 148 | with torch.no_grad(): 149 | for data in tqdm(dataset): 150 | #data=dataset[i] 151 | X=data['data'].to(device,).long() 152 | Y=data['labels'].to(device,dtype=torch.int64) 153 | 154 | output,aw= model(X,None) 155 | #del X 156 | loss+=criterion(output,Y) 157 | classification_predictions = torch.argmax(output,dim=1).squeeze() 158 | for pred in output: 159 | predictions.append(pred.cpu().numpy()) 160 | for weight in aw: 161 | attention_weights.append(weight.cpu().numpy()) 162 | 163 | for t in Y: 164 | ground_truths.append(t.cpu().numpy()) 165 | for seq in X: 166 | sequences.append(seq.cpu().numpy()) 167 | del output 168 | torch.cuda.empty_cache() 169 | val_loss=(loss/batches).cpu() 170 | predictions=np.asarray(predictions) 171 | attention_weights=np.asarray(attention_weights) 172 | binary_predictions=predictions.copy() 173 | binary_predictions[binary_predictions==2]=1 174 | binary_ground_truths=ground_truths.copy() 175 | binary_ground_truths[binary_ground_truths==2]=1 176 | return predictions,attention_weights,np.asarray(sequences),np.asarray(ground_truths) 177 | -------------------------------------------------------------------------------- /src/Enchancer_classification/Logger.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from os import path 3 | 4 | 5 | class CSVLogger: 6 | def __init__(self,columns,file): 7 | self.columns=columns 8 | self.file=file 9 | if not self.check_header(): 10 | self._write_header() 11 | 12 | 13 | def check_header(self): 14 | if path.exists(self.file): 15 | # with open(self.file, 'r') as csvfile: 16 | # sniffer = csv.Sniffer() 17 | # has_header = sniffer.has_header(csvfile.read(2048)) 18 | # header=csvfile.seek(0) 19 | header=True 20 | else: 21 | header=False 22 | return header 23 | 24 | 25 | def _write_header(self): 26 | with open(self.file,"a") as f: 27 | string="" 28 | for attrib in self.columns: 29 | string+="{},".format(attrib) 30 | string=string[:len(string)-1] 31 | string+="\n" 32 | f.write(string) 33 | return self 34 | 35 | def log(self,row): 36 | if len(row)!=len(self.columns): 37 | raise Exception("Mismatch between row vector and number of columns in logger") 38 | with open(self.file,"a") as f: 39 | string="" 40 | for attrib in row: 41 | string+="{},".format(attrib) 42 | string=string[:len(string)-1] 43 | string+="\n" 44 | f.write(string) 45 | return self 46 | -------------------------------------------------------------------------------- /src/Enchancer_classification/LrScheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def update_lr(optimizer, lr): 5 | for param_group in optimizer.param_groups: 6 | param_group['lr'] = lr 7 | 8 | class lr_AIAYN(): 9 | ''' 10 | Learning rate scheduler from the paper: 11 | Attention is All You Need 12 | ''' 13 | def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1): 14 | self.optimizer=optimizer 15 | self.d_model=d_model 16 | self.warmup_steps=warmup_steps 17 | self.step_num=0 18 | self.factor=factor 19 | 20 | def step(self): 21 | self.step_num+=1 22 | lr=self.d_model**-0.5*np.min([self.step_num**-0.5, 23 | self.step_num*self.warmup_steps**-1.5])*self.factor 24 | update_lr(self.optimizer,lr) 25 | return lr 26 | 27 | 28 | class Cos_Anneal(): 29 | ''' 30 | Learning rate scheduler flat and anneal 31 | ''' 32 | def __init__(self,optimizer,max_lr,min_lr,T): 33 | self.optimizer=optimizer 34 | self.max_lr=max_lr 35 | self.min_lr=min_lr 36 | self.step_num=0 37 | self.T=T 38 | 39 | def step(self): 40 | pi=3.1415 41 | self.step_num+=1 42 | lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi)) 43 | if self.optimizer: 44 | update_lr(self.optimizer,lr) 45 | return lr -------------------------------------------------------------------------------- /src/Enchancer_classification/Metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def accuracy(predictions,ground_truths): 5 | return np.sum(predictions==ground_truths)/len(ground_truths) 6 | 7 | 8 | def sensitivity(predictions,ground_truths): 9 | ''' 10 | Here it is assumed: 11 | 0=negative 12 | 1=positive 13 | ''' 14 | return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1]) 15 | 16 | 17 | 18 | def specificity(predictions,ground_truths): 19 | ''' 20 | Here it is assumed: 21 | 0=negative 22 | 1=positive 23 | ''' 24 | return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0]) 25 | 26 | def MCC(predictions,ground_truths): 27 | ''' 28 | Here it is assumed: 29 | 0=negative 30 | 1=positive 31 | ''' 32 | N1=len(predictions[(predictions==0)&(ground_truths==1)]) 33 | N2=len(predictions[(predictions==1)&(ground_truths==0)]) 34 | N3=len(ground_truths[ground_truths==1]) 35 | N4=len(ground_truths[ground_truths==0]) 36 | sens=1-N1/N3 37 | spec=1-N2/N4 38 | denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4)) 39 | return (1-sens-spec)/denom 40 | 41 | 42 | -------------------------------------------------------------------------------- /src/Enchancer_classification/evaluate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import time 6 | from Functions import * 7 | from Dataset import * 8 | from Network import * 9 | from LrScheduler import * 10 | import Metrics 11 | from Logger import CSVLogger 12 | import argparse 13 | 14 | try: 15 | #from apex.parallel import DistributedDataParallel as DDP 16 | from apex.fp16_utils import * 17 | from apex import amp, optimizers 18 | from apex.multi_tensor_apply import multi_tensor_applier 19 | except ImportError: 20 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.") 21 | import pickle 22 | #gpu selection 23 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 24 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 25 | from sklearn.metrics import matthews_corrcoef 26 | def get_args(): 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument('--gpu_id', type=str, default='0', help='which gpu to use') 29 | parser.add_argument('--path', type=str, default='../v9d3.csv', help='path of csv file with DNA sequences and labels') 30 | parser.add_argument('--epochs', type=int, default=150, help='number of epochs to train') 31 | parser.add_argument('--batch_size', type=int, default=24, help='size of each batch during training') 32 | parser.add_argument('--weight_decay', type=float, default=0, help='weight dacay used in optimizer') 33 | parser.add_argument('--ntoken', type=int, default=4, help='number of tokens to represent DNA nucleotides (should always be 4)') 34 | parser.add_argument('--nclass', type=int, default=2, help='number of classes from the linear decoder') 35 | parser.add_argument('--ninp', type=int, default=512, help='ninp for transformer encoder') 36 | parser.add_argument('--nhead', type=int, default=8, help='nhead for transformer encoder') 37 | parser.add_argument('--nhid', type=int, default=2048, help='nhid for transformer encoder') 38 | parser.add_argument('--nlayers', type=int, default=6, help='nlayers for transformer encoder') 39 | parser.add_argument('--save_freq', type=int, default=1, help='saving checkpoints per save_freq epochs') 40 | parser.add_argument('--dropout', type=float, default=.1, help='transformer dropout') 41 | parser.add_argument('--warmup_steps', type=int, default=3200, help='training schedule warmup steps') 42 | parser.add_argument('--lr_scale', type=float, default=0.1, help='learning rate scale') 43 | parser.add_argument('--nmute', type=int, default=18, help='number of mutations during training') 44 | parser.add_argument('--kmers', type=int, nargs='+', default=[2,3,4,5,6], help='k-mers to be aggregated') 45 | #parser.add_argument('--kmer_aggregation', type=bool, default=True, help='k-mers to be aggregated') 46 | parser.add_argument('--kmer_aggregation', dest='kmer_aggregation', action='store_true') 47 | parser.add_argument('--no_kmer_aggregation', dest='kmer_aggregation', action='store_false') 48 | parser.set_defaults(kmer_aggregation=True) 49 | parser.add_argument('--nfolds', type=int, default=5, help='number of cross validation folds') 50 | parser.add_argument('--fold', type=int, default=0, help='which fold to train') 51 | opts = parser.parse_args() 52 | return opts 53 | 54 | def evaluate_fold(fold): 55 | 56 | #load data 57 | #opts=get_args() 58 | df=pd.read_csv(opts.path) 59 | 60 | sequences=np.asarray(df.sequence) 61 | labels=np.asarray(df.label) 62 | 63 | train_indices, val_indices=iter_split(sequences,labels,fold,opts.nfolds) 64 | # print(train_indices.shape) 65 | # print(val_indices.shape) 66 | # exit() 67 | dataset=PromoterDataset(sequences[train_indices],labels[train_indices]) 68 | val_dataset=PromoterDataset(sequences[val_indices],labels[val_indices]) 69 | dataloader=torch.utils.data.DataLoader(dataset,batch_size=opts.batch_size,shuffle=True) 70 | val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=opts.batch_size*2,shuffle=False) 71 | 72 | 73 | 74 | #init model 75 | model=NucleicTransformer(opts.ntoken, opts.nclass, opts.ninp, opts.nhead, opts.nhid, 76 | opts.nlayers, opts.kmer_aggregation, kmers=opts.kmers, 77 | dropout=opts.dropout,return_aw=True).to(device) 78 | #optimizer=torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=weight_decay) 79 | 80 | # Initialization 81 | # opt_level = 'O1' 82 | # model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) 83 | 84 | # pytorch_total_params = sum(p.numel() for p in model.parameters()) 85 | # print('Total number of paramters: {}'.format(pytorch_total_params)) 86 | 87 | #evaluation loop 88 | #ground_truths=dataset.labels[dataset.val_indices] 89 | ensemble_predictions=[] 90 | acc=[] 91 | 92 | weights_path="best_weights/fold{}top1.ckpt".format(fold,i+1) 93 | print(weights_path) 94 | checkpoint=torch.load(weights_path) 95 | model.load_state_dict(checkpoint) 96 | predictions,attention_weights,sequences,ground_truths=predict(model,device,val_dataloader) 97 | # #validate(model,device,dataset,batch_size=batch_size*2) 98 | predictions=np.exp(predictions)/np.sum(np.exp(predictions),axis=1).reshape(len(predictions),1) 99 | ensemble_predictions.append(predictions) 100 | ensemble_predictions=np.asarray(ensemble_predictions) 101 | ensemble_predictions=np.mean(np.asarray(ensemble_predictions),axis=0) 102 | model.cpu() 103 | del model 104 | #del optimizer 105 | torch.cuda.empty_cache() 106 | return ensemble_predictions, ground_truths, attention_weights, sequences 107 | 108 | opts=get_args() 109 | 110 | 111 | predictions=[] 112 | ground_truths=[] 113 | attention_weights=[] 114 | sequences=[] 115 | for i in range(5): 116 | ngram=[7] 117 | p,t,at,seq= evaluate_fold(i) 118 | predictions.append(p) 119 | ground_truths.append(t) 120 | #print(at.shape) 121 | #attention_weights.append(at) 122 | #print(seq.shape) 123 | #sequences.append(seq) 124 | 125 | 126 | probs=np.concatenate(predictions) 127 | ground_truths=np.concatenate(ground_truths) 128 | predictions=np.argmax(probs,axis=1) 129 | #attention_weights=np.squeeze(np.concatenate(attention_weights,0)).astype('float16') 130 | #sequences=np.asarray(sequences).reshape(-1,81) 131 | acc=Metrics.accuracy(predictions,ground_truths) 132 | sens=Metrics.sensitivity(predictions,ground_truths) 133 | spec=Metrics.specificity(predictions,ground_truths) 134 | MCC=matthews_corrcoef(ground_truths,predictions) 135 | 136 | # prediction_dict={'predictions':np.squeeze(predictions), 137 | # 'ground_truths':np.squeeze(ground_truths), 138 | # 'attention_weights':np.squeeze(attention_weights), 139 | # 'sequences':np.squeeze(sequences.reshape(-1,81)) 140 | # } 141 | 142 | # with open("prediction_dict.p","wb+") as f: 143 | # pickle.dump(prediction_dict,f) 144 | 145 | 146 | with open("cv.txt",'w+') as f: 147 | f.write(f"ACC: {acc}\n") 148 | f.write(f"sensitivity: {sens}\n") 149 | f.write(f"spec: {spec}\n") 150 | f.write(f"MCC: {MCC}\n") 151 | -------------------------------------------------------------------------------- /src/Enchancer_classification/evaluate.sh: -------------------------------------------------------------------------------- 1 | python -i evaluate.py --gpu_id 0 --kmer_aggregation --epochs 150 --nlayers 6 --nmute 15 \ 2 | --path ../bert_enhancer_dataset.csv --kmers 7 --ninp 256 --nhid 1024 3 | -------------------------------------------------------------------------------- /src/Enchancer_classification/job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ##NECESSARY JOB SPECIFICATIONS 4 | #SBATCH --job-name=JobExample5 #Set the job name to "JobExample4" 5 | #SBATCH --time=02:30:00 #Set the wall clock limit to 1hr and 30min 6 | #SBATCH --ntasks=1 #Request 1 task 7 | #SBATCH --mem=5120M #Request 2560MB (2.5GB) per node 8 | #SBATCH --output=out #Send stdout/err to "Example4Out.[jobID]" 9 | #SBATCH --gres=gpu:rtx:1 #Request 1 "rtx" GPU per node 10 | #SBATCH --partition=gpu #Request the GPU partition/queue 11 | 12 | 13 | ##OPTIONAL JOB SPECIFICATIONS 14 | #SBATCH --account=132825315633 15 | #SBATCH --mail-type=ALL #Send email on all job events 16 | #SBATCH --mail-user=shujun@tamu.edu #Send all emails to email_address 17 | 18 | #First Executable Line 19 | #cd $SCRATCH 20 | cd /scratch/user/shujun/Nucleic-Transformer/src/promoter_classification_v9d4 21 | #module load Anaconda3 22 | #source /scratch/user/shujun/.conda/envs/torch/bin/activate 23 | #./run.sh 24 | 25 | for i in {0..4};do 26 | /scratch/user/shujun/.conda/envs/torch/bin/python train.py --fold $i --kmer_aggregation --epochs 150 --nlayers 6 --nmute 15 --path v9d4.csv --kmers 7 --ninp 256 --nhid 1024 27 | done 28 | -------------------------------------------------------------------------------- /src/Enchancer_classification/job_eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ##NECESSARY JOB SPECIFICATIONS 4 | #SBATCH --job-name=JobExample5 #Set the job name to "JobExample4" 5 | #SBATCH --time=00:10:00 #Set the wall clock limit to 1hr and 30min 6 | #SBATCH --ntasks=1 #Request 1 task 7 | #SBATCH --mem=5120M #Request 2560MB (2.5GB) per node 8 | #SBATCH --output=out #Send stdout/err to "Example4Out.[jobID]" 9 | #SBATCH --gres=gpu:rtx:1 #Request 1 "rtx" GPU per node 10 | #SBATCH --partition=gpu #Request the GPU partition/queue 11 | 12 | 13 | ##OPTIONAL JOB SPECIFICATIONS 14 | #SBATCH --account=132825315633 15 | #SBATCH --mail-type=ALL #Send email on all job events 16 | #SBATCH --mail-user=shujun@tamu.edu #Send all emails to email_address 17 | 18 | #First Executable Line 19 | #cd $SCRATCH 20 | cd /scratch/user/shujun/Nucleic-Transformer/src/promoter_classification_v9d4 21 | #module load Anaconda3 22 | #source /scratch/user/shujun/.conda/envs/torch/bin/activate 23 | #./run.sh 24 | 25 | /scratch/user/shujun/.conda/envs/torch/bin/python evaluate.py --gpu_id 0 --kmer_aggregation --epochs 150 --nlayers 6 --nmute 15 --path v9d4.csv --kmers 7 --ninp 256 --nhid 1024 26 | -------------------------------------------------------------------------------- /src/Enchancer_classification/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for i in {0..4};do 3 | python train.py --fold $i --kmer_aggregation --epochs 50 --nlayers 6 --nmute 15 \ 4 | --path bert_enhancer_dataset.csv \ 5 | --kmers 7 --ninp 256 --nhid 1024 6 | done 7 | -------------------------------------------------------------------------------- /src/Enchancer_classification/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import time 6 | from Functions import * 7 | from Dataset import * 8 | from Network import * 9 | from LrScheduler import * 10 | import Metrics 11 | from Logger import CSVLogger 12 | import argparse 13 | 14 | try: 15 | #from apex.parallel import DistributedDataParallel as DDP 16 | from apex.fp16_utils import * 17 | from apex import amp, optimizers 18 | from apex.multi_tensor_apply import multi_tensor_applier 19 | except ImportError: 20 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.") 21 | import pickle 22 | #gpu selection 23 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 24 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 25 | from sklearn.metrics import matthews_corrcoef 26 | def get_args(): 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument('--gpu_id', type=str, default='0', help='which gpu to use') 29 | parser.add_argument('--path', type=str, default='../v9d3.csv', help='path of csv file with DNA sequences and labels') 30 | parser.add_argument('--epochs', type=int, default=150, help='number of epochs to train') 31 | parser.add_argument('--batch_size', type=int, default=24, help='size of each batch during training') 32 | parser.add_argument('--weight_decay', type=float, default=0, help='weight dacay used in optimizer') 33 | parser.add_argument('--ntoken', type=int, default=4, help='number of tokens to represent DNA nucleotides (should always be 4)') 34 | parser.add_argument('--nclass', type=int, default=2, help='number of classes from the linear decoder') 35 | parser.add_argument('--ninp', type=int, default=512, help='ninp for transformer encoder') 36 | parser.add_argument('--nhead', type=int, default=8, help='nhead for transformer encoder') 37 | parser.add_argument('--nhid', type=int, default=2048, help='nhid for transformer encoder') 38 | parser.add_argument('--nlayers', type=int, default=6, help='nlayers for transformer encoder') 39 | parser.add_argument('--save_freq', type=int, default=1, help='saving checkpoints per save_freq epochs') 40 | parser.add_argument('--dropout', type=float, default=.1, help='transformer dropout') 41 | parser.add_argument('--warmup_steps', type=int, default=3200, help='training schedule warmup steps') 42 | parser.add_argument('--lr_scale', type=float, default=0.1, help='learning rate scale') 43 | parser.add_argument('--nmute', type=int, default=18, help='number of mutations during training') 44 | parser.add_argument('--kmers', type=int, nargs='+', default=[2,3,4,5,6], help='k-mers to be aggregated') 45 | #parser.add_argument('--kmer_aggregation', type=bool, default=True, help='k-mers to be aggregated') 46 | parser.add_argument('--kmer_aggregation', dest='kmer_aggregation', action='store_true') 47 | parser.add_argument('--no_kmer_aggregation', dest='kmer_aggregation', action='store_false') 48 | parser.set_defaults(kmer_aggregation=True) 49 | parser.add_argument('--nfolds', type=int, default=5, help='number of cross validation folds') 50 | parser.add_argument('--fold', type=int, default=0, help='which fold to train') 51 | opts = parser.parse_args() 52 | return opts 53 | 54 | def evaluate_fold(fold): 55 | 56 | #load data 57 | #opts=get_args() 58 | df=pd.read_csv(opts.path) 59 | 60 | sequences=np.asarray(df.sequence) 61 | labels=np.asarray(df.label) 62 | 63 | train_indices, val_indices=iter_split(sequences,labels,fold,opts.nfolds) 64 | # print(train_indices.shape) 65 | # print(val_indices.shape) 66 | # exit() 67 | test_dataset=PromoterDataset(sequences,labels) 68 | test_dataloader=torch.utils.data.DataLoader(test_dataset,batch_size=opts.batch_size*2,shuffle=False) 69 | 70 | 71 | 72 | #init model 73 | model=NucleicTransformer(opts.ntoken, opts.nclass, opts.ninp, opts.nhead, opts.nhid, 74 | opts.nlayers, opts.kmer_aggregation, kmers=opts.kmers, 75 | dropout=opts.dropout,return_aw=True).to(device) 76 | #optimizer=torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=weight_decay) 77 | 78 | # Initialization 79 | # opt_level = 'O1' 80 | # model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) 81 | 82 | # pytorch_total_params = sum(p.numel() for p in model.parameters()) 83 | # print('Total number of paramters: {}'.format(pytorch_total_params)) 84 | 85 | #evaluation loop 86 | #ground_truths=dataset.labels[dataset.val_indices] 87 | ensemble_predictions=[] 88 | acc=[] 89 | 90 | weights_path="best_weights/fold{}top1.ckpt".format(fold,i+1) 91 | print(weights_path) 92 | checkpoint=torch.load(weights_path) 93 | model.load_state_dict(checkpoint) 94 | predictions,attention_weights,sequences,ground_truths=predict(model,device,test_dataloader) 95 | # #validate(model,device,dataset,batch_size=batch_size*2) 96 | predictions=np.exp(predictions)/np.sum(np.exp(predictions),axis=1).reshape(len(predictions),1) 97 | ensemble_predictions.append(predictions) 98 | ensemble_predictions=np.asarray(ensemble_predictions) 99 | ensemble_predictions=np.mean(np.asarray(ensemble_predictions),axis=0) 100 | model.cpu() 101 | del model 102 | #del optimizer 103 | torch.cuda.empty_cache() 104 | return ensemble_predictions, ground_truths, attention_weights, sequences 105 | 106 | opts=get_args() 107 | 108 | 109 | predictions=[] 110 | ground_truths=[] 111 | attention_weights=[] 112 | sequences=[] 113 | for i in range(5): 114 | ngram=[7] 115 | p,t,at,seq= evaluate_fold(i) 116 | predictions.append(p) 117 | ground_truths.append(t) 118 | #print(at.shape) 119 | #attention_weights.append(at) 120 | #print(seq.shape) 121 | #sequences.append(seq) 122 | 123 | 124 | probs=np.stack(predictions,0).mean(0) 125 | ground_truths=np.stack(ground_truths,0).mean(0) 126 | predictions=np.argmax(probs,axis=1) 127 | #attention_weights=np.squeeze(np.concatenate(attention_weights,0)).astype('float16') 128 | #sequences=np.asarray(sequences).reshape(-1,81) 129 | acc=Metrics.accuracy(predictions,ground_truths) 130 | sens=Metrics.sensitivity(predictions,ground_truths) 131 | spec=Metrics.specificity(predictions,ground_truths) 132 | MCC=matthews_corrcoef(ground_truths,predictions) 133 | 134 | # prediction_dict={'predictions':np.squeeze(predictions), 135 | # 'ground_truths':np.squeeze(ground_truths), 136 | # 'attention_weights':np.squeeze(attention_weights), 137 | # 'sequences':np.squeeze(sequences.reshape(-1,81)) 138 | # } 139 | 140 | # with open("prediction_dict.p","wb+") as f: 141 | # pickle.dump(prediction_dict,f) 142 | 143 | 144 | with open("test_results.txt",'w+') as f: 145 | f.write(f"ACC: {acc}\n") 146 | f.write(f"sensitivity: {sens}\n") 147 | f.write(f"spec: {spec}\n") 148 | f.write(f"MCC: {MCC}\n") 149 | -------------------------------------------------------------------------------- /src/Enchancer_classification/test.sh: -------------------------------------------------------------------------------- 1 | python -i test.py --gpu_id 0 --kmer_aggregation --epochs 150 --nlayers 6 --nmute 15 \ 2 | --path ../bert_enhancer_test_dataset.csv --kmers 7 --ninp 256 --nhid 1024 3 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/README.md: -------------------------------------------------------------------------------- 1 | # Nucleic_Transformer_Eukaryotic_Promoters 2 | 3 | To run: 4 | 1. download datasets from release 5 | 2. create a new folder 'data' and put the csv files in said folder 6 | 3. in each folder there's a ```run.sh``` to run training for that specific dataset 7 | 8 | Folders with '''deepromoter''' suffix are code to run deepromoter training with the same hyperparameters and architecture in the deepromoter paper on the dataset described by the folder name 9 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_non_tata/Dataset.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | from tqdm import tqdm 6 | import torch 7 | 8 | 9 | nt_int={ 10 | "A": 0, 11 | "T": 1, 12 | "G": 2, 13 | "C": 3,} 14 | 15 | def nucleatide2int(nt_sequence,target_length=None): 16 | int_sequence=[] 17 | for nt in nt_sequence: 18 | nt=nt.upper() 19 | if nt in nt_int: 20 | int_sequence.append(nt_int[nt]) 21 | int_sequence=np.asarray(int_sequence,dtype='int32') 22 | if target_length: 23 | int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1) 24 | return int_sequence 25 | 26 | 27 | 28 | 29 | 30 | class PromoterDataset(torch.utils.data.Dataset): 31 | def __init__(self,sequences,labels): 32 | self.data=[] 33 | for seq in sequences: 34 | self.data.append(nucleatide2int(seq)) 35 | #print(self.data[-1].shape) 36 | #exit() 37 | #self.data=np.array(self.data,dtype='int') 38 | #exit() 39 | self.labels=labels 40 | 41 | print(len(self.data)) 42 | print(self.labels.shape) 43 | 44 | def __len__(self): 45 | return len(self.labels) 46 | 47 | def __getitem__(self,idx): 48 | return {'data':self.data[idx], 'labels':self.labels[idx]} 49 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_non_tata/Logger.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from os import path 3 | 4 | 5 | class CSVLogger: 6 | def __init__(self,columns,file): 7 | self.columns=columns 8 | self.file=file 9 | if not self.check_header(): 10 | self._write_header() 11 | 12 | 13 | def check_header(self): 14 | if path.exists(self.file): 15 | # with open(self.file, 'r') as csvfile: 16 | # sniffer = csv.Sniffer() 17 | # has_header = sniffer.has_header(csvfile.read(2048)) 18 | # header=csvfile.seek(0) 19 | header=True 20 | else: 21 | header=False 22 | return header 23 | 24 | 25 | def _write_header(self): 26 | with open(self.file,"a") as f: 27 | string="" 28 | for attrib in self.columns: 29 | string+="{},".format(attrib) 30 | string=string[:len(string)-1] 31 | string+="\n" 32 | f.write(string) 33 | return self 34 | 35 | def log(self,row): 36 | if len(row)!=len(self.columns): 37 | raise Exception("Mismatch between row vector and number of columns in logger") 38 | with open(self.file,"a") as f: 39 | string="" 40 | for attrib in row: 41 | string+="{},".format(attrib) 42 | string=string[:len(string)-1] 43 | string+="\n" 44 | f.write(string) 45 | return self 46 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_non_tata/LrScheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def update_lr(optimizer, lr): 5 | for param_group in optimizer.param_groups: 6 | param_group['lr'] = lr 7 | 8 | class lr_AIAYN(): 9 | ''' 10 | Learning rate scheduler from the paper: 11 | Attention is All You Need 12 | ''' 13 | def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1): 14 | self.optimizer=optimizer 15 | self.d_model=d_model 16 | self.warmup_steps=warmup_steps 17 | self.step_num=0 18 | self.factor=factor 19 | 20 | def step(self): 21 | self.step_num+=1 22 | lr=self.d_model**-0.5*np.min([self.step_num**-0.5, 23 | self.step_num*self.warmup_steps**-1.5])*self.factor 24 | update_lr(self.optimizer,lr) 25 | return lr 26 | 27 | 28 | class Cos_Anneal(): 29 | ''' 30 | Learning rate scheduler flat and anneal 31 | ''' 32 | def __init__(self,optimizer,max_lr,min_lr,T): 33 | self.optimizer=optimizer 34 | self.max_lr=max_lr 35 | self.min_lr=min_lr 36 | self.step_num=0 37 | self.T=T 38 | 39 | def step(self): 40 | pi=3.1415 41 | self.step_num+=1 42 | lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi)) 43 | if self.optimizer: 44 | update_lr(self.optimizer,lr) 45 | return lr -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_non_tata/Metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def accuracy(predictions,ground_truths): 5 | return np.sum(predictions==ground_truths)/len(ground_truths) 6 | 7 | 8 | def sensitivity(predictions,ground_truths): 9 | ''' 10 | Here it is assumed: 11 | 0=negative 12 | 1=positive 13 | ''' 14 | return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1]) 15 | 16 | 17 | 18 | def specificity(predictions,ground_truths): 19 | ''' 20 | Here it is assumed: 21 | 0=negative 22 | 1=positive 23 | ''' 24 | return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0]) 25 | 26 | def MCC(predictions,ground_truths): 27 | ''' 28 | Here it is assumed: 29 | 0=negative 30 | 1=positive 31 | ''' 32 | N1=len(predictions[(predictions==0)&(ground_truths==1)]) 33 | N2=len(predictions[(predictions==1)&(ground_truths==0)]) 34 | N3=len(ground_truths[ground_truths==1]) 35 | N4=len(ground_truths[ground_truths==0]) 36 | sens=1-N1/N3 37 | spec=1-N2/N4 38 | denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4)) 39 | return (1-sens-spec)/denom 40 | 41 | 42 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_non_tata/check_log.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_non_tata/extract_motif.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import os 4 | from tqdm import tqdm 5 | import pandas as pd 6 | import argparse 7 | import matplotlib.pyplot as plt 8 | import matplotlib 9 | 10 | font = {'family' : 'normal', 11 | 'weight' : 'bold', 12 | 'size' : 10} 13 | 14 | matplotlib.rc('font', **font) 15 | 16 | def get_args(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--kmers', type=int, default='7', help='kmer') 19 | opts = parser.parse_args() 20 | return opts 21 | 22 | opts=get_args() 23 | 24 | nts=[ 25 | "A", 26 | "T", 27 | "G", 28 | "C"] 29 | 30 | def int2nucleotide(nt_sequence,target_length=None): 31 | seq='' 32 | for nt in nt_sequence: 33 | seq+=nts[nt] 34 | return seq 35 | 36 | with open("prediction_dict.p","rb") as f: 37 | prediction_dict=pickle.load(f) 38 | 39 | 40 | df=pd.DataFrame(columns=['index','sequence']) 41 | 42 | def get_kmers(sequence,k): 43 | kmers=[] 44 | for i in range(len(sequence)-k+1): 45 | kmers.append(sequence[i:i+k]) 46 | return kmers 47 | 48 | os.system('mkdir aw_visualized') 49 | 50 | top=10 51 | count=0 52 | sequences=[] 53 | top_kmers=[] 54 | top_k_count=[] 55 | for i in tqdm(range(len(prediction_dict['sequences']))): 56 | 57 | count+=1 58 | sequence=int2nucleotide(prediction_dict['sequences'][i]) 59 | sequences.append(sequence) 60 | attention_weights=prediction_dict['attention_weights'][i] 61 | ground_truth=prediction_dict['ground_truths'][i] 62 | prediction=prediction_dict['predictions'][i] 63 | 64 | kmers=np.asarray(get_kmers(sequence,opts.kmers)) 65 | 66 | attention_weights=attention_weights[-1].sum(0) 67 | #attention_weights=attention_weights/attention_weights.sum() 68 | # plt.imshow(attention_weights.reshape(1,-1).astype('float32')) 69 | # plt.show() 70 | #exit() 71 | if ground_truth==1: 72 | state='positive' 73 | else: 74 | state='negative' 75 | 76 | if ground_truth==prediction: 77 | eval='correct' 78 | else: 79 | eval='wrong' 80 | if state=='positive' and eval=='correct': 81 | sorted_indices=np.argsort(attention_weights) 82 | #print(attention_weights[sorted_indices][-3:]) 83 | top_k=kmers[sorted_indices][-3:] 84 | for kmer in top_k: 85 | if kmer not in top_kmers: 86 | top_kmers.append(kmer) 87 | top_k_count.append(1) 88 | else: 89 | top_k_count[top_kmers.index(kmer)]=top_k_count[top_kmers.index(kmer)]+1 90 | #exit() 91 | 92 | top_kmers=np.asarray(top_kmers) 93 | top_k_count=np.asarray(top_k_count) 94 | 95 | #exit() 96 | 97 | top_indices=np.flip(np.argsort(top_k_count)) 98 | 99 | fig, ax = plt.subplots() 100 | x=np.arange(top) 101 | width=0.4 102 | bar=ax.bar(x,top_k_count[top_indices[:top]],edgecolor='k',linewidth=2) 103 | ax.set_ylabel('Num of appearancesin top 3',fontsize=10) 104 | #ax.set_title('Scores by group and gender') 105 | ax.set_xticks(x) 106 | ax.set_xticklabels(top_kmers[top_indices[:top]]) 107 | plt.setp(ax.get_xticklabels(), rotation=30, ha="right", 108 | rotation_mode="anchor") 109 | ax.legend() 110 | plt.savefig('promoter_motifs.eps') 111 | #plt.show() 112 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_non_tata/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for i in {0..4};do 3 | python train.py --fold $i --gpu_id 0 --kmer_aggregation --epochs 150 \ 4 | --nlayers 6 --nmute 45 --path ../../data/human_non_tata_dataset.csv --kmers 11 --ninp 256 --nhid 1024 \ 5 | --batch_size 64 --lr_scale 0.2 6 | done 7 | 8 | python evaluate.py --gpu_id 0 --kmer_aggregation --epochs 150 \ 9 | --nlayers 6 --nmute 15 --kmers 11 --ninp 256 --nhid 1024 \ 10 | --path ../../data/human_non_tata_dataset.csv 11 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_non_tata_deepromoter/Dataset.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | from tqdm import tqdm 6 | import torch 7 | 8 | 9 | nt_int={ 10 | "A": 0, 11 | "T": 1, 12 | "G": 2, 13 | "C": 3,} 14 | 15 | def nucleatide2int(nt_sequence,target_length=None): 16 | int_sequence=[] 17 | for nt in nt_sequence: 18 | nt=nt.upper() 19 | if nt in nt_int: 20 | int_sequence.append(nt_int[nt]) 21 | int_sequence=np.asarray(int_sequence,dtype='int32') 22 | if target_length: 23 | int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1) 24 | return int_sequence 25 | 26 | 27 | 28 | 29 | 30 | class PromoterDataset(torch.utils.data.Dataset): 31 | def __init__(self,sequences,labels): 32 | self.data=[] 33 | for seq in sequences: 34 | self.data.append(nucleatide2int(seq)) 35 | #print(self.data[-1].shape) 36 | #exit() 37 | #self.data=np.array(self.data,dtype='int') 38 | #exit() 39 | self.labels=labels 40 | 41 | print(len(self.data)) 42 | print(self.labels.shape) 43 | 44 | def __len__(self): 45 | return len(self.labels) 46 | 47 | def __getitem__(self,idx): 48 | 49 | sequence=torch.tensor(self.data[idx]).long() 50 | sequence=torch.nn.functional.one_hot(sequence,num_classes=4).float() 51 | 52 | return {'data':sequence, 'labels':self.labels[idx]} 53 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_non_tata_deepromoter/DeePromoter.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | import numpy as np 4 | from torch import nn 5 | from torch.nn.utils.rnn import pad_sequence 6 | from torch.utils.data import Dataset, DataLoader, random_split 7 | from torchvision import transforms, utils 8 | 9 | 10 | class ParallelCNN(nn.Module): 11 | def __init__(self, para_ker, pool_kernel=6, drop=0.5): 12 | """ 13 | Multiple CNN layer apply on input and concatenate the output 14 | :param para_ker: List of kernel size that will be used 15 | :param pool_kernel: Pooling parameter after CNN 16 | :param drop: Dropout parameter 17 | """ 18 | super(ParallelCNN, self).__init__() 19 | self.lseq = nn.ModuleList() 20 | for k in para_ker: 21 | seq = nn.Sequential( 22 | nn.Conv1d(4, 4, kernel_size=k, padding="same"), 23 | nn.ReLU(), 24 | nn.MaxPool1d(pool_kernel), 25 | nn.Dropout(drop) 26 | ) 27 | self.lseq.append(seq) 28 | 29 | def forward(self, inputs): 30 | """ 31 | :param inputs: DNA onehot sequences [batch_size x 4 x length] 32 | :return: Stack CNN output feature from different kernel size [batch_size x 12 x length] 33 | """ 34 | _x = list() 35 | for seq in self.lseq: 36 | x = seq(inputs) 37 | _x.append(x) 38 | # concate outputs of every conv layer to a tensor 39 | _x = torch.cat(_x, 1) 40 | return _x 41 | 42 | 43 | class BidirectionalLSTM(nn.Module): 44 | def __init__(self, input_size, hidden_size, output_size): 45 | super(BidirectionalLSTM, self).__init__() 46 | self.rnn = nn.LSTM(input_size, hidden_size, bidirectional=True, batch_first=True) 47 | self.linear = nn.Linear(hidden_size * 2, output_size) 48 | 49 | def forward(self, inputs): 50 | """ 51 | :param inputs: visual feature [batch_size x T x input_size] 52 | :return: contextual feature [batch_size x T x output_size] 53 | """ 54 | 55 | self.rnn.flatten_parameters() 56 | recurrent, _ = self.rnn(inputs) # batch_size x T x input_size -> batch_size x T x (2*hidden_size) 57 | output = self.linear(recurrent) # batch_size x T x output_size 58 | return output 59 | 60 | 61 | class DeePromoter(nn.Module): 62 | def __init__(self, para_ker, input_shape=(64, 300, 4), pool_kernel=6, drop=0.5): 63 | """ 64 | Deepromoter 65 | :param para_ker: List of kernel size that will be used 66 | :param input_shape: Specifies the input shape for model(fixed) 67 | :param pool_kernel: Pooling parameter after CNN 68 | :param drop: Dropout parameter 69 | """ 70 | super(DeePromoter, self).__init__() 71 | binode = len(para_ker) * 4 72 | 73 | self.pconv = ParallelCNN(para_ker, pool_kernel, drop) 74 | self.bilstm = BidirectionalLSTM(binode, binode, binode) 75 | self.flatten = nn.Flatten() 76 | x = torch.zeros(input_shape) 77 | shape = self.get_feature_shape(x) 78 | 79 | self.fc = nn.Sequential( 80 | nn.Linear(shape, shape), 81 | nn.ReLU(), 82 | nn.Linear(shape, 2), 83 | ) 84 | 85 | def get_feature_shape(self, x): 86 | """Pass a dummy input through to find the shape 87 | after flatten layer for Linear layer construction""" 88 | x = x.permute(0, 2, 1) 89 | x = self.pconv(x) 90 | x = x.permute(0, 2, 1) 91 | x = self.bilstm(x) 92 | x = self.flatten(x) 93 | return x.shape[1] 94 | 95 | def forward(self, x): 96 | x = x.permute(0, 2, 1) 97 | x = self.pconv(x) 98 | x = x.permute(0, 2, 1) 99 | x = self.bilstm(x) 100 | x = self.flatten(x) 101 | x = self.fc(x) 102 | return x 103 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_non_tata_deepromoter/Logger.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from os import path 3 | 4 | 5 | class CSVLogger: 6 | def __init__(self,columns,file): 7 | self.columns=columns 8 | self.file=file 9 | if not self.check_header(): 10 | self._write_header() 11 | 12 | 13 | def check_header(self): 14 | if path.exists(self.file): 15 | # with open(self.file, 'r') as csvfile: 16 | # sniffer = csv.Sniffer() 17 | # has_header = sniffer.has_header(csvfile.read(2048)) 18 | # header=csvfile.seek(0) 19 | header=True 20 | else: 21 | header=False 22 | return header 23 | 24 | 25 | def _write_header(self): 26 | with open(self.file,"a") as f: 27 | string="" 28 | for attrib in self.columns: 29 | string+="{},".format(attrib) 30 | string=string[:len(string)-1] 31 | string+="\n" 32 | f.write(string) 33 | return self 34 | 35 | def log(self,row): 36 | if len(row)!=len(self.columns): 37 | raise Exception("Mismatch between row vector and number of columns in logger") 38 | with open(self.file,"a") as f: 39 | string="" 40 | for attrib in row: 41 | string+="{},".format(attrib) 42 | string=string[:len(string)-1] 43 | string+="\n" 44 | f.write(string) 45 | return self 46 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_non_tata_deepromoter/LrScheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def update_lr(optimizer, lr): 5 | for param_group in optimizer.param_groups: 6 | param_group['lr'] = lr 7 | 8 | class lr_AIAYN(): 9 | ''' 10 | Learning rate scheduler from the paper: 11 | Attention is All You Need 12 | ''' 13 | def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1): 14 | self.optimizer=optimizer 15 | self.d_model=d_model 16 | self.warmup_steps=warmup_steps 17 | self.step_num=0 18 | self.factor=factor 19 | 20 | def step(self): 21 | self.step_num+=1 22 | lr=self.d_model**-0.5*np.min([self.step_num**-0.5, 23 | self.step_num*self.warmup_steps**-1.5])*self.factor 24 | update_lr(self.optimizer,lr) 25 | return lr 26 | 27 | 28 | class Cos_Anneal(): 29 | ''' 30 | Learning rate scheduler flat and anneal 31 | ''' 32 | def __init__(self,optimizer,max_lr,min_lr,T): 33 | self.optimizer=optimizer 34 | self.max_lr=max_lr 35 | self.min_lr=min_lr 36 | self.step_num=0 37 | self.T=T 38 | 39 | def step(self): 40 | pi=3.1415 41 | self.step_num+=1 42 | lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi)) 43 | if self.optimizer: 44 | update_lr(self.optimizer,lr) 45 | return lr -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_non_tata_deepromoter/Metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def accuracy(predictions,ground_truths): 5 | return np.sum(predictions==ground_truths)/len(ground_truths) 6 | 7 | 8 | def sensitivity(predictions,ground_truths): 9 | ''' 10 | Here it is assumed: 11 | 0=negative 12 | 1=positive 13 | ''' 14 | return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1]) 15 | 16 | 17 | 18 | def specificity(predictions,ground_truths): 19 | ''' 20 | Here it is assumed: 21 | 0=negative 22 | 1=positive 23 | ''' 24 | return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0]) 25 | 26 | def MCC(predictions,ground_truths): 27 | ''' 28 | Here it is assumed: 29 | 0=negative 30 | 1=positive 31 | ''' 32 | N1=len(predictions[(predictions==0)&(ground_truths==1)]) 33 | N2=len(predictions[(predictions==1)&(ground_truths==0)]) 34 | N3=len(ground_truths[ground_truths==1]) 35 | N4=len(ground_truths[ground_truths==0]) 36 | sens=1-N1/N3 37 | spec=1-N2/N4 38 | denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4)) 39 | return (1-sens-spec)/denom 40 | 41 | 42 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_non_tata_deepromoter/check_log.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_non_tata_deepromoter/extract_motif.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import os 4 | from tqdm import tqdm 5 | import pandas as pd 6 | import argparse 7 | import matplotlib.pyplot as plt 8 | import matplotlib 9 | 10 | font = {'family' : 'normal', 11 | 'weight' : 'bold', 12 | 'size' : 10} 13 | 14 | matplotlib.rc('font', **font) 15 | 16 | def get_args(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--kmers', type=int, default='7', help='kmer') 19 | opts = parser.parse_args() 20 | return opts 21 | 22 | opts=get_args() 23 | 24 | nts=[ 25 | "A", 26 | "T", 27 | "G", 28 | "C"] 29 | 30 | def int2nucleotide(nt_sequence,target_length=None): 31 | seq='' 32 | for nt in nt_sequence: 33 | seq+=nts[nt] 34 | return seq 35 | 36 | with open("prediction_dict.p","rb") as f: 37 | prediction_dict=pickle.load(f) 38 | 39 | 40 | df=pd.DataFrame(columns=['index','sequence']) 41 | 42 | def get_kmers(sequence,k): 43 | kmers=[] 44 | for i in range(len(sequence)-k+1): 45 | kmers.append(sequence[i:i+k]) 46 | return kmers 47 | 48 | os.system('mkdir aw_visualized') 49 | 50 | top=10 51 | count=0 52 | sequences=[] 53 | top_kmers=[] 54 | top_k_count=[] 55 | for i in tqdm(range(len(prediction_dict['sequences']))): 56 | 57 | count+=1 58 | sequence=int2nucleotide(prediction_dict['sequences'][i]) 59 | sequences.append(sequence) 60 | attention_weights=prediction_dict['attention_weights'][i] 61 | ground_truth=prediction_dict['ground_truths'][i] 62 | prediction=prediction_dict['predictions'][i] 63 | 64 | kmers=np.asarray(get_kmers(sequence,opts.kmers)) 65 | 66 | attention_weights=attention_weights[-1].sum(0) 67 | #attention_weights=attention_weights/attention_weights.sum() 68 | # plt.imshow(attention_weights.reshape(1,-1).astype('float32')) 69 | # plt.show() 70 | #exit() 71 | if ground_truth==1: 72 | state='positive' 73 | else: 74 | state='negative' 75 | 76 | if ground_truth==prediction: 77 | eval='correct' 78 | else: 79 | eval='wrong' 80 | if state=='positive' and eval=='correct': 81 | sorted_indices=np.argsort(attention_weights) 82 | #print(attention_weights[sorted_indices][-3:]) 83 | top_k=kmers[sorted_indices][-3:] 84 | for kmer in top_k: 85 | if kmer not in top_kmers: 86 | top_kmers.append(kmer) 87 | top_k_count.append(1) 88 | else: 89 | top_k_count[top_kmers.index(kmer)]=top_k_count[top_kmers.index(kmer)]+1 90 | #exit() 91 | 92 | top_kmers=np.asarray(top_kmers) 93 | top_k_count=np.asarray(top_k_count) 94 | 95 | #exit() 96 | 97 | top_indices=np.flip(np.argsort(top_k_count)) 98 | 99 | fig, ax = plt.subplots() 100 | x=np.arange(top) 101 | width=0.4 102 | bar=ax.bar(x,top_k_count[top_indices[:top]],edgecolor='k',linewidth=2) 103 | ax.set_ylabel('Num of appearancesin top 3',fontsize=10) 104 | #ax.set_title('Scores by group and gender') 105 | ax.set_xticks(x) 106 | ax.set_xticklabels(top_kmers[top_indices[:top]]) 107 | plt.setp(ax.get_xticklabels(), rotation=30, ha="right", 108 | rotation_mode="anchor") 109 | ax.legend() 110 | plt.savefig('promoter_motifs.eps') 111 | #plt.show() 112 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_non_tata_deepromoter/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for i in {0..4};do 3 | python train.py --fold $i --gpu_id 0 --kmer_aggregation --epochs 50 \ 4 | --nlayers 6 --nmute 45 --path ../../data/human_non_tata_dataset.csv --kmers 7 --ninp 256 --nhid 1024 \ 5 | --batch_size 64 6 | done 7 | 8 | python evaluate.py --gpu_id 0 --kmer_aggregation --epochs 150 \ 9 | --nlayers 6 --nmute 15 --kmers 7 --ninp 256 --nhid 1024 \ 10 | --path ../../data/human_non_tata_dataset.csv 11 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_tata/Dataset.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | from tqdm import tqdm 6 | import torch 7 | 8 | 9 | nt_int={ 10 | "A": 0, 11 | "T": 1, 12 | "G": 2, 13 | "C": 3,} 14 | 15 | def nucleatide2int(nt_sequence,target_length=None): 16 | int_sequence=[] 17 | for nt in nt_sequence: 18 | nt=nt.upper() 19 | if nt in nt_int: 20 | int_sequence.append(nt_int[nt]) 21 | int_sequence=np.asarray(int_sequence,dtype='int32') 22 | if target_length: 23 | int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1) 24 | return int_sequence 25 | 26 | 27 | 28 | 29 | 30 | class PromoterDataset(torch.utils.data.Dataset): 31 | def __init__(self,sequences,labels): 32 | self.data=[] 33 | for seq in sequences: 34 | self.data.append(nucleatide2int(seq)) 35 | #print(self.data[-1].shape) 36 | #exit() 37 | #self.data=np.array(self.data,dtype='int') 38 | #exit() 39 | self.labels=labels 40 | 41 | print(len(self.data)) 42 | print(self.labels.shape) 43 | 44 | def __len__(self): 45 | return len(self.labels) 46 | 47 | def __getitem__(self,idx): 48 | return {'data':self.data[idx], 'labels':self.labels[idx]} 49 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_tata/Logger.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from os import path 3 | 4 | 5 | class CSVLogger: 6 | def __init__(self,columns,file): 7 | self.columns=columns 8 | self.file=file 9 | if not self.check_header(): 10 | self._write_header() 11 | 12 | 13 | def check_header(self): 14 | if path.exists(self.file): 15 | # with open(self.file, 'r') as csvfile: 16 | # sniffer = csv.Sniffer() 17 | # has_header = sniffer.has_header(csvfile.read(2048)) 18 | # header=csvfile.seek(0) 19 | header=True 20 | else: 21 | header=False 22 | return header 23 | 24 | 25 | def _write_header(self): 26 | with open(self.file,"a") as f: 27 | string="" 28 | for attrib in self.columns: 29 | string+="{},".format(attrib) 30 | string=string[:len(string)-1] 31 | string+="\n" 32 | f.write(string) 33 | return self 34 | 35 | def log(self,row): 36 | if len(row)!=len(self.columns): 37 | raise Exception("Mismatch between row vector and number of columns in logger") 38 | with open(self.file,"a") as f: 39 | string="" 40 | for attrib in row: 41 | string+="{},".format(attrib) 42 | string=string[:len(string)-1] 43 | string+="\n" 44 | f.write(string) 45 | return self 46 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_tata/LrScheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def update_lr(optimizer, lr): 5 | for param_group in optimizer.param_groups: 6 | param_group['lr'] = lr 7 | 8 | class lr_AIAYN(): 9 | ''' 10 | Learning rate scheduler from the paper: 11 | Attention is All You Need 12 | ''' 13 | def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1): 14 | self.optimizer=optimizer 15 | self.d_model=d_model 16 | self.warmup_steps=warmup_steps 17 | self.step_num=0 18 | self.factor=factor 19 | 20 | def step(self): 21 | self.step_num+=1 22 | lr=self.d_model**-0.5*np.min([self.step_num**-0.5, 23 | self.step_num*self.warmup_steps**-1.5])*self.factor 24 | update_lr(self.optimizer,lr) 25 | return lr 26 | 27 | 28 | class Cos_Anneal(): 29 | ''' 30 | Learning rate scheduler flat and anneal 31 | ''' 32 | def __init__(self,optimizer,max_lr,min_lr,T): 33 | self.optimizer=optimizer 34 | self.max_lr=max_lr 35 | self.min_lr=min_lr 36 | self.step_num=0 37 | self.T=T 38 | 39 | def step(self): 40 | pi=3.1415 41 | self.step_num+=1 42 | lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi)) 43 | if self.optimizer: 44 | update_lr(self.optimizer,lr) 45 | return lr -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_tata/Metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def accuracy(predictions,ground_truths): 5 | return np.sum(predictions==ground_truths)/len(ground_truths) 6 | 7 | 8 | def sensitivity(predictions,ground_truths): 9 | ''' 10 | Here it is assumed: 11 | 0=negative 12 | 1=positive 13 | ''' 14 | return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1]) 15 | 16 | 17 | 18 | def specificity(predictions,ground_truths): 19 | ''' 20 | Here it is assumed: 21 | 0=negative 22 | 1=positive 23 | ''' 24 | return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0]) 25 | 26 | def MCC(predictions,ground_truths): 27 | ''' 28 | Here it is assumed: 29 | 0=negative 30 | 1=positive 31 | ''' 32 | N1=len(predictions[(predictions==0)&(ground_truths==1)]) 33 | N2=len(predictions[(predictions==1)&(ground_truths==0)]) 34 | N3=len(ground_truths[ground_truths==1]) 35 | N4=len(ground_truths[ground_truths==0]) 36 | sens=1-N1/N3 37 | spec=1-N2/N4 38 | denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4)) 39 | return (1-sens-spec)/denom 40 | 41 | 42 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_tata/check_log.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_tata/extract_motif.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import os 4 | from tqdm import tqdm 5 | import pandas as pd 6 | import argparse 7 | import matplotlib.pyplot as plt 8 | import matplotlib 9 | 10 | font = {'family' : 'normal', 11 | 'weight' : 'bold', 12 | 'size' : 10} 13 | 14 | matplotlib.rc('font', **font) 15 | 16 | def get_args(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--kmers', type=int, default='7', help='kmer') 19 | opts = parser.parse_args() 20 | return opts 21 | 22 | opts=get_args() 23 | 24 | nts=[ 25 | "A", 26 | "T", 27 | "G", 28 | "C"] 29 | 30 | def int2nucleotide(nt_sequence,target_length=None): 31 | seq='' 32 | for nt in nt_sequence: 33 | seq+=nts[nt] 34 | return seq 35 | 36 | with open("prediction_dict.p","rb") as f: 37 | prediction_dict=pickle.load(f) 38 | 39 | 40 | df=pd.DataFrame(columns=['index','sequence']) 41 | 42 | def get_kmers(sequence,k): 43 | kmers=[] 44 | for i in range(len(sequence)-k+1): 45 | kmers.append(sequence[i:i+k]) 46 | return kmers 47 | 48 | os.system('mkdir aw_visualized') 49 | 50 | top=10 51 | count=0 52 | sequences=[] 53 | top_kmers=[] 54 | top_k_count=[] 55 | for i in tqdm(range(len(prediction_dict['sequences']))): 56 | 57 | count+=1 58 | sequence=int2nucleotide(prediction_dict['sequences'][i]) 59 | sequences.append(sequence) 60 | attention_weights=prediction_dict['attention_weights'][i] 61 | ground_truth=prediction_dict['ground_truths'][i] 62 | prediction=prediction_dict['predictions'][i] 63 | 64 | kmers=np.asarray(get_kmers(sequence,opts.kmers)) 65 | 66 | attention_weights=attention_weights[-1].sum(0) 67 | #attention_weights=attention_weights/attention_weights.sum() 68 | # plt.imshow(attention_weights.reshape(1,-1).astype('float32')) 69 | # plt.show() 70 | #exit() 71 | if ground_truth==1: 72 | state='positive' 73 | else: 74 | state='negative' 75 | 76 | if ground_truth==prediction: 77 | eval='correct' 78 | else: 79 | eval='wrong' 80 | if state=='positive' and eval=='correct': 81 | sorted_indices=np.argsort(attention_weights) 82 | #print(attention_weights[sorted_indices][-3:]) 83 | top_k=kmers[sorted_indices][-3:] 84 | for kmer in top_k: 85 | if kmer not in top_kmers: 86 | top_kmers.append(kmer) 87 | top_k_count.append(1) 88 | else: 89 | top_k_count[top_kmers.index(kmer)]=top_k_count[top_kmers.index(kmer)]+1 90 | #exit() 91 | 92 | top_kmers=np.asarray(top_kmers) 93 | top_k_count=np.asarray(top_k_count) 94 | 95 | #exit() 96 | 97 | top_indices=np.flip(np.argsort(top_k_count)) 98 | 99 | fig, ax = plt.subplots() 100 | x=np.arange(top) 101 | width=0.4 102 | bar=ax.bar(x,top_k_count[top_indices[:top]],edgecolor='k',linewidth=2) 103 | ax.set_ylabel('Num of appearancesin top 3',fontsize=10) 104 | #ax.set_title('Scores by group and gender') 105 | ax.set_xticks(x) 106 | ax.set_xticklabels(top_kmers[top_indices[:top]]) 107 | plt.setp(ax.get_xticklabels(), rotation=30, ha="right", 108 | rotation_mode="anchor") 109 | ax.legend() 110 | plt.savefig('promoter_motifs.eps') 111 | #plt.show() 112 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_tata/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for i in {0..4};do 3 | python train.py --fold $i --gpu_id 0 --kmer_aggregation --epochs 150 \ 4 | --nlayers 6 --nmute 45 --path ../../data/human_tata_dataset.csv --kmers 11 --ninp 256 --nhid 1024 \ 5 | --batch_size 32 --lr_scale 0.1 6 | done 7 | 8 | python evaluate.py --gpu_id 0 --kmer_aggregation --epochs 150 \ 9 | --nlayers 6 --nmute 15 --kmers 11 --ninp 256 --nhid 1024 \ 10 | --path ../../data/human_tata_dataset.csv 11 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_tata_deepromoter/Dataset.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | from tqdm import tqdm 6 | import torch 7 | 8 | 9 | nt_int={ 10 | "A": 0, 11 | "T": 1, 12 | "G": 2, 13 | "C": 3,} 14 | 15 | def nucleatide2int(nt_sequence,target_length=None): 16 | int_sequence=[] 17 | for nt in nt_sequence: 18 | nt=nt.upper() 19 | if nt in nt_int: 20 | int_sequence.append(nt_int[nt]) 21 | int_sequence=np.asarray(int_sequence,dtype='int32') 22 | if target_length: 23 | int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1) 24 | return int_sequence 25 | 26 | 27 | 28 | 29 | 30 | class PromoterDataset(torch.utils.data.Dataset): 31 | def __init__(self,sequences,labels): 32 | self.data=[] 33 | for seq in sequences: 34 | self.data.append(nucleatide2int(seq)) 35 | #print(self.data[-1].shape) 36 | #exit() 37 | #self.data=np.array(self.data,dtype='int') 38 | #exit() 39 | self.labels=labels 40 | 41 | print(len(self.data)) 42 | print(self.labels.shape) 43 | 44 | def __len__(self): 45 | return len(self.labels) 46 | 47 | def __getitem__(self,idx): 48 | 49 | sequence=torch.tensor(self.data[idx]).long() 50 | sequence=torch.nn.functional.one_hot(sequence,num_classes=4).float() 51 | 52 | return {'data':sequence, 'labels':self.labels[idx]} 53 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_tata_deepromoter/DeePromoter.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | import numpy as np 4 | from torch import nn 5 | from torch.nn.utils.rnn import pad_sequence 6 | from torch.utils.data import Dataset, DataLoader, random_split 7 | from torchvision import transforms, utils 8 | 9 | 10 | class ParallelCNN(nn.Module): 11 | def __init__(self, para_ker, pool_kernel=6, drop=0.5): 12 | """ 13 | Multiple CNN layer apply on input and concatenate the output 14 | :param para_ker: List of kernel size that will be used 15 | :param pool_kernel: Pooling parameter after CNN 16 | :param drop: Dropout parameter 17 | """ 18 | super(ParallelCNN, self).__init__() 19 | self.lseq = nn.ModuleList() 20 | for k in para_ker: 21 | seq = nn.Sequential( 22 | nn.Conv1d(4, 4, kernel_size=k, padding="same"), 23 | nn.ReLU(), 24 | nn.MaxPool1d(pool_kernel), 25 | nn.Dropout(drop) 26 | ) 27 | self.lseq.append(seq) 28 | 29 | def forward(self, inputs): 30 | """ 31 | :param inputs: DNA onehot sequences [batch_size x 4 x length] 32 | :return: Stack CNN output feature from different kernel size [batch_size x 12 x length] 33 | """ 34 | _x = list() 35 | for seq in self.lseq: 36 | x = seq(inputs) 37 | _x.append(x) 38 | # concate outputs of every conv layer to a tensor 39 | _x = torch.cat(_x, 1) 40 | return _x 41 | 42 | 43 | class BidirectionalLSTM(nn.Module): 44 | def __init__(self, input_size, hidden_size, output_size): 45 | super(BidirectionalLSTM, self).__init__() 46 | self.rnn = nn.LSTM(input_size, hidden_size, bidirectional=True, batch_first=True) 47 | self.linear = nn.Linear(hidden_size * 2, output_size) 48 | 49 | def forward(self, inputs): 50 | """ 51 | :param inputs: visual feature [batch_size x T x input_size] 52 | :return: contextual feature [batch_size x T x output_size] 53 | """ 54 | 55 | self.rnn.flatten_parameters() 56 | recurrent, _ = self.rnn(inputs) # batch_size x T x input_size -> batch_size x T x (2*hidden_size) 57 | output = self.linear(recurrent) # batch_size x T x output_size 58 | return output 59 | 60 | 61 | class DeePromoter(nn.Module): 62 | def __init__(self, para_ker, input_shape=(64, 300, 4), pool_kernel=6, drop=0.5): 63 | """ 64 | Deepromoter 65 | :param para_ker: List of kernel size that will be used 66 | :param input_shape: Specifies the input shape for model(fixed) 67 | :param pool_kernel: Pooling parameter after CNN 68 | :param drop: Dropout parameter 69 | """ 70 | super(DeePromoter, self).__init__() 71 | binode = len(para_ker) * 4 72 | 73 | self.pconv = ParallelCNN(para_ker, pool_kernel, drop) 74 | self.bilstm = BidirectionalLSTM(binode, binode, binode) 75 | self.flatten = nn.Flatten() 76 | x = torch.zeros(input_shape) 77 | shape = self.get_feature_shape(x) 78 | 79 | self.fc = nn.Sequential( 80 | nn.Linear(shape, shape), 81 | nn.ReLU(), 82 | nn.Linear(shape, 2), 83 | ) 84 | 85 | def get_feature_shape(self, x): 86 | """Pass a dummy input through to find the shape 87 | after flatten layer for Linear layer construction""" 88 | x = x.permute(0, 2, 1) 89 | x = self.pconv(x) 90 | x = x.permute(0, 2, 1) 91 | x = self.bilstm(x) 92 | x = self.flatten(x) 93 | return x.shape[1] 94 | 95 | def forward(self, x): 96 | x = x.permute(0, 2, 1) 97 | x = self.pconv(x) 98 | x = x.permute(0, 2, 1) 99 | x = self.bilstm(x) 100 | x = self.flatten(x) 101 | x = self.fc(x) 102 | return x 103 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_tata_deepromoter/Logger.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from os import path 3 | 4 | 5 | class CSVLogger: 6 | def __init__(self,columns,file): 7 | self.columns=columns 8 | self.file=file 9 | if not self.check_header(): 10 | self._write_header() 11 | 12 | 13 | def check_header(self): 14 | if path.exists(self.file): 15 | # with open(self.file, 'r') as csvfile: 16 | # sniffer = csv.Sniffer() 17 | # has_header = sniffer.has_header(csvfile.read(2048)) 18 | # header=csvfile.seek(0) 19 | header=True 20 | else: 21 | header=False 22 | return header 23 | 24 | 25 | def _write_header(self): 26 | with open(self.file,"a") as f: 27 | string="" 28 | for attrib in self.columns: 29 | string+="{},".format(attrib) 30 | string=string[:len(string)-1] 31 | string+="\n" 32 | f.write(string) 33 | return self 34 | 35 | def log(self,row): 36 | if len(row)!=len(self.columns): 37 | raise Exception("Mismatch between row vector and number of columns in logger") 38 | with open(self.file,"a") as f: 39 | string="" 40 | for attrib in row: 41 | string+="{},".format(attrib) 42 | string=string[:len(string)-1] 43 | string+="\n" 44 | f.write(string) 45 | return self 46 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_tata_deepromoter/LrScheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def update_lr(optimizer, lr): 5 | for param_group in optimizer.param_groups: 6 | param_group['lr'] = lr 7 | 8 | class lr_AIAYN(): 9 | ''' 10 | Learning rate scheduler from the paper: 11 | Attention is All You Need 12 | ''' 13 | def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1): 14 | self.optimizer=optimizer 15 | self.d_model=d_model 16 | self.warmup_steps=warmup_steps 17 | self.step_num=0 18 | self.factor=factor 19 | 20 | def step(self): 21 | self.step_num+=1 22 | lr=self.d_model**-0.5*np.min([self.step_num**-0.5, 23 | self.step_num*self.warmup_steps**-1.5])*self.factor 24 | update_lr(self.optimizer,lr) 25 | return lr 26 | 27 | 28 | class Cos_Anneal(): 29 | ''' 30 | Learning rate scheduler flat and anneal 31 | ''' 32 | def __init__(self,optimizer,max_lr,min_lr,T): 33 | self.optimizer=optimizer 34 | self.max_lr=max_lr 35 | self.min_lr=min_lr 36 | self.step_num=0 37 | self.T=T 38 | 39 | def step(self): 40 | pi=3.1415 41 | self.step_num+=1 42 | lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi)) 43 | if self.optimizer: 44 | update_lr(self.optimizer,lr) 45 | return lr -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_tata_deepromoter/Metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def accuracy(predictions,ground_truths): 5 | return np.sum(predictions==ground_truths)/len(ground_truths) 6 | 7 | 8 | def sensitivity(predictions,ground_truths): 9 | ''' 10 | Here it is assumed: 11 | 0=negative 12 | 1=positive 13 | ''' 14 | return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1]) 15 | 16 | 17 | 18 | def specificity(predictions,ground_truths): 19 | ''' 20 | Here it is assumed: 21 | 0=negative 22 | 1=positive 23 | ''' 24 | return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0]) 25 | 26 | def MCC(predictions,ground_truths): 27 | ''' 28 | Here it is assumed: 29 | 0=negative 30 | 1=positive 31 | ''' 32 | N1=len(predictions[(predictions==0)&(ground_truths==1)]) 33 | N2=len(predictions[(predictions==1)&(ground_truths==0)]) 34 | N3=len(ground_truths[ground_truths==1]) 35 | N4=len(ground_truths[ground_truths==0]) 36 | sens=1-N1/N3 37 | spec=1-N2/N4 38 | denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4)) 39 | return (1-sens-spec)/denom 40 | 41 | 42 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_tata_deepromoter/check_log.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_tata_deepromoter/evaluate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import time 6 | from Functions import * 7 | from Dataset import * 8 | from Network import * 9 | from LrScheduler import * 10 | import Metrics 11 | from Logger import CSVLogger 12 | import argparse 13 | from DeePromoter import * 14 | try: 15 | #from apex.parallel import DistributedDataParallel as DDP 16 | from apex.fp16_utils import * 17 | from apex import amp, optimizers 18 | from apex.multi_tensor_apply import multi_tensor_applier 19 | except ImportError: 20 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.") 21 | import pickle 22 | #gpu selection 23 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 24 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 25 | from sklearn.metrics import matthews_corrcoef 26 | def get_args(): 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument('--gpu_id', type=str, default='0', help='which gpu to use') 29 | parser.add_argument('--path', type=str, default='../v9d3.csv', help='path of csv file with DNA sequences and labels') 30 | parser.add_argument('--epochs', type=int, default=150, help='number of epochs to train') 31 | parser.add_argument('--batch_size', type=int, default=24, help='size of each batch during training') 32 | parser.add_argument('--weight_decay', type=float, default=0, help='weight dacay used in optimizer') 33 | parser.add_argument('--ntoken', type=int, default=4, help='number of tokens to represent DNA nucleotides (should always be 4)') 34 | parser.add_argument('--nclass', type=int, default=2, help='number of classes from the linear decoder') 35 | parser.add_argument('--ninp', type=int, default=512, help='ninp for transformer encoder') 36 | parser.add_argument('--nhead', type=int, default=8, help='nhead for transformer encoder') 37 | parser.add_argument('--nhid', type=int, default=2048, help='nhid for transformer encoder') 38 | parser.add_argument('--nlayers', type=int, default=6, help='nlayers for transformer encoder') 39 | parser.add_argument('--save_freq', type=int, default=1, help='saving checkpoints per save_freq epochs') 40 | parser.add_argument('--dropout', type=float, default=.1, help='transformer dropout') 41 | parser.add_argument('--warmup_steps', type=int, default=3200, help='training schedule warmup steps') 42 | parser.add_argument('--lr_scale', type=float, default=0.1, help='learning rate scale') 43 | parser.add_argument('--nmute', type=int, default=18, help='number of mutations during training') 44 | parser.add_argument('--kmers', type=int, nargs='+', default=[2,3,4,5,6], help='k-mers to be aggregated') 45 | #parser.add_argument('--kmer_aggregation', type=bool, default=True, help='k-mers to be aggregated') 46 | parser.add_argument('--kmer_aggregation', dest='kmer_aggregation', action='store_true') 47 | parser.add_argument('--no_kmer_aggregation', dest='kmer_aggregation', action='store_false') 48 | parser.set_defaults(kmer_aggregation=True) 49 | parser.add_argument('--nfolds', type=int, default=5, help='number of cross validation folds') 50 | parser.add_argument('--fold', type=int, default=0, help='which fold to train') 51 | opts = parser.parse_args() 52 | return opts 53 | 54 | def evaluate_fold(fold): 55 | 56 | #load data 57 | #opts=get_args() 58 | df=pd.read_csv(opts.path) 59 | 60 | sequences=np.asarray(df.sequence) 61 | labels=np.asarray(df.label) 62 | 63 | train_indices, val_indices, test_indices=iter_split_strict(sequences,labels,fold,opts.nfolds) 64 | # print(train_indices.shape) 65 | # print(val_indices.shape) 66 | # exit() 67 | dataset=PromoterDataset(sequences[train_indices],labels[train_indices]) 68 | val_dataset=PromoterDataset(sequences[test_indices],labels[test_indices]) 69 | dataloader=torch.utils.data.DataLoader(dataset,batch_size=opts.batch_size,shuffle=True) 70 | val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=opts.batch_size*2,shuffle=False) 71 | 72 | 73 | 74 | #init model 75 | model=DeePromoter([27, 14, 7]).to(device).to(device) 76 | model=nn.DataParallel(model) 77 | #optimizer=torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=weight_decay) 78 | 79 | # Initialization 80 | # opt_level = 'O1' 81 | # model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) 82 | 83 | # pytorch_total_params = sum(p.numel() for p in model.parameters()) 84 | # print('Total number of paramters: {}'.format(pytorch_total_params)) 85 | 86 | #evaluation loop 87 | #ground_truths=dataset.labels[dataset.val_indices] 88 | ensemble_predictions=[] 89 | acc=[] 90 | 91 | weights_path="best_weights/fold{}top1.ckpt".format(fold,i+1) 92 | print(weights_path) 93 | checkpoint=torch.load(weights_path) 94 | model.load_state_dict(checkpoint) 95 | predictions,attention_weights,sequences,ground_truths=predict(model,device,val_dataloader) 96 | # #validate(model,device,dataset,batch_size=batch_size*2) 97 | predictions=np.exp(predictions)/np.sum(np.exp(predictions),axis=1).reshape(len(predictions),1) 98 | ensemble_predictions.append(predictions) 99 | ensemble_predictions=np.asarray(ensemble_predictions) 100 | ensemble_predictions=np.mean(np.asarray(ensemble_predictions),axis=0) 101 | model.cpu() 102 | del model 103 | #del optimizer 104 | torch.cuda.empty_cache() 105 | return ensemble_predictions, ground_truths, attention_weights, sequences 106 | 107 | opts=get_args() 108 | 109 | 110 | predictions=[] 111 | ground_truths=[] 112 | #attention_weights=[] 113 | sequences=[] 114 | for i in range(5): 115 | ngram=[7] 116 | p,t,at,seq= evaluate_fold(i) 117 | predictions.append(p) 118 | ground_truths.append(t) 119 | #print(at.shape) 120 | #attention_weights.append(at) 121 | sequences.append(seq) 122 | 123 | 124 | probs=np.concatenate(predictions) 125 | ground_truths=np.concatenate(ground_truths) 126 | predictions=np.argmax(probs,axis=1) 127 | #attention_weights=np.squeeze(np.concatenate(attention_weights,0)).astype('float16') 128 | #sequences=np.asarray(sequences).reshape(-1,81) 129 | acc=Metrics.accuracy(predictions,ground_truths) 130 | sens=Metrics.sensitivity(predictions,ground_truths) 131 | spec=Metrics.specificity(predictions,ground_truths) 132 | MCC=matthews_corrcoef(ground_truths,predictions) 133 | precision=precision_score(ground_truths,predictions) 134 | recall=recall_score(ground_truths,predictions) 135 | f1=f1_score(ground_truths,predictions) 136 | # prediction_dict={'predictions':np.squeeze(predictions), 137 | # 'ground_truths':np.squeeze(ground_truths), 138 | # 'attention_weights':np.squeeze(attention_weights), 139 | # 'sequences':np.squeeze(sequences.reshape(-1,81)) 140 | # } 141 | 142 | # with open("prediction_dict.p","wb+") as f: 143 | # pickle.dump(prediction_dict,f) 144 | 145 | 146 | with open("cv.txt",'w+') as f: 147 | f.write(f"ACC: {acc}\n") 148 | f.write(f"sensitivity: {sens}\n") 149 | f.write(f"spec: {spec}\n") 150 | f.write(f"precision: {precision}\n") 151 | f.write(f"recall: {recall}\n") 152 | f.write(f"f1: {f1}\n") 153 | f.write(f"MCC: {MCC}\n") 154 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_tata_deepromoter/extract_motif.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import os 4 | from tqdm import tqdm 5 | import pandas as pd 6 | import argparse 7 | import matplotlib.pyplot as plt 8 | import matplotlib 9 | 10 | font = {'family' : 'normal', 11 | 'weight' : 'bold', 12 | 'size' : 10} 13 | 14 | matplotlib.rc('font', **font) 15 | 16 | def get_args(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--kmers', type=int, default='7', help='kmer') 19 | opts = parser.parse_args() 20 | return opts 21 | 22 | opts=get_args() 23 | 24 | nts=[ 25 | "A", 26 | "T", 27 | "G", 28 | "C"] 29 | 30 | def int2nucleotide(nt_sequence,target_length=None): 31 | seq='' 32 | for nt in nt_sequence: 33 | seq+=nts[nt] 34 | return seq 35 | 36 | with open("prediction_dict.p","rb") as f: 37 | prediction_dict=pickle.load(f) 38 | 39 | 40 | df=pd.DataFrame(columns=['index','sequence']) 41 | 42 | def get_kmers(sequence,k): 43 | kmers=[] 44 | for i in range(len(sequence)-k+1): 45 | kmers.append(sequence[i:i+k]) 46 | return kmers 47 | 48 | os.system('mkdir aw_visualized') 49 | 50 | top=10 51 | count=0 52 | sequences=[] 53 | top_kmers=[] 54 | top_k_count=[] 55 | for i in tqdm(range(len(prediction_dict['sequences']))): 56 | 57 | count+=1 58 | sequence=int2nucleotide(prediction_dict['sequences'][i]) 59 | sequences.append(sequence) 60 | attention_weights=prediction_dict['attention_weights'][i] 61 | ground_truth=prediction_dict['ground_truths'][i] 62 | prediction=prediction_dict['predictions'][i] 63 | 64 | kmers=np.asarray(get_kmers(sequence,opts.kmers)) 65 | 66 | attention_weights=attention_weights[-1].sum(0) 67 | #attention_weights=attention_weights/attention_weights.sum() 68 | # plt.imshow(attention_weights.reshape(1,-1).astype('float32')) 69 | # plt.show() 70 | #exit() 71 | if ground_truth==1: 72 | state='positive' 73 | else: 74 | state='negative' 75 | 76 | if ground_truth==prediction: 77 | eval='correct' 78 | else: 79 | eval='wrong' 80 | if state=='positive' and eval=='correct': 81 | sorted_indices=np.argsort(attention_weights) 82 | #print(attention_weights[sorted_indices][-3:]) 83 | top_k=kmers[sorted_indices][-3:] 84 | for kmer in top_k: 85 | if kmer not in top_kmers: 86 | top_kmers.append(kmer) 87 | top_k_count.append(1) 88 | else: 89 | top_k_count[top_kmers.index(kmer)]=top_k_count[top_kmers.index(kmer)]+1 90 | #exit() 91 | 92 | top_kmers=np.asarray(top_kmers) 93 | top_k_count=np.asarray(top_k_count) 94 | 95 | #exit() 96 | 97 | top_indices=np.flip(np.argsort(top_k_count)) 98 | 99 | fig, ax = plt.subplots() 100 | x=np.arange(top) 101 | width=0.4 102 | bar=ax.bar(x,top_k_count[top_indices[:top]],edgecolor='k',linewidth=2) 103 | ax.set_ylabel('Num of appearancesin top 3',fontsize=10) 104 | #ax.set_title('Scores by group and gender') 105 | ax.set_xticks(x) 106 | ax.set_xticklabels(top_kmers[top_indices[:top]]) 107 | plt.setp(ax.get_xticklabels(), rotation=30, ha="right", 108 | rotation_mode="anchor") 109 | ax.legend() 110 | plt.savefig('promoter_motifs.eps') 111 | #plt.show() 112 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/human_tata_deepromoter/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for i in {0..4};do 3 | python train.py --fold $i --gpu_id 1 --kmer_aggregation --epochs 50 \ 4 | --nlayers 6 --nmute 45 --path ../../data/human_tata_dataset.csv --kmers 7 --ninp 256 --nhid 1024 \ 5 | --batch_size 64 6 | done 7 | 8 | python evaluate.py --gpu_id 1 --kmer_aggregation --epochs 150 \ 9 | --nlayers 6 --nmute 15 --kmers 7 --ninp 256 --nhid 1024 \ 10 | --path ../../data/human_tata_dataset.csv 11 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_non_tata/Dataset.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | from tqdm import tqdm 6 | import torch 7 | 8 | 9 | nt_int={ 10 | "A": 0, 11 | "T": 1, 12 | "G": 2, 13 | "C": 3,} 14 | 15 | def nucleatide2int(nt_sequence,target_length=None): 16 | int_sequence=[] 17 | for nt in nt_sequence: 18 | nt=nt.upper() 19 | if nt in nt_int: 20 | int_sequence.append(nt_int[nt]) 21 | int_sequence=np.asarray(int_sequence,dtype='int32') 22 | if target_length: 23 | int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1) 24 | return int_sequence 25 | 26 | 27 | 28 | 29 | 30 | class PromoterDataset(torch.utils.data.Dataset): 31 | def __init__(self,sequences,labels): 32 | self.data=[] 33 | for seq in sequences: 34 | self.data.append(nucleatide2int(seq)) 35 | #print(self.data[-1].shape) 36 | #exit() 37 | #self.data=np.array(self.data,dtype='int') 38 | #exit() 39 | self.labels=labels 40 | 41 | print(len(self.data)) 42 | print(self.labels.shape) 43 | 44 | def __len__(self): 45 | return len(self.labels) 46 | 47 | def __getitem__(self,idx): 48 | return {'data':self.data[idx], 'labels':self.labels[idx]} 49 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_non_tata/Logger.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from os import path 3 | 4 | 5 | class CSVLogger: 6 | def __init__(self,columns,file): 7 | self.columns=columns 8 | self.file=file 9 | if not self.check_header(): 10 | self._write_header() 11 | 12 | 13 | def check_header(self): 14 | if path.exists(self.file): 15 | # with open(self.file, 'r') as csvfile: 16 | # sniffer = csv.Sniffer() 17 | # has_header = sniffer.has_header(csvfile.read(2048)) 18 | # header=csvfile.seek(0) 19 | header=True 20 | else: 21 | header=False 22 | return header 23 | 24 | 25 | def _write_header(self): 26 | with open(self.file,"a") as f: 27 | string="" 28 | for attrib in self.columns: 29 | string+="{},".format(attrib) 30 | string=string[:len(string)-1] 31 | string+="\n" 32 | f.write(string) 33 | return self 34 | 35 | def log(self,row): 36 | if len(row)!=len(self.columns): 37 | raise Exception("Mismatch between row vector and number of columns in logger") 38 | with open(self.file,"a") as f: 39 | string="" 40 | for attrib in row: 41 | string+="{},".format(attrib) 42 | string=string[:len(string)-1] 43 | string+="\n" 44 | f.write(string) 45 | return self 46 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_non_tata/LrScheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def update_lr(optimizer, lr): 5 | for param_group in optimizer.param_groups: 6 | param_group['lr'] = lr 7 | 8 | class lr_AIAYN(): 9 | ''' 10 | Learning rate scheduler from the paper: 11 | Attention is All You Need 12 | ''' 13 | def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1): 14 | self.optimizer=optimizer 15 | self.d_model=d_model 16 | self.warmup_steps=warmup_steps 17 | self.step_num=0 18 | self.factor=factor 19 | 20 | def step(self): 21 | self.step_num+=1 22 | lr=self.d_model**-0.5*np.min([self.step_num**-0.5, 23 | self.step_num*self.warmup_steps**-1.5])*self.factor 24 | update_lr(self.optimizer,lr) 25 | return lr 26 | 27 | 28 | class Cos_Anneal(): 29 | ''' 30 | Learning rate scheduler flat and anneal 31 | ''' 32 | def __init__(self,optimizer,max_lr,min_lr,T): 33 | self.optimizer=optimizer 34 | self.max_lr=max_lr 35 | self.min_lr=min_lr 36 | self.step_num=0 37 | self.T=T 38 | 39 | def step(self): 40 | pi=3.1415 41 | self.step_num+=1 42 | lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi)) 43 | if self.optimizer: 44 | update_lr(self.optimizer,lr) 45 | return lr -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_non_tata/Metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def accuracy(predictions,ground_truths): 5 | return np.sum(predictions==ground_truths)/len(ground_truths) 6 | 7 | 8 | def sensitivity(predictions,ground_truths): 9 | ''' 10 | Here it is assumed: 11 | 0=negative 12 | 1=positive 13 | ''' 14 | return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1]) 15 | 16 | 17 | 18 | def specificity(predictions,ground_truths): 19 | ''' 20 | Here it is assumed: 21 | 0=negative 22 | 1=positive 23 | ''' 24 | return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0]) 25 | 26 | def MCC(predictions,ground_truths): 27 | ''' 28 | Here it is assumed: 29 | 0=negative 30 | 1=positive 31 | ''' 32 | N1=len(predictions[(predictions==0)&(ground_truths==1)]) 33 | N2=len(predictions[(predictions==1)&(ground_truths==0)]) 34 | N3=len(ground_truths[ground_truths==1]) 35 | N4=len(ground_truths[ground_truths==0]) 36 | sens=1-N1/N3 37 | spec=1-N2/N4 38 | denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4)) 39 | return (1-sens-spec)/denom 40 | 41 | 42 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_non_tata/check_log.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_non_tata/extract_motif.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import os 4 | from tqdm import tqdm 5 | import pandas as pd 6 | import argparse 7 | import matplotlib.pyplot as plt 8 | import matplotlib 9 | 10 | font = {'family' : 'normal', 11 | 'weight' : 'bold', 12 | 'size' : 10} 13 | 14 | matplotlib.rc('font', **font) 15 | 16 | def get_args(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--kmers', type=int, default='7', help='kmer') 19 | opts = parser.parse_args() 20 | return opts 21 | 22 | opts=get_args() 23 | 24 | nts=[ 25 | "A", 26 | "T", 27 | "G", 28 | "C"] 29 | 30 | def int2nucleotide(nt_sequence,target_length=None): 31 | seq='' 32 | for nt in nt_sequence: 33 | seq+=nts[nt] 34 | return seq 35 | 36 | with open("prediction_dict.p","rb") as f: 37 | prediction_dict=pickle.load(f) 38 | 39 | 40 | df=pd.DataFrame(columns=['index','sequence']) 41 | 42 | def get_kmers(sequence,k): 43 | kmers=[] 44 | for i in range(len(sequence)-k+1): 45 | kmers.append(sequence[i:i+k]) 46 | return kmers 47 | 48 | os.system('mkdir aw_visualized') 49 | 50 | top=10 51 | count=0 52 | sequences=[] 53 | top_kmers=[] 54 | top_k_count=[] 55 | for i in tqdm(range(len(prediction_dict['sequences']))): 56 | 57 | count+=1 58 | sequence=int2nucleotide(prediction_dict['sequences'][i]) 59 | sequences.append(sequence) 60 | attention_weights=prediction_dict['attention_weights'][i] 61 | ground_truth=prediction_dict['ground_truths'][i] 62 | prediction=prediction_dict['predictions'][i] 63 | 64 | kmers=np.asarray(get_kmers(sequence,opts.kmers)) 65 | 66 | attention_weights=attention_weights[-1].sum(0) 67 | #attention_weights=attention_weights/attention_weights.sum() 68 | # plt.imshow(attention_weights.reshape(1,-1).astype('float32')) 69 | # plt.show() 70 | #exit() 71 | if ground_truth==1: 72 | state='positive' 73 | else: 74 | state='negative' 75 | 76 | if ground_truth==prediction: 77 | eval='correct' 78 | else: 79 | eval='wrong' 80 | if state=='positive' and eval=='correct': 81 | sorted_indices=np.argsort(attention_weights) 82 | #print(attention_weights[sorted_indices][-3:]) 83 | top_k=kmers[sorted_indices][-3:] 84 | for kmer in top_k: 85 | if kmer not in top_kmers: 86 | top_kmers.append(kmer) 87 | top_k_count.append(1) 88 | else: 89 | top_k_count[top_kmers.index(kmer)]=top_k_count[top_kmers.index(kmer)]+1 90 | #exit() 91 | 92 | top_kmers=np.asarray(top_kmers) 93 | top_k_count=np.asarray(top_k_count) 94 | 95 | #exit() 96 | 97 | top_indices=np.flip(np.argsort(top_k_count)) 98 | 99 | fig, ax = plt.subplots() 100 | x=np.arange(top) 101 | width=0.4 102 | bar=ax.bar(x,top_k_count[top_indices[:top]],edgecolor='k',linewidth=2) 103 | ax.set_ylabel('Num of appearancesin top 3',fontsize=10) 104 | #ax.set_title('Scores by group and gender') 105 | ax.set_xticks(x) 106 | ax.set_xticklabels(top_kmers[top_indices[:top]]) 107 | plt.setp(ax.get_xticklabels(), rotation=30, ha="right", 108 | rotation_mode="anchor") 109 | ax.legend() 110 | plt.savefig('promoter_motifs.eps') 111 | #plt.show() 112 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_non_tata/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for i in {0..4};do 3 | python train.py --fold $i --gpu_id 1 --kmer_aggregation --epochs 150 \ 4 | --nlayers 6 --nmute 45 --path ../../data/mouse_non_tata_dataset.csv --kmers 11 --ninp 256 --nhid 1024 \ 5 | --batch_size 64 --lr_scale 0.2 6 | done 7 | 8 | python evaluate.py --gpu_id 1 --kmer_aggregation --epochs 150 \ 9 | --nlayers 6 --nmute 15 --kmers 11 --ninp 256 --nhid 1024 \ 10 | --path ../../data/mouse_non_tata_dataset.csv 11 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_non_tata_deepromoter/Dataset.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | from tqdm import tqdm 6 | import torch 7 | 8 | 9 | nt_int={ 10 | "A": 0, 11 | "T": 1, 12 | "G": 2, 13 | "C": 3,} 14 | 15 | def nucleatide2int(nt_sequence,target_length=None): 16 | int_sequence=[] 17 | for nt in nt_sequence: 18 | nt=nt.upper() 19 | if nt in nt_int: 20 | int_sequence.append(nt_int[nt]) 21 | int_sequence=np.asarray(int_sequence,dtype='int32') 22 | if target_length: 23 | int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1) 24 | return int_sequence 25 | 26 | 27 | 28 | 29 | 30 | class PromoterDataset(torch.utils.data.Dataset): 31 | def __init__(self,sequences,labels): 32 | self.data=[] 33 | for seq in sequences: 34 | self.data.append(nucleatide2int(seq)) 35 | #print(self.data[-1].shape) 36 | #exit() 37 | #self.data=np.array(self.data,dtype='int') 38 | #exit() 39 | self.labels=labels 40 | 41 | print(len(self.data)) 42 | print(self.labels.shape) 43 | 44 | def __len__(self): 45 | return len(self.labels) 46 | 47 | def __getitem__(self,idx): 48 | 49 | sequence=torch.tensor(self.data[idx]).long() 50 | sequence=torch.nn.functional.one_hot(sequence,num_classes=4).float() 51 | 52 | return {'data':sequence, 'labels':self.labels[idx]} 53 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_non_tata_deepromoter/DeePromoter.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | import numpy as np 4 | from torch import nn 5 | from torch.nn.utils.rnn import pad_sequence 6 | from torch.utils.data import Dataset, DataLoader, random_split 7 | from torchvision import transforms, utils 8 | 9 | 10 | class ParallelCNN(nn.Module): 11 | def __init__(self, para_ker, pool_kernel=6, drop=0.5): 12 | """ 13 | Multiple CNN layer apply on input and concatenate the output 14 | :param para_ker: List of kernel size that will be used 15 | :param pool_kernel: Pooling parameter after CNN 16 | :param drop: Dropout parameter 17 | """ 18 | super(ParallelCNN, self).__init__() 19 | self.lseq = nn.ModuleList() 20 | for k in para_ker: 21 | seq = nn.Sequential( 22 | nn.Conv1d(4, 4, kernel_size=k, padding="same"), 23 | nn.ReLU(), 24 | nn.MaxPool1d(pool_kernel), 25 | nn.Dropout(drop) 26 | ) 27 | self.lseq.append(seq) 28 | 29 | def forward(self, inputs): 30 | """ 31 | :param inputs: DNA onehot sequences [batch_size x 4 x length] 32 | :return: Stack CNN output feature from different kernel size [batch_size x 12 x length] 33 | """ 34 | _x = list() 35 | for seq in self.lseq: 36 | x = seq(inputs) 37 | _x.append(x) 38 | # concate outputs of every conv layer to a tensor 39 | _x = torch.cat(_x, 1) 40 | return _x 41 | 42 | 43 | class BidirectionalLSTM(nn.Module): 44 | def __init__(self, input_size, hidden_size, output_size): 45 | super(BidirectionalLSTM, self).__init__() 46 | self.rnn = nn.LSTM(input_size, hidden_size, bidirectional=True, batch_first=True) 47 | self.linear = nn.Linear(hidden_size * 2, output_size) 48 | 49 | def forward(self, inputs): 50 | """ 51 | :param inputs: visual feature [batch_size x T x input_size] 52 | :return: contextual feature [batch_size x T x output_size] 53 | """ 54 | 55 | self.rnn.flatten_parameters() 56 | recurrent, _ = self.rnn(inputs) # batch_size x T x input_size -> batch_size x T x (2*hidden_size) 57 | output = self.linear(recurrent) # batch_size x T x output_size 58 | return output 59 | 60 | 61 | class DeePromoter(nn.Module): 62 | def __init__(self, para_ker, input_shape=(64, 300, 4), pool_kernel=6, drop=0.5): 63 | """ 64 | Deepromoter 65 | :param para_ker: List of kernel size that will be used 66 | :param input_shape: Specifies the input shape for model(fixed) 67 | :param pool_kernel: Pooling parameter after CNN 68 | :param drop: Dropout parameter 69 | """ 70 | super(DeePromoter, self).__init__() 71 | binode = len(para_ker) * 4 72 | 73 | self.pconv = ParallelCNN(para_ker, pool_kernel, drop) 74 | self.bilstm = BidirectionalLSTM(binode, binode, binode) 75 | self.flatten = nn.Flatten() 76 | x = torch.zeros(input_shape) 77 | shape = self.get_feature_shape(x) 78 | 79 | self.fc = nn.Sequential( 80 | nn.Linear(shape, shape), 81 | nn.ReLU(), 82 | nn.Linear(shape, 2), 83 | ) 84 | 85 | def get_feature_shape(self, x): 86 | """Pass a dummy input through to find the shape 87 | after flatten layer for Linear layer construction""" 88 | x = x.permute(0, 2, 1) 89 | x = self.pconv(x) 90 | x = x.permute(0, 2, 1) 91 | x = self.bilstm(x) 92 | x = self.flatten(x) 93 | return x.shape[1] 94 | 95 | def forward(self, x): 96 | x = x.permute(0, 2, 1) 97 | x = self.pconv(x) 98 | x = x.permute(0, 2, 1) 99 | x = self.bilstm(x) 100 | x = self.flatten(x) 101 | x = self.fc(x) 102 | return x 103 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_non_tata_deepromoter/Logger.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from os import path 3 | 4 | 5 | class CSVLogger: 6 | def __init__(self,columns,file): 7 | self.columns=columns 8 | self.file=file 9 | if not self.check_header(): 10 | self._write_header() 11 | 12 | 13 | def check_header(self): 14 | if path.exists(self.file): 15 | # with open(self.file, 'r') as csvfile: 16 | # sniffer = csv.Sniffer() 17 | # has_header = sniffer.has_header(csvfile.read(2048)) 18 | # header=csvfile.seek(0) 19 | header=True 20 | else: 21 | header=False 22 | return header 23 | 24 | 25 | def _write_header(self): 26 | with open(self.file,"a") as f: 27 | string="" 28 | for attrib in self.columns: 29 | string+="{},".format(attrib) 30 | string=string[:len(string)-1] 31 | string+="\n" 32 | f.write(string) 33 | return self 34 | 35 | def log(self,row): 36 | if len(row)!=len(self.columns): 37 | raise Exception("Mismatch between row vector and number of columns in logger") 38 | with open(self.file,"a") as f: 39 | string="" 40 | for attrib in row: 41 | string+="{},".format(attrib) 42 | string=string[:len(string)-1] 43 | string+="\n" 44 | f.write(string) 45 | return self 46 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_non_tata_deepromoter/LrScheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def update_lr(optimizer, lr): 5 | for param_group in optimizer.param_groups: 6 | param_group['lr'] = lr 7 | 8 | class lr_AIAYN(): 9 | ''' 10 | Learning rate scheduler from the paper: 11 | Attention is All You Need 12 | ''' 13 | def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1): 14 | self.optimizer=optimizer 15 | self.d_model=d_model 16 | self.warmup_steps=warmup_steps 17 | self.step_num=0 18 | self.factor=factor 19 | 20 | def step(self): 21 | self.step_num+=1 22 | lr=self.d_model**-0.5*np.min([self.step_num**-0.5, 23 | self.step_num*self.warmup_steps**-1.5])*self.factor 24 | update_lr(self.optimizer,lr) 25 | return lr 26 | 27 | 28 | class Cos_Anneal(): 29 | ''' 30 | Learning rate scheduler flat and anneal 31 | ''' 32 | def __init__(self,optimizer,max_lr,min_lr,T): 33 | self.optimizer=optimizer 34 | self.max_lr=max_lr 35 | self.min_lr=min_lr 36 | self.step_num=0 37 | self.T=T 38 | 39 | def step(self): 40 | pi=3.1415 41 | self.step_num+=1 42 | lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi)) 43 | if self.optimizer: 44 | update_lr(self.optimizer,lr) 45 | return lr -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_non_tata_deepromoter/Metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def accuracy(predictions,ground_truths): 5 | return np.sum(predictions==ground_truths)/len(ground_truths) 6 | 7 | 8 | def sensitivity(predictions,ground_truths): 9 | ''' 10 | Here it is assumed: 11 | 0=negative 12 | 1=positive 13 | ''' 14 | return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1]) 15 | 16 | 17 | 18 | def specificity(predictions,ground_truths): 19 | ''' 20 | Here it is assumed: 21 | 0=negative 22 | 1=positive 23 | ''' 24 | return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0]) 25 | 26 | def MCC(predictions,ground_truths): 27 | ''' 28 | Here it is assumed: 29 | 0=negative 30 | 1=positive 31 | ''' 32 | N1=len(predictions[(predictions==0)&(ground_truths==1)]) 33 | N2=len(predictions[(predictions==1)&(ground_truths==0)]) 34 | N3=len(ground_truths[ground_truths==1]) 35 | N4=len(ground_truths[ground_truths==0]) 36 | sens=1-N1/N3 37 | spec=1-N2/N4 38 | denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4)) 39 | return (1-sens-spec)/denom 40 | 41 | 42 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_non_tata_deepromoter/check_log.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_non_tata_deepromoter/extract_motif.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import os 4 | from tqdm import tqdm 5 | import pandas as pd 6 | import argparse 7 | import matplotlib.pyplot as plt 8 | import matplotlib 9 | 10 | font = {'family' : 'normal', 11 | 'weight' : 'bold', 12 | 'size' : 10} 13 | 14 | matplotlib.rc('font', **font) 15 | 16 | def get_args(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--kmers', type=int, default='7', help='kmer') 19 | opts = parser.parse_args() 20 | return opts 21 | 22 | opts=get_args() 23 | 24 | nts=[ 25 | "A", 26 | "T", 27 | "G", 28 | "C"] 29 | 30 | def int2nucleotide(nt_sequence,target_length=None): 31 | seq='' 32 | for nt in nt_sequence: 33 | seq+=nts[nt] 34 | return seq 35 | 36 | with open("prediction_dict.p","rb") as f: 37 | prediction_dict=pickle.load(f) 38 | 39 | 40 | df=pd.DataFrame(columns=['index','sequence']) 41 | 42 | def get_kmers(sequence,k): 43 | kmers=[] 44 | for i in range(len(sequence)-k+1): 45 | kmers.append(sequence[i:i+k]) 46 | return kmers 47 | 48 | os.system('mkdir aw_visualized') 49 | 50 | top=10 51 | count=0 52 | sequences=[] 53 | top_kmers=[] 54 | top_k_count=[] 55 | for i in tqdm(range(len(prediction_dict['sequences']))): 56 | 57 | count+=1 58 | sequence=int2nucleotide(prediction_dict['sequences'][i]) 59 | sequences.append(sequence) 60 | attention_weights=prediction_dict['attention_weights'][i] 61 | ground_truth=prediction_dict['ground_truths'][i] 62 | prediction=prediction_dict['predictions'][i] 63 | 64 | kmers=np.asarray(get_kmers(sequence,opts.kmers)) 65 | 66 | attention_weights=attention_weights[-1].sum(0) 67 | #attention_weights=attention_weights/attention_weights.sum() 68 | # plt.imshow(attention_weights.reshape(1,-1).astype('float32')) 69 | # plt.show() 70 | #exit() 71 | if ground_truth==1: 72 | state='positive' 73 | else: 74 | state='negative' 75 | 76 | if ground_truth==prediction: 77 | eval='correct' 78 | else: 79 | eval='wrong' 80 | if state=='positive' and eval=='correct': 81 | sorted_indices=np.argsort(attention_weights) 82 | #print(attention_weights[sorted_indices][-3:]) 83 | top_k=kmers[sorted_indices][-3:] 84 | for kmer in top_k: 85 | if kmer not in top_kmers: 86 | top_kmers.append(kmer) 87 | top_k_count.append(1) 88 | else: 89 | top_k_count[top_kmers.index(kmer)]=top_k_count[top_kmers.index(kmer)]+1 90 | #exit() 91 | 92 | top_kmers=np.asarray(top_kmers) 93 | top_k_count=np.asarray(top_k_count) 94 | 95 | #exit() 96 | 97 | top_indices=np.flip(np.argsort(top_k_count)) 98 | 99 | fig, ax = plt.subplots() 100 | x=np.arange(top) 101 | width=0.4 102 | bar=ax.bar(x,top_k_count[top_indices[:top]],edgecolor='k',linewidth=2) 103 | ax.set_ylabel('Num of appearancesin top 3',fontsize=10) 104 | #ax.set_title('Scores by group and gender') 105 | ax.set_xticks(x) 106 | ax.set_xticklabels(top_kmers[top_indices[:top]]) 107 | plt.setp(ax.get_xticklabels(), rotation=30, ha="right", 108 | rotation_mode="anchor") 109 | ax.legend() 110 | plt.savefig('promoter_motifs.eps') 111 | #plt.show() 112 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_non_tata_deepromoter/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for i in {0..4};do 3 | python train.py --fold $i --gpu_id 1 --kmer_aggregation --epochs 50 \ 4 | --nlayers 6 --nmute 45 --path ../../data/mouse_non_tata_dataset.csv --kmers 7 --ninp 256 --nhid 1024 \ 5 | --batch_size 64 6 | done 7 | 8 | python evaluate.py --gpu_id 1 --kmer_aggregation --epochs 150 \ 9 | --nlayers 6 --nmute 15 --kmers 7 --ninp 256 --nhid 1024 \ 10 | --path ../../data/mouse_non_tata_dataset.csv 11 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_tata/Dataset.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | from tqdm import tqdm 6 | import torch 7 | 8 | 9 | nt_int={ 10 | "A": 0, 11 | "T": 1, 12 | "G": 2, 13 | "C": 3,} 14 | 15 | def nucleatide2int(nt_sequence,target_length=None): 16 | int_sequence=[] 17 | for nt in nt_sequence: 18 | nt=nt.upper() 19 | if nt in nt_int: 20 | int_sequence.append(nt_int[nt]) 21 | int_sequence=np.asarray(int_sequence,dtype='int32') 22 | if target_length: 23 | int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1) 24 | return int_sequence 25 | 26 | 27 | 28 | 29 | 30 | class PromoterDataset(torch.utils.data.Dataset): 31 | def __init__(self,sequences,labels): 32 | self.data=[] 33 | for seq in sequences: 34 | self.data.append(nucleatide2int(seq)) 35 | #print(self.data[-1].shape) 36 | #exit() 37 | #self.data=np.array(self.data,dtype='int') 38 | #exit() 39 | self.labels=labels 40 | 41 | print(len(self.data)) 42 | print(self.labels.shape) 43 | 44 | def __len__(self): 45 | return len(self.labels) 46 | 47 | def __getitem__(self,idx): 48 | return {'data':self.data[idx], 'labels':self.labels[idx]} 49 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_tata/Logger.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from os import path 3 | 4 | 5 | class CSVLogger: 6 | def __init__(self,columns,file): 7 | self.columns=columns 8 | self.file=file 9 | if not self.check_header(): 10 | self._write_header() 11 | 12 | 13 | def check_header(self): 14 | if path.exists(self.file): 15 | # with open(self.file, 'r') as csvfile: 16 | # sniffer = csv.Sniffer() 17 | # has_header = sniffer.has_header(csvfile.read(2048)) 18 | # header=csvfile.seek(0) 19 | header=True 20 | else: 21 | header=False 22 | return header 23 | 24 | 25 | def _write_header(self): 26 | with open(self.file,"a") as f: 27 | string="" 28 | for attrib in self.columns: 29 | string+="{},".format(attrib) 30 | string=string[:len(string)-1] 31 | string+="\n" 32 | f.write(string) 33 | return self 34 | 35 | def log(self,row): 36 | if len(row)!=len(self.columns): 37 | raise Exception("Mismatch between row vector and number of columns in logger") 38 | with open(self.file,"a") as f: 39 | string="" 40 | for attrib in row: 41 | string+="{},".format(attrib) 42 | string=string[:len(string)-1] 43 | string+="\n" 44 | f.write(string) 45 | return self 46 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_tata/LrScheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def update_lr(optimizer, lr): 5 | for param_group in optimizer.param_groups: 6 | param_group['lr'] = lr 7 | 8 | class lr_AIAYN(): 9 | ''' 10 | Learning rate scheduler from the paper: 11 | Attention is All You Need 12 | ''' 13 | def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1): 14 | self.optimizer=optimizer 15 | self.d_model=d_model 16 | self.warmup_steps=warmup_steps 17 | self.step_num=0 18 | self.factor=factor 19 | 20 | def step(self): 21 | self.step_num+=1 22 | lr=self.d_model**-0.5*np.min([self.step_num**-0.5, 23 | self.step_num*self.warmup_steps**-1.5])*self.factor 24 | update_lr(self.optimizer,lr) 25 | return lr 26 | 27 | 28 | class Cos_Anneal(): 29 | ''' 30 | Learning rate scheduler flat and anneal 31 | ''' 32 | def __init__(self,optimizer,max_lr,min_lr,T): 33 | self.optimizer=optimizer 34 | self.max_lr=max_lr 35 | self.min_lr=min_lr 36 | self.step_num=0 37 | self.T=T 38 | 39 | def step(self): 40 | pi=3.1415 41 | self.step_num+=1 42 | lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi)) 43 | if self.optimizer: 44 | update_lr(self.optimizer,lr) 45 | return lr -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_tata/Metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def accuracy(predictions,ground_truths): 5 | return np.sum(predictions==ground_truths)/len(ground_truths) 6 | 7 | 8 | def sensitivity(predictions,ground_truths): 9 | ''' 10 | Here it is assumed: 11 | 0=negative 12 | 1=positive 13 | ''' 14 | return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1]) 15 | 16 | 17 | 18 | def specificity(predictions,ground_truths): 19 | ''' 20 | Here it is assumed: 21 | 0=negative 22 | 1=positive 23 | ''' 24 | return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0]) 25 | 26 | def MCC(predictions,ground_truths): 27 | ''' 28 | Here it is assumed: 29 | 0=negative 30 | 1=positive 31 | ''' 32 | N1=len(predictions[(predictions==0)&(ground_truths==1)]) 33 | N2=len(predictions[(predictions==1)&(ground_truths==0)]) 34 | N3=len(ground_truths[ground_truths==1]) 35 | N4=len(ground_truths[ground_truths==0]) 36 | sens=1-N1/N3 37 | spec=1-N2/N4 38 | denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4)) 39 | return (1-sens-spec)/denom 40 | 41 | 42 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_tata/check_log.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_tata/extract_motif.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import os 4 | from tqdm import tqdm 5 | import pandas as pd 6 | import argparse 7 | import matplotlib.pyplot as plt 8 | import matplotlib 9 | 10 | font = {'family' : 'normal', 11 | 'weight' : 'bold', 12 | 'size' : 10} 13 | 14 | matplotlib.rc('font', **font) 15 | 16 | def get_args(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--kmers', type=int, default='7', help='kmer') 19 | opts = parser.parse_args() 20 | return opts 21 | 22 | opts=get_args() 23 | 24 | nts=[ 25 | "A", 26 | "T", 27 | "G", 28 | "C"] 29 | 30 | def int2nucleotide(nt_sequence,target_length=None): 31 | seq='' 32 | for nt in nt_sequence: 33 | seq+=nts[nt] 34 | return seq 35 | 36 | with open("prediction_dict.p","rb") as f: 37 | prediction_dict=pickle.load(f) 38 | 39 | 40 | df=pd.DataFrame(columns=['index','sequence']) 41 | 42 | def get_kmers(sequence,k): 43 | kmers=[] 44 | for i in range(len(sequence)-k+1): 45 | kmers.append(sequence[i:i+k]) 46 | return kmers 47 | 48 | os.system('mkdir aw_visualized') 49 | 50 | top=10 51 | count=0 52 | sequences=[] 53 | top_kmers=[] 54 | top_k_count=[] 55 | for i in tqdm(range(len(prediction_dict['sequences']))): 56 | 57 | count+=1 58 | sequence=int2nucleotide(prediction_dict['sequences'][i]) 59 | sequences.append(sequence) 60 | attention_weights=prediction_dict['attention_weights'][i] 61 | ground_truth=prediction_dict['ground_truths'][i] 62 | prediction=prediction_dict['predictions'][i] 63 | 64 | kmers=np.asarray(get_kmers(sequence,opts.kmers)) 65 | 66 | attention_weights=attention_weights[-1].sum(0) 67 | #attention_weights=attention_weights/attention_weights.sum() 68 | # plt.imshow(attention_weights.reshape(1,-1).astype('float32')) 69 | # plt.show() 70 | #exit() 71 | if ground_truth==1: 72 | state='positive' 73 | else: 74 | state='negative' 75 | 76 | if ground_truth==prediction: 77 | eval='correct' 78 | else: 79 | eval='wrong' 80 | if state=='positive' and eval=='correct': 81 | sorted_indices=np.argsort(attention_weights) 82 | #print(attention_weights[sorted_indices][-3:]) 83 | top_k=kmers[sorted_indices][-3:] 84 | for kmer in top_k: 85 | if kmer not in top_kmers: 86 | top_kmers.append(kmer) 87 | top_k_count.append(1) 88 | else: 89 | top_k_count[top_kmers.index(kmer)]=top_k_count[top_kmers.index(kmer)]+1 90 | #exit() 91 | 92 | top_kmers=np.asarray(top_kmers) 93 | top_k_count=np.asarray(top_k_count) 94 | 95 | #exit() 96 | 97 | top_indices=np.flip(np.argsort(top_k_count)) 98 | 99 | fig, ax = plt.subplots() 100 | x=np.arange(top) 101 | width=0.4 102 | bar=ax.bar(x,top_k_count[top_indices[:top]],edgecolor='k',linewidth=2) 103 | ax.set_ylabel('Num of appearancesin top 3',fontsize=10) 104 | #ax.set_title('Scores by group and gender') 105 | ax.set_xticks(x) 106 | ax.set_xticklabels(top_kmers[top_indices[:top]]) 107 | plt.setp(ax.get_xticklabels(), rotation=30, ha="right", 108 | rotation_mode="anchor") 109 | ax.legend() 110 | plt.savefig('promoter_motifs.eps') 111 | #plt.show() 112 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_tata/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for i in {0..4};do 3 | python train.py --fold $i --gpu_id 1 --kmer_aggregation --epochs 150 \ 4 | --nlayers 6 --nmute 45 --path ../../data/mouse_tata_dataset.csv --kmers 11 --ninp 256 --nhid 1024 \ 5 | --batch_size 32 --lr_scale 0.1 6 | done 7 | 8 | python evaluate.py --gpu_id 1 --kmer_aggregation --epochs 150 \ 9 | --nlayers 6 --nmute 15 --kmers 11 --ninp 256 --nhid 1024 \ 10 | --path ../../data/mouse_tata_dataset.csv 11 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_tata_deepromoter/Dataset.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | from tqdm import tqdm 6 | import torch 7 | 8 | 9 | nt_int={ 10 | "A": 0, 11 | "T": 1, 12 | "G": 2, 13 | "C": 3,} 14 | 15 | def nucleatide2int(nt_sequence,target_length=None): 16 | int_sequence=[] 17 | for nt in nt_sequence: 18 | nt=nt.upper() 19 | if nt in nt_int: 20 | int_sequence.append(nt_int[nt]) 21 | int_sequence=np.asarray(int_sequence,dtype='int32') 22 | if target_length: 23 | int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1) 24 | return int_sequence 25 | 26 | 27 | 28 | 29 | 30 | class PromoterDataset(torch.utils.data.Dataset): 31 | def __init__(self,sequences,labels): 32 | self.data=[] 33 | for seq in sequences: 34 | self.data.append(nucleatide2int(seq)) 35 | #print(self.data[-1].shape) 36 | #exit() 37 | #self.data=np.array(self.data,dtype='int') 38 | #exit() 39 | self.labels=labels 40 | 41 | print(len(self.data)) 42 | print(self.labels.shape) 43 | 44 | def __len__(self): 45 | return len(self.labels) 46 | 47 | def __getitem__(self,idx): 48 | 49 | sequence=torch.tensor(self.data[idx]).long() 50 | sequence=torch.nn.functional.one_hot(sequence,num_classes=4).float() 51 | 52 | return {'data':sequence, 'labels':self.labels[idx]} 53 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_tata_deepromoter/DeePromoter.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | import numpy as np 4 | from torch import nn 5 | from torch.nn.utils.rnn import pad_sequence 6 | from torch.utils.data import Dataset, DataLoader, random_split 7 | from torchvision import transforms, utils 8 | 9 | 10 | class ParallelCNN(nn.Module): 11 | def __init__(self, para_ker, pool_kernel=6, drop=0.5): 12 | """ 13 | Multiple CNN layer apply on input and concatenate the output 14 | :param para_ker: List of kernel size that will be used 15 | :param pool_kernel: Pooling parameter after CNN 16 | :param drop: Dropout parameter 17 | """ 18 | super(ParallelCNN, self).__init__() 19 | self.lseq = nn.ModuleList() 20 | for k in para_ker: 21 | seq = nn.Sequential( 22 | nn.Conv1d(4, 4, kernel_size=k, padding="same"), 23 | nn.ReLU(), 24 | nn.MaxPool1d(pool_kernel), 25 | nn.Dropout(drop) 26 | ) 27 | self.lseq.append(seq) 28 | 29 | def forward(self, inputs): 30 | """ 31 | :param inputs: DNA onehot sequences [batch_size x 4 x length] 32 | :return: Stack CNN output feature from different kernel size [batch_size x 12 x length] 33 | """ 34 | _x = list() 35 | for seq in self.lseq: 36 | x = seq(inputs) 37 | _x.append(x) 38 | # concate outputs of every conv layer to a tensor 39 | _x = torch.cat(_x, 1) 40 | return _x 41 | 42 | 43 | class BidirectionalLSTM(nn.Module): 44 | def __init__(self, input_size, hidden_size, output_size): 45 | super(BidirectionalLSTM, self).__init__() 46 | self.rnn = nn.LSTM(input_size, hidden_size, bidirectional=True, batch_first=True) 47 | self.linear = nn.Linear(hidden_size * 2, output_size) 48 | 49 | def forward(self, inputs): 50 | """ 51 | :param inputs: visual feature [batch_size x T x input_size] 52 | :return: contextual feature [batch_size x T x output_size] 53 | """ 54 | 55 | self.rnn.flatten_parameters() 56 | recurrent, _ = self.rnn(inputs) # batch_size x T x input_size -> batch_size x T x (2*hidden_size) 57 | output = self.linear(recurrent) # batch_size x T x output_size 58 | return output 59 | 60 | 61 | class DeePromoter(nn.Module): 62 | def __init__(self, para_ker, input_shape=(64, 300, 4), pool_kernel=6, drop=0.5): 63 | """ 64 | Deepromoter 65 | :param para_ker: List of kernel size that will be used 66 | :param input_shape: Specifies the input shape for model(fixed) 67 | :param pool_kernel: Pooling parameter after CNN 68 | :param drop: Dropout parameter 69 | """ 70 | super(DeePromoter, self).__init__() 71 | binode = len(para_ker) * 4 72 | 73 | self.pconv = ParallelCNN(para_ker, pool_kernel, drop) 74 | self.bilstm = BidirectionalLSTM(binode, binode, binode) 75 | self.flatten = nn.Flatten() 76 | x = torch.zeros(input_shape) 77 | shape = self.get_feature_shape(x) 78 | 79 | self.fc = nn.Sequential( 80 | nn.Linear(shape, shape), 81 | nn.ReLU(), 82 | nn.Linear(shape, 2), 83 | ) 84 | 85 | def get_feature_shape(self, x): 86 | """Pass a dummy input through to find the shape 87 | after flatten layer for Linear layer construction""" 88 | x = x.permute(0, 2, 1) 89 | x = self.pconv(x) 90 | x = x.permute(0, 2, 1) 91 | x = self.bilstm(x) 92 | x = self.flatten(x) 93 | return x.shape[1] 94 | 95 | def forward(self, x): 96 | x = x.permute(0, 2, 1) 97 | x = self.pconv(x) 98 | x = x.permute(0, 2, 1) 99 | x = self.bilstm(x) 100 | x = self.flatten(x) 101 | x = self.fc(x) 102 | return x 103 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_tata_deepromoter/Logger.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from os import path 3 | 4 | 5 | class CSVLogger: 6 | def __init__(self,columns,file): 7 | self.columns=columns 8 | self.file=file 9 | if not self.check_header(): 10 | self._write_header() 11 | 12 | 13 | def check_header(self): 14 | if path.exists(self.file): 15 | # with open(self.file, 'r') as csvfile: 16 | # sniffer = csv.Sniffer() 17 | # has_header = sniffer.has_header(csvfile.read(2048)) 18 | # header=csvfile.seek(0) 19 | header=True 20 | else: 21 | header=False 22 | return header 23 | 24 | 25 | def _write_header(self): 26 | with open(self.file,"a") as f: 27 | string="" 28 | for attrib in self.columns: 29 | string+="{},".format(attrib) 30 | string=string[:len(string)-1] 31 | string+="\n" 32 | f.write(string) 33 | return self 34 | 35 | def log(self,row): 36 | if len(row)!=len(self.columns): 37 | raise Exception("Mismatch between row vector and number of columns in logger") 38 | with open(self.file,"a") as f: 39 | string="" 40 | for attrib in row: 41 | string+="{},".format(attrib) 42 | string=string[:len(string)-1] 43 | string+="\n" 44 | f.write(string) 45 | return self 46 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_tata_deepromoter/LrScheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def update_lr(optimizer, lr): 5 | for param_group in optimizer.param_groups: 6 | param_group['lr'] = lr 7 | 8 | class lr_AIAYN(): 9 | ''' 10 | Learning rate scheduler from the paper: 11 | Attention is All You Need 12 | ''' 13 | def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1): 14 | self.optimizer=optimizer 15 | self.d_model=d_model 16 | self.warmup_steps=warmup_steps 17 | self.step_num=0 18 | self.factor=factor 19 | 20 | def step(self): 21 | self.step_num+=1 22 | lr=self.d_model**-0.5*np.min([self.step_num**-0.5, 23 | self.step_num*self.warmup_steps**-1.5])*self.factor 24 | update_lr(self.optimizer,lr) 25 | return lr 26 | 27 | 28 | class Cos_Anneal(): 29 | ''' 30 | Learning rate scheduler flat and anneal 31 | ''' 32 | def __init__(self,optimizer,max_lr,min_lr,T): 33 | self.optimizer=optimizer 34 | self.max_lr=max_lr 35 | self.min_lr=min_lr 36 | self.step_num=0 37 | self.T=T 38 | 39 | def step(self): 40 | pi=3.1415 41 | self.step_num+=1 42 | lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi)) 43 | if self.optimizer: 44 | update_lr(self.optimizer,lr) 45 | return lr -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_tata_deepromoter/Metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def accuracy(predictions,ground_truths): 5 | return np.sum(predictions==ground_truths)/len(ground_truths) 6 | 7 | 8 | def sensitivity(predictions,ground_truths): 9 | ''' 10 | Here it is assumed: 11 | 0=negative 12 | 1=positive 13 | ''' 14 | return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1]) 15 | 16 | 17 | 18 | def specificity(predictions,ground_truths): 19 | ''' 20 | Here it is assumed: 21 | 0=negative 22 | 1=positive 23 | ''' 24 | return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0]) 25 | 26 | def MCC(predictions,ground_truths): 27 | ''' 28 | Here it is assumed: 29 | 0=negative 30 | 1=positive 31 | ''' 32 | N1=len(predictions[(predictions==0)&(ground_truths==1)]) 33 | N2=len(predictions[(predictions==1)&(ground_truths==0)]) 34 | N3=len(ground_truths[ground_truths==1]) 35 | N4=len(ground_truths[ground_truths==0]) 36 | sens=1-N1/N3 37 | spec=1-N2/N4 38 | denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4)) 39 | return (1-sens-spec)/denom 40 | 41 | 42 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_tata_deepromoter/check_log.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_tata_deepromoter/evaluate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import time 6 | from Functions import * 7 | from Dataset import * 8 | from Network import * 9 | from LrScheduler import * 10 | import Metrics 11 | from Logger import CSVLogger 12 | import argparse 13 | from DeePromoter import * 14 | try: 15 | #from apex.parallel import DistributedDataParallel as DDP 16 | from apex.fp16_utils import * 17 | from apex import amp, optimizers 18 | from apex.multi_tensor_apply import multi_tensor_applier 19 | except ImportError: 20 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.") 21 | import pickle 22 | #gpu selection 23 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 24 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 25 | from sklearn.metrics import matthews_corrcoef 26 | def get_args(): 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument('--gpu_id', type=str, default='0', help='which gpu to use') 29 | parser.add_argument('--path', type=str, default='../v9d3.csv', help='path of csv file with DNA sequences and labels') 30 | parser.add_argument('--epochs', type=int, default=150, help='number of epochs to train') 31 | parser.add_argument('--batch_size', type=int, default=24, help='size of each batch during training') 32 | parser.add_argument('--weight_decay', type=float, default=0, help='weight dacay used in optimizer') 33 | parser.add_argument('--ntoken', type=int, default=4, help='number of tokens to represent DNA nucleotides (should always be 4)') 34 | parser.add_argument('--nclass', type=int, default=2, help='number of classes from the linear decoder') 35 | parser.add_argument('--ninp', type=int, default=512, help='ninp for transformer encoder') 36 | parser.add_argument('--nhead', type=int, default=8, help='nhead for transformer encoder') 37 | parser.add_argument('--nhid', type=int, default=2048, help='nhid for transformer encoder') 38 | parser.add_argument('--nlayers', type=int, default=6, help='nlayers for transformer encoder') 39 | parser.add_argument('--save_freq', type=int, default=1, help='saving checkpoints per save_freq epochs') 40 | parser.add_argument('--dropout', type=float, default=.1, help='transformer dropout') 41 | parser.add_argument('--warmup_steps', type=int, default=3200, help='training schedule warmup steps') 42 | parser.add_argument('--lr_scale', type=float, default=0.1, help='learning rate scale') 43 | parser.add_argument('--nmute', type=int, default=18, help='number of mutations during training') 44 | parser.add_argument('--kmers', type=int, nargs='+', default=[2,3,4,5,6], help='k-mers to be aggregated') 45 | #parser.add_argument('--kmer_aggregation', type=bool, default=True, help='k-mers to be aggregated') 46 | parser.add_argument('--kmer_aggregation', dest='kmer_aggregation', action='store_true') 47 | parser.add_argument('--no_kmer_aggregation', dest='kmer_aggregation', action='store_false') 48 | parser.set_defaults(kmer_aggregation=True) 49 | parser.add_argument('--nfolds', type=int, default=5, help='number of cross validation folds') 50 | parser.add_argument('--fold', type=int, default=0, help='which fold to train') 51 | opts = parser.parse_args() 52 | return opts 53 | 54 | def evaluate_fold(fold): 55 | 56 | #load data 57 | #opts=get_args() 58 | df=pd.read_csv(opts.path) 59 | 60 | sequences=np.asarray(df.sequence) 61 | labels=np.asarray(df.label) 62 | 63 | train_indices, val_indices, test_indices=iter_split_strict(sequences,labels,fold,opts.nfolds) 64 | # print(train_indices.shape) 65 | # print(val_indices.shape) 66 | # exit() 67 | dataset=PromoterDataset(sequences[train_indices],labels[train_indices]) 68 | val_dataset=PromoterDataset(sequences[test_indices],labels[test_indices]) 69 | dataloader=torch.utils.data.DataLoader(dataset,batch_size=opts.batch_size,shuffle=True) 70 | val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=opts.batch_size*2,shuffle=False) 71 | 72 | 73 | 74 | #init model 75 | model=DeePromoter([27, 14, 7]).to(device).to(device) 76 | model=nn.DataParallel(model) 77 | #optimizer=torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=weight_decay) 78 | 79 | # Initialization 80 | # opt_level = 'O1' 81 | # model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) 82 | 83 | # pytorch_total_params = sum(p.numel() for p in model.parameters()) 84 | # print('Total number of paramters: {}'.format(pytorch_total_params)) 85 | 86 | #evaluation loop 87 | #ground_truths=dataset.labels[dataset.val_indices] 88 | ensemble_predictions=[] 89 | acc=[] 90 | 91 | weights_path="best_weights/fold{}top1.ckpt".format(fold,i+1) 92 | print(weights_path) 93 | checkpoint=torch.load(weights_path) 94 | model.load_state_dict(checkpoint) 95 | predictions,attention_weights,sequences,ground_truths=predict(model,device,val_dataloader) 96 | # #validate(model,device,dataset,batch_size=batch_size*2) 97 | predictions=np.exp(predictions)/np.sum(np.exp(predictions),axis=1).reshape(len(predictions),1) 98 | ensemble_predictions.append(predictions) 99 | ensemble_predictions=np.asarray(ensemble_predictions) 100 | ensemble_predictions=np.mean(np.asarray(ensemble_predictions),axis=0) 101 | model.cpu() 102 | del model 103 | #del optimizer 104 | torch.cuda.empty_cache() 105 | return ensemble_predictions, ground_truths, attention_weights, sequences 106 | 107 | opts=get_args() 108 | 109 | 110 | predictions=[] 111 | ground_truths=[] 112 | #attention_weights=[] 113 | sequences=[] 114 | for i in range(5): 115 | ngram=[7] 116 | p,t,at,seq= evaluate_fold(i) 117 | predictions.append(p) 118 | ground_truths.append(t) 119 | #print(at.shape) 120 | #attention_weights.append(at) 121 | sequences.append(seq) 122 | 123 | 124 | probs=np.concatenate(predictions) 125 | ground_truths=np.concatenate(ground_truths) 126 | predictions=np.argmax(probs,axis=1) 127 | #attention_weights=np.squeeze(np.concatenate(attention_weights,0)).astype('float16') 128 | #sequences=np.asarray(sequences).reshape(-1,81) 129 | acc=Metrics.accuracy(predictions,ground_truths) 130 | sens=Metrics.sensitivity(predictions,ground_truths) 131 | spec=Metrics.specificity(predictions,ground_truths) 132 | MCC=matthews_corrcoef(ground_truths,predictions) 133 | precision=precision_score(ground_truths,predictions) 134 | recall=recall_score(ground_truths,predictions) 135 | f1=f1_score(ground_truths,predictions) 136 | # prediction_dict={'predictions':np.squeeze(predictions), 137 | # 'ground_truths':np.squeeze(ground_truths), 138 | # 'attention_weights':np.squeeze(attention_weights), 139 | # 'sequences':np.squeeze(sequences.reshape(-1,81)) 140 | # } 141 | 142 | # with open("prediction_dict.p","wb+") as f: 143 | # pickle.dump(prediction_dict,f) 144 | 145 | 146 | with open("cv.txt",'w+') as f: 147 | f.write(f"ACC: {acc}\n") 148 | f.write(f"sensitivity: {sens}\n") 149 | f.write(f"spec: {spec}\n") 150 | f.write(f"precision: {precision}\n") 151 | f.write(f"recall: {recall}\n") 152 | f.write(f"f1: {f1}\n") 153 | f.write(f"MCC: {MCC}\n") 154 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_tata_deepromoter/extract_motif.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import os 4 | from tqdm import tqdm 5 | import pandas as pd 6 | import argparse 7 | import matplotlib.pyplot as plt 8 | import matplotlib 9 | 10 | font = {'family' : 'normal', 11 | 'weight' : 'bold', 12 | 'size' : 10} 13 | 14 | matplotlib.rc('font', **font) 15 | 16 | def get_args(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--kmers', type=int, default='7', help='kmer') 19 | opts = parser.parse_args() 20 | return opts 21 | 22 | opts=get_args() 23 | 24 | nts=[ 25 | "A", 26 | "T", 27 | "G", 28 | "C"] 29 | 30 | def int2nucleotide(nt_sequence,target_length=None): 31 | seq='' 32 | for nt in nt_sequence: 33 | seq+=nts[nt] 34 | return seq 35 | 36 | with open("prediction_dict.p","rb") as f: 37 | prediction_dict=pickle.load(f) 38 | 39 | 40 | df=pd.DataFrame(columns=['index','sequence']) 41 | 42 | def get_kmers(sequence,k): 43 | kmers=[] 44 | for i in range(len(sequence)-k+1): 45 | kmers.append(sequence[i:i+k]) 46 | return kmers 47 | 48 | os.system('mkdir aw_visualized') 49 | 50 | top=10 51 | count=0 52 | sequences=[] 53 | top_kmers=[] 54 | top_k_count=[] 55 | for i in tqdm(range(len(prediction_dict['sequences']))): 56 | 57 | count+=1 58 | sequence=int2nucleotide(prediction_dict['sequences'][i]) 59 | sequences.append(sequence) 60 | attention_weights=prediction_dict['attention_weights'][i] 61 | ground_truth=prediction_dict['ground_truths'][i] 62 | prediction=prediction_dict['predictions'][i] 63 | 64 | kmers=np.asarray(get_kmers(sequence,opts.kmers)) 65 | 66 | attention_weights=attention_weights[-1].sum(0) 67 | #attention_weights=attention_weights/attention_weights.sum() 68 | # plt.imshow(attention_weights.reshape(1,-1).astype('float32')) 69 | # plt.show() 70 | #exit() 71 | if ground_truth==1: 72 | state='positive' 73 | else: 74 | state='negative' 75 | 76 | if ground_truth==prediction: 77 | eval='correct' 78 | else: 79 | eval='wrong' 80 | if state=='positive' and eval=='correct': 81 | sorted_indices=np.argsort(attention_weights) 82 | #print(attention_weights[sorted_indices][-3:]) 83 | top_k=kmers[sorted_indices][-3:] 84 | for kmer in top_k: 85 | if kmer not in top_kmers: 86 | top_kmers.append(kmer) 87 | top_k_count.append(1) 88 | else: 89 | top_k_count[top_kmers.index(kmer)]=top_k_count[top_kmers.index(kmer)]+1 90 | #exit() 91 | 92 | top_kmers=np.asarray(top_kmers) 93 | top_k_count=np.asarray(top_k_count) 94 | 95 | #exit() 96 | 97 | top_indices=np.flip(np.argsort(top_k_count)) 98 | 99 | fig, ax = plt.subplots() 100 | x=np.arange(top) 101 | width=0.4 102 | bar=ax.bar(x,top_k_count[top_indices[:top]],edgecolor='k',linewidth=2) 103 | ax.set_ylabel('Num of appearancesin top 3',fontsize=10) 104 | #ax.set_title('Scores by group and gender') 105 | ax.set_xticks(x) 106 | ax.set_xticklabels(top_kmers[top_indices[:top]]) 107 | plt.setp(ax.get_xticklabels(), rotation=30, ha="right", 108 | rotation_mode="anchor") 109 | ax.legend() 110 | plt.savefig('promoter_motifs.eps') 111 | #plt.show() 112 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/mouse_tata_deepromoter/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for i in {0..4};do 3 | python train.py --fold $i --gpu_id 0 --kmer_aggregation --epochs 50 \ 4 | --nlayers 6 --nmute 45 --path ../../data/mouse_tata_dataset.csv --kmers 7 --ninp 256 --nhid 1024 \ 5 | --batch_size 64 6 | done 7 | 8 | python evaluate.py --gpu_id 0 --kmer_aggregation --epochs 150 \ 9 | --nlayers 6 --nmute 15 --kmers 7 --ninp 256 --nhid 1024 \ 10 | --path ../../data/mouse_tata_dataset.csv 11 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/run_human.sh: -------------------------------------------------------------------------------- 1 | cd human_tata 2 | bash run.sh 3 | cd .. 4 | 5 | cd human_non_tata 6 | bash run.sh 7 | cd .. 8 | -------------------------------------------------------------------------------- /src/Eukaryotic_Promoters_Classification/run_mouse.sh: -------------------------------------------------------------------------------- 1 | cd mouse_tata 2 | bash run.sh 3 | cd .. 4 | 5 | cd mouse_non_tata 6 | bash run.sh 7 | cd .. 8 | -------------------------------------------------------------------------------- /src/Non_Coding_Variant_Effects/Dataset.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | from tqdm import tqdm 6 | import torch 7 | 8 | 9 | nt_int={ 10 | "A": 0, 11 | "T": 1, 12 | "G": 2, 13 | "C": 3,} 14 | 15 | def nucleatide2int(nt_sequence,target_length=None): 16 | int_sequence=[] 17 | for nt in nt_sequence: 18 | nt=nt.upper() 19 | if nt in nt_int: 20 | int_sequence.append(nt_int[nt]) 21 | int_sequence=np.asarray(int_sequence,dtype='int32') 22 | if target_length: 23 | int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1) 24 | return int_sequence 25 | 26 | 27 | class DeepSeaDataset(torch.utils.data.Dataset): 28 | def __init__(self,sequences,labels): 29 | self.sequences=sequences 30 | self.labels=labels 31 | 32 | 33 | def __len__(self): 34 | return len(self.labels) 35 | 36 | def __getitem__(self,idx): 37 | sequence=self.sequences[idx].argmax(0) 38 | return {'data':sequence, 'labels':self.labels[idx]} 39 | -------------------------------------------------------------------------------- /src/Non_Coding_Variant_Effects/Functions.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | from sklearn import metrics 4 | import numpy as np 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from tqdm import tqdm 8 | import Metrics 9 | import numpy as np 10 | import os 11 | import pandas as pd 12 | import random 13 | 14 | def seed_everything(seed=42): 15 | random.seed(seed) 16 | os.environ['PYTHONHASHSEED'] = str(seed) 17 | np.random.seed(seed) 18 | torch.manual_seed(seed) 19 | torch.cuda.manual_seed(seed) 20 | torch.backends.cudnn.deterministic = True 21 | 22 | def get_best_weights_from_fold(fold,top=3): 23 | csv_file='log_fold{}.csv'.format(fold) 24 | 25 | history=pd.read_csv(csv_file) 26 | scores=np.asarray(history.val_auc) 27 | top_epochs=scores.argsort()[-3:][::-1] 28 | print(scores[top_epochs]) 29 | os.system('mkdir best_weights') 30 | 31 | for i in range(top): 32 | weights_path='checkpoints_fold{}/epoch{}.ckpt'.format(fold,history.epoch[top_epochs[i]]) 33 | print(weights_path) 34 | os.system('cp {} best_weights/fold{}top{}.ckpt'.format(weights_path,fold,i+1)) 35 | os.system('rm -r checkpoints_fold{}'.format(fold)) 36 | 37 | def smoothcrossentropyloss(pred,gold,n_class=2,smoothing=0.05): 38 | gold = gold.contiguous().view(-1) 39 | one_hot = torch.zeros_like(pred).scatter(1, gold.view(-1, 1), 1) 40 | one_hot = one_hot * (1 - smoothing) + (1 - one_hot) * smoothing / (n_class - 1) 41 | log_prb = F.log_softmax(pred, dim=1) 42 | loss = -(one_hot * log_prb) 43 | #loss=loss.sum(1).mean() 44 | return loss 45 | 46 | def mutate_dna_sequence(sequence,nmute=15): 47 | mutation=torch.randint(0,4,size=(sequence.shape[0],nmute)) 48 | to_mutate = torch.randperm(sequence.shape[1])[:nmute] 49 | sequence[:,to_mutate]=mutation 50 | return sequence 51 | 52 | def get_MLM_mask(sequence,nmask=12): 53 | mask=np.zeros(sequence.shape,dtype='bool') 54 | to_mask=np.random.choice(len(sequence[0]),size=(nmask),replace=False) 55 | mask[:,to_mask]=True 56 | return mask 57 | 58 | def get_complementary_sequence_deepsea(sequence): 59 | #AGCT 60 | complementary_sequence=sequence.clone() 61 | complementary_sequence[sequence==0]=3 62 | complementary_sequence[sequence==1]=2 63 | complementary_sequence[sequence==2]=1 64 | complementary_sequence[sequence==3]=0 65 | complementary_sequence=complementary_sequence.flip(-1) 66 | return complementary_sequence 67 | 68 | def update_lr(optimizer, lr): 69 | for param_group in optimizer.param_groups: 70 | param_group['lr'] = lr 71 | 72 | def save_weights(model,optimizer,epoch,folder): 73 | if os.path.isdir(folder)==False: 74 | os.makedirs(folder,exist_ok=True) 75 | torch.save(model.state_dict(), folder+'/epoch{}.ckpt'.format(epoch+1)) 76 | 77 | 78 | 79 | def validate(model,device,dataset,batch_size=64): 80 | batches=len(dataset) 81 | model.train(False) 82 | total=0 83 | predictions=[] 84 | outputs=[] 85 | ground_truths=[] 86 | loss=0 87 | criterion=nn.BCEWithLogitsLoss() 88 | with torch.no_grad(): 89 | for data in tqdm(dataset): 90 | X=data['data'].to(device).long() 91 | Y=data['labels'].to(device).float() 92 | 93 | output= model(X) 94 | del X 95 | loss+=criterion(output,Y) 96 | probs = torch.sigmoid(output) 97 | for pred in probs: 98 | predictions.append(pred.cpu().numpy()>0.5) 99 | for vector in probs: 100 | outputs.append(vector.cpu().numpy()) 101 | for t in Y: 102 | ground_truths.append(t.cpu().numpy()) 103 | del output 104 | torch.cuda.empty_cache() 105 | val_loss=(loss/batches).cpu() 106 | ground_truths=np.asarray(ground_truths).reshape(-1) 107 | predictions=np.asarray(predictions).reshape(-1) 108 | outputs=np.asarray(outputs).reshape(-1) 109 | #score=metrics.cohen_kappa_score(ground_truths,predictions,weights='quadratic') 110 | val_acc=Metrics.accuracy(predictions,ground_truths) 111 | auc=metrics.roc_auc_score(ground_truths,outputs) 112 | val_sens=Metrics.sensitivity(predictions,ground_truths) 113 | val_spec=Metrics.specificity(predictions,ground_truths) 114 | print('Val accuracy: {}, Val_auc: {}, Val Loss: {}'.format(val_acc,auc,val_loss)) 115 | return val_loss,auc,val_acc,val_sens,val_spec 116 | 117 | 118 | def predict(model,device,dataset,batch_size=64): 119 | batches=int(len(dataset.val_indices)/batch_size)+1 120 | model.train(False) 121 | total=0 122 | ground_truths=dataset.labels[dataset.val_indices] 123 | predictions=[] 124 | attention_weights=[] 125 | loss=0 126 | criterion=nn.CrossEntropyLoss() 127 | dataset.switch_mode(training=False) 128 | dataset.update_batchsize(batch_size) 129 | with torch.no_grad(): 130 | for i in tqdm(range(len(dataset))): 131 | data=dataset[i] 132 | X=torch.Tensor(data['data']).to(device,).long() 133 | Y=torch.Tensor(data['labels']).to(device,dtype=torch.int64) 134 | directions=data['directions'] 135 | directions=directions.reshape(len(directions),1)*np.ones(X.shape) 136 | directions=torch.Tensor(directions).to(device).long() 137 | output,_,_,aw= model(X,directions,None) 138 | del X 139 | loss+=criterion(output,Y) 140 | classification_predictions = torch.argmax(output,dim=1).squeeze() 141 | for pred in output: 142 | predictions.append(pred.cpu().numpy()) 143 | for weight in aw: 144 | attention_weights.append(weight.cpu().numpy()) 145 | 146 | del output 147 | torch.cuda.empty_cache() 148 | val_loss=(loss/batches).cpu() 149 | predictions=np.asarray(predictions) 150 | attention_weights=np.asarray(attention_weights) 151 | binary_predictions=predictions.copy() 152 | binary_predictions[binary_predictions==2]=1 153 | binary_ground_truths=ground_truths.copy() 154 | binary_ground_truths[binary_ground_truths==2]=1 155 | return predictions,attention_weights,np.asarray(dataset.data[dataset.val_indices]) 156 | -------------------------------------------------------------------------------- /src/Non_Coding_Variant_Effects/Logger.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from os import path 3 | 4 | 5 | class CSVLogger: 6 | def __init__(self,columns,file): 7 | self.columns=columns 8 | self.file=file 9 | if not self.check_header(): 10 | self._write_header() 11 | 12 | 13 | def check_header(self): 14 | if path.exists(self.file): 15 | header=True 16 | else: 17 | header=False 18 | return header 19 | 20 | 21 | def _write_header(self): 22 | with open(self.file,"a") as f: 23 | string="" 24 | for attrib in self.columns: 25 | string+="{},".format(attrib) 26 | string=string[:len(string)-1] 27 | string+="\n" 28 | f.write(string) 29 | return self 30 | 31 | def log(self,row): 32 | if len(row)!=len(self.columns): 33 | raise Exception("Mismatch between row vector and number of columns in logger") 34 | with open(self.file,"a") as f: 35 | string="" 36 | for attrib in row: 37 | string+="{},".format(attrib) 38 | string=string[:len(string)-1] 39 | string+="\n" 40 | f.write(string) 41 | return self 42 | -------------------------------------------------------------------------------- /src/Non_Coding_Variant_Effects/LrScheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def update_lr(optimizer, lr): 5 | for param_group in optimizer.param_groups: 6 | param_group['lr'] = lr 7 | 8 | class lr_AIAYN(): 9 | ''' 10 | Learning rate scheduler from the paper: 11 | Attention is All You Need 12 | ''' 13 | def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1): 14 | self.optimizer=optimizer 15 | self.d_model=d_model 16 | self.warmup_steps=warmup_steps 17 | self.step_num=0 18 | self.factor=factor 19 | 20 | def step(self): 21 | self.step_num+=1 22 | lr=self.d_model**-0.5*np.min([self.step_num**-0.5, 23 | self.step_num*self.warmup_steps**-1.5])*self.factor 24 | update_lr(self.optimizer,lr) 25 | return lr 26 | 27 | 28 | class Cos_Anneal(): 29 | ''' 30 | Learning rate scheduler flat and anneal 31 | ''' 32 | def __init__(self,optimizer,max_lr,min_lr,T): 33 | self.optimizer=optimizer 34 | self.max_lr=max_lr 35 | self.min_lr=min_lr 36 | self.step_num=0 37 | self.T=T 38 | 39 | def step(self): 40 | pi=3.1415 41 | self.step_num+=1 42 | lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi)) 43 | if self.optimizer: 44 | update_lr(self.optimizer,lr) 45 | return lr -------------------------------------------------------------------------------- /src/Non_Coding_Variant_Effects/Metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def accuracy(predictions,ground_truths): 5 | return np.sum(predictions==ground_truths)/len(ground_truths) 6 | 7 | 8 | def sensitivity(predictions,ground_truths): 9 | ''' 10 | Here it is assumed: 11 | 0=negative 12 | 1=positive 13 | ''' 14 | return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1]) 15 | 16 | 17 | 18 | def specificity(predictions,ground_truths): 19 | ''' 20 | Here it is assumed: 21 | 0=negative 22 | 1=positive 23 | ''' 24 | return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0]) 25 | 26 | def MCC(predictions,ground_truths): 27 | ''' 28 | Here it is assumed: 29 | 0=negative 30 | 1=positive 31 | ''' 32 | N1=len(predictions[(predictions==0)&(ground_truths==1)]) 33 | N2=len(predictions[(predictions==1)&(ground_truths==0)]) 34 | N3=len(ground_truths[ground_truths==1]) 35 | N4=len(ground_truths[ground_truths==0]) 36 | sens=1-N1/N3 37 | spec=1-N2/N4 38 | denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4)) 39 | return (1-sens-spec)/denom 40 | 41 | 42 | -------------------------------------------------------------------------------- /src/Non_Coding_Variant_Effects/compute_median_aucs.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | df=pd.read_csv('test_aucs.csv') 4 | deepsea=pd.read_excel('../41592_2015_BFnmeth3547_MOESM646_ESM.xlsx') 5 | deepsea_aucs=deepsea.iloc[1:,4] 6 | deepsea_aucs[599]=1 7 | 8 | with open("test_results.txt",'w+') as f: 9 | f.write('###NT###\n') 10 | f.write(f"DNase_median_acu: {df.AUC.iloc[:125].median()}\n") 11 | f.write(f"TF_median_acu: {df.AUC.iloc[125:815].median()}\n") 12 | f.write(f"Histone_median_acu: {df.AUC.iloc[815:919].median()}\n") 13 | f.write('###Deep Sea###\n') 14 | f.write(f"DNase_median_acu: {deepsea_aucs[:125].median()}\n") 15 | f.write(f"TF_median_acu: {deepsea_aucs[125:815].median()}\n") 16 | f.write(f"Histone_median_acu: {deepsea_aucs[815:919].median()}\n") 17 | -------------------------------------------------------------------------------- /src/Non_Coding_Variant_Effects/compute_val_aucs.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from sklearn import metrics 3 | from tqdm import tqdm 4 | import numpy as np 5 | 6 | 7 | with open('test_results.p','rb') as f: 8 | outputs,ground_truths=pickle.load(f) 9 | 10 | aucs=[] 11 | for i in tqdm(range(598)): 12 | auc=metrics.roc_auc_score(ground_truths[:,i],outputs[:,i]) 13 | aucs.append(auc) 14 | 15 | aucs.append(1) 16 | for i in tqdm(range(599,919)): 17 | auc=metrics.roc_auc_score(ground_truths[:,i],outputs[:,i]) 18 | aucs.append(auc) 19 | 20 | import pandas as pd 21 | df=pd.DataFrame(columns=['AUC']) 22 | df['AUC']=aucs 23 | 24 | df.to_csv('test_aucs.csv') 25 | 26 | # #exit() 27 | # aucs=[] 28 | # for i in tqdm(range(125)): 29 | # auc=metrics.roc_auc_score(ground_truths[:,i],outputs[:,i]) 30 | # aucs.append(auc) 31 | # all_aucs.append(auc) 32 | # 33 | # DNase_median_acu=np.median(aucs) 34 | # 35 | # 36 | # aucs=[] 37 | # for i in tqdm(range(125,598)): 38 | # auc=metrics.roc_auc_score(ground_truths[:,i],outputs[:,i]) 39 | # aucs.append(auc) 40 | # 41 | # 42 | # for i in tqdm(range(599,815)): 43 | # auc=metrics.roc_auc_score(ground_truths[:,i],outputs[:,i]) 44 | # aucs.append(auc) 45 | # 46 | # TF_median_acu=np.median(aucs) 47 | # 48 | # aucs=[] 49 | # for i in tqdm(range(815,919)): 50 | # auc=metrics.roc_auc_score(ground_truths[:,i],outputs[:,i]) 51 | # aucs.append(auc) 52 | # 53 | # 54 | # Histone_median_acu=np.median(aucs) 55 | # 56 | # with open("test_results.txt",'w+') as f: 57 | # f.write(f"DNase_median_acu: {DNase_median_acu}\n") 58 | # f.write(f"TF_median_acu: {TF_median_acu}\n") 59 | # f.write(f"Histone_median_acu: {Histone_median_acu}\n") 60 | -------------------------------------------------------------------------------- /src/Non_Coding_Variant_Effects/preprocess_data.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | import h5py 3 | import numpy as np 4 | 5 | f=h5py.File('deepsea_train/train.mat', 'r')# as f: 6 | train_seqs =np.array(f['trainxdata']).transpose(2,1,0).astype('uint8') 7 | train_labels =np.array(f['traindata']).transpose(1,0).astype('uint8') 8 | val_data = scipy.io.loadmat('deepsea_train/valid.mat') 9 | val_seqs = np.array(val_data['validxdata']).transpose(2,1,0).astype('uint8') 10 | val_labels = np.array(val_data['validdata']).transpose(1,0).astype('uint8') 11 | 12 | import pickle 13 | with open('DeepSea_TrainVal.p','wb+') as f: 14 | pickle.dump([train_seqs,train_labels,val_seqs,val_labels],f) 15 | -------------------------------------------------------------------------------- /src/Non_Coding_Variant_Effects/readme.md: -------------------------------------------------------------------------------- 1 | # classifying effects of non-coding variants 2 | 3 | 1. download datasets from http://deepsea.princeton.edu/media/code/deepsea_train_bundle.v0.9.tar.gz 4 | 2. create folder ```deepsea_train``` and unzip contents into folder and run ```preprocess.py``` to extract train val set to a numpy file (for faster data loading) 5 | 3. ```bash run.sh``` to run training 6 | 4. ```bash test.sh``` to make inference on the test set 7 | 5. ```compute_val_aucs.py``` and ```compute_median_aucs.py``` to calculate test aucs and median aucs in TF/DNS/HM 8 | -------------------------------------------------------------------------------- /src/Non_Coding_Variant_Effects/restart.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | python restart.py --gpu_id 0,1 --kmer_aggregation --nmute 40 --epochs 60 --nlayers 3 \ 5 | --batch_size 256 --kmers 13 --lr_scale 0.1 --ninp 512 --nhid 2048 --num_workers 32 \ 6 | --nclass 919 --nhead 8 --restart_epoch 20 7 | -------------------------------------------------------------------------------- /src/Non_Coding_Variant_Effects/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | python train.py --gpu_id 0,1 --kmer_aggregation --nmute 40 --epochs 60 --nlayers 3 \ 5 | --batch_size 512 --kmers 7 --lr_scale 1 --ninp 1024 --nhid 4096 --num_workers 32 \ 6 | --nclass 919 --nhead 16 --weight_decay 1e-6 --dropout 0.2 7 | -------------------------------------------------------------------------------- /src/Non_Coding_Variant_Effects/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | python validate.py --gpu_id 0,1 --kmer_aggregation --nmute 40 --epochs 60 --nlayers 3 \ 5 | --batch_size 1024 --kmers 7 --lr_scale 0.1 --ninp 1024 --nhid 4096 --num_workers 32 \ 6 | --nclass 919 --nhead 16 7 | -------------------------------------------------------------------------------- /src/Non_Coding_Variant_Effects/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import time 5 | from Functions import * 6 | from Dataset import * 7 | from Network import * 8 | from LrScheduler import * 9 | import Metrics 10 | from Logger import CSVLogger 11 | import argparse 12 | try: 13 | #from apex.parallel import DistributedDataParallel as DDP 14 | from apex.fp16_utils import * 15 | from apex import amp, optimizers 16 | from apex.multi_tensor_apply import multi_tensor_applier 17 | except ImportError: 18 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.") 19 | 20 | 21 | def get_args(): 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('--gpu_id', type=str, default='0', help='which gpu to use') 24 | parser.add_argument('--path', type=str, default='../', help='path of csv file with DNA sequences and labels') 25 | parser.add_argument('--epochs', type=int, default=150, help='number of epochs to train') 26 | parser.add_argument('--batch_size', type=int, default=24, help='size of each batch during training') 27 | parser.add_argument('--weight_decay', type=float, default=0, help='weight dacay used in optimizer') 28 | parser.add_argument('--ntoken', type=int, default=4, help='number of tokens to represent DNA nucleotides (should always be 4)') 29 | parser.add_argument('--nclass', type=int, default=919, help='number of classes from the linear decoder') 30 | parser.add_argument('--ninp', type=int, default=512, help='ninp for transformer encoder') 31 | parser.add_argument('--nhead', type=int, default=8, help='nhead for transformer encoder') 32 | parser.add_argument('--nhid', type=int, default=2048, help='nhid for transformer encoder') 33 | parser.add_argument('--nlayers', type=int, default=6, help='nlayers for transformer encoder') 34 | parser.add_argument('--save_freq', type=int, default=1, help='saving checkpoints per save_freq epochs') 35 | parser.add_argument('--dropout', type=float, default=.1, help='transformer dropout') 36 | parser.add_argument('--warmup_steps', type=int, default=3200, help='training schedule warmup steps') 37 | parser.add_argument('--lr_scale', type=float, default=0.1, help='learning rate scale') 38 | parser.add_argument('--nmute', type=int, default=18, help='number of mutations during training') 39 | parser.add_argument('--kmers', type=int, nargs='+', default=[7], help='k-mers to be aggregated') 40 | #parser.add_argument('--kmer_aggregation', type=bool, default=True, help='k-mers to be aggregated') 41 | parser.add_argument('--kmer_aggregation', dest='kmer_aggregation', action='store_true') 42 | parser.add_argument('--no_kmer_aggregation', dest='kmer_aggregation', action='store_false') 43 | parser.set_defaults(kmer_aggregation=True) 44 | parser.add_argument('--nfolds', type=int, default=5, help='number of cross validation folds') 45 | parser.add_argument('--fold', type=int, default=0, help='which fold to train') 46 | parser.add_argument('--val_freq', type=int, default=1, help='which fold to train') 47 | parser.add_argument('--num_workers', type=int, default=1, help='num_workers') 48 | opts = parser.parse_args() 49 | return opts 50 | 51 | #def train_fold(): 52 | 53 | opts=get_args() 54 | seed_everything(2020) 55 | #gpu selection 56 | os.environ["CUDA_VISIBLE_DEVICES"] = opts.gpu_id 57 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 58 | 59 | 60 | import pickle 61 | with open('DeepSea_TrainVal.p','rb') as f: 62 | train_seqs,train_labels,val_seqs,val_labels=pickle.load(f) 63 | 64 | #exit() 65 | 66 | dataset=DeepSeaDataset(train_seqs,train_labels) 67 | dataloader=torch.utils.data.DataLoader(dataset,batch_size=opts.batch_size,shuffle=True,num_workers=opts.num_workers) 68 | val_dataset=DeepSeaDataset(val_seqs.transpose(2,1,0),val_labels.transpose(1,0)) 69 | val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=opts.batch_size*4,shuffle=False) 70 | 71 | #exit() 72 | #lr=0 73 | 74 | #checkpointing 75 | checkpoints_folder='checkpoints_fold{}'.format((opts.fold)) 76 | csv_file='log_fold{}.csv'.format((opts.fold)) 77 | columns=['epoch','train_loss', 78 | 'val_loss','val_auc','val_acc','val_sens','val_spec'] 79 | logger=CSVLogger(columns,csv_file) 80 | 81 | #build model and logger 82 | model=NucleicTransformer(opts.ntoken, opts.nclass, opts.ninp, opts.nhead, opts.nhid, 83 | opts.nlayers, opts.kmer_aggregation, kmers=opts.kmers, 84 | dropout=opts.dropout).to(device) 85 | optimizer=torch.optim.Adam(model.parameters(), weight_decay=opts.weight_decay) 86 | criterion=nn.BCEWithLogitsLoss(reduction='none') 87 | lr_schedule=lr_AIAYN(optimizer,opts.ninp,opts.warmup_steps,opts.lr_scale) 88 | # Initialization 89 | opt_level = 'O1' 90 | model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) 91 | model = nn.DataParallel(model) 92 | #softmax = nn.Softmax(dim=1) 93 | 94 | pytorch_total_params = sum(p.numel() for p in model.parameters()) 95 | print('Total number of paramters: {}'.format(pytorch_total_params)) 96 | 97 | #print("Starting training for fold {}/{}".format(opts.fold,opts.nfolds)) 98 | #training loop 99 | for epoch in range(opts.epochs): 100 | model.train(True) 101 | t=time.time() 102 | total_loss=0 103 | optimizer.zero_grad() 104 | total_steps=len(dataloader) 105 | step=0 106 | for data in tqdm(dataloader): 107 | step+=1 108 | #for step in range(1): 109 | lr=lr_schedule.step() 110 | src=data['data'].to(device).long() 111 | labels=data['labels'].to(device).float() 112 | #exit() 113 | #mutated_sequence=mutate_dna_sequence(src,opts.nmute).to(device) 114 | output=model(src) 115 | #loss_weight=torch.ones(len(output),device=device) 116 | loss_weight=torch.ones_like(labels) 117 | loss_weight[labels==1]=10 118 | loss=criterion(output.reshape(-1),labels.reshape(-1))*loss_weight.reshape(-1) 119 | loss=loss.mean() 120 | 121 | with amp.scale_loss(loss, optimizer) as scaled_loss: 122 | scaled_loss.backward() 123 | torch.nn.utils.clip_grad_norm_(model.parameters(), 1) 124 | optimizer.step() 125 | optimizer.zero_grad() 126 | total_loss+=loss 127 | # print ("Epoch [{}/{}], Step [{}/{}] Loss: {:.3f} Lr:{:.6f} Time: {:.1f}" 128 | # .format(epoch+1, opts.epochs, step+1, total_steps, total_loss/(step+1) , lr,time.time()-t),end='\r',flush=True) #total_loss/(step+1) 129 | # #break 130 | print('') 131 | 132 | train_loss=total_loss/(step+1) 133 | 134 | if (epoch+1)%opts.val_freq==0: 135 | val_loss,auc,val_acc,val_sens,val_spec=validate(model,device,val_dataloader,batch_size=opts.batch_size*2) 136 | print("Epoch {} train loss: {}".format(epoch+1,train_loss)) 137 | 138 | to_log=[epoch+1,train_loss,val_loss,auc,val_acc,val_sens,val_spec] 139 | logger.log(to_log) 140 | 141 | 142 | if (epoch+1)%opts.save_freq==0: 143 | save_weights(model,optimizer,epoch,checkpoints_folder) 144 | 145 | 146 | get_best_weights_from_fold(opts.fold) 147 | 148 | #train_fold() 149 | -------------------------------------------------------------------------------- /src/Viral_identification/Dataset.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | from tqdm import tqdm 6 | import torch 7 | 8 | 9 | nt_int={ 10 | "A": 0, 11 | "T": 1, 12 | "G": 2, 13 | "C": 3,} 14 | 15 | def nucleatide2int(nt_sequence,target_length=None): 16 | int_sequence=[] 17 | for nt in nt_sequence: 18 | nt=nt.upper() 19 | if nt in nt_int: 20 | int_sequence.append(nt_int[nt]) 21 | int_sequence=np.asarray(int_sequence,dtype='int32') 22 | if target_length: 23 | int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1) 24 | return int_sequence 25 | 26 | 27 | class ViraminerDataset(torch.utils.data.Dataset): 28 | def __init__(self,sequences,labels): 29 | self.data=[] 30 | for seq in sequences: 31 | self.data.append(nucleatide2int(seq)) 32 | 33 | self.data=np.asarray(self.data,dtype='int') 34 | self.labels=np.asarray(labels,dtype='int') 35 | 36 | print(self.data.shape) 37 | print(self.labels.shape) 38 | 39 | def __len__(self): 40 | return len(self.labels) 41 | 42 | def __getitem__(self,idx): 43 | return {'data':self.data[idx], 'labels':self.labels[idx]} 44 | -------------------------------------------------------------------------------- /src/Viral_identification/Functions.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | from sklearn import metrics 4 | import numpy as np 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from tqdm import tqdm 8 | import Metrics 9 | import numpy as np 10 | import os 11 | import pandas as pd 12 | import random 13 | 14 | def seed_everything(seed=42): 15 | random.seed(seed) 16 | os.environ['PYTHONHASHSEED'] = str(seed) 17 | np.random.seed(seed) 18 | torch.manual_seed(seed) 19 | torch.cuda.manual_seed(seed) 20 | torch.backends.cudnn.deterministic = True 21 | 22 | def get_best_weights_from_fold(fold,top=3): 23 | csv_file='log_fold{}.csv'.format(fold) 24 | 25 | history=pd.read_csv(csv_file) 26 | scores=np.asarray(history.val_auc) 27 | top_epochs=scores.argsort()[-3:][::-1] 28 | print(scores[top_epochs]) 29 | os.system('mkdir best_weights') 30 | 31 | for i in range(top): 32 | weights_path='checkpoints_fold{}/epoch{}.ckpt'.format(fold,history.epoch[top_epochs[i]]) 33 | print(weights_path) 34 | os.system('cp {} best_weights/fold{}top{}.ckpt'.format(weights_path,fold,i+1)) 35 | os.system('rm -r checkpoints_fold{}'.format(fold)) 36 | 37 | def smoothcrossentropyloss(pred,gold,n_class=2,smoothing=0.05): 38 | gold = gold.contiguous().view(-1) 39 | one_hot = torch.zeros_like(pred).scatter(1, gold.view(-1, 1), 1) 40 | one_hot = one_hot * (1 - smoothing) + (1 - one_hot) * smoothing / (n_class - 1) 41 | log_prb = F.log_softmax(pred, dim=1) 42 | loss = -(one_hot * log_prb) 43 | #loss=loss.sum(1).mean() 44 | return loss 45 | 46 | def mutate_dna_sequence(sequence,nmute=15): 47 | mutation=torch.randint(0,4,size=(sequence.shape[0],nmute)) 48 | to_mutate = torch.randperm(sequence.shape[1])[:nmute] 49 | sequence[:,to_mutate]=mutation 50 | return sequence 51 | 52 | def get_MLM_mask(sequence,nmask=12): 53 | mask=np.zeros(sequence.shape,dtype='bool') 54 | to_mask=np.random.choice(len(sequence[0]),size=(nmask),replace=False) 55 | mask[:,to_mask]=True 56 | return mask 57 | 58 | def get_complementary_sequence(sequence): 59 | complementary_sequence=sequence.copy() 60 | complementary_sequence[sequence==0]=1 61 | complementary_sequence[sequence==1]=0 62 | complementary_sequence[sequence==2]=3 63 | complementary_sequence[sequence==3]=2 64 | complementary_sequence=complementary_sequence[:,::-1] 65 | return complementary_sequence 66 | 67 | def update_lr(optimizer, lr): 68 | for param_group in optimizer.param_groups: 69 | param_group['lr'] = lr 70 | 71 | def save_weights(model,optimizer,epoch,folder): 72 | if os.path.isdir(folder)==False: 73 | os.makedirs(folder,exist_ok=True) 74 | torch.save(model.state_dict(), folder+'/epoch{}.ckpt'.format(epoch+1)) 75 | 76 | 77 | 78 | def validate(model,device,dataset,batch_size=64): 79 | batches=len(dataset) 80 | model.train(False) 81 | total=0 82 | predictions=[] 83 | outputs=[] 84 | ground_truths=[] 85 | loss=0 86 | criterion=nn.CrossEntropyLoss() 87 | with torch.no_grad(): 88 | for data in tqdm(dataset): 89 | X=data['data'].to(device) 90 | Y=data['labels'].to(device) 91 | 92 | output= model(X) 93 | del X 94 | loss+=criterion(output,Y) 95 | classification_predictions = torch.argmax(output,dim=1).squeeze() 96 | for pred in classification_predictions: 97 | predictions.append(pred.cpu().numpy()) 98 | for vector in output: 99 | outputs.append(vector.cpu().numpy()) 100 | for t in Y: 101 | ground_truths.append(t.cpu().numpy()) 102 | del output 103 | torch.cuda.empty_cache() 104 | val_loss=(loss/batches).cpu() 105 | ground_truths=np.asarray(ground_truths) 106 | predictions=np.asarray(predictions) 107 | outputs=np.asarray(outputs) 108 | #print(predictions) 109 | #print(ground_truths) 110 | #score=metrics.cohen_kappa_score(ground_truths,predictions,weights='quadratic') 111 | val_acc=Metrics.accuracy(predictions,ground_truths) 112 | auc=metrics.roc_auc_score(ground_truths,outputs[:,1]) 113 | val_sens=Metrics.sensitivity(predictions,ground_truths) 114 | val_spec=Metrics.specificity(predictions,ground_truths) 115 | print('Val accuracy: {}, Val Loss: {}'.format(val_acc,val_loss)) 116 | return val_loss,auc,val_acc,val_sens,val_spec 117 | 118 | 119 | def predict(model,device,dataset,batch_size=64): 120 | batches=int(len(dataset.val_indices)/batch_size)+1 121 | model.train(False) 122 | total=0 123 | ground_truths=dataset.labels[dataset.val_indices] 124 | predictions=[] 125 | attention_weights=[] 126 | loss=0 127 | criterion=nn.CrossEntropyLoss() 128 | dataset.switch_mode(training=False) 129 | dataset.update_batchsize(batch_size) 130 | with torch.no_grad(): 131 | for i in tqdm(range(len(dataset))): 132 | data=dataset[i] 133 | X=torch.Tensor(data['data']).to(device,).long() 134 | Y=torch.Tensor(data['labels']).to(device,dtype=torch.int64) 135 | directions=data['directions'] 136 | directions=directions.reshape(len(directions),1)*np.ones(X.shape) 137 | directions=torch.Tensor(directions).to(device).long() 138 | output,_,_,aw= model(X,directions,None) 139 | del X 140 | loss+=criterion(output,Y) 141 | classification_predictions = torch.argmax(output,dim=1).squeeze() 142 | for pred in output: 143 | predictions.append(pred.cpu().numpy()) 144 | for weight in aw: 145 | attention_weights.append(weight.cpu().numpy()) 146 | 147 | del output 148 | torch.cuda.empty_cache() 149 | val_loss=(loss/batches).cpu() 150 | predictions=np.asarray(predictions) 151 | attention_weights=np.asarray(attention_weights) 152 | binary_predictions=predictions.copy() 153 | binary_predictions[binary_predictions==2]=1 154 | binary_ground_truths=ground_truths.copy() 155 | binary_ground_truths[binary_ground_truths==2]=1 156 | return predictions,attention_weights,np.asarray(dataset.data[dataset.val_indices]) 157 | -------------------------------------------------------------------------------- /src/Viral_identification/Logger.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from os import path 3 | 4 | 5 | class CSVLogger: 6 | def __init__(self,columns,file): 7 | self.columns=columns 8 | self.file=file 9 | if not self.check_header(): 10 | self._write_header() 11 | 12 | 13 | def check_header(self): 14 | if path.exists(self.file): 15 | header=True 16 | else: 17 | header=False 18 | return header 19 | 20 | 21 | def _write_header(self): 22 | with open(self.file,"a") as f: 23 | string="" 24 | for attrib in self.columns: 25 | string+="{},".format(attrib) 26 | string=string[:len(string)-1] 27 | string+="\n" 28 | f.write(string) 29 | return self 30 | 31 | def log(self,row): 32 | if len(row)!=len(self.columns): 33 | raise Exception("Mismatch between row vector and number of columns in logger") 34 | with open(self.file,"a") as f: 35 | string="" 36 | for attrib in row: 37 | string+="{},".format(attrib) 38 | string=string[:len(string)-1] 39 | string+="\n" 40 | f.write(string) 41 | return self 42 | -------------------------------------------------------------------------------- /src/Viral_identification/LrScheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def update_lr(optimizer, lr): 5 | for param_group in optimizer.param_groups: 6 | param_group['lr'] = lr 7 | 8 | class lr_AIAYN(): 9 | ''' 10 | Learning rate scheduler from the paper: 11 | Attention is All You Need 12 | ''' 13 | def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1): 14 | self.optimizer=optimizer 15 | self.d_model=d_model 16 | self.warmup_steps=warmup_steps 17 | self.step_num=0 18 | self.factor=factor 19 | 20 | def step(self): 21 | self.step_num+=1 22 | lr=self.d_model**-0.5*np.min([self.step_num**-0.5, 23 | self.step_num*self.warmup_steps**-1.5])*self.factor 24 | update_lr(self.optimizer,lr) 25 | return lr 26 | 27 | 28 | class Cos_Anneal(): 29 | ''' 30 | Learning rate scheduler flat and anneal 31 | ''' 32 | def __init__(self,optimizer,max_lr,min_lr,T): 33 | self.optimizer=optimizer 34 | self.max_lr=max_lr 35 | self.min_lr=min_lr 36 | self.step_num=0 37 | self.T=T 38 | 39 | def step(self): 40 | pi=3.1415 41 | self.step_num+=1 42 | lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi)) 43 | if self.optimizer: 44 | update_lr(self.optimizer,lr) 45 | return lr -------------------------------------------------------------------------------- /src/Viral_identification/Metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def accuracy(predictions,ground_truths): 5 | return np.sum(predictions==ground_truths)/len(ground_truths) 6 | 7 | 8 | def sensitivity(predictions,ground_truths): 9 | ''' 10 | Here it is assumed: 11 | 0=negative 12 | 1=positive 13 | ''' 14 | return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1]) 15 | 16 | 17 | 18 | def specificity(predictions,ground_truths): 19 | ''' 20 | Here it is assumed: 21 | 0=negative 22 | 1=positive 23 | ''' 24 | return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0]) 25 | 26 | def MCC(predictions,ground_truths): 27 | ''' 28 | Here it is assumed: 29 | 0=negative 30 | 1=positive 31 | ''' 32 | N1=len(predictions[(predictions==0)&(ground_truths==1)]) 33 | N2=len(predictions[(predictions==1)&(ground_truths==0)]) 34 | N3=len(ground_truths[ground_truths==1]) 35 | N4=len(ground_truths[ground_truths==0]) 36 | sens=1-N1/N3 37 | spec=1-N2/N4 38 | denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4)) 39 | return (1-sens-spec)/denom 40 | 41 | 42 | -------------------------------------------------------------------------------- /src/Viral_identification/evaluate_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import time 5 | from Functions import * 6 | from Dataset import * 7 | from Network import * 8 | from LrScheduler import * 9 | import Metrics 10 | from Logger import CSVLogger 11 | import argparse 12 | try: 13 | #from apex.parallel import DistributedDataParallel as DDP 14 | from apex.fp16_utils import * 15 | from apex import amp, optimizers 16 | from apex.multi_tensor_apply import multi_tensor_applier 17 | except ImportError: 18 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.") 19 | from tqdm import tqdm 20 | 21 | def get_args(): 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('--gpu_id', type=str, default='0,1', help='which gpu to use') 24 | parser.add_argument('--path', type=str, default='../', help='path of csv file with DNA sequences and labels') 25 | parser.add_argument('--epochs', type=int, default=150, help='number of epochs to train') 26 | parser.add_argument('--batch_size', type=int, default=24, help='size of each batch during training') 27 | parser.add_argument('--weight_decay', type=float, default=0, help='weight dacay used in optimizer') 28 | parser.add_argument('--ntoken', type=int, default=4, help='number of tokens to represent DNA nucleotides (should always be 4)') 29 | parser.add_argument('--nclass', type=int, default=2, help='number of classes from the linear decoder') 30 | parser.add_argument('--ninp', type=int, default=512, help='ninp for transformer encoder') 31 | parser.add_argument('--nhead', type=int, default=8, help='nhead for transformer encoder') 32 | parser.add_argument('--nhid', type=int, default=2048, help='nhid for transformer encoder') 33 | parser.add_argument('--nlayers', type=int, default=6, help='nlayers for transformer encoder') 34 | parser.add_argument('--save_freq', type=int, default=1, help='saving checkpoints per save_freq epochs') 35 | parser.add_argument('--dropout', type=float, default=.1, help='transformer dropout') 36 | parser.add_argument('--warmup_steps', type=int, default=3200, help='training schedule warmup steps') 37 | parser.add_argument('--lr_scale', type=float, default=0.1, help='learning rate scale') 38 | parser.add_argument('--nmute', type=int, default=18, help='number of mutations during training') 39 | parser.add_argument('--kmers', type=int, nargs='+', default=[2,3,4,5,6], help='k-mers to be aggregated') 40 | #parser.add_argument('--kmer_aggregation', type=bool, default=True, help='k-mers to be aggregated') 41 | parser.add_argument('--kmer_aggregation', dest='kmer_aggregation', action='store_true') 42 | parser.add_argument('--no_kmer_aggregation', dest='kmer_aggregation', action='store_false') 43 | parser.set_defaults(kmer_aggregation=True) 44 | parser.add_argument('--nfolds', type=int, default=5, help='number of cross validation folds') 45 | parser.add_argument('--fold', type=int, default=0, help='which fold to train') 46 | parser.add_argument('--val_freq', type=int, default=1, help='which fold to train') 47 | opts = parser.parse_args() 48 | return opts 49 | 50 | 51 | opts=get_args() 52 | #gpu selection 53 | os.environ["CUDA_VISIBLE_DEVICES"] = opts.gpu_id 54 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 55 | #lr=0 56 | 57 | #checkpointing 58 | checkpoints_folder='checkpoints_fold{}'.format((opts.fold)) 59 | csv_file='log_fold{}.csv'.format((opts.fold)) 60 | columns=['epoch','train_loss','train_acc','recon_acc', 61 | 'val_loss','val_auc','val_acc','val_sens','val_spec'] 62 | #logger=CSVLogger(columns,csv_file) 63 | 64 | #build model and logger 65 | MODELS=[] 66 | for i in range(3): 67 | model=NucleicTransformer(opts.ntoken, opts.nclass, opts.ninp, opts.nhead, opts.nhid, 68 | opts.nlayers, opts.kmer_aggregation, kmers=opts.kmers, 69 | dropout=opts.dropout).to(device) 70 | optimizer=torch.optim.Adam(model.parameters(), weight_decay=opts.weight_decay) 71 | criterion=nn.CrossEntropyLoss(reduction='none') 72 | lr_schedule=lr_AIAYN(optimizer,opts.ninp,opts.warmup_steps,opts.lr_scale) 73 | # Initialization 74 | opt_level = 'O1' 75 | model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) 76 | model = nn.DataParallel(model) 77 | 78 | 79 | pytorch_total_params = sum(p.numel() for p in model.parameters()) 80 | print('Total number of paramters: {}'.format(pytorch_total_params)) 81 | 82 | model.load_state_dict(torch.load("best_weights/fold0top{}.ckpt".format(i+1))) 83 | model.eval() 84 | MODELS.append(model) 85 | 86 | dict=MODELS[0].module.state_dict() 87 | for key in dict: 88 | for i in range(1,len(MODELS)): 89 | dict[key]=dict[key]+MODELS[i].module.state_dict()[key] 90 | 91 | dict[key]=dict[key]/float(len(MODELS)) 92 | 93 | MODELS[0].module.load_state_dict(dict) 94 | avg_model=MODELS[0] 95 | 96 | def geometric_mean(preds): 97 | gmean=np.ones(preds.shape[1:]) 98 | 99 | for pred in preds: 100 | gmean=gmean*pred 101 | 102 | gmean=gmean**(1/len(preds)) 103 | return gmean 104 | 105 | df=pd.read_csv('../fullset_test.csv',header=None) 106 | 107 | seqs=[] 108 | labels=[] 109 | 110 | for i in range(len(df)): 111 | seqs.append(nucleatide2int(df.iloc[i,1])) 112 | labels.append(df.iloc[i,2]) 113 | labels=np.asarray(labels).astype("int") 114 | seqs=np.asarray(seqs).astype("int") 115 | 116 | 117 | batch_size=128 118 | batches=np.around(len(df)/batch_size+0.5).astype('int') 119 | preds=[] 120 | softmax = nn.Softmax(dim=1) 121 | for i in tqdm(range(batches)): 122 | with torch.no_grad(): 123 | outputs=[] 124 | #for model in MODELS: 125 | x=torch.Tensor(seqs[i*batch_size:(i+1)*batch_size]).to(device).long() 126 | y=softmax(avg_model(x)) 127 | #outputs.append(softmax(y).cpu().numpy()) 128 | for vec in y: 129 | preds.append(vec.cpu().numpy()) 130 | 131 | from sklearn import metrics 132 | preds=np.asarray(preds) 133 | auc=metrics.roc_auc_score(labels,preds[:,1]) 134 | 135 | with open("test_results.p",'wb+') as f: 136 | pickle.dump([labels,preds],f) 137 | 138 | 139 | print(auc) 140 | with open("test_score.txt",'w+') as f: 141 | f.write("test auc score: {}".format(auc)) 142 | 143 | 144 | 145 | 146 | # for i in range(3,10): 147 | # ngrams=np.arange(2,i) 148 | # print(ngrams) 149 | # train_fold(0,ngrams) 150 | # # train_fold(0,[2,3,4]) 151 | -------------------------------------------------------------------------------- /src/Viral_identification/readme.md: -------------------------------------------------------------------------------- 1 | # Source code to train nucleic transformer to reproduce results in the paper for the viraminer dataset 2 | 3 | Dataset can be downloaded at https://github.com/NeuroCSUT/ViraMiner/tree/master/data/DNA_data 4 | 5 | Download fullset_test.csv, fullset_train.csv, and fullset_validation.csv and put them on directory above the folder where you plan to run training (their paths should be ../fullset_test.csv etc) 6 | 7 | To run training: ```./run.sh``` 8 | 9 | You might need to lower the batch size depending on what GPU you have. If you run into memory error with cuda, lower --batch_size in run.sh 10 | 11 | To check results on the test set: ```./evaluate_test.sh``` 12 | 13 | Test results will be saved in a pickle file named test_results.p, and the AUC score will be printed out to test_score.txt 14 | -------------------------------------------------------------------------------- /src/Viral_identification/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | python train.py --gpu_id 0 --kmer_aggregation --nmute 40 --epochs 60 --nlayers 6 \ 5 | --batch_size 128 --kmers 13 --lr_scale 0.1 --ninp 512 --nhid 2048 --num_workers 8 6 | -------------------------------------------------------------------------------- /src/Viral_identification/test.sh: -------------------------------------------------------------------------------- 1 | python evaluate_test.py --gpu_id 0,1 --kmer_aggregation --nmute 20 --epochs 100 --nlayers 6 \ 2 | --batch_size 128 --kmers 13 --lr_scale 0.1 --ninp 512 --nhid 2048 3 | -------------------------------------------------------------------------------- /src/Viral_identification/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import time 5 | from Functions import * 6 | from Dataset import * 7 | from Network import * 8 | from LrScheduler import * 9 | import Metrics 10 | from Logger import CSVLogger 11 | import argparse 12 | try: 13 | #from apex.parallel import DistributedDataParallel as DDP 14 | from apex.fp16_utils import * 15 | from apex import amp, optimizers 16 | from apex.multi_tensor_apply import multi_tensor_applier 17 | except ImportError: 18 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.") 19 | 20 | 21 | def get_args(): 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('--gpu_id', type=str, default='0', help='which gpu to use') 24 | parser.add_argument('--path', type=str, default='../', help='path of csv file with DNA sequences and labels') 25 | parser.add_argument('--epochs', type=int, default=150, help='number of epochs to train') 26 | parser.add_argument('--batch_size', type=int, default=24, help='size of each batch during training') 27 | parser.add_argument('--weight_decay', type=float, default=0, help='weight dacay used in optimizer') 28 | parser.add_argument('--ntoken', type=int, default=4, help='number of tokens to represent DNA nucleotides (should always be 4)') 29 | parser.add_argument('--nclass', type=int, default=2, help='number of classes from the linear decoder') 30 | parser.add_argument('--ninp', type=int, default=512, help='ninp for transformer encoder') 31 | parser.add_argument('--nhead', type=int, default=8, help='nhead for transformer encoder') 32 | parser.add_argument('--nhid', type=int, default=2048, help='nhid for transformer encoder') 33 | parser.add_argument('--nlayers', type=int, default=6, help='nlayers for transformer encoder') 34 | parser.add_argument('--save_freq', type=int, default=1, help='saving checkpoints per save_freq epochs') 35 | parser.add_argument('--dropout', type=float, default=.1, help='transformer dropout') 36 | parser.add_argument('--warmup_steps', type=int, default=3200, help='training schedule warmup steps') 37 | parser.add_argument('--lr_scale', type=float, default=0.1, help='learning rate scale') 38 | parser.add_argument('--nmute', type=int, default=18, help='number of mutations during training') 39 | parser.add_argument('--kmers', type=int, nargs='+', default=[2,3,4,5,6], help='k-mers to be aggregated') 40 | #parser.add_argument('--kmer_aggregation', type=bool, default=True, help='k-mers to be aggregated') 41 | parser.add_argument('--kmer_aggregation', dest='kmer_aggregation', action='store_true') 42 | parser.add_argument('--no_kmer_aggregation', dest='kmer_aggregation', action='store_false') 43 | parser.set_defaults(kmer_aggregation=True) 44 | parser.add_argument('--nfolds', type=int, default=5, help='number of cross validation folds') 45 | parser.add_argument('--fold', type=int, default=0, help='which fold to train') 46 | parser.add_argument('--val_freq', type=int, default=1, help='which fold to train') 47 | parser.add_argument('--num_workers', type=int, default=1, help='num_workers') 48 | opts = parser.parse_args() 49 | return opts 50 | 51 | def train_fold(): 52 | 53 | opts=get_args() 54 | seed_everything(2020) 55 | #gpu selection 56 | os.environ["CUDA_VISIBLE_DEVICES"] = opts.gpu_id 57 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 58 | 59 | train_df=pd.read_csv(os.path.join("..","fullset_train.csv")) 60 | val_df=pd.read_csv(os.path.join("..","fullset_validation.csv")) 61 | 62 | dataset=ViraminerDataset(train_df.iloc[:,1],train_df.iloc[:,2]) 63 | dataloader=torch.utils.data.DataLoader(dataset,batch_size=opts.batch_size,shuffle=True,num_workers=opts.num_workers) 64 | val_dataset=ViraminerDataset(val_df.iloc[:,1],val_df.iloc[:,2]) 65 | val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=opts.batch_size*2,shuffle=False) 66 | 67 | #exit() 68 | #lr=0 69 | 70 | #checkpointing 71 | checkpoints_folder='checkpoints_fold{}'.format((opts.fold)) 72 | csv_file='log_fold{}.csv'.format((opts.fold)) 73 | columns=['epoch','train_acc', 74 | 'val_loss','val_auc','val_acc','val_sens','val_spec'] 75 | logger=CSVLogger(columns,csv_file) 76 | 77 | #build model and logger 78 | model=NucleicTransformer(opts.ntoken, opts.nclass, opts.ninp, opts.nhead, opts.nhid, 79 | opts.nlayers, opts.kmer_aggregation, kmers=opts.kmers, 80 | dropout=opts.dropout).to(device) 81 | optimizer=torch.optim.Adam(model.parameters(), weight_decay=opts.weight_decay) 82 | criterion=nn.CrossEntropyLoss(reduction='none') 83 | lr_schedule=lr_AIAYN(optimizer,opts.ninp,opts.warmup_steps,opts.lr_scale) 84 | # Initialization 85 | opt_level = 'O1' 86 | model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) 87 | model = nn.DataParallel(model) 88 | softmax = nn.Softmax(dim=1) 89 | 90 | pytorch_total_params = sum(p.numel() for p in model.parameters()) 91 | print('Total number of paramters: {}'.format(pytorch_total_params)) 92 | 93 | print("Starting training for fold {}/{}".format(opts.fold,opts.nfolds)) 94 | #training loop 95 | for epoch in range(opts.epochs): 96 | model.train(True) 97 | t=time.time() 98 | total_loss=0 99 | optimizer.zero_grad() 100 | total_steps=len(dataloader) 101 | for step, data in enumerate(dataloader): 102 | #for step in range(1): 103 | lr=lr_schedule.step() 104 | src=data['data'] 105 | labels=data['labels'].to(device) 106 | mutated_sequence=mutate_dna_sequence(src,opts.nmute).to(device) 107 | output=model(mutated_sequence) 108 | loss_weight=torch.ones(len(output),device=device) 109 | loss=torch.mean(criterion(output,labels)) 110 | 111 | 112 | with amp.scale_loss(loss, optimizer) as scaled_loss: 113 | scaled_loss.backward() 114 | torch.nn.utils.clip_grad_norm_(model.parameters(), 1) 115 | optimizer.step() 116 | optimizer.zero_grad() 117 | total_loss+=loss 118 | print ("Epoch [{}/{}], Step [{}/{}] Loss: {:.3f} Lr:{:.6f} Time: {:.1f}" 119 | .format(epoch+1, opts.epochs, step+1, total_steps, total_loss/(step+1) , lr,time.time()-t),end='\r',flush=True) #total_loss/(step+1) 120 | #break 121 | print('') 122 | 123 | train_loss=total_loss/(step+1) 124 | 125 | if (epoch+1)%opts.val_freq==0: 126 | val_loss,auc,val_acc,val_sens,val_spec=validate(model,device,val_dataloader,batch_size=opts.batch_size*2) 127 | print("Epoch {} train loss: {}".format(epoch+1,train_loss)) 128 | 129 | to_log=[epoch+1,train_loss,val_loss,auc,val_acc,val_sens,val_spec] 130 | logger.log(to_log) 131 | 132 | 133 | if (epoch+1)%opts.save_freq==0: 134 | save_weights(model,optimizer,epoch,checkpoints_folder) 135 | 136 | 137 | get_best_weights_from_fold(opts.fold) 138 | 139 | train_fold() 140 | --------------------------------------------------------------------------------