├── LICENSE
├── README.md
├── environment.yml
├── graphics
    ├── overview.PNG
    └── overview.png
└── src
    ├── Ecoli_Promoter_classification
        ├── Dataset.py
        ├── Functions.py
        ├── Logger.py
        ├── LrScheduler.py
        ├── Metrics.py
        ├── Network.py
        ├── evaluate.py
        ├── evaluate.sh
        ├── extract_motif.py
        ├── readme.md
        ├── run.sh
        ├── train.py
        └── v9d3.csv
    ├── Enchancer_classification
        ├── Dataset.py
        ├── Functions.py
        ├── Logger.py
        ├── LrScheduler.py
        ├── Metrics.py
        ├── Network.py
        ├── bert_enhancer_dataset.csv
        ├── evaluate.py
        ├── evaluate.sh
        ├── job.sh
        ├── job_eval.sh
        ├── run.sh
        ├── test.py
        ├── test.sh
        └── train.py
    ├── Eukaryotic_Promoters_Classification
        ├── README.md
        ├── human_non_tata
        │   ├── Dataset.py
        │   ├── Functions.py
        │   ├── Logger.py
        │   ├── LrScheduler.py
        │   ├── Metrics.py
        │   ├── Network.py
        │   ├── check_log.py
        │   ├── evaluate.py
        │   ├── extract_motif.py
        │   ├── run.sh
        │   └── train.py
        ├── human_non_tata_deepromoter
        │   ├── Dataset.py
        │   ├── DeePromoter.py
        │   ├── Functions.py
        │   ├── Logger.py
        │   ├── LrScheduler.py
        │   ├── Metrics.py
        │   ├── Network.py
        │   ├── check_log.py
        │   ├── evaluate.py
        │   ├── extract_motif.py
        │   ├── run.sh
        │   └── train.py
        ├── human_tata
        │   ├── Dataset.py
        │   ├── Functions.py
        │   ├── Logger.py
        │   ├── LrScheduler.py
        │   ├── Metrics.py
        │   ├── Network.py
        │   ├── check_log.py
        │   ├── evaluate.py
        │   ├── extract_motif.py
        │   ├── run.sh
        │   └── train.py
        ├── human_tata_deepromoter
        │   ├── Dataset.py
        │   ├── DeePromoter.py
        │   ├── Functions.py
        │   ├── Logger.py
        │   ├── LrScheduler.py
        │   ├── Metrics.py
        │   ├── Network.py
        │   ├── check_log.py
        │   ├── evaluate.py
        │   ├── extract_motif.py
        │   ├── run.sh
        │   └── train.py
        ├── mouse_non_tata
        │   ├── Dataset.py
        │   ├── Functions.py
        │   ├── Logger.py
        │   ├── LrScheduler.py
        │   ├── Metrics.py
        │   ├── Network.py
        │   ├── check_log.py
        │   ├── evaluate.py
        │   ├── extract_motif.py
        │   ├── run.sh
        │   └── train.py
        ├── mouse_non_tata_deepromoter
        │   ├── Dataset.py
        │   ├── DeePromoter.py
        │   ├── Functions.py
        │   ├── Logger.py
        │   ├── LrScheduler.py
        │   ├── Metrics.py
        │   ├── Network.py
        │   ├── check_log.py
        │   ├── evaluate.py
        │   ├── extract_motif.py
        │   ├── run.sh
        │   └── train.py
        ├── mouse_tata
        │   ├── Dataset.py
        │   ├── Functions.py
        │   ├── Logger.py
        │   ├── LrScheduler.py
        │   ├── Metrics.py
        │   ├── Network.py
        │   ├── check_log.py
        │   ├── evaluate.py
        │   ├── extract_motif.py
        │   ├── run.sh
        │   └── train.py
        ├── mouse_tata_deepromoter
        │   ├── Dataset.py
        │   ├── DeePromoter.py
        │   ├── Functions.py
        │   ├── Logger.py
        │   ├── LrScheduler.py
        │   ├── Metrics.py
        │   ├── Network.py
        │   ├── check_log.py
        │   ├── evaluate.py
        │   ├── extract_motif.py
        │   ├── run.sh
        │   └── train.py
        ├── run_human.sh
        └── run_mouse.sh
    ├── Non_Coding_Variant_Effects
        ├── Dataset.py
        ├── Functions.py
        ├── Logger.py
        ├── LrScheduler.py
        ├── Metrics.py
        ├── Network.py
        ├── compute_median_aucs.py
        ├── compute_val_aucs.py
        ├── preprocess_data.py
        ├── readme.md
        ├── restart.py
        ├── restart.sh
        ├── run.sh
        ├── test.py
        ├── test.sh
        └── train.py
    └── Viral_identification
        ├── Dataset.py
        ├── Functions.py
        ├── Logger.py
        ├── LrScheduler.py
        ├── Metrics.py
        ├── Network.py
        ├── evaluate_test.py
        ├── readme.md
        ├── run.sh
        ├── test.sh
        └── train.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Shujun-He
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Nucleic Transformer: Classifying DNA sequences with Self-attention and Convolutions
 2 | 
 3 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5641875.svg)](https://doi.org/10.5281/zenodo.5641875)
 4 | 
 5 | 
 6 | Source code to reproduce results in the paper "Nucleic Transformer: Classifying DNA sequences with Self-attention and Convolutions".
 7 | 
 8 | <p align="center">
 9 |   <img src="https://github.com/Shujun-He/Nucleic-Transformer/blob/master/graphics/overview.PNG"/>
10 | </p>
11 | 
12 | 
13 | ## How to use the models
14 | 
15 | I also made a web app to use the models. Check it out at https://github.com/Shujun-He/Nucleic-Transformer-WebApp
16 | 
17 | 
18 | ## Requirements
19 | I included a file (environment.yml) to recreate the exact environment I used. Since I also use this environment for computer vision tasks, it includes some other packages as well. This should take around 10 minutes. After installing anaconda:
20 | 
21 | 
22 | ```
23 | conda env create -f environment.yml
24 | ```
25 | 
26 | Then to activate the environment
27 | 
28 | ```
29 | conda activate torch
30 | ```
31 | 
32 | Additionally, you will need Nvidai Apex: https://github.com/NVIDIA/apex
33 | 
34 | ```
35 | git clone https://github.com/NVIDIA/apex
36 | cd apex
37 | pip install .
38 | ```
39 | 
40 | 
41 | 
42 | ## Repo file structure 
43 | 
44 | The src folder includes all the code needed to reproduce results in the paper and the OpenVaccine competition. Additional instructions are in each folder
45 | 
46 | ```src/Ecoli_Promoter_classification``` includes all the code and file needed to reproduce results for E.coli promoter classification
47 | 
48 | ```src/Eukaryotic_Promoters_Classification``` includes all the code and file needed to reproduce results for eukaryotic promoter classification
49 | 
50 | 
51 | 
52 | ```src/Non_Coding_Variant_Effects``` includeds all the code needed to reproduce results for the deepsea dataset
53 | 
54 | ```src/Viral_identification``` includeds all the code needed to reproduce results for the viraminer dataset
55 | 
56 | ```src/Enchancer_classification``` includeds all the code needed to reproduce results for the enhancer dataset
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/graphics/overview.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shujun-He/Nucleic-Transformer/2c020793335417111442684770009bbdf13a885c/graphics/overview.PNG


--------------------------------------------------------------------------------
/graphics/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shujun-He/Nucleic-Transformer/2c020793335417111442684770009bbdf13a885c/graphics/overview.png


--------------------------------------------------------------------------------
/src/Ecoli_Promoter_classification/Dataset.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import os
 3 | import numpy as np
 4 | import pandas as pd
 5 | from tqdm import tqdm
 6 | import torch
 7 | 
 8 | 
 9 | nt_int={
10 | "A": 0,
11 | "T": 1,
12 | "G": 2,
13 | "C": 3,}
14 | 
15 | def nucleatide2int(nt_sequence,target_length=None):
16 |     int_sequence=[]
17 |     for nt in nt_sequence:
18 |         nt=nt.upper()
19 |         if nt in nt_int:
20 |             int_sequence.append(nt_int[nt])
21 |     int_sequence=np.asarray(int_sequence,dtype='int32')
22 |     if target_length:
23 |         int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1)
24 |     return int_sequence
25 | 
26 | 
27 | class PromoterDataset(torch.utils.data.Dataset):
28 |     def __init__(self,sequences,labels):
29 |         self.data=[]
30 |         for seq in sequences:
31 |             self.data.append(nucleatide2int(seq))
32 | 
33 |         self.data=np.asarray(self.data,dtype='int')
34 |         self.labels=labels
35 | 
36 |         print(self.data.shape)
37 |         print(self.labels.shape)
38 | 
39 |     def __len__(self):
40 |         return len(self.labels)
41 | 
42 |     def __getitem__(self,idx):
43 |         return {'data':self.data[idx], 'labels':self.labels[idx]}
44 | 


--------------------------------------------------------------------------------
/src/Ecoli_Promoter_classification/Functions.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import os
  3 | from sklearn import metrics
  4 | import numpy as np
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from tqdm import tqdm
  8 | import Metrics
  9 | import numpy as np
 10 | import os
 11 | import pandas as pd
 12 | import torch
 13 | import random
 14 | from sklearn.model_selection import StratifiedKFold
 15 | 
 16 | 
 17 | def iter_split(data,labels,fold,nfolds=5,seed=2020):
 18 |     splits = StratifiedKFold(n_splits=nfolds, random_state=seed, shuffle=True)
 19 |     splits = list(splits.split(data,labels))
 20 |     # splits = np.zeros(len(data)).astype(np.int)
 21 |     # for i in range(nfolds): splits[splits[i][1]] = i
 22 |     # indices=np.arange(len(data))
 23 |     train_indices=splits[fold][0]
 24 |     val_indices=splits[fold][1]
 25 |     return train_indices, val_indices
 26 | 
 27 | def seed_everything(seed=42):
 28 |     random.seed(seed)
 29 |     os.environ['PYTHONHASHSEED'] = str(seed)
 30 |     np.random.seed(seed)
 31 |     torch.manual_seed(seed)
 32 |     torch.cuda.manual_seed(seed)
 33 |     torch.backends.cudnn.deterministic = True
 34 | seed_everything(seed=42)
 35 | 
 36 | def get_best_weights_from_fold(fold,top=1):
 37 |     csv_file='log_fold{}.csv'.format(fold)
 38 | 
 39 |     history=pd.read_csv(csv_file)
 40 |     scores=np.asarray(history.val_acc)
 41 |     top_epochs=scores.argsort()[-3:][::-1]
 42 |     print(scores[top_epochs])
 43 |     os.system('mkdir best_weights')
 44 | 
 45 |     for i in range(top):
 46 |         weights_path='checkpoints_fold{}/epoch{}.ckpt'.format(fold,history.epoch[top_epochs[i]])
 47 |         print(weights_path)
 48 |         os.system('cp {} best_weights/fold{}top{}.ckpt'.format(weights_path,fold,i+1))
 49 |     os.system('rm -r checkpoints_fold{}'.format(fold))
 50 | 
 51 | def smoothcrossentropyloss(pred,gold,n_class=2,smoothing=0.05):
 52 |     gold = gold.contiguous().view(-1)
 53 |     one_hot = torch.zeros_like(pred).scatter(1, gold.view(-1, 1), 1)
 54 |     one_hot = one_hot * (1 - smoothing) + (1 - one_hot) * smoothing / (n_class - 1)
 55 |     log_prb = F.log_softmax(pred, dim=1)
 56 |     loss = -(one_hot * log_prb)
 57 |     #loss=loss.sum(1).mean()
 58 |     return loss
 59 | 
 60 | def mutate_dna_sequence(sequence,nmute=15):
 61 |     mutation=torch.randint(0,4,size=(sequence.shape[0],nmute))
 62 |     to_mutate = torch.randperm(sequence.shape[1])[:nmute]
 63 |     sequence[:,to_mutate]=mutation
 64 |     return sequence
 65 | 
 66 | def get_MLM_mask(sequence,nmask=12):
 67 |     mask=np.zeros(sequence.shape,dtype='bool')
 68 |     to_mask=np.random.choice(len(sequence[0]),size=(nmask),replace=False)
 69 |     mask[:,to_mask]=True
 70 |     return mask
 71 | 
 72 | def get_complementary_sequence(sequence):
 73 |     complementary_sequence=sequence.copy()
 74 |     complementary_sequence[sequence==0]=1
 75 |     complementary_sequence[sequence==1]=0
 76 |     complementary_sequence[sequence==2]=3
 77 |     complementary_sequence[sequence==3]=2
 78 |     complementary_sequence=complementary_sequence[:,::-1]
 79 |     return complementary_sequence
 80 | 
 81 | def update_lr(optimizer, lr):
 82 |     for param_group in optimizer.param_groups:
 83 |         param_group['lr'] = lr
 84 | 
 85 | def save_weights(model,optimizer,epoch,folder):
 86 |     if os.path.isdir(folder)==False:
 87 |         os.makedirs(folder,exist_ok=True)
 88 |     torch.save(model.state_dict(), folder+'/epoch{}.ckpt'.format(epoch+1))
 89 | 
 90 | def get_lr(optimizer):
 91 |     for param_group in optimizer.param_groups:
 92 |         lr=param_group['lr']
 93 |     return lr
 94 | 
 95 | def validate(model,device,dataset,batch_size=64):
 96 |     batches=len(dataset)
 97 |     model.train(False)
 98 |     total=0
 99 |     ground_truths=[]
100 |     predictions=[]
101 |     loss=0
102 |     criterion=nn.CrossEntropyLoss()
103 |     # dataset.switch_mode(training=False)
104 |     # dataset.update_batchsize(batch_size)
105 |     with torch.no_grad():
106 |         for data in tqdm(dataset):
107 |             #data=dataset[i]
108 |             X=data['data'].to(device).long()
109 |             Y=data['labels'].to(device).long()
110 |             output= model(X,None)
111 |             del X
112 |             loss+=criterion(output,Y)
113 |             classification_predictions = torch.argmax(output,dim=1).squeeze()
114 |             for pred in classification_predictions:
115 |                 predictions.append(pred.cpu().numpy())
116 |             for truth in Y:
117 |                 ground_truths.append(truth.cpu().numpy())
118 |             del output
119 |     ground_truths=np.asarray(ground_truths)
120 |     torch.cuda.empty_cache()
121 |     val_loss=(loss/batches).cpu()
122 |     predictions=np.asarray(predictions)
123 |     binary_predictions=predictions.copy()
124 |     binary_predictions[binary_predictions==2]=1
125 |     binary_ground_truths=ground_truths.copy()
126 |     binary_ground_truths[binary_ground_truths==2]=1
127 |     #print(predictions)
128 |     #print(ground_truths)
129 |     #score=metrics.cohen_kappa_score(ground_truths,predictions,weights='quadratic')
130 |     val_acc=Metrics.accuracy(predictions,ground_truths)
131 |     val_sens=Metrics.sensitivity(predictions,ground_truths)
132 |     val_spec=Metrics.specificity(predictions,ground_truths)
133 |     binary_acc=np.sum(binary_predictions==binary_ground_truths)/len(binary_ground_truths)
134 |     print('Accuracy: {}, Binary Accuracy: {} Val Loss: {}'.format(val_acc,binary_acc,val_loss))
135 |     return val_loss,val_acc,val_sens,val_spec
136 | 
137 | 
138 | def predict(model,device,dataset,batch_size=64):
139 |     batches=len(dataset)
140 |     model.train(False)
141 |     total=0
142 |     ground_truths=[]
143 |     predictions=[]
144 |     attention_weights=[]
145 |     sequences=[]
146 |     loss=0
147 |     criterion=nn.CrossEntropyLoss()
148 |     with torch.no_grad():
149 |         for data in tqdm(dataset):
150 |             #data=dataset[i]
151 |             X=data['data'].to(device,).long()
152 |             Y=data['labels'].to(device,dtype=torch.int64)
153 | 
154 |             output,aw= model(X,None)
155 |             #del X
156 |             loss+=criterion(output,Y)
157 |             classification_predictions = torch.argmax(output,dim=1).squeeze()
158 |             for pred in output:
159 |                 predictions.append(pred.cpu().numpy())
160 |             for weight in aw:
161 |                 attention_weights.append(weight.cpu().numpy())
162 | 
163 |             for t in Y:
164 |                 ground_truths.append(t.cpu().numpy())
165 |             for seq in X:
166 |                 sequences.append(seq.cpu().numpy())
167 |             del output
168 |     torch.cuda.empty_cache()
169 |     val_loss=(loss/batches).cpu()
170 |     predictions=np.asarray(predictions)
171 |     attention_weights=np.asarray(attention_weights)
172 |     binary_predictions=predictions.copy()
173 |     binary_predictions[binary_predictions==2]=1
174 |     binary_ground_truths=ground_truths.copy()
175 |     binary_ground_truths[binary_ground_truths==2]=1
176 |     return predictions,attention_weights,np.asarray(sequences),np.asarray(ground_truths)
177 | 


--------------------------------------------------------------------------------
/src/Ecoli_Promoter_classification/Logger.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from os import path
 3 | 
 4 | 
 5 | class CSVLogger:
 6 |     def __init__(self,columns,file):
 7 |         self.columns=columns
 8 |         self.file=file
 9 |         if not self.check_header():
10 |             self._write_header()
11 | 
12 | 
13 |     def check_header(self):
14 |         if path.exists(self.file):
15 |             # with open(self.file, 'r') as csvfile:
16 |             #     sniffer = csv.Sniffer()
17 |             #     has_header = sniffer.has_header(csvfile.read(2048))
18 |             #     header=csvfile.seek(0)
19 |             header=True
20 |         else:
21 |             header=False
22 |         return header
23 | 
24 | 
25 |     def _write_header(self):
26 |         with open(self.file,"a") as f:
27 |             string=""
28 |             for attrib in self.columns:
29 |                 string+="{},".format(attrib)
30 |             string=string[:len(string)-1]
31 |             string+="\n"
32 |             f.write(string)
33 |         return self
34 | 
35 |     def log(self,row):
36 |         if len(row)!=len(self.columns):
37 |             raise Exception("Mismatch between row vector and number of columns in logger")
38 |         with open(self.file,"a") as f:
39 |             string=""
40 |             for attrib in row:
41 |                 string+="{},".format(attrib)
42 |             string=string[:len(string)-1]
43 |             string+="\n"
44 |             f.write(string)
45 |         return self
46 | 


--------------------------------------------------------------------------------
/src/Ecoli_Promoter_classification/LrScheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def update_lr(optimizer, lr):
 5 |     for param_group in optimizer.param_groups:
 6 |         param_group['lr'] = lr
 7 | 
 8 | class lr_AIAYN():
 9 |     '''
10 |     Learning rate scheduler from the paper:
11 |     Attention is All You Need
12 |     '''
13 |     def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1):
14 |         self.optimizer=optimizer
15 |         self.d_model=d_model
16 |         self.warmup_steps=warmup_steps
17 |         self.step_num=0
18 |         self.factor=factor
19 | 
20 |     def step(self):
21 |         self.step_num+=1
22 |         lr=self.d_model**-0.5*np.min([self.step_num**-0.5,
23 |                                       self.step_num*self.warmup_steps**-1.5])*self.factor
24 |         update_lr(self.optimizer,lr)
25 |         return lr
26 |         
27 |         
28 | class Cos_Anneal():
29 |     '''
30 |     Learning rate scheduler flat and anneal
31 |     '''
32 |     def __init__(self,optimizer,max_lr,min_lr,T):
33 |         self.optimizer=optimizer
34 |         self.max_lr=max_lr
35 |         self.min_lr=min_lr
36 |         self.step_num=0
37 |         self.T=T
38 | 
39 |     def step(self):
40 |         pi=3.1415
41 |         self.step_num+=1
42 |         lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi))
43 |         if self.optimizer:
44 |             update_lr(self.optimizer,lr)
45 |         return lr        


--------------------------------------------------------------------------------
/src/Ecoli_Promoter_classification/Metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def accuracy(predictions,ground_truths):
 5 |     return np.sum(predictions==ground_truths)/len(ground_truths)
 6 |     
 7 |     
 8 | def sensitivity(predictions,ground_truths):
 9 |     '''
10 |     Here it is assumed:
11 |     0=negative
12 |     1=positive
13 |     '''
14 |     return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1])
15 | 
16 | 
17 | 
18 | def specificity(predictions,ground_truths):
19 |     '''
20 |     Here it is assumed:
21 |     0=negative
22 |     1=positive
23 |     '''
24 |     return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0])
25 |    
26 | def MCC(predictions,ground_truths):
27 |     '''
28 |     Here it is assumed:
29 |     0=negative
30 |     1=positive
31 |     '''
32 |     N1=len(predictions[(predictions==0)&(ground_truths==1)])
33 |     N2=len(predictions[(predictions==1)&(ground_truths==0)])
34 |     N3=len(ground_truths[ground_truths==1])
35 |     N4=len(ground_truths[ground_truths==0])
36 |     sens=1-N1/N3
37 |     spec=1-N2/N4
38 |     denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4))
39 |     return (1-sens-spec)/denom
40 |     
41 |     
42 |     


--------------------------------------------------------------------------------
/src/Ecoli_Promoter_classification/evaluate.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import time
  6 | from Functions import *
  7 | from Dataset import *
  8 | from Network import *
  9 | from LrScheduler import *
 10 | import Metrics
 11 | from Logger import CSVLogger
 12 | import argparse
 13 | 
 14 | try:
 15 |     #from apex.parallel import DistributedDataParallel as DDP
 16 |     from apex.fp16_utils import *
 17 |     from apex import amp, optimizers
 18 |     from apex.multi_tensor_apply import multi_tensor_applier
 19 | except ImportError:
 20 |     raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")
 21 | import pickle
 22 | #gpu selection
 23 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 24 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 25 | from sklearn.metrics import matthews_corrcoef
 26 | def get_args():
 27 |     parser = argparse.ArgumentParser()
 28 |     parser.add_argument('--gpu_id', type=str, default='0',  help='which gpu to use')
 29 |     parser.add_argument('--path', type=str, default='../v9d3.csv', help='path of csv file with DNA sequences and labels')
 30 |     parser.add_argument('--epochs', type=int, default=150, help='number of epochs to train')
 31 |     parser.add_argument('--batch_size', type=int, default=24, help='size of each batch during training')
 32 |     parser.add_argument('--weight_decay', type=float, default=0, help='weight dacay used in optimizer')
 33 |     parser.add_argument('--ntoken', type=int, default=4, help='number of tokens to represent DNA nucleotides (should always be 4)')
 34 |     parser.add_argument('--nclass', type=int, default=2, help='number of classes from the linear decoder')
 35 |     parser.add_argument('--ninp', type=int, default=512, help='ninp for transformer encoder')
 36 |     parser.add_argument('--nhead', type=int, default=8, help='nhead for transformer encoder')
 37 |     parser.add_argument('--nhid', type=int, default=2048, help='nhid for transformer encoder')
 38 |     parser.add_argument('--nlayers', type=int, default=6, help='nlayers for transformer encoder')
 39 |     parser.add_argument('--save_freq', type=int, default=1, help='saving checkpoints per save_freq epochs')
 40 |     parser.add_argument('--dropout', type=float, default=.1, help='transformer dropout')
 41 |     parser.add_argument('--warmup_steps', type=int, default=3200, help='training schedule warmup steps')
 42 |     parser.add_argument('--lr_scale', type=float, default=0.1, help='learning rate scale')
 43 |     parser.add_argument('--nmute', type=int, default=18, help='number of mutations during training')
 44 |     parser.add_argument('--kmers', type=int, nargs='+', default=[2,3,4,5,6], help='k-mers to be aggregated')
 45 |     #parser.add_argument('--kmer_aggregation', type=bool, default=True, help='k-mers to be aggregated')
 46 |     parser.add_argument('--kmer_aggregation', dest='kmer_aggregation', action='store_true')
 47 |     parser.add_argument('--no_kmer_aggregation', dest='kmer_aggregation', action='store_false')
 48 |     parser.set_defaults(kmer_aggregation=True)
 49 |     parser.add_argument('--nfolds', type=int, default=5, help='number of cross validation folds')
 50 |     parser.add_argument('--fold', type=int, default=0, help='which fold to train')
 51 |     opts = parser.parse_args()
 52 |     return opts
 53 | 
 54 | def evaluate_fold(fold):
 55 | 
 56 |     #load data
 57 |     #opts=get_args()
 58 |     df=pd.read_csv('v9d3.csv')
 59 | 
 60 |     sequences=np.asarray(df.seqs)
 61 |     labels=np.asarray(df.labels)
 62 | 
 63 |     train_indices, val_indices=iter_split(sequences,labels,fold,opts.nfolds)
 64 |     # print(train_indices.shape)
 65 |     # print(val_indices.shape)
 66 |     # exit()
 67 |     dataset=PromoterDataset(sequences[train_indices],labels[train_indices])
 68 |     val_dataset=PromoterDataset(sequences[val_indices],labels[val_indices])
 69 |     dataloader=torch.utils.data.DataLoader(dataset,batch_size=opts.batch_size,shuffle=True)
 70 |     val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=opts.batch_size*2,shuffle=False)
 71 | 
 72 | 
 73 | 
 74 |     #init model
 75 |     model=NucleicTransformer(opts.ntoken, opts.nclass, opts.ninp, opts.nhead, opts.nhid,
 76 |                            opts.nlayers, opts.kmer_aggregation, kmers=opts.kmers,
 77 |                            dropout=opts.dropout,return_aw=True).to(device)
 78 |     #optimizer=torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=weight_decay)
 79 | 
 80 |     # Initialization
 81 |     # opt_level = 'O1'
 82 |     # model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
 83 | 
 84 |     # pytorch_total_params = sum(p.numel() for p in model.parameters())
 85 |     # print('Total number of paramters: {}'.format(pytorch_total_params))
 86 | 
 87 |     #evaluation loop
 88 |     #ground_truths=dataset.labels[dataset.val_indices]
 89 |     ensemble_predictions=[]
 90 |     acc=[]
 91 | 
 92 |     weights_path="best_weights/fold{}top1.ckpt".format(fold,i+1)
 93 |     print(weights_path)
 94 |     checkpoint=torch.load(weights_path)
 95 |     model.load_state_dict(checkpoint)
 96 |     predictions,attention_weights,sequences,ground_truths=predict(model,device,val_dataloader)
 97 |     # #validate(model,device,dataset,batch_size=batch_size*2)
 98 |     predictions=np.exp(predictions)/np.sum(np.exp(predictions),axis=1).reshape(len(predictions),1)
 99 |     ensemble_predictions.append(predictions)
100 |     ensemble_predictions=np.asarray(ensemble_predictions)
101 |     ensemble_predictions=np.mean(np.asarray(ensemble_predictions),axis=0)
102 |     model.cpu()
103 |     del model
104 |     #del optimizer
105 |     torch.cuda.empty_cache()
106 |     return ensemble_predictions, ground_truths, attention_weights, sequences
107 | 
108 | opts=get_args()
109 | 
110 | 
111 | predictions=[]
112 | ground_truths=[]
113 | attention_weights=[]
114 | sequences=[]
115 | for i in range(5):
116 |     ngram=[7]
117 |     p,t,at,seq= evaluate_fold(i)
118 |     predictions.append(p)
119 |     ground_truths.append(t)
120 |     print(at.shape)
121 |     attention_weights.append(at)
122 |     sequences.append(seq)
123 | 
124 | 
125 | probs=np.concatenate(predictions)
126 | ground_truths=np.concatenate(ground_truths)
127 | predictions=np.argmax(probs,axis=1)
128 | attention_weights=np.squeeze(np.concatenate(attention_weights,0)).astype('float16')
129 | sequences=np.asarray(sequences).reshape(-1,81)
130 | acc=Metrics.accuracy(predictions,ground_truths)
131 | sens=Metrics.sensitivity(predictions,ground_truths)
132 | spec=Metrics.specificity(predictions,ground_truths)
133 | MCC=matthews_corrcoef(ground_truths,predictions)
134 | 
135 | prediction_dict={'predictions':np.squeeze(predictions),
136 |                  'ground_truths':np.squeeze(ground_truths),
137 |                  'attention_weights':np.squeeze(attention_weights),
138 |                  'sequences':np.squeeze(sequences.reshape(-1,81))
139 | }
140 | 
141 | with open("prediction_dict.p","wb+") as f:
142 |     pickle.dump(prediction_dict,f)
143 | 
144 | 
145 | with open("cv.txt",'w+') as f:
146 |     f.write(f"ACC: {acc}\n")
147 |     f.write(f"sensitivity: {sens}\n")
148 |     f.write(f"spec: {spec}\n")
149 |     f.write(f"MCC: {MCC}\n")
150 | 


--------------------------------------------------------------------------------
/src/Ecoli_Promoter_classification/evaluate.sh:
--------------------------------------------------------------------------------
1 | python -i evaluate.py --gpu_id 0 --kmer_aggregation --epochs 150 --nlayers 6 --nmute 15 --path v9d3.csv --kmers 7 --ninp 256 --nhid 1024
2 | 


--------------------------------------------------------------------------------
/src/Ecoli_Promoter_classification/extract_motif.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pickle
  3 | import os
  4 | from tqdm import tqdm
  5 | import pandas as pd
  6 | import argparse
  7 | import matplotlib.pyplot as plt
  8 | import matplotlib
  9 | 
 10 | font = {'family' : 'normal',
 11 |         'weight' : 'bold',
 12 |         'size'   : 10}
 13 | 
 14 | matplotlib.rc('font', **font)
 15 | 
 16 | def get_args():
 17 |     parser = argparse.ArgumentParser()
 18 |     parser.add_argument('--kmers', type=int, default='7',  help='kmer')
 19 |     opts = parser.parse_args()
 20 |     return opts
 21 | 
 22 | opts=get_args()
 23 | 
 24 | nts=[
 25 | "A",
 26 | "T",
 27 | "G",
 28 | "C"]
 29 | 
 30 | def int2nucleotide(nt_sequence,target_length=None):
 31 |     seq=''
 32 |     for nt in nt_sequence:
 33 |         seq+=nts[nt]
 34 |     return seq
 35 | 
 36 | with open("prediction_dict.p","rb") as f:
 37 |     prediction_dict=pickle.load(f)
 38 | 
 39 | 
 40 | df=pd.DataFrame(columns=['index','sequence'])
 41 | 
 42 | def get_kmers(sequence,k):
 43 |     kmers=[]
 44 |     for i in range(len(sequence)-k+1):
 45 |         kmers.append(sequence[i:i+k])
 46 |     return kmers
 47 | 
 48 | os.system('mkdir aw_visualized')
 49 | 
 50 | top=10
 51 | count=0
 52 | sequences=[]
 53 | top_kmers=[]
 54 | top_k_count=[]
 55 | for i in tqdm(range(len(prediction_dict['sequences']))):
 56 | 
 57 |     count+=1
 58 |     sequence=int2nucleotide(prediction_dict['sequences'][i])
 59 |     sequences.append(sequence)
 60 |     attention_weights=prediction_dict['attention_weights'][i]
 61 |     ground_truth=prediction_dict['ground_truths'][i]
 62 |     prediction=prediction_dict['predictions'][i]
 63 | 
 64 |     kmers=np.asarray(get_kmers(sequence,opts.kmers))
 65 | 
 66 |     attention_weights=attention_weights[-1].sum(0)
 67 |     #attention_weights=attention_weights/attention_weights.sum()
 68 |     # plt.imshow(attention_weights.reshape(1,-1).astype('float32'))
 69 |     # plt.show()
 70 |     #exit()
 71 |     if ground_truth==1:
 72 |         state='positive'
 73 |     else:
 74 |         state='negative'
 75 | 
 76 |     if ground_truth==prediction:
 77 |         eval='correct'
 78 |     else:
 79 |         eval='wrong'
 80 |     if state=='positive' and eval=='correct':
 81 |         sorted_indices=np.argsort(attention_weights)
 82 |         #print(attention_weights[sorted_indices][-3:])
 83 |         top_k=kmers[sorted_indices][-3:]
 84 |         for kmer in top_k:
 85 |             if kmer not in top_kmers:
 86 |                 top_kmers.append(kmer)
 87 |                 top_k_count.append(1)
 88 |             else:
 89 |                 top_k_count[top_kmers.index(kmer)]=top_k_count[top_kmers.index(kmer)]+1
 90 |     #exit()
 91 | 
 92 | top_kmers=np.asarray(top_kmers)
 93 | top_k_count=np.asarray(top_k_count)
 94 | 
 95 | #exit()
 96 | 
 97 | top_indices=np.flip(np.argsort(top_k_count))
 98 | 
 99 | fig, ax = plt.subplots()
100 | x=np.arange(top)
101 | width=0.4
102 | bar=ax.bar(x,top_k_count[top_indices[:top]],edgecolor='k',linewidth=2)
103 | ax.set_ylabel('Num of appearancesin top 3',fontsize=10)
104 | #ax.set_title('Scores by group and gender')
105 | ax.set_xticks(x)
106 | ax.set_xticklabels(top_kmers[top_indices[:top]])
107 | plt.setp(ax.get_xticklabels(), rotation=30, ha="right",
108 |          rotation_mode="anchor")
109 | ax.legend()
110 | plt.savefig('promoter_motifs.eps')
111 | #plt.show()
112 | 


--------------------------------------------------------------------------------
/src/Ecoli_Promoter_classification/readme.md:
--------------------------------------------------------------------------------
 1 | # Source code to train nucleic transformer to reproduce results in the paper
 2 | 
 3 | Dataset is included here in v9d3.csv
 4 | 
 5 | To run: 
 6 | ```bash run.sh```
 7 | 
 8 | To get cross validation results: 
 9 | ```bash evaluate.sh```
10 | 
11 | Results will be in cv.txt
12 | 
13 | To extract top promoter motifs based on attention weights:
14 | ```python extract_motif.py```
15 | 
16 | An eps file named promoter_motifs.eps will be generated
17 | 


--------------------------------------------------------------------------------
/src/Ecoli_Promoter_classification/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | for i in {0..4};do
3 | python train.py --fold $i --gpu_id 0 --kmer_aggregation --epochs 150 --nlayers 6 --nmute 15 --path v9d3.csv --kmers 7 --ninp 256 --nhid 1024
4 | done
5 | 


--------------------------------------------------------------------------------
/src/Enchancer_classification/Dataset.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import os
 3 | import numpy as np
 4 | import pandas as pd
 5 | from tqdm import tqdm
 6 | import torch
 7 | 
 8 | 
 9 | nt_int={
10 | "A": 0,
11 | "T": 1,
12 | "G": 2,
13 | "C": 3,}
14 | 
15 | def nucleatide2int(nt_sequence,target_length=None):
16 |     int_sequence=[]
17 |     for nt in nt_sequence:
18 |         nt=nt.upper()
19 |         if nt in nt_int:
20 |             int_sequence.append(nt_int[nt])
21 |     int_sequence=np.asarray(int_sequence,dtype='int32')
22 |     if target_length:
23 |         int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1)
24 |     return int_sequence
25 | 
26 | 
27 | class PromoterDataset(torch.utils.data.Dataset):
28 |     def __init__(self,sequences,labels):
29 |         self.data=[]
30 |         for seq in sequences:
31 |             self.data.append(nucleatide2int(seq))
32 | 
33 |         self.data=np.asarray(self.data,dtype='int')
34 |         self.labels=labels
35 | 
36 |         print(self.data.shape)
37 |         print(self.labels.shape)
38 | 
39 |     def __len__(self):
40 |         return len(self.labels)
41 | 
42 |     def __getitem__(self,idx):
43 |         return {'data':self.data[idx], 'labels':self.labels[idx]}
44 | 


--------------------------------------------------------------------------------
/src/Enchancer_classification/Functions.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import os
  3 | from sklearn import metrics
  4 | import numpy as np
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from tqdm import tqdm
  8 | import Metrics
  9 | import numpy as np
 10 | import os
 11 | import pandas as pd
 12 | import torch
 13 | import random
 14 | from sklearn.model_selection import StratifiedKFold
 15 | 
 16 | 
 17 | def iter_split(data,labels,fold,nfolds=5,seed=2020):
 18 |     splits = StratifiedKFold(n_splits=nfolds, random_state=seed, shuffle=True)
 19 |     splits = list(splits.split(data,labels))
 20 |     # splits = np.zeros(len(data)).astype(np.int)
 21 |     # for i in range(nfolds): splits[splits[i][1]] = i
 22 |     # indices=np.arange(len(data))
 23 |     train_indices=splits[fold][0]
 24 |     val_indices=splits[fold][1]
 25 |     return train_indices, val_indices
 26 | 
 27 | def seed_everything(seed=42):
 28 |     random.seed(seed)
 29 |     os.environ['PYTHONHASHSEED'] = str(seed)
 30 |     np.random.seed(seed)
 31 |     torch.manual_seed(seed)
 32 |     torch.cuda.manual_seed(seed)
 33 |     torch.backends.cudnn.deterministic = True
 34 | seed_everything(seed=42)
 35 | 
 36 | def get_best_weights_from_fold(fold,top=1):
 37 |     csv_file='log_fold{}.csv'.format(fold)
 38 | 
 39 |     history=pd.read_csv(csv_file)
 40 |     scores=np.asarray(history.val_acc)
 41 |     top_epochs=scores.argsort()[-3:][::-1]
 42 |     print(scores[top_epochs])
 43 |     os.system('mkdir best_weights')
 44 | 
 45 |     for i in range(top):
 46 |         weights_path='checkpoints_fold{}/epoch{}.ckpt'.format(fold,history.epoch[top_epochs[i]])
 47 |         print(weights_path)
 48 |         os.system('cp {} best_weights/fold{}top{}.ckpt'.format(weights_path,fold,i+1))
 49 |     os.system('rm -r checkpoints_fold{}'.format(fold))
 50 | 
 51 | def smoothcrossentropyloss(pred,gold,n_class=2,smoothing=0.05):
 52 |     gold = gold.contiguous().view(-1)
 53 |     one_hot = torch.zeros_like(pred).scatter(1, gold.view(-1, 1), 1)
 54 |     one_hot = one_hot * (1 - smoothing) + (1 - one_hot) * smoothing / (n_class - 1)
 55 |     log_prb = F.log_softmax(pred, dim=1)
 56 |     loss = -(one_hot * log_prb)
 57 |     #loss=loss.sum(1).mean()
 58 |     return loss
 59 | 
 60 | def mutate_dna_sequence(sequence,nmute=15):
 61 |     mutation=torch.randint(0,4,size=(sequence.shape[0],nmute))
 62 |     to_mutate = torch.randperm(sequence.shape[1])[:nmute]
 63 |     sequence[:,to_mutate]=mutation
 64 |     return sequence
 65 | 
 66 | def get_MLM_mask(sequence,nmask=12):
 67 |     mask=np.zeros(sequence.shape,dtype='bool')
 68 |     to_mask=np.random.choice(len(sequence[0]),size=(nmask),replace=False)
 69 |     mask[:,to_mask]=True
 70 |     return mask
 71 | 
 72 | def get_complementary_sequence(sequence):
 73 |     complementary_sequence=sequence.copy()
 74 |     complementary_sequence[sequence==0]=1
 75 |     complementary_sequence[sequence==1]=0
 76 |     complementary_sequence[sequence==2]=3
 77 |     complementary_sequence[sequence==3]=2
 78 |     complementary_sequence=complementary_sequence[:,::-1]
 79 |     return complementary_sequence
 80 | 
 81 | def update_lr(optimizer, lr):
 82 |     for param_group in optimizer.param_groups:
 83 |         param_group['lr'] = lr
 84 | 
 85 | def save_weights(model,optimizer,epoch,folder):
 86 |     if os.path.isdir(folder)==False:
 87 |         os.makedirs(folder,exist_ok=True)
 88 |     torch.save(model.state_dict(), folder+'/epoch{}.ckpt'.format(epoch+1))
 89 | 
 90 | def get_lr(optimizer):
 91 |     for param_group in optimizer.param_groups:
 92 |         lr=param_group['lr']
 93 |     return lr
 94 | 
 95 | def validate(model,device,dataset,batch_size=64):
 96 |     batches=len(dataset)
 97 |     model.train(False)
 98 |     total=0
 99 |     ground_truths=[]
100 |     predictions=[]
101 |     loss=0
102 |     criterion=nn.CrossEntropyLoss()
103 |     # dataset.switch_mode(training=False)
104 |     # dataset.update_batchsize(batch_size)
105 |     with torch.no_grad():
106 |         for data in tqdm(dataset):
107 |             #data=dataset[i]
108 |             X=data['data'].to(device).long()
109 |             Y=data['labels'].to(device).long()
110 |             output= model(X,None)
111 |             del X
112 |             loss+=criterion(output,Y)
113 |             classification_predictions = torch.argmax(output,dim=1).squeeze()
114 |             for pred in classification_predictions:
115 |                 predictions.append(pred.cpu().numpy())
116 |             for truth in Y:
117 |                 ground_truths.append(truth.cpu().numpy())
118 |             del output
119 |     ground_truths=np.asarray(ground_truths)
120 |     torch.cuda.empty_cache()
121 |     val_loss=(loss/batches).cpu()
122 |     predictions=np.asarray(predictions)
123 |     binary_predictions=predictions.copy()
124 |     binary_predictions[binary_predictions==2]=1
125 |     binary_ground_truths=ground_truths.copy()
126 |     binary_ground_truths[binary_ground_truths==2]=1
127 |     #print(predictions)
128 |     #print(ground_truths)
129 |     #score=metrics.cohen_kappa_score(ground_truths,predictions,weights='quadratic')
130 |     val_acc=Metrics.accuracy(predictions,ground_truths)
131 |     val_sens=Metrics.sensitivity(predictions,ground_truths)
132 |     val_spec=Metrics.specificity(predictions,ground_truths)
133 |     binary_acc=np.sum(binary_predictions==binary_ground_truths)/len(binary_ground_truths)
134 |     print('Accuracy: {}, Binary Accuracy: {} Val Loss: {}'.format(val_acc,binary_acc,val_loss))
135 |     return val_loss,val_acc,val_sens,val_spec
136 | 
137 | 
138 | def predict(model,device,dataset,batch_size=64):
139 |     batches=len(dataset)
140 |     model.train(False)
141 |     total=0
142 |     ground_truths=[]
143 |     predictions=[]
144 |     attention_weights=[]
145 |     sequences=[]
146 |     loss=0
147 |     criterion=nn.CrossEntropyLoss()
148 |     with torch.no_grad():
149 |         for data in tqdm(dataset):
150 |             #data=dataset[i]
151 |             X=data['data'].to(device,).long()
152 |             Y=data['labels'].to(device,dtype=torch.int64)
153 | 
154 |             output,aw= model(X,None)
155 |             #del X
156 |             loss+=criterion(output,Y)
157 |             classification_predictions = torch.argmax(output,dim=1).squeeze()
158 |             for pred in output:
159 |                 predictions.append(pred.cpu().numpy())
160 |             for weight in aw:
161 |                 attention_weights.append(weight.cpu().numpy())
162 | 
163 |             for t in Y:
164 |                 ground_truths.append(t.cpu().numpy())
165 |             for seq in X:
166 |                 sequences.append(seq.cpu().numpy())
167 |             del output
168 |     torch.cuda.empty_cache()
169 |     val_loss=(loss/batches).cpu()
170 |     predictions=np.asarray(predictions)
171 |     attention_weights=np.asarray(attention_weights)
172 |     binary_predictions=predictions.copy()
173 |     binary_predictions[binary_predictions==2]=1
174 |     binary_ground_truths=ground_truths.copy()
175 |     binary_ground_truths[binary_ground_truths==2]=1
176 |     return predictions,attention_weights,np.asarray(sequences),np.asarray(ground_truths)
177 | 


--------------------------------------------------------------------------------
/src/Enchancer_classification/Logger.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from os import path
 3 | 
 4 | 
 5 | class CSVLogger:
 6 |     def __init__(self,columns,file):
 7 |         self.columns=columns
 8 |         self.file=file
 9 |         if not self.check_header():
10 |             self._write_header()
11 | 
12 | 
13 |     def check_header(self):
14 |         if path.exists(self.file):
15 |             # with open(self.file, 'r') as csvfile:
16 |             #     sniffer = csv.Sniffer()
17 |             #     has_header = sniffer.has_header(csvfile.read(2048))
18 |             #     header=csvfile.seek(0)
19 |             header=True
20 |         else:
21 |             header=False
22 |         return header
23 | 
24 | 
25 |     def _write_header(self):
26 |         with open(self.file,"a") as f:
27 |             string=""
28 |             for attrib in self.columns:
29 |                 string+="{},".format(attrib)
30 |             string=string[:len(string)-1]
31 |             string+="\n"
32 |             f.write(string)
33 |         return self
34 | 
35 |     def log(self,row):
36 |         if len(row)!=len(self.columns):
37 |             raise Exception("Mismatch between row vector and number of columns in logger")
38 |         with open(self.file,"a") as f:
39 |             string=""
40 |             for attrib in row:
41 |                 string+="{},".format(attrib)
42 |             string=string[:len(string)-1]
43 |             string+="\n"
44 |             f.write(string)
45 |         return self
46 | 


--------------------------------------------------------------------------------
/src/Enchancer_classification/LrScheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def update_lr(optimizer, lr):
 5 |     for param_group in optimizer.param_groups:
 6 |         param_group['lr'] = lr
 7 | 
 8 | class lr_AIAYN():
 9 |     '''
10 |     Learning rate scheduler from the paper:
11 |     Attention is All You Need
12 |     '''
13 |     def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1):
14 |         self.optimizer=optimizer
15 |         self.d_model=d_model
16 |         self.warmup_steps=warmup_steps
17 |         self.step_num=0
18 |         self.factor=factor
19 | 
20 |     def step(self):
21 |         self.step_num+=1
22 |         lr=self.d_model**-0.5*np.min([self.step_num**-0.5,
23 |                                       self.step_num*self.warmup_steps**-1.5])*self.factor
24 |         update_lr(self.optimizer,lr)
25 |         return lr
26 |         
27 |         
28 | class Cos_Anneal():
29 |     '''
30 |     Learning rate scheduler flat and anneal
31 |     '''
32 |     def __init__(self,optimizer,max_lr,min_lr,T):
33 |         self.optimizer=optimizer
34 |         self.max_lr=max_lr
35 |         self.min_lr=min_lr
36 |         self.step_num=0
37 |         self.T=T
38 | 
39 |     def step(self):
40 |         pi=3.1415
41 |         self.step_num+=1
42 |         lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi))
43 |         if self.optimizer:
44 |             update_lr(self.optimizer,lr)
45 |         return lr        


--------------------------------------------------------------------------------
/src/Enchancer_classification/Metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def accuracy(predictions,ground_truths):
 5 |     return np.sum(predictions==ground_truths)/len(ground_truths)
 6 |     
 7 |     
 8 | def sensitivity(predictions,ground_truths):
 9 |     '''
10 |     Here it is assumed:
11 |     0=negative
12 |     1=positive
13 |     '''
14 |     return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1])
15 | 
16 | 
17 | 
18 | def specificity(predictions,ground_truths):
19 |     '''
20 |     Here it is assumed:
21 |     0=negative
22 |     1=positive
23 |     '''
24 |     return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0])
25 |    
26 | def MCC(predictions,ground_truths):
27 |     '''
28 |     Here it is assumed:
29 |     0=negative
30 |     1=positive
31 |     '''
32 |     N1=len(predictions[(predictions==0)&(ground_truths==1)])
33 |     N2=len(predictions[(predictions==1)&(ground_truths==0)])
34 |     N3=len(ground_truths[ground_truths==1])
35 |     N4=len(ground_truths[ground_truths==0])
36 |     sens=1-N1/N3
37 |     spec=1-N2/N4
38 |     denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4))
39 |     return (1-sens-spec)/denom
40 |     
41 |     
42 |     


--------------------------------------------------------------------------------
/src/Enchancer_classification/evaluate.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import time
  6 | from Functions import *
  7 | from Dataset import *
  8 | from Network import *
  9 | from LrScheduler import *
 10 | import Metrics
 11 | from Logger import CSVLogger
 12 | import argparse
 13 | 
 14 | try:
 15 |     #from apex.parallel import DistributedDataParallel as DDP
 16 |     from apex.fp16_utils import *
 17 |     from apex import amp, optimizers
 18 |     from apex.multi_tensor_apply import multi_tensor_applier
 19 | except ImportError:
 20 |     raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")
 21 | import pickle
 22 | #gpu selection
 23 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 24 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 25 | from sklearn.metrics import matthews_corrcoef
 26 | def get_args():
 27 |     parser = argparse.ArgumentParser()
 28 |     parser.add_argument('--gpu_id', type=str, default='0',  help='which gpu to use')
 29 |     parser.add_argument('--path', type=str, default='../v9d3.csv', help='path of csv file with DNA sequences and labels')
 30 |     parser.add_argument('--epochs', type=int, default=150, help='number of epochs to train')
 31 |     parser.add_argument('--batch_size', type=int, default=24, help='size of each batch during training')
 32 |     parser.add_argument('--weight_decay', type=float, default=0, help='weight dacay used in optimizer')
 33 |     parser.add_argument('--ntoken', type=int, default=4, help='number of tokens to represent DNA nucleotides (should always be 4)')
 34 |     parser.add_argument('--nclass', type=int, default=2, help='number of classes from the linear decoder')
 35 |     parser.add_argument('--ninp', type=int, default=512, help='ninp for transformer encoder')
 36 |     parser.add_argument('--nhead', type=int, default=8, help='nhead for transformer encoder')
 37 |     parser.add_argument('--nhid', type=int, default=2048, help='nhid for transformer encoder')
 38 |     parser.add_argument('--nlayers', type=int, default=6, help='nlayers for transformer encoder')
 39 |     parser.add_argument('--save_freq', type=int, default=1, help='saving checkpoints per save_freq epochs')
 40 |     parser.add_argument('--dropout', type=float, default=.1, help='transformer dropout')
 41 |     parser.add_argument('--warmup_steps', type=int, default=3200, help='training schedule warmup steps')
 42 |     parser.add_argument('--lr_scale', type=float, default=0.1, help='learning rate scale')
 43 |     parser.add_argument('--nmute', type=int, default=18, help='number of mutations during training')
 44 |     parser.add_argument('--kmers', type=int, nargs='+', default=[2,3,4,5,6], help='k-mers to be aggregated')
 45 |     #parser.add_argument('--kmer_aggregation', type=bool, default=True, help='k-mers to be aggregated')
 46 |     parser.add_argument('--kmer_aggregation', dest='kmer_aggregation', action='store_true')
 47 |     parser.add_argument('--no_kmer_aggregation', dest='kmer_aggregation', action='store_false')
 48 |     parser.set_defaults(kmer_aggregation=True)
 49 |     parser.add_argument('--nfolds', type=int, default=5, help='number of cross validation folds')
 50 |     parser.add_argument('--fold', type=int, default=0, help='which fold to train')
 51 |     opts = parser.parse_args()
 52 |     return opts
 53 | 
 54 | def evaluate_fold(fold):
 55 | 
 56 |     #load data
 57 |     #opts=get_args()
 58 |     df=pd.read_csv(opts.path)
 59 | 
 60 |     sequences=np.asarray(df.sequence)
 61 |     labels=np.asarray(df.label)
 62 | 
 63 |     train_indices, val_indices=iter_split(sequences,labels,fold,opts.nfolds)
 64 |     # print(train_indices.shape)
 65 |     # print(val_indices.shape)
 66 |     # exit()
 67 |     dataset=PromoterDataset(sequences[train_indices],labels[train_indices])
 68 |     val_dataset=PromoterDataset(sequences[val_indices],labels[val_indices])
 69 |     dataloader=torch.utils.data.DataLoader(dataset,batch_size=opts.batch_size,shuffle=True)
 70 |     val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=opts.batch_size*2,shuffle=False)
 71 | 
 72 | 
 73 | 
 74 |     #init model
 75 |     model=NucleicTransformer(opts.ntoken, opts.nclass, opts.ninp, opts.nhead, opts.nhid,
 76 |                            opts.nlayers, opts.kmer_aggregation, kmers=opts.kmers,
 77 |                            dropout=opts.dropout,return_aw=True).to(device)
 78 |     #optimizer=torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=weight_decay)
 79 | 
 80 |     # Initialization
 81 |     # opt_level = 'O1'
 82 |     # model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
 83 | 
 84 |     # pytorch_total_params = sum(p.numel() for p in model.parameters())
 85 |     # print('Total number of paramters: {}'.format(pytorch_total_params))
 86 | 
 87 |     #evaluation loop
 88 |     #ground_truths=dataset.labels[dataset.val_indices]
 89 |     ensemble_predictions=[]
 90 |     acc=[]
 91 | 
 92 |     weights_path="best_weights/fold{}top1.ckpt".format(fold,i+1)
 93 |     print(weights_path)
 94 |     checkpoint=torch.load(weights_path)
 95 |     model.load_state_dict(checkpoint)
 96 |     predictions,attention_weights,sequences,ground_truths=predict(model,device,val_dataloader)
 97 |     # #validate(model,device,dataset,batch_size=batch_size*2)
 98 |     predictions=np.exp(predictions)/np.sum(np.exp(predictions),axis=1).reshape(len(predictions),1)
 99 |     ensemble_predictions.append(predictions)
100 |     ensemble_predictions=np.asarray(ensemble_predictions)
101 |     ensemble_predictions=np.mean(np.asarray(ensemble_predictions),axis=0)
102 |     model.cpu()
103 |     del model
104 |     #del optimizer
105 |     torch.cuda.empty_cache()
106 |     return ensemble_predictions, ground_truths, attention_weights, sequences
107 | 
108 | opts=get_args()
109 | 
110 | 
111 | predictions=[]
112 | ground_truths=[]
113 | attention_weights=[]
114 | sequences=[]
115 | for i in range(5):
116 |     ngram=[7]
117 |     p,t,at,seq= evaluate_fold(i)
118 |     predictions.append(p)
119 |     ground_truths.append(t)
120 |     #print(at.shape)
121 |     #attention_weights.append(at)
122 |     #print(seq.shape)
123 |     #sequences.append(seq)
124 | 
125 | 
126 | probs=np.concatenate(predictions)
127 | ground_truths=np.concatenate(ground_truths)
128 | predictions=np.argmax(probs,axis=1)
129 | #attention_weights=np.squeeze(np.concatenate(attention_weights,0)).astype('float16')
130 | #sequences=np.asarray(sequences).reshape(-1,81)
131 | acc=Metrics.accuracy(predictions,ground_truths)
132 | sens=Metrics.sensitivity(predictions,ground_truths)
133 | spec=Metrics.specificity(predictions,ground_truths)
134 | MCC=matthews_corrcoef(ground_truths,predictions)
135 | 
136 | # prediction_dict={'predictions':np.squeeze(predictions),
137 | #                  'ground_truths':np.squeeze(ground_truths),
138 | #                  'attention_weights':np.squeeze(attention_weights),
139 | #                  'sequences':np.squeeze(sequences.reshape(-1,81))
140 | # }
141 | 
142 | # with open("prediction_dict.p","wb+") as f:
143 | #     pickle.dump(prediction_dict,f)
144 | 
145 | 
146 | with open("cv.txt",'w+') as f:
147 |     f.write(f"ACC: {acc}\n")
148 |     f.write(f"sensitivity: {sens}\n")
149 |     f.write(f"spec: {spec}\n")
150 |     f.write(f"MCC: {MCC}\n")
151 | 


--------------------------------------------------------------------------------
/src/Enchancer_classification/evaluate.sh:
--------------------------------------------------------------------------------
1 | python -i evaluate.py --gpu_id 0 --kmer_aggregation --epochs 150 --nlayers 6 --nmute 15 \
2 | --path ../bert_enhancer_dataset.csv --kmers 7 --ninp 256 --nhid 1024
3 | 


--------------------------------------------------------------------------------
/src/Enchancer_classification/job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ##NECESSARY JOB SPECIFICATIONS
 4 | #SBATCH --job-name=JobExample5       #Set the job name to "JobExample4"
 5 | #SBATCH --time=02:30:00              #Set the wall clock limit to 1hr and 30min
 6 | #SBATCH --ntasks=1                   #Request 1 task
 7 | #SBATCH --mem=5120M                  #Request 2560MB (2.5GB) per node
 8 | #SBATCH --output=out      #Send stdout/err to "Example4Out.[jobID]"
 9 | #SBATCH --gres=gpu:rtx:1             #Request 1 "rtx" GPU per node
10 | #SBATCH --partition=gpu              #Request the GPU partition/queue
11 | 
12 | 
13 | ##OPTIONAL JOB SPECIFICATIONS
14 | #SBATCH --account=132825315633
15 | #SBATCH --mail-type=ALL              #Send email on all job events
16 | #SBATCH --mail-user=shujun@tamu.edu    #Send all emails to email_address
17 | 
18 | #First Executable Line
19 | #cd $SCRATCH
20 | cd /scratch/user/shujun/Nucleic-Transformer/src/promoter_classification_v9d4
21 | #module load Anaconda3
22 | #source /scratch/user/shujun/.conda/envs/torch/bin/activate
23 | #./run.sh
24 | 
25 | for i in {0..4};do
26 |   /scratch/user/shujun/.conda/envs/torch/bin/python train.py --fold $i --kmer_aggregation --epochs 150 --nlayers 6 --nmute 15 --path v9d4.csv --kmers 7 --ninp 256 --nhid 1024
27 | done
28 | 


--------------------------------------------------------------------------------
/src/Enchancer_classification/job_eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ##NECESSARY JOB SPECIFICATIONS
 4 | #SBATCH --job-name=JobExample5       #Set the job name to "JobExample4"
 5 | #SBATCH --time=00:10:00              #Set the wall clock limit to 1hr and 30min
 6 | #SBATCH --ntasks=1                   #Request 1 task
 7 | #SBATCH --mem=5120M                  #Request 2560MB (2.5GB) per node
 8 | #SBATCH --output=out      #Send stdout/err to "Example4Out.[jobID]"
 9 | #SBATCH --gres=gpu:rtx:1             #Request 1 "rtx" GPU per node
10 | #SBATCH --partition=gpu              #Request the GPU partition/queue
11 | 
12 | 
13 | ##OPTIONAL JOB SPECIFICATIONS
14 | #SBATCH --account=132825315633
15 | #SBATCH --mail-type=ALL              #Send email on all job events
16 | #SBATCH --mail-user=shujun@tamu.edu    #Send all emails to email_address
17 | 
18 | #First Executable Line
19 | #cd $SCRATCH
20 | cd /scratch/user/shujun/Nucleic-Transformer/src/promoter_classification_v9d4
21 | #module load Anaconda3
22 | #source /scratch/user/shujun/.conda/envs/torch/bin/activate
23 | #./run.sh
24 | 
25 | /scratch/user/shujun/.conda/envs/torch/bin/python evaluate.py --gpu_id 0 --kmer_aggregation --epochs 150 --nlayers 6 --nmute 15 --path v9d4.csv --kmers 7 --ninp 256 --nhid 1024
26 | 


--------------------------------------------------------------------------------
/src/Enchancer_classification/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | for i in {0..4};do
3 | python train.py --fold $i --kmer_aggregation --epochs 50 --nlayers 6 --nmute 15 \
4 | --path bert_enhancer_dataset.csv \
5 | --kmers 7 --ninp 256 --nhid 1024
6 | done
7 | 


--------------------------------------------------------------------------------
/src/Enchancer_classification/test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import time
  6 | from Functions import *
  7 | from Dataset import *
  8 | from Network import *
  9 | from LrScheduler import *
 10 | import Metrics
 11 | from Logger import CSVLogger
 12 | import argparse
 13 | 
 14 | try:
 15 |     #from apex.parallel import DistributedDataParallel as DDP
 16 |     from apex.fp16_utils import *
 17 |     from apex import amp, optimizers
 18 |     from apex.multi_tensor_apply import multi_tensor_applier
 19 | except ImportError:
 20 |     raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")
 21 | import pickle
 22 | #gpu selection
 23 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 24 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 25 | from sklearn.metrics import matthews_corrcoef
 26 | def get_args():
 27 |     parser = argparse.ArgumentParser()
 28 |     parser.add_argument('--gpu_id', type=str, default='0',  help='which gpu to use')
 29 |     parser.add_argument('--path', type=str, default='../v9d3.csv', help='path of csv file with DNA sequences and labels')
 30 |     parser.add_argument('--epochs', type=int, default=150, help='number of epochs to train')
 31 |     parser.add_argument('--batch_size', type=int, default=24, help='size of each batch during training')
 32 |     parser.add_argument('--weight_decay', type=float, default=0, help='weight dacay used in optimizer')
 33 |     parser.add_argument('--ntoken', type=int, default=4, help='number of tokens to represent DNA nucleotides (should always be 4)')
 34 |     parser.add_argument('--nclass', type=int, default=2, help='number of classes from the linear decoder')
 35 |     parser.add_argument('--ninp', type=int, default=512, help='ninp for transformer encoder')
 36 |     parser.add_argument('--nhead', type=int, default=8, help='nhead for transformer encoder')
 37 |     parser.add_argument('--nhid', type=int, default=2048, help='nhid for transformer encoder')
 38 |     parser.add_argument('--nlayers', type=int, default=6, help='nlayers for transformer encoder')
 39 |     parser.add_argument('--save_freq', type=int, default=1, help='saving checkpoints per save_freq epochs')
 40 |     parser.add_argument('--dropout', type=float, default=.1, help='transformer dropout')
 41 |     parser.add_argument('--warmup_steps', type=int, default=3200, help='training schedule warmup steps')
 42 |     parser.add_argument('--lr_scale', type=float, default=0.1, help='learning rate scale')
 43 |     parser.add_argument('--nmute', type=int, default=18, help='number of mutations during training')
 44 |     parser.add_argument('--kmers', type=int, nargs='+', default=[2,3,4,5,6], help='k-mers to be aggregated')
 45 |     #parser.add_argument('--kmer_aggregation', type=bool, default=True, help='k-mers to be aggregated')
 46 |     parser.add_argument('--kmer_aggregation', dest='kmer_aggregation', action='store_true')
 47 |     parser.add_argument('--no_kmer_aggregation', dest='kmer_aggregation', action='store_false')
 48 |     parser.set_defaults(kmer_aggregation=True)
 49 |     parser.add_argument('--nfolds', type=int, default=5, help='number of cross validation folds')
 50 |     parser.add_argument('--fold', type=int, default=0, help='which fold to train')
 51 |     opts = parser.parse_args()
 52 |     return opts
 53 | 
 54 | def evaluate_fold(fold):
 55 | 
 56 |     #load data
 57 |     #opts=get_args()
 58 |     df=pd.read_csv(opts.path)
 59 | 
 60 |     sequences=np.asarray(df.sequence)
 61 |     labels=np.asarray(df.label)
 62 | 
 63 |     train_indices, val_indices=iter_split(sequences,labels,fold,opts.nfolds)
 64 |     # print(train_indices.shape)
 65 |     # print(val_indices.shape)
 66 |     # exit()
 67 |     test_dataset=PromoterDataset(sequences,labels)
 68 |     test_dataloader=torch.utils.data.DataLoader(test_dataset,batch_size=opts.batch_size*2,shuffle=False)
 69 | 
 70 | 
 71 | 
 72 |     #init model
 73 |     model=NucleicTransformer(opts.ntoken, opts.nclass, opts.ninp, opts.nhead, opts.nhid,
 74 |                            opts.nlayers, opts.kmer_aggregation, kmers=opts.kmers,
 75 |                            dropout=opts.dropout,return_aw=True).to(device)
 76 |     #optimizer=torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=weight_decay)
 77 | 
 78 |     # Initialization
 79 |     # opt_level = 'O1'
 80 |     # model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
 81 | 
 82 |     # pytorch_total_params = sum(p.numel() for p in model.parameters())
 83 |     # print('Total number of paramters: {}'.format(pytorch_total_params))
 84 | 
 85 |     #evaluation loop
 86 |     #ground_truths=dataset.labels[dataset.val_indices]
 87 |     ensemble_predictions=[]
 88 |     acc=[]
 89 | 
 90 |     weights_path="best_weights/fold{}top1.ckpt".format(fold,i+1)
 91 |     print(weights_path)
 92 |     checkpoint=torch.load(weights_path)
 93 |     model.load_state_dict(checkpoint)
 94 |     predictions,attention_weights,sequences,ground_truths=predict(model,device,test_dataloader)
 95 |     # #validate(model,device,dataset,batch_size=batch_size*2)
 96 |     predictions=np.exp(predictions)/np.sum(np.exp(predictions),axis=1).reshape(len(predictions),1)
 97 |     ensemble_predictions.append(predictions)
 98 |     ensemble_predictions=np.asarray(ensemble_predictions)
 99 |     ensemble_predictions=np.mean(np.asarray(ensemble_predictions),axis=0)
100 |     model.cpu()
101 |     del model
102 |     #del optimizer
103 |     torch.cuda.empty_cache()
104 |     return ensemble_predictions, ground_truths, attention_weights, sequences
105 | 
106 | opts=get_args()
107 | 
108 | 
109 | predictions=[]
110 | ground_truths=[]
111 | attention_weights=[]
112 | sequences=[]
113 | for i in range(5):
114 |     ngram=[7]
115 |     p,t,at,seq= evaluate_fold(i)
116 |     predictions.append(p)
117 |     ground_truths.append(t)
118 |     #print(at.shape)
119 |     #attention_weights.append(at)
120 |     #print(seq.shape)
121 |     #sequences.append(seq)
122 | 
123 | 
124 | probs=np.stack(predictions,0).mean(0)
125 | ground_truths=np.stack(ground_truths,0).mean(0)
126 | predictions=np.argmax(probs,axis=1)
127 | #attention_weights=np.squeeze(np.concatenate(attention_weights,0)).astype('float16')
128 | #sequences=np.asarray(sequences).reshape(-1,81)
129 | acc=Metrics.accuracy(predictions,ground_truths)
130 | sens=Metrics.sensitivity(predictions,ground_truths)
131 | spec=Metrics.specificity(predictions,ground_truths)
132 | MCC=matthews_corrcoef(ground_truths,predictions)
133 | 
134 | # prediction_dict={'predictions':np.squeeze(predictions),
135 | #                  'ground_truths':np.squeeze(ground_truths),
136 | #                  'attention_weights':np.squeeze(attention_weights),
137 | #                  'sequences':np.squeeze(sequences.reshape(-1,81))
138 | # }
139 | 
140 | # with open("prediction_dict.p","wb+") as f:
141 | #     pickle.dump(prediction_dict,f)
142 | 
143 | 
144 | with open("test_results.txt",'w+') as f:
145 |     f.write(f"ACC: {acc}\n")
146 |     f.write(f"sensitivity: {sens}\n")
147 |     f.write(f"spec: {spec}\n")
148 |     f.write(f"MCC: {MCC}\n")
149 | 


--------------------------------------------------------------------------------
/src/Enchancer_classification/test.sh:
--------------------------------------------------------------------------------
1 | python -i test.py --gpu_id 0 --kmer_aggregation --epochs 150 --nlayers 6 --nmute 15 \
2 | --path ../bert_enhancer_test_dataset.csv --kmers 7 --ninp 256 --nhid 1024
3 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/README.md:
--------------------------------------------------------------------------------
1 | # Nucleic_Transformer_Eukaryotic_Promoters
2 | 
3 | To run: 
4 | 1. download datasets from release
5 | 2. create a new folder 'data' and put the csv files in said folder
6 | 3. in each folder there's a ```run.sh``` to run training for that specific dataset
7 | 
8 | Folders with '''deepromoter''' suffix are code to run deepromoter training with the same hyperparameters and architecture in the deepromoter paper on the dataset described by the folder name
9 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_non_tata/Dataset.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import os
 3 | import numpy as np
 4 | import pandas as pd
 5 | from tqdm import tqdm
 6 | import torch
 7 | 
 8 | 
 9 | nt_int={
10 | "A": 0,
11 | "T": 1,
12 | "G": 2,
13 | "C": 3,}
14 | 
15 | def nucleatide2int(nt_sequence,target_length=None):
16 |     int_sequence=[]
17 |     for nt in nt_sequence:
18 |         nt=nt.upper()
19 |         if nt in nt_int:
20 |             int_sequence.append(nt_int[nt])
21 |     int_sequence=np.asarray(int_sequence,dtype='int32')
22 |     if target_length:
23 |         int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1)
24 |     return int_sequence
25 | 
26 | 
27 | 
28 | 
29 | 
30 | class PromoterDataset(torch.utils.data.Dataset):
31 |     def __init__(self,sequences,labels):
32 |         self.data=[]
33 |         for seq in sequences:
34 |             self.data.append(nucleatide2int(seq))
35 |             #print(self.data[-1].shape)
36 |         #exit()
37 |         #self.data=np.array(self.data,dtype='int')
38 |         #exit()
39 |         self.labels=labels
40 | 
41 |         print(len(self.data))
42 |         print(self.labels.shape)
43 | 
44 |     def __len__(self):
45 |         return len(self.labels)
46 | 
47 |     def __getitem__(self,idx):
48 |         return {'data':self.data[idx], 'labels':self.labels[idx]}
49 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_non_tata/Logger.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from os import path
 3 | 
 4 | 
 5 | class CSVLogger:
 6 |     def __init__(self,columns,file):
 7 |         self.columns=columns
 8 |         self.file=file
 9 |         if not self.check_header():
10 |             self._write_header()
11 | 
12 | 
13 |     def check_header(self):
14 |         if path.exists(self.file):
15 |             # with open(self.file, 'r') as csvfile:
16 |             #     sniffer = csv.Sniffer()
17 |             #     has_header = sniffer.has_header(csvfile.read(2048))
18 |             #     header=csvfile.seek(0)
19 |             header=True
20 |         else:
21 |             header=False
22 |         return header
23 | 
24 | 
25 |     def _write_header(self):
26 |         with open(self.file,"a") as f:
27 |             string=""
28 |             for attrib in self.columns:
29 |                 string+="{},".format(attrib)
30 |             string=string[:len(string)-1]
31 |             string+="\n"
32 |             f.write(string)
33 |         return self
34 | 
35 |     def log(self,row):
36 |         if len(row)!=len(self.columns):
37 |             raise Exception("Mismatch between row vector and number of columns in logger")
38 |         with open(self.file,"a") as f:
39 |             string=""
40 |             for attrib in row:
41 |                 string+="{},".format(attrib)
42 |             string=string[:len(string)-1]
43 |             string+="\n"
44 |             f.write(string)
45 |         return self
46 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_non_tata/LrScheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def update_lr(optimizer, lr):
 5 |     for param_group in optimizer.param_groups:
 6 |         param_group['lr'] = lr
 7 | 
 8 | class lr_AIAYN():
 9 |     '''
10 |     Learning rate scheduler from the paper:
11 |     Attention is All You Need
12 |     '''
13 |     def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1):
14 |         self.optimizer=optimizer
15 |         self.d_model=d_model
16 |         self.warmup_steps=warmup_steps
17 |         self.step_num=0
18 |         self.factor=factor
19 | 
20 |     def step(self):
21 |         self.step_num+=1
22 |         lr=self.d_model**-0.5*np.min([self.step_num**-0.5,
23 |                                       self.step_num*self.warmup_steps**-1.5])*self.factor
24 |         update_lr(self.optimizer,lr)
25 |         return lr
26 |         
27 |         
28 | class Cos_Anneal():
29 |     '''
30 |     Learning rate scheduler flat and anneal
31 |     '''
32 |     def __init__(self,optimizer,max_lr,min_lr,T):
33 |         self.optimizer=optimizer
34 |         self.max_lr=max_lr
35 |         self.min_lr=min_lr
36 |         self.step_num=0
37 |         self.T=T
38 | 
39 |     def step(self):
40 |         pi=3.1415
41 |         self.step_num+=1
42 |         lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi))
43 |         if self.optimizer:
44 |             update_lr(self.optimizer,lr)
45 |         return lr        


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_non_tata/Metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def accuracy(predictions,ground_truths):
 5 |     return np.sum(predictions==ground_truths)/len(ground_truths)
 6 |     
 7 |     
 8 | def sensitivity(predictions,ground_truths):
 9 |     '''
10 |     Here it is assumed:
11 |     0=negative
12 |     1=positive
13 |     '''
14 |     return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1])
15 | 
16 | 
17 | 
18 | def specificity(predictions,ground_truths):
19 |     '''
20 |     Here it is assumed:
21 |     0=negative
22 |     1=positive
23 |     '''
24 |     return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0])
25 |    
26 | def MCC(predictions,ground_truths):
27 |     '''
28 |     Here it is assumed:
29 |     0=negative
30 |     1=positive
31 |     '''
32 |     N1=len(predictions[(predictions==0)&(ground_truths==1)])
33 |     N2=len(predictions[(predictions==1)&(ground_truths==0)])
34 |     N3=len(ground_truths[ground_truths==1])
35 |     N4=len(ground_truths[ground_truths==0])
36 |     sens=1-N1/N3
37 |     spec=1-N2/N4
38 |     denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4))
39 |     return (1-sens-spec)/denom
40 |     
41 |     
42 |     


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_non_tata/check_log.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_non_tata/extract_motif.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pickle
  3 | import os
  4 | from tqdm import tqdm
  5 | import pandas as pd
  6 | import argparse
  7 | import matplotlib.pyplot as plt
  8 | import matplotlib
  9 | 
 10 | font = {'family' : 'normal',
 11 |         'weight' : 'bold',
 12 |         'size'   : 10}
 13 | 
 14 | matplotlib.rc('font', **font)
 15 | 
 16 | def get_args():
 17 |     parser = argparse.ArgumentParser()
 18 |     parser.add_argument('--kmers', type=int, default='7',  help='kmer')
 19 |     opts = parser.parse_args()
 20 |     return opts
 21 | 
 22 | opts=get_args()
 23 | 
 24 | nts=[
 25 | "A",
 26 | "T",
 27 | "G",
 28 | "C"]
 29 | 
 30 | def int2nucleotide(nt_sequence,target_length=None):
 31 |     seq=''
 32 |     for nt in nt_sequence:
 33 |         seq+=nts[nt]
 34 |     return seq
 35 | 
 36 | with open("prediction_dict.p","rb") as f:
 37 |     prediction_dict=pickle.load(f)
 38 | 
 39 | 
 40 | df=pd.DataFrame(columns=['index','sequence'])
 41 | 
 42 | def get_kmers(sequence,k):
 43 |     kmers=[]
 44 |     for i in range(len(sequence)-k+1):
 45 |         kmers.append(sequence[i:i+k])
 46 |     return kmers
 47 | 
 48 | os.system('mkdir aw_visualized')
 49 | 
 50 | top=10
 51 | count=0
 52 | sequences=[]
 53 | top_kmers=[]
 54 | top_k_count=[]
 55 | for i in tqdm(range(len(prediction_dict['sequences']))):
 56 | 
 57 |     count+=1
 58 |     sequence=int2nucleotide(prediction_dict['sequences'][i])
 59 |     sequences.append(sequence)
 60 |     attention_weights=prediction_dict['attention_weights'][i]
 61 |     ground_truth=prediction_dict['ground_truths'][i]
 62 |     prediction=prediction_dict['predictions'][i]
 63 | 
 64 |     kmers=np.asarray(get_kmers(sequence,opts.kmers))
 65 | 
 66 |     attention_weights=attention_weights[-1].sum(0)
 67 |     #attention_weights=attention_weights/attention_weights.sum()
 68 |     # plt.imshow(attention_weights.reshape(1,-1).astype('float32'))
 69 |     # plt.show()
 70 |     #exit()
 71 |     if ground_truth==1:
 72 |         state='positive'
 73 |     else:
 74 |         state='negative'
 75 | 
 76 |     if ground_truth==prediction:
 77 |         eval='correct'
 78 |     else:
 79 |         eval='wrong'
 80 |     if state=='positive' and eval=='correct':
 81 |         sorted_indices=np.argsort(attention_weights)
 82 |         #print(attention_weights[sorted_indices][-3:])
 83 |         top_k=kmers[sorted_indices][-3:]
 84 |         for kmer in top_k:
 85 |             if kmer not in top_kmers:
 86 |                 top_kmers.append(kmer)
 87 |                 top_k_count.append(1)
 88 |             else:
 89 |                 top_k_count[top_kmers.index(kmer)]=top_k_count[top_kmers.index(kmer)]+1
 90 |     #exit()
 91 | 
 92 | top_kmers=np.asarray(top_kmers)
 93 | top_k_count=np.asarray(top_k_count)
 94 | 
 95 | #exit()
 96 | 
 97 | top_indices=np.flip(np.argsort(top_k_count))
 98 | 
 99 | fig, ax = plt.subplots()
100 | x=np.arange(top)
101 | width=0.4
102 | bar=ax.bar(x,top_k_count[top_indices[:top]],edgecolor='k',linewidth=2)
103 | ax.set_ylabel('Num of appearancesin top 3',fontsize=10)
104 | #ax.set_title('Scores by group and gender')
105 | ax.set_xticks(x)
106 | ax.set_xticklabels(top_kmers[top_indices[:top]])
107 | plt.setp(ax.get_xticklabels(), rotation=30, ha="right",
108 |          rotation_mode="anchor")
109 | ax.legend()
110 | plt.savefig('promoter_motifs.eps')
111 | #plt.show()
112 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_non_tata/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | for i in {0..4};do
 3 | python train.py --fold $i --gpu_id 0 --kmer_aggregation --epochs 150 \
 4 | --nlayers 6 --nmute 45 --path ../../data/human_non_tata_dataset.csv --kmers 11 --ninp 256 --nhid 1024 \
 5 | --batch_size 64 --lr_scale 0.2
 6 | done
 7 | 
 8 | python evaluate.py --gpu_id 0 --kmer_aggregation --epochs 150 \
 9 | --nlayers 6 --nmute 15 --kmers 11 --ninp 256 --nhid 1024 \
10 | --path ../../data/human_non_tata_dataset.csv
11 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_non_tata_deepromoter/Dataset.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import os
 3 | import numpy as np
 4 | import pandas as pd
 5 | from tqdm import tqdm
 6 | import torch
 7 | 
 8 | 
 9 | nt_int={
10 | "A": 0,
11 | "T": 1,
12 | "G": 2,
13 | "C": 3,}
14 | 
15 | def nucleatide2int(nt_sequence,target_length=None):
16 |     int_sequence=[]
17 |     for nt in nt_sequence:
18 |         nt=nt.upper()
19 |         if nt in nt_int:
20 |             int_sequence.append(nt_int[nt])
21 |     int_sequence=np.asarray(int_sequence,dtype='int32')
22 |     if target_length:
23 |         int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1)
24 |     return int_sequence
25 | 
26 | 
27 | 
28 | 
29 | 
30 | class PromoterDataset(torch.utils.data.Dataset):
31 |     def __init__(self,sequences,labels):
32 |         self.data=[]
33 |         for seq in sequences:
34 |             self.data.append(nucleatide2int(seq))
35 |             #print(self.data[-1].shape)
36 |         #exit()
37 |         #self.data=np.array(self.data,dtype='int')
38 |         #exit()
39 |         self.labels=labels
40 | 
41 |         print(len(self.data))
42 |         print(self.labels.shape)
43 | 
44 |     def __len__(self):
45 |         return len(self.labels)
46 | 
47 |     def __getitem__(self,idx):
48 | 
49 |         sequence=torch.tensor(self.data[idx]).long()
50 |         sequence=torch.nn.functional.one_hot(sequence,num_classes=4).float()
51 | 
52 |         return {'data':sequence, 'labels':self.labels[idx]}
53 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_non_tata_deepromoter/DeePromoter.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import torch
  3 | import numpy as np
  4 | from torch import nn
  5 | from torch.nn.utils.rnn import pad_sequence
  6 | from torch.utils.data import Dataset, DataLoader, random_split
  7 | from torchvision import transforms, utils
  8 | 
  9 | 
 10 | class ParallelCNN(nn.Module):
 11 |     def __init__(self, para_ker, pool_kernel=6, drop=0.5):
 12 |         """
 13 |         Multiple CNN layer apply on input and concatenate the output
 14 |         :param para_ker: List of kernel size that will be used
 15 |         :param pool_kernel: Pooling parameter after CNN
 16 |         :param drop: Dropout parameter
 17 |         """
 18 |         super(ParallelCNN, self).__init__()
 19 |         self.lseq = nn.ModuleList()
 20 |         for k in para_ker:
 21 |             seq = nn.Sequential(
 22 |                 nn.Conv1d(4, 4, kernel_size=k, padding="same"),
 23 |                 nn.ReLU(),
 24 |                 nn.MaxPool1d(pool_kernel),
 25 |                 nn.Dropout(drop)
 26 |             )
 27 |             self.lseq.append(seq)
 28 | 
 29 |     def forward(self, inputs):
 30 |         """
 31 |         :param inputs: DNA onehot sequences [batch_size x 4 x length]
 32 |         :return: Stack CNN output feature from different kernel size [batch_size x 12 x length]
 33 |         """
 34 |         _x = list()
 35 |         for seq in self.lseq:
 36 |             x = seq(inputs)
 37 |             _x.append(x)
 38 |         # concate outputs of every conv layer to a tensor
 39 |         _x = torch.cat(_x, 1)
 40 |         return _x
 41 | 
 42 | 
 43 | class BidirectionalLSTM(nn.Module):
 44 |     def __init__(self, input_size, hidden_size, output_size):
 45 |         super(BidirectionalLSTM, self).__init__()
 46 |         self.rnn = nn.LSTM(input_size, hidden_size, bidirectional=True, batch_first=True)
 47 |         self.linear = nn.Linear(hidden_size * 2, output_size)
 48 | 
 49 |     def forward(self, inputs):
 50 |         """
 51 |         :param inputs: visual feature [batch_size x T x input_size]
 52 |         :return: contextual feature [batch_size x T x output_size]
 53 |         """
 54 | 
 55 |         self.rnn.flatten_parameters()
 56 |         recurrent, _ = self.rnn(inputs)  # batch_size x T x input_size -> batch_size x T x (2*hidden_size)
 57 |         output = self.linear(recurrent)  # batch_size x T x output_size
 58 |         return output
 59 | 
 60 | 
 61 | class DeePromoter(nn.Module):
 62 |     def __init__(self, para_ker, input_shape=(64, 300, 4), pool_kernel=6, drop=0.5):
 63 |         """
 64 |         Deepromoter
 65 |         :param para_ker: List of kernel size that will be used
 66 |         :param input_shape: Specifies the input shape for model(fixed)
 67 |         :param pool_kernel: Pooling parameter after CNN
 68 |         :param drop: Dropout parameter
 69 |         """
 70 |         super(DeePromoter, self).__init__()
 71 |         binode = len(para_ker) * 4
 72 | 
 73 |         self.pconv = ParallelCNN(para_ker, pool_kernel, drop)
 74 |         self.bilstm = BidirectionalLSTM(binode, binode, binode)
 75 |         self.flatten = nn.Flatten()
 76 |         x = torch.zeros(input_shape)
 77 |         shape = self.get_feature_shape(x)
 78 | 
 79 |         self.fc = nn.Sequential(
 80 |             nn.Linear(shape, shape),
 81 |             nn.ReLU(),
 82 |             nn.Linear(shape, 2),
 83 |         )
 84 | 
 85 |     def get_feature_shape(self, x):
 86 |         """Pass a dummy input through to find the shape
 87 |         after flatten layer for Linear layer construction"""
 88 |         x = x.permute(0, 2, 1)
 89 |         x = self.pconv(x)
 90 |         x = x.permute(0, 2, 1)
 91 |         x = self.bilstm(x)
 92 |         x = self.flatten(x)
 93 |         return x.shape[1]
 94 | 
 95 |     def forward(self, x):
 96 |         x = x.permute(0, 2, 1)
 97 |         x = self.pconv(x)
 98 |         x = x.permute(0, 2, 1)
 99 |         x = self.bilstm(x)
100 |         x = self.flatten(x)
101 |         x = self.fc(x)
102 |         return x
103 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_non_tata_deepromoter/Logger.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from os import path
 3 | 
 4 | 
 5 | class CSVLogger:
 6 |     def __init__(self,columns,file):
 7 |         self.columns=columns
 8 |         self.file=file
 9 |         if not self.check_header():
10 |             self._write_header()
11 | 
12 | 
13 |     def check_header(self):
14 |         if path.exists(self.file):
15 |             # with open(self.file, 'r') as csvfile:
16 |             #     sniffer = csv.Sniffer()
17 |             #     has_header = sniffer.has_header(csvfile.read(2048))
18 |             #     header=csvfile.seek(0)
19 |             header=True
20 |         else:
21 |             header=False
22 |         return header
23 | 
24 | 
25 |     def _write_header(self):
26 |         with open(self.file,"a") as f:
27 |             string=""
28 |             for attrib in self.columns:
29 |                 string+="{},".format(attrib)
30 |             string=string[:len(string)-1]
31 |             string+="\n"
32 |             f.write(string)
33 |         return self
34 | 
35 |     def log(self,row):
36 |         if len(row)!=len(self.columns):
37 |             raise Exception("Mismatch between row vector and number of columns in logger")
38 |         with open(self.file,"a") as f:
39 |             string=""
40 |             for attrib in row:
41 |                 string+="{},".format(attrib)
42 |             string=string[:len(string)-1]
43 |             string+="\n"
44 |             f.write(string)
45 |         return self
46 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_non_tata_deepromoter/LrScheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def update_lr(optimizer, lr):
 5 |     for param_group in optimizer.param_groups:
 6 |         param_group['lr'] = lr
 7 | 
 8 | class lr_AIAYN():
 9 |     '''
10 |     Learning rate scheduler from the paper:
11 |     Attention is All You Need
12 |     '''
13 |     def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1):
14 |         self.optimizer=optimizer
15 |         self.d_model=d_model
16 |         self.warmup_steps=warmup_steps
17 |         self.step_num=0
18 |         self.factor=factor
19 | 
20 |     def step(self):
21 |         self.step_num+=1
22 |         lr=self.d_model**-0.5*np.min([self.step_num**-0.5,
23 |                                       self.step_num*self.warmup_steps**-1.5])*self.factor
24 |         update_lr(self.optimizer,lr)
25 |         return lr
26 |         
27 |         
28 | class Cos_Anneal():
29 |     '''
30 |     Learning rate scheduler flat and anneal
31 |     '''
32 |     def __init__(self,optimizer,max_lr,min_lr,T):
33 |         self.optimizer=optimizer
34 |         self.max_lr=max_lr
35 |         self.min_lr=min_lr
36 |         self.step_num=0
37 |         self.T=T
38 | 
39 |     def step(self):
40 |         pi=3.1415
41 |         self.step_num+=1
42 |         lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi))
43 |         if self.optimizer:
44 |             update_lr(self.optimizer,lr)
45 |         return lr        


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_non_tata_deepromoter/Metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def accuracy(predictions,ground_truths):
 5 |     return np.sum(predictions==ground_truths)/len(ground_truths)
 6 |     
 7 |     
 8 | def sensitivity(predictions,ground_truths):
 9 |     '''
10 |     Here it is assumed:
11 |     0=negative
12 |     1=positive
13 |     '''
14 |     return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1])
15 | 
16 | 
17 | 
18 | def specificity(predictions,ground_truths):
19 |     '''
20 |     Here it is assumed:
21 |     0=negative
22 |     1=positive
23 |     '''
24 |     return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0])
25 |    
26 | def MCC(predictions,ground_truths):
27 |     '''
28 |     Here it is assumed:
29 |     0=negative
30 |     1=positive
31 |     '''
32 |     N1=len(predictions[(predictions==0)&(ground_truths==1)])
33 |     N2=len(predictions[(predictions==1)&(ground_truths==0)])
34 |     N3=len(ground_truths[ground_truths==1])
35 |     N4=len(ground_truths[ground_truths==0])
36 |     sens=1-N1/N3
37 |     spec=1-N2/N4
38 |     denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4))
39 |     return (1-sens-spec)/denom
40 |     
41 |     
42 |     


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_non_tata_deepromoter/check_log.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_non_tata_deepromoter/extract_motif.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pickle
  3 | import os
  4 | from tqdm import tqdm
  5 | import pandas as pd
  6 | import argparse
  7 | import matplotlib.pyplot as plt
  8 | import matplotlib
  9 | 
 10 | font = {'family' : 'normal',
 11 |         'weight' : 'bold',
 12 |         'size'   : 10}
 13 | 
 14 | matplotlib.rc('font', **font)
 15 | 
 16 | def get_args():
 17 |     parser = argparse.ArgumentParser()
 18 |     parser.add_argument('--kmers', type=int, default='7',  help='kmer')
 19 |     opts = parser.parse_args()
 20 |     return opts
 21 | 
 22 | opts=get_args()
 23 | 
 24 | nts=[
 25 | "A",
 26 | "T",
 27 | "G",
 28 | "C"]
 29 | 
 30 | def int2nucleotide(nt_sequence,target_length=None):
 31 |     seq=''
 32 |     for nt in nt_sequence:
 33 |         seq+=nts[nt]
 34 |     return seq
 35 | 
 36 | with open("prediction_dict.p","rb") as f:
 37 |     prediction_dict=pickle.load(f)
 38 | 
 39 | 
 40 | df=pd.DataFrame(columns=['index','sequence'])
 41 | 
 42 | def get_kmers(sequence,k):
 43 |     kmers=[]
 44 |     for i in range(len(sequence)-k+1):
 45 |         kmers.append(sequence[i:i+k])
 46 |     return kmers
 47 | 
 48 | os.system('mkdir aw_visualized')
 49 | 
 50 | top=10
 51 | count=0
 52 | sequences=[]
 53 | top_kmers=[]
 54 | top_k_count=[]
 55 | for i in tqdm(range(len(prediction_dict['sequences']))):
 56 | 
 57 |     count+=1
 58 |     sequence=int2nucleotide(prediction_dict['sequences'][i])
 59 |     sequences.append(sequence)
 60 |     attention_weights=prediction_dict['attention_weights'][i]
 61 |     ground_truth=prediction_dict['ground_truths'][i]
 62 |     prediction=prediction_dict['predictions'][i]
 63 | 
 64 |     kmers=np.asarray(get_kmers(sequence,opts.kmers))
 65 | 
 66 |     attention_weights=attention_weights[-1].sum(0)
 67 |     #attention_weights=attention_weights/attention_weights.sum()
 68 |     # plt.imshow(attention_weights.reshape(1,-1).astype('float32'))
 69 |     # plt.show()
 70 |     #exit()
 71 |     if ground_truth==1:
 72 |         state='positive'
 73 |     else:
 74 |         state='negative'
 75 | 
 76 |     if ground_truth==prediction:
 77 |         eval='correct'
 78 |     else:
 79 |         eval='wrong'
 80 |     if state=='positive' and eval=='correct':
 81 |         sorted_indices=np.argsort(attention_weights)
 82 |         #print(attention_weights[sorted_indices][-3:])
 83 |         top_k=kmers[sorted_indices][-3:]
 84 |         for kmer in top_k:
 85 |             if kmer not in top_kmers:
 86 |                 top_kmers.append(kmer)
 87 |                 top_k_count.append(1)
 88 |             else:
 89 |                 top_k_count[top_kmers.index(kmer)]=top_k_count[top_kmers.index(kmer)]+1
 90 |     #exit()
 91 | 
 92 | top_kmers=np.asarray(top_kmers)
 93 | top_k_count=np.asarray(top_k_count)
 94 | 
 95 | #exit()
 96 | 
 97 | top_indices=np.flip(np.argsort(top_k_count))
 98 | 
 99 | fig, ax = plt.subplots()
100 | x=np.arange(top)
101 | width=0.4
102 | bar=ax.bar(x,top_k_count[top_indices[:top]],edgecolor='k',linewidth=2)
103 | ax.set_ylabel('Num of appearancesin top 3',fontsize=10)
104 | #ax.set_title('Scores by group and gender')
105 | ax.set_xticks(x)
106 | ax.set_xticklabels(top_kmers[top_indices[:top]])
107 | plt.setp(ax.get_xticklabels(), rotation=30, ha="right",
108 |          rotation_mode="anchor")
109 | ax.legend()
110 | plt.savefig('promoter_motifs.eps')
111 | #plt.show()
112 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_non_tata_deepromoter/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | for i in {0..4};do
 3 | python train.py --fold $i --gpu_id 0 --kmer_aggregation --epochs 50 \
 4 | --nlayers 6 --nmute 45 --path ../../data/human_non_tata_dataset.csv --kmers 7 --ninp 256 --nhid 1024 \
 5 | --batch_size 64
 6 | done
 7 | 
 8 | python evaluate.py --gpu_id 0 --kmer_aggregation --epochs 150 \
 9 | --nlayers 6 --nmute 15 --kmers 7 --ninp 256 --nhid 1024 \
10 | --path ../../data/human_non_tata_dataset.csv
11 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_tata/Dataset.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import os
 3 | import numpy as np
 4 | import pandas as pd
 5 | from tqdm import tqdm
 6 | import torch
 7 | 
 8 | 
 9 | nt_int={
10 | "A": 0,
11 | "T": 1,
12 | "G": 2,
13 | "C": 3,}
14 | 
15 | def nucleatide2int(nt_sequence,target_length=None):
16 |     int_sequence=[]
17 |     for nt in nt_sequence:
18 |         nt=nt.upper()
19 |         if nt in nt_int:
20 |             int_sequence.append(nt_int[nt])
21 |     int_sequence=np.asarray(int_sequence,dtype='int32')
22 |     if target_length:
23 |         int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1)
24 |     return int_sequence
25 | 
26 | 
27 | 
28 | 
29 | 
30 | class PromoterDataset(torch.utils.data.Dataset):
31 |     def __init__(self,sequences,labels):
32 |         self.data=[]
33 |         for seq in sequences:
34 |             self.data.append(nucleatide2int(seq))
35 |             #print(self.data[-1].shape)
36 |         #exit()
37 |         #self.data=np.array(self.data,dtype='int')
38 |         #exit()
39 |         self.labels=labels
40 | 
41 |         print(len(self.data))
42 |         print(self.labels.shape)
43 | 
44 |     def __len__(self):
45 |         return len(self.labels)
46 | 
47 |     def __getitem__(self,idx):
48 |         return {'data':self.data[idx], 'labels':self.labels[idx]}
49 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_tata/Logger.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from os import path
 3 | 
 4 | 
 5 | class CSVLogger:
 6 |     def __init__(self,columns,file):
 7 |         self.columns=columns
 8 |         self.file=file
 9 |         if not self.check_header():
10 |             self._write_header()
11 | 
12 | 
13 |     def check_header(self):
14 |         if path.exists(self.file):
15 |             # with open(self.file, 'r') as csvfile:
16 |             #     sniffer = csv.Sniffer()
17 |             #     has_header = sniffer.has_header(csvfile.read(2048))
18 |             #     header=csvfile.seek(0)
19 |             header=True
20 |         else:
21 |             header=False
22 |         return header
23 | 
24 | 
25 |     def _write_header(self):
26 |         with open(self.file,"a") as f:
27 |             string=""
28 |             for attrib in self.columns:
29 |                 string+="{},".format(attrib)
30 |             string=string[:len(string)-1]
31 |             string+="\n"
32 |             f.write(string)
33 |         return self
34 | 
35 |     def log(self,row):
36 |         if len(row)!=len(self.columns):
37 |             raise Exception("Mismatch between row vector and number of columns in logger")
38 |         with open(self.file,"a") as f:
39 |             string=""
40 |             for attrib in row:
41 |                 string+="{},".format(attrib)
42 |             string=string[:len(string)-1]
43 |             string+="\n"
44 |             f.write(string)
45 |         return self
46 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_tata/LrScheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def update_lr(optimizer, lr):
 5 |     for param_group in optimizer.param_groups:
 6 |         param_group['lr'] = lr
 7 | 
 8 | class lr_AIAYN():
 9 |     '''
10 |     Learning rate scheduler from the paper:
11 |     Attention is All You Need
12 |     '''
13 |     def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1):
14 |         self.optimizer=optimizer
15 |         self.d_model=d_model
16 |         self.warmup_steps=warmup_steps
17 |         self.step_num=0
18 |         self.factor=factor
19 | 
20 |     def step(self):
21 |         self.step_num+=1
22 |         lr=self.d_model**-0.5*np.min([self.step_num**-0.5,
23 |                                       self.step_num*self.warmup_steps**-1.5])*self.factor
24 |         update_lr(self.optimizer,lr)
25 |         return lr
26 |         
27 |         
28 | class Cos_Anneal():
29 |     '''
30 |     Learning rate scheduler flat and anneal
31 |     '''
32 |     def __init__(self,optimizer,max_lr,min_lr,T):
33 |         self.optimizer=optimizer
34 |         self.max_lr=max_lr
35 |         self.min_lr=min_lr
36 |         self.step_num=0
37 |         self.T=T
38 | 
39 |     def step(self):
40 |         pi=3.1415
41 |         self.step_num+=1
42 |         lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi))
43 |         if self.optimizer:
44 |             update_lr(self.optimizer,lr)
45 |         return lr        


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_tata/Metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def accuracy(predictions,ground_truths):
 5 |     return np.sum(predictions==ground_truths)/len(ground_truths)
 6 |     
 7 |     
 8 | def sensitivity(predictions,ground_truths):
 9 |     '''
10 |     Here it is assumed:
11 |     0=negative
12 |     1=positive
13 |     '''
14 |     return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1])
15 | 
16 | 
17 | 
18 | def specificity(predictions,ground_truths):
19 |     '''
20 |     Here it is assumed:
21 |     0=negative
22 |     1=positive
23 |     '''
24 |     return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0])
25 |    
26 | def MCC(predictions,ground_truths):
27 |     '''
28 |     Here it is assumed:
29 |     0=negative
30 |     1=positive
31 |     '''
32 |     N1=len(predictions[(predictions==0)&(ground_truths==1)])
33 |     N2=len(predictions[(predictions==1)&(ground_truths==0)])
34 |     N3=len(ground_truths[ground_truths==1])
35 |     N4=len(ground_truths[ground_truths==0])
36 |     sens=1-N1/N3
37 |     spec=1-N2/N4
38 |     denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4))
39 |     return (1-sens-spec)/denom
40 |     
41 |     
42 |     


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_tata/check_log.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_tata/extract_motif.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pickle
  3 | import os
  4 | from tqdm import tqdm
  5 | import pandas as pd
  6 | import argparse
  7 | import matplotlib.pyplot as plt
  8 | import matplotlib
  9 | 
 10 | font = {'family' : 'normal',
 11 |         'weight' : 'bold',
 12 |         'size'   : 10}
 13 | 
 14 | matplotlib.rc('font', **font)
 15 | 
 16 | def get_args():
 17 |     parser = argparse.ArgumentParser()
 18 |     parser.add_argument('--kmers', type=int, default='7',  help='kmer')
 19 |     opts = parser.parse_args()
 20 |     return opts
 21 | 
 22 | opts=get_args()
 23 | 
 24 | nts=[
 25 | "A",
 26 | "T",
 27 | "G",
 28 | "C"]
 29 | 
 30 | def int2nucleotide(nt_sequence,target_length=None):
 31 |     seq=''
 32 |     for nt in nt_sequence:
 33 |         seq+=nts[nt]
 34 |     return seq
 35 | 
 36 | with open("prediction_dict.p","rb") as f:
 37 |     prediction_dict=pickle.load(f)
 38 | 
 39 | 
 40 | df=pd.DataFrame(columns=['index','sequence'])
 41 | 
 42 | def get_kmers(sequence,k):
 43 |     kmers=[]
 44 |     for i in range(len(sequence)-k+1):
 45 |         kmers.append(sequence[i:i+k])
 46 |     return kmers
 47 | 
 48 | os.system('mkdir aw_visualized')
 49 | 
 50 | top=10
 51 | count=0
 52 | sequences=[]
 53 | top_kmers=[]
 54 | top_k_count=[]
 55 | for i in tqdm(range(len(prediction_dict['sequences']))):
 56 | 
 57 |     count+=1
 58 |     sequence=int2nucleotide(prediction_dict['sequences'][i])
 59 |     sequences.append(sequence)
 60 |     attention_weights=prediction_dict['attention_weights'][i]
 61 |     ground_truth=prediction_dict['ground_truths'][i]
 62 |     prediction=prediction_dict['predictions'][i]
 63 | 
 64 |     kmers=np.asarray(get_kmers(sequence,opts.kmers))
 65 | 
 66 |     attention_weights=attention_weights[-1].sum(0)
 67 |     #attention_weights=attention_weights/attention_weights.sum()
 68 |     # plt.imshow(attention_weights.reshape(1,-1).astype('float32'))
 69 |     # plt.show()
 70 |     #exit()
 71 |     if ground_truth==1:
 72 |         state='positive'
 73 |     else:
 74 |         state='negative'
 75 | 
 76 |     if ground_truth==prediction:
 77 |         eval='correct'
 78 |     else:
 79 |         eval='wrong'
 80 |     if state=='positive' and eval=='correct':
 81 |         sorted_indices=np.argsort(attention_weights)
 82 |         #print(attention_weights[sorted_indices][-3:])
 83 |         top_k=kmers[sorted_indices][-3:]
 84 |         for kmer in top_k:
 85 |             if kmer not in top_kmers:
 86 |                 top_kmers.append(kmer)
 87 |                 top_k_count.append(1)
 88 |             else:
 89 |                 top_k_count[top_kmers.index(kmer)]=top_k_count[top_kmers.index(kmer)]+1
 90 |     #exit()
 91 | 
 92 | top_kmers=np.asarray(top_kmers)
 93 | top_k_count=np.asarray(top_k_count)
 94 | 
 95 | #exit()
 96 | 
 97 | top_indices=np.flip(np.argsort(top_k_count))
 98 | 
 99 | fig, ax = plt.subplots()
100 | x=np.arange(top)
101 | width=0.4
102 | bar=ax.bar(x,top_k_count[top_indices[:top]],edgecolor='k',linewidth=2)
103 | ax.set_ylabel('Num of appearancesin top 3',fontsize=10)
104 | #ax.set_title('Scores by group and gender')
105 | ax.set_xticks(x)
106 | ax.set_xticklabels(top_kmers[top_indices[:top]])
107 | plt.setp(ax.get_xticklabels(), rotation=30, ha="right",
108 |          rotation_mode="anchor")
109 | ax.legend()
110 | plt.savefig('promoter_motifs.eps')
111 | #plt.show()
112 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_tata/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | for i in {0..4};do
 3 | python train.py --fold $i --gpu_id 0 --kmer_aggregation --epochs 150 \
 4 | --nlayers 6 --nmute 45 --path ../../data/human_tata_dataset.csv --kmers 11 --ninp 256 --nhid 1024 \
 5 | --batch_size 32 --lr_scale 0.1
 6 | done
 7 | 
 8 | python evaluate.py --gpu_id 0 --kmer_aggregation --epochs 150 \
 9 | --nlayers 6 --nmute 15 --kmers 11 --ninp 256 --nhid 1024 \
10 | --path ../../data/human_tata_dataset.csv
11 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_tata_deepromoter/Dataset.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import os
 3 | import numpy as np
 4 | import pandas as pd
 5 | from tqdm import tqdm
 6 | import torch
 7 | 
 8 | 
 9 | nt_int={
10 | "A": 0,
11 | "T": 1,
12 | "G": 2,
13 | "C": 3,}
14 | 
15 | def nucleatide2int(nt_sequence,target_length=None):
16 |     int_sequence=[]
17 |     for nt in nt_sequence:
18 |         nt=nt.upper()
19 |         if nt in nt_int:
20 |             int_sequence.append(nt_int[nt])
21 |     int_sequence=np.asarray(int_sequence,dtype='int32')
22 |     if target_length:
23 |         int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1)
24 |     return int_sequence
25 | 
26 | 
27 | 
28 | 
29 | 
30 | class PromoterDataset(torch.utils.data.Dataset):
31 |     def __init__(self,sequences,labels):
32 |         self.data=[]
33 |         for seq in sequences:
34 |             self.data.append(nucleatide2int(seq))
35 |             #print(self.data[-1].shape)
36 |         #exit()
37 |         #self.data=np.array(self.data,dtype='int')
38 |         #exit()
39 |         self.labels=labels
40 | 
41 |         print(len(self.data))
42 |         print(self.labels.shape)
43 | 
44 |     def __len__(self):
45 |         return len(self.labels)
46 | 
47 |     def __getitem__(self,idx):
48 | 
49 |         sequence=torch.tensor(self.data[idx]).long()
50 |         sequence=torch.nn.functional.one_hot(sequence,num_classes=4).float()
51 | 
52 |         return {'data':sequence, 'labels':self.labels[idx]}
53 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_tata_deepromoter/DeePromoter.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import torch
  3 | import numpy as np
  4 | from torch import nn
  5 | from torch.nn.utils.rnn import pad_sequence
  6 | from torch.utils.data import Dataset, DataLoader, random_split
  7 | from torchvision import transforms, utils
  8 | 
  9 | 
 10 | class ParallelCNN(nn.Module):
 11 |     def __init__(self, para_ker, pool_kernel=6, drop=0.5):
 12 |         """
 13 |         Multiple CNN layer apply on input and concatenate the output
 14 |         :param para_ker: List of kernel size that will be used
 15 |         :param pool_kernel: Pooling parameter after CNN
 16 |         :param drop: Dropout parameter
 17 |         """
 18 |         super(ParallelCNN, self).__init__()
 19 |         self.lseq = nn.ModuleList()
 20 |         for k in para_ker:
 21 |             seq = nn.Sequential(
 22 |                 nn.Conv1d(4, 4, kernel_size=k, padding="same"),
 23 |                 nn.ReLU(),
 24 |                 nn.MaxPool1d(pool_kernel),
 25 |                 nn.Dropout(drop)
 26 |             )
 27 |             self.lseq.append(seq)
 28 | 
 29 |     def forward(self, inputs):
 30 |         """
 31 |         :param inputs: DNA onehot sequences [batch_size x 4 x length]
 32 |         :return: Stack CNN output feature from different kernel size [batch_size x 12 x length]
 33 |         """
 34 |         _x = list()
 35 |         for seq in self.lseq:
 36 |             x = seq(inputs)
 37 |             _x.append(x)
 38 |         # concate outputs of every conv layer to a tensor
 39 |         _x = torch.cat(_x, 1)
 40 |         return _x
 41 | 
 42 | 
 43 | class BidirectionalLSTM(nn.Module):
 44 |     def __init__(self, input_size, hidden_size, output_size):
 45 |         super(BidirectionalLSTM, self).__init__()
 46 |         self.rnn = nn.LSTM(input_size, hidden_size, bidirectional=True, batch_first=True)
 47 |         self.linear = nn.Linear(hidden_size * 2, output_size)
 48 | 
 49 |     def forward(self, inputs):
 50 |         """
 51 |         :param inputs: visual feature [batch_size x T x input_size]
 52 |         :return: contextual feature [batch_size x T x output_size]
 53 |         """
 54 | 
 55 |         self.rnn.flatten_parameters()
 56 |         recurrent, _ = self.rnn(inputs)  # batch_size x T x input_size -> batch_size x T x (2*hidden_size)
 57 |         output = self.linear(recurrent)  # batch_size x T x output_size
 58 |         return output
 59 | 
 60 | 
 61 | class DeePromoter(nn.Module):
 62 |     def __init__(self, para_ker, input_shape=(64, 300, 4), pool_kernel=6, drop=0.5):
 63 |         """
 64 |         Deepromoter
 65 |         :param para_ker: List of kernel size that will be used
 66 |         :param input_shape: Specifies the input shape for model(fixed)
 67 |         :param pool_kernel: Pooling parameter after CNN
 68 |         :param drop: Dropout parameter
 69 |         """
 70 |         super(DeePromoter, self).__init__()
 71 |         binode = len(para_ker) * 4
 72 | 
 73 |         self.pconv = ParallelCNN(para_ker, pool_kernel, drop)
 74 |         self.bilstm = BidirectionalLSTM(binode, binode, binode)
 75 |         self.flatten = nn.Flatten()
 76 |         x = torch.zeros(input_shape)
 77 |         shape = self.get_feature_shape(x)
 78 | 
 79 |         self.fc = nn.Sequential(
 80 |             nn.Linear(shape, shape),
 81 |             nn.ReLU(),
 82 |             nn.Linear(shape, 2),
 83 |         )
 84 | 
 85 |     def get_feature_shape(self, x):
 86 |         """Pass a dummy input through to find the shape
 87 |         after flatten layer for Linear layer construction"""
 88 |         x = x.permute(0, 2, 1)
 89 |         x = self.pconv(x)
 90 |         x = x.permute(0, 2, 1)
 91 |         x = self.bilstm(x)
 92 |         x = self.flatten(x)
 93 |         return x.shape[1]
 94 | 
 95 |     def forward(self, x):
 96 |         x = x.permute(0, 2, 1)
 97 |         x = self.pconv(x)
 98 |         x = x.permute(0, 2, 1)
 99 |         x = self.bilstm(x)
100 |         x = self.flatten(x)
101 |         x = self.fc(x)
102 |         return x
103 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_tata_deepromoter/Logger.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from os import path
 3 | 
 4 | 
 5 | class CSVLogger:
 6 |     def __init__(self,columns,file):
 7 |         self.columns=columns
 8 |         self.file=file
 9 |         if not self.check_header():
10 |             self._write_header()
11 | 
12 | 
13 |     def check_header(self):
14 |         if path.exists(self.file):
15 |             # with open(self.file, 'r') as csvfile:
16 |             #     sniffer = csv.Sniffer()
17 |             #     has_header = sniffer.has_header(csvfile.read(2048))
18 |             #     header=csvfile.seek(0)
19 |             header=True
20 |         else:
21 |             header=False
22 |         return header
23 | 
24 | 
25 |     def _write_header(self):
26 |         with open(self.file,"a") as f:
27 |             string=""
28 |             for attrib in self.columns:
29 |                 string+="{},".format(attrib)
30 |             string=string[:len(string)-1]
31 |             string+="\n"
32 |             f.write(string)
33 |         return self
34 | 
35 |     def log(self,row):
36 |         if len(row)!=len(self.columns):
37 |             raise Exception("Mismatch between row vector and number of columns in logger")
38 |         with open(self.file,"a") as f:
39 |             string=""
40 |             for attrib in row:
41 |                 string+="{},".format(attrib)
42 |             string=string[:len(string)-1]
43 |             string+="\n"
44 |             f.write(string)
45 |         return self
46 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_tata_deepromoter/LrScheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def update_lr(optimizer, lr):
 5 |     for param_group in optimizer.param_groups:
 6 |         param_group['lr'] = lr
 7 | 
 8 | class lr_AIAYN():
 9 |     '''
10 |     Learning rate scheduler from the paper:
11 |     Attention is All You Need
12 |     '''
13 |     def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1):
14 |         self.optimizer=optimizer
15 |         self.d_model=d_model
16 |         self.warmup_steps=warmup_steps
17 |         self.step_num=0
18 |         self.factor=factor
19 | 
20 |     def step(self):
21 |         self.step_num+=1
22 |         lr=self.d_model**-0.5*np.min([self.step_num**-0.5,
23 |                                       self.step_num*self.warmup_steps**-1.5])*self.factor
24 |         update_lr(self.optimizer,lr)
25 |         return lr
26 |         
27 |         
28 | class Cos_Anneal():
29 |     '''
30 |     Learning rate scheduler flat and anneal
31 |     '''
32 |     def __init__(self,optimizer,max_lr,min_lr,T):
33 |         self.optimizer=optimizer
34 |         self.max_lr=max_lr
35 |         self.min_lr=min_lr
36 |         self.step_num=0
37 |         self.T=T
38 | 
39 |     def step(self):
40 |         pi=3.1415
41 |         self.step_num+=1
42 |         lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi))
43 |         if self.optimizer:
44 |             update_lr(self.optimizer,lr)
45 |         return lr        


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_tata_deepromoter/Metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def accuracy(predictions,ground_truths):
 5 |     return np.sum(predictions==ground_truths)/len(ground_truths)
 6 |     
 7 |     
 8 | def sensitivity(predictions,ground_truths):
 9 |     '''
10 |     Here it is assumed:
11 |     0=negative
12 |     1=positive
13 |     '''
14 |     return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1])
15 | 
16 | 
17 | 
18 | def specificity(predictions,ground_truths):
19 |     '''
20 |     Here it is assumed:
21 |     0=negative
22 |     1=positive
23 |     '''
24 |     return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0])
25 |    
26 | def MCC(predictions,ground_truths):
27 |     '''
28 |     Here it is assumed:
29 |     0=negative
30 |     1=positive
31 |     '''
32 |     N1=len(predictions[(predictions==0)&(ground_truths==1)])
33 |     N2=len(predictions[(predictions==1)&(ground_truths==0)])
34 |     N3=len(ground_truths[ground_truths==1])
35 |     N4=len(ground_truths[ground_truths==0])
36 |     sens=1-N1/N3
37 |     spec=1-N2/N4
38 |     denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4))
39 |     return (1-sens-spec)/denom
40 |     
41 |     
42 |     


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_tata_deepromoter/check_log.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_tata_deepromoter/evaluate.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import time
  6 | from Functions import *
  7 | from Dataset import *
  8 | from Network import *
  9 | from LrScheduler import *
 10 | import Metrics
 11 | from Logger import CSVLogger
 12 | import argparse
 13 | from DeePromoter import *
 14 | try:
 15 |     #from apex.parallel import DistributedDataParallel as DDP
 16 |     from apex.fp16_utils import *
 17 |     from apex import amp, optimizers
 18 |     from apex.multi_tensor_apply import multi_tensor_applier
 19 | except ImportError:
 20 |     raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")
 21 | import pickle
 22 | #gpu selection
 23 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 24 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 25 | from sklearn.metrics import matthews_corrcoef
 26 | def get_args():
 27 |     parser = argparse.ArgumentParser()
 28 |     parser.add_argument('--gpu_id', type=str, default='0',  help='which gpu to use')
 29 |     parser.add_argument('--path', type=str, default='../v9d3.csv', help='path of csv file with DNA sequences and labels')
 30 |     parser.add_argument('--epochs', type=int, default=150, help='number of epochs to train')
 31 |     parser.add_argument('--batch_size', type=int, default=24, help='size of each batch during training')
 32 |     parser.add_argument('--weight_decay', type=float, default=0, help='weight dacay used in optimizer')
 33 |     parser.add_argument('--ntoken', type=int, default=4, help='number of tokens to represent DNA nucleotides (should always be 4)')
 34 |     parser.add_argument('--nclass', type=int, default=2, help='number of classes from the linear decoder')
 35 |     parser.add_argument('--ninp', type=int, default=512, help='ninp for transformer encoder')
 36 |     parser.add_argument('--nhead', type=int, default=8, help='nhead for transformer encoder')
 37 |     parser.add_argument('--nhid', type=int, default=2048, help='nhid for transformer encoder')
 38 |     parser.add_argument('--nlayers', type=int, default=6, help='nlayers for transformer encoder')
 39 |     parser.add_argument('--save_freq', type=int, default=1, help='saving checkpoints per save_freq epochs')
 40 |     parser.add_argument('--dropout', type=float, default=.1, help='transformer dropout')
 41 |     parser.add_argument('--warmup_steps', type=int, default=3200, help='training schedule warmup steps')
 42 |     parser.add_argument('--lr_scale', type=float, default=0.1, help='learning rate scale')
 43 |     parser.add_argument('--nmute', type=int, default=18, help='number of mutations during training')
 44 |     parser.add_argument('--kmers', type=int, nargs='+', default=[2,3,4,5,6], help='k-mers to be aggregated')
 45 |     #parser.add_argument('--kmer_aggregation', type=bool, default=True, help='k-mers to be aggregated')
 46 |     parser.add_argument('--kmer_aggregation', dest='kmer_aggregation', action='store_true')
 47 |     parser.add_argument('--no_kmer_aggregation', dest='kmer_aggregation', action='store_false')
 48 |     parser.set_defaults(kmer_aggregation=True)
 49 |     parser.add_argument('--nfolds', type=int, default=5, help='number of cross validation folds')
 50 |     parser.add_argument('--fold', type=int, default=0, help='which fold to train')
 51 |     opts = parser.parse_args()
 52 |     return opts
 53 | 
 54 | def evaluate_fold(fold):
 55 | 
 56 |     #load data
 57 |     #opts=get_args()
 58 |     df=pd.read_csv(opts.path)
 59 | 
 60 |     sequences=np.asarray(df.sequence)
 61 |     labels=np.asarray(df.label)
 62 | 
 63 |     train_indices, val_indices, test_indices=iter_split_strict(sequences,labels,fold,opts.nfolds)
 64 |     # print(train_indices.shape)
 65 |     # print(val_indices.shape)
 66 |     # exit()
 67 |     dataset=PromoterDataset(sequences[train_indices],labels[train_indices])
 68 |     val_dataset=PromoterDataset(sequences[test_indices],labels[test_indices])
 69 |     dataloader=torch.utils.data.DataLoader(dataset,batch_size=opts.batch_size,shuffle=True)
 70 |     val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=opts.batch_size*2,shuffle=False)
 71 | 
 72 | 
 73 | 
 74 |     #init model
 75 |     model=DeePromoter([27, 14, 7]).to(device).to(device)
 76 |     model=nn.DataParallel(model)
 77 |     #optimizer=torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=weight_decay)
 78 | 
 79 |     # Initialization
 80 |     # opt_level = 'O1'
 81 |     # model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
 82 | 
 83 |     # pytorch_total_params = sum(p.numel() for p in model.parameters())
 84 |     # print('Total number of paramters: {}'.format(pytorch_total_params))
 85 | 
 86 |     #evaluation loop
 87 |     #ground_truths=dataset.labels[dataset.val_indices]
 88 |     ensemble_predictions=[]
 89 |     acc=[]
 90 | 
 91 |     weights_path="best_weights/fold{}top1.ckpt".format(fold,i+1)
 92 |     print(weights_path)
 93 |     checkpoint=torch.load(weights_path)
 94 |     model.load_state_dict(checkpoint)
 95 |     predictions,attention_weights,sequences,ground_truths=predict(model,device,val_dataloader)
 96 |     # #validate(model,device,dataset,batch_size=batch_size*2)
 97 |     predictions=np.exp(predictions)/np.sum(np.exp(predictions),axis=1).reshape(len(predictions),1)
 98 |     ensemble_predictions.append(predictions)
 99 |     ensemble_predictions=np.asarray(ensemble_predictions)
100 |     ensemble_predictions=np.mean(np.asarray(ensemble_predictions),axis=0)
101 |     model.cpu()
102 |     del model
103 |     #del optimizer
104 |     torch.cuda.empty_cache()
105 |     return ensemble_predictions, ground_truths, attention_weights, sequences
106 | 
107 | opts=get_args()
108 | 
109 | 
110 | predictions=[]
111 | ground_truths=[]
112 | #attention_weights=[]
113 | sequences=[]
114 | for i in range(5):
115 |     ngram=[7]
116 |     p,t,at,seq= evaluate_fold(i)
117 |     predictions.append(p)
118 |     ground_truths.append(t)
119 |     #print(at.shape)
120 |     #attention_weights.append(at)
121 |     sequences.append(seq)
122 | 
123 | 
124 | probs=np.concatenate(predictions)
125 | ground_truths=np.concatenate(ground_truths)
126 | predictions=np.argmax(probs,axis=1)
127 | #attention_weights=np.squeeze(np.concatenate(attention_weights,0)).astype('float16')
128 | #sequences=np.asarray(sequences).reshape(-1,81)
129 | acc=Metrics.accuracy(predictions,ground_truths)
130 | sens=Metrics.sensitivity(predictions,ground_truths)
131 | spec=Metrics.specificity(predictions,ground_truths)
132 | MCC=matthews_corrcoef(ground_truths,predictions)
133 | precision=precision_score(ground_truths,predictions)
134 | recall=recall_score(ground_truths,predictions)
135 | f1=f1_score(ground_truths,predictions)
136 | # prediction_dict={'predictions':np.squeeze(predictions),
137 | #                  'ground_truths':np.squeeze(ground_truths),
138 | #                  'attention_weights':np.squeeze(attention_weights),
139 | #                  'sequences':np.squeeze(sequences.reshape(-1,81))
140 | # }
141 | 
142 | # with open("prediction_dict.p","wb+") as f:
143 | #     pickle.dump(prediction_dict,f)
144 | 
145 | 
146 | with open("cv.txt",'w+') as f:
147 |     f.write(f"ACC: {acc}\n")
148 |     f.write(f"sensitivity: {sens}\n")
149 |     f.write(f"spec: {spec}\n")
150 |     f.write(f"precision: {precision}\n")
151 |     f.write(f"recall: {recall}\n")
152 |     f.write(f"f1: {f1}\n")
153 |     f.write(f"MCC: {MCC}\n")
154 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_tata_deepromoter/extract_motif.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pickle
  3 | import os
  4 | from tqdm import tqdm
  5 | import pandas as pd
  6 | import argparse
  7 | import matplotlib.pyplot as plt
  8 | import matplotlib
  9 | 
 10 | font = {'family' : 'normal',
 11 |         'weight' : 'bold',
 12 |         'size'   : 10}
 13 | 
 14 | matplotlib.rc('font', **font)
 15 | 
 16 | def get_args():
 17 |     parser = argparse.ArgumentParser()
 18 |     parser.add_argument('--kmers', type=int, default='7',  help='kmer')
 19 |     opts = parser.parse_args()
 20 |     return opts
 21 | 
 22 | opts=get_args()
 23 | 
 24 | nts=[
 25 | "A",
 26 | "T",
 27 | "G",
 28 | "C"]
 29 | 
 30 | def int2nucleotide(nt_sequence,target_length=None):
 31 |     seq=''
 32 |     for nt in nt_sequence:
 33 |         seq+=nts[nt]
 34 |     return seq
 35 | 
 36 | with open("prediction_dict.p","rb") as f:
 37 |     prediction_dict=pickle.load(f)
 38 | 
 39 | 
 40 | df=pd.DataFrame(columns=['index','sequence'])
 41 | 
 42 | def get_kmers(sequence,k):
 43 |     kmers=[]
 44 |     for i in range(len(sequence)-k+1):
 45 |         kmers.append(sequence[i:i+k])
 46 |     return kmers
 47 | 
 48 | os.system('mkdir aw_visualized')
 49 | 
 50 | top=10
 51 | count=0
 52 | sequences=[]
 53 | top_kmers=[]
 54 | top_k_count=[]
 55 | for i in tqdm(range(len(prediction_dict['sequences']))):
 56 | 
 57 |     count+=1
 58 |     sequence=int2nucleotide(prediction_dict['sequences'][i])
 59 |     sequences.append(sequence)
 60 |     attention_weights=prediction_dict['attention_weights'][i]
 61 |     ground_truth=prediction_dict['ground_truths'][i]
 62 |     prediction=prediction_dict['predictions'][i]
 63 | 
 64 |     kmers=np.asarray(get_kmers(sequence,opts.kmers))
 65 | 
 66 |     attention_weights=attention_weights[-1].sum(0)
 67 |     #attention_weights=attention_weights/attention_weights.sum()
 68 |     # plt.imshow(attention_weights.reshape(1,-1).astype('float32'))
 69 |     # plt.show()
 70 |     #exit()
 71 |     if ground_truth==1:
 72 |         state='positive'
 73 |     else:
 74 |         state='negative'
 75 | 
 76 |     if ground_truth==prediction:
 77 |         eval='correct'
 78 |     else:
 79 |         eval='wrong'
 80 |     if state=='positive' and eval=='correct':
 81 |         sorted_indices=np.argsort(attention_weights)
 82 |         #print(attention_weights[sorted_indices][-3:])
 83 |         top_k=kmers[sorted_indices][-3:]
 84 |         for kmer in top_k:
 85 |             if kmer not in top_kmers:
 86 |                 top_kmers.append(kmer)
 87 |                 top_k_count.append(1)
 88 |             else:
 89 |                 top_k_count[top_kmers.index(kmer)]=top_k_count[top_kmers.index(kmer)]+1
 90 |     #exit()
 91 | 
 92 | top_kmers=np.asarray(top_kmers)
 93 | top_k_count=np.asarray(top_k_count)
 94 | 
 95 | #exit()
 96 | 
 97 | top_indices=np.flip(np.argsort(top_k_count))
 98 | 
 99 | fig, ax = plt.subplots()
100 | x=np.arange(top)
101 | width=0.4
102 | bar=ax.bar(x,top_k_count[top_indices[:top]],edgecolor='k',linewidth=2)
103 | ax.set_ylabel('Num of appearancesin top 3',fontsize=10)
104 | #ax.set_title('Scores by group and gender')
105 | ax.set_xticks(x)
106 | ax.set_xticklabels(top_kmers[top_indices[:top]])
107 | plt.setp(ax.get_xticklabels(), rotation=30, ha="right",
108 |          rotation_mode="anchor")
109 | ax.legend()
110 | plt.savefig('promoter_motifs.eps')
111 | #plt.show()
112 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/human_tata_deepromoter/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | for i in {0..4};do
 3 | python train.py --fold $i --gpu_id 1 --kmer_aggregation --epochs 50 \
 4 | --nlayers 6 --nmute 45 --path ../../data/human_tata_dataset.csv --kmers 7 --ninp 256 --nhid 1024 \
 5 | --batch_size 64
 6 | done
 7 | 
 8 | python evaluate.py --gpu_id 1 --kmer_aggregation --epochs 150 \
 9 | --nlayers 6 --nmute 15 --kmers 7 --ninp 256 --nhid 1024 \
10 | --path ../../data/human_tata_dataset.csv
11 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_non_tata/Dataset.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import os
 3 | import numpy as np
 4 | import pandas as pd
 5 | from tqdm import tqdm
 6 | import torch
 7 | 
 8 | 
 9 | nt_int={
10 | "A": 0,
11 | "T": 1,
12 | "G": 2,
13 | "C": 3,}
14 | 
15 | def nucleatide2int(nt_sequence,target_length=None):
16 |     int_sequence=[]
17 |     for nt in nt_sequence:
18 |         nt=nt.upper()
19 |         if nt in nt_int:
20 |             int_sequence.append(nt_int[nt])
21 |     int_sequence=np.asarray(int_sequence,dtype='int32')
22 |     if target_length:
23 |         int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1)
24 |     return int_sequence
25 | 
26 | 
27 | 
28 | 
29 | 
30 | class PromoterDataset(torch.utils.data.Dataset):
31 |     def __init__(self,sequences,labels):
32 |         self.data=[]
33 |         for seq in sequences:
34 |             self.data.append(nucleatide2int(seq))
35 |             #print(self.data[-1].shape)
36 |         #exit()
37 |         #self.data=np.array(self.data,dtype='int')
38 |         #exit()
39 |         self.labels=labels
40 | 
41 |         print(len(self.data))
42 |         print(self.labels.shape)
43 | 
44 |     def __len__(self):
45 |         return len(self.labels)
46 | 
47 |     def __getitem__(self,idx):
48 |         return {'data':self.data[idx], 'labels':self.labels[idx]}
49 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_non_tata/Logger.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from os import path
 3 | 
 4 | 
 5 | class CSVLogger:
 6 |     def __init__(self,columns,file):
 7 |         self.columns=columns
 8 |         self.file=file
 9 |         if not self.check_header():
10 |             self._write_header()
11 | 
12 | 
13 |     def check_header(self):
14 |         if path.exists(self.file):
15 |             # with open(self.file, 'r') as csvfile:
16 |             #     sniffer = csv.Sniffer()
17 |             #     has_header = sniffer.has_header(csvfile.read(2048))
18 |             #     header=csvfile.seek(0)
19 |             header=True
20 |         else:
21 |             header=False
22 |         return header
23 | 
24 | 
25 |     def _write_header(self):
26 |         with open(self.file,"a") as f:
27 |             string=""
28 |             for attrib in self.columns:
29 |                 string+="{},".format(attrib)
30 |             string=string[:len(string)-1]
31 |             string+="\n"
32 |             f.write(string)
33 |         return self
34 | 
35 |     def log(self,row):
36 |         if len(row)!=len(self.columns):
37 |             raise Exception("Mismatch between row vector and number of columns in logger")
38 |         with open(self.file,"a") as f:
39 |             string=""
40 |             for attrib in row:
41 |                 string+="{},".format(attrib)
42 |             string=string[:len(string)-1]
43 |             string+="\n"
44 |             f.write(string)
45 |         return self
46 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_non_tata/LrScheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def update_lr(optimizer, lr):
 5 |     for param_group in optimizer.param_groups:
 6 |         param_group['lr'] = lr
 7 | 
 8 | class lr_AIAYN():
 9 |     '''
10 |     Learning rate scheduler from the paper:
11 |     Attention is All You Need
12 |     '''
13 |     def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1):
14 |         self.optimizer=optimizer
15 |         self.d_model=d_model
16 |         self.warmup_steps=warmup_steps
17 |         self.step_num=0
18 |         self.factor=factor
19 | 
20 |     def step(self):
21 |         self.step_num+=1
22 |         lr=self.d_model**-0.5*np.min([self.step_num**-0.5,
23 |                                       self.step_num*self.warmup_steps**-1.5])*self.factor
24 |         update_lr(self.optimizer,lr)
25 |         return lr
26 |         
27 |         
28 | class Cos_Anneal():
29 |     '''
30 |     Learning rate scheduler flat and anneal
31 |     '''
32 |     def __init__(self,optimizer,max_lr,min_lr,T):
33 |         self.optimizer=optimizer
34 |         self.max_lr=max_lr
35 |         self.min_lr=min_lr
36 |         self.step_num=0
37 |         self.T=T
38 | 
39 |     def step(self):
40 |         pi=3.1415
41 |         self.step_num+=1
42 |         lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi))
43 |         if self.optimizer:
44 |             update_lr(self.optimizer,lr)
45 |         return lr        


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_non_tata/Metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def accuracy(predictions,ground_truths):
 5 |     return np.sum(predictions==ground_truths)/len(ground_truths)
 6 |     
 7 |     
 8 | def sensitivity(predictions,ground_truths):
 9 |     '''
10 |     Here it is assumed:
11 |     0=negative
12 |     1=positive
13 |     '''
14 |     return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1])
15 | 
16 | 
17 | 
18 | def specificity(predictions,ground_truths):
19 |     '''
20 |     Here it is assumed:
21 |     0=negative
22 |     1=positive
23 |     '''
24 |     return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0])
25 |    
26 | def MCC(predictions,ground_truths):
27 |     '''
28 |     Here it is assumed:
29 |     0=negative
30 |     1=positive
31 |     '''
32 |     N1=len(predictions[(predictions==0)&(ground_truths==1)])
33 |     N2=len(predictions[(predictions==1)&(ground_truths==0)])
34 |     N3=len(ground_truths[ground_truths==1])
35 |     N4=len(ground_truths[ground_truths==0])
36 |     sens=1-N1/N3
37 |     spec=1-N2/N4
38 |     denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4))
39 |     return (1-sens-spec)/denom
40 |     
41 |     
42 |     


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_non_tata/check_log.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_non_tata/extract_motif.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pickle
  3 | import os
  4 | from tqdm import tqdm
  5 | import pandas as pd
  6 | import argparse
  7 | import matplotlib.pyplot as plt
  8 | import matplotlib
  9 | 
 10 | font = {'family' : 'normal',
 11 |         'weight' : 'bold',
 12 |         'size'   : 10}
 13 | 
 14 | matplotlib.rc('font', **font)
 15 | 
 16 | def get_args():
 17 |     parser = argparse.ArgumentParser()
 18 |     parser.add_argument('--kmers', type=int, default='7',  help='kmer')
 19 |     opts = parser.parse_args()
 20 |     return opts
 21 | 
 22 | opts=get_args()
 23 | 
 24 | nts=[
 25 | "A",
 26 | "T",
 27 | "G",
 28 | "C"]
 29 | 
 30 | def int2nucleotide(nt_sequence,target_length=None):
 31 |     seq=''
 32 |     for nt in nt_sequence:
 33 |         seq+=nts[nt]
 34 |     return seq
 35 | 
 36 | with open("prediction_dict.p","rb") as f:
 37 |     prediction_dict=pickle.load(f)
 38 | 
 39 | 
 40 | df=pd.DataFrame(columns=['index','sequence'])
 41 | 
 42 | def get_kmers(sequence,k):
 43 |     kmers=[]
 44 |     for i in range(len(sequence)-k+1):
 45 |         kmers.append(sequence[i:i+k])
 46 |     return kmers
 47 | 
 48 | os.system('mkdir aw_visualized')
 49 | 
 50 | top=10
 51 | count=0
 52 | sequences=[]
 53 | top_kmers=[]
 54 | top_k_count=[]
 55 | for i in tqdm(range(len(prediction_dict['sequences']))):
 56 | 
 57 |     count+=1
 58 |     sequence=int2nucleotide(prediction_dict['sequences'][i])
 59 |     sequences.append(sequence)
 60 |     attention_weights=prediction_dict['attention_weights'][i]
 61 |     ground_truth=prediction_dict['ground_truths'][i]
 62 |     prediction=prediction_dict['predictions'][i]
 63 | 
 64 |     kmers=np.asarray(get_kmers(sequence,opts.kmers))
 65 | 
 66 |     attention_weights=attention_weights[-1].sum(0)
 67 |     #attention_weights=attention_weights/attention_weights.sum()
 68 |     # plt.imshow(attention_weights.reshape(1,-1).astype('float32'))
 69 |     # plt.show()
 70 |     #exit()
 71 |     if ground_truth==1:
 72 |         state='positive'
 73 |     else:
 74 |         state='negative'
 75 | 
 76 |     if ground_truth==prediction:
 77 |         eval='correct'
 78 |     else:
 79 |         eval='wrong'
 80 |     if state=='positive' and eval=='correct':
 81 |         sorted_indices=np.argsort(attention_weights)
 82 |         #print(attention_weights[sorted_indices][-3:])
 83 |         top_k=kmers[sorted_indices][-3:]
 84 |         for kmer in top_k:
 85 |             if kmer not in top_kmers:
 86 |                 top_kmers.append(kmer)
 87 |                 top_k_count.append(1)
 88 |             else:
 89 |                 top_k_count[top_kmers.index(kmer)]=top_k_count[top_kmers.index(kmer)]+1
 90 |     #exit()
 91 | 
 92 | top_kmers=np.asarray(top_kmers)
 93 | top_k_count=np.asarray(top_k_count)
 94 | 
 95 | #exit()
 96 | 
 97 | top_indices=np.flip(np.argsort(top_k_count))
 98 | 
 99 | fig, ax = plt.subplots()
100 | x=np.arange(top)
101 | width=0.4
102 | bar=ax.bar(x,top_k_count[top_indices[:top]],edgecolor='k',linewidth=2)
103 | ax.set_ylabel('Num of appearancesin top 3',fontsize=10)
104 | #ax.set_title('Scores by group and gender')
105 | ax.set_xticks(x)
106 | ax.set_xticklabels(top_kmers[top_indices[:top]])
107 | plt.setp(ax.get_xticklabels(), rotation=30, ha="right",
108 |          rotation_mode="anchor")
109 | ax.legend()
110 | plt.savefig('promoter_motifs.eps')
111 | #plt.show()
112 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_non_tata/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | for i in {0..4};do
 3 | python train.py --fold $i --gpu_id 1 --kmer_aggregation --epochs 150 \
 4 | --nlayers 6 --nmute 45 --path ../../data/mouse_non_tata_dataset.csv --kmers 11 --ninp 256 --nhid 1024 \
 5 | --batch_size 64 --lr_scale 0.2
 6 | done
 7 | 
 8 | python evaluate.py --gpu_id 1 --kmer_aggregation --epochs 150 \
 9 | --nlayers 6 --nmute 15 --kmers 11 --ninp 256 --nhid 1024 \
10 | --path ../../data/mouse_non_tata_dataset.csv
11 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_non_tata_deepromoter/Dataset.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import os
 3 | import numpy as np
 4 | import pandas as pd
 5 | from tqdm import tqdm
 6 | import torch
 7 | 
 8 | 
 9 | nt_int={
10 | "A": 0,
11 | "T": 1,
12 | "G": 2,
13 | "C": 3,}
14 | 
15 | def nucleatide2int(nt_sequence,target_length=None):
16 |     int_sequence=[]
17 |     for nt in nt_sequence:
18 |         nt=nt.upper()
19 |         if nt in nt_int:
20 |             int_sequence.append(nt_int[nt])
21 |     int_sequence=np.asarray(int_sequence,dtype='int32')
22 |     if target_length:
23 |         int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1)
24 |     return int_sequence
25 | 
26 | 
27 | 
28 | 
29 | 
30 | class PromoterDataset(torch.utils.data.Dataset):
31 |     def __init__(self,sequences,labels):
32 |         self.data=[]
33 |         for seq in sequences:
34 |             self.data.append(nucleatide2int(seq))
35 |             #print(self.data[-1].shape)
36 |         #exit()
37 |         #self.data=np.array(self.data,dtype='int')
38 |         #exit()
39 |         self.labels=labels
40 | 
41 |         print(len(self.data))
42 |         print(self.labels.shape)
43 | 
44 |     def __len__(self):
45 |         return len(self.labels)
46 | 
47 |     def __getitem__(self,idx):
48 | 
49 |         sequence=torch.tensor(self.data[idx]).long()
50 |         sequence=torch.nn.functional.one_hot(sequence,num_classes=4).float()
51 | 
52 |         return {'data':sequence, 'labels':self.labels[idx]}
53 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_non_tata_deepromoter/DeePromoter.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import torch
  3 | import numpy as np
  4 | from torch import nn
  5 | from torch.nn.utils.rnn import pad_sequence
  6 | from torch.utils.data import Dataset, DataLoader, random_split
  7 | from torchvision import transforms, utils
  8 | 
  9 | 
 10 | class ParallelCNN(nn.Module):
 11 |     def __init__(self, para_ker, pool_kernel=6, drop=0.5):
 12 |         """
 13 |         Multiple CNN layer apply on input and concatenate the output
 14 |         :param para_ker: List of kernel size that will be used
 15 |         :param pool_kernel: Pooling parameter after CNN
 16 |         :param drop: Dropout parameter
 17 |         """
 18 |         super(ParallelCNN, self).__init__()
 19 |         self.lseq = nn.ModuleList()
 20 |         for k in para_ker:
 21 |             seq = nn.Sequential(
 22 |                 nn.Conv1d(4, 4, kernel_size=k, padding="same"),
 23 |                 nn.ReLU(),
 24 |                 nn.MaxPool1d(pool_kernel),
 25 |                 nn.Dropout(drop)
 26 |             )
 27 |             self.lseq.append(seq)
 28 | 
 29 |     def forward(self, inputs):
 30 |         """
 31 |         :param inputs: DNA onehot sequences [batch_size x 4 x length]
 32 |         :return: Stack CNN output feature from different kernel size [batch_size x 12 x length]
 33 |         """
 34 |         _x = list()
 35 |         for seq in self.lseq:
 36 |             x = seq(inputs)
 37 |             _x.append(x)
 38 |         # concate outputs of every conv layer to a tensor
 39 |         _x = torch.cat(_x, 1)
 40 |         return _x
 41 | 
 42 | 
 43 | class BidirectionalLSTM(nn.Module):
 44 |     def __init__(self, input_size, hidden_size, output_size):
 45 |         super(BidirectionalLSTM, self).__init__()
 46 |         self.rnn = nn.LSTM(input_size, hidden_size, bidirectional=True, batch_first=True)
 47 |         self.linear = nn.Linear(hidden_size * 2, output_size)
 48 | 
 49 |     def forward(self, inputs):
 50 |         """
 51 |         :param inputs: visual feature [batch_size x T x input_size]
 52 |         :return: contextual feature [batch_size x T x output_size]
 53 |         """
 54 | 
 55 |         self.rnn.flatten_parameters()
 56 |         recurrent, _ = self.rnn(inputs)  # batch_size x T x input_size -> batch_size x T x (2*hidden_size)
 57 |         output = self.linear(recurrent)  # batch_size x T x output_size
 58 |         return output
 59 | 
 60 | 
 61 | class DeePromoter(nn.Module):
 62 |     def __init__(self, para_ker, input_shape=(64, 300, 4), pool_kernel=6, drop=0.5):
 63 |         """
 64 |         Deepromoter
 65 |         :param para_ker: List of kernel size that will be used
 66 |         :param input_shape: Specifies the input shape for model(fixed)
 67 |         :param pool_kernel: Pooling parameter after CNN
 68 |         :param drop: Dropout parameter
 69 |         """
 70 |         super(DeePromoter, self).__init__()
 71 |         binode = len(para_ker) * 4
 72 | 
 73 |         self.pconv = ParallelCNN(para_ker, pool_kernel, drop)
 74 |         self.bilstm = BidirectionalLSTM(binode, binode, binode)
 75 |         self.flatten = nn.Flatten()
 76 |         x = torch.zeros(input_shape)
 77 |         shape = self.get_feature_shape(x)
 78 | 
 79 |         self.fc = nn.Sequential(
 80 |             nn.Linear(shape, shape),
 81 |             nn.ReLU(),
 82 |             nn.Linear(shape, 2),
 83 |         )
 84 | 
 85 |     def get_feature_shape(self, x):
 86 |         """Pass a dummy input through to find the shape
 87 |         after flatten layer for Linear layer construction"""
 88 |         x = x.permute(0, 2, 1)
 89 |         x = self.pconv(x)
 90 |         x = x.permute(0, 2, 1)
 91 |         x = self.bilstm(x)
 92 |         x = self.flatten(x)
 93 |         return x.shape[1]
 94 | 
 95 |     def forward(self, x):
 96 |         x = x.permute(0, 2, 1)
 97 |         x = self.pconv(x)
 98 |         x = x.permute(0, 2, 1)
 99 |         x = self.bilstm(x)
100 |         x = self.flatten(x)
101 |         x = self.fc(x)
102 |         return x
103 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_non_tata_deepromoter/Logger.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from os import path
 3 | 
 4 | 
 5 | class CSVLogger:
 6 |     def __init__(self,columns,file):
 7 |         self.columns=columns
 8 |         self.file=file
 9 |         if not self.check_header():
10 |             self._write_header()
11 | 
12 | 
13 |     def check_header(self):
14 |         if path.exists(self.file):
15 |             # with open(self.file, 'r') as csvfile:
16 |             #     sniffer = csv.Sniffer()
17 |             #     has_header = sniffer.has_header(csvfile.read(2048))
18 |             #     header=csvfile.seek(0)
19 |             header=True
20 |         else:
21 |             header=False
22 |         return header
23 | 
24 | 
25 |     def _write_header(self):
26 |         with open(self.file,"a") as f:
27 |             string=""
28 |             for attrib in self.columns:
29 |                 string+="{},".format(attrib)
30 |             string=string[:len(string)-1]
31 |             string+="\n"
32 |             f.write(string)
33 |         return self
34 | 
35 |     def log(self,row):
36 |         if len(row)!=len(self.columns):
37 |             raise Exception("Mismatch between row vector and number of columns in logger")
38 |         with open(self.file,"a") as f:
39 |             string=""
40 |             for attrib in row:
41 |                 string+="{},".format(attrib)
42 |             string=string[:len(string)-1]
43 |             string+="\n"
44 |             f.write(string)
45 |         return self
46 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_non_tata_deepromoter/LrScheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def update_lr(optimizer, lr):
 5 |     for param_group in optimizer.param_groups:
 6 |         param_group['lr'] = lr
 7 | 
 8 | class lr_AIAYN():
 9 |     '''
10 |     Learning rate scheduler from the paper:
11 |     Attention is All You Need
12 |     '''
13 |     def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1):
14 |         self.optimizer=optimizer
15 |         self.d_model=d_model
16 |         self.warmup_steps=warmup_steps
17 |         self.step_num=0
18 |         self.factor=factor
19 | 
20 |     def step(self):
21 |         self.step_num+=1
22 |         lr=self.d_model**-0.5*np.min([self.step_num**-0.5,
23 |                                       self.step_num*self.warmup_steps**-1.5])*self.factor
24 |         update_lr(self.optimizer,lr)
25 |         return lr
26 |         
27 |         
28 | class Cos_Anneal():
29 |     '''
30 |     Learning rate scheduler flat and anneal
31 |     '''
32 |     def __init__(self,optimizer,max_lr,min_lr,T):
33 |         self.optimizer=optimizer
34 |         self.max_lr=max_lr
35 |         self.min_lr=min_lr
36 |         self.step_num=0
37 |         self.T=T
38 | 
39 |     def step(self):
40 |         pi=3.1415
41 |         self.step_num+=1
42 |         lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi))
43 |         if self.optimizer:
44 |             update_lr(self.optimizer,lr)
45 |         return lr        


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_non_tata_deepromoter/Metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def accuracy(predictions,ground_truths):
 5 |     return np.sum(predictions==ground_truths)/len(ground_truths)
 6 |     
 7 |     
 8 | def sensitivity(predictions,ground_truths):
 9 |     '''
10 |     Here it is assumed:
11 |     0=negative
12 |     1=positive
13 |     '''
14 |     return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1])
15 | 
16 | 
17 | 
18 | def specificity(predictions,ground_truths):
19 |     '''
20 |     Here it is assumed:
21 |     0=negative
22 |     1=positive
23 |     '''
24 |     return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0])
25 |    
26 | def MCC(predictions,ground_truths):
27 |     '''
28 |     Here it is assumed:
29 |     0=negative
30 |     1=positive
31 |     '''
32 |     N1=len(predictions[(predictions==0)&(ground_truths==1)])
33 |     N2=len(predictions[(predictions==1)&(ground_truths==0)])
34 |     N3=len(ground_truths[ground_truths==1])
35 |     N4=len(ground_truths[ground_truths==0])
36 |     sens=1-N1/N3
37 |     spec=1-N2/N4
38 |     denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4))
39 |     return (1-sens-spec)/denom
40 |     
41 |     
42 |     


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_non_tata_deepromoter/check_log.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_non_tata_deepromoter/extract_motif.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pickle
  3 | import os
  4 | from tqdm import tqdm
  5 | import pandas as pd
  6 | import argparse
  7 | import matplotlib.pyplot as plt
  8 | import matplotlib
  9 | 
 10 | font = {'family' : 'normal',
 11 |         'weight' : 'bold',
 12 |         'size'   : 10}
 13 | 
 14 | matplotlib.rc('font', **font)
 15 | 
 16 | def get_args():
 17 |     parser = argparse.ArgumentParser()
 18 |     parser.add_argument('--kmers', type=int, default='7',  help='kmer')
 19 |     opts = parser.parse_args()
 20 |     return opts
 21 | 
 22 | opts=get_args()
 23 | 
 24 | nts=[
 25 | "A",
 26 | "T",
 27 | "G",
 28 | "C"]
 29 | 
 30 | def int2nucleotide(nt_sequence,target_length=None):
 31 |     seq=''
 32 |     for nt in nt_sequence:
 33 |         seq+=nts[nt]
 34 |     return seq
 35 | 
 36 | with open("prediction_dict.p","rb") as f:
 37 |     prediction_dict=pickle.load(f)
 38 | 
 39 | 
 40 | df=pd.DataFrame(columns=['index','sequence'])
 41 | 
 42 | def get_kmers(sequence,k):
 43 |     kmers=[]
 44 |     for i in range(len(sequence)-k+1):
 45 |         kmers.append(sequence[i:i+k])
 46 |     return kmers
 47 | 
 48 | os.system('mkdir aw_visualized')
 49 | 
 50 | top=10
 51 | count=0
 52 | sequences=[]
 53 | top_kmers=[]
 54 | top_k_count=[]
 55 | for i in tqdm(range(len(prediction_dict['sequences']))):
 56 | 
 57 |     count+=1
 58 |     sequence=int2nucleotide(prediction_dict['sequences'][i])
 59 |     sequences.append(sequence)
 60 |     attention_weights=prediction_dict['attention_weights'][i]
 61 |     ground_truth=prediction_dict['ground_truths'][i]
 62 |     prediction=prediction_dict['predictions'][i]
 63 | 
 64 |     kmers=np.asarray(get_kmers(sequence,opts.kmers))
 65 | 
 66 |     attention_weights=attention_weights[-1].sum(0)
 67 |     #attention_weights=attention_weights/attention_weights.sum()
 68 |     # plt.imshow(attention_weights.reshape(1,-1).astype('float32'))
 69 |     # plt.show()
 70 |     #exit()
 71 |     if ground_truth==1:
 72 |         state='positive'
 73 |     else:
 74 |         state='negative'
 75 | 
 76 |     if ground_truth==prediction:
 77 |         eval='correct'
 78 |     else:
 79 |         eval='wrong'
 80 |     if state=='positive' and eval=='correct':
 81 |         sorted_indices=np.argsort(attention_weights)
 82 |         #print(attention_weights[sorted_indices][-3:])
 83 |         top_k=kmers[sorted_indices][-3:]
 84 |         for kmer in top_k:
 85 |             if kmer not in top_kmers:
 86 |                 top_kmers.append(kmer)
 87 |                 top_k_count.append(1)
 88 |             else:
 89 |                 top_k_count[top_kmers.index(kmer)]=top_k_count[top_kmers.index(kmer)]+1
 90 |     #exit()
 91 | 
 92 | top_kmers=np.asarray(top_kmers)
 93 | top_k_count=np.asarray(top_k_count)
 94 | 
 95 | #exit()
 96 | 
 97 | top_indices=np.flip(np.argsort(top_k_count))
 98 | 
 99 | fig, ax = plt.subplots()
100 | x=np.arange(top)
101 | width=0.4
102 | bar=ax.bar(x,top_k_count[top_indices[:top]],edgecolor='k',linewidth=2)
103 | ax.set_ylabel('Num of appearancesin top 3',fontsize=10)
104 | #ax.set_title('Scores by group and gender')
105 | ax.set_xticks(x)
106 | ax.set_xticklabels(top_kmers[top_indices[:top]])
107 | plt.setp(ax.get_xticklabels(), rotation=30, ha="right",
108 |          rotation_mode="anchor")
109 | ax.legend()
110 | plt.savefig('promoter_motifs.eps')
111 | #plt.show()
112 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_non_tata_deepromoter/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | for i in {0..4};do
 3 | python train.py --fold $i --gpu_id 1 --kmer_aggregation --epochs 50 \
 4 | --nlayers 6 --nmute 45 --path ../../data/mouse_non_tata_dataset.csv --kmers 7 --ninp 256 --nhid 1024 \
 5 | --batch_size 64
 6 | done
 7 | 
 8 | python evaluate.py --gpu_id 1 --kmer_aggregation --epochs 150 \
 9 | --nlayers 6 --nmute 15 --kmers 7 --ninp 256 --nhid 1024 \
10 | --path ../../data/mouse_non_tata_dataset.csv
11 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_tata/Dataset.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import os
 3 | import numpy as np
 4 | import pandas as pd
 5 | from tqdm import tqdm
 6 | import torch
 7 | 
 8 | 
 9 | nt_int={
10 | "A": 0,
11 | "T": 1,
12 | "G": 2,
13 | "C": 3,}
14 | 
15 | def nucleatide2int(nt_sequence,target_length=None):
16 |     int_sequence=[]
17 |     for nt in nt_sequence:
18 |         nt=nt.upper()
19 |         if nt in nt_int:
20 |             int_sequence.append(nt_int[nt])
21 |     int_sequence=np.asarray(int_sequence,dtype='int32')
22 |     if target_length:
23 |         int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1)
24 |     return int_sequence
25 | 
26 | 
27 | 
28 | 
29 | 
30 | class PromoterDataset(torch.utils.data.Dataset):
31 |     def __init__(self,sequences,labels):
32 |         self.data=[]
33 |         for seq in sequences:
34 |             self.data.append(nucleatide2int(seq))
35 |             #print(self.data[-1].shape)
36 |         #exit()
37 |         #self.data=np.array(self.data,dtype='int')
38 |         #exit()
39 |         self.labels=labels
40 | 
41 |         print(len(self.data))
42 |         print(self.labels.shape)
43 | 
44 |     def __len__(self):
45 |         return len(self.labels)
46 | 
47 |     def __getitem__(self,idx):
48 |         return {'data':self.data[idx], 'labels':self.labels[idx]}
49 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_tata/Logger.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from os import path
 3 | 
 4 | 
 5 | class CSVLogger:
 6 |     def __init__(self,columns,file):
 7 |         self.columns=columns
 8 |         self.file=file
 9 |         if not self.check_header():
10 |             self._write_header()
11 | 
12 | 
13 |     def check_header(self):
14 |         if path.exists(self.file):
15 |             # with open(self.file, 'r') as csvfile:
16 |             #     sniffer = csv.Sniffer()
17 |             #     has_header = sniffer.has_header(csvfile.read(2048))
18 |             #     header=csvfile.seek(0)
19 |             header=True
20 |         else:
21 |             header=False
22 |         return header
23 | 
24 | 
25 |     def _write_header(self):
26 |         with open(self.file,"a") as f:
27 |             string=""
28 |             for attrib in self.columns:
29 |                 string+="{},".format(attrib)
30 |             string=string[:len(string)-1]
31 |             string+="\n"
32 |             f.write(string)
33 |         return self
34 | 
35 |     def log(self,row):
36 |         if len(row)!=len(self.columns):
37 |             raise Exception("Mismatch between row vector and number of columns in logger")
38 |         with open(self.file,"a") as f:
39 |             string=""
40 |             for attrib in row:
41 |                 string+="{},".format(attrib)
42 |             string=string[:len(string)-1]
43 |             string+="\n"
44 |             f.write(string)
45 |         return self
46 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_tata/LrScheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def update_lr(optimizer, lr):
 5 |     for param_group in optimizer.param_groups:
 6 |         param_group['lr'] = lr
 7 | 
 8 | class lr_AIAYN():
 9 |     '''
10 |     Learning rate scheduler from the paper:
11 |     Attention is All You Need
12 |     '''
13 |     def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1):
14 |         self.optimizer=optimizer
15 |         self.d_model=d_model
16 |         self.warmup_steps=warmup_steps
17 |         self.step_num=0
18 |         self.factor=factor
19 | 
20 |     def step(self):
21 |         self.step_num+=1
22 |         lr=self.d_model**-0.5*np.min([self.step_num**-0.5,
23 |                                       self.step_num*self.warmup_steps**-1.5])*self.factor
24 |         update_lr(self.optimizer,lr)
25 |         return lr
26 |         
27 |         
28 | class Cos_Anneal():
29 |     '''
30 |     Learning rate scheduler flat and anneal
31 |     '''
32 |     def __init__(self,optimizer,max_lr,min_lr,T):
33 |         self.optimizer=optimizer
34 |         self.max_lr=max_lr
35 |         self.min_lr=min_lr
36 |         self.step_num=0
37 |         self.T=T
38 | 
39 |     def step(self):
40 |         pi=3.1415
41 |         self.step_num+=1
42 |         lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi))
43 |         if self.optimizer:
44 |             update_lr(self.optimizer,lr)
45 |         return lr        


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_tata/Metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def accuracy(predictions,ground_truths):
 5 |     return np.sum(predictions==ground_truths)/len(ground_truths)
 6 |     
 7 |     
 8 | def sensitivity(predictions,ground_truths):
 9 |     '''
10 |     Here it is assumed:
11 |     0=negative
12 |     1=positive
13 |     '''
14 |     return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1])
15 | 
16 | 
17 | 
18 | def specificity(predictions,ground_truths):
19 |     '''
20 |     Here it is assumed:
21 |     0=negative
22 |     1=positive
23 |     '''
24 |     return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0])
25 |    
26 | def MCC(predictions,ground_truths):
27 |     '''
28 |     Here it is assumed:
29 |     0=negative
30 |     1=positive
31 |     '''
32 |     N1=len(predictions[(predictions==0)&(ground_truths==1)])
33 |     N2=len(predictions[(predictions==1)&(ground_truths==0)])
34 |     N3=len(ground_truths[ground_truths==1])
35 |     N4=len(ground_truths[ground_truths==0])
36 |     sens=1-N1/N3
37 |     spec=1-N2/N4
38 |     denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4))
39 |     return (1-sens-spec)/denom
40 |     
41 |     
42 |     


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_tata/check_log.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_tata/extract_motif.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pickle
  3 | import os
  4 | from tqdm import tqdm
  5 | import pandas as pd
  6 | import argparse
  7 | import matplotlib.pyplot as plt
  8 | import matplotlib
  9 | 
 10 | font = {'family' : 'normal',
 11 |         'weight' : 'bold',
 12 |         'size'   : 10}
 13 | 
 14 | matplotlib.rc('font', **font)
 15 | 
 16 | def get_args():
 17 |     parser = argparse.ArgumentParser()
 18 |     parser.add_argument('--kmers', type=int, default='7',  help='kmer')
 19 |     opts = parser.parse_args()
 20 |     return opts
 21 | 
 22 | opts=get_args()
 23 | 
 24 | nts=[
 25 | "A",
 26 | "T",
 27 | "G",
 28 | "C"]
 29 | 
 30 | def int2nucleotide(nt_sequence,target_length=None):
 31 |     seq=''
 32 |     for nt in nt_sequence:
 33 |         seq+=nts[nt]
 34 |     return seq
 35 | 
 36 | with open("prediction_dict.p","rb") as f:
 37 |     prediction_dict=pickle.load(f)
 38 | 
 39 | 
 40 | df=pd.DataFrame(columns=['index','sequence'])
 41 | 
 42 | def get_kmers(sequence,k):
 43 |     kmers=[]
 44 |     for i in range(len(sequence)-k+1):
 45 |         kmers.append(sequence[i:i+k])
 46 |     return kmers
 47 | 
 48 | os.system('mkdir aw_visualized')
 49 | 
 50 | top=10
 51 | count=0
 52 | sequences=[]
 53 | top_kmers=[]
 54 | top_k_count=[]
 55 | for i in tqdm(range(len(prediction_dict['sequences']))):
 56 | 
 57 |     count+=1
 58 |     sequence=int2nucleotide(prediction_dict['sequences'][i])
 59 |     sequences.append(sequence)
 60 |     attention_weights=prediction_dict['attention_weights'][i]
 61 |     ground_truth=prediction_dict['ground_truths'][i]
 62 |     prediction=prediction_dict['predictions'][i]
 63 | 
 64 |     kmers=np.asarray(get_kmers(sequence,opts.kmers))
 65 | 
 66 |     attention_weights=attention_weights[-1].sum(0)
 67 |     #attention_weights=attention_weights/attention_weights.sum()
 68 |     # plt.imshow(attention_weights.reshape(1,-1).astype('float32'))
 69 |     # plt.show()
 70 |     #exit()
 71 |     if ground_truth==1:
 72 |         state='positive'
 73 |     else:
 74 |         state='negative'
 75 | 
 76 |     if ground_truth==prediction:
 77 |         eval='correct'
 78 |     else:
 79 |         eval='wrong'
 80 |     if state=='positive' and eval=='correct':
 81 |         sorted_indices=np.argsort(attention_weights)
 82 |         #print(attention_weights[sorted_indices][-3:])
 83 |         top_k=kmers[sorted_indices][-3:]
 84 |         for kmer in top_k:
 85 |             if kmer not in top_kmers:
 86 |                 top_kmers.append(kmer)
 87 |                 top_k_count.append(1)
 88 |             else:
 89 |                 top_k_count[top_kmers.index(kmer)]=top_k_count[top_kmers.index(kmer)]+1
 90 |     #exit()
 91 | 
 92 | top_kmers=np.asarray(top_kmers)
 93 | top_k_count=np.asarray(top_k_count)
 94 | 
 95 | #exit()
 96 | 
 97 | top_indices=np.flip(np.argsort(top_k_count))
 98 | 
 99 | fig, ax = plt.subplots()
100 | x=np.arange(top)
101 | width=0.4
102 | bar=ax.bar(x,top_k_count[top_indices[:top]],edgecolor='k',linewidth=2)
103 | ax.set_ylabel('Num of appearancesin top 3',fontsize=10)
104 | #ax.set_title('Scores by group and gender')
105 | ax.set_xticks(x)
106 | ax.set_xticklabels(top_kmers[top_indices[:top]])
107 | plt.setp(ax.get_xticklabels(), rotation=30, ha="right",
108 |          rotation_mode="anchor")
109 | ax.legend()
110 | plt.savefig('promoter_motifs.eps')
111 | #plt.show()
112 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_tata/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | for i in {0..4};do
 3 | python train.py --fold $i --gpu_id 1 --kmer_aggregation --epochs 150 \
 4 | --nlayers 6 --nmute 45 --path ../../data/mouse_tata_dataset.csv --kmers 11 --ninp 256 --nhid 1024 \
 5 | --batch_size 32 --lr_scale 0.1
 6 | done
 7 | 
 8 | python evaluate.py --gpu_id 1 --kmer_aggregation --epochs 150 \
 9 | --nlayers 6 --nmute 15 --kmers 11 --ninp 256 --nhid 1024 \
10 | --path ../../data/mouse_tata_dataset.csv
11 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_tata_deepromoter/Dataset.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import os
 3 | import numpy as np
 4 | import pandas as pd
 5 | from tqdm import tqdm
 6 | import torch
 7 | 
 8 | 
 9 | nt_int={
10 | "A": 0,
11 | "T": 1,
12 | "G": 2,
13 | "C": 3,}
14 | 
15 | def nucleatide2int(nt_sequence,target_length=None):
16 |     int_sequence=[]
17 |     for nt in nt_sequence:
18 |         nt=nt.upper()
19 |         if nt in nt_int:
20 |             int_sequence.append(nt_int[nt])
21 |     int_sequence=np.asarray(int_sequence,dtype='int32')
22 |     if target_length:
23 |         int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1)
24 |     return int_sequence
25 | 
26 | 
27 | 
28 | 
29 | 
30 | class PromoterDataset(torch.utils.data.Dataset):
31 |     def __init__(self,sequences,labels):
32 |         self.data=[]
33 |         for seq in sequences:
34 |             self.data.append(nucleatide2int(seq))
35 |             #print(self.data[-1].shape)
36 |         #exit()
37 |         #self.data=np.array(self.data,dtype='int')
38 |         #exit()
39 |         self.labels=labels
40 | 
41 |         print(len(self.data))
42 |         print(self.labels.shape)
43 | 
44 |     def __len__(self):
45 |         return len(self.labels)
46 | 
47 |     def __getitem__(self,idx):
48 | 
49 |         sequence=torch.tensor(self.data[idx]).long()
50 |         sequence=torch.nn.functional.one_hot(sequence,num_classes=4).float()
51 | 
52 |         return {'data':sequence, 'labels':self.labels[idx]}
53 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_tata_deepromoter/DeePromoter.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import torch
  3 | import numpy as np
  4 | from torch import nn
  5 | from torch.nn.utils.rnn import pad_sequence
  6 | from torch.utils.data import Dataset, DataLoader, random_split
  7 | from torchvision import transforms, utils
  8 | 
  9 | 
 10 | class ParallelCNN(nn.Module):
 11 |     def __init__(self, para_ker, pool_kernel=6, drop=0.5):
 12 |         """
 13 |         Multiple CNN layer apply on input and concatenate the output
 14 |         :param para_ker: List of kernel size that will be used
 15 |         :param pool_kernel: Pooling parameter after CNN
 16 |         :param drop: Dropout parameter
 17 |         """
 18 |         super(ParallelCNN, self).__init__()
 19 |         self.lseq = nn.ModuleList()
 20 |         for k in para_ker:
 21 |             seq = nn.Sequential(
 22 |                 nn.Conv1d(4, 4, kernel_size=k, padding="same"),
 23 |                 nn.ReLU(),
 24 |                 nn.MaxPool1d(pool_kernel),
 25 |                 nn.Dropout(drop)
 26 |             )
 27 |             self.lseq.append(seq)
 28 | 
 29 |     def forward(self, inputs):
 30 |         """
 31 |         :param inputs: DNA onehot sequences [batch_size x 4 x length]
 32 |         :return: Stack CNN output feature from different kernel size [batch_size x 12 x length]
 33 |         """
 34 |         _x = list()
 35 |         for seq in self.lseq:
 36 |             x = seq(inputs)
 37 |             _x.append(x)
 38 |         # concate outputs of every conv layer to a tensor
 39 |         _x = torch.cat(_x, 1)
 40 |         return _x
 41 | 
 42 | 
 43 | class BidirectionalLSTM(nn.Module):
 44 |     def __init__(self, input_size, hidden_size, output_size):
 45 |         super(BidirectionalLSTM, self).__init__()
 46 |         self.rnn = nn.LSTM(input_size, hidden_size, bidirectional=True, batch_first=True)
 47 |         self.linear = nn.Linear(hidden_size * 2, output_size)
 48 | 
 49 |     def forward(self, inputs):
 50 |         """
 51 |         :param inputs: visual feature [batch_size x T x input_size]
 52 |         :return: contextual feature [batch_size x T x output_size]
 53 |         """
 54 | 
 55 |         self.rnn.flatten_parameters()
 56 |         recurrent, _ = self.rnn(inputs)  # batch_size x T x input_size -> batch_size x T x (2*hidden_size)
 57 |         output = self.linear(recurrent)  # batch_size x T x output_size
 58 |         return output
 59 | 
 60 | 
 61 | class DeePromoter(nn.Module):
 62 |     def __init__(self, para_ker, input_shape=(64, 300, 4), pool_kernel=6, drop=0.5):
 63 |         """
 64 |         Deepromoter
 65 |         :param para_ker: List of kernel size that will be used
 66 |         :param input_shape: Specifies the input shape for model(fixed)
 67 |         :param pool_kernel: Pooling parameter after CNN
 68 |         :param drop: Dropout parameter
 69 |         """
 70 |         super(DeePromoter, self).__init__()
 71 |         binode = len(para_ker) * 4
 72 | 
 73 |         self.pconv = ParallelCNN(para_ker, pool_kernel, drop)
 74 |         self.bilstm = BidirectionalLSTM(binode, binode, binode)
 75 |         self.flatten = nn.Flatten()
 76 |         x = torch.zeros(input_shape)
 77 |         shape = self.get_feature_shape(x)
 78 | 
 79 |         self.fc = nn.Sequential(
 80 |             nn.Linear(shape, shape),
 81 |             nn.ReLU(),
 82 |             nn.Linear(shape, 2),
 83 |         )
 84 | 
 85 |     def get_feature_shape(self, x):
 86 |         """Pass a dummy input through to find the shape
 87 |         after flatten layer for Linear layer construction"""
 88 |         x = x.permute(0, 2, 1)
 89 |         x = self.pconv(x)
 90 |         x = x.permute(0, 2, 1)
 91 |         x = self.bilstm(x)
 92 |         x = self.flatten(x)
 93 |         return x.shape[1]
 94 | 
 95 |     def forward(self, x):
 96 |         x = x.permute(0, 2, 1)
 97 |         x = self.pconv(x)
 98 |         x = x.permute(0, 2, 1)
 99 |         x = self.bilstm(x)
100 |         x = self.flatten(x)
101 |         x = self.fc(x)
102 |         return x
103 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_tata_deepromoter/Logger.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from os import path
 3 | 
 4 | 
 5 | class CSVLogger:
 6 |     def __init__(self,columns,file):
 7 |         self.columns=columns
 8 |         self.file=file
 9 |         if not self.check_header():
10 |             self._write_header()
11 | 
12 | 
13 |     def check_header(self):
14 |         if path.exists(self.file):
15 |             # with open(self.file, 'r') as csvfile:
16 |             #     sniffer = csv.Sniffer()
17 |             #     has_header = sniffer.has_header(csvfile.read(2048))
18 |             #     header=csvfile.seek(0)
19 |             header=True
20 |         else:
21 |             header=False
22 |         return header
23 | 
24 | 
25 |     def _write_header(self):
26 |         with open(self.file,"a") as f:
27 |             string=""
28 |             for attrib in self.columns:
29 |                 string+="{},".format(attrib)
30 |             string=string[:len(string)-1]
31 |             string+="\n"
32 |             f.write(string)
33 |         return self
34 | 
35 |     def log(self,row):
36 |         if len(row)!=len(self.columns):
37 |             raise Exception("Mismatch between row vector and number of columns in logger")
38 |         with open(self.file,"a") as f:
39 |             string=""
40 |             for attrib in row:
41 |                 string+="{},".format(attrib)
42 |             string=string[:len(string)-1]
43 |             string+="\n"
44 |             f.write(string)
45 |         return self
46 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_tata_deepromoter/LrScheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def update_lr(optimizer, lr):
 5 |     for param_group in optimizer.param_groups:
 6 |         param_group['lr'] = lr
 7 | 
 8 | class lr_AIAYN():
 9 |     '''
10 |     Learning rate scheduler from the paper:
11 |     Attention is All You Need
12 |     '''
13 |     def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1):
14 |         self.optimizer=optimizer
15 |         self.d_model=d_model
16 |         self.warmup_steps=warmup_steps
17 |         self.step_num=0
18 |         self.factor=factor
19 | 
20 |     def step(self):
21 |         self.step_num+=1
22 |         lr=self.d_model**-0.5*np.min([self.step_num**-0.5,
23 |                                       self.step_num*self.warmup_steps**-1.5])*self.factor
24 |         update_lr(self.optimizer,lr)
25 |         return lr
26 |         
27 |         
28 | class Cos_Anneal():
29 |     '''
30 |     Learning rate scheduler flat and anneal
31 |     '''
32 |     def __init__(self,optimizer,max_lr,min_lr,T):
33 |         self.optimizer=optimizer
34 |         self.max_lr=max_lr
35 |         self.min_lr=min_lr
36 |         self.step_num=0
37 |         self.T=T
38 | 
39 |     def step(self):
40 |         pi=3.1415
41 |         self.step_num+=1
42 |         lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi))
43 |         if self.optimizer:
44 |             update_lr(self.optimizer,lr)
45 |         return lr        


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_tata_deepromoter/Metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def accuracy(predictions,ground_truths):
 5 |     return np.sum(predictions==ground_truths)/len(ground_truths)
 6 |     
 7 |     
 8 | def sensitivity(predictions,ground_truths):
 9 |     '''
10 |     Here it is assumed:
11 |     0=negative
12 |     1=positive
13 |     '''
14 |     return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1])
15 | 
16 | 
17 | 
18 | def specificity(predictions,ground_truths):
19 |     '''
20 |     Here it is assumed:
21 |     0=negative
22 |     1=positive
23 |     '''
24 |     return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0])
25 |    
26 | def MCC(predictions,ground_truths):
27 |     '''
28 |     Here it is assumed:
29 |     0=negative
30 |     1=positive
31 |     '''
32 |     N1=len(predictions[(predictions==0)&(ground_truths==1)])
33 |     N2=len(predictions[(predictions==1)&(ground_truths==0)])
34 |     N3=len(ground_truths[ground_truths==1])
35 |     N4=len(ground_truths[ground_truths==0])
36 |     sens=1-N1/N3
37 |     spec=1-N2/N4
38 |     denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4))
39 |     return (1-sens-spec)/denom
40 |     
41 |     
42 |     


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_tata_deepromoter/check_log.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_tata_deepromoter/evaluate.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import time
  6 | from Functions import *
  7 | from Dataset import *
  8 | from Network import *
  9 | from LrScheduler import *
 10 | import Metrics
 11 | from Logger import CSVLogger
 12 | import argparse
 13 | from DeePromoter import *
 14 | try:
 15 |     #from apex.parallel import DistributedDataParallel as DDP
 16 |     from apex.fp16_utils import *
 17 |     from apex import amp, optimizers
 18 |     from apex.multi_tensor_apply import multi_tensor_applier
 19 | except ImportError:
 20 |     raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")
 21 | import pickle
 22 | #gpu selection
 23 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 24 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 25 | from sklearn.metrics import matthews_corrcoef
 26 | def get_args():
 27 |     parser = argparse.ArgumentParser()
 28 |     parser.add_argument('--gpu_id', type=str, default='0',  help='which gpu to use')
 29 |     parser.add_argument('--path', type=str, default='../v9d3.csv', help='path of csv file with DNA sequences and labels')
 30 |     parser.add_argument('--epochs', type=int, default=150, help='number of epochs to train')
 31 |     parser.add_argument('--batch_size', type=int, default=24, help='size of each batch during training')
 32 |     parser.add_argument('--weight_decay', type=float, default=0, help='weight dacay used in optimizer')
 33 |     parser.add_argument('--ntoken', type=int, default=4, help='number of tokens to represent DNA nucleotides (should always be 4)')
 34 |     parser.add_argument('--nclass', type=int, default=2, help='number of classes from the linear decoder')
 35 |     parser.add_argument('--ninp', type=int, default=512, help='ninp for transformer encoder')
 36 |     parser.add_argument('--nhead', type=int, default=8, help='nhead for transformer encoder')
 37 |     parser.add_argument('--nhid', type=int, default=2048, help='nhid for transformer encoder')
 38 |     parser.add_argument('--nlayers', type=int, default=6, help='nlayers for transformer encoder')
 39 |     parser.add_argument('--save_freq', type=int, default=1, help='saving checkpoints per save_freq epochs')
 40 |     parser.add_argument('--dropout', type=float, default=.1, help='transformer dropout')
 41 |     parser.add_argument('--warmup_steps', type=int, default=3200, help='training schedule warmup steps')
 42 |     parser.add_argument('--lr_scale', type=float, default=0.1, help='learning rate scale')
 43 |     parser.add_argument('--nmute', type=int, default=18, help='number of mutations during training')
 44 |     parser.add_argument('--kmers', type=int, nargs='+', default=[2,3,4,5,6], help='k-mers to be aggregated')
 45 |     #parser.add_argument('--kmer_aggregation', type=bool, default=True, help='k-mers to be aggregated')
 46 |     parser.add_argument('--kmer_aggregation', dest='kmer_aggregation', action='store_true')
 47 |     parser.add_argument('--no_kmer_aggregation', dest='kmer_aggregation', action='store_false')
 48 |     parser.set_defaults(kmer_aggregation=True)
 49 |     parser.add_argument('--nfolds', type=int, default=5, help='number of cross validation folds')
 50 |     parser.add_argument('--fold', type=int, default=0, help='which fold to train')
 51 |     opts = parser.parse_args()
 52 |     return opts
 53 | 
 54 | def evaluate_fold(fold):
 55 | 
 56 |     #load data
 57 |     #opts=get_args()
 58 |     df=pd.read_csv(opts.path)
 59 | 
 60 |     sequences=np.asarray(df.sequence)
 61 |     labels=np.asarray(df.label)
 62 | 
 63 |     train_indices, val_indices, test_indices=iter_split_strict(sequences,labels,fold,opts.nfolds)
 64 |     # print(train_indices.shape)
 65 |     # print(val_indices.shape)
 66 |     # exit()
 67 |     dataset=PromoterDataset(sequences[train_indices],labels[train_indices])
 68 |     val_dataset=PromoterDataset(sequences[test_indices],labels[test_indices])
 69 |     dataloader=torch.utils.data.DataLoader(dataset,batch_size=opts.batch_size,shuffle=True)
 70 |     val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=opts.batch_size*2,shuffle=False)
 71 | 
 72 | 
 73 | 
 74 |     #init model
 75 |     model=DeePromoter([27, 14, 7]).to(device).to(device)
 76 |     model=nn.DataParallel(model)
 77 |     #optimizer=torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=weight_decay)
 78 | 
 79 |     # Initialization
 80 |     # opt_level = 'O1'
 81 |     # model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
 82 | 
 83 |     # pytorch_total_params = sum(p.numel() for p in model.parameters())
 84 |     # print('Total number of paramters: {}'.format(pytorch_total_params))
 85 | 
 86 |     #evaluation loop
 87 |     #ground_truths=dataset.labels[dataset.val_indices]
 88 |     ensemble_predictions=[]
 89 |     acc=[]
 90 | 
 91 |     weights_path="best_weights/fold{}top1.ckpt".format(fold,i+1)
 92 |     print(weights_path)
 93 |     checkpoint=torch.load(weights_path)
 94 |     model.load_state_dict(checkpoint)
 95 |     predictions,attention_weights,sequences,ground_truths=predict(model,device,val_dataloader)
 96 |     # #validate(model,device,dataset,batch_size=batch_size*2)
 97 |     predictions=np.exp(predictions)/np.sum(np.exp(predictions),axis=1).reshape(len(predictions),1)
 98 |     ensemble_predictions.append(predictions)
 99 |     ensemble_predictions=np.asarray(ensemble_predictions)
100 |     ensemble_predictions=np.mean(np.asarray(ensemble_predictions),axis=0)
101 |     model.cpu()
102 |     del model
103 |     #del optimizer
104 |     torch.cuda.empty_cache()
105 |     return ensemble_predictions, ground_truths, attention_weights, sequences
106 | 
107 | opts=get_args()
108 | 
109 | 
110 | predictions=[]
111 | ground_truths=[]
112 | #attention_weights=[]
113 | sequences=[]
114 | for i in range(5):
115 |     ngram=[7]
116 |     p,t,at,seq= evaluate_fold(i)
117 |     predictions.append(p)
118 |     ground_truths.append(t)
119 |     #print(at.shape)
120 |     #attention_weights.append(at)
121 |     sequences.append(seq)
122 | 
123 | 
124 | probs=np.concatenate(predictions)
125 | ground_truths=np.concatenate(ground_truths)
126 | predictions=np.argmax(probs,axis=1)
127 | #attention_weights=np.squeeze(np.concatenate(attention_weights,0)).astype('float16')
128 | #sequences=np.asarray(sequences).reshape(-1,81)
129 | acc=Metrics.accuracy(predictions,ground_truths)
130 | sens=Metrics.sensitivity(predictions,ground_truths)
131 | spec=Metrics.specificity(predictions,ground_truths)
132 | MCC=matthews_corrcoef(ground_truths,predictions)
133 | precision=precision_score(ground_truths,predictions)
134 | recall=recall_score(ground_truths,predictions)
135 | f1=f1_score(ground_truths,predictions)
136 | # prediction_dict={'predictions':np.squeeze(predictions),
137 | #                  'ground_truths':np.squeeze(ground_truths),
138 | #                  'attention_weights':np.squeeze(attention_weights),
139 | #                  'sequences':np.squeeze(sequences.reshape(-1,81))
140 | # }
141 | 
142 | # with open("prediction_dict.p","wb+") as f:
143 | #     pickle.dump(prediction_dict,f)
144 | 
145 | 
146 | with open("cv.txt",'w+') as f:
147 |     f.write(f"ACC: {acc}\n")
148 |     f.write(f"sensitivity: {sens}\n")
149 |     f.write(f"spec: {spec}\n")
150 |     f.write(f"precision: {precision}\n")
151 |     f.write(f"recall: {recall}\n")
152 |     f.write(f"f1: {f1}\n")
153 |     f.write(f"MCC: {MCC}\n")
154 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_tata_deepromoter/extract_motif.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pickle
  3 | import os
  4 | from tqdm import tqdm
  5 | import pandas as pd
  6 | import argparse
  7 | import matplotlib.pyplot as plt
  8 | import matplotlib
  9 | 
 10 | font = {'family' : 'normal',
 11 |         'weight' : 'bold',
 12 |         'size'   : 10}
 13 | 
 14 | matplotlib.rc('font', **font)
 15 | 
 16 | def get_args():
 17 |     parser = argparse.ArgumentParser()
 18 |     parser.add_argument('--kmers', type=int, default='7',  help='kmer')
 19 |     opts = parser.parse_args()
 20 |     return opts
 21 | 
 22 | opts=get_args()
 23 | 
 24 | nts=[
 25 | "A",
 26 | "T",
 27 | "G",
 28 | "C"]
 29 | 
 30 | def int2nucleotide(nt_sequence,target_length=None):
 31 |     seq=''
 32 |     for nt in nt_sequence:
 33 |         seq+=nts[nt]
 34 |     return seq
 35 | 
 36 | with open("prediction_dict.p","rb") as f:
 37 |     prediction_dict=pickle.load(f)
 38 | 
 39 | 
 40 | df=pd.DataFrame(columns=['index','sequence'])
 41 | 
 42 | def get_kmers(sequence,k):
 43 |     kmers=[]
 44 |     for i in range(len(sequence)-k+1):
 45 |         kmers.append(sequence[i:i+k])
 46 |     return kmers
 47 | 
 48 | os.system('mkdir aw_visualized')
 49 | 
 50 | top=10
 51 | count=0
 52 | sequences=[]
 53 | top_kmers=[]
 54 | top_k_count=[]
 55 | for i in tqdm(range(len(prediction_dict['sequences']))):
 56 | 
 57 |     count+=1
 58 |     sequence=int2nucleotide(prediction_dict['sequences'][i])
 59 |     sequences.append(sequence)
 60 |     attention_weights=prediction_dict['attention_weights'][i]
 61 |     ground_truth=prediction_dict['ground_truths'][i]
 62 |     prediction=prediction_dict['predictions'][i]
 63 | 
 64 |     kmers=np.asarray(get_kmers(sequence,opts.kmers))
 65 | 
 66 |     attention_weights=attention_weights[-1].sum(0)
 67 |     #attention_weights=attention_weights/attention_weights.sum()
 68 |     # plt.imshow(attention_weights.reshape(1,-1).astype('float32'))
 69 |     # plt.show()
 70 |     #exit()
 71 |     if ground_truth==1:
 72 |         state='positive'
 73 |     else:
 74 |         state='negative'
 75 | 
 76 |     if ground_truth==prediction:
 77 |         eval='correct'
 78 |     else:
 79 |         eval='wrong'
 80 |     if state=='positive' and eval=='correct':
 81 |         sorted_indices=np.argsort(attention_weights)
 82 |         #print(attention_weights[sorted_indices][-3:])
 83 |         top_k=kmers[sorted_indices][-3:]
 84 |         for kmer in top_k:
 85 |             if kmer not in top_kmers:
 86 |                 top_kmers.append(kmer)
 87 |                 top_k_count.append(1)
 88 |             else:
 89 |                 top_k_count[top_kmers.index(kmer)]=top_k_count[top_kmers.index(kmer)]+1
 90 |     #exit()
 91 | 
 92 | top_kmers=np.asarray(top_kmers)
 93 | top_k_count=np.asarray(top_k_count)
 94 | 
 95 | #exit()
 96 | 
 97 | top_indices=np.flip(np.argsort(top_k_count))
 98 | 
 99 | fig, ax = plt.subplots()
100 | x=np.arange(top)
101 | width=0.4
102 | bar=ax.bar(x,top_k_count[top_indices[:top]],edgecolor='k',linewidth=2)
103 | ax.set_ylabel('Num of appearancesin top 3',fontsize=10)
104 | #ax.set_title('Scores by group and gender')
105 | ax.set_xticks(x)
106 | ax.set_xticklabels(top_kmers[top_indices[:top]])
107 | plt.setp(ax.get_xticklabels(), rotation=30, ha="right",
108 |          rotation_mode="anchor")
109 | ax.legend()
110 | plt.savefig('promoter_motifs.eps')
111 | #plt.show()
112 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/mouse_tata_deepromoter/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | for i in {0..4};do
 3 | python train.py --fold $i --gpu_id 0 --kmer_aggregation --epochs 50 \
 4 | --nlayers 6 --nmute 45 --path ../../data/mouse_tata_dataset.csv --kmers 7 --ninp 256 --nhid 1024 \
 5 | --batch_size 64
 6 | done
 7 | 
 8 | python evaluate.py --gpu_id 0 --kmer_aggregation --epochs 150 \
 9 | --nlayers 6 --nmute 15 --kmers 7 --ninp 256 --nhid 1024 \
10 | --path ../../data/mouse_tata_dataset.csv
11 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/run_human.sh:
--------------------------------------------------------------------------------
1 | cd human_tata
2 | bash run.sh
3 | cd ..
4 | 
5 | cd human_non_tata
6 | bash run.sh
7 | cd ..
8 | 


--------------------------------------------------------------------------------
/src/Eukaryotic_Promoters_Classification/run_mouse.sh:
--------------------------------------------------------------------------------
1 | cd mouse_tata
2 | bash run.sh
3 | cd ..
4 | 
5 | cd mouse_non_tata
6 | bash run.sh
7 | cd ..
8 | 


--------------------------------------------------------------------------------
/src/Non_Coding_Variant_Effects/Dataset.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import os
 3 | import numpy as np
 4 | import pandas as pd
 5 | from tqdm import tqdm
 6 | import torch
 7 | 
 8 | 
 9 | nt_int={
10 | "A": 0,
11 | "T": 1,
12 | "G": 2,
13 | "C": 3,}
14 | 
15 | def nucleatide2int(nt_sequence,target_length=None):
16 |     int_sequence=[]
17 |     for nt in nt_sequence:
18 |         nt=nt.upper()
19 |         if nt in nt_int:
20 |             int_sequence.append(nt_int[nt])
21 |     int_sequence=np.asarray(int_sequence,dtype='int32')
22 |     if target_length:
23 |         int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1)
24 |     return int_sequence
25 | 
26 | 
27 | class DeepSeaDataset(torch.utils.data.Dataset):
28 |     def __init__(self,sequences,labels):
29 |         self.sequences=sequences
30 |         self.labels=labels
31 | 
32 | 
33 |     def __len__(self):
34 |         return len(self.labels)
35 | 
36 |     def __getitem__(self,idx):
37 |         sequence=self.sequences[idx].argmax(0)
38 |         return {'data':sequence, 'labels':self.labels[idx]}
39 | 


--------------------------------------------------------------------------------
/src/Non_Coding_Variant_Effects/Functions.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import os
  3 | from sklearn import metrics
  4 | import numpy as np
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from tqdm import tqdm
  8 | import Metrics
  9 | import numpy as np
 10 | import os
 11 | import pandas as pd
 12 | import random
 13 | 
 14 | def seed_everything(seed=42):
 15 |     random.seed(seed)
 16 |     os.environ['PYTHONHASHSEED'] = str(seed)
 17 |     np.random.seed(seed)
 18 |     torch.manual_seed(seed)
 19 |     torch.cuda.manual_seed(seed)
 20 |     torch.backends.cudnn.deterministic = True
 21 | 
 22 | def get_best_weights_from_fold(fold,top=3):
 23 |     csv_file='log_fold{}.csv'.format(fold)
 24 | 
 25 |     history=pd.read_csv(csv_file)
 26 |     scores=np.asarray(history.val_auc)
 27 |     top_epochs=scores.argsort()[-3:][::-1]
 28 |     print(scores[top_epochs])
 29 |     os.system('mkdir best_weights')
 30 | 
 31 |     for i in range(top):
 32 |         weights_path='checkpoints_fold{}/epoch{}.ckpt'.format(fold,history.epoch[top_epochs[i]])
 33 |         print(weights_path)
 34 |         os.system('cp {} best_weights/fold{}top{}.ckpt'.format(weights_path,fold,i+1))
 35 |     os.system('rm -r checkpoints_fold{}'.format(fold))
 36 | 
 37 | def smoothcrossentropyloss(pred,gold,n_class=2,smoothing=0.05):
 38 |     gold = gold.contiguous().view(-1)
 39 |     one_hot = torch.zeros_like(pred).scatter(1, gold.view(-1, 1), 1)
 40 |     one_hot = one_hot * (1 - smoothing) + (1 - one_hot) * smoothing / (n_class - 1)
 41 |     log_prb = F.log_softmax(pred, dim=1)
 42 |     loss = -(one_hot * log_prb)
 43 |     #loss=loss.sum(1).mean()
 44 |     return loss
 45 | 
 46 | def mutate_dna_sequence(sequence,nmute=15):
 47 |     mutation=torch.randint(0,4,size=(sequence.shape[0],nmute))
 48 |     to_mutate = torch.randperm(sequence.shape[1])[:nmute]
 49 |     sequence[:,to_mutate]=mutation
 50 |     return sequence
 51 | 
 52 | def get_MLM_mask(sequence,nmask=12):
 53 |     mask=np.zeros(sequence.shape,dtype='bool')
 54 |     to_mask=np.random.choice(len(sequence[0]),size=(nmask),replace=False)
 55 |     mask[:,to_mask]=True
 56 |     return mask
 57 | 
 58 | def get_complementary_sequence_deepsea(sequence):
 59 |     #AGCT
 60 |     complementary_sequence=sequence.clone()
 61 |     complementary_sequence[sequence==0]=3
 62 |     complementary_sequence[sequence==1]=2
 63 |     complementary_sequence[sequence==2]=1
 64 |     complementary_sequence[sequence==3]=0
 65 |     complementary_sequence=complementary_sequence.flip(-1)
 66 |     return complementary_sequence
 67 | 
 68 | def update_lr(optimizer, lr):
 69 |     for param_group in optimizer.param_groups:
 70 |         param_group['lr'] = lr
 71 | 
 72 | def save_weights(model,optimizer,epoch,folder):
 73 |     if os.path.isdir(folder)==False:
 74 |         os.makedirs(folder,exist_ok=True)
 75 |     torch.save(model.state_dict(), folder+'/epoch{}.ckpt'.format(epoch+1))
 76 | 
 77 | 
 78 | 
 79 | def validate(model,device,dataset,batch_size=64):
 80 |     batches=len(dataset)
 81 |     model.train(False)
 82 |     total=0
 83 |     predictions=[]
 84 |     outputs=[]
 85 |     ground_truths=[]
 86 |     loss=0
 87 |     criterion=nn.BCEWithLogitsLoss()
 88 |     with torch.no_grad():
 89 |         for data in tqdm(dataset):
 90 |             X=data['data'].to(device).long()
 91 |             Y=data['labels'].to(device).float()
 92 | 
 93 |             output= model(X)
 94 |             del X
 95 |             loss+=criterion(output,Y)
 96 |             probs = torch.sigmoid(output)
 97 |             for pred in probs:
 98 |                 predictions.append(pred.cpu().numpy()>0.5)
 99 |             for vector in probs:
100 |                 outputs.append(vector.cpu().numpy())
101 |             for t in Y:
102 |                 ground_truths.append(t.cpu().numpy())
103 |             del output
104 |     torch.cuda.empty_cache()
105 |     val_loss=(loss/batches).cpu()
106 |     ground_truths=np.asarray(ground_truths).reshape(-1)
107 |     predictions=np.asarray(predictions).reshape(-1)
108 |     outputs=np.asarray(outputs).reshape(-1)
109 |     #score=metrics.cohen_kappa_score(ground_truths,predictions,weights='quadratic')
110 |     val_acc=Metrics.accuracy(predictions,ground_truths)
111 |     auc=metrics.roc_auc_score(ground_truths,outputs)
112 |     val_sens=Metrics.sensitivity(predictions,ground_truths)
113 |     val_spec=Metrics.specificity(predictions,ground_truths)
114 |     print('Val accuracy: {}, Val_auc: {}, Val Loss: {}'.format(val_acc,auc,val_loss))
115 |     return val_loss,auc,val_acc,val_sens,val_spec
116 | 
117 | 
118 | def predict(model,device,dataset,batch_size=64):
119 |     batches=int(len(dataset.val_indices)/batch_size)+1
120 |     model.train(False)
121 |     total=0
122 |     ground_truths=dataset.labels[dataset.val_indices]
123 |     predictions=[]
124 |     attention_weights=[]
125 |     loss=0
126 |     criterion=nn.CrossEntropyLoss()
127 |     dataset.switch_mode(training=False)
128 |     dataset.update_batchsize(batch_size)
129 |     with torch.no_grad():
130 |         for i in tqdm(range(len(dataset))):
131 |             data=dataset[i]
132 |             X=torch.Tensor(data['data']).to(device,).long()
133 |             Y=torch.Tensor(data['labels']).to(device,dtype=torch.int64)
134 |             directions=data['directions']
135 |             directions=directions.reshape(len(directions),1)*np.ones(X.shape)
136 |             directions=torch.Tensor(directions).to(device).long()
137 |             output,_,_,aw= model(X,directions,None)
138 |             del X
139 |             loss+=criterion(output,Y)
140 |             classification_predictions = torch.argmax(output,dim=1).squeeze()
141 |             for pred in output:
142 |                 predictions.append(pred.cpu().numpy())
143 |             for weight in aw:
144 |                 attention_weights.append(weight.cpu().numpy())
145 | 
146 |             del output
147 |     torch.cuda.empty_cache()
148 |     val_loss=(loss/batches).cpu()
149 |     predictions=np.asarray(predictions)
150 |     attention_weights=np.asarray(attention_weights)
151 |     binary_predictions=predictions.copy()
152 |     binary_predictions[binary_predictions==2]=1
153 |     binary_ground_truths=ground_truths.copy()
154 |     binary_ground_truths[binary_ground_truths==2]=1
155 |     return predictions,attention_weights,np.asarray(dataset.data[dataset.val_indices])
156 | 


--------------------------------------------------------------------------------
/src/Non_Coding_Variant_Effects/Logger.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from os import path
 3 | 
 4 | 
 5 | class CSVLogger:
 6 |     def __init__(self,columns,file):
 7 |         self.columns=columns
 8 |         self.file=file
 9 |         if not self.check_header():
10 |             self._write_header()
11 | 
12 | 
13 |     def check_header(self):
14 |         if path.exists(self.file):
15 |             header=True
16 |         else:
17 |             header=False
18 |         return header
19 | 
20 | 
21 |     def _write_header(self):
22 |         with open(self.file,"a") as f:
23 |             string=""
24 |             for attrib in self.columns:
25 |                 string+="{},".format(attrib)
26 |             string=string[:len(string)-1]
27 |             string+="\n"
28 |             f.write(string)
29 |         return self
30 | 
31 |     def log(self,row):
32 |         if len(row)!=len(self.columns):
33 |             raise Exception("Mismatch between row vector and number of columns in logger")
34 |         with open(self.file,"a") as f:
35 |             string=""
36 |             for attrib in row:
37 |                 string+="{},".format(attrib)
38 |             string=string[:len(string)-1]
39 |             string+="\n"
40 |             f.write(string)
41 |         return self
42 | 


--------------------------------------------------------------------------------
/src/Non_Coding_Variant_Effects/LrScheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def update_lr(optimizer, lr):
 5 |     for param_group in optimizer.param_groups:
 6 |         param_group['lr'] = lr
 7 | 
 8 | class lr_AIAYN():
 9 |     '''
10 |     Learning rate scheduler from the paper:
11 |     Attention is All You Need
12 |     '''
13 |     def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1):
14 |         self.optimizer=optimizer
15 |         self.d_model=d_model
16 |         self.warmup_steps=warmup_steps
17 |         self.step_num=0
18 |         self.factor=factor
19 | 
20 |     def step(self):
21 |         self.step_num+=1
22 |         lr=self.d_model**-0.5*np.min([self.step_num**-0.5,
23 |                                       self.step_num*self.warmup_steps**-1.5])*self.factor
24 |         update_lr(self.optimizer,lr)
25 |         return lr
26 |         
27 |         
28 | class Cos_Anneal():
29 |     '''
30 |     Learning rate scheduler flat and anneal
31 |     '''
32 |     def __init__(self,optimizer,max_lr,min_lr,T):
33 |         self.optimizer=optimizer
34 |         self.max_lr=max_lr
35 |         self.min_lr=min_lr
36 |         self.step_num=0
37 |         self.T=T
38 | 
39 |     def step(self):
40 |         pi=3.1415
41 |         self.step_num+=1
42 |         lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi))
43 |         if self.optimizer:
44 |             update_lr(self.optimizer,lr)
45 |         return lr        


--------------------------------------------------------------------------------
/src/Non_Coding_Variant_Effects/Metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def accuracy(predictions,ground_truths):
 5 |     return np.sum(predictions==ground_truths)/len(ground_truths)
 6 |     
 7 |     
 8 | def sensitivity(predictions,ground_truths):
 9 |     '''
10 |     Here it is assumed:
11 |     0=negative
12 |     1=positive
13 |     '''
14 |     return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1])
15 | 
16 | 
17 | 
18 | def specificity(predictions,ground_truths):
19 |     '''
20 |     Here it is assumed:
21 |     0=negative
22 |     1=positive
23 |     '''
24 |     return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0])
25 |    
26 | def MCC(predictions,ground_truths):
27 |     '''
28 |     Here it is assumed:
29 |     0=negative
30 |     1=positive
31 |     '''
32 |     N1=len(predictions[(predictions==0)&(ground_truths==1)])
33 |     N2=len(predictions[(predictions==1)&(ground_truths==0)])
34 |     N3=len(ground_truths[ground_truths==1])
35 |     N4=len(ground_truths[ground_truths==0])
36 |     sens=1-N1/N3
37 |     spec=1-N2/N4
38 |     denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4))
39 |     return (1-sens-spec)/denom
40 |     
41 |     
42 |     


--------------------------------------------------------------------------------
/src/Non_Coding_Variant_Effects/compute_median_aucs.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | df=pd.read_csv('test_aucs.csv')
 4 | deepsea=pd.read_excel('../41592_2015_BFnmeth3547_MOESM646_ESM.xlsx')
 5 | deepsea_aucs=deepsea.iloc[1:,4]
 6 | deepsea_aucs[599]=1
 7 | 
 8 | with open("test_results.txt",'w+') as f:
 9 |     f.write('###NT###\n')
10 |     f.write(f"DNase_median_acu: {df.AUC.iloc[:125].median()}\n")
11 |     f.write(f"TF_median_acu: {df.AUC.iloc[125:815].median()}\n")
12 |     f.write(f"Histone_median_acu: {df.AUC.iloc[815:919].median()}\n")
13 |     f.write('###Deep Sea###\n')
14 |     f.write(f"DNase_median_acu: {deepsea_aucs[:125].median()}\n")
15 |     f.write(f"TF_median_acu: {deepsea_aucs[125:815].median()}\n")
16 |     f.write(f"Histone_median_acu: {deepsea_aucs[815:919].median()}\n")
17 | 


--------------------------------------------------------------------------------
/src/Non_Coding_Variant_Effects/compute_val_aucs.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | from sklearn import metrics
 3 | from tqdm import tqdm
 4 | import numpy as np
 5 | 
 6 | 
 7 | with open('test_results.p','rb') as f:
 8 |     outputs,ground_truths=pickle.load(f)
 9 | 
10 | aucs=[]
11 | for i in tqdm(range(598)):
12 |     auc=metrics.roc_auc_score(ground_truths[:,i],outputs[:,i])
13 |     aucs.append(auc)
14 | 
15 | aucs.append(1)
16 | for i in tqdm(range(599,919)):
17 |     auc=metrics.roc_auc_score(ground_truths[:,i],outputs[:,i])
18 |     aucs.append(auc)
19 | 
20 | import pandas as pd
21 | df=pd.DataFrame(columns=['AUC'])
22 | df['AUC']=aucs
23 | 
24 | df.to_csv('test_aucs.csv')
25 | 
26 | # #exit()
27 | # aucs=[]
28 | # for i in tqdm(range(125)):
29 | #     auc=metrics.roc_auc_score(ground_truths[:,i],outputs[:,i])
30 | #     aucs.append(auc)
31 | #     all_aucs.append(auc)
32 | #
33 | # DNase_median_acu=np.median(aucs)
34 | #
35 | #
36 | # aucs=[]
37 | # for i in tqdm(range(125,598)):
38 | #     auc=metrics.roc_auc_score(ground_truths[:,i],outputs[:,i])
39 | #     aucs.append(auc)
40 | #
41 | #
42 | # for i in tqdm(range(599,815)):
43 | #     auc=metrics.roc_auc_score(ground_truths[:,i],outputs[:,i])
44 | #     aucs.append(auc)
45 | #
46 | # TF_median_acu=np.median(aucs)
47 | #
48 | # aucs=[]
49 | # for i in tqdm(range(815,919)):
50 | #     auc=metrics.roc_auc_score(ground_truths[:,i],outputs[:,i])
51 | #     aucs.append(auc)
52 | #
53 | #
54 | # Histone_median_acu=np.median(aucs)
55 | #
56 | # with open("test_results.txt",'w+') as f:
57 | #     f.write(f"DNase_median_acu: {DNase_median_acu}\n")
58 | #     f.write(f"TF_median_acu: {TF_median_acu}\n")
59 | #     f.write(f"Histone_median_acu: {Histone_median_acu}\n")
60 | 


--------------------------------------------------------------------------------
/src/Non_Coding_Variant_Effects/preprocess_data.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | import h5py
 3 | import numpy as np
 4 | 
 5 | f=h5py.File('deepsea_train/train.mat', 'r')# as f:
 6 | train_seqs =np.array(f['trainxdata']).transpose(2,1,0).astype('uint8')
 7 | train_labels =np.array(f['traindata']).transpose(1,0).astype('uint8')
 8 | val_data = scipy.io.loadmat('deepsea_train/valid.mat')
 9 | val_seqs = np.array(val_data['validxdata']).transpose(2,1,0).astype('uint8')
10 | val_labels = np.array(val_data['validdata']).transpose(1,0).astype('uint8')
11 | 
12 | import pickle
13 | with open('DeepSea_TrainVal.p','wb+') as f:
14 |     pickle.dump([train_seqs,train_labels,val_seqs,val_labels],f)
15 | 


--------------------------------------------------------------------------------
/src/Non_Coding_Variant_Effects/readme.md:
--------------------------------------------------------------------------------
1 | # classifying effects of non-coding variants
2 | 
3 | 1. download datasets from http://deepsea.princeton.edu/media/code/deepsea_train_bundle.v0.9.tar.gz
4 | 2. create folder ```deepsea_train``` and unzip contents into folder and run ```preprocess.py``` to extract train val set to a numpy file (for faster data loading)
5 | 3. ```bash run.sh``` to run training
6 | 4. ```bash test.sh``` to make inference on the test set
7 | 5. ```compute_val_aucs.py``` and ```compute_median_aucs.py``` to calculate test aucs and median aucs in TF/DNS/HM
8 | 


--------------------------------------------------------------------------------
/src/Non_Coding_Variant_Effects/restart.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | 
4 | python restart.py --gpu_id 0,1 --kmer_aggregation --nmute 40 --epochs 60 --nlayers 3 \
5 | --batch_size 256 --kmers 13 --lr_scale 0.1 --ninp 512 --nhid 2048 --num_workers 32 \
6 | --nclass 919 --nhead 8 --restart_epoch 20
7 | 


--------------------------------------------------------------------------------
/src/Non_Coding_Variant_Effects/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | 
4 | python train.py --gpu_id 0,1 --kmer_aggregation --nmute 40 --epochs 60 --nlayers 3 \
5 | --batch_size 512 --kmers 7 --lr_scale 1 --ninp 1024 --nhid 4096 --num_workers 32 \
6 | --nclass 919 --nhead 16 --weight_decay 1e-6 --dropout 0.2
7 | 


--------------------------------------------------------------------------------
/src/Non_Coding_Variant_Effects/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | 
4 | python validate.py --gpu_id 0,1 --kmer_aggregation --nmute 40 --epochs 60 --nlayers 3 \
5 | --batch_size 1024 --kmers 7 --lr_scale 0.1 --ninp 1024 --nhid 4096 --num_workers 32 \
6 | --nclass 919 --nhead 16
7 | 


--------------------------------------------------------------------------------
/src/Non_Coding_Variant_Effects/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.nn as nn
  4 | import time
  5 | from Functions import *
  6 | from Dataset import *
  7 | from Network import *
  8 | from LrScheduler import *
  9 | import Metrics
 10 | from Logger import CSVLogger
 11 | import argparse
 12 | try:
 13 |     #from apex.parallel import DistributedDataParallel as DDP
 14 |     from apex.fp16_utils import *
 15 |     from apex import amp, optimizers
 16 |     from apex.multi_tensor_apply import multi_tensor_applier
 17 | except ImportError:
 18 |     raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")
 19 | 
 20 | 
 21 | def get_args():
 22 |     parser = argparse.ArgumentParser()
 23 |     parser.add_argument('--gpu_id', type=str, default='0',  help='which gpu to use')
 24 |     parser.add_argument('--path', type=str, default='../', help='path of csv file with DNA sequences and labels')
 25 |     parser.add_argument('--epochs', type=int, default=150, help='number of epochs to train')
 26 |     parser.add_argument('--batch_size', type=int, default=24, help='size of each batch during training')
 27 |     parser.add_argument('--weight_decay', type=float, default=0, help='weight dacay used in optimizer')
 28 |     parser.add_argument('--ntoken', type=int, default=4, help='number of tokens to represent DNA nucleotides (should always be 4)')
 29 |     parser.add_argument('--nclass', type=int, default=919, help='number of classes from the linear decoder')
 30 |     parser.add_argument('--ninp', type=int, default=512, help='ninp for transformer encoder')
 31 |     parser.add_argument('--nhead', type=int, default=8, help='nhead for transformer encoder')
 32 |     parser.add_argument('--nhid', type=int, default=2048, help='nhid for transformer encoder')
 33 |     parser.add_argument('--nlayers', type=int, default=6, help='nlayers for transformer encoder')
 34 |     parser.add_argument('--save_freq', type=int, default=1, help='saving checkpoints per save_freq epochs')
 35 |     parser.add_argument('--dropout', type=float, default=.1, help='transformer dropout')
 36 |     parser.add_argument('--warmup_steps', type=int, default=3200, help='training schedule warmup steps')
 37 |     parser.add_argument('--lr_scale', type=float, default=0.1, help='learning rate scale')
 38 |     parser.add_argument('--nmute', type=int, default=18, help='number of mutations during training')
 39 |     parser.add_argument('--kmers', type=int, nargs='+', default=[7], help='k-mers to be aggregated')
 40 |     #parser.add_argument('--kmer_aggregation', type=bool, default=True, help='k-mers to be aggregated')
 41 |     parser.add_argument('--kmer_aggregation', dest='kmer_aggregation', action='store_true')
 42 |     parser.add_argument('--no_kmer_aggregation', dest='kmer_aggregation', action='store_false')
 43 |     parser.set_defaults(kmer_aggregation=True)
 44 |     parser.add_argument('--nfolds', type=int, default=5, help='number of cross validation folds')
 45 |     parser.add_argument('--fold', type=int, default=0, help='which fold to train')
 46 |     parser.add_argument('--val_freq', type=int, default=1, help='which fold to train')
 47 |     parser.add_argument('--num_workers', type=int, default=1, help='num_workers')
 48 |     opts = parser.parse_args()
 49 |     return opts
 50 | 
 51 | #def train_fold():
 52 | 
 53 | opts=get_args()
 54 | seed_everything(2020)
 55 | #gpu selection
 56 | os.environ["CUDA_VISIBLE_DEVICES"] = opts.gpu_id
 57 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 58 | 
 59 | 
 60 | import pickle
 61 | with open('DeepSea_TrainVal.p','rb') as f:
 62 |     train_seqs,train_labels,val_seqs,val_labels=pickle.load(f)
 63 | 
 64 | #exit()
 65 | 
 66 | dataset=DeepSeaDataset(train_seqs,train_labels)
 67 | dataloader=torch.utils.data.DataLoader(dataset,batch_size=opts.batch_size,shuffle=True,num_workers=opts.num_workers)
 68 | val_dataset=DeepSeaDataset(val_seqs.transpose(2,1,0),val_labels.transpose(1,0))
 69 | val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=opts.batch_size*4,shuffle=False)
 70 | 
 71 | #exit()
 72 | #lr=0
 73 | 
 74 | #checkpointing
 75 | checkpoints_folder='checkpoints_fold{}'.format((opts.fold))
 76 | csv_file='log_fold{}.csv'.format((opts.fold))
 77 | columns=['epoch','train_loss',
 78 |          'val_loss','val_auc','val_acc','val_sens','val_spec']
 79 | logger=CSVLogger(columns,csv_file)
 80 | 
 81 | #build model and logger
 82 | model=NucleicTransformer(opts.ntoken, opts.nclass, opts.ninp, opts.nhead, opts.nhid,
 83 |                        opts.nlayers, opts.kmer_aggregation, kmers=opts.kmers,
 84 |                        dropout=opts.dropout).to(device)
 85 | optimizer=torch.optim.Adam(model.parameters(), weight_decay=opts.weight_decay)
 86 | criterion=nn.BCEWithLogitsLoss(reduction='none')
 87 | lr_schedule=lr_AIAYN(optimizer,opts.ninp,opts.warmup_steps,opts.lr_scale)
 88 | # Initialization
 89 | opt_level = 'O1'
 90 | model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
 91 | model = nn.DataParallel(model)
 92 | #softmax = nn.Softmax(dim=1)
 93 | 
 94 | pytorch_total_params = sum(p.numel() for p in model.parameters())
 95 | print('Total number of paramters: {}'.format(pytorch_total_params))
 96 | 
 97 | #print("Starting training for fold {}/{}".format(opts.fold,opts.nfolds))
 98 | #training loop
 99 | for epoch in range(opts.epochs):
100 |     model.train(True)
101 |     t=time.time()
102 |     total_loss=0
103 |     optimizer.zero_grad()
104 |     total_steps=len(dataloader)
105 |     step=0
106 |     for data in tqdm(dataloader):
107 |         step+=1
108 |     #for step in range(1):
109 |         lr=lr_schedule.step()
110 |         src=data['data'].to(device).long()
111 |         labels=data['labels'].to(device).float()
112 |         #exit()
113 |         #mutated_sequence=mutate_dna_sequence(src,opts.nmute).to(device)
114 |         output=model(src)
115 |         #loss_weight=torch.ones(len(output),device=device)
116 |         loss_weight=torch.ones_like(labels)
117 |         loss_weight[labels==1]=10
118 |         loss=criterion(output.reshape(-1),labels.reshape(-1))*loss_weight.reshape(-1)
119 |         loss=loss.mean()
120 | 
121 |         with amp.scale_loss(loss, optimizer) as scaled_loss:
122 |            scaled_loss.backward()
123 |         torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
124 |         optimizer.step()
125 |         optimizer.zero_grad()
126 |         total_loss+=loss
127 |         # print ("Epoch [{}/{}], Step [{}/{}] Loss: {:.3f} Lr:{:.6f} Time: {:.1f}"
128 |         #                .format(epoch+1, opts.epochs, step+1, total_steps, total_loss/(step+1) , lr,time.time()-t),end='\r',flush=True) #total_loss/(step+1)
129 |         # #break
130 |     print('')
131 | 
132 |     train_loss=total_loss/(step+1)
133 | 
134 |     if (epoch+1)%opts.val_freq==0:
135 |         val_loss,auc,val_acc,val_sens,val_spec=validate(model,device,val_dataloader,batch_size=opts.batch_size*2)
136 |         print("Epoch {} train loss: {}".format(epoch+1,train_loss))
137 | 
138 |         to_log=[epoch+1,train_loss,val_loss,auc,val_acc,val_sens,val_spec]
139 |         logger.log(to_log)
140 | 
141 | 
142 |     if (epoch+1)%opts.save_freq==0:
143 |         save_weights(model,optimizer,epoch,checkpoints_folder)
144 | 
145 | 
146 | get_best_weights_from_fold(opts.fold)
147 | 
148 | #train_fold()
149 | 


--------------------------------------------------------------------------------
/src/Viral_identification/Dataset.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import os
 3 | import numpy as np
 4 | import pandas as pd
 5 | from tqdm import tqdm
 6 | import torch
 7 | 
 8 | 
 9 | nt_int={
10 | "A": 0,
11 | "T": 1,
12 | "G": 2,
13 | "C": 3,}
14 | 
15 | def nucleatide2int(nt_sequence,target_length=None):
16 |     int_sequence=[]
17 |     for nt in nt_sequence:
18 |         nt=nt.upper()
19 |         if nt in nt_int:
20 |             int_sequence.append(nt_int[nt])
21 |     int_sequence=np.asarray(int_sequence,dtype='int32')
22 |     if target_length:
23 |         int_sequence=np.pad(int_sequence,(0,target_length-len(int_sequence)),constant_values=-1)
24 |     return int_sequence
25 | 
26 | 
27 | class ViraminerDataset(torch.utils.data.Dataset):
28 |     def __init__(self,sequences,labels):
29 |         self.data=[]
30 |         for seq in sequences:
31 |             self.data.append(nucleatide2int(seq))
32 | 
33 |         self.data=np.asarray(self.data,dtype='int')
34 |         self.labels=np.asarray(labels,dtype='int')
35 | 
36 |         print(self.data.shape)
37 |         print(self.labels.shape)
38 | 
39 |     def __len__(self):
40 |         return len(self.labels)
41 | 
42 |     def __getitem__(self,idx):
43 |         return {'data':self.data[idx], 'labels':self.labels[idx]}
44 | 


--------------------------------------------------------------------------------
/src/Viral_identification/Functions.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import os
  3 | from sklearn import metrics
  4 | import numpy as np
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from tqdm import tqdm
  8 | import Metrics
  9 | import numpy as np
 10 | import os
 11 | import pandas as pd
 12 | import random
 13 | 
 14 | def seed_everything(seed=42):
 15 |     random.seed(seed)
 16 |     os.environ['PYTHONHASHSEED'] = str(seed)
 17 |     np.random.seed(seed)
 18 |     torch.manual_seed(seed)
 19 |     torch.cuda.manual_seed(seed)
 20 |     torch.backends.cudnn.deterministic = True
 21 | 
 22 | def get_best_weights_from_fold(fold,top=3):
 23 |     csv_file='log_fold{}.csv'.format(fold)
 24 | 
 25 |     history=pd.read_csv(csv_file)
 26 |     scores=np.asarray(history.val_auc)
 27 |     top_epochs=scores.argsort()[-3:][::-1]
 28 |     print(scores[top_epochs])
 29 |     os.system('mkdir best_weights')
 30 | 
 31 |     for i in range(top):
 32 |         weights_path='checkpoints_fold{}/epoch{}.ckpt'.format(fold,history.epoch[top_epochs[i]])
 33 |         print(weights_path)
 34 |         os.system('cp {} best_weights/fold{}top{}.ckpt'.format(weights_path,fold,i+1))
 35 |     os.system('rm -r checkpoints_fold{}'.format(fold))
 36 | 
 37 | def smoothcrossentropyloss(pred,gold,n_class=2,smoothing=0.05):
 38 |     gold = gold.contiguous().view(-1)
 39 |     one_hot = torch.zeros_like(pred).scatter(1, gold.view(-1, 1), 1)
 40 |     one_hot = one_hot * (1 - smoothing) + (1 - one_hot) * smoothing / (n_class - 1)
 41 |     log_prb = F.log_softmax(pred, dim=1)
 42 |     loss = -(one_hot * log_prb)
 43 |     #loss=loss.sum(1).mean()
 44 |     return loss
 45 | 
 46 | def mutate_dna_sequence(sequence,nmute=15):
 47 |     mutation=torch.randint(0,4,size=(sequence.shape[0],nmute))
 48 |     to_mutate = torch.randperm(sequence.shape[1])[:nmute]
 49 |     sequence[:,to_mutate]=mutation
 50 |     return sequence
 51 | 
 52 | def get_MLM_mask(sequence,nmask=12):
 53 |     mask=np.zeros(sequence.shape,dtype='bool')
 54 |     to_mask=np.random.choice(len(sequence[0]),size=(nmask),replace=False)
 55 |     mask[:,to_mask]=True
 56 |     return mask
 57 | 
 58 | def get_complementary_sequence(sequence):
 59 |     complementary_sequence=sequence.copy()
 60 |     complementary_sequence[sequence==0]=1
 61 |     complementary_sequence[sequence==1]=0
 62 |     complementary_sequence[sequence==2]=3
 63 |     complementary_sequence[sequence==3]=2
 64 |     complementary_sequence=complementary_sequence[:,::-1]
 65 |     return complementary_sequence
 66 | 
 67 | def update_lr(optimizer, lr):
 68 |     for param_group in optimizer.param_groups:
 69 |         param_group['lr'] = lr
 70 | 
 71 | def save_weights(model,optimizer,epoch,folder):
 72 |     if os.path.isdir(folder)==False:
 73 |         os.makedirs(folder,exist_ok=True)
 74 |     torch.save(model.state_dict(), folder+'/epoch{}.ckpt'.format(epoch+1))
 75 | 
 76 | 
 77 | 
 78 | def validate(model,device,dataset,batch_size=64):
 79 |     batches=len(dataset)
 80 |     model.train(False)
 81 |     total=0
 82 |     predictions=[]
 83 |     outputs=[]
 84 |     ground_truths=[]
 85 |     loss=0
 86 |     criterion=nn.CrossEntropyLoss()
 87 |     with torch.no_grad():
 88 |         for data in tqdm(dataset):
 89 |             X=data['data'].to(device)
 90 |             Y=data['labels'].to(device)
 91 | 
 92 |             output= model(X)
 93 |             del X
 94 |             loss+=criterion(output,Y)
 95 |             classification_predictions = torch.argmax(output,dim=1).squeeze()
 96 |             for pred in classification_predictions:
 97 |                 predictions.append(pred.cpu().numpy())
 98 |             for vector in output:
 99 |                 outputs.append(vector.cpu().numpy())
100 |             for t in Y:
101 |                 ground_truths.append(t.cpu().numpy())
102 |             del output
103 |     torch.cuda.empty_cache()
104 |     val_loss=(loss/batches).cpu()
105 |     ground_truths=np.asarray(ground_truths)
106 |     predictions=np.asarray(predictions)
107 |     outputs=np.asarray(outputs)
108 |     #print(predictions)
109 |     #print(ground_truths)
110 |     #score=metrics.cohen_kappa_score(ground_truths,predictions,weights='quadratic')
111 |     val_acc=Metrics.accuracy(predictions,ground_truths)
112 |     auc=metrics.roc_auc_score(ground_truths,outputs[:,1])
113 |     val_sens=Metrics.sensitivity(predictions,ground_truths)
114 |     val_spec=Metrics.specificity(predictions,ground_truths)
115 |     print('Val accuracy: {}, Val Loss: {}'.format(val_acc,val_loss))
116 |     return val_loss,auc,val_acc,val_sens,val_spec
117 | 
118 | 
119 | def predict(model,device,dataset,batch_size=64):
120 |     batches=int(len(dataset.val_indices)/batch_size)+1
121 |     model.train(False)
122 |     total=0
123 |     ground_truths=dataset.labels[dataset.val_indices]
124 |     predictions=[]
125 |     attention_weights=[]
126 |     loss=0
127 |     criterion=nn.CrossEntropyLoss()
128 |     dataset.switch_mode(training=False)
129 |     dataset.update_batchsize(batch_size)
130 |     with torch.no_grad():
131 |         for i in tqdm(range(len(dataset))):
132 |             data=dataset[i]
133 |             X=torch.Tensor(data['data']).to(device,).long()
134 |             Y=torch.Tensor(data['labels']).to(device,dtype=torch.int64)
135 |             directions=data['directions']
136 |             directions=directions.reshape(len(directions),1)*np.ones(X.shape)
137 |             directions=torch.Tensor(directions).to(device).long()
138 |             output,_,_,aw= model(X,directions,None)
139 |             del X
140 |             loss+=criterion(output,Y)
141 |             classification_predictions = torch.argmax(output,dim=1).squeeze()
142 |             for pred in output:
143 |                 predictions.append(pred.cpu().numpy())
144 |             for weight in aw:
145 |                 attention_weights.append(weight.cpu().numpy())
146 | 
147 |             del output
148 |     torch.cuda.empty_cache()
149 |     val_loss=(loss/batches).cpu()
150 |     predictions=np.asarray(predictions)
151 |     attention_weights=np.asarray(attention_weights)
152 |     binary_predictions=predictions.copy()
153 |     binary_predictions[binary_predictions==2]=1
154 |     binary_ground_truths=ground_truths.copy()
155 |     binary_ground_truths[binary_ground_truths==2]=1
156 |     return predictions,attention_weights,np.asarray(dataset.data[dataset.val_indices])
157 | 


--------------------------------------------------------------------------------
/src/Viral_identification/Logger.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from os import path
 3 | 
 4 | 
 5 | class CSVLogger:
 6 |     def __init__(self,columns,file):
 7 |         self.columns=columns
 8 |         self.file=file
 9 |         if not self.check_header():
10 |             self._write_header()
11 | 
12 | 
13 |     def check_header(self):
14 |         if path.exists(self.file):
15 |             header=True
16 |         else:
17 |             header=False
18 |         return header
19 | 
20 | 
21 |     def _write_header(self):
22 |         with open(self.file,"a") as f:
23 |             string=""
24 |             for attrib in self.columns:
25 |                 string+="{},".format(attrib)
26 |             string=string[:len(string)-1]
27 |             string+="\n"
28 |             f.write(string)
29 |         return self
30 | 
31 |     def log(self,row):
32 |         if len(row)!=len(self.columns):
33 |             raise Exception("Mismatch between row vector and number of columns in logger")
34 |         with open(self.file,"a") as f:
35 |             string=""
36 |             for attrib in row:
37 |                 string+="{},".format(attrib)
38 |             string=string[:len(string)-1]
39 |             string+="\n"
40 |             f.write(string)
41 |         return self
42 | 


--------------------------------------------------------------------------------
/src/Viral_identification/LrScheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def update_lr(optimizer, lr):
 5 |     for param_group in optimizer.param_groups:
 6 |         param_group['lr'] = lr
 7 | 
 8 | class lr_AIAYN():
 9 |     '''
10 |     Learning rate scheduler from the paper:
11 |     Attention is All You Need
12 |     '''
13 |     def __init__(self,optimizer,d_model,warmup_steps=4000,factor=1):
14 |         self.optimizer=optimizer
15 |         self.d_model=d_model
16 |         self.warmup_steps=warmup_steps
17 |         self.step_num=0
18 |         self.factor=factor
19 | 
20 |     def step(self):
21 |         self.step_num+=1
22 |         lr=self.d_model**-0.5*np.min([self.step_num**-0.5,
23 |                                       self.step_num*self.warmup_steps**-1.5])*self.factor
24 |         update_lr(self.optimizer,lr)
25 |         return lr
26 |         
27 |         
28 | class Cos_Anneal():
29 |     '''
30 |     Learning rate scheduler flat and anneal
31 |     '''
32 |     def __init__(self,optimizer,max_lr,min_lr,T):
33 |         self.optimizer=optimizer
34 |         self.max_lr=max_lr
35 |         self.min_lr=min_lr
36 |         self.step_num=0
37 |         self.T=T
38 | 
39 |     def step(self):
40 |         pi=3.1415
41 |         self.step_num+=1
42 |         lr=self.min_lr+0.5*(self.max_lr-self.min_lr)*(1+np.cos(self.step_num/self.T*pi))
43 |         if self.optimizer:
44 |             update_lr(self.optimizer,lr)
45 |         return lr        


--------------------------------------------------------------------------------
/src/Viral_identification/Metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def accuracy(predictions,ground_truths):
 5 |     return np.sum(predictions==ground_truths)/len(ground_truths)
 6 |     
 7 |     
 8 | def sensitivity(predictions,ground_truths):
 9 |     '''
10 |     Here it is assumed:
11 |     0=negative
12 |     1=positive
13 |     '''
14 |     return 1-len(predictions[(predictions==0)*(ground_truths==1)])/len(ground_truths[ground_truths==1])
15 | 
16 | 
17 | 
18 | def specificity(predictions,ground_truths):
19 |     '''
20 |     Here it is assumed:
21 |     0=negative
22 |     1=positive
23 |     '''
24 |     return 1-len(predictions[(predictions==1)*(ground_truths==0)])/len(ground_truths[ground_truths==0])
25 |    
26 | def MCC(predictions,ground_truths):
27 |     '''
28 |     Here it is assumed:
29 |     0=negative
30 |     1=positive
31 |     '''
32 |     N1=len(predictions[(predictions==0)&(ground_truths==1)])
33 |     N2=len(predictions[(predictions==1)&(ground_truths==0)])
34 |     N3=len(ground_truths[ground_truths==1])
35 |     N4=len(ground_truths[ground_truths==0])
36 |     sens=1-N1/N3
37 |     spec=1-N2/N4
38 |     denom=np.sqrt((1+(N2-N1)/N3)*(1+(N1-N2)/N4))
39 |     return (1-sens-spec)/denom
40 |     
41 |     
42 |     


--------------------------------------------------------------------------------
/src/Viral_identification/evaluate_test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.nn as nn
  4 | import time
  5 | from Functions import *
  6 | from Dataset import *
  7 | from Network import *
  8 | from LrScheduler import *
  9 | import Metrics
 10 | from Logger import CSVLogger
 11 | import argparse
 12 | try:
 13 |     #from apex.parallel import DistributedDataParallel as DDP
 14 |     from apex.fp16_utils import *
 15 |     from apex import amp, optimizers
 16 |     from apex.multi_tensor_apply import multi_tensor_applier
 17 | except ImportError:
 18 |     raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")
 19 | from tqdm import tqdm
 20 | 
 21 | def get_args():
 22 |     parser = argparse.ArgumentParser()
 23 |     parser.add_argument('--gpu_id', type=str, default='0,1',  help='which gpu to use')
 24 |     parser.add_argument('--path', type=str, default='../', help='path of csv file with DNA sequences and labels')
 25 |     parser.add_argument('--epochs', type=int, default=150, help='number of epochs to train')
 26 |     parser.add_argument('--batch_size', type=int, default=24, help='size of each batch during training')
 27 |     parser.add_argument('--weight_decay', type=float, default=0, help='weight dacay used in optimizer')
 28 |     parser.add_argument('--ntoken', type=int, default=4, help='number of tokens to represent DNA nucleotides (should always be 4)')
 29 |     parser.add_argument('--nclass', type=int, default=2, help='number of classes from the linear decoder')
 30 |     parser.add_argument('--ninp', type=int, default=512, help='ninp for transformer encoder')
 31 |     parser.add_argument('--nhead', type=int, default=8, help='nhead for transformer encoder')
 32 |     parser.add_argument('--nhid', type=int, default=2048, help='nhid for transformer encoder')
 33 |     parser.add_argument('--nlayers', type=int, default=6, help='nlayers for transformer encoder')
 34 |     parser.add_argument('--save_freq', type=int, default=1, help='saving checkpoints per save_freq epochs')
 35 |     parser.add_argument('--dropout', type=float, default=.1, help='transformer dropout')
 36 |     parser.add_argument('--warmup_steps', type=int, default=3200, help='training schedule warmup steps')
 37 |     parser.add_argument('--lr_scale', type=float, default=0.1, help='learning rate scale')
 38 |     parser.add_argument('--nmute', type=int, default=18, help='number of mutations during training')
 39 |     parser.add_argument('--kmers', type=int, nargs='+', default=[2,3,4,5,6], help='k-mers to be aggregated')
 40 |     #parser.add_argument('--kmer_aggregation', type=bool, default=True, help='k-mers to be aggregated')
 41 |     parser.add_argument('--kmer_aggregation', dest='kmer_aggregation', action='store_true')
 42 |     parser.add_argument('--no_kmer_aggregation', dest='kmer_aggregation', action='store_false')
 43 |     parser.set_defaults(kmer_aggregation=True)
 44 |     parser.add_argument('--nfolds', type=int, default=5, help='number of cross validation folds')
 45 |     parser.add_argument('--fold', type=int, default=0, help='which fold to train')
 46 |     parser.add_argument('--val_freq', type=int, default=1, help='which fold to train')
 47 |     opts = parser.parse_args()
 48 |     return opts
 49 | 
 50 | 
 51 | opts=get_args()
 52 | #gpu selection
 53 | os.environ["CUDA_VISIBLE_DEVICES"] = opts.gpu_id
 54 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 55 | #lr=0
 56 | 
 57 | #checkpointing
 58 | checkpoints_folder='checkpoints_fold{}'.format((opts.fold))
 59 | csv_file='log_fold{}.csv'.format((opts.fold))
 60 | columns=['epoch','train_loss','train_acc','recon_acc',
 61 |          'val_loss','val_auc','val_acc','val_sens','val_spec']
 62 | #logger=CSVLogger(columns,csv_file)
 63 | 
 64 | #build model and logger
 65 | MODELS=[]
 66 | for i in range(3):
 67 |     model=NucleicTransformer(opts.ntoken, opts.nclass, opts.ninp, opts.nhead, opts.nhid,
 68 |                            opts.nlayers, opts.kmer_aggregation, kmers=opts.kmers,
 69 |                            dropout=opts.dropout).to(device)
 70 |     optimizer=torch.optim.Adam(model.parameters(), weight_decay=opts.weight_decay)
 71 |     criterion=nn.CrossEntropyLoss(reduction='none')
 72 |     lr_schedule=lr_AIAYN(optimizer,opts.ninp,opts.warmup_steps,opts.lr_scale)
 73 |     # Initialization
 74 |     opt_level = 'O1'
 75 |     model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
 76 |     model = nn.DataParallel(model)
 77 | 
 78 | 
 79 |     pytorch_total_params = sum(p.numel() for p in model.parameters())
 80 |     print('Total number of paramters: {}'.format(pytorch_total_params))
 81 | 
 82 |     model.load_state_dict(torch.load("best_weights/fold0top{}.ckpt".format(i+1)))
 83 |     model.eval()
 84 |     MODELS.append(model)
 85 | 
 86 | dict=MODELS[0].module.state_dict()
 87 | for key in dict:
 88 |     for i in range(1,len(MODELS)):
 89 |         dict[key]=dict[key]+MODELS[i].module.state_dict()[key]
 90 | 
 91 |     dict[key]=dict[key]/float(len(MODELS))
 92 | 
 93 | MODELS[0].module.load_state_dict(dict)
 94 | avg_model=MODELS[0]
 95 | 
 96 | def geometric_mean(preds):
 97 |     gmean=np.ones(preds.shape[1:])
 98 | 
 99 |     for pred in preds:
100 |         gmean=gmean*pred
101 | 
102 |     gmean=gmean**(1/len(preds))
103 |     return gmean
104 | 
105 | df=pd.read_csv('../fullset_test.csv',header=None)
106 | 
107 | seqs=[]
108 | labels=[]
109 | 
110 | for i in range(len(df)):
111 |     seqs.append(nucleatide2int(df.iloc[i,1]))
112 |     labels.append(df.iloc[i,2])
113 | labels=np.asarray(labels).astype("int")
114 | seqs=np.asarray(seqs).astype("int")
115 | 
116 | 
117 | batch_size=128
118 | batches=np.around(len(df)/batch_size+0.5).astype('int')
119 | preds=[]
120 | softmax = nn.Softmax(dim=1)
121 | for i in tqdm(range(batches)):
122 |     with torch.no_grad():
123 |         outputs=[]
124 |         #for model in MODELS:
125 |         x=torch.Tensor(seqs[i*batch_size:(i+1)*batch_size]).to(device).long()
126 |         y=softmax(avg_model(x))
127 |         #outputs.append(softmax(y).cpu().numpy())
128 |         for vec in y:
129 |             preds.append(vec.cpu().numpy())
130 | 
131 | from sklearn import metrics
132 | preds=np.asarray(preds)
133 | auc=metrics.roc_auc_score(labels,preds[:,1])
134 | 
135 | with open("test_results.p",'wb+') as f:
136 |     pickle.dump([labels,preds],f)
137 | 
138 | 
139 | print(auc)
140 | with open("test_score.txt",'w+') as f:
141 |     f.write("test auc score: {}".format(auc))
142 | 
143 | 
144 | 
145 | 
146 | # for i in range(3,10):
147 |     # ngrams=np.arange(2,i)
148 |     # print(ngrams)
149 |     # train_fold(0,ngrams)
150 | # # train_fold(0,[2,3,4])
151 | 


--------------------------------------------------------------------------------
/src/Viral_identification/readme.md:
--------------------------------------------------------------------------------
 1 | # Source code to train nucleic transformer to reproduce results in the paper for the viraminer dataset
 2 | 
 3 | Dataset can be downloaded at https://github.com/NeuroCSUT/ViraMiner/tree/master/data/DNA_data
 4 | 
 5 | Download fullset_test.csv, fullset_train.csv, and fullset_validation.csv and put them on directory above the folder where you plan to run training (their paths should be ../fullset_test.csv etc)
 6 | 
 7 | To run training:  ```./run.sh```
 8 | 
 9 | You might need to lower the batch size depending on what GPU you have. If you run into memory error with cuda, lower --batch_size in run.sh
10 | 
11 | To check results on the test set: ```./evaluate_test.sh```
12 | 
13 | Test results will be saved in a pickle file named test_results.p, and the AUC score will be printed out to test_score.txt
14 | 


--------------------------------------------------------------------------------
/src/Viral_identification/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | 
4 | python train.py --gpu_id 0 --kmer_aggregation --nmute 40 --epochs 60 --nlayers 6 \
5 | --batch_size 128 --kmers 13 --lr_scale 0.1 --ninp 512 --nhid 2048 --num_workers 8
6 | 


--------------------------------------------------------------------------------
/src/Viral_identification/test.sh:
--------------------------------------------------------------------------------
1 | python evaluate_test.py --gpu_id 0,1 --kmer_aggregation --nmute 20 --epochs 100 --nlayers 6 \
2 | --batch_size 128 --kmers 13 --lr_scale 0.1 --ninp 512 --nhid 2048
3 | 


--------------------------------------------------------------------------------
/src/Viral_identification/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.nn as nn
  4 | import time
  5 | from Functions import *
  6 | from Dataset import *
  7 | from Network import *
  8 | from LrScheduler import *
  9 | import Metrics
 10 | from Logger import CSVLogger
 11 | import argparse
 12 | try:
 13 |     #from apex.parallel import DistributedDataParallel as DDP
 14 |     from apex.fp16_utils import *
 15 |     from apex import amp, optimizers
 16 |     from apex.multi_tensor_apply import multi_tensor_applier
 17 | except ImportError:
 18 |     raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")
 19 | 
 20 | 
 21 | def get_args():
 22 |     parser = argparse.ArgumentParser()
 23 |     parser.add_argument('--gpu_id', type=str, default='0',  help='which gpu to use')
 24 |     parser.add_argument('--path', type=str, default='../', help='path of csv file with DNA sequences and labels')
 25 |     parser.add_argument('--epochs', type=int, default=150, help='number of epochs to train')
 26 |     parser.add_argument('--batch_size', type=int, default=24, help='size of each batch during training')
 27 |     parser.add_argument('--weight_decay', type=float, default=0, help='weight dacay used in optimizer')
 28 |     parser.add_argument('--ntoken', type=int, default=4, help='number of tokens to represent DNA nucleotides (should always be 4)')
 29 |     parser.add_argument('--nclass', type=int, default=2, help='number of classes from the linear decoder')
 30 |     parser.add_argument('--ninp', type=int, default=512, help='ninp for transformer encoder')
 31 |     parser.add_argument('--nhead', type=int, default=8, help='nhead for transformer encoder')
 32 |     parser.add_argument('--nhid', type=int, default=2048, help='nhid for transformer encoder')
 33 |     parser.add_argument('--nlayers', type=int, default=6, help='nlayers for transformer encoder')
 34 |     parser.add_argument('--save_freq', type=int, default=1, help='saving checkpoints per save_freq epochs')
 35 |     parser.add_argument('--dropout', type=float, default=.1, help='transformer dropout')
 36 |     parser.add_argument('--warmup_steps', type=int, default=3200, help='training schedule warmup steps')
 37 |     parser.add_argument('--lr_scale', type=float, default=0.1, help='learning rate scale')
 38 |     parser.add_argument('--nmute', type=int, default=18, help='number of mutations during training')
 39 |     parser.add_argument('--kmers', type=int, nargs='+', default=[2,3,4,5,6], help='k-mers to be aggregated')
 40 |     #parser.add_argument('--kmer_aggregation', type=bool, default=True, help='k-mers to be aggregated')
 41 |     parser.add_argument('--kmer_aggregation', dest='kmer_aggregation', action='store_true')
 42 |     parser.add_argument('--no_kmer_aggregation', dest='kmer_aggregation', action='store_false')
 43 |     parser.set_defaults(kmer_aggregation=True)
 44 |     parser.add_argument('--nfolds', type=int, default=5, help='number of cross validation folds')
 45 |     parser.add_argument('--fold', type=int, default=0, help='which fold to train')
 46 |     parser.add_argument('--val_freq', type=int, default=1, help='which fold to train')
 47 |     parser.add_argument('--num_workers', type=int, default=1, help='num_workers')
 48 |     opts = parser.parse_args()
 49 |     return opts
 50 | 
 51 | def train_fold():
 52 | 
 53 |     opts=get_args()
 54 |     seed_everything(2020)
 55 |     #gpu selection
 56 |     os.environ["CUDA_VISIBLE_DEVICES"] = opts.gpu_id
 57 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 58 | 
 59 |     train_df=pd.read_csv(os.path.join("..","fullset_train.csv"))
 60 |     val_df=pd.read_csv(os.path.join("..","fullset_validation.csv"))
 61 | 
 62 |     dataset=ViraminerDataset(train_df.iloc[:,1],train_df.iloc[:,2])
 63 |     dataloader=torch.utils.data.DataLoader(dataset,batch_size=opts.batch_size,shuffle=True,num_workers=opts.num_workers)
 64 |     val_dataset=ViraminerDataset(val_df.iloc[:,1],val_df.iloc[:,2])
 65 |     val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=opts.batch_size*2,shuffle=False)
 66 | 
 67 |     #exit()
 68 |     #lr=0
 69 | 
 70 |     #checkpointing
 71 |     checkpoints_folder='checkpoints_fold{}'.format((opts.fold))
 72 |     csv_file='log_fold{}.csv'.format((opts.fold))
 73 |     columns=['epoch','train_acc',
 74 |              'val_loss','val_auc','val_acc','val_sens','val_spec']
 75 |     logger=CSVLogger(columns,csv_file)
 76 | 
 77 |     #build model and logger
 78 |     model=NucleicTransformer(opts.ntoken, opts.nclass, opts.ninp, opts.nhead, opts.nhid,
 79 |                            opts.nlayers, opts.kmer_aggregation, kmers=opts.kmers,
 80 |                            dropout=opts.dropout).to(device)
 81 |     optimizer=torch.optim.Adam(model.parameters(), weight_decay=opts.weight_decay)
 82 |     criterion=nn.CrossEntropyLoss(reduction='none')
 83 |     lr_schedule=lr_AIAYN(optimizer,opts.ninp,opts.warmup_steps,opts.lr_scale)
 84 |     # Initialization
 85 |     opt_level = 'O1'
 86 |     model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
 87 |     model = nn.DataParallel(model)
 88 |     softmax = nn.Softmax(dim=1)
 89 | 
 90 |     pytorch_total_params = sum(p.numel() for p in model.parameters())
 91 |     print('Total number of paramters: {}'.format(pytorch_total_params))
 92 | 
 93 |     print("Starting training for fold {}/{}".format(opts.fold,opts.nfolds))
 94 |     #training loop
 95 |     for epoch in range(opts.epochs):
 96 |         model.train(True)
 97 |         t=time.time()
 98 |         total_loss=0
 99 |         optimizer.zero_grad()
100 |         total_steps=len(dataloader)
101 |         for step, data in enumerate(dataloader):
102 |         #for step in range(1):
103 |             lr=lr_schedule.step()
104 |             src=data['data']
105 |             labels=data['labels'].to(device)
106 |             mutated_sequence=mutate_dna_sequence(src,opts.nmute).to(device)
107 |             output=model(mutated_sequence)
108 |             loss_weight=torch.ones(len(output),device=device)
109 |             loss=torch.mean(criterion(output,labels))
110 | 
111 | 
112 |             with amp.scale_loss(loss, optimizer) as scaled_loss:
113 |                scaled_loss.backward()
114 |             torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
115 |             optimizer.step()
116 |             optimizer.zero_grad()
117 |             total_loss+=loss
118 |             print ("Epoch [{}/{}], Step [{}/{}] Loss: {:.3f} Lr:{:.6f} Time: {:.1f}"
119 |                            .format(epoch+1, opts.epochs, step+1, total_steps, total_loss/(step+1) , lr,time.time()-t),end='\r',flush=True) #total_loss/(step+1)
120 |             #break
121 |         print('')
122 | 
123 |         train_loss=total_loss/(step+1)
124 | 
125 |         if (epoch+1)%opts.val_freq==0:
126 |             val_loss,auc,val_acc,val_sens,val_spec=validate(model,device,val_dataloader,batch_size=opts.batch_size*2)
127 |             print("Epoch {} train loss: {}".format(epoch+1,train_loss))
128 | 
129 |             to_log=[epoch+1,train_loss,val_loss,auc,val_acc,val_sens,val_spec]
130 |             logger.log(to_log)
131 | 
132 | 
133 |         if (epoch+1)%opts.save_freq==0:
134 |             save_weights(model,optimizer,epoch,checkpoints_folder)
135 | 
136 | 
137 |     get_best_weights_from_fold(opts.fold)
138 | 
139 | train_fold()
140 | 


--------------------------------------------------------------------------------