├── prismnet ├── utils │ ├── __init__.py │ ├── acgu.npz │ ├── xprint.py │ ├── visualize.py │ ├── metrics.py │ └── datautils.py ├── __init__.py ├── engine │ ├── __init__.py │ └── train_loop.py ├── model │ ├── __init__.py │ ├── se.py │ ├── resnet.py │ ├── utils.py │ ├── smoothgrad.py │ └── PrismNet.py └── loader.py ├── tools ├── .DS_Store ├── gdata_bin.sh ├── generate_dataset.py └── main.py ├── data └── TIA1_Hela.h5 ├── .gitignore ├── requirements.txt ├── exp ├── prismnet │ ├── eval.sh │ ├── har.sh │ ├── infer.sh │ ├── saliency.sh │ ├── saliencyimg.sh │ ├── train.sh │ ├── saliencyimg_infer.sh │ └── train_all.sh └── logistic_reg │ ├── run.sh │ ├── gdata.py │ └── main.py ├── setup.py ├── LICENSE ├── motif_construct ├── motif_sig.R └── saliency_motif.pl └── README.md /prismnet/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .xprint import log_print -------------------------------------------------------------------------------- /tools/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kuixu/PrismNet/HEAD/tools/.DS_Store -------------------------------------------------------------------------------- /data/TIA1_Hela.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kuixu/PrismNet/HEAD/data/TIA1_Hela.h5 -------------------------------------------------------------------------------- /prismnet/utils/acgu.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kuixu/PrismNet/HEAD/prismnet/utils/acgu.npz -------------------------------------------------------------------------------- /prismnet/__init__.py: -------------------------------------------------------------------------------- 1 | from .engine import * 2 | from .utils import log_print 3 | from .model import * 4 | -------------------------------------------------------------------------------- /prismnet/engine/__init__.py: -------------------------------------------------------------------------------- 1 | from .train_loop import train, validate, inference, compute_saliency, compute_saliency_img, compute_high_attention_region 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .vscode 3 | build 4 | dist 5 | prismnet.egg-info 6 | out 7 | log.txt 8 | events.out* 9 | *.pth 10 | .DS_Store 11 | *.pdf -------------------------------------------------------------------------------- /prismnet/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .PrismNet import PrismNet, PrismNet_large 2 | from .utils import param_num 3 | from .smoothgrad import GuidedBackpropSmoothGrad -------------------------------------------------------------------------------- /tools/gdata_bin.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | d=clip_data 3 | for p in `cat data/${d}/all.list` 4 | do 5 | python -u tools/generate_dataset.py $p 1 5 data/$d 6 | done 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scipy==1.1.0 2 | termcolor 3 | h5py 4 | scikit-learn>=0.19.1 5 | torch==1.1.0 6 | tensorboardX 7 | tqdm>=4.28.1 8 | matplotlib>=3.0.2 9 | einops 10 | pandas 11 | -------------------------------------------------------------------------------- /exp/prismnet/eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | work_path=$(dirname $0) 3 | name=$(basename $work_path) 4 | # echo `date +%Y%m%d%H%M%S` 5 | 6 | p=$1 7 | d=$2 8 | 9 | exp=$name 10 | 11 | python -u tools/main.py \ 12 | --load_best \ 13 | --eval \ 14 | --data_dir data/$d \ 15 | --p_name $p\ 16 | --out_dir $work_path \ 17 | --exp_name $exp\ 18 | ${@:5}| tee $work_path/out/log.txt 19 | -------------------------------------------------------------------------------- /exp/prismnet/har.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | work_path=$(dirname $0) 3 | name=$(basename $work_path) 4 | # echo `date +%Y%m%d%H%M%S` 5 | 6 | p=$1 7 | infer_file=$2 8 | 9 | exp=$name 10 | 11 | python -u tools/main.py \ 12 | --load_best \ 13 | --har \ 14 | --infer_file $infer_file \ 15 | --p_name $p\ 16 | --out_dir $work_path \ 17 | --exp_name $exp\ 18 | ${@:6}| tee $work_path/out/log.txt 19 | -------------------------------------------------------------------------------- /exp/prismnet/infer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | work_path=$(dirname $0) 3 | name=$(basename $work_path) 4 | # echo `date +%Y%m%d%H%M%S` 5 | 6 | p=$1 7 | 8 | infer_file=$2 9 | exp=$name 10 | 11 | python -u tools/main.py \ 12 | --load_best \ 13 | --infer \ 14 | --infer_file $infer_file \ 15 | --p_name $p\ 16 | --out_dir $work_path \ 17 | --exp_name $exp\ 18 | ${@:6}| tee $work_path/out/log.txt 19 | -------------------------------------------------------------------------------- /exp/prismnet/saliency.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | work_path=$(dirname $0) 3 | name=$(basename $work_path) 4 | # echo `date +%Y%m%d%H%M%S` 5 | 6 | p=$1 7 | infer_file=$2 8 | 9 | exp=$name 10 | 11 | python -u tools/main.py \ 12 | --load_best \ 13 | --saliency \ 14 | --infer_file $infer_file \ 15 | --p_name $p\ 16 | --out_dir $work_path \ 17 | --exp_name $exp\ 18 | ${@:5}| tee $work_path/out/log.txt 19 | -------------------------------------------------------------------------------- /exp/prismnet/saliencyimg.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | work_path=$(dirname $0) 3 | name=$(basename $work_path) 4 | # echo `date +%Y%m%d%H%M%S` 5 | 6 | p=$1 7 | infer_file=$2 8 | 9 | exp=$name 10 | 11 | python -u tools/main.py \ 12 | --load_best \ 13 | --saliency_img \ 14 | --infer_file $infer_file \ 15 | --p_name $p\ 16 | --out_dir $work_path \ 17 | --exp_name $exp\ 18 | ${@:5}| tee $work_path/out/log.txt 19 | -------------------------------------------------------------------------------- /exp/prismnet/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | work_path=$(dirname $0) 3 | name=$(basename $work_path) 4 | # echo `date +%Y%m%d%H%M%S` 5 | 6 | p=$1 7 | d=$2 8 | 9 | exp=$name 10 | 11 | python -u tools/main.py \ 12 | --train \ 13 | --eval \ 14 | --lr 0.001 \ 15 | --data_dir data/$d \ 16 | --p_name $p\ 17 | --out_dir $work_path \ 18 | --exp_name $exp\ 19 | ${@:5} 20 | 21 | #| tee $work_path/out/log.txt 22 | -------------------------------------------------------------------------------- /prismnet/utils/xprint.py: -------------------------------------------------------------------------------- 1 | try: 2 | from termcolor import cprint 3 | except ImportError: 4 | cprint = None 5 | 6 | try: 7 | from pycrayon import CrayonClient 8 | except ImportError: 9 | CrayonClient = None 10 | 11 | def log_print(text, color=None, on_color=None, attrs=None): 12 | if cprint is not None: 13 | cprint(text, color=color, on_color=on_color, attrs=attrs) 14 | else: 15 | print(text) 16 | 17 | 18 | -------------------------------------------------------------------------------- /exp/prismnet/saliencyimg_infer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | work_path=$(dirname $0) 3 | name=$(basename $work_path) 4 | # echo `date +%Y%m%d%H%M%S` 5 | 6 | p=$1 7 | d=$2 8 | infer=$3 9 | 10 | exp=$name 11 | 12 | python -u tools/main.py \ 13 | --load_best \ 14 | --saliency_img \ 15 | --infer \ 16 | --infer_file $infer \ 17 | --data_dir data/$d \ 18 | --p_name $p\ 19 | --out_dir $work_path \ 20 | --exp_name $exp\ 21 | ${@:5}| tee $work_path/out/log.txt 22 | -------------------------------------------------------------------------------- /exp/logistic_reg/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | work_path=$(dirname $0) 3 | 4 | p=HEK293_RBP_HL_bind_matrix_total 5 | p2=$p 6 | la=10 7 | 8 | # part=Test 9 | mkdir $work_path/out 10 | mkdir $work_path/out/models 11 | mkdir $work_path/out/log 12 | 13 | train_data='data/halflife/'$p'.train.npz' 14 | test_data='data/halflife/'$p'.test.npz' 15 | pred_data='data/halflife/'${p2}'.test.npz' 16 | # CUDA_VISIBLE_DEVICE="0" 17 | python -u exp/logistic_reg/main.py \ 18 | --train_data $train_data \ 19 | --test_data $test_data \ 20 | --pred_data $pred_data \ 21 | --model_path $work_path/out/models/${p}_best.model \ 22 | --lam $la \ 23 | ${@:4}| tee -a $work_path/out/log/${p}.txt 24 | -------------------------------------------------------------------------------- /prismnet/model/se.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class SEBlock(nn.Module): 6 | def __init__(self, channel, reduction=2): 7 | super(SEBlock, self).__init__() 8 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 9 | self.fc = nn.Sequential( 10 | nn.Linear(channel, channel // reduction), 11 | nn.ReLU(inplace=True), 12 | nn.Linear(channel // reduction, channel), 13 | nn.Sigmoid() 14 | ) 15 | 16 | def forward(self, x): 17 | b, c, _, _ = x.size() 18 | y = self.avg_pool(x).view(b, c) 19 | y = self.fc(y).view(b, c, 1, 1) 20 | return y -------------------------------------------------------------------------------- /exp/prismnet/train_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | work_path=$(dirname $0) 3 | name=$(basename $work_path) 4 | da=clip_data 5 | 6 | if [ ! -d $work_path/out ];then 7 | mkdir $work_path/out 8 | mkdir $work_path/out/log 9 | fi 10 | # N threads according to your GPU 11 | SEND_THREAD_NUM=2 12 | 13 | ########################### 14 | 15 | tmp_fifofile="/tmp/$$.fifo" 16 | mkfifo "$tmp_fifofile" 17 | exec 6<>"$tmp_fifofile" 18 | for ((i=0;i<$SEND_THREAD_NUM;i++));do 19 | echo 20 | done >&6 21 | 22 | 23 | for p in `cat data/${da}/all.list` 24 | do 25 | read -u6 26 | { 27 | id=${p}_PrismNet_pu 28 | ff=$work_path/out/evals/${id}.metrics 29 | lg=$work_path/out/log/${id}.log 30 | if [ ! -f $ff ] ; then 31 | echo ${p}" ===" 32 | $srun $work_path/train.sh $p $da > $lg 2> $lg 33 | fi 34 | sleep 1 35 | echo >&6 36 | } & 37 | pid=$! 38 | echo $pid 39 | done 40 | 41 | wait 42 | exec 6>&- 43 | exit 0 44 | 45 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | # Author: Kui XU 4 | # Created Time : Mon 3 Jul 2017 09:42:31 PM CST 5 | # File Name: setup.py 6 | # Description: 7 | """ 8 | 9 | from setuptools import setup, find_packages 10 | 11 | with open('requirements.txt') as f: 12 | requirements = f.read().splitlines() 13 | 14 | setup(name='prismnet', 15 | version='0.1.1', 16 | description='PrismNet', 17 | packages=find_packages(), 18 | 19 | author='Kui XU', 20 | author_email='kuixu.cs@gmail.com', 21 | url='https://github.com/kuixu/PrismNet', 22 | install_requires=requirements, 23 | python_requires='>3.6.0', 24 | 25 | classifiers=[ 26 | 'Development Status :: 1 - Beta', 27 | 'Intended Audience :: Science/Research', 28 | 'License :: OSI Approved :: MIT License', 29 | 'Programming Language :: Python :: 3.6', 30 | 'Operating System :: MacOS :: MacOS X', 31 | 'Operating System :: Microsoft :: Windows', 32 | 'Operating System :: CentOS :: Linux', 33 | ], 34 | ) 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 PrismNet 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /motif_construct/motif_sig.R: -------------------------------------------------------------------------------- 1 | ########################################################## 2 | #This R script is to analysis the motif enrichment and cluster 3 | ########################################################## 4 | 5 | Args <- commandArgs() 6 | in_file = Args[6] 7 | out_file = Args[7] 8 | 9 | 10 | human_motif<-read.table(file=in_file, header = F, sep = "\t") 11 | 12 | head(human_motif) 13 | 14 | i = 1 15 | Pvalue <- c() 16 | Odd_ratio <- c() 17 | FDR <- c() 18 | for(i in 1:dim(human_motif)[1]){ 19 | Sum = human_motif[i,2]/human_motif[i,3] 20 | compare<-matrix(floor(c(human_motif[i,2],Sum*0.1,Sum*(1-human_motif[i,3]),Sum*0.9)),nr=2,dimnames= 21 | list(c("sites","not sites"),c("motif","random"))) 22 | fisher_test <- fisher.test(compare,alternative = "greater") 23 | Pvalue <- c(Pvalue, fisher_test$p.value) 24 | Odd_ratio <- c(Odd_ratio, fisher_test$estimate) 25 | } 26 | 27 | FDR <- p.adjust(Pvalue, method = "fdr") 28 | 29 | colnames(human_motif) <- c("motif", "number", "percent") 30 | data1 <- cbind(human_motif, Odd_ratio) 31 | data1 <- cbind(data1, Pvalue) 32 | data1 <- cbind(data1, FDR) 33 | write.table(data1, file = out_file, row.names = F, col.names = T, sep = "\t") 34 | -------------------------------------------------------------------------------- /exp/logistic_reg/gdata.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import pandas as pd 4 | 5 | def g_rand_data(N=5000, M=128): 6 | label = np.random.randint(2,size=N) 7 | data = np.random.rand(N,M) 8 | line = "" 9 | for i in range(N): 10 | datastr = " ".join(["{:d}:{:.3f}".format(j, data[i,j]) for j in range(M) ]) 11 | line += "{:d} {:s}\n".format(label[i], datastr) 12 | print(line) 13 | 14 | def save_file(data, filepath): 15 | print("Y min,max:", data[:,0].min(), data[:,0].max()) 16 | print("X min,max:", data[:,1:].min(), data[:,1:].max()) 17 | print("p/n:", data[:,0].sum()/data.shape[0],) 18 | N,M = data.shape 19 | with open(filepath,"w") as f: 20 | line = "" 21 | for i in range(N): 22 | datastr = " ".join(["{:d}:{:f}".format(j, data[i,j]) for j in range(1, M) ]) 23 | line += "{:d} {:s}\n".format(int(data[i,0]), datastr) 24 | print(line, file=f) 25 | def concat_data(filepath): 26 | raw_data = pd.read_csv(filepath,sep="\t") 27 | data=raw_data.to_numpy()[:,1:] 28 | 29 | # import pdb; pdb.set_trace() 30 | 31 | 32 | t_path = filepath.replace(".txt",".train") 33 | e_path = filepath.replace(".txt",".test") 34 | 35 | # import pdb; pdb.set_trace() 36 | # data = abs(np.concatenate((pos_samples, neg_samples))) 37 | # data = np.concatenate((pos_samples, neg_samples)) 38 | print("min,max:", data[:,1:].min(), data[:,1:].max()) 39 | dmin = data[:,0].min() 40 | dwid = data[:,0].max() - dmin 41 | # data[:,0] = (data[:,0] - dmin)/dwid 42 | # norm 43 | data[:,1:] = (data[:,1:] - data[:,1:].mean())/data[:,1:].std() 44 | N,M = data.shape 45 | perm =np.random.permutation(N) 46 | t_N = int(0.8*N) 47 | 48 | 49 | # save_file(data[perm][:t_N,:], t_path) 50 | np.savez_compressed(t_path+".npz", x=data[perm][:t_N,1:], y=data[perm][:t_N,0],dmin=dmin,dwid=dwid) 51 | print("Training file saved into:", t_path,",", t_N," samples.") 52 | # save_file(data[perm][t_N:,:], e_path) 53 | np.savez_compressed(e_path+".npz", x=data[perm][t_N:,1:], y=data[perm][t_N:,0],dmin=dmin,dwid=dwid) 54 | print("Testing file saved into:", e_path,",", N-t_N," samples.") 55 | 56 | 57 | 58 | 59 | if __name__ == "__main__": 60 | # g_rand_data() 61 | import glob 62 | for f in glob.glob("data/regu6/*.txt"): 63 | print(f) 64 | try: 65 | concat_data(f) 66 | except TypeError: 67 | pass 68 | 69 | -------------------------------------------------------------------------------- /tools/generate_dataset.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | import warnings 5 | warnings.filterwarnings('ignore',category=FutureWarning) 6 | warnings.filterwarnings('ignore',category=RuntimeWarning) 7 | import os, sys, h5py 8 | import pandas as pd 9 | import numpy as np 10 | np.random.seed(100) 11 | 12 | from prismnet.utils import datautils 13 | 14 | def read_csv(path): 15 | # load sequences 16 | df = pd.read_csv(path, sep='\t', header=None) 17 | df = df.loc[df[0]!="Type"] 18 | 19 | Type = 0 20 | loc = 1 21 | Seq = 2 22 | Str = 3 23 | Score = 4 24 | label = 5 25 | 26 | rnac_set = df[Type].to_numpy() 27 | sequences = df[Seq].to_numpy() 28 | structs = df[Str].to_numpy() 29 | targets = df[Score].to_numpy().astype(np.float32).reshape(-1,1) 30 | return sequences, structs, targets 31 | 32 | max_length = 101 33 | only_pos = False 34 | binary = True 35 | 36 | name = sys.argv[1] 37 | is_bin = sys.argv[2] 38 | in_ver = int(sys.argv[3]) 39 | data_path = sys.argv[4] 40 | 41 | print(name) 42 | 43 | 44 | 45 | outfile = name+'.h5' 46 | sequences, structs, targets = read_csv(os.path.join(data_path, name+'.tsv')) 47 | 48 | # combine inpute data 49 | one_hot = datautils.convert_one_hot(sequences, max_length) 50 | structure = np.zeros((len(structs), in_ver-4, max_length)) 51 | for i in range(len(structs)): 52 | struct = structs[i].split(',') 53 | ti = [float(t) for t in struct] 54 | ti = np.array(ti).reshape(1,-1) 55 | structure[i] = np.concatenate([ti], axis=0) 56 | 57 | data = np.concatenate([one_hot, structure], axis=1) 58 | 59 | # preprare targets 60 | if is_bin=="0": 61 | targets = datautils.rescale(targets) 62 | elif is_bin=="1": 63 | targets[targets<0] = 0 64 | targets[targets>0] = 1 65 | 66 | 67 | # split dataset into train, cross-validation, and test set 68 | train, test = datautils.split_dataset(data, targets, valid_frac=0.2) 69 | 70 | target_data_type = np.int32 if is_bin=="1" else np.float32 71 | # save dataset 72 | save_path = os.path.join(data_path, outfile) 73 | print(name, data.shape, len(train[0]), len(test[0]), test[1].max(), test[1].min()) 74 | # print('saving dataset: ', save_path) 75 | with h5py.File(save_path, "w") as f: 76 | dset = f.create_dataset("X_train", data=train[0].astype(np.float32), compression="gzip") 77 | dset = f.create_dataset("Y_train", data=train[1].astype(target_data_type), compression="gzip") 78 | dset = f.create_dataset("X_test", data=test[0].astype(np.float32), compression="gzip") 79 | dset = f.create_dataset("Y_test", data=test[1].astype(target_data_type), compression="gzip") 80 | -------------------------------------------------------------------------------- /prismnet/model/resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class ResidualBlock1D(nn.Module): 7 | 8 | def __init__(self, planes, downsample=True): 9 | super(ResidualBlock1D, self).__init__() 10 | self.c1 = nn.Conv1d(planes, planes, kernel_size=1, stride=1, bias=False) 11 | self.b1 = nn.BatchNorm1d(planes) 12 | self.c2 = nn.Conv1d(planes, planes*2, kernel_size=11, stride=1, 13 | padding=5, bias=False) 14 | self.b2 = nn.BatchNorm1d(planes*2) 15 | self.c3 = nn.Conv1d(planes*2, planes*8, kernel_size=1, stride=1, bias=False) 16 | self.b3 = nn.BatchNorm1d(planes * 8) 17 | self.downsample = nn.Sequential( 18 | nn.Conv1d(planes, planes*8, kernel_size=1, stride=1, bias=False), 19 | nn.BatchNorm1d(planes*8), 20 | ) 21 | self.relu = nn.ReLU(inplace=True) 22 | 23 | def forward(self, x): 24 | identity = x 25 | 26 | out = self.c1(x) 27 | out = self.b1(out) 28 | out = self.relu(out) 29 | 30 | out = self.c2(out) 31 | out = self.b2(out) 32 | out = self.relu(out) 33 | 34 | out = self.c3(out) 35 | out = self.b3(out) 36 | 37 | if self.downsample: 38 | identity = self.downsample(x) 39 | 40 | out += identity 41 | out = self.relu(out) 42 | 43 | return out 44 | 45 | class ResidualBlock2D(nn.Module): 46 | 47 | def __init__(self, planes, kernel_size=(11,5), padding=(5,2), downsample=True): 48 | super(ResidualBlock2D, self).__init__() 49 | self.c1 = nn.Conv2d(planes, planes, kernel_size=1, stride=1, bias=False) 50 | self.b1 = nn.BatchNorm2d(planes) 51 | self.c2 = nn.Conv2d(planes, planes*2, kernel_size=kernel_size, stride=1, 52 | padding=padding, bias=False) 53 | self.b2 = nn.BatchNorm2d(planes*2) 54 | self.c3 = nn.Conv2d(planes*2, planes*4, kernel_size=1, stride=1, bias=False) 55 | self.b3 = nn.BatchNorm2d(planes * 4) 56 | self.downsample = nn.Sequential( 57 | nn.Conv2d(planes, planes*4, kernel_size=1, stride=1, bias=False), 58 | nn.BatchNorm2d(planes*4), 59 | ) 60 | self.relu = nn.ReLU(inplace=True) 61 | 62 | def forward(self, x): 63 | identity = x 64 | 65 | out = self.c1(x) 66 | out = self.b1(out) 67 | out = self.relu(out) 68 | 69 | out = self.c2(out) 70 | out = self.b2(out) 71 | out = self.relu(out) 72 | 73 | out = self.c3(out) 74 | out = self.b3(out) 75 | 76 | if self.downsample: 77 | identity = self.downsample(x) 78 | out += identity 79 | out = self.relu(out) 80 | 81 | return out 82 | 83 | 84 | -------------------------------------------------------------------------------- /prismnet/loader.py: -------------------------------------------------------------------------------- 1 | import os, sys, pdb, h5py 2 | import os.path 3 | import numpy as np 4 | import torch 5 | import torch.utils.data 6 | 7 | class SeqicSHAPE(torch.utils.data.Dataset): 8 | def __init__(self, data_path, is_test=False, is_infer=False, use_structure=True): 9 | """data loader 10 | 11 | Args: 12 | data_path ([str]): h5 file path 13 | is_test (bool, optional): testset or not. Defaults to False. 14 | """ 15 | if is_infer: 16 | self.dataset = self.__load_infer_data__(data_path, use_structure=use_structure) 17 | print("infer data: ", self.__len__()," use_structure: ", use_structure) 18 | else: 19 | dataset = h5py.File(data_path, 'r') 20 | X_train = np.array(dataset['X_train']).astype(np.float32) 21 | Y_train = np.array(dataset['Y_train']).astype(np.int32) 22 | X_test = np.array(dataset['X_test']).astype(np.float32) 23 | Y_test = np.array(dataset['Y_test']).astype(np.int32) 24 | if len(Y_train.shape) == 1: 25 | Y_train = np.expand_dims(Y_train, axis=1) 26 | Y_test = np.expand_dims(Y_test, axis=1) 27 | X_train = np.expand_dims(X_train, axis=3).transpose([0, 3, 2, 1]) 28 | X_test = np.expand_dims(X_test, axis=3).transpose([0, 3, 2, 1]) 29 | 30 | train = {'inputs': X_train, 'targets': Y_train} 31 | test = {'inputs': X_test, 'targets': Y_test} 32 | 33 | labels, nums = np.unique(Y_train,return_counts=True) 34 | print("train:", labels, nums) 35 | labels, nums = np.unique(Y_test,return_counts=True) 36 | print("test:", labels, nums) 37 | 38 | train = self.__prepare_data__(train) 39 | test = self.__prepare_data__(test) 40 | 41 | if is_test: 42 | self.dataset = test 43 | else: 44 | self.dataset = train 45 | 46 | 47 | 48 | def __load_infer_data__(self, data_path, use_structure=True): 49 | from prismnet.utils import datautils 50 | dataset = datautils.load_testset_txt(data_path, use_structure=use_structure, seq_length=101) 51 | return dataset 52 | 53 | 54 | def __prepare_data__(self, data): 55 | inputs = data['inputs'][:,:,:,:4] 56 | structure = data['inputs'][:,:,:,4:] 57 | structure = np.expand_dims(structure[:,:,:,0], axis=3) 58 | inputs = np.concatenate([inputs, structure], axis=3) 59 | data['inputs'] = inputs 60 | return data 61 | 62 | def __to_sequence__(self, x): 63 | x1 = np.zeros_like(x[0,:,:1]) 64 | for i in range(x1.shape[0]): 65 | # import pdb; pdb.set_trace() 66 | x1[i] = np.argmax(x[0,i,:4]) 67 | # import pdb; pdb.set_trace() 68 | return x1 69 | 70 | def __getitem__(self, index): 71 | """ 72 | Args: 73 | index (int): Index 74 | 75 | Returns: 76 | tuple: (image, target) where target is index of the target class. 77 | """ 78 | x = self.dataset['inputs'][index] 79 | # x = self.__to_sequence__(x) 80 | y = self.dataset['targets'][index] 81 | return x, y 82 | 83 | 84 | def __len__(self): 85 | return len(self.dataset['inputs']) 86 | 87 | -------------------------------------------------------------------------------- /prismnet/model/utils.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import numpy as np 3 | 4 | import torch 5 | from sklearn import metrics 6 | 7 | import torch 8 | from torch.optim.lr_scheduler import _LRScheduler 9 | from torch.optim.lr_scheduler import ReduceLROnPlateau 10 | 11 | def param_num(model): 12 | num_param0 = sum(p.numel() for p in model.parameters()) 13 | num_param1 = sum(p.numel() for p in model.parameters() if p.requires_grad) 14 | print("===========================") 15 | print("Total params:", num_param0) 16 | print("Trainable params:", num_param1) 17 | print("Non-trainable params:", num_param0-num_param1) 18 | print("===========================") 19 | 20 | def compute_acc_auc(output, y): 21 | y1 = y.to(device='cpu', dtype=torch.long).numpy() 22 | p_class = (output>=0.5).to(device='cpu').data.numpy() 23 | prob = output.to(device='cpu').data.numpy() 24 | acc = metrics.accuracy_score(y1, p_class) 25 | auc = 0.5 26 | try: 27 | auc = metrics.roc_auc_score(y1, prob) 28 | except Exception as e: 29 | pass 30 | 31 | return acc, auc 32 | 33 | class GradualWarmupScheduler(_LRScheduler): 34 | """ Gradually warm-up(increasing) learning rate in optimizer. 35 | Proposed in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour'. 36 | Args: 37 | optimizer (Optimizer): Wrapped optimizer. 38 | multiplier: target learning rate = base lr * multiplier 39 | total_epoch: target learning rate is reached at total_epoch, gradually 40 | after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau) 41 | """ 42 | 43 | def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None): 44 | self.multiplier = multiplier 45 | if self.multiplier <= 1.: 46 | raise ValueError('multiplier should be greater than 1.') 47 | self.total_epoch = total_epoch 48 | self.after_scheduler = after_scheduler 49 | self.finished = False 50 | super().__init__(optimizer) 51 | 52 | def get_lr(self): 53 | if self.last_epoch > self.total_epoch: 54 | if self.after_scheduler: 55 | if not self.finished: 56 | self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs] 57 | self.finished = True 58 | return self.after_scheduler.get_lr() 59 | return [base_lr * self.multiplier for base_lr in self.base_lrs] 60 | 61 | return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs] 62 | 63 | def step_ReduceLROnPlateau(self, metrics, epoch=None): 64 | if epoch is None: 65 | epoch = self.last_epoch + 1 66 | self.last_epoch = epoch if epoch != 0 else 1 # ReduceLROnPlateau is called at the end of epoch, whereas others are called at beginning 67 | if self.last_epoch <= self.total_epoch: 68 | warmup_lr = [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs] 69 | for param_group, lr in zip(self.optimizer.param_groups, warmup_lr): 70 | param_group['lr'] = lr 71 | else: 72 | if epoch is None: 73 | self.after_scheduler.step(metrics, None) 74 | else: 75 | self.after_scheduler.step(metrics, epoch - self.total_epoch) 76 | 77 | def step(self, epoch=None, metrics=None): 78 | if type(self.after_scheduler) != ReduceLROnPlateau: 79 | if self.finished and self.after_scheduler: 80 | if epoch is None: 81 | self.after_scheduler.step(None) 82 | else: 83 | self.after_scheduler.step(epoch - self.total_epoch) 84 | else: 85 | return super(GradualWarmupScheduler, self).step(epoch) 86 | else: 87 | self.step_ReduceLROnPlateau(metrics, epoch) -------------------------------------------------------------------------------- /prismnet/utils/visualize.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import numpy as np 3 | import matplotlib as mpl 4 | mpl.use("pdf") 5 | import matplotlib.pyplot as plt 6 | import matplotlib.gridspec as gridspec 7 | from scipy.misc import imresize 8 | 9 | package_directory = os.path.dirname(os.path.abspath(__file__)) 10 | acgu_path = os.path.join(package_directory,'acgu.npz') 11 | chars = np.load(acgu_path,allow_pickle=True)['data'] 12 | 13 | def normalize_pwm(pwm, factor=None, MAX=None): 14 | if MAX is None: 15 | MAX = np.max(np.abs(pwm)) 16 | pwm = pwm/MAX 17 | if factor: 18 | pwm = np.exp(pwm*factor) 19 | norm = np.outer(np.ones(pwm.shape[0]), np.sum(np.abs(pwm), axis=0)) 20 | return pwm/norm 21 | 22 | def get_nt_height(pwm, height, norm): 23 | 24 | def entropy(p): 25 | s = 0 26 | for i in range(len(p)): 27 | if p[i] > 0: 28 | s -= p[i]*np.log2(p[i]) 29 | return s 30 | 31 | num_nt, num_seq = pwm.shape 32 | heights = np.zeros((num_nt,num_seq)) 33 | for i in range(num_seq): 34 | if norm == 1: 35 | total_height = height 36 | else: 37 | total_height = (np.log2(num_nt) - entropy(pwm[:, i]))*height 38 | 39 | heights[:,i] = np.floor(pwm[:,i]*np.minimum(total_height, height*2)) 40 | 41 | return heights.astype(int) 42 | 43 | def seq_logo(pwm, height=30, nt_width=10, norm=0, alphabet='rna', colormap='standard'): 44 | 45 | heights = get_nt_height(pwm, height, norm) 46 | num_nt, num_seq = pwm.shape 47 | width = np.ceil(nt_width*num_seq).astype(int) 48 | 49 | max_height = height*2 50 | logo = np.ones((max_height, width, 3)).astype(int)*255 51 | for i in range(num_seq): 52 | nt_height = np.sort(heights[:,i]) 53 | index = np.argsort(heights[:,i]) 54 | remaining_height = np.sum(heights[:,i]) 55 | offset = max_height-remaining_height 56 | 57 | for j in range(num_nt): 58 | if nt_height[j] <=0 : 59 | continue 60 | # resized dimensions of image 61 | nt_img = imresize(chars[index[j]], (nt_height[j], nt_width)) 62 | # determine location of image 63 | height_range = range(remaining_height-nt_height[j], remaining_height) 64 | width_range = range(i*nt_width, i*nt_width+nt_width) 65 | # 'annoying' way to broadcast resized nucleotide image 66 | if height_range: 67 | for k in range(3): 68 | for m in range(len(width_range)): 69 | logo[height_range+offset, width_range[m],k] = nt_img[:,m,k] 70 | 71 | remaining_height -= nt_height[j] 72 | 73 | return logo.astype(np.uint8) 74 | 75 | def plot_saliency(X, W, nt_width=100, norm_factor=3, str_null=None, outdir="results/"): 76 | # filter out zero-padding 77 | plot_index = np.where(np.sum(X[:4,:], axis=0)!=0)[0] 78 | num_nt = len(plot_index) 79 | trace_width = num_nt*nt_width 80 | trace_height = 400 81 | 82 | seq_str_mode = False 83 | if X.shape[0]>4: 84 | seq_str_mode = True 85 | assert str_null is not None, "Null region is not provided." 86 | 87 | # sequence logo 88 | img_seq_raw = seq_logo(X[:4, plot_index], height=nt_width, nt_width=nt_width) 89 | 90 | if seq_str_mode: 91 | # structure line 92 | str_raw = X[4, plot_index] 93 | if str_null.sum() > 0: 94 | str_raw[str_null.T==1] = -0.01 95 | 96 | line_str_raw = np.zeros(trace_width) 97 | for v in range(str_raw.shape[0]): 98 | line_str_raw[v*nt_width:(v+1)*nt_width] = (1-str_raw[v])*trace_height 99 | # i+=1 100 | 101 | # sequence saliency logo 102 | seq_sal = normalize_pwm(W[:4, plot_index], factor=norm_factor) 103 | img_seq_sal_logo = seq_logo(seq_sal, height=nt_width*5, nt_width=nt_width) 104 | img_seq_sal = imresize(W[:4, plot_index], size=(trace_height, trace_width)) 105 | 106 | if seq_str_mode: 107 | # structure saliency logo 108 | str_sal = W[4, plot_index].reshape(1,-1) 109 | img_str_sal = imresize(str_sal, size=(trace_height, trace_width)) 110 | 111 | # plot 112 | fig = plt.figure(figsize=(10.1,2)) 113 | gs = gridspec.GridSpec(nrows=4, ncols=1, height_ratios=[2.5, 1, 0.5, 1]) 114 | cmap_reversed = mpl.cm.get_cmap('jet') 115 | 116 | ax = fig.add_subplot(gs[0, 0]) 117 | ax.axis('off') 118 | ax.imshow(img_seq_sal_logo) 119 | plt.text(x=trace_width-400,y=10, s='PrismNet', fontsize=4) 120 | 121 | ax = fig.add_subplot(gs[1, 0]) 122 | ax.axis('off') 123 | ax.imshow(img_seq_sal, cmap=cmap_reversed) 124 | 125 | ax = fig.add_subplot(gs[2, 0]) 126 | ax.axis('off') 127 | ax.imshow(img_seq_raw) 128 | 129 | if seq_str_mode: 130 | ax = fig.add_subplot(gs[3, 0]) 131 | ax.axis('off') 132 | ax.imshow(img_str_sal, cmap=cmap_reversed) 133 | ax.plot(line_str_raw, '-', color='r', linewidth=1, scalex=False, scaley=False) 134 | 135 | # plot balck line to hide the -1(NULL structure score) 136 | x = (np.zeros(trace_width) + (1+0.01))*trace_height +1.5 137 | ax.plot(x, '-', color='white', linewidth=1.2, scalex=False, scaley=False) 138 | 139 | plt.subplots_adjust(wspace=0, hspace=0) 140 | 141 | # save figure 142 | filepath = outdir 143 | fig.savefig(filepath, format='pdf', dpi=300, bbox_inches='tight') 144 | plt.close('all') 145 | -------------------------------------------------------------------------------- /prismnet/model/smoothgrad.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Kui Xu, xukui.cs@gmail.com 4 | # 2019-02-25 5 | # ref smoothGrad 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | from torch.autograd import grad,Variable 11 | import numpy as np 12 | 13 | class SmoothGrad(object): 14 | def __init__(self, model, device='cpu', only_seq=False, train=False, 15 | x_stddev=0.015, t_stddev=0.015, nsamples=20, magnitude=2): 16 | self.model = model 17 | self.device = device 18 | self.train = train 19 | self.only_seq = only_seq 20 | self.x_stddev = x_stddev 21 | self.t_stddev = t_stddev 22 | self.nsamples = nsamples 23 | self.magnitude = magnitude 24 | self.features = model 25 | # import pdb; pdb.set_trace() 26 | 27 | def get_gradients(self, z, pred_label=None): 28 | self.model.eval() 29 | self.model.zero_grad() 30 | z = z.to(self.device) 31 | z.requires_grad=True 32 | output = self.model(z) 33 | output = torch.sigmoid(output) 34 | output.backward() 35 | return z.grad 36 | 37 | def get_smooth_gradients(self, z, y=None): 38 | return self.__call__(z, y) 39 | 40 | def __call__(self, z, y=None): 41 | """[summary] 42 | 43 | Args: 44 | z ([type]): [description] 45 | y ([type]): [description] 46 | x_stddev (float, optional): [description]. Defaults to 0.15. 47 | t_stddev (float, optional): [description]. Defaults to 0.15. 48 | nsamples (int, optional): [description]. Defaults to 20. 49 | magnitude (int, optional): magnitude:0,1,2; 0: original gradient, 1: absolute value of the gradient, 50 | 2: square value of the gradient. Defaults to 2. 51 | 52 | Returns: 53 | [type]: [description] 54 | """ 55 | 56 | # 1. for sequece 57 | x = z[:,:,:,:4] # .data.cpu() 58 | x_stddev = (self.x_stddev * (x.max()-x.min())).to(self.device).item() 59 | 60 | total_grad = torch.zeros(z.shape).to(self.device) 61 | x_noise = torch.zeros(x.shape).to(self.device) 62 | if not self.only_seq: 63 | # 2. for structure 64 | t = z[:,:,:,4:] #.data.cpu() 65 | t_stddev = (self.t_stddev * (t.max()-t.min())).to(self.device).item() 66 | #t_total_grad = torch.zeros(t.shape) 67 | t_noise = torch.zeros(t.shape).to(self.device) 68 | 69 | for i in range(self.nsamples): 70 | x_plus_noise = x + x_noise.zero_().normal_(0, x_stddev) 71 | if self.only_seq: 72 | z_plus_noise = x_plus_noise 73 | else: 74 | t_plus_noise = t + t_noise.zero_().normal_(0, t_stddev) 75 | z_plus_noise = torch.cat((x_plus_noise, t_plus_noise), dim=3) 76 | #print("z_plus_noise:",z_plus_noise.size()) 77 | grad = self.get_gradients(z_plus_noise, y) 78 | if self.magnitude == 1: 79 | total_grad += torch.abs(grad) 80 | elif self.magnitude == 2: 81 | total_grad += grad * grad 82 | 83 | # total_grad += grad * grad 84 | total_grad /= self.nsamples 85 | return total_grad 86 | 87 | def get_batch_gradients(self, X, Y=None): 88 | if Y is not None: 89 | assert len(X) == len(Y), "The size of input {} and target {} are not matched.".format(len(X), len(Y)) 90 | g = torch.zeros_like(X) 91 | for i in range(X.shape[0]): 92 | x = X[i:i+1] 93 | if Y is not None: 94 | y = Y[i:i+1] 95 | else: 96 | y = None 97 | g[i:i+1] = self.get_smooth_gradients(x, y) 98 | # g[i:i+1] = self.get_gradients(x, y) 99 | return g 100 | 101 | 102 | def generate_saliency(model, x, y=None, smooth=False, nsamples=2, stddev=0.15, only_seq=False, \ 103 | train=False): 104 | saliency = SmoothGrad(model, only_seq, train) 105 | x_grad = saliency.get_smooth_gradients(x, y, nsamples=nsamples, x_stddev=stddev, t_stddev=stddev) 106 | return x_grad 107 | 108 | 109 | 110 | class GuidedBackpropReLU(torch.autograd.Function): 111 | 112 | def __init__(self, inplace=False): 113 | super(GuidedBackpropReLU, self).__init__() 114 | self.inplace = inplace 115 | 116 | def forward(self, input): 117 | pos_mask = (input > 0).type_as(input) 118 | output = torch.addcmul( 119 | torch.zeros(input.size()).type_as(input), 120 | input, 121 | pos_mask) 122 | self.save_for_backward(input, output) 123 | return output 124 | 125 | def backward(self, grad_output): 126 | input, output = self.saved_tensors 127 | 128 | pos_mask_1 = (input > 0).type_as(grad_output) 129 | pos_mask_2 = (grad_output > 0).type_as(grad_output) 130 | grad_input = torch.addcmul( 131 | torch.zeros(input.size()).type_as(input), 132 | torch.addcmul( 133 | torch.zeros(input.size()).type_as(input), grad_output, pos_mask_1), 134 | pos_mask_2) 135 | 136 | return grad_input 137 | 138 | def __repr__(self): 139 | inplace_str = ', inplace' if self.inplace else '' 140 | return self.__class__.__name__ + ' (' \ 141 | + inplace_str + ')' 142 | 143 | class GuidedBackpropSmoothGrad(SmoothGrad): 144 | 145 | def __init__(self, model, device='cpu', only_seq=False, train=False, 146 | x_stddev=0.15, t_stddev=0.15, nsamples=20, magnitude=2): 147 | super(GuidedBackpropSmoothGrad, self).__init__( 148 | model, device, only_seq, train, x_stddev, t_stddev, nsamples, magnitude) 149 | for idx, module in self.features._modules.items(): 150 | if module.__class__.__name__ is 'ReLU': 151 | self.features._modules[idx] = GuidedBackpropReLU() 152 | 153 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PrismNet 2 | 3 | This is a [PyTorch](https://pytorch.org/) implementation of our paper: 4 | ## Predicting dynamic cellular protein-RNA interactions using deep learning and in vivo RNA structure 5 | Lei Sun*, Kui Xu*, Wenze Huang*, Yucheng T. Yang*, Pan Li, Lei Tang, Tuanlin Xiong, Qiangfeng Cliff Zhang 6 | 7 | *: indicates equal contribution. 8 | 9 | Cell Research Version: ([https://www.nature.com/articles/s41422-021-00476-y](https://www.nature.com/articles/s41422-021-00476-y)) 10 | 11 | bioRxiv preprint: ([https://www.biorxiv.org/content/10.1101/2020.05.05.078774v1](https://www.biorxiv.org/content/10.1101/2020.05.05.078774v1)) 12 | 13 | ![prismnet](https://github.com/kuixu/PrismNet/wiki/imgs/prismnet.png) 14 | 15 | 16 | 17 | ### Table of Contents 18 | - [Getting started](#Getting-started) 19 | - [Datasets](#datasets) 20 | - [Usage](#usage) 21 | - [Copyright and License](#copyright-and-license) 22 | - [Reference](#Reference) 23 | 24 | ## Getting started 25 | 26 | 27 | ### Requirements 28 | 29 | - Python 3.6 30 | - PyTorch 1.1.0, with NVIDIA CUDA Support 31 | - pip 32 | 33 | ### Installation 34 | Clone repository: 35 | 36 | ```bash 37 | git clone https://github.com/kuixu/PrismNet.git 38 | ``` 39 | Install packages: 40 | ```bash 41 | cd PrismNet 42 | pip install -r requirements.txt 43 | pip install -e . 44 | ``` 45 | 46 | ## Datasets 47 | 48 | ### Prepare the datasets 49 | 50 | Scripts and pipeline are in preparing, currently, we provide 172 samples data in *.tsv format for training and testing PrismNet. 51 | 52 | ``` 53 | # Download data 54 | cd PrismNet/data 55 | wget https://zhanglabnet.oss-cn-beijing.aliyuncs.com/prismnet/data/clip_data.tgz 56 | tar zxvf clip_data.tgz 57 | 58 | # Generate training and validation set for binary classification 59 | cd PrismNet 60 | tools/gdata_bin.sh 61 | ``` 62 | 63 | 64 | ## Usage 65 | 66 | ### Network Architecture 67 | 68 | ![prismnet](https://github.com/kuixu/PrismNet/wiki/imgs/prismnet-arch.png) 69 | 70 | ### Training 71 | 72 | To train one single protein model from scratch, run 73 | ``` 74 | exp/EXP_NAME/train.sh pu PrismNet TIA1_Hela clip_data 75 | ``` 76 | where you replace `TIA1_Hela` with the name of the data file you want to use, you replace EXP_NAME with a specific name of this experiment. Hyper-parameters could be tuned in `exp/prismnet/train.sh`. For available training options, please take a look at `tools/train.py`. 77 | 78 | To monitor the training process, add option `-tfboard` in `exp/prismnet/train.sh`, and view page at http://localhost:6006 using tensorboard: 79 | ``` 80 | tensorboard --logdir exp/EXP_NAME/out/tfb 81 | ``` 82 | 83 | To train all the protein models, run 84 | ``` 85 | exp/EXP_NAME/train_all.sh 86 | ``` 87 | 88 | ### Evaluation 89 | For evaluation of the models, we provide the script `eval.sh`. You can run it using 90 | ``` 91 | exp/prismnet/eval.sh TIA1_Hela clip_data 92 | ``` 93 | 94 | ### Inference 95 | For inference data (the same format as the *.tsv file used in [Datasets](#datasets)) using the trained models, we provide the script `infer.sh`. You can run it using 96 | ``` 97 | exp/prismnet/infer.sh TIA1_Hela /path/to/inference_file.tsv 98 | ``` 99 | 100 | ### Compute High Attention Regions 101 | For computing high attention regions using the trained models, we provide the script `har.sh`. You can run it using 102 | ``` 103 | exp/prismnet/har.sh TIA1_Hela /path/to/inference_file.tsv 104 | ``` 105 | 106 | ### Compute Saliency 107 | For computing saliency using the trained models, we provide the script `saliency.sh`. You can run it using 108 | ``` 109 | exp/prismnet/saliency.sh TIA1_Hela /path/to/inference_file.tsv 110 | ``` 111 | 112 | ### Plot Saliency Image 113 | For plotting saliency image using the trained models, we provide the script `saliencyimg.sh`. You can run it using 114 | ``` 115 | exp/prismnet/saliencyimg.sh TIA1_Hela /path/to/inference_file.tsv 116 | ``` 117 | 118 | ### Motif Construction 119 | For the construction and analysis of integrative motifs, Users can use the scripts in `motif_construct/` 120 | ``` 121 | perl saliency_motif.pl infile.txt sal outfile 122 | Rscript motif_sig.R outfile_motif_summary.txt outfile_motif_sig.txt 123 | ``` 124 | 125 | ### Integrative motif 126 | 127 | The integrative motif could be downloaded at [here](http://prismnet.zhanglab.net/data/Total_motifs-matrix-logo.xlsx). 128 | 129 | 130 | ### Half Life Analysis (Example) 131 | 132 | #### Download half life data 133 | ``` 134 | cd PrismNet/data 135 | wget https://zhanglabnet.oss-cn-beijing.aliyuncs.com/prismnet/data/halflife_data.tgz 136 | tar zxvf halflife_data.tgz 137 | ``` 138 | 139 | #### Requirements 140 | ``` 141 | pip install xgboost==1.3.0rc1 matplotlib scipy scikit-learn termplotlib 142 | ``` 143 | 144 | #### Run Example 145 | 146 | ``` 147 | exp/logistic_reg/run.sh 148 | ``` 149 | 150 | ### Dataset and Results Visualization 151 | 152 | We also provide a website [http://prismnet.zhanglab.net/](http://prismnet.zhanglab.net/) to visualize the icSHAPE date and the results. 153 | 154 | ## Copyright and License 155 | This project is free to use for non-commercial purposes - see the [LICENSE](LICENSE) file for details. 156 | 157 | ## Reference 158 | 159 | ``` 160 | @article {Sun2021cr, 161 | title = {Predicting dynamic cellular protein-RNA interactions using deep learning and in vivo RNA structure}, 162 | author = {Sun, Lei and Xu, Kui and Huang, Wenze and Yang, Yucheng T. and Li, Pan and Tang, Lei and Xiong, Tuanlin and Zhang, Qiangfeng Cliff}, 163 | year = {2021}, 164 | doi = {https://doi.org/10.1038/s41422-021-00476-y}, 165 | journal = {Cell Research} 166 | } 167 | @article {Sun2021cell, 168 | title = {In vivo structural characterization of the whole SARS-CoV-2 RNA genome identifies host cell target proteins vulnerable to re-purposed drugs}, 169 | author = {Sun, Lei and Li, Pan and Ju, Xiaohui and Rao, Jian and Huang, Wenze and Zhang, Shaojun and Xiong, Tuanlin and Xu, Kui and Zhou, Xiaolin and Ren, Lili and Ding, Qiang and Wang, Jianwei and Zhang, Qiangfeng Cliff}, 170 | year = {2021}, 171 | doi = {https://doi.org/10.1016/j.cell.2021.02.008}, 172 | journal = {Cell} 173 | } 174 | ``` 175 | -------------------------------------------------------------------------------- /prismnet/model/PrismNet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from .resnet import ResidualBlock1D, ResidualBlock2D 5 | from .se import SEBlock 6 | 7 | class Conv2d(nn.Module): 8 | def __init__(self, in_channels, out_channels, kernel_size, stride=1, relu=True, same_padding=False, bn=False): 9 | super(Conv2d, self).__init__() 10 | p0 = int((kernel_size[0] - 1) / 2) if same_padding else 0 11 | p1 = int((kernel_size[1] - 1) / 2) if same_padding else 0 12 | padding = (p0, p1) 13 | self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding=padding) 14 | self.bn = nn.BatchNorm2d(out_channels) if bn else None 15 | self.relu = nn.ReLU(inplace=True) if relu else None 16 | 17 | def forward(self, x): 18 | x = self.conv(x) 19 | if self.bn is not None: 20 | x = self.bn(x) 21 | if self.relu is not None: 22 | x = self.relu(x) 23 | return x 24 | 25 | class PrismNet(nn.Module): 26 | def __init__(self, mode="pu"): 27 | super(PrismNet, self).__init__() 28 | self.mode = mode 29 | h_p, h_k = 2, 5 30 | if mode=="pu": 31 | self.n_features = 5 32 | elif mode=="seq": 33 | self.n_features = 4 34 | h_p, h_k = 1, 3 35 | elif mode=="str": 36 | self.n_features = 1 37 | h_p, h_k = 0, 1 38 | else: 39 | raise "mode error" 40 | 41 | base_channel = 8 42 | self.conv = Conv2d(1, base_channel, kernel_size=(11, h_k), bn = True, same_padding=True) 43 | self.se = SEBlock(base_channel) 44 | self.res2d = ResidualBlock2D(base_channel, kernel_size=(11, h_k), padding=(5, h_p)) 45 | self.res1d = ResidualBlock1D(base_channel*4) 46 | self.avgpool = nn.AvgPool2d((1,self.n_features)) 47 | self.gpool = nn.AdaptiveAvgPool1d(1) 48 | self.fc = nn.Linear(base_channel*4*8, 1) 49 | self._initialize_weights() 50 | 51 | def _initialize_weights(self): 52 | for m in self.modules(): 53 | if isinstance(m, nn.Conv2d): 54 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 55 | if m.bias is not None: 56 | nn.init.constant_(m.bias, 0) 57 | elif isinstance(m, nn.Conv1d): 58 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 59 | if m.bias is not None: 60 | nn.init.constant_(m.bias, 0) 61 | elif isinstance(m, nn.BatchNorm2d): 62 | nn.init.constant_(m.weight, 1) 63 | nn.init.constant_(m.bias, 0) 64 | elif isinstance(m, nn.BatchNorm1d): 65 | nn.init.constant_(m.weight, 1) 66 | nn.init.constant_(m.bias, 0) 67 | elif isinstance(m, nn.Linear): 68 | nn.init.normal_(m.weight, 0, 0.01) 69 | nn.init.constant_(m.bias, 0) 70 | 71 | def forward(self, input): 72 | """[forward] 73 | 74 | Args: 75 | input ([tensor],N,C,W,H): input features 76 | """ 77 | if self.mode=="seq": 78 | input = input[:,:,:,:4] 79 | elif self.mode=="str": 80 | input = input[:,:,:,4:] 81 | x = self.conv(input) 82 | x = F.dropout(x, 0.1, training=self.training) 83 | z = self.se(x) 84 | x = self.res2d(x*z) 85 | x = F.dropout(x, 0.5, training=self.training) 86 | x = self.avgpool(x) 87 | x = x.view(x.shape[0], x.shape[1], x.shape[2]) 88 | x = self.res1d(x) 89 | x = F.dropout(x, 0.3, training=self.training) 90 | x = self.gpool(x) 91 | x = x.view(x.shape[0], x.shape[1]) 92 | x = self.fc(x) 93 | return x 94 | 95 | 96 | class PrismNet_large(nn.Module): 97 | def __init__(self, mode="pu"): 98 | super(PrismNet_large, self).__init__() 99 | self.mode = mode 100 | h_p, h_k = 2, 5 101 | if mode=="pu": 102 | self.n_features = 5 103 | elif mode=="seq": 104 | self.n_features = 4 105 | h_p, h_k = 1, 3 106 | elif mode=="str": 107 | self.n_features = 1 108 | h_p, h_k = 0, 1 109 | else: 110 | raise "mode error" 111 | 112 | base_channel = 64 113 | self.conv = Conv2d(1, base_channel, kernel_size=(11, h_k), bn = True, same_padding=True) 114 | self.se = SEBlock(base_channel) 115 | self.res2d = ResidualBlock2D(base_channel, kernel_size=(11, h_k), padding=(5, h_p)) 116 | self.res1d = ResidualBlock1D(base_channel*4) 117 | self.avgpool = nn.AvgPool2d((1,self.n_features)) 118 | self.gpool = nn.AdaptiveAvgPool1d(1) 119 | self.fc = nn.Linear(base_channel*4*8, 1) 120 | self._initialize_weights() 121 | 122 | def _initialize_weights(self): 123 | for m in self.modules(): 124 | if isinstance(m, nn.Conv2d): 125 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 126 | if m.bias is not None: 127 | nn.init.constant_(m.bias, 0) 128 | elif isinstance(m, nn.Conv1d): 129 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 130 | if m.bias is not None: 131 | nn.init.constant_(m.bias, 0) 132 | elif isinstance(m, nn.BatchNorm2d): 133 | nn.init.constant_(m.weight, 1) 134 | nn.init.constant_(m.bias, 0) 135 | elif isinstance(m, nn.BatchNorm1d): 136 | nn.init.constant_(m.weight, 1) 137 | nn.init.constant_(m.bias, 0) 138 | elif isinstance(m, nn.Linear): 139 | nn.init.normal_(m.weight, 0, 0.01) 140 | nn.init.constant_(m.bias, 0) 141 | 142 | def forward(self, input): 143 | """[summary] 144 | 145 | Args: 146 | input ([tensor],N,C,W,H): input features 147 | """ 148 | if self.mode=="seq": 149 | input = input[:,:,:,:4] 150 | elif self.mode=="str": 151 | input = input[:,:,:,4:] 152 | x = self.conv(input) 153 | x = F.dropout(x, 0.1, training=self.training) 154 | z = self.se(x) 155 | x = self.res2d(x*z) 156 | x = F.dropout(x, 0.5, training=self.training) 157 | x = self.avgpool(x) 158 | x = x.view(x.shape[0], x.shape[1], x.shape[2]) 159 | x = self.res1d(x) 160 | x = F.dropout(x, 0.3, training=self.training) 161 | x = self.gpool(x) 162 | x = x.view(x.shape[0], x.shape[1]) 163 | x = self.fc(x) 164 | return x 165 | -------------------------------------------------------------------------------- /prismnet/utils/metrics.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import numpy as np 3 | from six.moves import cPickle 4 | from sklearn.metrics import roc_curve, auc, precision_recall_curve, accuracy_score, roc_auc_score, confusion_matrix 5 | from scipy import stats 6 | 7 | 8 | __all__ = [ 9 | "pearsonr", 10 | "rsquare", 11 | "accuracy", 12 | "roc", 13 | "pr", 14 | "calculate_metrics" 15 | ] 16 | 17 | # class MLMetrics(object): 18 | class MLMetrics(object): 19 | def __init__(self, objective='binary'): 20 | self.objective = objective 21 | self.metrics = [] 22 | 23 | def update(self, label, pred, other_lst): 24 | met, _ = calculate_metrics(label, pred, self.objective) 25 | if len(other_lst)>0: 26 | met.extend(other_lst) 27 | self.metrics.append(met) 28 | self.compute_avg() 29 | 30 | def compute_avg(self): 31 | if len(self.metrics)>1: 32 | self.avg = np.array(self.metrics).mean(axis=0) 33 | self.sum = np.array(self.metrics).sum(axis=0) 34 | else: 35 | self.avg = self.metrics[0] 36 | self.sum = self.metrics[0] 37 | self.acc = self.avg[0] 38 | self.auc = self.avg[1] 39 | self.prc = self.avg[2] 40 | self.tp = int(self.sum[3]) 41 | self.tn = int(self.sum[4]) 42 | self.fp = int(self.sum[5]) 43 | self.fn = int(self.sum[6]) 44 | if len(self.avg)>7: 45 | self.other = self.avg[7:] 46 | 47 | 48 | def pearsonr(label, prediction): 49 | ndim = np.ndim(label) 50 | if ndim == 1: 51 | corr = [stats.pearsonr(label, prediction)] 52 | else: 53 | num_labels = label.shape[1] 54 | corr = [] 55 | for i in range(num_labels): 56 | #corr.append(np.corrcoef(label[:,i], prediction[:,i])) 57 | corr.append(stats.pearsonr(label[:,i], prediction[:,i])[0]) 58 | 59 | return corr 60 | 61 | 62 | def rsquare(label, prediction): 63 | ndim = np.ndim(label) 64 | if ndim == 1: 65 | y = label 66 | X = prediction 67 | m = np.dot(X,y)/np.dot(X, X) 68 | resid = y - m*X; 69 | ym = y - np.mean(y); 70 | rsqr2 = 1 - np.dot(resid.T,resid)/ np.dot(ym.T, ym); 71 | metric = [rsqr2] 72 | slope = [m] 73 | else: 74 | num_labels = label.shape[1] 75 | metric = [] 76 | slope = [] 77 | for i in range(num_labels): 78 | y = label[:,i] 79 | X = prediction[:,i] 80 | m = np.dot(X,y)/np.dot(X, X) 81 | resid = y - m*X; 82 | ym = y - np.mean(y); 83 | rsqr2 = 1 - np.dot(resid.T,resid)/ np.dot(ym.T, ym); 84 | metric.append(rsqr2) 85 | slope.append(m) 86 | return metric, slope 87 | 88 | 89 | def accuracy(label, prediction): 90 | ndim = np.ndim(label) 91 | if ndim == 1: 92 | metric = np.array(accuracy_score(label, np.round(prediction))) 93 | else: 94 | num_labels = label.shape[1] 95 | metric = np.zeros((num_labels)) 96 | for i in range(num_labels): 97 | metric[i] = accuracy_score(label[:,i], np.round(prediction[:,i])) 98 | return metric 99 | 100 | 101 | def roc(label, prediction): 102 | ndim = np.ndim(label) 103 | if ndim == 1: 104 | fpr, tpr, thresholds = roc_curve(label, prediction) 105 | score = auc(fpr, tpr) 106 | metric = np.array(score) 107 | curves = [(fpr, tpr)] 108 | else: 109 | num_labels = label.shape[1] 110 | curves = [] 111 | metric = np.zeros((num_labels)) 112 | for i in range(num_labels): 113 | fpr, tpr, thresholds = roc_curve(label[:,i], prediction[:,i]) 114 | score = auc(fpr, tpr) 115 | metric[i]= score 116 | curves.append((fpr, tpr)) 117 | return metric, curves 118 | 119 | 120 | def pr(label, prediction): 121 | ndim = np.ndim(label) 122 | if ndim == 1: 123 | precision, recall, thresholds = precision_recall_curve(label, prediction) 124 | score = auc(recall, precision) 125 | metric = np.array(score) 126 | curves = [(precision, recall)] 127 | else: 128 | num_labels = label.shape[1] 129 | curves = [] 130 | metric = np.zeros((num_labels)) 131 | for i in range(num_labels): 132 | precision, recall, thresholds = precision_recall_curve(label[:,i], prediction[:,i]) 133 | score = auc(recall, precision) 134 | metric[i] = score 135 | curves.append((precision, recall)) 136 | return metric, curves 137 | 138 | def tfnp(label, prediction): 139 | try: 140 | tn, fp, fn, tp = confusion_matrix(label, prediction).ravel() 141 | except Exception: 142 | tp, tn, fp, fn =0,0,0,0 143 | 144 | return tp, tn, fp, fn 145 | 146 | 147 | def calculate_metrics(label, prediction, objective): 148 | """calculate metrics for classification""" 149 | # import pdb; pdb.set_trace() 150 | 151 | 152 | if (objective == "binary") | (objective == 'hinge'): 153 | ndim = np.ndim(label) 154 | #if ndim == 1: 155 | # label = one_hot_labels(label) 156 | correct = accuracy(label, prediction) 157 | auc_roc, roc_curves = roc(label, prediction) 158 | auc_pr, pr_curves = pr(label, prediction) 159 | # import pdb; pdb.set_trace() 160 | if ndim == 2: 161 | prediction=prediction[:,0] 162 | label = label[:,0] 163 | # pred_class = prediction[:,0]>0.5 164 | pred_class = prediction>0.5 165 | # tp, tn, fp, fn = tfnp(label[:,0], pred_class) 166 | tp, tn, fp, fn = tfnp(label, pred_class) 167 | # tn8, fp8, fn8, tp8 = tfnp(label[:,0], prediction[prediction>0.8][:,0]) 168 | # import pdb; pdb.set_trace() 169 | mean = [np.nanmean(correct), np.nanmean(auc_roc), np.nanmean(auc_pr),tp, tn, fp, fn] 170 | std = [np.nanstd(correct), np.nanstd(auc_roc), np.nanstd(auc_pr)] 171 | 172 | elif objective == "categorical": 173 | 174 | correct = np.mean(np.equal(np.argmax(label, axis=1), np.argmax(prediction, axis=1))) 175 | auc_roc, roc_curves = roc(label, prediction) 176 | auc_pr, pr_curves = pr(label, prediction) 177 | mean = [np.nanmean(correct), np.nanmean(auc_roc), np.nanmean(auc_pr)] 178 | std = [np.nanstd(correct), np.nanstd(auc_roc), np.nanstd(auc_pr)] 179 | for i in range(label.shape[1]): 180 | label_c, prediction_c = label[:,i], prediction[:,i] 181 | auc_roc, roc_curves = roc(label_c, prediction_c) 182 | mean.append(np.nanmean(auc_roc)) 183 | std.append(np.nanstd(auc_roc)) 184 | 185 | 186 | elif (objective == 'squared_error') | (objective == 'kl_divergence') | (objective == 'cdf'): 187 | ndim = np.ndim(label) 188 | #if ndim == 1: 189 | # label = one_hot_labels(label) 190 | label[label<0.5] = 0 191 | label[label>=0.5] = 1 192 | # import pdb; pdb.set_trace() 193 | 194 | correct = accuracy(label, prediction) 195 | auc_roc, roc_curves = roc(label, prediction) 196 | auc_pr, pr_curves = pr(label, prediction) 197 | # import pdb; pdb.set_trace() 198 | if ndim == 2: 199 | prediction=prediction[:,0] 200 | label = label[:,0] 201 | # pred_class = prediction[:,0]>0.5 202 | pred_class = prediction>0.5 203 | # tp, tn, fp, fn = tfnp(label[:,0], pred_class) 204 | tp, tn, fp, fn = tfnp(label, pred_class) 205 | # mean = [np.nanmean(correct), np.nanmean(auc_roc), np.nanmean(auc_pr),tp, tn, fp, fn] 206 | # std = [np.nanstd(correct), np.nanstd(auc_roc), np.nanstd(auc_pr)] 207 | 208 | 209 | # squared_error 210 | corr = pearsonr(label,prediction) 211 | rsqr, slope = rsquare(label, prediction) 212 | # mean = [np.nanmean(corr), np.nanmean(rsqr), np.nanmean(slope)] 213 | # std = [np.nanstd(corr), np.nanstd(rsqr), np.nanstd(slope)] 214 | 215 | mean = [np.nanmean(correct), np.nanmean(auc_roc), np.nanmean(auc_pr),tp, tn, fp, fn, np.nanmean(corr), np.nanmean(rsqr), np.nanmean(slope)] 216 | std = [np.nanstd(correct), np.nanstd(auc_roc), np.nanstd(auc_pr), np.nanstd(corr), np.nanstd(rsqr), np.nanstd(slope)] 217 | 218 | else: 219 | mean = 0 220 | std = 0 221 | 222 | return [mean, std] 223 | -------------------------------------------------------------------------------- /prismnet/engine/train_loop.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse, os, copy 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | from tqdm import tqdm 7 | import prismnet.model as arch 8 | from prismnet.utils import log_print, metrics, datautils 9 | 10 | def train(args, model, device, train_loader, criterion, optimizer): 11 | model.train() 12 | met = metrics.MLMetrics(objective='binary') 13 | for batch_idx, (x0, y0) in enumerate(train_loader): 14 | x, y = x0.float().to(device), y0.to(device).float() 15 | if y0.sum() ==0 or y0.sum() ==args.batch_size: 16 | continue 17 | optimizer.zero_grad() 18 | output = model(x) 19 | loss = criterion(output, y) 20 | prob = torch.sigmoid(output) 21 | 22 | y_np = y.to(device='cpu', dtype=torch.long).detach().numpy() 23 | p_np = prob.to(device='cpu').detach().numpy() 24 | met.update(y_np, p_np,[loss.item()]) 25 | loss.backward() 26 | torch.nn.utils.clip_grad_norm_(model.parameters(), 5) 27 | optimizer.step() 28 | 29 | return met 30 | 31 | def validate(args, model, device, test_loader, criterion): 32 | model.eval() 33 | y_all = [] 34 | p_all = [] 35 | l_all = [] 36 | with torch.no_grad(): 37 | for batch_idx, (x0, y0) in enumerate(test_loader): 38 | x, y = x0.float().to(device), y0.to(device).float() 39 | #if y0.sum() ==0: 40 | # import pdb; pdb.set_trace() 41 | output = model(x) 42 | loss = criterion(output, y) 43 | prob = torch.sigmoid(output) 44 | 45 | y_np = y.to(device='cpu', dtype=torch.long).numpy() 46 | p_np = prob.to(device='cpu').numpy() 47 | l_np = loss.item() 48 | 49 | y_all.append(y_np) 50 | p_all.append(p_np) 51 | l_all.append(l_np) 52 | 53 | y_all = np.concatenate(y_all) 54 | p_all = np.concatenate(p_all) 55 | l_all = np.array(l_all) 56 | 57 | met = metrics.MLMetrics(objective='binary') 58 | met.update(y_all, p_all,[l_all.mean()]) 59 | 60 | 61 | 62 | return met, y_all, p_all 63 | 64 | def inference(args, model, device, test_loader): 65 | model.eval() 66 | p_all = [] 67 | with torch.no_grad(): 68 | for batch_idx, (x0, y0) in enumerate(test_loader): 69 | x, y = x0.float().to(device), y0.to(device).float() 70 | output = model(x) 71 | prob = torch.sigmoid(output) 72 | 73 | p_np = prob.to(device='cpu').numpy() 74 | p_all.append(p_np) 75 | 76 | p_all = np.concatenate(p_all) 77 | return p_all 78 | 79 | 80 | def compute_saliency(args, model, device, test_loader, identity): 81 | from prismnet.model import GuidedBackpropSmoothGrad 82 | 83 | model.eval() 84 | 85 | saliency_dir = datautils.make_directory(args.out_dir, "out/saliency") 86 | saliency_path = os.path.join(saliency_dir, identity+'.sal') 87 | 88 | # sgrad = SmoothGrad(model, device=device) 89 | sgrad = GuidedBackpropSmoothGrad(model, device=device) 90 | sal = "" 91 | for batch_idx, (x0, y0) in enumerate(test_loader): 92 | X, Y = x0.float().to(device), y0.to(device).float() 93 | output = model(X) 94 | prob = torch.sigmoid(output) 95 | p_np = prob.to(device='cpu').detach().numpy().squeeze(-1) 96 | guided_saliency = sgrad.get_batch_gradients(X, Y) 97 | # import pdb; pdb.set_trace() 98 | N, NS, _, _ = guided_saliency.shape # (N, 101, 1, 5) 99 | 100 | for i in range(N): 101 | inr = batch_idx*args.batch_size + i 102 | str_sal = datautils.mat2str(np.squeeze(guided_saliency[i])) 103 | sal += "{}\t{:.6f}\t{}\n".format(inr, p_np[i], str_sal) 104 | 105 | f = open(saliency_path,"w") 106 | f.write(sal) 107 | f.close() 108 | print(saliency_path) 109 | 110 | 111 | def compute_saliency_img(args, model, device, test_loader, identity): 112 | from prismnet.model import GuidedBackpropSmoothGrad 113 | from prismnet.utils import visualize 114 | 115 | def saliency_img(X, mul_saliency, outdir="results"): 116 | """generate saliency image 117 | 118 | Args: 119 | X ([np.ndarray]): raw input(L x 5/4) 120 | mul_saliency ([np.ndarray]): [description] 121 | outdir (str, optional): [description]. Defaults to "results". 122 | """ 123 | if X.shape[-1]==5: 124 | x_str = X[:,4:] 125 | str_null = np.zeros_like(x_str) 126 | ind =np.where(x_str == -1)[0] 127 | str_null[ind,0]=1 128 | 129 | ss = mul_saliency[:,:] 130 | s_str = mul_saliency[:,4:] 131 | s_str = (s_str - s_str.min())/(s_str.max() - s_str.min()) 132 | ss[:,4:] = s_str * (1-str_null) 133 | 134 | str_null=np.squeeze(str_null).T 135 | else: 136 | str_null = None 137 | ss = mul_saliency[:,:] 138 | 139 | visualize.plot_saliency( 140 | X.T, 141 | ss.T, 142 | nt_width=100, 143 | norm_factor=3, 144 | str_null=str_null, 145 | outdir=outdir 146 | ) 147 | 148 | 149 | prefix_n = len(str(len(test_loader.dataset))) 150 | datautils.make_directory(args.out_dir, "out/imgs/") 151 | imgs_dir = datautils.make_directory(args.out_dir, "out/imgs/"+identity) 152 | imgs_path = imgs_dir+'/{:0'+str(prefix_n)+'d}_{:.3f}.pdf' 153 | saliency_path = os.path.join(imgs_dir, 'all.sal') 154 | 155 | # sgrad = SmoothGrad(model, device=device) 156 | sgrad = GuidedBackpropSmoothGrad(model, device=device, magnitude=1) 157 | for batch_idx, (x0, y0) in enumerate(test_loader): 158 | X, Y = x0.float().to(device), y0.to(device).float() 159 | output = model(X) 160 | prob = torch.sigmoid(output) 161 | p_np = prob.to(device='cpu').detach().numpy().squeeze() 162 | guided_saliency = sgrad.get_batch_gradients(X, Y) 163 | mul_saliency = copy.deepcopy(guided_saliency) 164 | mul_saliency[:,:,:,:4] = guided_saliency[:,:,:,:4] * X[:,:,:,:4] 165 | N, NS, _, _ = guided_saliency.shape # (N, 101, 1, 5) 166 | sal = "" 167 | for i in tqdm(range(N)): 168 | inr = batch_idx*args.batch_size + i 169 | str_sal = datautils.mat2str(np.squeeze(guided_saliency[i])) 170 | sal += "{}\t{:.6f}\t{}\n".format(inr, p_np[i], str_sal) 171 | img_path = imgs_path.format(inr, p_np[i]) 172 | # import pdb; pdb.set_trace() 173 | saliency_img( 174 | X[i,0].to(device='cpu').detach().numpy(), 175 | mul_saliency[i,0].to(device='cpu').numpy(), 176 | outdir=img_path) 177 | if not os.path.exists(saliency_path): 178 | f = open(saliency_path,"w") 179 | f.write(sal) 180 | f.close() 181 | print(saliency_path) 182 | 183 | 184 | 185 | def compute_high_attention_region(args, model, device, test_loader, identity): 186 | from prismnet.model import GuidedBackpropSmoothGrad 187 | model.eval() 188 | har_dir = datautils.make_directory(args.out_dir, "out/har") 189 | har_path = os.path.join(har_dir, identity+'.har') 190 | 191 | L = 20 192 | har = "" 193 | # sgrad = SmoothGrad(model, device=device) 194 | sgrad = GuidedBackpropSmoothGrad(model, device=device) 195 | for batch_idx, (x0, y0) in enumerate(test_loader): 196 | X, Y = x0.float().to(device), y0.to(device).float() 197 | output = model(X) 198 | prob = torch.sigmoid(output) 199 | p_np = prob.to(device='cpu').detach().numpy().squeeze() 200 | guided_saliency = sgrad.get_batch_gradients(X, Y) 201 | 202 | attention_region = guided_saliency.sum(dim=3)[:,0,:].to(device='cpu').numpy() # (N, 101, 1) 203 | N,NS = attention_region.shape # (N, 101) 204 | for i in range(N): 205 | inr = batch_idx*args.batch_size + i 206 | iar = attention_region[i] 207 | ar_score = np.array([ iar[j:j+L].sum() for j in range(NS-L+1)]) 208 | # import pdb; pdb.set_trace() 209 | highest_ind = np.argmax(iar) 210 | har += "{}\t{:.6f}\t{}\t{}\n".format(inr, p_np[i], highest_ind, highest_ind+L) 211 | 212 | f = open(har_path,"w") 213 | f.write(har) 214 | f.close() 215 | print(har_path) 216 | 217 | -------------------------------------------------------------------------------- /tools/main.py: -------------------------------------------------------------------------------- 1 | import argparse, os, random 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.optim import lr_scheduler 7 | 8 | 9 | from tensorboardX import SummaryWriter 10 | from sklearn import metrics 11 | import numpy as np 12 | 13 | import prismnet.model as arch 14 | from prismnet import train, validate, inference, log_print, compute_saliency, compute_saliency_img, compute_high_attention_region 15 | #compute_high_attention_region 16 | 17 | # from prismnet.engine.train_loop import 18 | from prismnet.model.utils import GradualWarmupScheduler 19 | from prismnet.loader import SeqicSHAPE 20 | from prismnet.utils import datautils 21 | 22 | 23 | def fix_seed(seed): 24 | """ 25 | Seed all necessary random number generators. 26 | """ 27 | if seed is None: 28 | seed = random.randint(1, 10000) 29 | torch.set_num_threads(1) # Suggested for issues with deadlocks, etc. 30 | random.seed(seed) 31 | os.environ['PYTHONHASHSEED'] = str(seed) 32 | np.random.seed(seed) 33 | torch.manual_seed(seed) 34 | torch.cuda.manual_seed(seed) 35 | torch.cuda.manual_seed_all(seed) # if you are using multi-GPU. 36 | torch.backends.cudnn.deterministic = True 37 | torch.backends.cudnn.benchmark = True 38 | torch.backends.cudnn.enabled = True 39 | # print("[Info] cudnn.deterministic set to True. CUDNN-optimized code may be slow.") 40 | 41 | def save_evals(out_dir, filename, dataname, predictions, label, met): 42 | evals_dir = datautils.make_directory(out_dir, "out/evals") 43 | metrics_path = os.path.join(evals_dir, filename+'.metrics') 44 | probs_path = os.path.join(evals_dir, filename+'.probs') 45 | with open(metrics_path,"w") as f: 46 | if "_reg" in filename: 47 | print("{:s}\t{:.3f}\t{:.3f}\t{:.3f}\t{:d}\t{:d}\t{:d}\t{:d}\t{:.3f}\t{:.3f}\t{:.3f}".format( 48 | dataname, 49 | met.acc, 50 | met.auc, 51 | met.prc, 52 | met.tp, 53 | met.tn, 54 | met.fp, 55 | met.fn, 56 | met.avg[7], 57 | met.avg[8], 58 | met.avg[9], 59 | ), file=f) 60 | else: 61 | print("{:s}\t{:.3f}\t{:.3f}\t{:.3f}\t{:d}\t{:d}\t{:d}\t{:d}".format( 62 | dataname, 63 | met.acc, 64 | met.auc, 65 | met.prc, 66 | met.tp, 67 | met.tn, 68 | met.fp, 69 | met.fn, 70 | ), file=f) 71 | with open(probs_path,"w") as f: 72 | for i in range(len(predictions)): 73 | print("{:.3f}\t{}".format(predictions[i,0], label[i,0]), file=f) 74 | print("Evaluation file:", metrics_path) 75 | print("Prediction file:", probs_path) 76 | 77 | def save_infers(out_dir, filename, predictions): 78 | evals_dir = datautils.make_directory(out_dir, "out/infer") 79 | probs_path = os.path.join(evals_dir, filename+'.probs') 80 | with open(probs_path,"w") as f: 81 | for i in range(len(predictions)): 82 | print("{:f}".format(predictions[i,0]), file=f) 83 | print("Prediction file:", probs_path) 84 | 85 | def main(): 86 | global writer, best_epoch 87 | # Training settings 88 | parser = argparse.ArgumentParser(description='Official version of PrismNet') 89 | # Data options 90 | parser.add_argument('--data_dir', type=str, default="data", help='data path') 91 | parser.add_argument('--exp_name', type=str, default="cnn", metavar='N', help='experiment name') 92 | parser.add_argument('--p_name', type=str, default="TIA1_Hela", metavar='N', help='protein name') 93 | parser.add_argument('--out_dir', type=str, default=".", help='output directory') 94 | parser.add_argument('--mode', type=str, default="pu", help='data mode') 95 | parser.add_argument("--infer_file", type=str, help="infer file", default="") 96 | # Training Hyper-parameter 97 | parser.add_argument('--arch', default="PrismNet", help='network architecture') 98 | parser.add_argument('--lr_scheduler', default="warmup", help=' lr scheduler: warmup/cosine') 99 | parser.add_argument('--lr', type=float, default=0.0001, help='learning rate') 100 | parser.add_argument('--batch_size', type=int, default=64, help='input batch size') 101 | parser.add_argument('--nepochs', type=int, default=200, help='number of epochs to train') 102 | parser.add_argument('--pos_weight', type=int, default=2, help='positive class weight') 103 | parser.add_argument('--weight_decay', type=float, default=1e-6, help='weight decay, default=1e-6') 104 | parser.add_argument('--early_stopping', type=int, default=20, help='early stopping') 105 | # Training 106 | parser.add_argument('--load_best', action='store_true', help='load best model') 107 | parser.add_argument('--eval', action='store_true', help='eval mode') 108 | parser.add_argument('--train', action='store_true', help='train mode') 109 | parser.add_argument('--infer', action='store_true', help='infer mode') 110 | parser.add_argument('--infer_test', action='store_true', help='infer test from h5') 111 | parser.add_argument('--eval_test', action='store_true', help='eval test from h5') 112 | parser.add_argument('--saliency', action='store_true', help='compute saliency mode') 113 | parser.add_argument('--saliency_img', action='store_true', help='compute saliency and plot image mode') 114 | parser.add_argument('--har', action='store_true', help='compute highest attention region') 115 | # misc 116 | parser.add_argument('--tfboard', action='store_true', help='tf board') 117 | parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') 118 | parser.add_argument('--workers', type=int, help='number of data loading workers', default=2) 119 | parser.add_argument('--log_interval', type=int, default=100, help='log print interval') 120 | parser.add_argument('--seed', type=int, default=1024, help='manual seed') 121 | args = parser.parse_args() 122 | print(args) 123 | use_cuda = not args.no_cuda and torch.cuda.is_available() 124 | 125 | if args.mode == 'pu': 126 | args.nstr = 1 127 | else: 128 | args.nstr = 0 129 | 130 | # out dir 131 | data_path = args.data_dir + "/" + args.p_name + ".h5" 132 | identity = args.p_name+'_'+args.arch+"_"+args.mode 133 | datautils.make_directory(args.out_dir,"out/") 134 | model_dir = datautils.make_directory(args.out_dir,"out/models") 135 | model_path = os.path.join(model_dir, identity+"_{}.pth") 136 | 137 | if args.tfboard: 138 | tfb_dir = datautils.make_directory(args.out_dir,"out/tfb") 139 | writer = SummaryWriter(tfb_dir) 140 | else: 141 | writer = None 142 | # fix random seed 143 | fix_seed(args.seed) 144 | 145 | device = torch.device("cuda" if use_cuda else "cpu") 146 | kwargs = {'num_workers': args.workers, 'pin_memory': True} if use_cuda else {} 147 | 148 | #train_loader = torch.utils.data.DataLoader(SeqicSHAPE(data_path), \ 149 | # batch_size=args.batch_size, shuffle=True, **kwargs) 150 | #test_loader = torch.utils.data.DataLoader(SeqicSHAPE(data_path, is_test=True), \ 151 | # batch_size=args.batch_size*8, shuffle=False, **kwargs) 152 | #print("Train set:", len(train_loader.dataset)) 153 | #print("Test set:", len(test_loader.dataset)) 154 | 155 | 156 | print("Network Arch:", args.arch) 157 | model = getattr(arch, args.arch)(mode=args.mode) 158 | arch.param_num(model) 159 | # print(model) 160 | 161 | if args.load_best: 162 | filename = model_path.format("best") 163 | print("Loading model: {}".format(filename)) 164 | model.load_state_dict(torch.load(filename,map_location='cpu')) 165 | 166 | model = model.to(device) 167 | criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(args.pos_weight)) 168 | 169 | if args.train: 170 | 171 | train_loader = torch.utils.data.DataLoader(SeqicSHAPE(data_path), \ 172 | batch_size=args.batch_size, shuffle=True, **kwargs) 173 | 174 | test_loader = torch.utils.data.DataLoader(SeqicSHAPE(data_path, is_test=True), \ 175 | batch_size=args.batch_size*8, shuffle=False, **kwargs) 176 | print("Train set:", len(train_loader.dataset)) 177 | print("Test set:", len(test_loader.dataset)) 178 | 179 | optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.999), weight_decay=args.weight_decay) 180 | scheduler = GradualWarmupScheduler( 181 | optimizer, multiplier=8, total_epoch=float(args.nepochs), after_scheduler=None) 182 | 183 | best_auc = 0 184 | best_acc = 0 185 | best_epoch = 0 186 | for epoch in range(1, args.nepochs + 1): 187 | t_met = train(args, model, device, train_loader, criterion, optimizer) 188 | v_met, _, _ = validate(args, model, device, test_loader, criterion) 189 | scheduler.step(epoch) 190 | lr = scheduler.get_lr()[0] 191 | color_best='green' 192 | if best_auc < v_met.auc: 193 | best_auc = v_met.auc 194 | best_acc = v_met.acc 195 | best_epoch = epoch 196 | color_best = 'red' 197 | filename = model_path.format("best") 198 | torch.save(model.state_dict(), filename) 199 | if epoch - best_epoch > args.early_stopping: 200 | print("Early stop at %d, %s "%(epoch, args.exp_name)) 201 | break 202 | 203 | if args.tfboard and writer is not None: 204 | writer.add_scalar('loss/train', t_met.other[0], epoch) 205 | writer.add_scalar('acc/train', t_met.acc, epoch) 206 | writer.add_scalar('AUC/train', t_met.auc, epoch) 207 | writer.add_scalar('lr', lr, epoch) 208 | writer.add_scalar('loss/test', v_met.other[0], epoch) 209 | writer.add_scalar('acc/test', v_met.acc, epoch) 210 | writer.add_scalar('AUC/test', v_met.auc, epoch) 211 | line='{} \t Train Epoch: {} avg.loss: {:.4f} Acc: {:.2f}%, AUC: {:.4f} lr: {:.6f}'.format(\ 212 | args.p_name, epoch, t_met.other[0], t_met.acc, t_met.auc, lr) 213 | log_print(line, color='green', attrs=['bold']) 214 | 215 | line='{} \t Test Epoch: {} avg.loss: {:.4f} Acc: {:.2f}%, AUC: {:.4f} ({:.4f})'.format(\ 216 | args.p_name, epoch, v_met.other[0], v_met.acc, v_met.auc, best_auc) 217 | log_print(line, color=color_best, attrs=['bold']) 218 | 219 | print("{} auc: {:.4f} acc: {:.4f}".format(args.p_name, best_auc, best_acc)) 220 | 221 | filename = model_path.format("best") 222 | print("Loading model: {}".format(filename)) 223 | model.load_state_dict(torch.load(filename)) 224 | 225 | 226 | 227 | if args.eval: 228 | 229 | test_loader = torch.utils.data.DataLoader(SeqicSHAPE(data_path, is_test=True), \ 230 | batch_size=args.batch_size*8, shuffle=False, **kwargs) 231 | print("Test set:", len(test_loader.dataset)) 232 | 233 | met, y_all, p_all = validate(args, model, device, test_loader, criterion) 234 | print("> eval {} auc: {:.4f} acc: {:.4f}".format(args.p_name, met.auc, met.acc)) 235 | save_evals(args.out_dir, identity, args.p_name, p_all, y_all, met) 236 | 237 | if args.infer and os.path.exists(args.infer_file): 238 | test_loader = torch.utils.data.DataLoader(SeqicSHAPE(args.infer_file, is_infer=True), \ 239 | batch_size=args.batch_size, shuffle=False, **kwargs) 240 | 241 | p_all = inference(args, model, device, test_loader) 242 | identity = identity+"_"+ os.path.basename(args.infer_file).replace(".txt","") 243 | save_infers(args.out_dir, identity, p_all) 244 | 245 | if args.saliency and os.path.exists(args.infer_file): 246 | test_loader = torch.utils.data.DataLoader(SeqicSHAPE(args.infer_file, is_infer=True), \ 247 | batch_size=args.batch_size, shuffle=False, **kwargs) 248 | compute_saliency(args, model, device, test_loader, identity) 249 | 250 | if args.saliency_img and os.path.exists(args.infer_file): 251 | test_loader = torch.utils.data.DataLoader(SeqicSHAPE(args.infer_file, is_infer=True), \ 252 | batch_size=args.batch_size, shuffle=False, **kwargs) 253 | compute_saliency_img(args, model, device, test_loader, identity) 254 | 255 | if args.har and os.path.exists(args.infer_file): 256 | test_loader = torch.utils.data.DataLoader(SeqicSHAPE(args.infer_file, is_infer=True), \ 257 | batch_size=args.batch_size, shuffle=False, **kwargs) 258 | compute_high_attention_region(args, model, device, test_loader, identity) 259 | 260 | 261 | 262 | 263 | 264 | if __name__ == '__main__': 265 | main() 266 | -------------------------------------------------------------------------------- /prismnet/utils/datautils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import os, sys, h5py 6 | import numpy as np 7 | from copy import deepcopy 8 | 9 | 10 | 11 | def make_directory(path, foldername, verbose=1): 12 | """make a directory""" 13 | 14 | if not os.path.isdir(path): 15 | os.mkdir(path) 16 | print("making directory: " + path) 17 | 18 | outdir = os.path.join(path, foldername) 19 | if not os.path.isdir(outdir): 20 | os.mkdir(outdir) 21 | print("making directory: " + outdir) 22 | return outdir 23 | 24 | def finished(path, line_num): 25 | """check a results file is finished or not 26 | 27 | Args: 28 | path ([str]): [results file path] 29 | line_num ([int]): [target line number] 30 | """ 31 | 32 | if os.path.exists(path): 33 | with open(path, "r") as f: 34 | if line_num == len(f.readlines()): 35 | return True 36 | else: 37 | return False 38 | else: 39 | return False 40 | 41 | def get_file_names(dataset_path): 42 | file_names = [] 43 | for file_name in os.listdir(dataset_path): 44 | if os.path.splitext(file_name)[1] == '.h5': 45 | file_names.append(file_name) 46 | return file_names 47 | 48 | def md5(string): 49 | return hashlib.md5(string.encode('utf-8')).hexdigest() 50 | 51 | def mat2str(m): 52 | string="" 53 | if len(m.shape)==1: 54 | for j in range(m.shape[0]): 55 | string+= "%.3f," % m[j] 56 | else: 57 | for i in range(m.shape[0]): 58 | for j in range(m.shape[1]): 59 | string+= "%.3f," % m[i,j] 60 | return string 61 | 62 | def rescale(vec, thr=0.0): 63 | ind0 = np.where(vec>=thr)[0] 64 | u_norm = 0.5 * (vec[ind0]-thr)/(vec[ind0].max()) + 0.5 65 | ind2 = np.where(vec<0)[0] 66 | vec_norm = vec.copy() 67 | vec_norm[ind0] = u_norm 68 | vec_norm[ind2] = 0.0 69 | return vec_norm 70 | 71 | 72 | def decodeDNA(m): 73 | na=["A","C","G","U"] 74 | var,inds=np.where(m==1) 75 | seq="" 76 | for i in inds: 77 | seq=seq+na[i] 78 | return seq 79 | 80 | def str_onehot(vec): 81 | thr=0.15 82 | mask_str = np.zeros((2,vec.shape[-1])) 83 | ind =np.where(vec >= thr)[1] 84 | mask_str[1,ind]=1 85 | ind =np.where(vec < thr)[1] 86 | mask_str[0,ind]=1 87 | ind =np.where(vec == -1)[1] 88 | mask_str[0,ind]=0.5 89 | mask_str[1,ind]=0.5 90 | return mask_str 91 | 92 | def convert_one_hot(sequence, max_length=None): 93 | """convert DNA/RNA sequences to a one-hot representation""" 94 | 95 | one_hot_seq = [] 96 | for seq in sequence: 97 | seq = seq.upper() 98 | seq_length = len(seq) 99 | one_hot = np.zeros((4,seq_length)) 100 | index = [j for j in range(seq_length) if seq[j] == 'A'] 101 | one_hot[0,index] = 1 102 | index = [j for j in range(seq_length) if seq[j] == 'C'] 103 | one_hot[1,index] = 1 104 | index = [j for j in range(seq_length) if seq[j] == 'G'] 105 | one_hot[2,index] = 1 106 | index = [j for j in range(seq_length) if (seq[j] == 'U') | (seq[j] == 'T')] 107 | one_hot[3,index] = 1 108 | 109 | # handle boundary conditions with zero-padding 110 | if max_length: 111 | offset1 = int((max_length - seq_length)/2) 112 | offset2 = max_length - seq_length - offset1 113 | 114 | if offset1: 115 | one_hot = np.hstack([np.zeros((4,offset1)), one_hot]) 116 | if offset2: 117 | one_hot = np.hstack([one_hot, np.zeros((4,offset2))]) 118 | 119 | one_hot_seq.append(one_hot) 120 | 121 | # convert to numpy array 122 | one_hot_seq = np.array(one_hot_seq) 123 | 124 | return one_hot_seq 125 | 126 | def convert_cat_one_hot(targets): 127 | """convert DNA/RNA sequences to a one-hot representation""" 128 | t_length = len(targets) 129 | cat_num = len(np.unique(targets)) 130 | one_hot = np.zeros((t_length, cat_num)) 131 | for i in range(cat_num): 132 | index = np.where(targets==i)[0] 133 | one_hot[index,i]= 1 134 | return one_hot 135 | 136 | def seq_mutate(seq): 137 | mut_seq = [] 138 | for i in range(len(seq)): 139 | if seq[i] == "A" : 140 | mut_seq.extend([seq[0:i] + "C" + seq[(i+1):], seq[0:i] + "G" + seq[i+1:], seq[0:i] + "T" + seq[i+1:]]) 141 | elif seq[i] == "C" : 142 | mut_seq.extend([seq[0:i] + "A" + seq[i+1:], seq[0:i] + "G" + seq[i+1:], seq[0:i] + "T" + seq[i+1:]]) 143 | elif seq[i] == "G" : 144 | mut_seq.extend([seq[0:i] + "A" + seq[i+1:], seq[0:i] + "C" + seq[i+1:], seq[0:i] + "T" + seq[i+1:]]) 145 | else: 146 | mut_seq.extend([seq[0:i] + "A" + seq[i+1:], seq[0:i] + "C" + seq[i+1:], seq[0:i] + "G" + seq[i+1:]]) 147 | return mut_seq 148 | 149 | 150 | def load_dataset_hdf5(file_path, ss_type='seq'): 151 | 152 | def prepare_data(train, ss_type=None): 153 | if ss_type == 'struct': 154 | structure = train['inputs'][:,:,:,4:9] 155 | paired = np.expand_dims(structure[:,:,:,0], axis=3) 156 | train['inputs'] = paired 157 | return train 158 | 159 | seq = train['inputs'][:,:,:,:4] 160 | 161 | if ss_type == 'pu': 162 | structure = train['inputs'][:,:,:,4:9] 163 | paired = np.expand_dims(structure[:,:,:,0], axis=3) 164 | 165 | if structure.shape[-1]>3: 166 | unpaired = np.expand_dims(np.sum(structure[:,:,:,1:], axis=3), axis=3) 167 | seq = np.concatenate([seq, paired, unpaired], axis=3) 168 | elif structure.shape[-1]==1: 169 | seq = np.concatenate([seq, paired], axis=3) 170 | elif structure.shape[-1]==2: 171 | unpaired = np.expand_dims(structure[:,:,:,1], axis=3) 172 | seq = np.concatenate([seq, paired, unpaired], axis=3) 173 | elif structure.shape[-1]==3: 174 | unpaired = np.expand_dims(structure[:,:,:,1], axis=3) 175 | other = np.expand_dims(structure[:,:,:,2], axis=3) 176 | seq = np.concatenate([seq, paired, unpaired, other], axis=3) 177 | elif ss_type == 'p': 178 | structure = train['inputs'][:,:,:,4:9] 179 | paired = np.expand_dims(structure[:,:,:,0], axis=3) 180 | seq = np.concatenate([seq, paired], axis=3) 181 | elif ss_type == 'struct': 182 | structure = train['inputs'][:,:,:,4:9] 183 | paired = np.expand_dims(structure[:,:,:,0], axis=3) 184 | HIME = structure[:,:,:,1:] 185 | seq = np.concatenate([seq, paired, HIME], axis=3) 186 | train['inputs'] = seq 187 | return train 188 | 189 | # open dataset 190 | with h5py.File(file_path, 'r') as f: 191 | # load set A data 192 | X_train = np.array(f['X_train']) 193 | Y_train = np.array(f['Y_train']) 194 | X_test = np.array(f['X_test']) 195 | Y_test = np.array(f['Y_test']) 196 | 197 | 198 | 199 | # expand dims of targets 200 | if len(Y_train.shape) == 1: 201 | Y_train = np.expand_dims(Y_train, axis=1) 202 | Y_test = np.expand_dims(Y_test, axis=1) 203 | 204 | # add another dimension to make a 4d tensor 205 | X_train = np.expand_dims(X_train, axis=3).transpose([0, 2, 3, 1]) 206 | X_test = np.expand_dims(X_test, axis=3).transpose([0, 2, 3, 1]) 207 | 208 | # dictionary for each dataset 209 | train = {'inputs': X_train, 'targets': Y_train} 210 | test = {'inputs': X_test, 'targets': Y_test} 211 | 212 | 213 | # parse secondary structure profiles 214 | train = prepare_data(train, ss_type) 215 | test = prepare_data(test, ss_type) 216 | 217 | print("train:",train['inputs'].shape) 218 | print("test:",test['inputs'].shape) 219 | 220 | return train, test 221 | 222 | 223 | def process_data(train, test, method='log_norm'): 224 | """get the results for a single experiment specified by rbp_index. 225 | Then, preprocess the binding affinity intensities according to method. 226 | method: 227 | clip_norm - clip datapoints larger than 4 standard deviations from the mean 228 | log_norm - log transcormation 229 | both - perform clip and log normalization as separate targets (expands dimensions of targets) 230 | """ 231 | 232 | def normalize_data(data, method): 233 | if method == 'standard': 234 | MIN = np.min(data) 235 | data = np.log(data-MIN+1) 236 | sigma = np.mean(data) 237 | data_norm = (data)/sigma 238 | params = sigma 239 | if method == 'clip_norm': 240 | # standard-normal transformation 241 | significance = 4 242 | std = np.std(data) 243 | index = np.where(data > std*significance)[0] 244 | data[index] = std*significance 245 | mu = np.mean(data) 246 | sigma = np.std(data) 247 | data_norm = (data-mu)/sigma 248 | params = [mu, sigma] 249 | 250 | elif method == 'log_norm': 251 | # log-standard-normal transformation 252 | MIN = np.min(data) 253 | data = np.log(data-MIN+1) 254 | mu = np.mean(data) 255 | sigma = np.std(data) 256 | data_norm = (data-mu)/sigma 257 | params = [MIN, mu, sigma] 258 | 259 | elif method == 'both': 260 | data_norm1, params = normalize_data(data, 'clip_norm') 261 | data_norm2, params = normalize_data(data, 'log_norm') 262 | data_norm = np.hstack([data_norm1, data_norm2]) 263 | return data_norm, params 264 | 265 | 266 | # get binding affinities for a given rbp experiment 267 | Y_train = train['targets'] 268 | Y_test = test['targets'] 269 | #import pdb; pdb.set_trace() 270 | 271 | if len(Y_train.shape)==1: 272 | # filter NaN 273 | train_index = np.where(np.isnan(Y_train) == False)[0] 274 | test_index = np.where(np.isnan(Y_test) == False)[0] 275 | Y_train = Y_train[train_index] 276 | Y_test = Y_test[test_index] 277 | X_train = train['inputs'][train_index] 278 | X_test = test['inputs'][test_index] 279 | else: 280 | X_train = train['inputs'] 281 | X_test = test['inputs'] 282 | 283 | # normalize intenensities 284 | if method: 285 | Y_train, params_train = normalize_data(Y_train, method) 286 | Y_test, params_test = normalize_data(Y_test, method) 287 | 288 | # store sequences and intensities 289 | train = {'inputs': X_train, 'targets': Y_train} 290 | test = {'inputs': X_test, 'targets': Y_test} 291 | 292 | return train, test 293 | 294 | 295 | def down_negative_samples(train, test, ratio=0.0): 296 | """get the results for a single experiment specified by rbp_index. 297 | Then, preprocess the binding affinity intensities according to method. 298 | method: 299 | clip_norm - clip datapoints larger than 4 standard deviations from the mean 300 | log_norm - log transcormation 301 | both - perform clip and log normalization as separate targets (expands dimensions of targets) 302 | """ 303 | if ratio==0.0: 304 | print("No negative down-sampling ratio.") 305 | return train, test 306 | 307 | X_train = train['inputs'] 308 | X_test = test['inputs'] 309 | 310 | Y_train = train['targets']#.astype(np.int32) 311 | Y_test = test['targets']#.astype(np.int32) 312 | 313 | pos_index_tr = np.where(Y_train==1)[0] 314 | pos_index_te = np.where(Y_test==1)[0] 315 | 316 | neg_index_tr = np.where(Y_train==0)[0] 317 | neg_index_te = np.where(Y_test==0)[0] 318 | 319 | n_down_neg_tr = int(ratio * (len(Y_train) - len(neg_index_tr))) 320 | n_down_neg_te = int(ratio * (len(Y_test) - len(neg_index_te))) 321 | 322 | dw_neg_index_tr = np.random.choice(neg_index_tr, size=n_down_neg_tr) 323 | dw_neg_index_te = np.random.choice(neg_index_te, size=n_down_neg_te) 324 | 325 | pos_neg_tr =np.concatenate((dw_neg_index_tr, pos_index_tr)) 326 | pos_neg_te =np.concatenate((dw_neg_index_te, pos_index_te)) 327 | 328 | train = {'inputs': X_train[pos_neg_tr], 'targets': Y_train[pos_neg_tr]} 329 | test = {'inputs': X_test[pos_neg_te], 'targets': Y_test[pos_neg_te]} 330 | 331 | return train, test 332 | 333 | 334 | def load_testset_txt_only_seq(filepath, test, return_trans_id=False, seq_length=101): 335 | print("Reading inference file(only seq):", filepath) 336 | if os.path.exists(filepath+"_test.h5"): 337 | print("loading from h5.") 338 | with h5py.File(filepath+"_test.h5", 'r') as f: 339 | # load set A data 340 | test['inputs'] = f['inputs'] 341 | test['targets'] = f['targets'] 342 | 343 | 344 | if return_trans_id: 345 | blob = np.load(filepath+"_tran.npz") 346 | trans_ids = blob['trans_ids'] 347 | return test, trans_ids 348 | else: 349 | return test 350 | 351 | seqs = [] 352 | trans_ids = [] 353 | with open(filepath,"r") as f: 354 | for line in f.readlines(): 355 | line=line.strip('\n').split('\t') 356 | if len(line[2])!=seq_length: 357 | continue 358 | trans_ids.append(line[0]) 359 | seqs.append(line[1]) 360 | print("Converting.") 361 | input = convert_one_hot(seqs, seq_length) 362 | print("Converted.") 363 | 364 | inputs = np.expand_dims(input, axis=3).transpose([0, 2, 3, 1]) 365 | targets = np.ones((inputs.shape[0],1)) 366 | targets[inputs.shape[0]-1]=0 367 | 368 | test['inputs'] =inputs 369 | test['targets'] =targets 370 | 371 | print("Saving into h5.") 372 | with h5py.File(filepath+"_test.h5", "w") as f: 373 | dset = f.create_dataset("inputs", data=inputs, compression="gzip") 374 | dset = f.create_dataset("targets", data=targets, compression="gzip") 375 | print("Saved.") 376 | 377 | if return_trans_id: 378 | trans_ids = np.array(trans_ids) 379 | return test, trans_ids 380 | else: 381 | return test 382 | 383 | 384 | 385 | def load_testset_txt(filepath, use_structure=True, seq_length=101): 386 | test = {} 387 | 388 | print("Reading inference file:", filepath) 389 | if os.path.exists(filepath+"_test.npz"): 390 | print("loading from npz.") 391 | 392 | f = np.load(filepath+"_test.npz", allow_pickle=True) 393 | test['inputs'] = f['inputs'] 394 | test['targets'] = f['targets'] 395 | 396 | return test 397 | 398 | in_ver = 5 399 | seqs = [] 400 | strs = [] 401 | with open(filepath,"r") as f: 402 | for line in f.readlines(): 403 | line=line.strip('\n').split('\t') 404 | if len(line[2])!=seq_length: 405 | continue 406 | seqs.append(line[2]) 407 | if use_structure: 408 | strs.append(line[3]) 409 | in_seq = convert_one_hot(seqs, seq_length) 410 | 411 | if use_structure: 412 | structure = np.zeros((len(seqs), in_ver-4, seq_length)) 413 | for i in range(len(seqs)): 414 | icshape = strs[i].strip(',').split(',') 415 | ti = [float(t) for t in icshape] 416 | ti = np.array(ti).reshape(1,-1) 417 | structure[i] = np.concatenate([ti], axis=0) 418 | input = np.concatenate([in_seq, structure], axis=1) 419 | else: 420 | input = in_seq 421 | 422 | inputs = np.expand_dims(input, axis=3).transpose([0, 3, 2, 1]) 423 | targets = np.ones((in_seq.shape[0],1)) 424 | 425 | targets[in_seq.shape[0]-1]=0 426 | 427 | test['inputs'] = inputs 428 | test['targets'] = targets 429 | print("Saving into npz.") 430 | np.savez_compressed(filepath+"_test.npz", inputs=inputs, targets=targets) 431 | print("Saved.") 432 | 433 | return test 434 | 435 | 436 | 437 | def load_testset_txt_mu(filepath, test, seq_length=101): 438 | print("Reading test file:", filepath) 439 | f_mu = open(filepath,"r") 440 | seqs = [] 441 | strs = [] 442 | use_pu = True 443 | if test['inputs'].shape[-1]==4: 444 | use_pu = False 445 | nf=0 446 | for line in f_mu.readlines(): 447 | nf+=1 448 | line=line.strip('\n').split('\t') 449 | if len(line[2])!=seq_length: 450 | continue 451 | seqs.append(line[2]) 452 | mut_seq=seq_mutate(line[2]) 453 | seqs.extend(mut_seq) 454 | if use_pu: 455 | strs.extend([line[3]] * len(seqs)) 456 | print("file line num:",nf) 457 | print("mut seq num:",len(seqs)) 458 | in_seq = munge.convert_one_hot(seqs, seq_length) 459 | in_ver = 5 460 | if use_pu: 461 | structure = np.zeros((len(seqs), in_ver-4, seq_length)) 462 | for i in range(len(seqs)): 463 | struct_list = strs[i].strip(',').split(',') 464 | ti = np.array([float(t) for t in struct_list]).reshape(1,-1) 465 | structure[i] = np.concatenate([ti], axis=0) 466 | input = np.concatenate([in_seq, structure], axis=1) 467 | else: 468 | input = in_seq 469 | 470 | inputs = np.expand_dims(input, axis=3).transpose([0, 2, 3, 1]) 471 | targets = np.ones((in_seq.shape[0],1)) 472 | 473 | targets[in_seq.shape[0]-1]=0 474 | 475 | test['inputs'] =inputs 476 | test['targets'] =targets 477 | return test 478 | 479 | 480 | def split_dataset(data, targets, valid_frac=0.2): 481 | 482 | ind0 = np.where(targets<0.5)[0] 483 | ind1 = np.where(targets>=0.5)[0] 484 | 485 | n_neg = int(len(ind0)*valid_frac) 486 | n_pos = int(len(ind1)*valid_frac) 487 | 488 | shuf_neg = np.random.permutation(len(ind0)) 489 | shuf_pos = np.random.permutation(len(ind1)) 490 | 491 | X_train = np.concatenate((data[ind1[shuf_pos[n_pos:]]], data[ind0[shuf_neg[n_neg:]]])) 492 | Y_train = np.concatenate((targets[ind1[shuf_pos[n_pos:]]], targets[ind0[shuf_neg[n_neg:]]])) 493 | train = (X_train, Y_train) 494 | 495 | X_test = np.concatenate((data[ind1[shuf_pos[:n_pos]]], data[ind0[shuf_neg[:n_neg]]])) 496 | Y_test = np.concatenate((targets[ind1[shuf_pos[:n_pos]]], targets[ind0[shuf_neg[:n_neg]]])) 497 | test = (X_test, Y_test) 498 | 499 | return train, test 500 | -------------------------------------------------------------------------------- /exp/logistic_reg/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | # Author: XU Kui 4 | # Created Time : 09 Nov 2020 11:14:31 PM CST 5 | # Description: 6 | decription: x 7 | """ 8 | import os,sys 9 | import numpy as np 10 | import xgboost as xgb 11 | import matplotlib 12 | matplotlib.use('pdf') 13 | import matplotlib.pyplot as plt 14 | import argparse 15 | from sklearn.metrics import r2_score 16 | # feature_list= ['AARS', 'AATF', 'ABCF1', 'AGGF1', 'AKAP1', 'AKAP8L', 'ALKBH5', 'APOBEC3C', 'AQR', 'ATXN2', 'AUH', 'BCCIP', 'BCLAF1', 'BUD13', 'C17ORF85', 'C22ORF28', 'CAPRIN1', 'CDC40', 'CPEB4', 'CPSF1', 'CPSF2', 'CPSF3', 'CPSF4', 'CPSF6', 'CPSF7', 'CSTF2', 'CSTF2T', 'DDX21', 'DDX24', 'DDX3X', 'DDX42', 'DDX51', 'DDX52', 'DDX55', 'DDX59', 'DDX6', 'DGCR8', 'DHX30', 'DKC1', 'DROSHA', 'EFTUD2', 'EIF3D', 'EIF3G', 'EIF3H', 'EIF4A3', 'eIF4AIII', 'EIF4G2', 'ELAVL1', 'EWSR1', 'EXOSC5', 'FAM120A', 'FASTKD2', 'FBL', 'FIP1L1', 'FKBP4', 'FMR1', 'FTO', 'FUS', 'FXR1', 'FXR2', 'G3BP1', 'GEMIN5', 'GNL3', 'GPKOW', 'GRWD1', 'GTF2F1', 'HLTF', 'HNRNPA1', 'HNRNPC', 'HNRNPD', 'HNRNPF', 'HNRNPK', 'HNRNPM', 'HNRNPU', 'HNRNPUL1', 'IGF2BP1', 'IGF2BP2', 'IGF2BP3', 'ILF3', 'KHDRBS1', 'KHSRP', 'LARP4', 'LARP7', 'LIN28A', 'LIN28B', 'LSM11', 'METAP2', 'METTL14', 'METTL3', 'MOV10', 'MTPAP', 'NCBP2', 'NIP7', 'NIPBL', 'NKRF', 'NOL12', 'NOLC1', 'NONO', 'NOP56', 'NOP58', 'NPM1', 'NUDT21', 'PABPC4', 'PABPN1', 'PCBP1', 'PCBP2', 'PHF6', 'PPIG', 'PRPF4', 'PRPF8', 'PTBP1', 'PTBP1PTBP2', 'PUM1', 'PUM2', 'PUS1', 'QKI', 'RBFOX2', 'RBM15', 'RBM22', 'RBM27', 'RBPMS', 'RPS11', 'RPS3', 'RTCB', 'SAFB2', 'SBDS', 'SDAD1', 'SERBP1', 'SF3A3', 'SF3B1', 'SF3B4', 'SLBP', 'SLTM', 'SMNDC1', 'SND1', 'SRRM4', 'SRSF1', 'SRSF7', 'SRSF9', 'SUB1', 'SUPV3L1', 'TAF15', 'TARDBP', 'TBRG4', 'TIA1', 'TIAL1', 'TNRC6A', 'TRA2A', 'TROVE2', 'U2AF1', 'U2AF2', 'U2AF65', 'UCHL5', 'UPF1', 'UTP18', 'UTP3', 'WDR3', 'WDR33', 'WDR43', 'WRN', 'WTAP', 'XRCC6', 'XRN2', 'YBX3', 'YTHDF2', 'YWHAG', 'ZC3H11A', 'ZC3H7B', 'ZNF622', 'ZNF800', 'ZRANB2'] 17 | feature_list= ['AARS', 'AATF', 'ABCF1', 'AGGF1', 'AKAP1', 'AKAP8L', 'ALKBH5', 'APOBEC3C', 'AQR', 'ATXN2', 'AUH', 'BCCIP', 'BCLAF1', 'BUD13', 'C17ORF85', 'C22ORF28', 'CAPRIN1', 'CDC40', 'CPEB4', 'CPSF1', 'CPSF2', 'CPSF3', 'CPSF4', 'CPSF6', 'CPSF7', 'CSTF2', 'CSTF2T', 'DDX21', 'DDX24', 'DDX3X', 'DDX42', 'DDX51', 'DDX52', 'DDX55', 'DDX59', 'DDX6', 'DGCR8', 'DHX30', 'DKC1', 'DROSHA', 'EFTUD2', 'EIF3D', 'EIF3G', 'EIF3H', 'EIF4A3', 'EIF4G2', 'ELAVL1', 'EWSR1', 'EXOSC5', 'FAM120A', 'FASTKD2', 'FBL', 'FIP1L1', 'FKBP4', 'FMR1', 'FTO', 'FUS', 'FXR1', 'FXR2', 'G3BP1', 'GEMIN5', 'GNL3', 'GPKOW', 'GRWD1', 'GTF2F1', 'HLTF', 'HNRNPA1', 'HNRNPC', 'HNRNPD', 'HNRNPF', 'HNRNPK', 'HNRNPM', 'HNRNPU', 'HNRNPUL1', 'IGF2BP1', 'IGF2BP2', 'IGF2BP3', 'ILF3', 'KHDRBS1', 'KHSRP', 'LARP4', 'LARP7', 'LIN28A', 'LIN28B', 'LSM11', 'METAP2', 'METTL14', 'METTL3', 'MOV10', 'MTPAP', 'NCBP2', 'NIP7', 'NIPBL', 'NKRF', 'NOL12', 'NOLC1', 'NONO', 'NOP56', 'NOP58', 'NPM1', 'NUDT21', 'PABPC4', 'PABPN1', 'PCBP1', 'PCBP2', 'PHF6', 'PPIG', 'PRPF4', 'PRPF8', 'PTBP1', 'PTBP1PTBP2', 'PUM1', 'PUM2', 'PUS1', 'QKI', 'RBFOX2', 'RBM15', 'RBM22', 'RBM27', 'RBPMS', 'RPS11', 'RPS3', 'RTCB', 'SAFB2', 'SBDS', 'SDAD1', 'SERBP1', 'SF3A3', 'SF3B1', 'SF3B4', 'SLBP', 'SLTM', 'SMNDC1', 'SND1', 'SRRM4', 'SRSF1', 'SRSF7', 'SRSF9', 'SUB1', 'SUPV3L1', 'TAF15', 'TARDBP', 'TBRG4', 'TIA1', 'TIAL1', 'TNRC6A', 'TRA2A', 'TROVE2', 'U2AF1', 'U2AF2', 'U2AF65', 'UCHL5', 'UPF1', 'UTP18', 'UTP3', 'WDR3', 'WDR33', 'WDR43', 'WRN', 'WTAP', 'XRCC6', 'XRN2', 'YBX3', 'YTHDF2', 'YWHAG', 'ZC3H11A', 'ZC3H7B', 'ZNF622', 'ZNF800', 'ZRANB2', 'eIF4AIII'] 18 | spec_list = ["AUH","HNRNPC","HNRNPU","IGF2BP1","IGF2BP3","LIN28B","SND1","TAF15","TIA1","FMR1","FXR1","FXR2","ILF3","KHDRBS1","KHSRP","PTBP1","TARDBP","TNRC6A","XRN2","BCLAF1","DDX6","EXOSC5","G3BP1","LARP4","NCBP2","PABPN1","PCBP1","SUPV3L1","UPF1","YBX3","PABPC4","PUM1","PUM2","SERBP1","HNRNPD","HNRNPF","QKI"] 19 | 20 | top20_list = ['SND1', 'NPM1', 'KHDRBS1', 'GNL3', 'HNRNPUL1', 'TARDBP', 'ELAVL1', 'YTHDF2', 21 | 'YBX3', 'LIN28B', 'YWHAG', 'ZC3H7B', 'TIA1', 'PUM2', 'RBFOX2', 'SERBP1', 'RBPMS', 22 | 'RPS3', 'PUM1', 'PRPF8'] 23 | spec_top_list = ['AUH', 'BCLAF1', 'DDX6', 'ELAVL1', 'EXOSC5', 'FMR1', 'FXR1', 'FXR2', 'G3BP1', 'GNL3', 'HNRNPC', 'HNRNPD', 'HNRNPF', 'HNRNPU', 'HNRNPUL1', 'IGF2BP1', 'IGF2BP3', 'ILF3', 'KHDRBS1', 'KHSRP', 'LARP4', 'LIN28B', 'NCBP2', 'NPM1', 'PABPC4', 'PABPN1', 'PCBP1', 'PRPF8', 'PTBP1', 'PUM1', 'PUM2', 'QKI', 'RBFOX2', 'RBPMS', 'RPS3', 'SERBP1', 'SND1', 'SUPV3L1', 'TAF15', 'TARDBP', 'TIA1', 'TNRC6A', 'UPF1', 'XRN2', 'YBX3', 'YTHDF2', 'YWHAG', 'ZC3H7B'] 24 | import scipy.stats 25 | 26 | import termplotlib as tpl 27 | # from data_utils import load_data 28 | 29 | import pickle 30 | from sklearn import datasets, ensemble 31 | # from sklearn.ensemble import HistGradientBoostingRegressor 32 | from sklearn.inspection import permutation_importance 33 | 34 | 35 | 36 | def plot(x, y, label="plot"): 37 | fig = tpl.figure() 38 | fig.plot(x, y, label=label, width=50, height=15) 39 | fig.show() 40 | 41 | def plot_hist(sample,bins=40): 42 | counts, bin_edges = np.histogram(sample, bins=bins) 43 | fig = tpl.figure() 44 | fig.hist(counts, bin_edges, grid=[15, 25], orientation="horizontal",force_ascii=False) 45 | fig.show() 46 | 47 | def normy(x): 48 | return (x-x.min())/(x.max() - x.min()) 49 | 50 | def normx(x): 51 | return 1/(1 + np.exp(-x.astype("float"))) 52 | # return (x-x.mean())/x.std() 53 | 54 | def get_topk_important_fea(filepath, topk=4): 55 | global feature_list 56 | feature_name=np.array(feature_list) 57 | weight = np.load(filepath, allow_pickle=True) 58 | gain = weight['gain'].tolist() 59 | fea_gain = np.zeros(len(gain)) 60 | for i in range(len(gain)): 61 | fea_gain[i] = gain['f'+str(i)] 62 | topk_flist = fea_gain.argsort()[::-1][:topk] 63 | 64 | 65 | return topk_flist 66 | 67 | 68 | def get_topk_important_fea1(reg, topk=4): 69 | # global feature_list 70 | # feature_name=np.array(feature_list) 71 | # weight = np.load(filepath, allow_pickle=True) 72 | # fscore = bst.get_fscore() 73 | feature_importance = reg.feature_importances_ 74 | topk_flist = np.argsort(feature_importance)[::-1][:topk] 75 | return topk_flist 76 | 77 | def get_topk_important_fea2(bst, topk=4): 78 | global feature_list 79 | feature_name=np.array(feature_list) 80 | # weight = np.load(filepath, allow_pickle=True) 81 | # fscore = bst.get_fscore() 82 | fscore = bst.get_score(importance_type='gain') 83 | fea_fscore = np.zeros(len(fscore)) 84 | for i in range(len(fscore)):fea_fscore[i] = fscore['f'+str(i)] 85 | topk_flist = fea_fscore.argsort()[::-1][:topk] 86 | return topk_flist 87 | 88 | 89 | 90 | ## 91 | # this script demonstrate how to fit generalized linear model in xgboost 92 | # basically, we are using linear model, instead of tree for our boosters 93 | 94 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example') 95 | parser.add_argument('--batch-size', type=int, default=640, metavar='N', 96 | help='input batch size for training (default: 64)') 97 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', 98 | help='input batch size for testing (default: 1000)') 99 | parser.add_argument('--epochs', type=int, default=400, metavar='N', 100 | help='number of epochs to train (default: 14)') 101 | parser.add_argument('--lr', type=float, default=1.0, metavar='LR', 102 | help='learning rate (default: 1.0)') 103 | parser.add_argument('--gamma', type=float, default=0.7, metavar='M', 104 | help='Learning rate step gamma (default: 0.7)') 105 | parser.add_argument('--no-cuda', action='store_true', default=False, 106 | help='disables CUDA training') 107 | parser.add_argument('--cv', action='store_true', default=False, 108 | help='quickly check a single pass') 109 | parser.add_argument('--seed', type=int, default=1, metavar='S', 110 | help='random seed (default: 1)') 111 | parser.add_argument('--log-interval', type=int, default=100, metavar='N', 112 | help='how many batches to wait before logging training status') 113 | parser.add_argument('--save-model', action='store_true', default=False, 114 | help='For Saving the current Model') 115 | parser.add_argument('--train_data', default='', type=str, 116 | help="path of the training data to use") 117 | parser.add_argument('--test_data', default='', type=str, 118 | help="path of the training data to use") 119 | parser.add_argument('--pred_data', default='', type=str, 120 | help="path of the training data to use") 121 | parser.add_argument('--model_path', default='', type=str, 122 | help="path of the training data to use") 123 | parser.add_argument('--reg', default='squarederror', type=str, 124 | help="path of the training data to use") 125 | parser.add_argument('--booster', default='gbtree', type=str, 126 | help="path of the training data to use") 127 | parser.add_argument('--lam', type=int, default=-1, 128 | help='L2 reg (default: -1)' 129 | ) 130 | parser.add_argument('--topk', type=int, default=0, 131 | help='topk features (default: 0)') 132 | parser.add_argument('--randk', type=int, default=0, 133 | help='random k features (default: 0)') 134 | parser.add_argument('--load_best', action='store_true', default=False, 135 | help='load best model') 136 | parser.add_argument('--fine_tune', action='store_true', default=False, 137 | help='fine tuning ') 138 | parser.add_argument('--cell_expr', action='store_true', default=False, 139 | help='using cell expression') 140 | parser.add_argument('--normx', action='store_true', default=False, 141 | help='norm input data') 142 | parser.add_argument('--plot', action='store_true', default=False, 143 | help='norm input data') 144 | parser.add_argument('--fsel', type=int, default=1, 145 | help='feature selector (default: -1)' 146 | ) 147 | parser.add_argument('--sellist', type=int, default=0, 148 | help='feature selector (default: -1)' 149 | ) 150 | args = parser.parse_args() 151 | 152 | traindata = args.train_data 153 | testdata = args.test_data 154 | preddata = args.pred_data 155 | 156 | if not os.path.exists(preddata): 157 | print(preddata," not found.") 158 | preddata = "" 159 | if args.fine_tune: 160 | traindata = preddata.replace(".train.npz",".test.npz") 161 | print("Fine-tune on ",traindata) 162 | 163 | print("Reading train data:",traindata) 164 | print("Reading test data:",testdata) 165 | print("Reading pred data:",preddata) 166 | 167 | t_data = np.load(traindata,allow_pickle=True) 168 | e_data = np.load(testdata,allow_pickle=True) 169 | if preddata!="": 170 | p_data = np.load(preddata,allow_pickle=True) 171 | 172 | t_x = t_data['x'] 173 | t_y = t_data['y'] 174 | e_x = e_data['x'] 175 | e_y = e_data['y'] 176 | if preddata!="": 177 | p_x = p_data['x'] 178 | p_y = p_data['y'] 179 | 180 | print(" train X: min,max: {:.3f} {:.3f} {}".format(t_x.min(), t_x.max(), t_x.shape)) 181 | print(" train Y: min,max: {:.3f} {:.3f}".format(t_y.min(), t_y.max())) 182 | print(" test X: min,max: {:.3f} {:.3f} {}".format(e_x.min(), e_x.max(), e_x.shape)) 183 | print(" test Y: min,max: {:.3f} {:.3f}".format(e_y.min(), e_y.max())) 184 | if preddata!="": 185 | print(" pred X: min,max: {:.3f} {:.3f}".format(p_x.min(), p_x.max(), p_x.shape)) 186 | print(" pred Y: min,max: {:.3f} {:.3f}".format(p_y.min(), p_y.max())) 187 | 188 | 189 | 190 | norm_x = args.normx 191 | if norm_x: 192 | t_x = normx(t_x) 193 | e_x = normx(e_x) 194 | if preddata!="": 195 | p_x = normx(p_x) 196 | 197 | # plot_hist(t_y) 198 | # print("-------------------------------------------") 199 | norm_y = True 200 | if norm_y: 201 | # t_y0 = np.zeros_like(t_y) 202 | # e_y0 = np.zeros_like(t_y) 203 | # import math 204 | # for i in range(t_y.shape[0]): 205 | # t_y0[i] = math.log(t_y[i]+1) 206 | 207 | # for i in range(e_y.shape[0]): 208 | # e_y0[i] = math.log(e_y[i]+1) 209 | # t_y = t_y0 210 | # e_y = e_y0 211 | # import pdb; pdb.set_trace() 212 | # t_y = np.log((t_y+1).astype("float")) 213 | # e_y = np.log((e_y+1).astype("float")) 214 | # t_x = abs(t_x) 215 | # e_x = abs(e_x) 216 | # t_y = np.log((t_y/2+2).astype("float")) 217 | # e_y = np.log((e_y/2+2).astype("float")) 218 | # t_x = t_x/10 219 | # e_x = e_x/10 220 | # t_y = np.log((t_y/2+2).astype("float")) 221 | # e_y = np.log((e_y/2+2).astype("float")) 222 | 223 | t_y = (t_y+1)/2 224 | e_y = (e_y+1)/2 225 | 226 | if preddata!="": 227 | # import pdb; pdb.set_trace() 228 | p_y = (p_y+1)/2 229 | 230 | 231 | # plot_hist(t_y) 232 | feature_name=np.array(feature_list) 233 | # if args.topk>0: 234 | # # filepath = args.model_path+"_weight_eval_test.npz" 235 | # # topk_list = get_topk_important_fea(filepath, topk=args.topk) 236 | # # feature_list = feature_name[topk_list] 237 | # # print("Using Top {} features: {}".format(args.topk, feature_name[topk_list])) 238 | # bst = xgb.Booster(model_file=args.model_path) 239 | # topk_list = get_topk_important_fea2(bst,args.topk) 240 | # feature_list = feature_name[topk_list] 241 | # t_x = t_x[:,topk_list] 242 | # e_x = e_x[:,topk_list] 243 | # if preddata!="": 244 | # p_x = p_x[:,topk_list] 245 | # print("Using Top {} features: {}".format(args.topk, feature_name[topk_list])) 246 | # # import pdb; pdb.set_trace() 247 | # args.model_path = args.model_path.replace("_best.model", "_topk{}_best.model".format(args.topk)) 248 | if args.sellist>0: 249 | if args.sellist==1: 250 | topk_list=[feature_list.index(p) for p in spec_list] 251 | 252 | elif args.sellist==2: 253 | topk_list=[feature_list.index(p) for p in spec_top_list] 254 | elif args.sellist==3: # top 20 255 | topk_list=[feature_list.index(p) for p in top20_list] 256 | else: 257 | raise "error no such list." 258 | 259 | 260 | feature_list = feature_name[topk_list] 261 | t_x = t_x[:,topk_list] 262 | e_x = e_x[:,topk_list] 263 | if preddata!="": 264 | p_x = p_x[:,topk_list] 265 | print("Using Top {} features: {}".format(args.topk, topk_list)) 266 | print("Using Top {} features: {}".format(args.topk, feature_name[topk_list])) 267 | # import pdb; pdb.set_trace() 268 | # args.model_path = args.model_path.replace("_best.skl", "_topk{}_best.skl".format(args.topk)) 269 | args.model_path = args.model_path.replace("_best.model", "_spec{}_best.model".format(args.sellist)) 270 | 271 | 272 | if args.topk>0: 273 | 274 | # filepath = args.model_path+"_weight_eval_test.npz" 275 | # topk_list = get_topk_important_fea(filepath, topk=args.topk) 276 | # feature_list = feature_name[topk_list] 277 | # print("Using Top {} features: {}".format(args.topk, feature_name[topk_list])) 278 | # bst = xgb.Booster(model_file=args.model_path) 279 | 280 | if args.fsel ==1: 281 | skl_model_path = args.model_path.replace("_best.model", "_best.skl") 282 | reg0 = pickle.load(open(skl_model_path, 'rb')) 283 | print("topk important_fea") 284 | topk_list = get_topk_important_fea1(reg0,args.topk) 285 | elif args.fsel ==2: 286 | skl_model_path = args.model_path.replace("_best.model", "_best.skl") 287 | reg0 = pickle.load(open(skl_model_path, 'rb')) 288 | print("topk permutation_importance") 289 | 290 | result = permutation_importance(reg0, e_x, e_y, n_repeats=10, 291 | random_state=42, n_jobs=2) 292 | topk_list = result.importances_mean.argsort()[::-1][:args.topk]#[::-1] 293 | else: 294 | print("topk gain") 295 | bst = xgb.Booster(model_file=args.model_path) 296 | topk_list = get_topk_important_fea2(bst,args.topk) 297 | 298 | 299 | feature_list = feature_name[topk_list] 300 | t_x = t_x[:,topk_list] 301 | e_x = e_x[:,topk_list] 302 | if preddata!="": 303 | p_x = p_x[:,topk_list] 304 | print("Using Top {} features: {}".format(args.topk, topk_list)) 305 | print("Using Top {} features: {}".format(args.topk, feature_name[topk_list])) 306 | # import pdb; pdb.set_trace() 307 | # args.model_path = args.model_path.replace("_best.skl", "_topk{}_best.skl".format(args.topk)) 308 | args.model_path = args.model_path.replace("_best.model", "_topk{}_best.model".format(args.topk)) 309 | 310 | if args.randk>0: 311 | # filepath = args.model_path+"_weight_eval_test.npz" 312 | topk_list = np.random.randint(171, size=args.randk) 313 | feature_list = feature_name[topk_list] 314 | t_x = t_x[:,topk_list] 315 | e_x = e_x[:,topk_list] 316 | if preddata!="": 317 | p_x = p_x[:,topk_list] 318 | print("Using Random {} features: {}".format(args.randk, feature_name[topk_list])) 319 | # print("Using Random {} features.".format(args.randk)) 320 | args.model_path = args.model_path.replace("_best.model", "_randk{}_best.model".format(args.randk)) 321 | 322 | print(" train X: min,max: {:.3f} {:.3f} {}".format(t_x.min(), t_x.max(), t_x.shape)) 323 | print(" train Y: min,max: {:.3f} {:.3f}".format(t_y.min(), t_y.max())) 324 | print(" test X: min,max: {:.3f} {:.3f} {}".format(e_x.min(), e_x.max(), e_x.shape)) 325 | print(" test Y: min,max: {:.3f} {:.3f}".format(e_y.min(), e_y.max())) 326 | if preddata!="": 327 | print(" pred X: min,max: {:.3f} {:.3f}".format(p_x.min(), p_x.max(), p_x.shape)) 328 | print(" pred Y: min,max: {:.3f} {:.3f}".format(p_y.min(), p_y.max())) 329 | 330 | 331 | # dtrain = xgb.DMatrix(t_x, label=t_y, feature_names=feature_list) 332 | # dtest = xgb.DMatrix(e_x, label=e_y, feature_names=feature_list) 333 | # if preddata!="": 334 | # dpred = xgb.DMatrix(p_x, label=p_y, feature_names=feature_list) 335 | # import pdb; pdb.set_trace() 336 | 337 | dtrain = xgb.DMatrix(t_x, label=t_y) 338 | dtest = xgb.DMatrix(e_x, label=e_y) 339 | if preddata!="": 340 | dpred = xgb.DMatrix(p_x, label=p_y) 341 | # change booster to gblinear, so that we are fitting a linear model 342 | # alpha is the L1 regularizer 343 | # lambda is the L2 regularizer 344 | # you can also set lambda_bias which is L2 regularizer on the bias term 345 | param = {'objective':'reg:squarederror', 'booster':'gbtree',"eval_metric": 'rmse', 346 | 'lambda': 16, 'eta':0.1} 347 | param = {'objective':'reg:'+args.reg, 'booster':args.booster,"eval_metric": 'rmse', 348 | 'lambda': 16, 'eta':0.1} 349 | print(param) 350 | # normally, you do not need to set eta (step_size) 351 | # XGBoost uses a parallel coordinate descent algorithm (shotgun), 352 | # there could be affection on convergence with parallelization on certain cases 353 | # setting eta to be smaller value, e.g 0.5 can make the optimization more stable 354 | # param['eta'] = 1 355 | 356 | ## 357 | # the rest of settings are the same 358 | ## 359 | watchlist = [(dtrain, 'train'),(dtest, 'eval'), ] 360 | num_round = 3000 361 | best_r = 0 362 | best_l = 0 363 | best_p = 0 364 | 365 | 366 | for la in range(0, 30, 2): 367 | if args.lam >= 0: 368 | param['lambda']=args.lam 369 | else: 370 | param['lambda'] = la 371 | print('lambda:', param['lambda']) 372 | 373 | early_stopping_rounds = 40 374 | if args.load_best: 375 | print("Loading best model.") 376 | bst = xgb.Booster(model_file=args.model_path) 377 | # topk_list = get_topk_important_fea2(bst,args.topk) 378 | # feature_list = feature_name[topk_list] 379 | # print("Using Top {} features22: {}".format(args.topk, feature_name[topk_list])) 380 | # import pdb; pdb.set_trace() 381 | # bst.save_model(args.model_path) 382 | if args.fine_tune: 383 | print("Fine tuning.") 384 | early_stop = xgb.callback.EarlyStopping( 385 | rounds=early_stopping_rounds, 386 | metric_name='rmse', 387 | save_best=True, 388 | data_name='eval' 389 | ) 390 | bst = xgb.train(param, dtrain, num_round, watchlist, callbacks=[early_stop],) 391 | args.model_path = args.model_path.replace("_best.model", "_finetune_best.model") 392 | elif args.cv: 393 | nfold = 5 394 | print("Do Cross Validation: {} fold.".format(nfold)) 395 | param['verbosity']=1 396 | hist = xgb.cv(param, dtrain, num_round, 397 | nfold=nfold, 398 | verbose_eval=True, 399 | early_stopping_rounds=early_stopping_rounds) 400 | print(hist) 401 | else: 402 | early_stop = xgb.callback.EarlyStopping( 403 | rounds=early_stopping_rounds, 404 | metric_name='rmse', 405 | save_best=True, 406 | data_name='eval' 407 | ) 408 | bst = xgb.train(param, dtrain, num_round, watchlist, callbacks=[early_stop],) 409 | bst.save_model(args.model_path) 410 | gain = bst.get_score(importance_type='gain') 411 | total_gain = bst.get_score(importance_type='total_gain') 412 | 413 | e_preds = bst.predict(dtest) 414 | e_labels = dtest.get_label() 415 | r, p = scipy.stats.pearsonr(e_labels, e_preds) 416 | # r2=r2_score(labels, preds) 417 | print("Test R: {:f}, R^2: {:f}, P-value: {:e}".format(r, r**2, p)) 418 | # if preddata!="": 419 | # r,p = predict(bst, dtest) 420 | if r> best_r: 421 | best_r = r 422 | best_p = p 423 | best_l = la 424 | print("### -> Best...") 425 | 426 | if preddata!="": 427 | # print("dpred: ",p_x.shape) 428 | p_preds = bst.predict(dpred) 429 | p_labels = dpred.get_label() 430 | r, p = scipy.stats.pearsonr(p_labels, p_preds) 431 | print("Pred R: {:f}, R^2: {:f}, P-value: {:e}".format(r, r**2, p)) 432 | else: 433 | p_labels=None 434 | p_preds=None 435 | 436 | 437 | np.savez_compressed(args.model_path+"_weight_eval_test.npz", 438 | gain=gain, 439 | total_gain=total_gain, 440 | eval_label=e_labels, 441 | eval_pred=e_preds, 442 | test_label=p_labels, 443 | test_pred=p_preds, 444 | ) 445 | if args.lam >= 0: # pred 446 | sys.exit(0) 447 | 448 | print("Best la: {}\nR: {:f}, R^2: {:f}, P-value: {:e}".format(best_l, best_r, best_r**2, p)) 449 | # print("Best R: {:f}, R^2: {:f}, lambda: {}".format(best_r, best_r**2, best_l)) 450 | # gain = bst.get_score(importance_type='gain') 451 | # total_gain = bst.get_score(importance_type='total_gain') 452 | # np.savez_compressed("low_fi_{:d}.npz".format(la), gain=gain, total_gain=total_gain) 453 | # xgb.plot_importance(bst,importance_type='gain', max_num_features=20) 454 | # plt.savefig("fi_{:d}.pdf".format(la)) 455 | # print('error=%f' % (sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds)))) 456 | -------------------------------------------------------------------------------- /motif_construct/saliency_motif.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | use Cwd; 4 | use List::Util qw/max min sum maxstr minstr shuffle/; 5 | 6 | my $infile = $ARGV[0]; 7 | my $prot_cell = $ARGV[1]; 8 | 9 | my $usage = "This script is to build the motif from PrismNet model output attention file. 10 | usage: $0 11 | 12 | example: perl saliency_motif.pl infile.sal outfile 13 | "; 14 | die $usage if $#ARGV<1; 15 | 16 | #the input file containing the attention signal. 17 | #my $infile = $prot_cell."_5v_binary_99999_binary_icbind_1_pu_ana0_test.txt"; 18 | 19 | 20 | my $site_file = $prot_cell."_seq_20_8"; 21 | my $kmer_file = $prot_cell."_seq_6kmer"; 22 | my $motif_file = $prot_cell."_motif_10"; 23 | 24 | my $bind_inf = &fdata_read3($infile); 25 | my $site_seq = &bind_select3($bind_inf, 10, 0.8, 0.2, $prot_cell); 26 | my %sseq = %{$site_seq}; 27 | 28 | open(OUT1, ">", $site_file."_seq.fa"); 29 | foreach my $k1 (sort {${$sseq{$b}}[0] <=> ${$sseq{$a}}[0]} keys %sseq){ 30 | print OUT1 ">",$k1,"|",${$sseq{$k1}}[0],"\n",${$sseq{$k1}}[2],"\n",${$sseq{$k1}}[3],"\n",${$sseq{$k1}}[4],"\n"; 31 | } 32 | close OUT1; 33 | 34 | 35 | my ($Kmer1, $Kmer_loc) = &kmer_cal2($site_seq, 6); 36 | my %kkmer1 = %{$Kmer1}; 37 | my %kkmer_loc = %{$Kmer_loc}; 38 | 39 | open(OUT1, ">", $kmer_file."_seq.txt"); 40 | foreach my $k1 (sort {$kkmer1{$b} <=> $kkmer1{$a}} keys %kkmer1){ 41 | print OUT1 $k1,"\t",$kkmer1{$k1},"\n"; 42 | } 43 | close OUT1; 44 | 45 | 46 | my ($Motif_matrix1, $Motif_matrix2) = &combine_kmer($Kmer1, "-ACGT", $Kmer_loc, $bind_inf, $prot_cell); 47 | &motif_print($Motif_matrix1, $prot_cell, 10, $motif_file."_seq.meme", "-ACGT"); 48 | 49 | my %mmat1 = %{$Motif_matrix1}; 50 | my %mmat2 = %{$Motif_matrix2}; 51 | open(OUT1, ">", $motif_file."_str.meme"); 52 | foreach my $k1 (sort {$a<=>$b} keys %mmat2){ 53 | my %tmp = %{$mmat2{$k1}}; 54 | print OUT1 $k1,"\n"; 55 | foreach my $k1 (sort {$a<=>$b} keys %tmp){ 56 | #print OUT1 ${$tmp{$k1}}[0],"|",${$tmp{$k1}}[1],"\t"; 57 | print OUT1 sprintf("%.4f", ${$tmp{$k1}}[0]),"|",sprintf("%.4f", ${$tmp{$k1}}[1]),"\t"; 58 | } 59 | print OUT1 "\n"; 60 | } 61 | close OUT1; 62 | 63 | open(OUT1, ">", $motif_file."_seq.txt"); 64 | foreach my $k1 (sort {$a<=>$b} keys %mmat1){ 65 | my %tmp = %{$mmat1{$k1}}; 66 | #print OUT1 $k1,"\n"; 67 | for(my $i=0; $i<=3; $i++){ 68 | foreach my $k2 (sort {$a<=>$b} keys %tmp){ 69 | #print OUT1 ${$tmp{$k1}}[0],"|",${$tmp{$k1}}[1],"\t"; 70 | print OUT1 sprintf("%.4f", ${$tmp{$k2}}[$i]),"\t"; 71 | } 72 | print OUT1 "\n"; 73 | } 74 | } 75 | close OUT1; 76 | 77 | open(OUT1, ">", $motif_file."_str.txt"); 78 | foreach my $k1 (sort {$a<=>$b} keys %mmat2){ 79 | my %tmp = %{$mmat2{$k1}}; 80 | #print OUT1 $k1,"\n"; 81 | for(my $i=0; $i<=1; $i++){ 82 | foreach my $k2 (sort {$a<=>$b} keys %tmp){ 83 | #print OUT1 ${$tmp{$k1}}[0],"|",${$tmp{$k1}}[1],"\t"; 84 | print OUT1 sprintf("%.4f", ${$tmp{$k2}}[$i]),"\t"; 85 | } 86 | print OUT1 "\n"; 87 | } 88 | } 89 | close OUT1; 90 | 91 | my $summary_file = $prot_cell."_summary.txt"; 92 | my $meme_file = $prot_cell."_motif_10_seq.meme"; 93 | my $seq_file = $prot_cell."_motif_10_seq.txt"; 94 | my $str_file = $prot_cell."_motif_10_str.txt"; 95 | 96 | my $tmeme_file = $prot_cell."_top10_motif_10_seq.meme"; 97 | my $tmeme_file2 = $prot_cell."_top10_motif_10_seq2.meme"; 98 | 99 | my $sum_out = $prot_cell."_motif_summary.txt"; 100 | my $seqstr_out = $prot_cell."_motif_prob.txt"; 101 | #RBFOX2_mes_summary.txt 102 | #RBFOX2_mes_motif_10_seq.txt 103 | #RBFOX2_mes_motif_10_str.txt 104 | #RBFOX2_mes_motif_10_seq.meme 105 | 106 | `head -n 240 $meme_file > $tmeme_file`; 107 | `cp $tmeme_file $tmeme_file2`; 108 | `tomtom -o $prot_cell $tmeme_file $tmeme_file2`; 109 | 110 | my $motif_similar = $prot_cell."/tomtom.txt"; 111 | my $Sinf = &read_summary($summary_file); 112 | my $Seq_inf = &read_seq_count($seq_file); 113 | my $Str_inf = &read_str_count($str_file); 114 | 115 | my %sinf = %{$Sinf}; my %seq_inf = %{$Seq_inf}; my %str_inf = %{$Str_inf}; 116 | my %finf = (); my %fsinf = (); 117 | 118 | my $Motif_com = &read_tomtom($motif_similar, $prot_cell, \%sinf); 119 | my %motif_com = %{$Motif_com}; 120 | my $num = 0; my $pnum = 0; my $unum = 0; 121 | foreach my $key (keys %sinf){ 122 | my $count = (${$sinf{$key}}[1] =~ s/U/U/g); 123 | if($count >= 4){ 124 | $unum = $unum + ${$sinf{$key}}[2]; 125 | }else{ 126 | $pnum = $pnum + ${$sinf{$key}}[2]; 127 | } 128 | $num = $num + ${$sinf{$key}}[2]; 129 | } 130 | 131 | foreach my $key ( sort{$a<=>$b} keys %motif_com){ 132 | my @sen = @{$motif_com{$key}}; 133 | #print $key,"\t",join("|", @sen),"\n"; 134 | } 135 | 136 | foreach my $key ( sort{$a<=>$b} keys %motif_com){ 137 | my @sen = @{$motif_com{$key}}; 138 | #print ">",$key,"\t",join("|",@sen),"\n"; 139 | if($#sen == -1){ 140 | $finf{$key} = $sinf{$key}; 141 | $fsinf{$key} = $seq_inf{$key}; 142 | ${$fsinf{$key}}{4} = ${$str_inf{$key}}{0}; 143 | ${$fsinf{$key}}{5} = ${$str_inf{$key}}{1}; 144 | }else{ 145 | $finf{$key} = $sinf{$key}; 146 | $fsinf{$key} = $seq_inf{$key}; 147 | ${$fsinf{$key}}{4} = ${$str_inf{$key}}{0}; 148 | ${$fsinf{$key}}{5} = ${$str_inf{$key}}{1}; 149 | for(my $i=0; $i<=$#sen; $i++){ 150 | my @sent1 = split(/\|/, $sen[$i]); 151 | my $shf = -$sent1[1]; 152 | ${$finf{$key}}[2] = ${$finf{$key}}[2] + ${$sinf{$sent1[0]}}[2]; 153 | for(my $i=0; $i<=3; $i++){ 154 | for(my $j=max(0-$shf, 0); $j<=min(9-$shf, 9); $j++){ 155 | ${${$fsinf{$key}}{$i}}[$j] = ${${$fsinf{$key}}{$i}}[$j] + ${${$seq_inf{$sent1[0]}}{$i}}[$j+$shf]; 156 | } 157 | } 158 | for(my $i=0; $i<=1; $i++){ 159 | for(my $j=max(0-$shf, 0); $j<=min(9-$shf, 9); $j++){ 160 | ${${$fsinf{$key}}{$i+4}}[$j] = ${${$fsinf{$key}}{$i+4}}[$j] + ${${$str_inf{$sent1[0]}}{$i}}[$j+$shf]; 161 | } 162 | } 163 | } 164 | } 165 | } 166 | 167 | open(OUT1, ">", $sum_out); 168 | open(OUT2, ">", $seqstr_out); 169 | print OUT1 "motif_id\tmotif_site\tmotif_weight\n"; 170 | 171 | my @labc = ("seq_A", "seq_C", "seq_G", "seq_U", "str_P", "str_U"); 172 | #print OUT1 $num,"\t",$unum,"\t",$unum/$num,"\t",$pnum,"\t",$pnum/$num,"\n"; 173 | foreach my $key (sort{${$finf{$b}}[2] <=> ${$finf{$a}}[2]} keys %finf){ 174 | print OUT1 ${$finf{$key}}[0],"|",${$finf{$key}}[1],"\t",${$finf{$key}}[2],"\t",${$finf{$key}}[2]/$num,"\n"; 175 | my $nsite = ${$finf{$key}}[2]; 176 | #for(my $i=0; $i<=5; $i++){ 177 | # for(my $j=0; $j<=9; $j++){ 178 | # print OUT2 ${${$fsinf{$key}}{$i}}[$j],"\t"; 179 | # } 180 | # print OUT2 "\n"; 181 | #} 182 | for(my $i=0; $i<=5; $i++){ 183 | print OUT2 $key."_".$labc[$i],"\t"; 184 | for(my $j=0; $j<=9; $j++){ 185 | print OUT2 ${${$fsinf{$key}}{$i}}[$j]/$nsite,"\t"; 186 | } 187 | print OUT2 "\n"; 188 | } 189 | } 190 | 191 | close OUT1; 192 | close OUT2; 193 | 194 | #id,int 195 | #label,int 196 | #Predictscore,float 197 | #Sequence,str,101 198 | #Icshape, float,101 199 | #Saliency,101x5(5v), 101x6(6v,7v) 200 | 201 | sub fdata_read3{ 202 | my $fdata_file = shift; 203 | my $i = 0; my $j = 0; my $r = 0; 204 | my $sen = ""; my $sen1 = ""; my $seq = ""; my $sna = ""; 205 | my @sen = (); my @sen1 = (); my @sen2 = (); 206 | my %inf = (); 207 | open(FILE1, $fdata_file)||die("open $fdata_file error!\n"); 208 | #11 5b724f3fcd6c7ec4fa054a09f038a70e 1.000 0.999 AUAAUUUUUUUCACUGUGCACCAGCAUCAGCAUCACUGUGUACCAGCAUCAGCAUCACUGUGUACCAGCAUCAGCAUCACUGUGUAUCAGCAUCAGCAUCACUG 0.55708003,0.0,0.0,0.0,0.0,0.5653001,0.0,0.0,0.0,0.45 0.55708003,0.0,0.0,0.0,0.0,0.5653001,0.0,0.0,0.0,0.45 209 | $sna = 0; 210 | while($sen = ){ 211 | chomp($sen); 212 | @sen1 = split(/\t/, $sen); 213 | if($sen1[2] == 1){ 214 | @sen2 = split(/\,/, $sen1[6]); 215 | my @ics = split(/\,/, $sen1[5]); 216 | my $num = 0; 217 | my @sent1 = (); 218 | my @psent = (); my @usent = (); 219 | for($i=0; $i<=100; $i++){ 220 | $num = max($sen2[$i*5], $sen2[$i*5+1], $sen2[$i*5+2], $sen2[$i*5+3]); 221 | push(@sent1, $num); 222 | push(@psent, $sen2[$i*5+4]); 223 | } 224 | my $seq_map = join("|", @sent1); 225 | $inf{$sen1[0]} = [$sen1[3], $sen1[4], $seq_map, join("|", @ics), join("|", @psent)]; 226 | } 227 | } 228 | close FILE1; 229 | return(\%inf); 230 | } 231 | 232 | sub bind_select3{ 233 | my $Inf = shift; my $len = shift; my $bind_score = shift; my $per = shift; my $protein_name = shift; 234 | my ($score1, $score2) = &max_per_seq_str($Inf, $len, $per, $bind_score); 235 | my %inf = %{$Inf}; my %site_inf = (); 236 | my $sna = ""; my $key = ""; 237 | foreach $key (sort {${$inf{$b}}[0] <=> ${$inf{$a}}[0]} keys %inf){ 238 | if(${$inf{$key}}[0] < $bind_score){ 239 | last; 240 | } 241 | my @sent1 = split(/\|/, ${$inf{$key}}[2]); 242 | shift(@sent1); 243 | my @sent2 = split(/\|/, ${$inf{$key}}[4]); 244 | shift(@sent2); 245 | my $i = 0; my $j = 0; my $index = 0; my $maxn = 0; my $sum1 = 0; my $sum2 = 0; my $sta = -1; my $end = -1; my $tsum1 = 0; my $tsum2 = 0; 246 | my $samstr = &get_str(${$inf{$key}}[1], ${$inf{$key}}[3], $protein_name); 247 | $samstr =~s/\./U/g; 248 | $samstr =~s/\(/P/g; 249 | $samstr =~s/\)/P/g; 250 | for($i=0; $i<=$#sent1-$len+1; $i++){ 251 | $sum1 = 0; $sum2 = 0; 252 | for($j=0; $j<$len; $j++){ 253 | $sum1 = $sum1 + $sent1[$i+$j]; 254 | $sum2 = $sum2 + $sent2[$i+$j]; 255 | } 256 | if(($sum1 > $score1) && ($sum2 > $score2)){ 257 | if($end >= $i){ 258 | $end = $i + $len - 1; 259 | }elsif($end > 0){ 260 | my $sna = $key."_".$sta; 261 | $site_inf{$sna} = [$tsum1, $tsum2, substr(${$inf{$key}}[1], $sta, $end - $sta + 1), &subshape(${$inf{$key}}[3], $sta, $end - $sta + 1), substr($samstr, $sta, $end - $sta + 1)]; 262 | $sta = $i; 263 | $end = $i + $len - 1; 264 | $tsum1 = $sum1; 265 | $tsum2 = $sum2; 266 | }else{ 267 | $sta = $i; 268 | $end = $i + $len - 1; 269 | $tsum1 = $sum1; 270 | $tsum2 = $sum2; 271 | } 272 | } 273 | } 274 | if($end > 0){ 275 | my $sna = $key."_".$sta; 276 | $site_inf{$sna} = [$tsum1, $tsum2, substr(${$inf{$key}}[1], $sta, $end - $sta + 1), &subshape(${$inf{$key}}[3], $sta, $end - $sta + 1), substr($samstr, $sta, $end - $sta + 1)]; 277 | } 278 | } 279 | return(\%site_inf); 280 | } 281 | 282 | sub subshape{ 283 | my $Shape_list = shift; my $sta = shift; my $len = shift; 284 | #my @dvalue = (0.0, 0.088, 0.233, 0.484, 1.0); 285 | my @sent1 = split(/\|/, $Shape_list); 286 | my @sent2 = (); 287 | #shift(@sent1); 288 | my $i = 0; my $str_seq = ""; 289 | for($i=$sta; $i<=$sta+$len-1; $i++){ 290 | push(@sent2, $sent1[$i]); 291 | } 292 | $str_seq = join("|", @sent2); 293 | return ($str_seq); 294 | } 295 | 296 | sub subshape2{ 297 | my $Shape_list = shift; my $sta = shift; my $len = shift; 298 | my @dvalue = (0.0, 0.088, 0.233, 0.484, 1.0); 299 | my @sent1 = split(/\|/, $Shape_list); 300 | shift(@sent1); 301 | my $i = 0; my $str_seq = ""; 302 | for($i=$sta; $i<=$sta+$len-1; $i++){ 303 | if($sent1[$i] <= $dvalue[1]){ 304 | $str_seq = $str_seq."P"; 305 | }elsif($sent1[$i] <= $dvalue[2]){ 306 | $str_seq = $str_seq."Q"; 307 | }elsif($sent1[$i] <= $dvalue[3]){ 308 | $str_seq = $str_seq."S"; 309 | }else{ 310 | $str_seq = $str_seq."Z"; 311 | } 312 | } 313 | return ($str_seq); 314 | } 315 | 316 | sub shape2str{ 317 | my $Shape_list = shift; my $Str_list = shift; 318 | my @dvalue = (0.0, 0.088, 0.233, 0.484, 1.0); 319 | my @sent1 = split(/\|/, $Shape_list); 320 | my @sent2 = split(//, $Str_list); 321 | #shift(@sent1); 322 | my $i = 0; my $str_seq = ""; 323 | for($i=0; $i<=$#sent1; $i++){ 324 | if($sent1[$i] <= 0){ 325 | $str_seq = $str_seq.$sent2[$i]; 326 | }elsif($sent1[$i] <= $dvalue[2]){ 327 | $str_seq = $str_seq."P"; 328 | }else{ 329 | $str_seq = $str_seq."U"; 330 | } 331 | } 332 | return ($str_seq); 333 | } 334 | 335 | sub max_index{ 336 | my $list = shift; my $len = shift; 337 | my @sen1 = @{$list}; 338 | my $i = 0; my $j = 0; my $r = 0; my $index = 0; my $maxn = 0; my $sum = 0; 339 | for($i=0; $i<=$#sen1-$len+1; $i++){ 340 | $sum = 0; 341 | for($j=0; $j<$len; $j++){ 342 | $sum = $sum + $sen1[$i+$j]; 343 | } 344 | if($sum > $maxn){ 345 | $index = $i; 346 | $maxn = $sum; 347 | } 348 | } 349 | return ($index, $maxn); 350 | } 351 | 352 | sub max_per_seq{ 353 | my $Inf = shift; my $len = shift; my $per = shift; my $bind_score = shift; 354 | #my $list = shift; my $len = shift; 355 | my %inf = %{$Inf}; 356 | my $key = ""; my @total = (); my $i = 0; 357 | foreach $key (sort {${$inf{$b}}[0] <=> ${$inf{$a}}[0]} keys %inf){ 358 | if(${$inf{$key}}[0] < $bind_score){ 359 | last; 360 | } 361 | my @sent1 = split(/\|/, ${$inf{$key}}[2]); 362 | shift(@sent1); 363 | for($i=0; $i<=$#sent1-$len+1; $i++){ 364 | push(@total, sum(@sent1[$i..($i+$len-1)])); 365 | } 366 | } 367 | @total = sort {$b <=> $a} @total; 368 | my $boun = $total[int(($#total + 1)*$per)-1]; 369 | return ($boun); 370 | } 371 | 372 | sub max_per_seq_str{ 373 | my $Inf = shift; my $len = shift; my $per = shift; my $bind_score = shift; 374 | #my $list = shift; my $len = shift; 375 | my %inf = %{$Inf}; 376 | my $key = ""; my @total1 = (); my @total2 = (); my $i = 0; 377 | foreach $key (sort {${$inf{$b}}[0] <=> ${$inf{$a}}[0]} keys %inf){ 378 | if(${$inf{$key}}[0] < $bind_score){ 379 | last; 380 | } 381 | my @sent1 = split(/\|/, ${$inf{$key}}[2]); 382 | shift(@sent1); 383 | my @sent2 = split(/\|/, ${$inf{$key}}[4]); 384 | shift(@sent2); 385 | for($i=0; $i<=$#sent1-$len+1; $i++){ 386 | push(@total1, sum(@sent1[$i..($i+$len-1)])); 387 | } 388 | for($i=0; $i<=$#sent2-$len+1; $i++){ 389 | push(@total2, sum(@sent2[$i..($i+$len-1)])); 390 | } 391 | } 392 | @total1 = sort {$b <=> $a} @total1; 393 | @total2 = sort {$b <=> $a} @total2; 394 | my $boun1 = $total1[int(($#total1 + 1)*$per)-1]; 395 | my $boun2 = $total2[int(($#total2 + 1)*$per*2)-1]; 396 | return ($boun1, $boun2); 397 | } 398 | 399 | sub motif_print{ 400 | my $Motif = shift; my $out_pref = shift; my $motif_len = shift; my $pmotif_file = shift; my $ALPHA = shift; 401 | my %mot_inf = %{$Motif}; 402 | my $sen = ""; my $sen1 = ""; my $seq = ""; my $sna = ""; my $file = ""; my $ics = ""; 403 | my @sen = (); my @sen1 = (); my @sen2 = (); my @sent1 = (); my @sent2 = (); 404 | my @alpha = split(//, $ALPHA); 405 | my $sid = ""; my $key; 406 | my $i = 0; my $j = 0; my $r = 0; my $num = 0; my $tsum = 0; my $k1 = 0; 407 | 408 | #my $head = "MEME version 4.10.1 (Release date: Wed Mar 25 11:40:43 2015 +1000)\nstrands: +\n\nMOTIF "; 409 | #my $head_str = "MEME version 4.10.1 (Release date: Wed Mar 25 11:40:43 2015 +1000)\n\nALPHABET\nP\nQ\nS\nZ\nEND ALPHABET\n\nstrands: +\n\nMOTIF "; 410 | my $head = "MEME version 4.10.1 (Release date: Wed Mar 25 11:40:43 2015 +1000)\n\nALPHABET\n".$alpha[1]."\n".$alpha[2]."\n".$alpha[3]."\n".$alpha[4]."\nEND ALPHABET\n\nstrands: +\n\nMOTIF "; 411 | 412 | #open(FILE1, $pf_file)||die("open $pf_file error!\n"); 413 | open(OUT1, ">", $pmotif_file); 414 | $j = 0; 415 | foreach $key (sort{$a<=>$b} keys %mot_inf){ 416 | print OUT1 $head; 417 | print OUT1 $out_pref,$key,"\n"; 418 | my %motifi = %{$mot_inf{$key}}; 419 | @sen1 = @{$motifi{1}}; 420 | $tsum = $sen1[0] + $sen1[1] + $sen1[2] + $sen1[3]; 421 | print OUT1 "letter-probability matrix: alength= 4 w= $motif_len nsites = $tsum\n"; 422 | foreach $i (sort{$a<=>$b} keys %motifi){ 423 | @sen1 = @{$motifi{$i}}; 424 | $tsum = $sen1[0] + $sen1[1] + $sen1[2] + $sen1[3]; 425 | if($tsum > 0){ 426 | my $new_var = sprintf(" %.6f %.6f %.6f %.6f \n", $sen1[0]/$tsum, $sen1[1]/$tsum, $sen1[2]/$tsum, $sen1[3]/$tsum); 427 | print OUT1 $new_var; 428 | }else{ 429 | my $new_var = sprintf(" %.6f %.6f %.6f %.6f \n", 0.25, 0.25, 0.25, 0.25); 430 | print OUT1 $new_var; 431 | } 432 | } 433 | print OUT1 "\n"; 434 | } 435 | close OUT1; 436 | } 437 | 438 | sub kmer_cal2{ 439 | my $Inf = shift; my $len = shift; 440 | my %inf = %{$Inf}; my %kmer_seq = (); my %kmer_loc = (); 441 | my @sent = (); 442 | my $i = 0; my $key = ""; my $seq = ""; my $subseq = ""; my $sta = 0; my $end = 0; my $sna = ""; 443 | my $stru = ""; my $substru = ""; my $kmer_name = ""; 444 | foreach $key (sort {${$inf{$b}}[0] <=> ${$inf{$a}}[0]} keys %inf){ 445 | @sent = split(/\_/, $key); 446 | $seq = ${$inf{$key}}[2]; 447 | #$stru = ${$inf{$key}}[4]; 448 | $stru = &shape2str(${$inf{$key}}[3], ${$inf{$key}}[4]); 449 | for($i=0; $i<=length($seq)-$len; $i++){ 450 | $subseq = substr($seq, $i, $len); 451 | $substru = substr($stru, $i, $len); 452 | $sta = $sent[1] + $i; $end = $sta + $len - 1; $sna = $sent[0]."_".$sta."_".$end; 453 | $kmer_name = $subseq."|".$substru; 454 | if(exists $kmer_seq{$kmer_name}){ 455 | $kmer_seq{$kmer_name} = $kmer_seq{$kmer_name} + 1; 456 | push(@{$kmer_loc{$kmer_name}}, $sna); 457 | }else{ 458 | $kmer_seq{$kmer_name} = 1; 459 | $kmer_loc{$kmer_name} = [$sna]; 460 | } 461 | } 462 | } 463 | return (\%kmer_seq, \%kmer_loc); 464 | } 465 | 466 | sub combine_kmer{ 467 | my $Inf = shift; my $ALPHA = shift; my $Kmer_loc = shift; my $Data_Inf = shift; my $protein_name = shift; 468 | my %inf = %{$Inf}; my %cinf = (); my %mot_inf = (); my %mot_str_inf = (); my %cinf_con = (); 469 | #my %kmer_loc = %{$Kmer_loc}; my %data_inf = %{$Data_Inf}; 470 | my $key = ""; my $k1 = ""; my $r = 0; my $flag = 0; my $exkey = ""; 471 | my $kmer_sht = sum(values %inf)*0.2; 472 | my $kmer_sh = 0; my $kmer_sum = 0; 473 | foreach $key ( sort{$inf{$b} <=> $inf{$a}} keys %inf){ 474 | $kmer_sum = $kmer_sum + $inf{$key}; 475 | if($kmer_sum > $kmer_sht){ 476 | $kmer_sh = $inf{$key}; 477 | last; 478 | } 479 | } 480 | if(max(values %inf) <= 5){ 481 | $kmer_sh = 0; 482 | } 483 | print $kmer_sh,"\n"; 484 | #my $kmer_sh = 0; 485 | open(OUT, ">", $protein_name."_summary.txt"); 486 | open(OUT2, ">", $protein_name."_summary2.txt"); 487 | print OUT $kmer_sh,"\n"; 488 | foreach $key ( sort{$inf{$b} <=> $inf{$a}} keys %inf){ 489 | if($inf{$key} <= $kmer_sh){ 490 | last; 491 | } 492 | $flag = 0; 493 | foreach $k1 (sort{${$cinf{$b}}[0] <=> ${$cinf{$a}}[0]} keys %cinf){ 494 | ($exkey, $flag) = &tcluster($k1, $key); 495 | if($flag == 1){ 496 | ${$cinf{$k1}}[0] = ${$cinf{$k1}}[0] + $inf{$key}; 497 | push(@{$cinf{$k1}}, $exkey, $inf{$key}); 498 | push(@{$cinf_con{$k1}}, $key); 499 | last; 500 | } 501 | } 502 | if($flag == 0){ 503 | $cinf{$key} = [$inf{$key}, $inf{$key}]; 504 | $cinf_con{$key} = [$key]; 505 | } 506 | } 507 | $r = 0; 508 | foreach $key ( sort {${$cinf{$b}}[0] <=> ${$cinf{$a}}[0]} keys %cinf){ 509 | $r = $r + 1; 510 | my ($mot1, $mot2, $num1) = &build_motif2($key, $cinf{$key}, $ALPHA, "-PU"); 511 | print OUT $key,"\t",$num1,"\n"; 512 | print OUT2 $key,"\t",$num1,"\n"; 513 | for(my $i=0; $i<=$#{$cinf{$key}}; $i++){ 514 | print OUT2 ${$cinf{$key}}[$i],"\t"; 515 | } 516 | print OUT2 "\n"; 517 | $mot_inf{$r} = $mot1; 518 | $mot_str_inf{$r} = $mot2; 519 | } 520 | close OUT; 521 | close OUT2; 522 | return (\%mot_inf, \%mot_str_inf); 523 | } 524 | 525 | sub icSHAPE_str2{ 526 | my $Shape_value = shift; 527 | #my $Shape_list = shift; my $sta = shift; my $len = shift; 528 | my $str_seq = ""; my $str_pro = ""; 529 | my @dvalue = (0.0, 0.088, 0.233, 0.484, 1.0); 530 | if($Shape_value < $dvalue[0]){ 531 | $str_seq = "N"; 532 | $str_pro = 0; 533 | }elsif($Shape_value <= $dvalue[1]){ 534 | $str_seq = "P"; 535 | $str_pro = 1 - ($Shape_value - $dvalue[0])/($dvalue[1] - $dvalue[0])*0.8; 536 | }elsif($Shape_value <= $dvalue[2]){ 537 | $str_seq = "P"; 538 | $str_pro = 0.2 - ($Shape_value - $dvalue[1])/($dvalue[2] - $dvalue[1])*0.2; 539 | }elsif($Shape_value <= $dvalue[3]){ 540 | $str_seq = "U"; 541 | $str_pro = ($Shape_value - $dvalue[2])/($dvalue[3] - $dvalue[2])*0.2; 542 | }else{ 543 | $str_seq = "U"; 544 | $str_pro = 0.2 + ($Shape_value - $dvalue[3])/($dvalue[4] - $dvalue[3])*0.8; 545 | } 546 | return ($str_seq, $str_pro); 547 | } 548 | 549 | sub icSHAPE_str{ 550 | my $Shape_value = shift; 551 | #my $Shape_list = shift; my $sta = shift; my $len = shift; 552 | my $str_seq = ""; my $str_pro = ""; 553 | my @dvalue = (0.0, 0.088, 0.233, 0.484, 1.0); 554 | if($Shape_value < $dvalue[0]){ 555 | $str_seq = "N"; 556 | $str_pro = 0; 557 | }elsif($Shape_value <= $dvalue[1]){ 558 | $str_seq = "P"; 559 | $str_pro = 1 - ($Shape_value - $dvalue[0])/($dvalue[1] - $dvalue[0])*0.5; 560 | }elsif($Shape_value <= $dvalue[2]){ 561 | $str_seq = "P"; 562 | $str_pro = 0.5 - ($Shape_value - $dvalue[1])/($dvalue[2] - $dvalue[1])*0.5; 563 | }elsif($Shape_value <= $dvalue[3]){ 564 | $str_seq = "U"; 565 | $str_pro = ($Shape_value - $dvalue[2])/($dvalue[3] - $dvalue[2])*0.5; 566 | }else{ 567 | $str_seq = "U"; 568 | $str_pro = 0.5 + ($Shape_value - $dvalue[3])/($dvalue[4] - $dvalue[3])*0.5; 569 | } 570 | return ($str_seq, $str_pro); 571 | } 572 | 573 | sub fivechar{ 574 | my $char1 = shift; my $char2 = shift; 575 | my @sen1 = split(//, $char1); my @sen2 = split(//, $char2); my $i = 0; my $r = 0; my $mismatch = 0; my $flag = 0; 576 | if(substr($char1, 0, 4) eq substr($char2, 1, 4)){ 577 | return ("-".$char2."---", 1); 578 | }elsif(substr($char1, 1, 4) eq substr($char2, 0, 4)){ 579 | return ("---".$char2."-", 1); 580 | } 581 | for($i=0; $i<=$#sen1; $i++){ 582 | if($sen1[$i] eq $sen2[$i]){ 583 | $r = $r + 1; 584 | } 585 | } 586 | $mismatch = length($char1) - $r; 587 | if($mismatch == 1){ 588 | return ("--".$char2."--", 1); 589 | } 590 | if(substr($char1, 0, 3) eq substr($char2, 2, 3)){ 591 | return ($char2."----", 1); 592 | }elsif(substr($char1, 2, 3) eq substr($char2, 0, 3)){ 593 | return ("----".$char2, 1); 594 | } 595 | for($i=0; $i<=$#sen1-1; $i++){ 596 | if($sen1[$i] eq $sen2[$i+1]){ 597 | $r = $r + 1; 598 | } 599 | } 600 | $mismatch = length($char1) - 1 - $r; 601 | if($mismatch == 1){ 602 | return ("-".$char2."---", 1); 603 | } 604 | for($i=1; $i<=$#sen1; $i++){ 605 | if($sen1[$i] eq $sen2[$i-1]){ 606 | $r = $r + 1; 607 | } 608 | } 609 | $mismatch = length($char1) - 1 - $r; 610 | if($mismatch == 1){ 611 | return ("---".$char2."-", 1); 612 | } 613 | return($char2, 0); 614 | } 615 | 616 | sub tcluster{ 617 | my $char1 = shift; my $char2 = shift; 618 | my @sen1 = split(/\|/, $char1); my @sen2 = split(/\|/, $char2); 619 | my @sent1 = split(//, $sen1[1]); 620 | my $r = 0; my $i = 0; 621 | for($i=0; $i<=$#sent1; $i++){ 622 | if($sent1[$i] eq "P"){ 623 | $r = $r + 1; 624 | } 625 | } 626 | #$r = $r/($#sent1 + 1); 627 | my ($ch1, $flag1) = &clusterchar1($sen1[0], $sen2[0]); 628 | my ($ch2, $flag2) = &clusterchar1($sen1[1], $sen2[1]); 629 | if(($flag1 == 1)&&($flag2 == 1)){ 630 | return($ch1."|".$ch2, 1); 631 | } 632 | if(($flag1 == 0)&&($flag2 == 1)&&($r > 4)){ 633 | my ($ch01, $flag01) = &clusterchar2($sen1[0], $sen2[0]); 634 | if($flag01 == 1){ 635 | return($ch01."|".$ch2, 1); 636 | } 637 | if(&mismatch($sen1[0], $sen2[0]) <= $r - 3){ 638 | return("--".$sen2[0]."--|".$ch2, 1); 639 | } 640 | } 641 | return($char2, 0); 642 | } 643 | 644 | sub tcluster2{ 645 | my $char1 = shift; my $char2 = shift; 646 | my @sen1 = split(/\|/, $char1); my @sen2 = split(/\|/, $char2); 647 | my @sent1 = split(//, $sen1[1]); 648 | my $r = 0; my $i = 0; 649 | for($i=0; $i<=$#sent1; $i++){ 650 | if($sent1[$i] eq "P"){ 651 | $r = $r + 1; 652 | } 653 | } 654 | $r = $r/($#sent1 + 1); 655 | my ($ch1, $flag1) = &clusterchar1($sen1[0], $sen2[0]); 656 | my ($ch2, $flag2) = &clusterchar1($sen1[1], $sen2[1]); 657 | if(($flag1 == 1)&&($flag2 == 1)){ 658 | return($ch1."|".$ch2, 1); 659 | }elsif(($flag1 == 1)&&($flag2 == 0)){ 660 | my ($ch02, $flag02) = &clusterchar2($sen1[1], $sen2[1]); 661 | if($flag02 == 1){ 662 | return($ch1."|".$ch02, 1); 663 | } 664 | }elsif(($flag1 == 0)&&($flag2 == 1)){ 665 | my ($ch01, $flag01) = &clusterchar2($sen1[0], $sen2[0]); 666 | if($flag01 == 1){ 667 | return($ch01."|".$ch2, 1); 668 | } 669 | } 670 | return($char2, 0); 671 | } 672 | 673 | sub clusterchar1{ 674 | my $char1 = shift; my $char2 = shift; 675 | my $tnum = length($char1); 676 | my @sen1 = split(//, $char1); my @sen2 = split(//, $char2); my $i = 0; my $r = 0; my $mismatch = 0; my $flag = 0; 677 | if($char1 eq $char2){ 678 | return ("--".$char2."--", 1); 679 | }elsif(substr($char1, 0, $tnum-1) eq substr($char2, 1, $tnum-1)){ 680 | return ("-".$char2."---", 1); 681 | }elsif(substr($char1, 1, $tnum-1) eq substr($char2, 0, $tnum-1)){ 682 | return ("---".$char2."-", 1); 683 | } 684 | $r = 0; 685 | for($i=0; $i<=$#sen1; $i++){ 686 | if($sen1[$i] eq $sen2[$i]){ 687 | $r = $r + 1; 688 | } 689 | } 690 | $mismatch = length($char1) - $r; 691 | if($mismatch <= 1){ 692 | return ("--".$char2."--", 1); 693 | } 694 | return($char2, 0); 695 | } 696 | 697 | sub clusterchar2{ 698 | my $char1 = shift; my $char2 = shift; 699 | my $tnum = length($char1); 700 | my @sen1 = split(//, $char1); my @sen2 = split(//, $char2); my $i = 0; my $r = 0; my $mismatch = 0; my $flag = 0; 701 | if($char1 eq $char2){ 702 | return ("--".$char2."--", 1); 703 | }elsif(substr($char1, 0, $tnum-1) eq substr($char2, 1, $tnum-1)){ 704 | return ("-".$char2."---", 1); 705 | }elsif(substr($char1, 1, $tnum-1) eq substr($char2, 0, $tnum-1)){ 706 | return ("---".$char2."-", 1); 707 | } 708 | $r = 0; 709 | for($i=0; $i<=$#sen1; $i++){ 710 | if($sen1[$i] eq $sen2[$i]){ 711 | $r = $r + 1; 712 | } 713 | } 714 | $mismatch = length($char1) - $r; 715 | if($mismatch <= 1){ 716 | return ("--".$char2."--", 1); 717 | } 718 | if(substr($char1, 0, $tnum-2) eq substr($char2, 2, $tnum-2)){ 719 | return ($char2."----", 1); 720 | }elsif(substr($char1, 2, $tnum-2) eq substr($char2, 0, $tnum-2)){ 721 | return ("----".$char2, 1); 722 | } 723 | $r = 0; 724 | for($i=0; $i<=$#sen1-1; $i++){ 725 | if($sen1[$i] eq $sen2[$i+1]){ 726 | $r = $r + 1; 727 | } 728 | } 729 | $mismatch = length($char1) - 1 - $r; 730 | if($mismatch <= 1){ 731 | return ("-".$char2."---", 1); 732 | } 733 | $r = 0; 734 | for($i=1; $i<=$#sen1; $i++){ 735 | if($sen1[$i] eq $sen2[$i-1]){ 736 | $r = $r + 1; 737 | } 738 | } 739 | $mismatch = length($char1) - 1 - $r; 740 | if($mismatch <= 1){ 741 | return ("---".$char2."-", 1); 742 | } 743 | return($char2, 0); 744 | } 745 | 746 | sub build_motif{ 747 | my $rep_seq = shift; my $cont = shift; my $ALPHA = shift; 748 | $rep_seq = "--".$rep_seq."--"; 749 | my @cons = @{$cont}; my @sen1 = split(//, $rep_seq); my @sen2 = split(//, $ALPHA); 750 | my %minf = (); 751 | my $i=0; my $j=0; my $tnum = $cons[1]; 752 | for($j=0; $j<=$#sen1; $j++){ 753 | if($sen1[$j] eq $sen2[0]){ 754 | $minf{$j} = [$tnum*0.25,$tnum*0.25,$tnum*0.25,$tnum*0.25]; 755 | }elsif($sen1[$j] eq $sen2[1]){ 756 | $minf{$j} = [$tnum,0,0,0]; 757 | }elsif($sen1[$j] eq $sen2[2]){ 758 | $minf{$j} = [0,$tnum,0,0]; 759 | }elsif($sen1[$j] eq $sen2[3]){ 760 | $minf{$j} = [0,0,$tnum,0]; 761 | }else{ 762 | $minf{$j} = [0,0,0,$tnum]; 763 | } 764 | } 765 | for($i=2; $i<=$#cons; $i=$i+2){ 766 | @sen1 = split(//, $cons[$i]); 767 | $tnum = $cons[$i+1]; 768 | for($j=0; $j<=$#sen1; $j++){ 769 | if($sen1[$j] eq $sen2[0]){ 770 | ${$minf{$j}}[0] = ${$minf{$j}}[0] + $tnum*0.25; 771 | ${$minf{$j}}[1] = ${$minf{$j}}[1] + $tnum*0.25; 772 | ${$minf{$j}}[2] = ${$minf{$j}}[2] + $tnum*0.25; 773 | ${$minf{$j}}[3] = ${$minf{$j}}[3] + $tnum*0.25; 774 | }elsif($sen1[$j] eq $sen2[1]){ 775 | ${$minf{$j}}[0] = ${$minf{$j}}[0] + $tnum; 776 | }elsif($sen1[$j] eq $sen2[2]){ 777 | ${$minf{$j}}[1] = ${$minf{$j}}[1] + $tnum; 778 | }elsif($sen1[$j] eq $sen2[3]){ 779 | ${$minf{$j}}[2] = ${$minf{$j}}[2] + $tnum; 780 | }else{ 781 | ${$minf{$j}}[3] = ${$minf{$j}}[3] + $tnum; 782 | } 783 | } 784 | } 785 | $tnum = $cons[0]; 786 | return (\%minf, $tnum); 787 | } 788 | 789 | sub build_motif2{ 790 | my $rep_seq = shift; my $cont = shift; my $ALPHA1 = shift; my $ALPHA2 = shift; 791 | #$rep_seq = "--".$rep_seq."--"; 792 | my @Sen = split(/\|/, $rep_seq); 793 | my @sent1 = split(//, "--".$Sen[0]."--"); my @sent2 = split(//, "--".$Sen[1]."--"); 794 | my @cons = @{$cont}; my @sen1 = split(//, $ALPHA1); my @sen2 = split(//, $ALPHA2); 795 | my %minf1 = (); my %minf2 = (); 796 | my $i=0; my $j=0; my $tnum = $cons[1]; 797 | for($j=0; $j<=$#sent1; $j++){ 798 | if($sent1[$j] eq $sen1[0]){ 799 | $minf1{$j} = [$tnum*0.25,$tnum*0.25,$tnum*0.25,$tnum*0.25]; 800 | }elsif($sent1[$j] eq $sen1[1]){ 801 | $minf1{$j} = [$tnum,0,0,0]; 802 | }elsif($sent1[$j] eq $sen1[2]){ 803 | $minf1{$j} = [0,$tnum,0,0]; 804 | }elsif($sent1[$j] eq $sen1[3]){ 805 | $minf1{$j} = [0,0,$tnum,0]; 806 | }else{ 807 | $minf1{$j} = [0,0,0,$tnum]; 808 | } 809 | } 810 | for($j=0; $j<=$#sent2; $j++){ 811 | if($sent2[$j] eq $sen2[0]){ 812 | $minf2{$j} = [$tnum*0.5,$tnum*0.5]; 813 | }elsif($sent2[$j] eq $sen2[1]){ 814 | $minf2{$j} = [$tnum,0]; 815 | }else{ 816 | $minf2{$j} = [0,$tnum]; 817 | } 818 | } 819 | for($i=2; $i<=$#cons; $i=$i+2){ 820 | @Sen = split(/\|/, $cons[$i]); 821 | @sent1 = split(//, $Sen[0]); @sent2 = split(//, $Sen[1]); $tnum = $cons[$i+1]; 822 | for($j=0; $j<=$#sent1; $j++){ 823 | if($sent1[$j] eq $sen1[0]){ 824 | ${$minf1{$j}}[0] = ${$minf1{$j}}[0] + $tnum*0.25; 825 | ${$minf1{$j}}[1] = ${$minf1{$j}}[1] + $tnum*0.25; 826 | ${$minf1{$j}}[2] = ${$minf1{$j}}[2] + $tnum*0.25; 827 | ${$minf1{$j}}[3] = ${$minf1{$j}}[3] + $tnum*0.25; 828 | }elsif($sent1[$j] eq $sen1[1]){ 829 | ${$minf1{$j}}[0] = ${$minf1{$j}}[0] + $tnum; 830 | }elsif($sent1[$j] eq $sen1[2]){ 831 | ${$minf1{$j}}[1] = ${$minf1{$j}}[1] + $tnum; 832 | }elsif($sent1[$j] eq $sen1[3]){ 833 | ${$minf1{$j}}[2] = ${$minf1{$j}}[2] + $tnum; 834 | }else{ 835 | ${$minf1{$j}}[3] = ${$minf1{$j}}[3] + $tnum; 836 | } 837 | } 838 | for($j=0; $j<=$#sent2; $j++){ 839 | if($sent2[$j] eq $sen2[0]){ 840 | ${$minf2{$j}}[0] = ${$minf2{$j}}[0] + $tnum*0.5; 841 | ${$minf2{$j}}[1] = ${$minf2{$j}}[1] + $tnum*0.5; 842 | }elsif($sent2[$j] eq $sen2[1]){ 843 | ${$minf2{$j}}[0] = ${$minf2{$j}}[0] + $tnum; 844 | }else{ 845 | ${$minf2{$j}}[1] = ${$minf2{$j}}[1] + $tnum; 846 | } 847 | } 848 | } 849 | $tnum = $cons[0]; 850 | return (\%minf1, \%minf2, $tnum); 851 | } 852 | 853 | sub kmer_cal3{ 854 | my $Inf = shift; 855 | my %inf = %{$Inf}; my %kmer_seq = (); my %kmer_loc = (); 856 | my @sent = (); my @psign1 = (0)x(16); my @usign1 = (0)x(16); my @sen1 = (); my @sen2 = (); my @sen3 = (); 857 | my $i = 0; my $j = 0; my $r = 0; 858 | my $key = ""; my $seq = ""; my $subseq = ""; my $sta = 0; my $end = 0; my $sna = ""; my $Num = 0; 859 | foreach $key (keys %inf){ 860 | $seq = ${$inf{$key}}[1]; 861 | $subseq = substr($seq, 5, 6); 862 | if(exists $kmer_seq{$subseq}){ 863 | $kmer_seq{$subseq} = $kmer_seq{$subseq} + 1; 864 | push(@{$kmer_loc{$subseq}}, $seq); 865 | }else{ 866 | $kmer_seq{$subseq} = 1; 867 | $kmer_loc{$subseq} = [$seq]; 868 | } 869 | } 870 | foreach $key ( sort{$kmer_seq{$b} <=> $kmer_seq{$a}} keys %kmer_seq){ 871 | push(@sent, $key); 872 | } 873 | #print $#sent+1,"\n"; 874 | $key = $sent[0]; 875 | @sen1 = @{$kmer_loc{$key}}; 876 | #print $#sen1+1,"\n"; 877 | for($i=0; $i<=$#sen1; $i++){ 878 | $Num = $Num + 1; 879 | @sen2 = split(//, $sen1[$i]); 880 | for($j=0; $j<=$#sen2; $j++){ 881 | if($sen2[$j] eq "P"){ 882 | $psign1[$j] = $psign1[$j] + 1; 883 | }else{ 884 | $usign1[$j] = $usign1[$j] + 1; 885 | } 886 | } 887 | } 888 | for($r=1; $r<=$#sent; $r++){ 889 | $sna = $sent[$r]; 890 | #print $key,"\t",$sna,"\n"; 891 | my ($sna2, $flag) = &clusterchar1($key, $sna); 892 | if($flag == 1){ 893 | @sen3 = split(//, $sna2); 894 | $sta = 0; 895 | for($i=0; $i<=3; $i++){ 896 | if($sen3[$i] eq "-"){ 897 | $sta = $sta + 1; 898 | } 899 | } 900 | @sen1 = @{$kmer_loc{$sna}}; 901 | #print $#sen1+1,"\n"; 902 | $sta = $sta - 2; 903 | for($i=0; $i<=$#sen1; $i++){ 904 | $Num = $Num + 1; 905 | @sen2 = split(//, $sen1[$i]); 906 | for($j=0; $j<=$#sen2; $j++){ 907 | #$j = $j + $sta; 908 | if((0<=$j + $sta)&&($j + $sta<=$#psign1)){ 909 | if($sen2[$j + $sta] eq "P"){ 910 | $psign1[$j] = $psign1[$j] + 1; 911 | }else{ 912 | $usign1[$j] = $usign1[$j] + 1; 913 | } 914 | } 915 | } 916 | } 917 | } 918 | } 919 | return (\@psign1, \@usign1, $Num); 920 | } 921 | 922 | sub build_str_motif{ 923 | my $rep_seq = shift; my $cont = shift; my $Kmer_loc = shift; my $Data_Inf = shift; my $protein_name = shift; 924 | my $flank_len = 5; 925 | my %kmer_loc = %{$Kmer_loc}; my %data_inf = %{$Data_Inf}; 926 | my %minf1 = (); my %minf2 = (); my %str_inf = (); 927 | my @cons = @{$cont}; my @sen1 = (); my @sen2 = (); 928 | my $i = 0; my $j = 0; my $r = 0; my $k = 0; my $num = 0; my $num1 = 0; my $num2 = 0; 929 | my @psign1 = (0)x(length($rep_seq) + 2*$flank_len); my @usign1 = (0)x(length($rep_seq) + 2*$flank_len); 930 | my @psign2 = (0)x(length($rep_seq) + 2*$flank_len); my @usign2 = (0)x(length($rep_seq) + 2*$flank_len); 931 | #open(OUT1, ">", "strtmp/".$rep_seq."str_test.txt"); 932 | for($i=0; $i<=$#cons; $i++){ 933 | #print $cons[$i],"|\t"; 934 | @sen1 = @{$kmer_loc{$cons[$i]}}; 935 | for($j=0; $j<=$#sen1; $j++){ 936 | #print $sen1[$j],"\t"; 937 | $num = $num + 1; 938 | #print $num,"\t"; 939 | @sen2 = split(/\_/, $sen1[$j]); 940 | if(!exists $data_inf{$sen2[0]}){ 941 | print $cons[$i],"|",$sen2[0],"\n"; 942 | next; 943 | } 944 | my @ics = split(/\|/, ${$data_inf{$sen2[0]}}[3]); 945 | my @psent = split(/\|/, ${$data_inf{$sen2[0]}}[4]); 946 | my $max_sign = max(@psent); my $min_sign = min(@psent); 947 | my $ave_ics = 0; my $n_ics = 0; 948 | 949 | my $sta = $sen2[1]; my $end = $sen2[2]; my $key = $sen2[0]; 950 | my $str_char = &get_str(${$data_inf{$key}}[1], ${$data_inf{$key}}[3], $protein_name); 951 | my $str_char1 = substr($str_char, $sta - 5, $end - $sta + 1 + 10); 952 | 953 | my $flag = 0; my $fnum = ""; 954 | if(length($str_char1) == 16){ 955 | my @str_sent = split(//, $str_char1); 956 | for($r=0; $r<=$#str_sent; $r++){ 957 | if($str_sent[$r] eq "."){ 958 | $fnum = $fnum."U"; 959 | }else{ 960 | $fnum = $fnum."P"; 961 | } 962 | } 963 | $str_inf{$num} = [$str_char1, $fnum]; 964 | } 965 | } 966 | #last; 967 | } 968 | #close OUT1; 969 | #print "strcuture site: ",$num,"\n"; 970 | #$k = keys %str_inf; 971 | #print $k,"\n"; 972 | my ($Psign1, $Usign1, $NUM) = kmer_cal2(\%str_inf); 973 | @psign1 = @{$Psign1}; @usign1 = @{$Usign1}; 974 | for($i=0; $i<=$#psign1; $i++){ 975 | #$minf{$i} = [$psign[$i], $usign[$i]]; 976 | if($psign1[$i] + $usign1[$i] != 0){ 977 | #my @prob = ($psign[$i]/($psign[$i] + $usign[$i]), $usign[$i]/($psign[$i] + $usign[$i])); 978 | #my $Height = &cal_entropy(\@prob); 979 | $minf1{$i} = [$psign1[$i]/($psign1[$i] + $usign1[$i]), $usign1[$i]/($psign1[$i] + $usign1[$i])]; 980 | #$minf{$i} = [$psign[$i]/($psign[$i] + $usign[$i])*$Height, $usign[$i]/($psign[$i] + $usign[$i])*$Height]; 981 | }else{ 982 | $minf1{$i} = [0.5, 0.5]; 983 | #$minf{$i} = [0, 0]; 984 | } 985 | } 986 | for($i=0; $i<=$#psign2; $i++){ 987 | #$minf{$i} = [$psign[$i], $usign[$i]]; 988 | if($psign2[$i] + $usign2[$i] != 0){ 989 | #my @prob = ($psign[$i]/($psign[$i] + $usign[$i]), $usign[$i]/($psign[$i] + $usign[$i])); 990 | #my $Height = &cal_entropy(\@prob); 991 | $minf2{$i} = [$psign2[$i]/($psign2[$i] + $usign2[$i]), $usign2[$i]/($psign2[$i] + $usign2[$i])]; 992 | #$minf{$i} = [$psign[$i]/($psign[$i] + $usign[$i])*$Height, $usign[$i]/($psign[$i] + $usign[$i])*$Height]; 993 | }else{ 994 | $minf2{$i} = [0.5, 0.5]; 995 | #$minf{$i} = [0, 0]; 996 | } 997 | } 998 | return (\%minf1, \%minf2, $num1, $NUM); 999 | } 1000 | 1001 | sub cal_entropy{ 1002 | my $sen = shift; 1003 | my @sent = @{$sen}; 1004 | my $i = 0; my $Sum = sum(@sent); my $Entropy = 0; 1005 | for($i=0; $i<=$#sent; $i++){ 1006 | $sent[$i] = $sent[$i]/$Sum; 1007 | if($sent[$i] > 0){ 1008 | $Entropy = $Entropy - $sent[$i]*log($sent[$i])/log(2); 1009 | } 1010 | } 1011 | return(1 - $Entropy); 1012 | } 1013 | 1014 | sub get_str{ 1015 | my $seqref = shift; my $ics = shift; my $protien = shift; 1016 | my $i; my $j; my $r; 1017 | my @sen1 = split(//, $seqref); my @sen2 = split(/\|/, $ics); 1018 | my $tmp_seq_file = $protien."_tmp_seq_file.txt"; my $tmp_shape_file = $protien."_tmp_shape_file.txt"; 1019 | open(SEQ, ">", $tmp_seq_file); 1020 | open(SHAPE, ">", $tmp_shape_file); 1021 | for($i=0; $i<=$#sen1; $i++){ 1022 | print SEQ $sen1[$i]; 1023 | } 1024 | print SEQ "\n"; 1025 | close SEQ; 1026 | $j = 1; 1027 | for($i = 0; $i<=$#sen2; $i++){ 1028 | $j = $i + 1; 1029 | if($sen2[$i] < 0){ 1030 | print SHAPE $j,"\t-1\n"; 1031 | }else{ 1032 | print SHAPE $j,"\t",$sen2[$i]*2,"\n"; 1033 | } 1034 | } 1035 | close SHAPE; 1036 | my $str_res = `RNAfold --noPS --shapeMethod="Dm8b−0.7" --shape=$tmp_shape_file < $tmp_seq_file`; 1037 | my @sent1 = split(/\n/,$str_res); 1038 | my $exa_seq = $sent1[0]; 1039 | my @sent2 = split(/\s/,$sent1[1]); 1040 | my $exa_str = $sent2[0]; 1041 | #my @sent3 = split(/\|/, $sna); 1042 | #$inf_str{$sna} = [$sent3[3], $exa_seq, $exa_str]; 1043 | return($exa_str); 1044 | } 1045 | 1046 | sub get_str2{ 1047 | my $seqref = shift; my $ics = shift; my $protien = shift; 1048 | my %inf_seq = %{$seqref}; my %inf_ics = %{$ics}; 1049 | my %inf_str = (); 1050 | my $sen; my $sna; my $sen1; my $seq; 1051 | my $i; my $j; my $r; my $count = 0; 1052 | my @sen = (); my @sen1 = (); my @sen2 = (); my @sen3 = (); 1053 | foreach $sna (keys %inf_seq){ 1054 | @sen1 = @{$inf_seq{$sna}}; 1055 | @sen2 = @{$inf_ics{$sna}}; 1056 | my $tmp_seq_file = $protien."_tmp_seq_file.txt"; my $tmp_shape_file = $protien."_tmp_shape_file.txt"; 1057 | open(SEQ, ">", $tmp_seq_file); 1058 | open(SHAPE, ">", $tmp_shape_file); 1059 | for($i=0; $i<=$#sen1; $i++){ 1060 | print SEQ $sen1[$i]; 1061 | } 1062 | print SEQ "\n"; 1063 | close SEQ; 1064 | $j = 1; 1065 | for($i = 0; $i<=$#sen2; $i++){ 1066 | if($sen2[$i] eq "NULL"){ 1067 | print SHAPE $j,"\t-1\n"; 1068 | }else{ 1069 | print SHAPE $j,"\t",$sen2[$i]*2,"\n"; 1070 | } 1071 | } 1072 | close SHAPE; 1073 | #my $str_res = `/Share2/home/zhangqf/usr/ViennaRNA-2.2.3/bin/RNAfold --noPS −−shapeMethod="Dm8b−0.7" --shape=tmp_shape_file.txt < tmp_seq_file.txt`; 1074 | my $str_res = `RNAfold --noPS --shapeMethod="Dm8b−0.7" --shape=$tmp_shape_file < $tmp_seq_file`; 1075 | my @sent1 = split(/\n/,$str_res); 1076 | my $exa_seq = $sent1[0]; 1077 | my @sent2 = split(/\s/,$sent1[1]); 1078 | my $exa_str = $sent2[0]; 1079 | my @sent3 = split(/\|/, $sna); 1080 | $inf_str{$sna} = [$sent3[3], $exa_seq, $exa_str]; 1081 | } 1082 | return (\%inf_str); 1083 | } 1084 | 1085 | sub mismatch{ 1086 | my $char1 = shift; my $char2 = shift; 1087 | my @sen1 = split(//, $char1); my @sen2 = split(//, $char2); my $i = 0; my $r = 0; 1088 | for($i=0; $i<=$#sen1; $i++){ 1089 | if($sen1[$i] ne $sen2[$i]){ 1090 | $r = $r + 1; 1091 | } 1092 | } 1093 | return($r); 1094 | } 1095 | 1096 | sub read_summary{ 1097 | my $fdata_file = shift; 1098 | my $i = 0; my $j = 0; my $r = 0; 1099 | my $sen = ""; 1100 | my @sen1 = (); my @sen2 = (); 1101 | my %inf = (); 1102 | open(FILE1, $fdata_file)||die("open $fdata_file error!\n"); 1103 | #2 1104 | #UGCAUG|UUUUUU 157 1105 | #UGCAUG|PPPPPP 119 1106 | $sen = ; 1107 | while($sen = ){ 1108 | $i = $i + 1; 1109 | chomp($sen); 1110 | @sen1 = split(/\t/, $sen); 1111 | @sen2 = split(/\|/, $sen1[0]); 1112 | $inf{$i} = [$sen2[0], $sen2[1], $sen1[1]]; 1113 | if($i >= 10){ 1114 | last; 1115 | } 1116 | } 1117 | close FILE1; 1118 | return(\%inf); 1119 | } 1120 | 1121 | sub read_seq_count{ 1122 | my $fdata_file = shift; 1123 | my $i = 0; my $j = 0; my $r = 0; 1124 | my $sen = ""; 1125 | my @sen1 = (); my @sen2 = (); 1126 | my %inf = (); 1127 | open(FILE1, $fdata_file)||die("open $fdata_file error!\n"); 1128 | #39.2500 31.0000 35.0000 0.0000 0.0000 157.0000 0.0000 8.2500 33.2500 39.2500 1129 | #39.2500 45.0000 15.0000 0.0000 157.0000 0.0000 0.0000 8.2500 42.2500 39.2500 1130 | while($sen = ){ 1131 | $i = $i + 1; 1132 | chomp($sen); 1133 | @sen1 = split(/\t/, $sen); 1134 | ${$inf{$i}}{0} = [@sen1[0..9]]; 1135 | for($j=1; $j<=3; $j++){ 1136 | $sen = ; 1137 | chomp($sen); 1138 | @sen1 = split(/\t/, $sen); 1139 | ${$inf{$i}}{$j} = [@sen1[0..9]]; 1140 | } 1141 | if($i >= 10){ 1142 | last; 1143 | } 1144 | } 1145 | close FILE1; 1146 | return(\%inf); 1147 | } 1148 | 1149 | sub read_str_count{ 1150 | my $fdata_file = shift; 1151 | my $i = 0; my $j = 0; my $r = 0; 1152 | my $sen = ""; 1153 | my @sen1 = (); my @sen2 = (); 1154 | my %inf = (); 1155 | open(FILE1, $fdata_file)||die("open $fdata_file error!\n"); 1156 | #39.2500 31.0000 35.0000 0.0000 0.0000 157.0000 0.0000 8.2500 33.2500 39.2500 1157 | #39.2500 45.0000 15.0000 0.0000 157.0000 0.0000 0.0000 8.2500 42.2500 39.2500 1158 | while($sen = ){ 1159 | $i = $i + 1; 1160 | chomp($sen); 1161 | @sen1 = split(/\t/, $sen); 1162 | ${$inf{$i}}{0} = [@sen1[0..9]]; 1163 | for($j=1; $j<=1; $j++){ 1164 | $sen = ; 1165 | chomp($sen); 1166 | @sen1 = split(/\t/, $sen); 1167 | ${$inf{$i}}{$j} = [@sen1[0..9]]; 1168 | } 1169 | if($i >= 10){ 1170 | last; 1171 | } 1172 | } 1173 | close FILE1; 1174 | return(\%inf); 1175 | } 1176 | 1177 | sub read_tomtom{ 1178 | my $fdata_file = shift; my $prot_name = shift; my $Sinf = shift; 1179 | my $i = 0; my $j = 0; my $r = 0; 1180 | my $sen = ""; my $Pattern = $prot_name."_"; 1181 | my @sen1 = (); my @sen2 = (); my @sent1 = (); my @sent2 = (); 1182 | my %inf = (); my %uniq = (); my %sinf = %{$Sinf}; 1183 | $inf{1} = []; $uniq{1} = 1; 1184 | open(FILE1, $fdata_file)||die("open $fdata_file error!\n"); 1185 | ##Query ID Target ID Optimal offset p-value E-value q-value Overlap Query consensus Target consensus Orientation 1186 | #RBFOX2_mes1 RBFOX2_mes1 0 2.32559e-10 2.32559e-09 4.65117e-09 10 ACTGCATGTA ACTGCATGTA + 1187 | #RBFOX2_mes3 RBFOX2_mes9 -1 0.00661106 0.0661106 0.0440737 9 AACATGTTCA AAATGTGCCA + 1188 | #RBFOX2_mes1 RBFOX2_mes4 1 0.0902425 0.902425 0.150404 9 ACTGCATGTA AAATGCTTGA - 1189 | #RBFOX2_mes3 RBFOX2_mes2 2 0.230382 2.30382 0.394583 8 AACATGTTCA TCTGCATGCT + 1190 | $sen = ; 1191 | while($sen = ){ 1192 | #$i = $i + 1; 1193 | chomp($sen); 1194 | @sen1 = split(/\t/, $sen); 1195 | if(($sen1[5] < 0.05) && ($sen1[9] eq "+") &&($sen1[1] ne $sen1[0])){ 1196 | $sen1[0]=~s/$prot_name/$Pattern/g; 1197 | $sen1[1]=~s/$prot_name/$Pattern/g; 1198 | @sent1 = split(/\_/, $sen1[0]); 1199 | @sent2 = split(/\_/, $sen1[1]); 1200 | if(&mismatch(${$sinf{$sent1[-1]}}[1], ${$sinf{$sent2[-1]}}[1]) <= 1){ 1201 | if((!exists $uniq{$sent1[-1]})&&(!exists $uniq{$sent2[-1]})){ 1202 | if(exists $inf{$sent1[-1]}){ 1203 | push(@{$inf{$sent1[-1]}}, $sent2[-1]."|".$sen1[2]."|".$sen1[7]."|".$sen1[8]); 1204 | $uniq{$sent1[-1]} = 1; 1205 | $uniq{$sent2[-1]} = 1; 1206 | }else{ 1207 | $inf{$sent1[-1]} = [$sent2[-1]."|".$sen1[2]."|".$sen1[7]."|".$sen1[8]]; 1208 | $uniq{$sent1[-1]} = 1; 1209 | $uniq{$sent2[-1]} = 1; 1210 | } 1211 | } 1212 | } 1213 | } 1214 | } 1215 | for($i=1; $i<=10; $i++){ 1216 | if((!exists $uniq{$i}) && (!exists $inf{$i})){ 1217 | $inf{$i} = []; 1218 | } 1219 | } 1220 | 1221 | close FILE1; 1222 | return(\%inf); 1223 | } 1224 | 1225 | exit; 1226 | --------------------------------------------------------------------------------