├── prismnet
    ├── utils
    │   ├── __init__.py
    │   ├── acgu.npz
    │   ├── xprint.py
    │   ├── visualize.py
    │   ├── metrics.py
    │   └── datautils.py
    ├── __init__.py
    ├── engine
    │   ├── __init__.py
    │   └── train_loop.py
    ├── model
    │   ├── __init__.py
    │   ├── se.py
    │   ├── resnet.py
    │   ├── utils.py
    │   ├── smoothgrad.py
    │   └── PrismNet.py
    └── loader.py
├── tools
    ├── .DS_Store
    ├── gdata_bin.sh
    ├── generate_dataset.py
    └── main.py
├── data
    └── TIA1_Hela.h5
├── .gitignore
├── requirements.txt
├── exp
    ├── prismnet
    │   ├── eval.sh
    │   ├── har.sh
    │   ├── infer.sh
    │   ├── saliency.sh
    │   ├── saliencyimg.sh
    │   ├── train.sh
    │   ├── saliencyimg_infer.sh
    │   └── train_all.sh
    └── logistic_reg
    │   ├── run.sh
    │   ├── gdata.py
    │   └── main.py
├── setup.py
├── LICENSE
├── motif_construct
    ├── motif_sig.R
    └── saliency_motif.pl
└── README.md


/prismnet/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .xprint import log_print


--------------------------------------------------------------------------------
/tools/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kuixu/PrismNet/HEAD/tools/.DS_Store


--------------------------------------------------------------------------------
/data/TIA1_Hela.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kuixu/PrismNet/HEAD/data/TIA1_Hela.h5


--------------------------------------------------------------------------------
/prismnet/utils/acgu.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kuixu/PrismNet/HEAD/prismnet/utils/acgu.npz


--------------------------------------------------------------------------------
/prismnet/__init__.py:
--------------------------------------------------------------------------------
1 | from .engine import *
2 | from .utils import log_print
3 | from .model import *
4 | 


--------------------------------------------------------------------------------
/prismnet/engine/__init__.py:
--------------------------------------------------------------------------------
1 | from .train_loop import train, validate, inference, compute_saliency, compute_saliency_img, compute_high_attention_region
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | .vscode
 3 | build
 4 | dist
 5 | prismnet.egg-info
 6 | out
 7 | log.txt
 8 | events.out*
 9 | *.pth
10 | .DS_Store
11 | *.pdf


--------------------------------------------------------------------------------
/prismnet/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .PrismNet import PrismNet, PrismNet_large
2 | from .utils import param_num
3 | from .smoothgrad import GuidedBackpropSmoothGrad


--------------------------------------------------------------------------------
/tools/gdata_bin.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | d=clip_data
3 | for p in `cat data/${d}/all.list`
4 | do 
5 |     python -u tools/generate_dataset.py $p 1 5 data/$d
6 | done
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | scipy==1.1.0
 2 | termcolor
 3 | h5py
 4 | scikit-learn>=0.19.1
 5 | torch==1.1.0
 6 | tensorboardX
 7 | tqdm>=4.28.1
 8 | matplotlib>=3.0.2
 9 | einops
10 | pandas
11 | 


--------------------------------------------------------------------------------
/exp/prismnet/eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | work_path=$(dirname $0)
 3 | name=$(basename $work_path)
 4 | # echo `date +%Y%m%d%H%M%S`
 5 | 
 6 | p=$1
 7 | d=$2
 8 | 
 9 | exp=$name
10 | 
11 | python -u tools/main.py \
12 |     --load_best \
13 |     --eval \
14 |     --data_dir data/$d \
15 |     --p_name $p\
16 |     --out_dir $work_path \
17 |     --exp_name $exp\
18 |     ${@:5}| tee $work_path/out/log.txt
19 | 


--------------------------------------------------------------------------------
/exp/prismnet/har.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | work_path=$(dirname $0)
 3 | name=$(basename $work_path)
 4 | # echo `date +%Y%m%d%H%M%S`
 5 | 
 6 | p=$1
 7 | infer_file=$2
 8 | 
 9 | exp=$name
10 | 
11 | python -u tools/main.py \
12 |     --load_best \
13 |     --har \
14 |     --infer_file $infer_file \
15 |     --p_name $p\
16 |     --out_dir $work_path \
17 |     --exp_name $exp\
18 |     ${@:6}| tee $work_path/out/log.txt
19 | 


--------------------------------------------------------------------------------
/exp/prismnet/infer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | work_path=$(dirname $0)
 3 | name=$(basename $work_path)
 4 | # echo `date +%Y%m%d%H%M%S`
 5 | 
 6 | p=$1
 7 | 
 8 | infer_file=$2
 9 | exp=$name
10 | 
11 | python -u tools/main.py \
12 |     --load_best \
13 |     --infer \
14 |     --infer_file $infer_file \
15 |     --p_name $p\
16 |     --out_dir $work_path \
17 |     --exp_name $exp\
18 |     ${@:6}| tee $work_path/out/log.txt
19 | 


--------------------------------------------------------------------------------
/exp/prismnet/saliency.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | work_path=$(dirname $0)
 3 | name=$(basename $work_path)
 4 | # echo `date +%Y%m%d%H%M%S`
 5 | 
 6 | p=$1
 7 | infer_file=$2
 8 | 
 9 | exp=$name
10 | 
11 | python -u tools/main.py \
12 |     --load_best \
13 |     --saliency \
14 |     --infer_file $infer_file \
15 |     --p_name $p\
16 |     --out_dir $work_path \
17 |     --exp_name $exp\
18 |     ${@:5}| tee $work_path/out/log.txt
19 | 


--------------------------------------------------------------------------------
/exp/prismnet/saliencyimg.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | work_path=$(dirname $0)
 3 | name=$(basename $work_path)
 4 | # echo `date +%Y%m%d%H%M%S`
 5 | 
 6 | p=$1
 7 | infer_file=$2
 8 | 
 9 | exp=$name
10 | 
11 | python -u tools/main.py \
12 |     --load_best \
13 |     --saliency_img \
14 |     --infer_file $infer_file \
15 |     --p_name $p\
16 |     --out_dir $work_path \
17 |     --exp_name $exp\
18 |     ${@:5}| tee $work_path/out/log.txt
19 | 


--------------------------------------------------------------------------------
/exp/prismnet/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | work_path=$(dirname $0)
 3 | name=$(basename $work_path)
 4 | # echo `date +%Y%m%d%H%M%S`
 5 | 
 6 | p=$1
 7 | d=$2
 8 | 
 9 | exp=$name
10 | 
11 | python -u tools/main.py \
12 |     --train \
13 |     --eval \
14 |     --lr 0.001 \
15 |     --data_dir data/$d \
16 |     --p_name $p\
17 |     --out_dir $work_path \
18 |     --exp_name $exp\
19 |     ${@:5}
20 |     
21 |     #| tee $work_path/out/log.txt
22 | 


--------------------------------------------------------------------------------
/prismnet/utils/xprint.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from termcolor import cprint
 3 | except ImportError:
 4 |     cprint = None
 5 | 
 6 | try:
 7 |     from pycrayon import CrayonClient
 8 | except ImportError:
 9 |     CrayonClient = None
10 | 
11 | def log_print(text, color=None, on_color=None, attrs=None):
12 |     if cprint is not None:
13 |         cprint(text, color=color, on_color=on_color, attrs=attrs)
14 |     else:
15 |         print(text)
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/exp/prismnet/saliencyimg_infer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | work_path=$(dirname $0)
 3 | name=$(basename $work_path)
 4 | # echo `date +%Y%m%d%H%M%S`
 5 | 
 6 | p=$1
 7 | d=$2
 8 | infer=$3
 9 | 
10 | exp=$name
11 | 
12 | python -u tools/main.py \
13 |     --load_best \
14 |     --saliency_img \
15 |     --infer \
16 |     --infer_file $infer \
17 |     --data_dir data/$d \
18 |     --p_name $p\
19 |     --out_dir $work_path \
20 |     --exp_name $exp\
21 |     ${@:5}| tee $work_path/out/log.txt
22 | 


--------------------------------------------------------------------------------
/exp/logistic_reg/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | work_path=$(dirname $0)
 3 | 
 4 | p=HEK293_RBP_HL_bind_matrix_total
 5 | p2=$p
 6 | la=10
 7 | 
 8 | # part=Test
 9 | mkdir $work_path/out
10 | mkdir $work_path/out/models
11 | mkdir $work_path/out/log
12 | 
13 | train_data='data/halflife/'$p'.train.npz'
14 | test_data='data/halflife/'$p'.test.npz'
15 | pred_data='data/halflife/'${p2}'.test.npz'
16 | # CUDA_VISIBLE_DEVICE="0" 
17 | python -u exp/logistic_reg/main.py \
18 |   --train_data $train_data \
19 |   --test_data $test_data \
20 |   --pred_data $pred_data \
21 |   --model_path $work_path/out/models/${p}_best.model \
22 |   --lam $la \
23 |   ${@:4}| tee -a $work_path/out/log/${p}.txt 
24 | 


--------------------------------------------------------------------------------
/prismnet/model/se.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class SEBlock(nn.Module):
 6 |     def __init__(self, channel, reduction=2):
 7 |         super(SEBlock, self).__init__()
 8 |         self.avg_pool = nn.AdaptiveAvgPool2d(1)
 9 |         self.fc = nn.Sequential(
10 |                 nn.Linear(channel, channel // reduction),
11 |                 nn.ReLU(inplace=True),
12 |                 nn.Linear(channel // reduction, channel),
13 |                 nn.Sigmoid()
14 |         )
15 | 
16 |     def forward(self, x):
17 |         b, c, _, _ = x.size()
18 |         y = self.avg_pool(x).view(b, c)
19 |         y = self.fc(y).view(b, c, 1, 1)
20 |         return y


--------------------------------------------------------------------------------
/exp/prismnet/train_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | work_path=$(dirname $0)
 3 | name=$(basename $work_path)
 4 | da=clip_data
 5 | 
 6 | if [ ! -d $work_path/out ];then
 7 |     mkdir $work_path/out
 8 |     mkdir $work_path/out/log
 9 | fi
10 | # N threads according to your GPU
11 | SEND_THREAD_NUM=2
12 | 
13 | ###########################
14 | 
15 | tmp_fifofile="/tmp/$$.fifo"
16 | mkfifo "$tmp_fifofile"
17 | exec 6<>"$tmp_fifofile"
18 | for ((i=0;i<$SEND_THREAD_NUM;i++));do
19 |                  echo                                                                                    
20 | done >&6
21 | 
22 | 
23 | for p in `cat  data/${da}/all.list`
24 | do 
25 |     read -u6
26 |     {
27 |     id=${p}_PrismNet_pu
28 |     ff=$work_path/out/evals/${id}.metrics
29 |     lg=$work_path/out/log/${id}.log
30 |     if [ ! -f $ff ] ; then 
31 |         echo ${p}" ==="
32 |         $srun $work_path/train.sh $p $da > $lg 2> $lg
33 |     fi
34 |     sleep 1
35 |     echo >&6
36 |     } &
37 |     pid=$!
38 |     echo $pid
39 | done
40 | 
41 | wait
42 | exec 6>&-
43 | exit 0
44 | 
45 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | # Author: Kui XU
 4 | # Created Time : Mon 3 Jul 2017 09:42:31 PM CST
 5 | # File Name: setup.py
 6 | # Description:
 7 | """
 8 | 
 9 | from setuptools import setup, find_packages
10 | 
11 | with open('requirements.txt') as f:
12 |     requirements = f.read().splitlines()
13 | 
14 | setup(name='prismnet',
15 |       version='0.1.1',
16 |       description='PrismNet',
17 |       packages=find_packages(),
18 | 
19 |       author='Kui XU',
20 |       author_email='kuixu.cs@gmail.com',
21 |       url='https://github.com/kuixu/PrismNet',
22 |       install_requires=requirements,
23 |       python_requires='>3.6.0',
24 | 
25 |       classifiers=[
26 |           'Development Status :: 1 - Beta',
27 |           'Intended Audience :: Science/Research',
28 |           'License :: OSI Approved :: MIT License',
29 |           'Programming Language :: Python :: 3.6',
30 |           'Operating System :: MacOS :: MacOS X',
31 |           'Operating System :: Microsoft :: Windows',
32 |           'Operating System :: CentOS :: Linux',
33 |      ],
34 |      )
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 PrismNet
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/motif_construct/motif_sig.R:
--------------------------------------------------------------------------------
 1 | ##########################################################
 2 | #This R script is to analysis the motif enrichment and cluster
 3 | ##########################################################
 4 | 
 5 | Args <- commandArgs()
 6 | in_file = Args[6]
 7 | out_file = Args[7]
 8 | 
 9 | 
10 | human_motif<-read.table(file=in_file, header = F, sep = "\t")
11 | 
12 | head(human_motif)
13 | 
14 | i = 1
15 | Pvalue <- c()
16 | Odd_ratio <- c()
17 | FDR <- c()
18 | for(i in 1:dim(human_motif)[1]){
19 |   Sum = human_motif[i,2]/human_motif[i,3]
20 |   compare<-matrix(floor(c(human_motif[i,2],Sum*0.1,Sum*(1-human_motif[i,3]),Sum*0.9)),nr=2,dimnames=
21 |                     list(c("sites","not sites"),c("motif","random")))
22 |   fisher_test <- fisher.test(compare,alternative = "greater")
23 |   Pvalue <- c(Pvalue, fisher_test$p.value)
24 |   Odd_ratio <- c(Odd_ratio, fisher_test$estimate)
25 | }
26 | 
27 | FDR <- p.adjust(Pvalue, method = "fdr")
28 | 
29 | colnames(human_motif) <- c("motif", "number", "percent")
30 | data1 <- cbind(human_motif, Odd_ratio)
31 | data1 <- cbind(data1, Pvalue)
32 | data1 <- cbind(data1, FDR)
33 | write.table(data1, file = out_file, row.names = F, col.names = T, sep = "\t")
34 | 


--------------------------------------------------------------------------------
/exp/logistic_reg/gdata.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import pandas as pd
 4 | 
 5 | def g_rand_data(N=5000, M=128):
 6 |     label = np.random.randint(2,size=N)
 7 |     data  = np.random.rand(N,M)
 8 |     line = ""
 9 |     for i in range(N):
10 |         datastr = " ".join(["{:d}:{:.3f}".format(j, data[i,j]) for j in range(M) ])
11 |         line += "{:d} {:s}\n".format(label[i], datastr)
12 |     print(line)
13 | 
14 | def save_file(data, filepath):
15 |     print("Y min,max:", data[:,0].min(), data[:,0].max())
16 |     print("X min,max:", data[:,1:].min(), data[:,1:].max())
17 |     print("p/n:", data[:,0].sum()/data.shape[0],)
18 |     N,M = data.shape
19 |     with open(filepath,"w") as f:
20 |         line = ""
21 |         for i in range(N):
22 |             datastr = " ".join(["{:d}:{:f}".format(j, data[i,j]) for j in range(1, M) ])
23 |             line += "{:d} {:s}\n".format(int(data[i,0]), datastr)
24 |         print(line, file=f)
25 | def concat_data(filepath):
26 |     raw_data = pd.read_csv(filepath,sep="\t")
27 |     data=raw_data.to_numpy()[:,1:]
28 | 
29 |     # import pdb; pdb.set_trace()
30 |     
31 | 
32 |     t_path = filepath.replace(".txt",".train")
33 |     e_path = filepath.replace(".txt",".test")
34 |     
35 |     # import pdb; pdb.set_trace()
36 |     # data = abs(np.concatenate((pos_samples, neg_samples)))
37 |     # data = np.concatenate((pos_samples, neg_samples))
38 |     print("min,max:", data[:,1:].min(), data[:,1:].max())
39 |     dmin = data[:,0].min()
40 |     dwid = data[:,0].max() - dmin
41 |     # data[:,0] = (data[:,0] - dmin)/dwid
42 |     # norm 
43 |     data[:,1:] = (data[:,1:] - data[:,1:].mean())/data[:,1:].std()
44 |     N,M = data.shape
45 |     perm =np.random.permutation(N)
46 |     t_N = int(0.8*N)
47 | 
48 | 
49 |     # save_file(data[perm][:t_N,:], t_path)
50 |     np.savez_compressed(t_path+".npz", x=data[perm][:t_N,1:], y=data[perm][:t_N,0],dmin=dmin,dwid=dwid)
51 |     print("Training file saved into:", t_path,",", t_N," samples.")
52 |     # save_file(data[perm][t_N:,:], e_path)
53 |     np.savez_compressed(e_path+".npz", x=data[perm][t_N:,1:], y=data[perm][t_N:,0],dmin=dmin,dwid=dwid)
54 |     print("Testing file saved into:", e_path,",", N-t_N," samples.")
55 |     
56 |     
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     # g_rand_data()
61 |     import glob
62 |     for f in glob.glob("data/regu6/*.txt"):
63 |         print(f)
64 |         try:
65 |             concat_data(f)
66 |         except TypeError:
67 |             pass
68 |         
69 | 


--------------------------------------------------------------------------------
/tools/generate_dataset.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | import warnings
 5 | warnings.filterwarnings('ignore',category=FutureWarning)
 6 | warnings.filterwarnings('ignore',category=RuntimeWarning)
 7 | import os, sys, h5py
 8 | import pandas as pd
 9 | import numpy as np
10 | np.random.seed(100)
11 | 
12 | from prismnet.utils import datautils
13 | 
14 | def read_csv(path):
15 |     # load sequences
16 |     df = pd.read_csv(path, sep='\t', header=None)
17 |     df = df.loc[df[0]!="Type"]
18 | 
19 |     Type  = 0
20 |     loc   = 1
21 |     Seq   = 2
22 |     Str   = 3
23 |     Score = 4
24 |     label = 5
25 | 
26 |     rnac_set  = df[Type].to_numpy()
27 |     sequences = df[Seq].to_numpy()
28 |     structs  = df[Str].to_numpy()
29 |     targets   = df[Score].to_numpy().astype(np.float32).reshape(-1,1)
30 |     return sequences, structs, targets
31 | 
32 | max_length = 101
33 | only_pos   = False
34 | binary     = True
35 | 
36 | name       = sys.argv[1]
37 | is_bin     = sys.argv[2]
38 | in_ver     = int(sys.argv[3])
39 | data_path  = sys.argv[4]
40 | 
41 | print(name)
42 | 
43 | 
44 | 
45 | outfile = name+'.h5'
46 | sequences, structs, targets = read_csv(os.path.join(data_path, name+'.tsv'))
47 | 
48 | # combine inpute data
49 | one_hot = datautils.convert_one_hot(sequences, max_length)
50 | structure = np.zeros((len(structs), in_ver-4, max_length))
51 | for i in range(len(structs)):
52 |     struct = structs[i].split(',')
53 |     ti = [float(t) for t in struct]
54 |     ti = np.array(ti).reshape(1,-1)
55 |     structure[i] = np.concatenate([ti], axis=0)
56 | 
57 | data = np.concatenate([one_hot, structure], axis=1)
58 | 
59 | # preprare targets
60 | if is_bin=="0":
61 |     targets = datautils.rescale(targets)
62 | elif is_bin=="1":
63 |     targets[targets<0] = 0
64 |     targets[targets>0] = 1
65 | 
66 | 
67 | # split dataset into train, cross-validation, and test set
68 | train, test = datautils.split_dataset(data, targets, valid_frac=0.2)
69 | 
70 | target_data_type = np.int32 if is_bin=="1" else np.float32
71 | # save dataset
72 | save_path = os.path.join(data_path, outfile)
73 | print(name, data.shape, len(train[0]), len(test[0]), test[1].max(), test[1].min())
74 | # print('saving dataset: ', save_path)
75 | with h5py.File(save_path, "w") as f:
76 |     dset = f.create_dataset("X_train", data=train[0].astype(np.float32), compression="gzip")
77 |     dset = f.create_dataset("Y_train", data=train[1].astype(target_data_type), compression="gzip")
78 |     dset = f.create_dataset("X_test", data=test[0].astype(np.float32), compression="gzip")
79 |     dset = f.create_dataset("Y_test", data=test[1].astype(target_data_type), compression="gzip")
80 | 


--------------------------------------------------------------------------------
/prismnet/model/resnet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class ResidualBlock1D(nn.Module):
 7 | 
 8 |     def __init__(self, planes, downsample=True):
 9 |         super(ResidualBlock1D, self).__init__()
10 |         self.c1 = nn.Conv1d(planes,   planes,   kernel_size=1, stride=1, bias=False)
11 |         self.b1 = nn.BatchNorm1d(planes)
12 |         self.c2 = nn.Conv1d(planes,   planes*2, kernel_size=11, stride=1,
13 |                      padding=5, bias=False)
14 |         self.b2 = nn.BatchNorm1d(planes*2)
15 |         self.c3 = nn.Conv1d(planes*2, planes*8, kernel_size=1, stride=1, bias=False)
16 |         self.b3 = nn.BatchNorm1d(planes * 8)
17 |         self.downsample = nn.Sequential(
18 |             nn.Conv1d(planes,   planes*8,   kernel_size=1, stride=1, bias=False),
19 |             nn.BatchNorm1d(planes*8),
20 |         )
21 |         self.relu  = nn.ReLU(inplace=True)
22 | 
23 |     def forward(self, x):
24 |         identity = x
25 | 
26 |         out = self.c1(x)
27 |         out = self.b1(out)
28 |         out = self.relu(out)
29 | 
30 |         out = self.c2(out)
31 |         out = self.b2(out)
32 |         out = self.relu(out)
33 | 
34 |         out = self.c3(out)
35 |         out = self.b3(out)
36 | 
37 |         if self.downsample:
38 |             identity = self.downsample(x)
39 | 
40 |         out += identity
41 |         out = self.relu(out)
42 | 
43 |         return out
44 | 
45 | class ResidualBlock2D(nn.Module):
46 | 
47 |     def __init__(self, planes, kernel_size=(11,5), padding=(5,2), downsample=True):
48 |         super(ResidualBlock2D, self).__init__()
49 |         self.c1 = nn.Conv2d(planes,   planes,   kernel_size=1, stride=1, bias=False)
50 |         self.b1 = nn.BatchNorm2d(planes)
51 |         self.c2 = nn.Conv2d(planes,   planes*2, kernel_size=kernel_size, stride=1,
52 |                      padding=padding, bias=False)
53 |         self.b2 = nn.BatchNorm2d(planes*2)
54 |         self.c3 = nn.Conv2d(planes*2, planes*4, kernel_size=1, stride=1, bias=False)
55 |         self.b3 = nn.BatchNorm2d(planes * 4)
56 |         self.downsample = nn.Sequential(
57 |             nn.Conv2d(planes,   planes*4,   kernel_size=1, stride=1, bias=False),
58 |             nn.BatchNorm2d(planes*4),
59 |         )
60 |         self.relu  = nn.ReLU(inplace=True)
61 | 
62 |     def forward(self, x):
63 |         identity = x
64 | 
65 |         out = self.c1(x)
66 |         out = self.b1(out)
67 |         out = self.relu(out)
68 | 
69 |         out = self.c2(out)
70 |         out = self.b2(out)
71 |         out = self.relu(out)
72 | 
73 |         out = self.c3(out)
74 |         out = self.b3(out)
75 | 
76 |         if self.downsample:
77 |             identity = self.downsample(x)
78 |         out += identity
79 |         out = self.relu(out)
80 | 
81 |         return out
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/prismnet/loader.py:
--------------------------------------------------------------------------------
 1 | import os, sys, pdb, h5py
 2 | import os.path
 3 | import numpy as np
 4 | import torch
 5 | import torch.utils.data
 6 | 
 7 | class SeqicSHAPE(torch.utils.data.Dataset):
 8 |     def __init__(self, data_path, is_test=False, is_infer=False, use_structure=True):
 9 |         """data loader
10 |         
11 |         Args:
12 |             data_path ([str]): h5 file path
13 |             is_test (bool, optional): testset or not. Defaults to False.
14 |         """
15 |         if is_infer:
16 |             self.dataset = self.__load_infer_data__(data_path, use_structure=use_structure)
17 |             print("infer data: ", self.__len__()," use_structure: ", use_structure)
18 |         else:
19 |             dataset = h5py.File(data_path, 'r')
20 |             X_train = np.array(dataset['X_train']).astype(np.float32)
21 |             Y_train = np.array(dataset['Y_train']).astype(np.int32)
22 |             X_test  = np.array(dataset['X_test']).astype(np.float32)
23 |             Y_test  = np.array(dataset['Y_test']).astype(np.int32)
24 |             if len(Y_train.shape) == 1:
25 |                 Y_train = np.expand_dims(Y_train, axis=1)
26 |                 Y_test  = np.expand_dims(Y_test, axis=1)
27 |             X_train = np.expand_dims(X_train, axis=3).transpose([0, 3, 2, 1])
28 |             X_test  = np.expand_dims(X_test,  axis=3).transpose([0, 3, 2, 1])
29 | 
30 |             train = {'inputs': X_train, 'targets': Y_train}
31 |             test  = {'inputs': X_test,  'targets': Y_test}
32 | 
33 |             labels, nums = np.unique(Y_train,return_counts=True)
34 |             print("train:", labels, nums)
35 |             labels, nums = np.unique(Y_test,return_counts=True)
36 |             print("test:", labels, nums)
37 | 
38 |             train = self.__prepare_data__(train)
39 |             test  = self.__prepare_data__(test)
40 | 
41 |             if is_test:
42 |                 self.dataset = test
43 |             else:
44 |                 self.dataset = train
45 | 
46 |         
47 | 
48 |     def __load_infer_data__(self, data_path, use_structure=True):
49 |         from prismnet.utils import datautils
50 |         dataset = datautils.load_testset_txt(data_path, use_structure=use_structure, seq_length=101)
51 |         return dataset
52 |        
53 |     
54 |     def __prepare_data__(self, data):
55 |         inputs    = data['inputs'][:,:,:,:4]
56 |         structure = data['inputs'][:,:,:,4:]
57 |         structure = np.expand_dims(structure[:,:,:,0], axis=3)
58 |         inputs    = np.concatenate([inputs, structure], axis=3)
59 |         data['inputs']  = inputs
60 |         return data
61 | 
62 |     def __to_sequence__(self, x):
63 |         x1 = np.zeros_like(x[0,:,:1])
64 |         for i in range(x1.shape[0]):
65 |             # import pdb; pdb.set_trace()
66 |             x1[i] = np.argmax(x[0,i,:4])
67 |             # import pdb; pdb.set_trace()
68 |         return x1
69 | 
70 |     def __getitem__(self, index):
71 |         """
72 |         Args:
73 |             index (int): Index
74 | 
75 |         Returns:
76 |             tuple: (image, target) where target is index of the target class.
77 |         """
78 |         x = self.dataset['inputs'][index]
79 |         # x = self.__to_sequence__(x)
80 |         y = self.dataset['targets'][index]
81 |         return x, y
82 | 
83 | 
84 |     def __len__(self):
85 |         return len(self.dataset['inputs'])
86 | 
87 | 


--------------------------------------------------------------------------------
/prismnet/model/utils.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import numpy as np
 3 | 
 4 | import torch
 5 | from sklearn import metrics
 6 | 
 7 | import torch
 8 | from torch.optim.lr_scheduler import _LRScheduler
 9 | from torch.optim.lr_scheduler import ReduceLROnPlateau
10 | 
11 | def param_num(model):
12 |     num_param0 = sum(p.numel() for p in model.parameters())
13 |     num_param1 = sum(p.numel() for p in model.parameters() if p.requires_grad)
14 |     print("===========================")
15 |     print("Total params:", num_param0)
16 |     print("Trainable params:", num_param1)
17 |     print("Non-trainable params:", num_param0-num_param1)
18 |     print("===========================")
19 | 
20 | def compute_acc_auc(output,  y):
21 |     y1      = y.to(device='cpu', dtype=torch.long).numpy()
22 |     p_class = (output>=0.5).to(device='cpu').data.numpy()
23 |     prob   = output.to(device='cpu').data.numpy()
24 |     acc = metrics.accuracy_score(y1, p_class)
25 |     auc = 0.5
26 |     try:
27 |         auc = metrics.roc_auc_score(y1, prob)
28 |     except Exception as e:
29 |         pass
30 |     
31 |     return acc, auc
32 | 
33 | class GradualWarmupScheduler(_LRScheduler):
34 |     """ Gradually warm-up(increasing) learning rate in optimizer.
35 |     Proposed in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour'.
36 |     Args:
37 |         optimizer (Optimizer): Wrapped optimizer.
38 |         multiplier: target learning rate = base lr * multiplier
39 |         total_epoch: target learning rate is reached at total_epoch, gradually
40 |         after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau)
41 |     """
42 | 
43 |     def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None):
44 |         self.multiplier = multiplier
45 |         if self.multiplier <= 1.:
46 |             raise ValueError('multiplier should be greater than 1.')
47 |         self.total_epoch = total_epoch
48 |         self.after_scheduler = after_scheduler
49 |         self.finished = False
50 |         super().__init__(optimizer)
51 | 
52 |     def get_lr(self):
53 |         if self.last_epoch > self.total_epoch:
54 |             if self.after_scheduler:
55 |                 if not self.finished:
56 |                     self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs]
57 |                     self.finished = True
58 |                 return self.after_scheduler.get_lr()
59 |             return [base_lr * self.multiplier for base_lr in self.base_lrs]
60 | 
61 |         return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]
62 | 
63 |     def step_ReduceLROnPlateau(self, metrics, epoch=None):
64 |         if epoch is None:
65 |             epoch = self.last_epoch + 1
66 |         self.last_epoch = epoch if epoch != 0 else 1  # ReduceLROnPlateau is called at the end of epoch, whereas others are called at beginning
67 |         if self.last_epoch <= self.total_epoch:
68 |             warmup_lr = [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]
69 |             for param_group, lr in zip(self.optimizer.param_groups, warmup_lr):
70 |                 param_group['lr'] = lr
71 |         else:
72 |             if epoch is None:
73 |                 self.after_scheduler.step(metrics, None)
74 |             else:
75 |                 self.after_scheduler.step(metrics, epoch - self.total_epoch)
76 | 
77 |     def step(self, epoch=None, metrics=None):
78 |         if type(self.after_scheduler) != ReduceLROnPlateau:
79 |             if self.finished and self.after_scheduler:
80 |                 if epoch is None:
81 |                     self.after_scheduler.step(None)
82 |                 else:
83 |                     self.after_scheduler.step(epoch - self.total_epoch)
84 |             else:
85 |                 return super(GradualWarmupScheduler, self).step(epoch)
86 |         else:
87 |             self.step_ReduceLROnPlateau(metrics, epoch)


--------------------------------------------------------------------------------
/prismnet/utils/visualize.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | import numpy as np
  3 | import matplotlib as mpl
  4 | mpl.use("pdf")
  5 | import matplotlib.pyplot as plt
  6 | import matplotlib.gridspec as gridspec
  7 | from scipy.misc import imresize
  8 | 
  9 | package_directory = os.path.dirname(os.path.abspath(__file__))
 10 | acgu_path = os.path.join(package_directory,'acgu.npz')
 11 | chars = np.load(acgu_path,allow_pickle=True)['data']
 12 | 
 13 | def normalize_pwm(pwm, factor=None, MAX=None):
 14 |     if MAX is None:
 15 |         MAX = np.max(np.abs(pwm))
 16 |     pwm = pwm/MAX
 17 |     if factor:
 18 |         pwm = np.exp(pwm*factor)
 19 |     norm = np.outer(np.ones(pwm.shape[0]), np.sum(np.abs(pwm), axis=0))
 20 |     return pwm/norm
 21 | 
 22 | def get_nt_height(pwm, height, norm):
 23 | 
 24 |     def entropy(p):
 25 |         s = 0
 26 |         for i in range(len(p)):
 27 |             if p[i] > 0:
 28 |                 s -= p[i]*np.log2(p[i])
 29 |         return s
 30 | 
 31 |     num_nt, num_seq = pwm.shape
 32 |     heights = np.zeros((num_nt,num_seq))
 33 |     for i in range(num_seq):
 34 |         if norm == 1:
 35 |             total_height = height
 36 |         else:
 37 |             total_height = (np.log2(num_nt) - entropy(pwm[:, i]))*height
 38 |         
 39 |         heights[:,i] = np.floor(pwm[:,i]*np.minimum(total_height, height*2))
 40 | 
 41 |     return heights.astype(int)
 42 | 
 43 | def seq_logo(pwm, height=30, nt_width=10, norm=0, alphabet='rna', colormap='standard'):
 44 | 
 45 |     heights = get_nt_height(pwm, height, norm)
 46 |     num_nt, num_seq = pwm.shape
 47 |     width = np.ceil(nt_width*num_seq).astype(int)
 48 |     
 49 |     max_height = height*2
 50 |     logo = np.ones((max_height, width, 3)).astype(int)*255
 51 |     for i in range(num_seq):
 52 |         nt_height = np.sort(heights[:,i])
 53 |         index = np.argsort(heights[:,i])
 54 |         remaining_height = np.sum(heights[:,i])
 55 |         offset = max_height-remaining_height
 56 | 
 57 |         for j in range(num_nt):
 58 |             if nt_height[j] <=0 :
 59 |                 continue
 60 |             # resized dimensions of image
 61 |             nt_img = imresize(chars[index[j]], (nt_height[j], nt_width))
 62 |             # determine location of image
 63 |             height_range = range(remaining_height-nt_height[j], remaining_height)
 64 |             width_range = range(i*nt_width, i*nt_width+nt_width)
 65 |             # 'annoying' way to broadcast resized nucleotide image
 66 |             if height_range:
 67 |                 for k in range(3):
 68 |                     for m in range(len(width_range)):
 69 |                         logo[height_range+offset, width_range[m],k] = nt_img[:,m,k]
 70 | 
 71 |             remaining_height -= nt_height[j]
 72 | 
 73 |     return logo.astype(np.uint8)
 74 | 
 75 | def plot_saliency(X, W, nt_width=100, norm_factor=3, str_null=None, outdir="results/"):
 76 |     # filter out zero-padding
 77 |     plot_index = np.where(np.sum(X[:4,:], axis=0)!=0)[0]
 78 |     num_nt = len(plot_index)
 79 |     trace_width = num_nt*nt_width
 80 |     trace_height = 400
 81 |     
 82 |     seq_str_mode = False
 83 |     if X.shape[0]>4:
 84 |         seq_str_mode = True
 85 |         assert str_null is not None, "Null region is not provided."
 86 | 
 87 |     # sequence logo
 88 |     img_seq_raw = seq_logo(X[:4, plot_index], height=nt_width, nt_width=nt_width)
 89 | 
 90 |     if seq_str_mode:
 91 |         # structure line
 92 |         str_raw = X[4, plot_index]
 93 |         if str_null.sum() > 0:
 94 |             str_raw[str_null.T==1] = -0.01
 95 | 
 96 |         line_str_raw = np.zeros(trace_width)
 97 |         for v in range(str_raw.shape[0]):
 98 |             line_str_raw[v*nt_width:(v+1)*nt_width] = (1-str_raw[v])*trace_height 
 99 |             # i+=1
100 |     
101 |     # sequence saliency logo
102 |     seq_sal = normalize_pwm(W[:4, plot_index], factor=norm_factor)
103 |     img_seq_sal_logo = seq_logo(seq_sal, height=nt_width*5, nt_width=nt_width)
104 |     img_seq_sal = imresize(W[:4, plot_index], size=(trace_height, trace_width))
105 | 
106 |     if seq_str_mode:
107 |         # structure saliency logo
108 |         str_sal = W[4, plot_index].reshape(1,-1)
109 |         img_str_sal = imresize(str_sal, size=(trace_height, trace_width))
110 | 
111 |     # plot    
112 |     fig = plt.figure(figsize=(10.1,2))
113 |     gs = gridspec.GridSpec(nrows=4, ncols=1, height_ratios=[2.5, 1, 0.5, 1])
114 |     cmap_reversed = mpl.cm.get_cmap('jet')
115 | 
116 |     ax = fig.add_subplot(gs[0, 0])
117 |     ax.axis('off')
118 |     ax.imshow(img_seq_sal_logo)
119 |     plt.text(x=trace_width-400,y=10, s='PrismNet', fontsize=4)
120 | 
121 |     ax = fig.add_subplot(gs[1, 0]) 
122 |     ax.axis('off')
123 |     ax.imshow(img_seq_sal, cmap=cmap_reversed)
124 | 
125 |     ax = fig.add_subplot(gs[2, 0]) 
126 |     ax.axis('off')
127 |     ax.imshow(img_seq_raw)
128 | 
129 |     if seq_str_mode:
130 |         ax = fig.add_subplot(gs[3, 0]) 
131 |         ax.axis('off')
132 |         ax.imshow(img_str_sal, cmap=cmap_reversed)
133 |         ax.plot(line_str_raw, '-', color='r', linewidth=1, scalex=False, scaley=False)
134 |         
135 |         # plot balck line to hide the -1(NULL structure score)
136 |         x = (np.zeros(trace_width) + (1+0.01))*trace_height  +1.5
137 |         ax.plot(x, '-', color='white', linewidth=1.2, scalex=False, scaley=False)
138 |     
139 |     plt.subplots_adjust(wspace=0, hspace=0)
140 |     
141 |     # save figure
142 |     filepath = outdir
143 |     fig.savefig(filepath, format='pdf', dpi=300, bbox_inches='tight')
144 |     plt.close('all')
145 | 


--------------------------------------------------------------------------------
/prismnet/model/smoothgrad.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # 
  3 | # Kui Xu, xukui.cs@gmail.com
  4 | # 2019-02-25
  5 | # ref smoothGrad
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | from torch.autograd import grad,Variable
 11 | import numpy as np
 12 | 
 13 | class SmoothGrad(object):
 14 |     def __init__(self, model, device='cpu', only_seq=False, train=False, 
 15 |         x_stddev=0.015, t_stddev=0.015, nsamples=20, magnitude=2):
 16 |         self.model     = model
 17 |         self.device    = device
 18 |         self.train     = train
 19 |         self.only_seq  = only_seq
 20 |         self.x_stddev  = x_stddev
 21 |         self.t_stddev  = t_stddev
 22 |         self.nsamples  = nsamples
 23 |         self.magnitude = magnitude
 24 |         self.features  = model
 25 |         # import pdb; pdb.set_trace()
 26 | 
 27 |     def get_gradients(self, z, pred_label=None):
 28 |         self.model.eval()
 29 |         self.model.zero_grad()
 30 |         z = z.to(self.device)
 31 |         z.requires_grad=True
 32 |         output = self.model(z)
 33 |         output = torch.sigmoid(output)
 34 |         output.backward()
 35 |         return z.grad
 36 | 
 37 |     def get_smooth_gradients(self, z, y=None):
 38 |         return self.__call__(z, y)
 39 |         
 40 |     def __call__(self, z, y=None):
 41 |         """[summary]
 42 |         
 43 |         Args:
 44 |             z ([type]): [description]
 45 |             y ([type]): [description]
 46 |             x_stddev (float, optional): [description]. Defaults to 0.15.
 47 |             t_stddev (float, optional): [description]. Defaults to 0.15.
 48 |             nsamples (int, optional):   [description]. Defaults to 20.
 49 |             magnitude (int, optional):  magnitude:0,1,2; 0: original gradient, 1: absolute value of the gradient,
 50 |                                         2: square value of the gradient. Defaults to 2.
 51 |         
 52 |         Returns:
 53 |             [type]: [description]
 54 |         """
 55 | 
 56 |         # 1. for sequece
 57 |         x = z[:,:,:,:4] # .data.cpu()
 58 |         x_stddev   = (self.x_stddev * (x.max()-x.min())).to(self.device).item() 
 59 | 
 60 |         total_grad = torch.zeros(z.shape).to(self.device)
 61 |         x_noise    = torch.zeros(x.shape).to(self.device)
 62 |         if not self.only_seq:
 63 |             # 2. for structure  
 64 |             t = z[:,:,:,4:] #.data.cpu()
 65 |             t_stddev = (self.t_stddev * (t.max()-t.min())).to(self.device).item() 
 66 |             #t_total_grad = torch.zeros(t.shape)
 67 |             t_noise = torch.zeros(t.shape).to(self.device)
 68 | 
 69 |         for i in range(self.nsamples):
 70 |             x_plus_noise = x + x_noise.zero_().normal_(0, x_stddev)
 71 |             if self.only_seq:
 72 |                 z_plus_noise = x_plus_noise
 73 |             else:
 74 |                 t_plus_noise = t + t_noise.zero_().normal_(0, t_stddev)
 75 |                 z_plus_noise = torch.cat((x_plus_noise, t_plus_noise), dim=3)
 76 |             #print("z_plus_noise:",z_plus_noise.size())
 77 |             grad = self.get_gradients(z_plus_noise, y)
 78 |             if self.magnitude == 1:
 79 |                 total_grad += torch.abs(grad)
 80 |             elif self.magnitude == 2:
 81 |                 total_grad += grad * grad
 82 |             
 83 |             # total_grad += grad * grad
 84 |         total_grad /= self.nsamples
 85 |         return total_grad
 86 | 
 87 |     def get_batch_gradients(self, X, Y=None):
 88 |         if Y is not None:
 89 |             assert len(X) == len(Y), "The size of input {} and target {} are not matched.".format(len(X), len(Y))
 90 |         g = torch.zeros_like(X)
 91 |         for i in range(X.shape[0]):
 92 |             x        = X[i:i+1]
 93 |             if Y is not None:
 94 |                 y    = Y[i:i+1]
 95 |             else:
 96 |                 y    = None
 97 |             g[i:i+1] =  self.get_smooth_gradients(x, y)
 98 |             # g[i:i+1] =  self.get_gradients(x, y)
 99 |         return g
100 | 
101 | 
102 | def generate_saliency(model, x, y=None, smooth=False, nsamples=2, stddev=0.15, only_seq=False, \
103 |     train=False):
104 |     saliency = SmoothGrad(model, only_seq, train)
105 |     x_grad   = saliency.get_smooth_gradients(x, y, nsamples=nsamples, x_stddev=stddev, t_stddev=stddev)
106 |     return x_grad
107 | 
108 | 
109 | 
110 | class GuidedBackpropReLU(torch.autograd.Function):
111 | 
112 |     def __init__(self, inplace=False):
113 |         super(GuidedBackpropReLU, self).__init__()
114 |         self.inplace = inplace
115 | 
116 |     def forward(self, input):
117 |         pos_mask = (input > 0).type_as(input)
118 |         output = torch.addcmul(
119 |             torch.zeros(input.size()).type_as(input),
120 |             input,
121 |             pos_mask)
122 |         self.save_for_backward(input, output)
123 |         return output
124 | 
125 |     def backward(self, grad_output):
126 |         input, output = self.saved_tensors
127 | 
128 |         pos_mask_1 = (input > 0).type_as(grad_output)
129 |         pos_mask_2 = (grad_output > 0).type_as(grad_output)
130 |         grad_input = torch.addcmul(
131 |             torch.zeros(input.size()).type_as(input),
132 |             torch.addcmul(
133 |                 torch.zeros(input.size()).type_as(input), grad_output, pos_mask_1),
134 |                 pos_mask_2)
135 | 
136 |         return grad_input
137 | 
138 |     def __repr__(self):
139 |         inplace_str = ', inplace' if self.inplace else ''
140 |         return self.__class__.__name__ + ' (' \
141 |             + inplace_str + ')'
142 | 
143 | class GuidedBackpropSmoothGrad(SmoothGrad):
144 | 
145 |     def __init__(self, model, device='cpu', only_seq=False, train=False, 
146 |         x_stddev=0.15, t_stddev=0.15, nsamples=20, magnitude=2):
147 |         super(GuidedBackpropSmoothGrad, self).__init__(
148 |             model, device, only_seq, train, x_stddev, t_stddev, nsamples, magnitude)
149 |         for idx, module in self.features._modules.items():
150 |             if module.__class__.__name__ is 'ReLU':
151 |                 self.features._modules[idx] = GuidedBackpropReLU()
152 | 
153 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PrismNet
  2 | 
  3 | This is a [PyTorch](https://pytorch.org/) implementation of our paper:
  4 | ## Predicting dynamic cellular protein-RNA interactions using deep learning and in vivo RNA structure
  5 | Lei Sun*,  Kui Xu*, Wenze Huang*, Yucheng T. Yang*, Pan Li, Lei Tang, Tuanlin Xiong, Qiangfeng Cliff Zhang
  6 | 
  7 | *: indicates equal contribution.
  8 | 
  9 | Cell Research Version: ([https://www.nature.com/articles/s41422-021-00476-y](https://www.nature.com/articles/s41422-021-00476-y))
 10 | 
 11 | bioRxiv preprint: ([https://www.biorxiv.org/content/10.1101/2020.05.05.078774v1](https://www.biorxiv.org/content/10.1101/2020.05.05.078774v1))
 12 | 
 13 | ![prismnet](https://github.com/kuixu/PrismNet/wiki/imgs/prismnet.png)
 14 | 
 15 | 
 16 | 
 17 | ### Table of Contents
 18 | - [Getting started](#Getting-started)
 19 | - [Datasets](#datasets)
 20 | - [Usage](#usage)
 21 | - [Copyright and License](#copyright-and-license)
 22 | - [Reference](#Reference)
 23 | 
 24 | ## Getting started
 25 | 
 26 | 
 27 | ### Requirements
 28 |  
 29 |  - Python 3.6
 30 |  - PyTorch 1.1.0, with NVIDIA CUDA Support
 31 |  - pip
 32 | 
 33 | ### Installation
 34 | Clone repository: 
 35 | 
 36 | ```bash
 37 | git clone https://github.com/kuixu/PrismNet.git
 38 | ```
 39 | Install packages:
 40 | ```bash
 41 | cd PrismNet
 42 | pip install -r requirements.txt
 43 | pip install -e .
 44 | ```
 45 | 
 46 | ## Datasets
 47 | 
 48 | ### Prepare the datasets
 49 | 
 50 | Scripts and pipeline are in preparing, currently, we provide 172 samples data in *.tsv format for training and testing PrismNet.
 51 | 
 52 | ```
 53 | # Download data
 54 | cd PrismNet/data
 55 | wget https://zhanglabnet.oss-cn-beijing.aliyuncs.com/prismnet/data/clip_data.tgz
 56 | tar zxvf clip_data.tgz
 57 | 
 58 | # Generate training and validation set for binary classification
 59 | cd PrismNet
 60 | tools/gdata_bin.sh
 61 | ```
 62 | 
 63 | 
 64 | ## Usage
 65 | 
 66 | ### Network Architecture
 67 | 
 68 | ![prismnet](https://github.com/kuixu/PrismNet/wiki/imgs/prismnet-arch.png)
 69 | 
 70 | ### Training 
 71 | 
 72 | To train one single protein model from scratch, run
 73 | ```
 74 | exp/EXP_NAME/train.sh pu PrismNet TIA1_Hela clip_data 
 75 | ```
 76 | where you replace `TIA1_Hela` with the name of the data file you want to use, you replace EXP_NAME with a specific name of this experiment. Hyper-parameters could be tuned in `exp/prismnet/train.sh`. For available training options, please take a look at `tools/train.py`.
 77 | 
 78 | To monitor the training process, add option `-tfboard` in `exp/prismnet/train.sh`, and view page at http://localhost:6006 using tensorboard:
 79 | ```
 80 | tensorboard --logdir exp/EXP_NAME/out/tfb
 81 | ```
 82 | 
 83 | To train all the protein models, run
 84 | ```
 85 | exp/EXP_NAME/train_all.sh
 86 | ```
 87 | 
 88 | ### Evaluation
 89 | For evaluation of the models, we provide the script `eval.sh`. You can run it using
 90 | ```
 91 | exp/prismnet/eval.sh TIA1_Hela clip_data 
 92 | ```
 93 | 
 94 | ### Inference
 95 | For inference data (the same format as the *.tsv file used in [Datasets](#datasets)) using the trained models, we provide the script `infer.sh`. You can run it using
 96 | ```
 97 | exp/prismnet/infer.sh TIA1_Hela /path/to/inference_file.tsv
 98 | ```
 99 | 
100 | ### Compute High Attention Regions
101 | For computing high attention regions using the trained models, we provide the script `har.sh`. You can run it using
102 | ```
103 | exp/prismnet/har.sh TIA1_Hela /path/to/inference_file.tsv
104 | ```
105 | 
106 | ### Compute Saliency
107 | For computing saliency using the trained models, we provide the script `saliency.sh`. You can run it using
108 | ```
109 | exp/prismnet/saliency.sh TIA1_Hela /path/to/inference_file.tsv
110 | ```
111 | 
112 | ### Plot Saliency Image
113 | For plotting saliency image using the trained models, we provide the script `saliencyimg.sh`. You can run it using
114 | ```
115 | exp/prismnet/saliencyimg.sh TIA1_Hela /path/to/inference_file.tsv 
116 | ```
117 | 
118 | ### Motif Construction
119 | For the construction and analysis of integrative motifs, Users can use the scripts in `motif_construct/` 
120 | ```
121 | perl saliency_motif.pl infile.txt sal outfile
122 | Rscript motif_sig.R outfile_motif_summary.txt outfile_motif_sig.txt
123 | ```
124 | 
125 | ### Integrative motif 
126 | 
127 | The integrative motif could be downloaded at [here](http://prismnet.zhanglab.net/data/Total_motifs-matrix-logo.xlsx).
128 | 
129 | 
130 | ### Half Life Analysis (Example)
131 | 
132 | #### Download half life data
133 | ```
134 | cd PrismNet/data
135 | wget https://zhanglabnet.oss-cn-beijing.aliyuncs.com/prismnet/data/halflife_data.tgz
136 | tar zxvf halflife_data.tgz
137 | ```
138 | 
139 | #### Requirements
140 | ```
141 | pip install xgboost==1.3.0rc1 matplotlib scipy scikit-learn termplotlib
142 | ```
143 | 
144 | #### Run Example
145 | 
146 | ```
147 | exp/logistic_reg/run.sh
148 | ```
149 | 
150 | ### Dataset and Results Visualization
151 | 
152 | We also provide a website [http://prismnet.zhanglab.net/](http://prismnet.zhanglab.net/) to visualize the icSHAPE date and the results.
153 | 
154 | ## Copyright and License
155 | This project is free to use for non-commercial purposes - see the [LICENSE](LICENSE) file for details.
156 | 
157 | ## Reference
158 | 
159 | ```
160 | @article {Sun2021cr,
161 | 	title = {Predicting dynamic cellular protein-RNA interactions using deep learning and in vivo RNA structure},
162 | 	author = {Sun, Lei and Xu, Kui and Huang, Wenze and Yang, Yucheng T. and Li, Pan and Tang, Lei and Xiong, Tuanlin and Zhang, Qiangfeng Cliff},
163 | 	year = {2021},
164 | 	doi = {https://doi.org/10.1038/s41422-021-00476-y},
165 | 	journal = {Cell Research}
166 | }
167 | @article {Sun2021cell,
168 | 	title = {In vivo structural characterization of the whole SARS-CoV-2 RNA genome identifies host cell target proteins vulnerable to re-purposed drugs},
169 | 	author = {Sun, Lei and Li, Pan and Ju, Xiaohui and Rao, Jian and Huang, Wenze and Zhang, Shaojun and Xiong, Tuanlin and Xu, Kui and Zhou, Xiaolin and Ren, Lili and Ding, Qiang and Wang, Jianwei and Zhang, Qiangfeng Cliff},
170 | 	year = {2021},
171 | 	doi = {https://doi.org/10.1016/j.cell.2021.02.008},
172 | 	journal = {Cell}
173 | }
174 | ```
175 | 


--------------------------------------------------------------------------------
/prismnet/model/PrismNet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from .resnet import ResidualBlock1D, ResidualBlock2D
  5 | from .se import SEBlock
  6 | 
  7 | class Conv2d(nn.Module):
  8 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1, relu=True, same_padding=False, bn=False):
  9 |         super(Conv2d, self).__init__()
 10 |         p0 = int((kernel_size[0] - 1) / 2) if same_padding else 0
 11 |         p1 = int((kernel_size[1] - 1) / 2) if same_padding else 0
 12 |         padding = (p0, p1)
 13 |         self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding=padding)
 14 |         self.bn = nn.BatchNorm2d(out_channels) if bn else None
 15 |         self.relu = nn.ReLU(inplace=True) if relu else None
 16 | 
 17 |     def forward(self, x):
 18 |         x = self.conv(x)
 19 |         if self.bn is not None:
 20 |             x = self.bn(x)
 21 |         if self.relu is not None:
 22 |             x = self.relu(x)
 23 |         return x
 24 | 
 25 | class PrismNet(nn.Module):
 26 |     def __init__(self, mode="pu"):
 27 |         super(PrismNet, self).__init__()
 28 |         self.mode = mode
 29 |         h_p, h_k = 2, 5 
 30 |         if mode=="pu":
 31 |             self.n_features = 5
 32 |         elif mode=="seq":
 33 |             self.n_features = 4
 34 |             h_p, h_k = 1, 3 
 35 |         elif mode=="str":
 36 |             self.n_features = 1
 37 |             h_p, h_k = 0, 1
 38 |         else:
 39 |             raise "mode error"
 40 |         
 41 |         base_channel = 8
 42 |         self.conv    = Conv2d(1, base_channel, kernel_size=(11, h_k), bn = True, same_padding=True)
 43 |         self.se      = SEBlock(base_channel)
 44 |         self.res2d   = ResidualBlock2D(base_channel, kernel_size=(11, h_k), padding=(5, h_p)) 
 45 |         self.res1d   = ResidualBlock1D(base_channel*4) 
 46 |         self.avgpool = nn.AvgPool2d((1,self.n_features))
 47 |         self.gpool   = nn.AdaptiveAvgPool1d(1)
 48 |         self.fc      = nn.Linear(base_channel*4*8, 1)
 49 |         self._initialize_weights()
 50 | 
 51 |     def _initialize_weights(self):
 52 |         for m in self.modules():
 53 |             if isinstance(m, nn.Conv2d):
 54 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
 55 |                 if m.bias is not None:
 56 |                     nn.init.constant_(m.bias, 0)
 57 |             elif isinstance(m, nn.Conv1d):
 58 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
 59 |                 if m.bias is not None:
 60 |                     nn.init.constant_(m.bias, 0)
 61 |             elif isinstance(m, nn.BatchNorm2d):
 62 |                 nn.init.constant_(m.weight, 1)
 63 |                 nn.init.constant_(m.bias, 0)
 64 |             elif isinstance(m, nn.BatchNorm1d):
 65 |                 nn.init.constant_(m.weight, 1)
 66 |                 nn.init.constant_(m.bias, 0)
 67 |             elif isinstance(m, nn.Linear):
 68 |                 nn.init.normal_(m.weight, 0, 0.01)
 69 |                 nn.init.constant_(m.bias, 0)
 70 |     
 71 |     def forward(self, input):
 72 |         """[forward]
 73 |         
 74 |         Args:
 75 |             input ([tensor],N,C,W,H): input features
 76 |         """
 77 |         if self.mode=="seq":
 78 |             input = input[:,:,:,:4]
 79 |         elif self.mode=="str":
 80 |             input = input[:,:,:,4:]
 81 |         x = self.conv(input)
 82 |         x = F.dropout(x, 0.1, training=self.training)
 83 |         z = self.se(x)
 84 |         x = self.res2d(x*z)
 85 |         x = F.dropout(x, 0.5, training=self.training)
 86 |         x = self.avgpool(x)
 87 |         x = x.view(x.shape[0], x.shape[1], x.shape[2])
 88 |         x = self.res1d(x)
 89 |         x = F.dropout(x, 0.3, training=self.training)
 90 |         x = self.gpool(x)
 91 |         x = x.view(x.shape[0], x.shape[1])
 92 |         x = self.fc(x)
 93 |         return x
 94 | 
 95 | 
 96 | class PrismNet_large(nn.Module):
 97 |     def __init__(self, mode="pu"):
 98 |         super(PrismNet_large, self).__init__()
 99 |         self.mode = mode
100 |         h_p, h_k = 2, 5 
101 |         if mode=="pu":
102 |             self.n_features = 5
103 |         elif mode=="seq":
104 |             self.n_features = 4
105 |             h_p, h_k = 1, 3 
106 |         elif mode=="str":
107 |             self.n_features = 1
108 |             h_p, h_k = 0, 1
109 |         else:
110 |             raise "mode error"
111 |         
112 |         base_channel = 64
113 |         self.conv    = Conv2d(1, base_channel, kernel_size=(11, h_k), bn = True, same_padding=True)
114 |         self.se      = SEBlock(base_channel)
115 |         self.res2d   = ResidualBlock2D(base_channel, kernel_size=(11, h_k), padding=(5, h_p)) 
116 |         self.res1d   = ResidualBlock1D(base_channel*4) 
117 |         self.avgpool = nn.AvgPool2d((1,self.n_features))
118 |         self.gpool   = nn.AdaptiveAvgPool1d(1)
119 |         self.fc      = nn.Linear(base_channel*4*8, 1)
120 |         self._initialize_weights()
121 | 
122 |     def _initialize_weights(self):
123 |         for m in self.modules():
124 |             if isinstance(m, nn.Conv2d):
125 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
126 |                 if m.bias is not None:
127 |                     nn.init.constant_(m.bias, 0)
128 |             elif isinstance(m, nn.Conv1d):
129 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
130 |                 if m.bias is not None:
131 |                     nn.init.constant_(m.bias, 0)
132 |             elif isinstance(m, nn.BatchNorm2d):
133 |                 nn.init.constant_(m.weight, 1)
134 |                 nn.init.constant_(m.bias, 0)
135 |             elif isinstance(m, nn.BatchNorm1d):
136 |                 nn.init.constant_(m.weight, 1)
137 |                 nn.init.constant_(m.bias, 0)
138 |             elif isinstance(m, nn.Linear):
139 |                 nn.init.normal_(m.weight, 0, 0.01)
140 |                 nn.init.constant_(m.bias, 0)
141 |     
142 |     def forward(self, input):
143 |         """[summary]
144 |         
145 |         Args:
146 |             input ([tensor],N,C,W,H): input features
147 |         """
148 |         if self.mode=="seq":
149 |             input = input[:,:,:,:4]
150 |         elif self.mode=="str":
151 |             input = input[:,:,:,4:]
152 |         x = self.conv(input)
153 |         x = F.dropout(x, 0.1, training=self.training)
154 |         z = self.se(x)
155 |         x = self.res2d(x*z)
156 |         x = F.dropout(x, 0.5, training=self.training)
157 |         x = self.avgpool(x)
158 |         x = x.view(x.shape[0], x.shape[1], x.shape[2])
159 |         x = self.res1d(x)
160 |         x = F.dropout(x, 0.3, training=self.training)
161 |         x = self.gpool(x)
162 |         x = x.view(x.shape[0], x.shape[1])
163 |         x = self.fc(x)
164 |         return x
165 | 


--------------------------------------------------------------------------------
/prismnet/utils/metrics.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | import numpy as np
  3 | from six.moves import cPickle
  4 | from sklearn.metrics import roc_curve, auc, precision_recall_curve, accuracy_score, roc_auc_score, confusion_matrix
  5 | from scipy import stats
  6 | 
  7 | 
  8 | __all__ = [
  9 |     "pearsonr",
 10 |     "rsquare",
 11 |     "accuracy",
 12 |     "roc",
 13 |     "pr",
 14 |     "calculate_metrics"
 15 | ]
 16 | 
 17 | # class MLMetrics(object):
 18 | class MLMetrics(object):
 19 |     def __init__(self, objective='binary'):
 20 |         self.objective = objective
 21 |         self.metrics = []
 22 | 
 23 |     def update(self, label, pred, other_lst):
 24 |         met, _ = calculate_metrics(label, pred, self.objective)
 25 |         if len(other_lst)>0:
 26 |             met.extend(other_lst)
 27 |         self.metrics.append(met)
 28 |         self.compute_avg() 
 29 | 
 30 |     def compute_avg(self):
 31 |         if len(self.metrics)>1:
 32 |             self.avg = np.array(self.metrics).mean(axis=0)
 33 |             self.sum = np.array(self.metrics).sum(axis=0)
 34 |         else:
 35 |             self.avg = self.metrics[0]
 36 |             self.sum = self.metrics[0]
 37 |         self.acc = self.avg[0]
 38 |         self.auc = self.avg[1]
 39 |         self.prc = self.avg[2]
 40 |         self.tp  = int(self.sum[3])
 41 |         self.tn  = int(self.sum[4])
 42 |         self.fp  = int(self.sum[5])
 43 |         self.fn  = int(self.sum[6])
 44 |         if len(self.avg)>7:
 45 |             self.other = self.avg[7:]
 46 | 
 47 | 
 48 | def pearsonr(label, prediction):
 49 |     ndim = np.ndim(label)
 50 |     if ndim == 1:
 51 |         corr = [stats.pearsonr(label, prediction)]
 52 |     else:
 53 |         num_labels = label.shape[1]
 54 |         corr = []
 55 |         for i in range(num_labels):
 56 |             #corr.append(np.corrcoef(label[:,i], prediction[:,i]))
 57 |             corr.append(stats.pearsonr(label[:,i], prediction[:,i])[0])
 58 | 
 59 |     return corr
 60 | 
 61 | 
 62 | def rsquare(label, prediction):
 63 |     ndim = np.ndim(label)
 64 |     if ndim == 1:
 65 |         y = label
 66 |         X = prediction
 67 |         m = np.dot(X,y)/np.dot(X, X)
 68 |         resid = y - m*X;
 69 |         ym = y - np.mean(y);
 70 |         rsqr2 = 1 - np.dot(resid.T,resid)/ np.dot(ym.T, ym);
 71 |         metric = [rsqr2]
 72 |         slope = [m]
 73 |     else:
 74 |         num_labels = label.shape[1]
 75 |         metric = []
 76 |         slope = []
 77 |         for i in range(num_labels):
 78 |             y = label[:,i]
 79 |             X = prediction[:,i]
 80 |             m = np.dot(X,y)/np.dot(X, X)
 81 |             resid = y - m*X;
 82 |             ym = y - np.mean(y);
 83 |             rsqr2 = 1 - np.dot(resid.T,resid)/ np.dot(ym.T, ym);
 84 |             metric.append(rsqr2)
 85 |             slope.append(m)
 86 |     return metric, slope
 87 | 
 88 | 
 89 | def accuracy(label, prediction):
 90 |     ndim = np.ndim(label)
 91 |     if ndim == 1:
 92 |         metric = np.array(accuracy_score(label, np.round(prediction)))
 93 |     else:
 94 |         num_labels = label.shape[1]
 95 |         metric = np.zeros((num_labels))
 96 |         for i in range(num_labels):
 97 |             metric[i] = accuracy_score(label[:,i], np.round(prediction[:,i]))
 98 |     return metric
 99 | 
100 | 
101 | def roc(label, prediction):
102 |     ndim = np.ndim(label)
103 |     if ndim == 1:
104 |         fpr, tpr, thresholds = roc_curve(label, prediction)
105 |         score = auc(fpr, tpr)
106 |         metric = np.array(score)
107 |         curves = [(fpr, tpr)]
108 |     else:
109 |         num_labels = label.shape[1]
110 |         curves = []
111 |         metric = np.zeros((num_labels))
112 |         for i in range(num_labels):
113 |             fpr, tpr, thresholds = roc_curve(label[:,i], prediction[:,i])
114 |             score = auc(fpr, tpr)
115 |             metric[i]= score
116 |             curves.append((fpr, tpr))
117 |     return metric, curves
118 | 
119 | 
120 | def pr(label, prediction):
121 |     ndim = np.ndim(label)
122 |     if ndim == 1:
123 |         precision, recall, thresholds = precision_recall_curve(label, prediction)
124 |         score = auc(recall, precision)
125 |         metric = np.array(score)
126 |         curves = [(precision, recall)]
127 |     else:
128 |         num_labels = label.shape[1]
129 |         curves = []
130 |         metric = np.zeros((num_labels))
131 |         for i in range(num_labels):
132 |             precision, recall, thresholds = precision_recall_curve(label[:,i], prediction[:,i])
133 |             score = auc(recall, precision)
134 |             metric[i] = score
135 |             curves.append((precision, recall))
136 |     return metric, curves
137 | 
138 | def tfnp(label, prediction):
139 |     try:
140 |         tn, fp, fn, tp = confusion_matrix(label, prediction).ravel()
141 |     except Exception:
142 |         tp, tn, fp, fn =0,0,0,0
143 |     
144 |     return tp, tn, fp, fn
145 | 
146 | 
147 | def calculate_metrics(label, prediction, objective):
148 |     """calculate metrics for classification"""
149 |     # import pdb; pdb.set_trace()
150 |     
151 | 
152 |     if (objective == "binary") | (objective == 'hinge'):
153 |         ndim = np.ndim(label)
154 |         #if ndim == 1:
155 |         #    label = one_hot_labels(label)
156 |         correct = accuracy(label, prediction)
157 |         auc_roc, roc_curves = roc(label, prediction)
158 |         auc_pr, pr_curves = pr(label, prediction)
159 |         # import pdb; pdb.set_trace()
160 |         if ndim == 2:
161 |             prediction=prediction[:,0]
162 |             label = label[:,0]
163 |         # pred_class = prediction[:,0]>0.5
164 |         pred_class = prediction>0.5
165 |         # tp, tn, fp, fn = tfnp(label[:,0], pred_class)
166 |         tp, tn, fp, fn = tfnp(label, pred_class)
167 |         # tn8, fp8, fn8, tp8 = tfnp(label[:,0], prediction[prediction>0.8][:,0])
168 |         # import pdb; pdb.set_trace()
169 |         mean = [np.nanmean(correct), np.nanmean(auc_roc), np.nanmean(auc_pr),tp, tn, fp, fn]
170 |         std = [np.nanstd(correct), np.nanstd(auc_roc), np.nanstd(auc_pr)]
171 | 
172 |     elif objective == "categorical":
173 | 
174 |         correct = np.mean(np.equal(np.argmax(label, axis=1), np.argmax(prediction, axis=1)))
175 |         auc_roc, roc_curves = roc(label, prediction)
176 |         auc_pr, pr_curves = pr(label, prediction)
177 |         mean = [np.nanmean(correct), np.nanmean(auc_roc), np.nanmean(auc_pr)]
178 |         std = [np.nanstd(correct), np.nanstd(auc_roc), np.nanstd(auc_pr)]
179 |         for i in range(label.shape[1]):
180 |             label_c, prediction_c = label[:,i], prediction[:,i]
181 |             auc_roc, roc_curves = roc(label_c, prediction_c)
182 |             mean.append(np.nanmean(auc_roc))
183 |             std.append(np.nanstd(auc_roc))
184 | 
185 | 
186 |     elif (objective == 'squared_error') | (objective == 'kl_divergence') | (objective == 'cdf'):
187 |         ndim = np.ndim(label)
188 |         #if ndim == 1:
189 |         #    label = one_hot_labels(label)
190 |         label[label<0.5] = 0
191 |         label[label>=0.5] = 1
192 |         # import pdb; pdb.set_trace()
193 | 
194 |         correct = accuracy(label, prediction)
195 |         auc_roc, roc_curves = roc(label, prediction)
196 |         auc_pr, pr_curves = pr(label, prediction)
197 |         # import pdb; pdb.set_trace()
198 |         if ndim == 2:
199 |             prediction=prediction[:,0]
200 |             label = label[:,0]
201 |         # pred_class = prediction[:,0]>0.5
202 |         pred_class = prediction>0.5
203 |         # tp, tn, fp, fn = tfnp(label[:,0], pred_class)
204 |         tp, tn, fp, fn = tfnp(label, pred_class)
205 |         # mean = [np.nanmean(correct), np.nanmean(auc_roc), np.nanmean(auc_pr),tp, tn, fp, fn]
206 |         # std = [np.nanstd(correct), np.nanstd(auc_roc), np.nanstd(auc_pr)]
207 |         
208 | 
209 |         # squared_error
210 |         corr = pearsonr(label,prediction)
211 |         rsqr, slope = rsquare(label, prediction)
212 |         # mean = [np.nanmean(corr), np.nanmean(rsqr), np.nanmean(slope)]
213 |         # std = [np.nanstd(corr), np.nanstd(rsqr), np.nanstd(slope)]
214 | 
215 |         mean = [np.nanmean(correct), np.nanmean(auc_roc), np.nanmean(auc_pr),tp, tn, fp, fn, np.nanmean(corr), np.nanmean(rsqr), np.nanmean(slope)]
216 |         std = [np.nanstd(correct), np.nanstd(auc_roc), np.nanstd(auc_pr), np.nanstd(corr), np.nanstd(rsqr), np.nanstd(slope)]
217 | 
218 |     else:
219 |         mean = 0
220 |         std = 0
221 | 
222 |     return [mean, std]
223 | 


--------------------------------------------------------------------------------
/prismnet/engine/train_loop.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import argparse, os, copy
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | from tqdm import tqdm
  7 | import prismnet.model as arch
  8 | from prismnet.utils import log_print, metrics, datautils
  9 |     
 10 | def train(args, model, device, train_loader, criterion, optimizer):
 11 |     model.train()
 12 |     met = metrics.MLMetrics(objective='binary')
 13 |     for batch_idx, (x0, y0) in enumerate(train_loader):
 14 |         x, y = x0.float().to(device), y0.to(device).float()
 15 |         if y0.sum() ==0 or y0.sum() ==args.batch_size:
 16 |             continue
 17 |         optimizer.zero_grad()
 18 |         output = model(x)
 19 |         loss = criterion(output, y)
 20 |         prob = torch.sigmoid(output)
 21 | 
 22 |         y_np = y.to(device='cpu', dtype=torch.long).detach().numpy()
 23 |         p_np = prob.to(device='cpu').detach().numpy()
 24 |         met.update(y_np, p_np,[loss.item()])
 25 |         loss.backward()
 26 |         torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
 27 |         optimizer.step()
 28 | 
 29 |     return met
 30 | 
 31 | def validate(args, model, device, test_loader, criterion):
 32 |     model.eval()
 33 |     y_all = []
 34 |     p_all = []
 35 |     l_all = []
 36 |     with torch.no_grad():
 37 |         for batch_idx, (x0, y0) in enumerate(test_loader):
 38 |             x, y = x0.float().to(device), y0.to(device).float()
 39 |             #if y0.sum() ==0:
 40 |             #    import pdb; pdb.set_trace()
 41 |             output  = model(x)
 42 |             loss = criterion(output, y)
 43 |             prob = torch.sigmoid(output)
 44 | 
 45 |             y_np = y.to(device='cpu', dtype=torch.long).numpy()
 46 |             p_np = prob.to(device='cpu').numpy()
 47 |             l_np = loss.item()
 48 | 
 49 |             y_all.append(y_np)
 50 |             p_all.append(p_np)
 51 |             l_all.append(l_np)
 52 | 
 53 |     y_all = np.concatenate(y_all)
 54 |     p_all = np.concatenate(p_all)
 55 |     l_all = np.array(l_all)
 56 |     
 57 |     met = metrics.MLMetrics(objective='binary')
 58 |     met.update(y_all, p_all,[l_all.mean()])
 59 |     
 60 | 
 61 |     
 62 |     return met, y_all, p_all
 63 | 
 64 | def inference(args, model, device, test_loader):
 65 |     model.eval()
 66 |     p_all = []
 67 |     with torch.no_grad():
 68 |         for batch_idx, (x0, y0) in enumerate(test_loader):
 69 |             x, y = x0.float().to(device), y0.to(device).float()
 70 |             output = model(x)
 71 |             prob = torch.sigmoid(output)
 72 | 
 73 |             p_np = prob.to(device='cpu').numpy()
 74 |             p_all.append(p_np)
 75 | 
 76 |     p_all = np.concatenate(p_all)
 77 |     return p_all
 78 | 
 79 | 
 80 | def compute_saliency(args, model, device, test_loader, identity):
 81 |     from prismnet.model import GuidedBackpropSmoothGrad
 82 | 
 83 |     model.eval()
 84 | 
 85 |     saliency_dir = datautils.make_directory(args.out_dir, "out/saliency")
 86 |     saliency_path = os.path.join(saliency_dir, identity+'.sal')
 87 | 
 88 |     # sgrad = SmoothGrad(model, device=device)
 89 |     sgrad = GuidedBackpropSmoothGrad(model, device=device)
 90 |     sal = ""
 91 |     for batch_idx, (x0, y0) in enumerate(test_loader):
 92 |         X, Y = x0.float().to(device), y0.to(device).float()
 93 |         output = model(X)
 94 |         prob = torch.sigmoid(output)
 95 |         p_np = prob.to(device='cpu').detach().numpy().squeeze(-1)
 96 |         guided_saliency = sgrad.get_batch_gradients(X, Y)
 97 |         # import pdb; pdb.set_trace()
 98 |         N, NS, _, _ = guided_saliency.shape # (N, 101, 1, 5)
 99 |         
100 |         for i in range(N):
101 |             inr = batch_idx*args.batch_size + i
102 |             str_sal = datautils.mat2str(np.squeeze(guided_saliency[i]))
103 |             sal += "{}\t{:.6f}\t{}\n".format(inr, p_np[i], str_sal)
104 |             
105 |     f = open(saliency_path,"w")
106 |     f.write(sal)
107 |     f.close()
108 |     print(saliency_path)
109 | 
110 | 
111 | def compute_saliency_img(args, model, device, test_loader, identity):
112 |     from prismnet.model import GuidedBackpropSmoothGrad
113 |     from prismnet.utils import visualize
114 | 
115 |     def saliency_img(X, mul_saliency, outdir="results"):
116 |         """generate saliency image
117 | 
118 |         Args:
119 |             X ([np.ndarray]): raw input(L x 5/4)
120 |             mul_saliency ([np.ndarray]): [description]
121 |             outdir (str, optional): [description]. Defaults to "results".
122 |         """
123 |         if X.shape[-1]==5:
124 |             x_str = X[:,4:]
125 |             str_null = np.zeros_like(x_str)
126 |             ind =np.where(x_str == -1)[0]
127 |             str_null[ind,0]=1
128 |             
129 |             ss = mul_saliency[:,:]
130 |             s_str = mul_saliency[:,4:]
131 |             s_str = (s_str - s_str.min())/(s_str.max() - s_str.min())
132 |             ss[:,4:] = s_str * (1-str_null)
133 | 
134 |             str_null=np.squeeze(str_null).T
135 |         else:
136 |             str_null = None
137 |             ss = mul_saliency[:,:]
138 | 
139 |         visualize.plot_saliency(
140 |             X.T, 
141 |             ss.T, 
142 |             nt_width=100, 
143 |             norm_factor=3, 
144 |             str_null=str_null, 
145 |             outdir=outdir
146 |         )
147 | 
148 | 
149 |     prefix_n = len(str(len(test_loader.dataset)))
150 |     datautils.make_directory(args.out_dir, "out/imgs/")
151 |     imgs_dir = datautils.make_directory(args.out_dir, "out/imgs/"+identity)
152 |     imgs_path = imgs_dir+'/{:0'+str(prefix_n)+'d}_{:.3f}.pdf'
153 |     saliency_path = os.path.join(imgs_dir, 'all.sal')
154 | 
155 |     # sgrad = SmoothGrad(model, device=device)
156 |     sgrad = GuidedBackpropSmoothGrad(model, device=device, magnitude=1)
157 |     for batch_idx, (x0, y0) in enumerate(test_loader):
158 |         X, Y = x0.float().to(device), y0.to(device).float()
159 |         output = model(X)
160 |         prob = torch.sigmoid(output)
161 |         p_np = prob.to(device='cpu').detach().numpy().squeeze()
162 |         guided_saliency  = sgrad.get_batch_gradients(X, Y)
163 |         mul_saliency = copy.deepcopy(guided_saliency)
164 |         mul_saliency[:,:,:,:4] =  guided_saliency[:,:,:,:4] * X[:,:,:,:4]
165 |         N, NS, _, _ = guided_saliency.shape # (N, 101, 1, 5)
166 |         sal = ""
167 |         for i in tqdm(range(N)):
168 |             inr = batch_idx*args.batch_size + i
169 |             str_sal = datautils.mat2str(np.squeeze(guided_saliency[i]))
170 |             sal += "{}\t{:.6f}\t{}\n".format(inr, p_np[i], str_sal)
171 |             img_path = imgs_path.format(inr, p_np[i])
172 |             # import pdb; pdb.set_trace()
173 |             saliency_img(
174 |                 X[i,0].to(device='cpu').detach().numpy(), 
175 |                 mul_saliency[i,0].to(device='cpu').numpy(), 
176 |                 outdir=img_path)
177 |     if not os.path.exists(saliency_path):     
178 |         f = open(saliency_path,"w")
179 |         f.write(sal)
180 |         f.close()
181 |         print(saliency_path)
182 | 
183 | 
184 | 
185 | def compute_high_attention_region(args, model, device, test_loader, identity):
186 |     from prismnet.model import GuidedBackpropSmoothGrad
187 |     model.eval()
188 |     har_dir = datautils.make_directory(args.out_dir, "out/har")
189 |     har_path = os.path.join(har_dir, identity+'.har')
190 | 
191 |     L = 20
192 |     har = ""
193 |     # sgrad = SmoothGrad(model, device=device)
194 |     sgrad = GuidedBackpropSmoothGrad(model, device=device)
195 |     for batch_idx, (x0, y0) in enumerate(test_loader):
196 |         X, Y = x0.float().to(device), y0.to(device).float()
197 |         output = model(X)
198 |         prob = torch.sigmoid(output)
199 |         p_np = prob.to(device='cpu').detach().numpy().squeeze()
200 |         guided_saliency  = sgrad.get_batch_gradients(X, Y)
201 |         
202 |         attention_region = guided_saliency.sum(dim=3)[:,0,:].to(device='cpu').numpy() # (N, 101, 1)
203 |         N,NS = attention_region.shape # (N, 101)
204 |         for i in range(N):
205 |             inr = batch_idx*args.batch_size + i
206 |             iar = attention_region[i]
207 |             ar_score = np.array([ iar[j:j+L].sum() for j in range(NS-L+1)])
208 |             # import pdb; pdb.set_trace()
209 |             highest_ind = np.argmax(iar)
210 |             har += "{}\t{:.6f}\t{}\t{}\n".format(inr, p_np[i], highest_ind, highest_ind+L)
211 | 
212 |     f = open(har_path,"w")
213 |     f.write(har)
214 |     f.close()
215 |     print(har_path)
216 | 
217 | 


--------------------------------------------------------------------------------
/tools/main.py:
--------------------------------------------------------------------------------
  1 | import argparse, os, random
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import torch.optim as optim
  6 | from torch.optim import lr_scheduler
  7 | 
  8 | 
  9 | from tensorboardX import SummaryWriter
 10 | from sklearn import metrics
 11 | import numpy as np
 12 | 
 13 | import prismnet.model as arch
 14 | from prismnet import train, validate, inference, log_print, compute_saliency, compute_saliency_img, compute_high_attention_region
 15 | #compute_high_attention_region
 16 | 
 17 | # from prismnet.engine.train_loop import 
 18 | from prismnet.model.utils import GradualWarmupScheduler
 19 | from prismnet.loader import SeqicSHAPE
 20 | from prismnet.utils import datautils
 21 | 
 22 | 
 23 | def fix_seed(seed):
 24 |     """
 25 |     Seed all necessary random number generators.
 26 |     """
 27 |     if seed is None:
 28 |         seed = random.randint(1, 10000)
 29 |     torch.set_num_threads(1)  # Suggested for issues with deadlocks, etc.
 30 |     random.seed(seed)
 31 |     os.environ['PYTHONHASHSEED'] = str(seed)
 32 |     np.random.seed(seed)
 33 |     torch.manual_seed(seed)
 34 |     torch.cuda.manual_seed(seed)
 35 |     torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
 36 |     torch.backends.cudnn.deterministic = True
 37 |     torch.backends.cudnn.benchmark = True
 38 |     torch.backends.cudnn.enabled = True
 39 |     # print("[Info] cudnn.deterministic set to True. CUDNN-optimized code may be slow.")
 40 | 
 41 | def save_evals(out_dir, filename, dataname, predictions, label, met):
 42 |     evals_dir = datautils.make_directory(out_dir, "out/evals")
 43 |     metrics_path = os.path.join(evals_dir, filename+'.metrics')
 44 |     probs_path = os.path.join(evals_dir, filename+'.probs')
 45 |     with open(metrics_path,"w") as f:
 46 |         if "_reg" in filename:
 47 |             print("{:s}\t{:.3f}\t{:.3f}\t{:.3f}\t{:d}\t{:d}\t{:d}\t{:d}\t{:.3f}\t{:.3f}\t{:.3f}".format(
 48 |                 dataname,
 49 |                 met.acc,
 50 |                 met.auc,
 51 |                 met.prc,
 52 |                 met.tp,
 53 |                 met.tn,
 54 |                 met.fp,
 55 |                 met.fn,
 56 |                 met.avg[7],
 57 |                 met.avg[8],
 58 |                 met.avg[9],
 59 |             ), file=f)
 60 |         else:
 61 |             print("{:s}\t{:.3f}\t{:.3f}\t{:.3f}\t{:d}\t{:d}\t{:d}\t{:d}".format(
 62 |                 dataname,
 63 |                 met.acc,
 64 |                 met.auc,
 65 |                 met.prc,
 66 |                 met.tp,
 67 |                 met.tn,
 68 |                 met.fp,
 69 |                 met.fn,
 70 |             ), file=f)
 71 |     with open(probs_path,"w") as f:
 72 |         for i in range(len(predictions)):
 73 |             print("{:.3f}\t{}".format(predictions[i,0], label[i,0]), file=f)
 74 |     print("Evaluation file:", metrics_path)
 75 |     print("Prediction file:", probs_path)
 76 | 
 77 | def save_infers(out_dir, filename, predictions):
 78 |     evals_dir = datautils.make_directory(out_dir, "out/infer")
 79 |     probs_path = os.path.join(evals_dir, filename+'.probs')
 80 |     with open(probs_path,"w") as f:
 81 |         for i in range(len(predictions)):
 82 |             print("{:f}".format(predictions[i,0]), file=f)
 83 |     print("Prediction file:", probs_path)
 84 | 
 85 | def main():
 86 |     global writer, best_epoch
 87 |     # Training settings
 88 |     parser = argparse.ArgumentParser(description='Official version of PrismNet')
 89 |     # Data options
 90 |     parser.add_argument('--data_dir',       type=str, default="data", help='data path')
 91 |     parser.add_argument('--exp_name',       type=str, default="cnn", metavar='N', help='experiment name')
 92 |     parser.add_argument('--p_name',         type=str, default="TIA1_Hela", metavar='N', help='protein name')
 93 |     parser.add_argument('--out_dir',        type=str, default=".", help='output directory')
 94 |     parser.add_argument('--mode',           type=str, default="pu", help='data mode')
 95 |     parser.add_argument("--infer_file",     type=str, help="infer file", default="")
 96 |     # Training Hyper-parameter
 97 |     parser.add_argument('--arch',           default="PrismNet", help='network architecture')
 98 |     parser.add_argument('--lr_scheduler',   default="warmup", help=' lr scheduler: warmup/cosine')
 99 |     parser.add_argument('--lr',             type=float, default=0.0001, help='learning rate')
100 |     parser.add_argument('--batch_size',     type=int, default=64, help='input batch size')
101 |     parser.add_argument('--nepochs',        type=int, default=200, help='number of epochs to train')
102 |     parser.add_argument('--pos_weight',     type=int, default=2, help='positive class weight')
103 |     parser.add_argument('--weight_decay',   type=float, default=1e-6, help='weight decay, default=1e-6')
104 |     parser.add_argument('--early_stopping', type=int, default=20, help='early stopping')
105 |     # Training 
106 |     parser.add_argument('--load_best',      action='store_true', help='load best model')
107 |     parser.add_argument('--eval',           action='store_true', help='eval mode')
108 |     parser.add_argument('--train',          action='store_true', help='train mode')
109 |     parser.add_argument('--infer',          action='store_true', help='infer mode')
110 |     parser.add_argument('--infer_test',     action='store_true', help='infer test from h5')
111 |     parser.add_argument('--eval_test',      action='store_true', help='eval test from h5')
112 |     parser.add_argument('--saliency',       action='store_true', help='compute saliency mode')
113 |     parser.add_argument('--saliency_img',   action='store_true', help='compute saliency and plot image mode')
114 |     parser.add_argument('--har',            action='store_true', help='compute highest attention region')
115 |     # misc
116 |     parser.add_argument('--tfboard',        action='store_true', help='tf board')
117 |     parser.add_argument('--no-cuda',        action='store_true', default=False, help='disables CUDA training')
118 |     parser.add_argument('--workers',        type=int, help='number of data loading workers', default=2)
119 |     parser.add_argument('--log_interval',   type=int, default=100, help='log print interval')
120 |     parser.add_argument('--seed',           type=int, default=1024, help='manual seed')
121 |     args = parser.parse_args()
122 |     print(args)
123 |     use_cuda = not args.no_cuda and torch.cuda.is_available()
124 |     
125 |     if args.mode == 'pu':
126 |         args.nstr = 1
127 |     else:
128 |         args.nstr = 0
129 | 
130 |     # out dir
131 |     data_path  = args.data_dir + "/" + args.p_name + ".h5"
132 |     identity   = args.p_name+'_'+args.arch+"_"+args.mode
133 |     datautils.make_directory(args.out_dir,"out/")
134 |     model_dir  = datautils.make_directory(args.out_dir,"out/models")
135 |     model_path = os.path.join(model_dir, identity+"_{}.pth")
136 | 
137 |     if args.tfboard:
138 |         tfb_dir  = datautils.make_directory(args.out_dir,"out/tfb")
139 |         writer = SummaryWriter(tfb_dir)
140 |     else:
141 |         writer = None
142 |     # fix random seed
143 |     fix_seed(args.seed)
144 | 
145 |     device = torch.device("cuda" if use_cuda else "cpu")
146 |     kwargs = {'num_workers': args.workers, 'pin_memory': True} if use_cuda else {}
147 |     
148 |     #train_loader = torch.utils.data.DataLoader(SeqicSHAPE(data_path), \
149 |     #    batch_size=args.batch_size, shuffle=True,  **kwargs)
150 |     #test_loader  = torch.utils.data.DataLoader(SeqicSHAPE(data_path, is_test=True), \
151 |     #    batch_size=args.batch_size*8, shuffle=False, **kwargs)
152 |     #print("Train set:", len(train_loader.dataset))
153 |     #print("Test  set:", len(test_loader.dataset))
154 | 
155 | 
156 |     print("Network Arch:", args.arch)
157 |     model = getattr(arch, args.arch)(mode=args.mode)
158 |     arch.param_num(model)
159 |     # print(model)
160 | 
161 |     if args.load_best:
162 |         filename = model_path.format("best")
163 |         print("Loading model: {}".format(filename))
164 |         model.load_state_dict(torch.load(filename,map_location='cpu'))
165 |  
166 |     model = model.to(device)
167 |     criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(args.pos_weight))
168 | 
169 |     if args.train:
170 | 
171 |         train_loader = torch.utils.data.DataLoader(SeqicSHAPE(data_path), \
172 |             batch_size=args.batch_size, shuffle=True,  **kwargs)
173 |         
174 |         test_loader  = torch.utils.data.DataLoader(SeqicSHAPE(data_path, is_test=True), \
175 |             batch_size=args.batch_size*8, shuffle=False, **kwargs)
176 |         print("Train set:", len(train_loader.dataset))
177 |         print("Test  set:", len(test_loader.dataset))
178 | 
179 |         optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.999), weight_decay=args.weight_decay)
180 |         scheduler = GradualWarmupScheduler(
181 |             optimizer, multiplier=8, total_epoch=float(args.nepochs), after_scheduler=None)
182 | 
183 |         best_auc = 0
184 |         best_acc = 0
185 |         best_epoch = 0
186 |         for epoch in range(1, args.nepochs + 1):
187 |             t_met       = train(args, model, device, train_loader, criterion, optimizer)
188 |             v_met, _, _ = validate(args, model, device, test_loader, criterion)
189 |             scheduler.step(epoch)
190 |             lr = scheduler.get_lr()[0]
191 |             color_best='green'
192 |             if best_auc < v_met.auc:
193 |                 best_auc = v_met.auc
194 |                 best_acc = v_met.acc
195 |                 best_epoch = epoch
196 |                 color_best = 'red'
197 |                 filename = model_path.format("best")
198 |                 torch.save(model.state_dict(), filename)
199 |             if epoch - best_epoch > args.early_stopping:
200 |                 print("Early stop at %d, %s "%(epoch, args.exp_name))
201 |                 break
202 | 
203 |             if args.tfboard and writer is not None:
204 |                 writer.add_scalar('loss/train', t_met.other[0], epoch)
205 |                 writer.add_scalar('acc/train', t_met.acc, epoch)
206 |                 writer.add_scalar('AUC/train', t_met.auc, epoch)
207 |                 writer.add_scalar('lr', lr, epoch)
208 |                 writer.add_scalar('loss/test', v_met.other[0], epoch)
209 |                 writer.add_scalar('acc/test', v_met.acc, epoch)
210 |                 writer.add_scalar('AUC/test', v_met.auc, epoch)
211 |             line='{} \t Train Epoch: {}     avg.loss: {:.4f} Acc: {:.2f}%, AUC: {:.4f} lr: {:.6f}'.format(\
212 |                 args.p_name, epoch, t_met.other[0], t_met.acc, t_met.auc, lr)
213 |             log_print(line, color='green', attrs=['bold'])
214 |             
215 |             line='{} \t Test  Epoch: {}     avg.loss: {:.4f} Acc: {:.2f}%, AUC: {:.4f} ({:.4f})'.format(\
216 |                 args.p_name, epoch, v_met.other[0], v_met.acc, v_met.auc, best_auc)
217 |             log_print(line, color=color_best, attrs=['bold'])
218 |             
219 |         print("{} auc: {:.4f} acc: {:.4f}".format(args.p_name, best_auc, best_acc))
220 | 
221 |         filename = model_path.format("best")
222 |         print("Loading model: {}".format(filename))
223 |         model.load_state_dict(torch.load(filename))
224 | 
225 |     
226 |     
227 |     if args.eval:
228 | 
229 |         test_loader  = torch.utils.data.DataLoader(SeqicSHAPE(data_path, is_test=True), \
230 |             batch_size=args.batch_size*8, shuffle=False, **kwargs)
231 |         print("Test  set:", len(test_loader.dataset))
232 | 
233 |         met, y_all, p_all = validate(args, model, device, test_loader, criterion)
234 |         print("> eval {} auc: {:.4f} acc: {:.4f}".format(args.p_name, met.auc, met.acc))
235 |         save_evals(args.out_dir, identity, args.p_name, p_all, y_all, met)
236 | 
237 |     if args.infer and os.path.exists(args.infer_file):
238 |         test_loader  = torch.utils.data.DataLoader(SeqicSHAPE(args.infer_file, is_infer=True), \
239 |             batch_size=args.batch_size, shuffle=False, **kwargs)
240 | 
241 |         p_all = inference(args, model, device, test_loader)
242 |         identity = identity+"_"+ os.path.basename(args.infer_file).replace(".txt","") 
243 |         save_infers(args.out_dir, identity, p_all)
244 | 
245 |     if args.saliency and os.path.exists(args.infer_file):
246 |         test_loader  = torch.utils.data.DataLoader(SeqicSHAPE(args.infer_file, is_infer=True), \
247 |             batch_size=args.batch_size, shuffle=False, **kwargs)
248 |         compute_saliency(args, model, device, test_loader, identity)
249 | 
250 |     if args.saliency_img and os.path.exists(args.infer_file):
251 |         test_loader  = torch.utils.data.DataLoader(SeqicSHAPE(args.infer_file, is_infer=True), \
252 |             batch_size=args.batch_size, shuffle=False, **kwargs)
253 |         compute_saliency_img(args, model, device, test_loader, identity)
254 |     
255 |     if args.har and os.path.exists(args.infer_file):
256 |         test_loader  = torch.utils.data.DataLoader(SeqicSHAPE(args.infer_file, is_infer=True), \
257 |             batch_size=args.batch_size, shuffle=False, **kwargs)
258 |         compute_high_attention_region(args, model, device, test_loader, identity)
259 | 
260 | 
261 |     
262 |     
263 | 
264 | if __name__ == '__main__':
265 |     main()
266 | 


--------------------------------------------------------------------------------
/prismnet/utils/datautils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import os, sys, h5py
  6 | import numpy as np
  7 | from copy import deepcopy
  8 | 
  9 | 
 10 | 
 11 | def make_directory(path, foldername, verbose=1):
 12 |     """make a directory"""
 13 | 
 14 |     if not os.path.isdir(path):
 15 |         os.mkdir(path)
 16 |         print("making directory: " + path)
 17 | 
 18 |     outdir = os.path.join(path, foldername)
 19 |     if not os.path.isdir(outdir):
 20 |         os.mkdir(outdir)
 21 |         print("making directory: " + outdir)
 22 |     return outdir
 23 | 
 24 | def finished(path, line_num):
 25 |     """check a results file is finished or not
 26 | 
 27 |     Args:
 28 |         path ([str]): [results file path]
 29 |         line_num ([int]): [target line number]
 30 |     """
 31 | 
 32 |     if os.path.exists(path):
 33 |         with open(path, "r") as f:
 34 |             if line_num == len(f.readlines()):
 35 |                 return True
 36 |             else:
 37 |                 return False
 38 |     else:
 39 |         return False
 40 | 
 41 | def get_file_names(dataset_path):
 42 |     file_names = []
 43 |     for file_name in os.listdir(dataset_path):
 44 |         if os.path.splitext(file_name)[1] == '.h5':
 45 |             file_names.append(file_name)
 46 |     return file_names
 47 | 
 48 | def md5(string):
 49 |     return hashlib.md5(string.encode('utf-8')).hexdigest()
 50 | 
 51 | def mat2str(m):
 52 |     string=""
 53 |     if len(m.shape)==1:
 54 |         for j in range(m.shape[0]):
 55 |             string+= "%.3f," % m[j]
 56 |     else:
 57 |         for i in range(m.shape[0]):
 58 |             for j in range(m.shape[1]):
 59 |                 string+= "%.3f," % m[i,j]
 60 |     return string
 61 | 
 62 | def rescale(vec, thr=0.0):
 63 |     ind0 = np.where(vec>=thr)[0]
 64 |     u_norm = 0.5 * (vec[ind0]-thr)/(vec[ind0].max()) + 0.5
 65 |     ind2 = np.where(vec<0)[0]
 66 |     vec_norm = vec.copy()
 67 |     vec_norm[ind0] = u_norm
 68 |     vec_norm[ind2] = 0.0
 69 |     return vec_norm
 70 | 
 71 |     
 72 | def decodeDNA(m):
 73 |     na=["A","C","G","U"]
 74 |     var,inds=np.where(m==1)
 75 |     seq=""
 76 |     for i in inds:
 77 |         seq=seq+na[i]
 78 |     return seq
 79 | 
 80 | def str_onehot(vec):
 81 |     thr=0.15
 82 |     mask_str = np.zeros((2,vec.shape[-1]))
 83 |     ind =np.where(vec >= thr)[1]
 84 |     mask_str[1,ind]=1
 85 |     ind =np.where(vec < thr)[1]
 86 |     mask_str[0,ind]=1
 87 |     ind =np.where(vec == -1)[1]
 88 |     mask_str[0,ind]=0.5
 89 |     mask_str[1,ind]=0.5
 90 |     return mask_str
 91 |     
 92 | def convert_one_hot(sequence, max_length=None):
 93 |     """convert DNA/RNA sequences to a one-hot representation"""
 94 | 
 95 |     one_hot_seq = []
 96 |     for seq in sequence:
 97 |         seq = seq.upper()
 98 |         seq_length = len(seq)
 99 |         one_hot = np.zeros((4,seq_length))
100 |         index = [j for j in range(seq_length) if seq[j] == 'A']
101 |         one_hot[0,index] = 1
102 |         index = [j for j in range(seq_length) if seq[j] == 'C']
103 |         one_hot[1,index] = 1
104 |         index = [j for j in range(seq_length) if seq[j] == 'G']
105 |         one_hot[2,index] = 1
106 |         index = [j for j in range(seq_length) if (seq[j] == 'U') | (seq[j] == 'T')]
107 |         one_hot[3,index] = 1
108 | 
109 |         # handle boundary conditions with zero-padding
110 |         if max_length:
111 |             offset1 = int((max_length - seq_length)/2)
112 |             offset2 = max_length - seq_length - offset1
113 | 
114 |             if offset1:
115 |                 one_hot = np.hstack([np.zeros((4,offset1)), one_hot])
116 |             if offset2:
117 |                 one_hot = np.hstack([one_hot, np.zeros((4,offset2))])
118 | 
119 |         one_hot_seq.append(one_hot)
120 | 
121 |     # convert to numpy array
122 |     one_hot_seq = np.array(one_hot_seq)
123 | 
124 |     return one_hot_seq
125 | 
126 | def convert_cat_one_hot(targets):
127 |     """convert DNA/RNA sequences to a one-hot representation"""
128 |     t_length = len(targets)
129 |     cat_num  = len(np.unique(targets))
130 |     one_hot = np.zeros((t_length, cat_num))
131 |     for i in range(cat_num):
132 |         index = np.where(targets==i)[0]
133 |         one_hot[index,i]= 1
134 |     return one_hot
135 | 
136 | def seq_mutate(seq):
137 |     mut_seq = []
138 |     for i in range(len(seq)):
139 |         if seq[i] == "A" :
140 |             mut_seq.extend([seq[0:i] + "C" + seq[(i+1):], seq[0:i] + "G" + seq[i+1:], seq[0:i] + "T" + seq[i+1:]])
141 |         elif seq[i] == "C" :
142 |             mut_seq.extend([seq[0:i] + "A" + seq[i+1:], seq[0:i] + "G" + seq[i+1:], seq[0:i] + "T" + seq[i+1:]])
143 |         elif seq[i] == "G" :
144 |             mut_seq.extend([seq[0:i] + "A" + seq[i+1:], seq[0:i] + "C" + seq[i+1:], seq[0:i] + "T" + seq[i+1:]])
145 |         else:
146 |             mut_seq.extend([seq[0:i] + "A" + seq[i+1:], seq[0:i] + "C" + seq[i+1:], seq[0:i] + "G" + seq[i+1:]])
147 |     return mut_seq
148 | 
149 | 
150 | def load_dataset_hdf5(file_path, ss_type='seq'):
151 | 
152 |     def prepare_data(train, ss_type=None):
153 |         if ss_type == 'struct':
154 |             structure = train['inputs'][:,:,:,4:9]
155 |             paired = np.expand_dims(structure[:,:,:,0], axis=3)
156 |             train['inputs']  = paired
157 |             return train
158 | 
159 |         seq = train['inputs'][:,:,:,:4]
160 | 
161 |         if ss_type == 'pu':
162 |             structure = train['inputs'][:,:,:,4:9]
163 |             paired = np.expand_dims(structure[:,:,:,0], axis=3)
164 | 
165 |             if structure.shape[-1]>3:
166 |                 unpaired = np.expand_dims(np.sum(structure[:,:,:,1:], axis=3), axis=3)
167 |                 seq = np.concatenate([seq, paired, unpaired], axis=3)
168 |             elif structure.shape[-1]==1:
169 |                 seq = np.concatenate([seq, paired], axis=3)
170 |             elif structure.shape[-1]==2:
171 |                 unpaired = np.expand_dims(structure[:,:,:,1], axis=3)
172 |                 seq = np.concatenate([seq, paired, unpaired], axis=3)
173 |             elif structure.shape[-1]==3:
174 |                 unpaired = np.expand_dims(structure[:,:,:,1], axis=3)
175 |                 other = np.expand_dims(structure[:,:,:,2], axis=3)
176 |                 seq = np.concatenate([seq, paired, unpaired, other], axis=3)
177 |         elif ss_type == 'p':
178 |             structure = train['inputs'][:,:,:,4:9]
179 |             paired = np.expand_dims(structure[:,:,:,0], axis=3)
180 |             seq = np.concatenate([seq, paired], axis=3)
181 |         elif ss_type == 'struct':
182 |             structure = train['inputs'][:,:,:,4:9]
183 |             paired = np.expand_dims(structure[:,:,:,0], axis=3)
184 |             HIME = structure[:,:,:,1:]
185 |             seq = np.concatenate([seq, paired, HIME], axis=3)
186 |         train['inputs']  = seq
187 |         return train
188 | 
189 |     # open dataset
190 |     with h5py.File(file_path, 'r') as f:
191 |         # load set A data
192 |         X_train = np.array(f['X_train'])
193 |         Y_train = np.array(f['Y_train'])
194 |         X_test  = np.array(f['X_test'])
195 |         Y_test  = np.array(f['Y_test'])
196 | 
197 |   
198 | 
199 |     # expand dims of targets
200 |     if len(Y_train.shape) == 1:
201 |         Y_train = np.expand_dims(Y_train, axis=1)
202 |         Y_test  = np.expand_dims(Y_test, axis=1)
203 | 
204 |     # add another dimension to make a 4d tensor
205 |     X_train = np.expand_dims(X_train, axis=3).transpose([0, 2, 3, 1])
206 |     X_test  = np.expand_dims(X_test,  axis=3).transpose([0, 2, 3, 1])
207 |     
208 |     # dictionary for each dataset
209 |     train = {'inputs': X_train, 'targets': Y_train}
210 |     test  = {'inputs': X_test, 'targets': Y_test}
211 |     
212 | 
213 |     # parse secondary structure profiles
214 |     train = prepare_data(train, ss_type)
215 |     test  = prepare_data(test, ss_type)
216 | 
217 |     print("train:",train['inputs'].shape)
218 |     print("test:",test['inputs'].shape)
219 | 
220 |     return train, test
221 | 
222 | 
223 | def process_data(train, test, method='log_norm'):
224 |     """get the results for a single experiment specified by rbp_index.
225 |     Then, preprocess the binding affinity intensities according to method.
226 |     method:
227 |         clip_norm - clip datapoints larger than 4 standard deviations from the mean
228 |         log_norm - log transcormation
229 |         both - perform clip and log normalization as separate targets (expands dimensions of targets)
230 |     """
231 | 
232 |     def normalize_data(data, method):
233 |         if method == 'standard':
234 |             MIN = np.min(data)
235 |             data = np.log(data-MIN+1)
236 |             sigma = np.mean(data)
237 |             data_norm = (data)/sigma
238 |             params = sigma
239 |         if method == 'clip_norm':
240 |             # standard-normal transformation
241 |             significance = 4
242 |             std = np.std(data)
243 |             index = np.where(data > std*significance)[0]
244 |             data[index] = std*significance
245 |             mu = np.mean(data)
246 |             sigma = np.std(data)
247 |             data_norm = (data-mu)/sigma
248 |             params = [mu, sigma]
249 | 
250 |         elif method == 'log_norm':
251 |             # log-standard-normal transformation
252 |             MIN = np.min(data)
253 |             data = np.log(data-MIN+1)
254 |             mu = np.mean(data)
255 |             sigma = np.std(data)
256 |             data_norm = (data-mu)/sigma
257 |             params = [MIN, mu, sigma]
258 | 
259 |         elif method == 'both':
260 |             data_norm1, params = normalize_data(data, 'clip_norm')
261 |             data_norm2, params = normalize_data(data, 'log_norm')
262 |             data_norm = np.hstack([data_norm1, data_norm2])
263 |         return data_norm, params
264 | 
265 | 
266 |     # get binding affinities for a given rbp experiment
267 |     Y_train = train['targets']
268 |     Y_test = test['targets']
269 |     #import pdb; pdb.set_trace()
270 | 
271 |     if len(Y_train.shape)==1:
272 |         # filter NaN
273 |         train_index = np.where(np.isnan(Y_train) == False)[0]
274 |         test_index = np.where(np.isnan(Y_test) == False)[0]
275 |         Y_train = Y_train[train_index]
276 |         Y_test = Y_test[test_index]
277 |         X_train = train['inputs'][train_index]
278 |         X_test = test['inputs'][test_index]
279 |     else:
280 |         X_train = train['inputs']
281 |         X_test = test['inputs']
282 | 
283 |     # normalize intenensities
284 |     if method:
285 |         Y_train, params_train = normalize_data(Y_train, method)
286 |         Y_test, params_test = normalize_data(Y_test, method)
287 | 
288 |     # store sequences and intensities
289 |     train = {'inputs': X_train, 'targets': Y_train}
290 |     test = {'inputs': X_test, 'targets': Y_test}
291 | 
292 |     return train, test
293 | 
294 | 
295 | def down_negative_samples(train, test, ratio=0.0):
296 |     """get the results for a single experiment specified by rbp_index.
297 |     Then, preprocess the binding affinity intensities according to method.
298 |     method:
299 |         clip_norm - clip datapoints larger than 4 standard deviations from the mean
300 |         log_norm - log transcormation
301 |         both - perform clip and log normalization as separate targets (expands dimensions of targets)
302 |     """
303 |     if ratio==0.0:
304 |         print("No negative down-sampling ratio.")
305 |         return train, test
306 | 
307 |     X_train = train['inputs']
308 |     X_test  = test['inputs']
309 | 
310 |     Y_train = train['targets']#.astype(np.int32)
311 |     Y_test  = test['targets']#.astype(np.int32)
312 | 
313 |     pos_index_tr = np.where(Y_train==1)[0]
314 |     pos_index_te = np.where(Y_test==1)[0]
315 | 
316 |     neg_index_tr = np.where(Y_train==0)[0]
317 |     neg_index_te = np.where(Y_test==0)[0]
318 | 
319 |     n_down_neg_tr = int(ratio * (len(Y_train) - len(neg_index_tr)))
320 |     n_down_neg_te = int(ratio * (len(Y_test) -  len(neg_index_te)))
321 | 
322 |     dw_neg_index_tr = np.random.choice(neg_index_tr, size=n_down_neg_tr)
323 |     dw_neg_index_te = np.random.choice(neg_index_te, size=n_down_neg_te)
324 | 
325 |     pos_neg_tr =np.concatenate((dw_neg_index_tr,    pos_index_tr))
326 |     pos_neg_te =np.concatenate((dw_neg_index_te,    pos_index_te))
327 | 
328 |     train = {'inputs': X_train[pos_neg_tr], 'targets': Y_train[pos_neg_tr]}
329 |     test = {'inputs': X_test[pos_neg_te], 'targets': Y_test[pos_neg_te]}
330 | 
331 |     return train, test
332 | 
333 | 
334 | def load_testset_txt_only_seq(filepath, test, return_trans_id=False, seq_length=101):
335 |     print("Reading inference file(only seq):", filepath)
336 |     if os.path.exists(filepath+"_test.h5"):
337 |         print("loading from h5.")        
338 |         with h5py.File(filepath+"_test.h5", 'r') as f:
339 |             # load set A data
340 |             test['inputs']  = f['inputs']
341 |             test['targets'] = f['targets']
342 |      
343 |         
344 |         if return_trans_id:
345 |             blob = np.load(filepath+"_tran.npz")
346 |             trans_ids = blob['trans_ids']
347 |             return test, trans_ids
348 |         else:
349 |             return test
350 | 
351 |     seqs = []
352 |     trans_ids = []
353 |     with open(filepath,"r") as f:
354 |         for line in f.readlines():
355 |             line=line.strip('\n').split('\t')
356 |             if len(line[2])!=seq_length:
357 |                 continue
358 |             trans_ids.append(line[0])
359 |             seqs.append(line[1])
360 |     print("Converting.")        
361 |     input = convert_one_hot(seqs, seq_length)
362 |     print("Converted.")        
363 | 
364 |     inputs = np.expand_dims(input, axis=3).transpose([0, 2, 3, 1])
365 |     targets = np.ones((inputs.shape[0],1))
366 |     targets[inputs.shape[0]-1]=0
367 | 
368 |     test['inputs'] =inputs
369 |     test['targets'] =targets
370 |     
371 |     print("Saving into h5.")
372 |     with h5py.File(filepath+"_test.h5", "w") as f:
373 |         dset = f.create_dataset("inputs", data=inputs, compression="gzip")
374 |         dset = f.create_dataset("targets", data=targets, compression="gzip")
375 |     print("Saved.")
376 | 
377 |     if return_trans_id:
378 |         trans_ids = np.array(trans_ids)
379 |         return test, trans_ids
380 |     else:
381 |         return test
382 | 
383 | 
384 | 
385 | def load_testset_txt(filepath, use_structure=True, seq_length=101):
386 |     test = {}
387 | 
388 |     print("Reading inference file:", filepath)
389 |     if os.path.exists(filepath+"_test.npz"):
390 |         print("loading from npz.")        
391 |        
392 |         f = np.load(filepath+"_test.npz", allow_pickle=True)
393 |         test['inputs']  = f['inputs']
394 |         test['targets'] = f['targets']
395 | 
396 |         return test
397 | 
398 |     in_ver = 5
399 |     seqs = []
400 |     strs = []
401 |     with open(filepath,"r") as f:
402 |         for line in f.readlines():
403 |             line=line.strip('\n').split('\t')
404 |             if len(line[2])!=seq_length:
405 |                 continue
406 |             seqs.append(line[2])
407 |             if use_structure:
408 |                 strs.append(line[3])
409 |     in_seq = convert_one_hot(seqs, seq_length)
410 |     
411 |     if use_structure:
412 |         structure = np.zeros((len(seqs), in_ver-4, seq_length))
413 |         for i in range(len(seqs)):
414 |             icshape = strs[i].strip(',').split(',')
415 |             ti = [float(t) for t in icshape]
416 |             ti = np.array(ti).reshape(1,-1)
417 |             structure[i] = np.concatenate([ti], axis=0)
418 |         input = np.concatenate([in_seq, structure], axis=1)
419 |     else:
420 |         input = in_seq
421 | 
422 |     inputs = np.expand_dims(input, axis=3).transpose([0, 3, 2, 1])
423 |     targets = np.ones((in_seq.shape[0],1))
424 | 
425 |     targets[in_seq.shape[0]-1]=0
426 | 
427 |     test['inputs']  = inputs
428 |     test['targets'] = targets
429 |     print("Saving into npz.")
430 |     np.savez_compressed(filepath+"_test.npz", inputs=inputs, targets=targets)
431 |     print("Saved.")
432 | 
433 |     return test
434 | 
435 | 
436 | 
437 | def load_testset_txt_mu(filepath, test, seq_length=101):
438 |     print("Reading test file:", filepath)
439 |     f_mu = open(filepath,"r")
440 |     seqs = []
441 |     strs = []
442 |     use_pu = True
443 |     if test['inputs'].shape[-1]==4:
444 |         use_pu = False
445 |     nf=0
446 |     for line in f_mu.readlines():
447 |         nf+=1
448 |         line=line.strip('\n').split('\t')
449 |         if len(line[2])!=seq_length:
450 |             continue
451 |         seqs.append(line[2])
452 |         mut_seq=seq_mutate(line[2])
453 |         seqs.extend(mut_seq)
454 |         if use_pu:
455 |             strs.extend([line[3]] * len(seqs))
456 |     print("file line num:",nf)
457 |     print("mut seq num:",len(seqs))
458 |     in_seq = munge.convert_one_hot(seqs, seq_length)
459 |     in_ver = 5
460 |     if use_pu:
461 |         structure = np.zeros((len(seqs), in_ver-4, seq_length))
462 |         for i in range(len(seqs)):
463 |             struct_list = strs[i].strip(',').split(',')
464 |             ti = np.array([float(t) for t in struct_list]).reshape(1,-1)
465 |             structure[i] = np.concatenate([ti], axis=0)
466 |         input = np.concatenate([in_seq, structure], axis=1)
467 |     else:
468 |         input = in_seq
469 | 
470 |     inputs = np.expand_dims(input, axis=3).transpose([0, 2, 3, 1])
471 |     targets = np.ones((in_seq.shape[0],1))
472 | 
473 |     targets[in_seq.shape[0]-1]=0
474 | 
475 |     test['inputs'] =inputs
476 |     test['targets'] =targets
477 |     return test
478 | 
479 | 
480 | def split_dataset(data, targets, valid_frac=0.2):
481 |     
482 |     ind0 = np.where(targets<0.5)[0]
483 |     ind1 = np.where(targets>=0.5)[0]
484 |     
485 |     n_neg = int(len(ind0)*valid_frac)
486 |     n_pos = int(len(ind1)*valid_frac)
487 | 
488 |     shuf_neg = np.random.permutation(len(ind0))
489 |     shuf_pos = np.random.permutation(len(ind1))
490 | 
491 |     X_train = np.concatenate((data[ind1[shuf_pos[n_pos:]]], data[ind0[shuf_neg[n_neg:]]]))
492 |     Y_train = np.concatenate((targets[ind1[shuf_pos[n_pos:]]], targets[ind0[shuf_neg[n_neg:]]]))
493 |     train = (X_train, Y_train)
494 | 
495 |     X_test = np.concatenate((data[ind1[shuf_pos[:n_pos]]], data[ind0[shuf_neg[:n_neg]]]))
496 |     Y_test = np.concatenate((targets[ind1[shuf_pos[:n_pos]]], targets[ind0[shuf_neg[:n_neg]]]))
497 |     test = (X_test, Y_test)
498 | 
499 |     return train, test
500 | 


--------------------------------------------------------------------------------
/exp/logistic_reg/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | # Author: XU Kui
  4 | # Created Time : 09 Nov 2020 11:14:31 PM CST
  5 | # Description:
  6 |     decription: x
  7 | """
  8 | import os,sys
  9 | import numpy as np
 10 | import xgboost as xgb
 11 | import matplotlib
 12 | matplotlib.use('pdf')
 13 | import matplotlib.pyplot as plt
 14 | import argparse
 15 | from sklearn.metrics import r2_score
 16 | # feature_list= ['AARS', 'AATF', 'ABCF1', 'AGGF1', 'AKAP1', 'AKAP8L', 'ALKBH5', 'APOBEC3C', 'AQR', 'ATXN2', 'AUH', 'BCCIP', 'BCLAF1', 'BUD13', 'C17ORF85', 'C22ORF28', 'CAPRIN1', 'CDC40', 'CPEB4', 'CPSF1', 'CPSF2', 'CPSF3', 'CPSF4', 'CPSF6', 'CPSF7', 'CSTF2', 'CSTF2T', 'DDX21', 'DDX24', 'DDX3X', 'DDX42', 'DDX51', 'DDX52', 'DDX55', 'DDX59', 'DDX6', 'DGCR8', 'DHX30', 'DKC1', 'DROSHA', 'EFTUD2', 'EIF3D', 'EIF3G', 'EIF3H', 'EIF4A3', 'eIF4AIII', 'EIF4G2', 'ELAVL1', 'EWSR1', 'EXOSC5', 'FAM120A', 'FASTKD2', 'FBL', 'FIP1L1', 'FKBP4', 'FMR1', 'FTO', 'FUS', 'FXR1', 'FXR2', 'G3BP1', 'GEMIN5', 'GNL3', 'GPKOW', 'GRWD1', 'GTF2F1', 'HLTF', 'HNRNPA1', 'HNRNPC', 'HNRNPD', 'HNRNPF', 'HNRNPK', 'HNRNPM', 'HNRNPU', 'HNRNPUL1', 'IGF2BP1', 'IGF2BP2', 'IGF2BP3', 'ILF3', 'KHDRBS1', 'KHSRP', 'LARP4', 'LARP7', 'LIN28A', 'LIN28B', 'LSM11', 'METAP2', 'METTL14', 'METTL3', 'MOV10', 'MTPAP', 'NCBP2', 'NIP7', 'NIPBL', 'NKRF', 'NOL12', 'NOLC1', 'NONO', 'NOP56', 'NOP58', 'NPM1', 'NUDT21', 'PABPC4', 'PABPN1', 'PCBP1', 'PCBP2', 'PHF6', 'PPIG', 'PRPF4', 'PRPF8', 'PTBP1', 'PTBP1PTBP2', 'PUM1', 'PUM2', 'PUS1', 'QKI', 'RBFOX2', 'RBM15', 'RBM22', 'RBM27', 'RBPMS', 'RPS11', 'RPS3', 'RTCB', 'SAFB2', 'SBDS', 'SDAD1', 'SERBP1', 'SF3A3', 'SF3B1', 'SF3B4', 'SLBP', 'SLTM', 'SMNDC1', 'SND1', 'SRRM4', 'SRSF1', 'SRSF7', 'SRSF9', 'SUB1', 'SUPV3L1', 'TAF15', 'TARDBP', 'TBRG4', 'TIA1', 'TIAL1', 'TNRC6A', 'TRA2A', 'TROVE2', 'U2AF1', 'U2AF2', 'U2AF65', 'UCHL5', 'UPF1', 'UTP18', 'UTP3', 'WDR3', 'WDR33', 'WDR43', 'WRN', 'WTAP', 'XRCC6', 'XRN2', 'YBX3', 'YTHDF2', 'YWHAG', 'ZC3H11A', 'ZC3H7B', 'ZNF622', 'ZNF800', 'ZRANB2']
 17 | feature_list= ['AARS', 'AATF', 'ABCF1', 'AGGF1', 'AKAP1', 'AKAP8L', 'ALKBH5', 'APOBEC3C', 'AQR', 'ATXN2', 'AUH', 'BCCIP', 'BCLAF1', 'BUD13', 'C17ORF85', 'C22ORF28', 'CAPRIN1', 'CDC40', 'CPEB4', 'CPSF1', 'CPSF2', 'CPSF3', 'CPSF4', 'CPSF6', 'CPSF7', 'CSTF2', 'CSTF2T', 'DDX21', 'DDX24', 'DDX3X', 'DDX42', 'DDX51', 'DDX52', 'DDX55', 'DDX59', 'DDX6', 'DGCR8', 'DHX30', 'DKC1', 'DROSHA', 'EFTUD2', 'EIF3D', 'EIF3G', 'EIF3H', 'EIF4A3',             'EIF4G2', 'ELAVL1', 'EWSR1', 'EXOSC5', 'FAM120A', 'FASTKD2', 'FBL', 'FIP1L1', 'FKBP4', 'FMR1', 'FTO', 'FUS', 'FXR1', 'FXR2', 'G3BP1', 'GEMIN5', 'GNL3', 'GPKOW', 'GRWD1', 'GTF2F1', 'HLTF', 'HNRNPA1', 'HNRNPC', 'HNRNPD', 'HNRNPF', 'HNRNPK', 'HNRNPM', 'HNRNPU', 'HNRNPUL1', 'IGF2BP1', 'IGF2BP2', 'IGF2BP3', 'ILF3', 'KHDRBS1', 'KHSRP', 'LARP4', 'LARP7', 'LIN28A', 'LIN28B', 'LSM11', 'METAP2', 'METTL14', 'METTL3', 'MOV10', 'MTPAP', 'NCBP2', 'NIP7', 'NIPBL', 'NKRF', 'NOL12', 'NOLC1', 'NONO', 'NOP56', 'NOP58', 'NPM1', 'NUDT21', 'PABPC4', 'PABPN1', 'PCBP1', 'PCBP2', 'PHF6', 'PPIG', 'PRPF4', 'PRPF8', 'PTBP1', 'PTBP1PTBP2', 'PUM1', 'PUM2', 'PUS1', 'QKI', 'RBFOX2', 'RBM15', 'RBM22', 'RBM27', 'RBPMS', 'RPS11', 'RPS3', 'RTCB', 'SAFB2', 'SBDS', 'SDAD1', 'SERBP1', 'SF3A3', 'SF3B1', 'SF3B4', 'SLBP', 'SLTM', 'SMNDC1', 'SND1', 'SRRM4', 'SRSF1', 'SRSF7', 'SRSF9', 'SUB1', 'SUPV3L1', 'TAF15', 'TARDBP', 'TBRG4', 'TIA1', 'TIAL1', 'TNRC6A', 'TRA2A', 'TROVE2', 'U2AF1', 'U2AF2', 'U2AF65', 'UCHL5', 'UPF1', 'UTP18', 'UTP3', 'WDR3', 'WDR33', 'WDR43', 'WRN', 'WTAP', 'XRCC6', 'XRN2', 'YBX3', 'YTHDF2', 'YWHAG', 'ZC3H11A', 'ZC3H7B', 'ZNF622', 'ZNF800', 'ZRANB2', 'eIF4AIII']
 18 | spec_list = ["AUH","HNRNPC","HNRNPU","IGF2BP1","IGF2BP3","LIN28B","SND1","TAF15","TIA1","FMR1","FXR1","FXR2","ILF3","KHDRBS1","KHSRP","PTBP1","TARDBP","TNRC6A","XRN2","BCLAF1","DDX6","EXOSC5","G3BP1","LARP4","NCBP2","PABPN1","PCBP1","SUPV3L1","UPF1","YBX3","PABPC4","PUM1","PUM2","SERBP1","HNRNPD","HNRNPF","QKI"]
 19 | 
 20 | top20_list = ['SND1', 'NPM1', 'KHDRBS1', 'GNL3', 'HNRNPUL1', 'TARDBP', 'ELAVL1', 'YTHDF2',
 21 |  'YBX3', 'LIN28B', 'YWHAG', 'ZC3H7B', 'TIA1', 'PUM2', 'RBFOX2', 'SERBP1', 'RBPMS',
 22 |  'RPS3', 'PUM1', 'PRPF8']
 23 | spec_top_list = ['AUH', 'BCLAF1', 'DDX6', 'ELAVL1', 'EXOSC5', 'FMR1', 'FXR1', 'FXR2', 'G3BP1', 'GNL3', 'HNRNPC', 'HNRNPD', 'HNRNPF', 'HNRNPU', 'HNRNPUL1', 'IGF2BP1', 'IGF2BP3', 'ILF3', 'KHDRBS1', 'KHSRP', 'LARP4', 'LIN28B', 'NCBP2', 'NPM1', 'PABPC4', 'PABPN1', 'PCBP1', 'PRPF8', 'PTBP1', 'PUM1', 'PUM2', 'QKI', 'RBFOX2', 'RBPMS', 'RPS3', 'SERBP1', 'SND1', 'SUPV3L1', 'TAF15', 'TARDBP', 'TIA1', 'TNRC6A', 'UPF1', 'XRN2', 'YBX3', 'YTHDF2', 'YWHAG', 'ZC3H7B']
 24 | import scipy.stats
 25 | 
 26 | import termplotlib as tpl
 27 | # from data_utils import load_data
 28 | 
 29 | import pickle
 30 | from sklearn import datasets, ensemble
 31 | # from sklearn.ensemble import HistGradientBoostingRegressor
 32 | from sklearn.inspection import permutation_importance
 33 | 
 34 | 
 35 | 
 36 | def plot(x, y, label="plot"):
 37 |     fig = tpl.figure()
 38 |     fig.plot(x, y, label=label, width=50, height=15)
 39 |     fig.show()
 40 | 
 41 | def plot_hist(sample,bins=40):
 42 |     counts, bin_edges = np.histogram(sample, bins=bins)
 43 |     fig = tpl.figure()
 44 |     fig.hist(counts, bin_edges, grid=[15, 25], orientation="horizontal",force_ascii=False)
 45 |     fig.show()
 46 | 
 47 | def normy(x):
 48 |     return (x-x.min())/(x.max() - x.min())
 49 | 
 50 | def normx(x):
 51 |     return 1/(1 + np.exp(-x.astype("float"))) 
 52 |     # return (x-x.mean())/x.std()
 53 | 
 54 | def get_topk_important_fea(filepath, topk=4):
 55 |     global feature_list
 56 |     feature_name=np.array(feature_list) 
 57 |     weight = np.load(filepath, allow_pickle=True)
 58 |     gain = weight['gain'].tolist()
 59 |     fea_gain = np.zeros(len(gain))
 60 |     for i in range(len(gain)):
 61 |         fea_gain[i] = gain['f'+str(i)]
 62 |     topk_flist = fea_gain.argsort()[::-1][:topk]
 63 |     
 64 |     
 65 |     return topk_flist
 66 | 
 67 | 
 68 | def get_topk_important_fea1(reg, topk=4):
 69 |     # global feature_list
 70 |     # feature_name=np.array(feature_list) 
 71 |     # weight = np.load(filepath, allow_pickle=True)
 72 |     # fscore = bst.get_fscore()
 73 |     feature_importance = reg.feature_importances_
 74 |     topk_flist = np.argsort(feature_importance)[::-1][:topk]
 75 |     return topk_flist
 76 | 
 77 | def get_topk_important_fea2(bst, topk=4):
 78 |     global feature_list
 79 |     feature_name=np.array(feature_list) 
 80 |     # weight = np.load(filepath, allow_pickle=True)
 81 |     # fscore = bst.get_fscore()
 82 |     fscore = bst.get_score(importance_type='gain')
 83 |     fea_fscore = np.zeros(len(fscore))
 84 |     for i in range(len(fscore)):fea_fscore[i] = fscore['f'+str(i)]
 85 |     topk_flist = fea_fscore.argsort()[::-1][:topk]
 86 |     return topk_flist
 87 | 
 88 | 
 89 | 
 90 | ##
 91 | #  this script demonstrate how to fit generalized linear model in xgboost
 92 | #  basically, we are using linear model, instead of tree for our boosters
 93 | 
 94 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 95 | parser.add_argument('--batch-size', type=int, default=640, metavar='N',
 96 |                     help='input batch size for training (default: 64)')
 97 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
 98 |                     help='input batch size for testing (default: 1000)')
 99 | parser.add_argument('--epochs', type=int, default=400, metavar='N',
100 |                     help='number of epochs to train (default: 14)')
101 | parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
102 |                     help='learning rate (default: 1.0)')
103 | parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
104 |                     help='Learning rate step gamma (default: 0.7)')
105 | parser.add_argument('--no-cuda', action='store_true', default=False,
106 |                     help='disables CUDA training')
107 | parser.add_argument('--cv', action='store_true', default=False,
108 |                     help='quickly check a single pass')
109 | parser.add_argument('--seed', type=int, default=1, metavar='S',
110 |                     help='random seed (default: 1)')
111 | parser.add_argument('--log-interval', type=int, default=100, metavar='N',
112 |                     help='how many batches to wait before logging training status')
113 | parser.add_argument('--save-model', action='store_true', default=False,
114 |                     help='For Saving the current Model')
115 | parser.add_argument('--train_data', default='', type=str,
116 |                     help="path of the training data to use")
117 | parser.add_argument('--test_data', default='', type=str,
118 |                     help="path of the training data to use")
119 | parser.add_argument('--pred_data', default='', type=str,
120 |                     help="path of the training data to use")
121 | parser.add_argument('--model_path', default='', type=str,
122 |                     help="path of the training data to use")
123 | parser.add_argument('--reg', default='squarederror', type=str,
124 |                     help="path of the training data to use")
125 | parser.add_argument('--booster', default='gbtree', type=str,
126 |                     help="path of the training data to use")
127 | parser.add_argument('--lam', type=int, default=-1, 
128 |                     help='L2 reg (default: -1)'
129 |                     )
130 | parser.add_argument('--topk', type=int, default=0, 
131 |                     help='topk features (default: 0)')
132 | parser.add_argument('--randk', type=int, default=0, 
133 |                     help='random k features (default: 0)')
134 | parser.add_argument('--load_best', action='store_true', default=False,
135 |                     help='load best model')
136 | parser.add_argument('--fine_tune', action='store_true', default=False,
137 |                     help='fine tuning ')
138 | parser.add_argument('--cell_expr', action='store_true', default=False,
139 |                     help='using cell expression')
140 | parser.add_argument('--normx', action='store_true', default=False,
141 |                     help='norm input data')
142 | parser.add_argument('--plot', action='store_true', default=False,
143 |                     help='norm input data')
144 | parser.add_argument('--fsel', type=int, default=1, 
145 |                     help='feature selector (default: -1)'
146 |                     )
147 | parser.add_argument('--sellist', type=int, default=0, 
148 |                     help='feature selector (default: -1)'
149 |                     )
150 | args = parser.parse_args()
151 | 
152 | traindata = args.train_data
153 | testdata = args.test_data
154 | preddata = args.pred_data
155 | 
156 | if not os.path.exists(preddata):
157 |     print(preddata," not found.")
158 |     preddata = ""
159 | if args.fine_tune:
160 |     traindata = preddata.replace(".train.npz",".test.npz")
161 |     print("Fine-tune on ",traindata)
162 |     
163 | print("Reading train data:",traindata)
164 | print("Reading test data:",testdata)
165 | print("Reading pred data:",preddata)
166 | 
167 | t_data = np.load(traindata,allow_pickle=True)
168 | e_data = np.load(testdata,allow_pickle=True)
169 | if preddata!="":
170 |     p_data = np.load(preddata,allow_pickle=True)
171 | 
172 | t_x = t_data['x']
173 | t_y = t_data['y']
174 | e_x = e_data['x']
175 | e_y = e_data['y']
176 | if preddata!="":
177 |     p_x = p_data['x']
178 |     p_y = p_data['y']
179 | 
180 | print(" train X: min,max: {:.3f} {:.3f} {}".format(t_x.min(), t_x.max(), t_x.shape))
181 | print(" train Y: min,max: {:.3f} {:.3f}".format(t_y.min(), t_y.max()))
182 | print(" test  X: min,max: {:.3f} {:.3f} {}".format(e_x.min(), e_x.max(), e_x.shape))
183 | print(" test  Y: min,max: {:.3f} {:.3f}".format(e_y.min(), e_y.max()))
184 | if preddata!="":
185 |     print(" pred  X: min,max: {:.3f} {:.3f}".format(p_x.min(), p_x.max(), p_x.shape))
186 |     print(" pred  Y: min,max: {:.3f} {:.3f}".format(p_y.min(), p_y.max()))
187 | 
188 | 
189 | 
190 | norm_x = args.normx
191 | if norm_x:
192 |     t_x = normx(t_x)
193 |     e_x = normx(e_x)
194 |     if preddata!="":
195 |         p_x = normx(p_x)
196 | 
197 | # plot_hist(t_y)
198 | # print("-------------------------------------------")
199 | norm_y = True
200 | if norm_y:
201 |     # t_y0 = np.zeros_like(t_y)
202 |     # e_y0 = np.zeros_like(t_y)
203 |     # import math
204 |     # for i in range(t_y.shape[0]):
205 |     #     t_y0[i] = math.log(t_y[i]+1)
206 | 
207 |     # for i in range(e_y.shape[0]):
208 |     #     e_y0[i] = math.log(e_y[i]+1)
209 |     # t_y = t_y0
210 |     # e_y = e_y0
211 |     # import pdb; pdb.set_trace()
212 |     # t_y = np.log((t_y+1).astype("float"))
213 |     # e_y = np.log((e_y+1).astype("float"))
214 |     # t_x = abs(t_x)
215 |     # e_x = abs(e_x)
216 |     # t_y = np.log((t_y/2+2).astype("float"))
217 |     # e_y = np.log((e_y/2+2).astype("float"))
218 |     # t_x = t_x/10
219 |     # e_x = e_x/10
220 |     # t_y = np.log((t_y/2+2).astype("float"))
221 |     # e_y = np.log((e_y/2+2).astype("float"))
222 | 
223 |     t_y = (t_y+1)/2
224 |     e_y = (e_y+1)/2
225 | 
226 |     if preddata!="":
227 |         # import pdb; pdb.set_trace()
228 |         p_y = (p_y+1)/2
229 | 
230 | 
231 | # plot_hist(t_y)
232 | feature_name=np.array(feature_list) 
233 | # if args.topk>0:
234 | #     # filepath = args.model_path+"_weight_eval_test.npz"
235 | #     # topk_list = get_topk_important_fea(filepath, topk=args.topk)
236 | #     # feature_list = feature_name[topk_list]
237 | #     # print("Using Top {} features: {}".format(args.topk, feature_name[topk_list]))
238 | #     bst = xgb.Booster(model_file=args.model_path)
239 | #     topk_list = get_topk_important_fea2(bst,args.topk)
240 | #     feature_list = feature_name[topk_list]
241 | #     t_x = t_x[:,topk_list]
242 | #     e_x = e_x[:,topk_list]
243 | #     if preddata!="":
244 | #         p_x = p_x[:,topk_list]
245 | #     print("Using Top {} features: {}".format(args.topk, feature_name[topk_list]))
246 | #     # import pdb; pdb.set_trace()
247 | #     args.model_path = args.model_path.replace("_best.model", "_topk{}_best.model".format(args.topk))
248 | if args.sellist>0:
249 |     if args.sellist==1:
250 |         topk_list=[feature_list.index(p) for p in spec_list]
251 | 
252 |     elif args.sellist==2:
253 |         topk_list=[feature_list.index(p) for p in spec_top_list]
254 |     elif args.sellist==3: # top 20
255 |         topk_list=[feature_list.index(p) for p in top20_list]
256 |     else:
257 |         raise "error no such list."
258 | 
259 | 
260 |     feature_list = feature_name[topk_list]
261 |     t_x = t_x[:,topk_list]
262 |     e_x = e_x[:,topk_list]
263 |     if preddata!="":
264 |         p_x = p_x[:,topk_list]
265 |     print("Using Top {} features: {}".format(args.topk, topk_list))
266 |     print("Using Top {} features: {}".format(args.topk, feature_name[topk_list]))
267 |     # import pdb; pdb.set_trace()
268 |     # args.model_path = args.model_path.replace("_best.skl", "_topk{}_best.skl".format(args.topk))
269 |     args.model_path = args.model_path.replace("_best.model", "_spec{}_best.model".format(args.sellist))
270 |     
271 | 
272 | if args.topk>0:
273 |     
274 |     # filepath = args.model_path+"_weight_eval_test.npz"
275 |     # topk_list = get_topk_important_fea(filepath, topk=args.topk)
276 |     # feature_list = feature_name[topk_list]
277 |     # print("Using Top {} features: {}".format(args.topk, feature_name[topk_list]))
278 |     # bst = xgb.Booster(model_file=args.model_path)
279 |     
280 |     if args.fsel ==1:
281 |         skl_model_path = args.model_path.replace("_best.model", "_best.skl")
282 |         reg0 = pickle.load(open(skl_model_path, 'rb'))
283 |         print("topk important_fea")
284 |         topk_list = get_topk_important_fea1(reg0,args.topk)
285 |     elif args.fsel ==2:
286 |         skl_model_path = args.model_path.replace("_best.model", "_best.skl")
287 |         reg0 = pickle.load(open(skl_model_path, 'rb'))
288 |         print("topk permutation_importance")
289 | 
290 |         result = permutation_importance(reg0, e_x, e_y, n_repeats=10,
291 |                                     random_state=42, n_jobs=2)
292 |         topk_list = result.importances_mean.argsort()[::-1][:args.topk]#[::-1]
293 |     else:
294 |         print("topk gain")
295 |         bst = xgb.Booster(model_file=args.model_path)
296 |         topk_list = get_topk_important_fea2(bst,args.topk)
297 | 
298 | 
299 |     feature_list = feature_name[topk_list]
300 |     t_x = t_x[:,topk_list]
301 |     e_x = e_x[:,topk_list]
302 |     if preddata!="":
303 |         p_x = p_x[:,topk_list]
304 |     print("Using Top {} features: {}".format(args.topk, topk_list))
305 |     print("Using Top {} features: {}".format(args.topk, feature_name[topk_list]))
306 |     # import pdb; pdb.set_trace()
307 |     # args.model_path = args.model_path.replace("_best.skl", "_topk{}_best.skl".format(args.topk))
308 |     args.model_path = args.model_path.replace("_best.model", "_topk{}_best.model".format(args.topk))
309 | 
310 | if args.randk>0:
311 |     # filepath = args.model_path+"_weight_eval_test.npz"
312 |     topk_list = np.random.randint(171, size=args.randk)
313 |     feature_list = feature_name[topk_list]
314 |     t_x = t_x[:,topk_list]
315 |     e_x = e_x[:,topk_list]
316 |     if preddata!="":
317 |         p_x = p_x[:,topk_list]
318 |     print("Using Random {} features: {}".format(args.randk, feature_name[topk_list]))
319 |     # print("Using Random {} features.".format(args.randk))
320 |     args.model_path = args.model_path.replace("_best.model", "_randk{}_best.model".format(args.randk))
321 | 
322 | print(" train X: min,max: {:.3f} {:.3f} {}".format(t_x.min(), t_x.max(), t_x.shape))
323 | print(" train Y: min,max: {:.3f} {:.3f}".format(t_y.min(), t_y.max()))
324 | print(" test  X: min,max: {:.3f} {:.3f} {}".format(e_x.min(), e_x.max(), e_x.shape))
325 | print(" test  Y: min,max: {:.3f} {:.3f}".format(e_y.min(), e_y.max()))
326 | if preddata!="":
327 |     print(" pred  X: min,max: {:.3f} {:.3f}".format(p_x.min(), p_x.max(), p_x.shape))
328 |     print(" pred  Y: min,max: {:.3f} {:.3f}".format(p_y.min(), p_y.max()))
329 | 
330 | 
331 | # dtrain = xgb.DMatrix(t_x, label=t_y, feature_names=feature_list)
332 | # dtest  = xgb.DMatrix(e_x, label=e_y, feature_names=feature_list)
333 | # if preddata!="":
334 | #     dpred  = xgb.DMatrix(p_x, label=p_y, feature_names=feature_list)
335 | # import pdb; pdb.set_trace()
336 | 
337 | dtrain = xgb.DMatrix(t_x, label=t_y)
338 | dtest  = xgb.DMatrix(e_x, label=e_y)
339 | if preddata!="":
340 |     dpred  = xgb.DMatrix(p_x, label=p_y)
341 | # change booster to gblinear, so that we are fitting a linear model
342 | # alpha is the L1 regularizer
343 | # lambda is the L2 regularizer
344 | # you can also set lambda_bias which is L2 regularizer on the bias term
345 | param = {'objective':'reg:squarederror', 'booster':'gbtree',"eval_metric": 'rmse',
346 |           'lambda': 16,  'eta':0.1}
347 | param = {'objective':'reg:'+args.reg, 'booster':args.booster,"eval_metric": 'rmse',
348 |           'lambda': 16,  'eta':0.1}
349 | print(param)
350 | # normally, you do not need to set eta (step_size)
351 | # XGBoost uses a parallel coordinate descent algorithm (shotgun),
352 | # there could be affection on convergence with parallelization on certain cases
353 | # setting eta to be smaller value, e.g 0.5 can make the optimization more stable
354 | # param['eta'] = 1
355 | 
356 | ##
357 | # the rest of settings are the same
358 | ##
359 | watchlist = [(dtrain, 'train'),(dtest, 'eval'), ]
360 | num_round = 3000
361 | best_r = 0
362 | best_l = 0
363 | best_p = 0
364 | 
365 | 
366 | for la in range(0, 30, 2):
367 |     if args.lam >= 0:
368 |         param['lambda']=args.lam
369 |     else:
370 |         param['lambda'] = la
371 |     print('lambda:', param['lambda'])
372 |     
373 |     early_stopping_rounds = 40
374 |     if args.load_best:
375 |         print("Loading best model.")
376 |         bst = xgb.Booster(model_file=args.model_path)
377 |         # topk_list = get_topk_important_fea2(bst,args.topk)
378 |         # feature_list = feature_name[topk_list]
379 |         # print("Using Top {} features22: {}".format(args.topk, feature_name[topk_list]))
380 |         # import pdb; pdb.set_trace()
381 |         # bst.save_model(args.model_path)
382 |         if args.fine_tune:
383 |             print("Fine tuning.")
384 |             early_stop = xgb.callback.EarlyStopping(
385 |                 rounds=early_stopping_rounds,
386 |                 metric_name='rmse', 
387 |                 save_best=True,
388 |                 data_name='eval'
389 |             )
390 |             bst = xgb.train(param, dtrain, num_round, watchlist, callbacks=[early_stop],)
391 |             args.model_path = args.model_path.replace("_best.model", "_finetune_best.model")
392 |     elif args.cv:
393 |         nfold = 5
394 |         print("Do Cross Validation: {} fold.".format(nfold))
395 |         param['verbosity']=1
396 |         hist = xgb.cv(param, dtrain, num_round, 
397 |             nfold=nfold, 
398 |             verbose_eval=True,
399 |             early_stopping_rounds=early_stopping_rounds)
400 |         print(hist)
401 |     else:
402 |         early_stop = xgb.callback.EarlyStopping(
403 |             rounds=early_stopping_rounds,
404 |             metric_name='rmse', 
405 |             save_best=True,
406 |             data_name='eval'
407 |         )
408 |         bst = xgb.train(param, dtrain, num_round, watchlist, callbacks=[early_stop],)
409 |     bst.save_model(args.model_path)
410 |     gain = bst.get_score(importance_type='gain')
411 |     total_gain = bst.get_score(importance_type='total_gain')
412 | 
413 |     e_preds = bst.predict(dtest)
414 |     e_labels = dtest.get_label()
415 |     r, p = scipy.stats.pearsonr(e_labels, e_preds)    
416 |     # r2=r2_score(labels, preds)
417 |     print("Test R: {:f}, R^2: {:f}, P-value: {:e}".format(r, r**2, p))
418 |     # if preddata!="":
419 |     #     r,p = predict(bst, dtest)
420 |     if r> best_r:
421 |         best_r = r
422 |         best_p = p
423 |         best_l = la
424 |         print("### -> Best...")
425 | 
426 |     if preddata!="":
427 |         # print("dpred: ",p_x.shape)
428 |         p_preds = bst.predict(dpred)
429 |         p_labels = dpred.get_label()
430 |         r, p = scipy.stats.pearsonr(p_labels, p_preds)    
431 |         print("Pred R: {:f}, R^2: {:f}, P-value: {:e}".format(r, r**2, p))
432 |     else:
433 |         p_labels=None
434 |         p_preds=None
435 | 
436 | 
437 |     np.savez_compressed(args.model_path+"_weight_eval_test.npz", 
438 |         gain=gain, 
439 |         total_gain=total_gain,
440 |         eval_label=e_labels,
441 |         eval_pred=e_preds,
442 |         test_label=p_labels,
443 |         test_pred=p_preds,
444 |         )
445 |     if args.lam >= 0: # pred
446 |         sys.exit(0)
447 | 
448 | print("Best la: {}\nR: {:f}, R^2: {:f}, P-value: {:e}".format(best_l, best_r, best_r**2, p))
449 | # print("Best R: {:f}, R^2: {:f}, lambda: {}".format(best_r, best_r**2, best_l))
450 |     # gain = bst.get_score(importance_type='gain')
451 |     # total_gain = bst.get_score(importance_type='total_gain')
452 |     # np.savez_compressed("low_fi_{:d}.npz".format(la), gain=gain, total_gain=total_gain)
453 |     # xgb.plot_importance(bst,importance_type='gain', max_num_features=20)
454 |     # plt.savefig("fi_{:d}.pdf".format(la))
455 |     # print('error=%f' % (sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds))))
456 | 


--------------------------------------------------------------------------------
/motif_construct/saliency_motif.pl:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/perl -w
   2 | use strict;
   3 | use Cwd;
   4 | use List::Util qw/max min sum maxstr minstr shuffle/;
   5 | 
   6 | my $infile = $ARGV[0];
   7 | my $prot_cell = $ARGV[1];
   8 | 
   9 | my $usage = "This script is to build the motif from PrismNet model output attention file.
  10 | usage: $0 <prot_cell>
  11 | 
  12 | example: perl saliency_motif.pl infile.sal outfile
  13 | ";
  14 | die $usage if $#ARGV<1;
  15 | 
  16 | #the input file containing the attention signal.
  17 | #my $infile = $prot_cell."_5v_binary_99999_binary_icbind_1_pu_ana0_test.txt";
  18 | 
  19 | 
  20 | my $site_file = $prot_cell."_seq_20_8";
  21 | my $kmer_file = $prot_cell."_seq_6kmer";
  22 | my $motif_file = $prot_cell."_motif_10";
  23 | 
  24 | my $bind_inf = &fdata_read3($infile);
  25 | my $site_seq = &bind_select3($bind_inf, 10, 0.8, 0.2, $prot_cell);
  26 | my %sseq = %{$site_seq};
  27 | 
  28 | open(OUT1, ">", $site_file."_seq.fa");
  29 | foreach my $k1 (sort {${$sseq{$b}}[0] <=> ${$sseq{$a}}[0]} keys %sseq){
  30 | 	print OUT1 ">",$k1,"|",${$sseq{$k1}}[0],"\n",${$sseq{$k1}}[2],"\n",${$sseq{$k1}}[3],"\n",${$sseq{$k1}}[4],"\n";
  31 | }
  32 | close OUT1; 
  33 | 
  34 | 
  35 | my ($Kmer1, $Kmer_loc) = &kmer_cal2($site_seq, 6);
  36 | my %kkmer1 = %{$Kmer1};
  37 | my %kkmer_loc = %{$Kmer_loc};
  38 | 
  39 | open(OUT1, ">", $kmer_file."_seq.txt");
  40 | foreach my $k1 (sort {$kkmer1{$b} <=> $kkmer1{$a}} keys %kkmer1){
  41 | 	print OUT1 $k1,"\t",$kkmer1{$k1},"\n";
  42 | }
  43 | close OUT1; 
  44 | 
  45 | 
  46 | my ($Motif_matrix1, $Motif_matrix2) = &combine_kmer($Kmer1, "-ACGT", $Kmer_loc, $bind_inf, $prot_cell);
  47 | &motif_print($Motif_matrix1, $prot_cell, 10, $motif_file."_seq.meme", "-ACGT");
  48 | 
  49 | my %mmat1 = %{$Motif_matrix1};
  50 | my %mmat2 = %{$Motif_matrix2};
  51 | open(OUT1, ">", $motif_file."_str.meme");
  52 | foreach my $k1 (sort {$a<=>$b} keys %mmat2){
  53 | 	my %tmp = %{$mmat2{$k1}};
  54 | 	print OUT1 $k1,"\n";
  55 | 	foreach my $k1 (sort {$a<=>$b} keys %tmp){
  56 | 		#print OUT1 ${$tmp{$k1}}[0],"|",${$tmp{$k1}}[1],"\t";
  57 | 		print OUT1 sprintf("%.4f", ${$tmp{$k1}}[0]),"|",sprintf("%.4f", ${$tmp{$k1}}[1]),"\t";
  58 | 	}
  59 | 	print OUT1 "\n";
  60 | }
  61 | close OUT1;
  62 | 
  63 | open(OUT1, ">", $motif_file."_seq.txt");
  64 | foreach my $k1 (sort {$a<=>$b} keys %mmat1){
  65 | 	my %tmp = %{$mmat1{$k1}};
  66 | 	#print OUT1 $k1,"\n";
  67 | 	for(my $i=0; $i<=3; $i++){
  68 | 		foreach my $k2 (sort {$a<=>$b} keys %tmp){
  69 | 			#print OUT1 ${$tmp{$k1}}[0],"|",${$tmp{$k1}}[1],"\t";
  70 | 			print OUT1 sprintf("%.4f", ${$tmp{$k2}}[$i]),"\t";
  71 | 		}
  72 | 		print OUT1 "\n";
  73 | 	}	
  74 | }
  75 | close OUT1;
  76 | 
  77 | open(OUT1, ">", $motif_file."_str.txt");
  78 | foreach my $k1 (sort {$a<=>$b} keys %mmat2){
  79 | 	my %tmp = %{$mmat2{$k1}};
  80 | 	#print OUT1 $k1,"\n";
  81 | 	for(my $i=0; $i<=1; $i++){
  82 | 		foreach my $k2 (sort {$a<=>$b} keys %tmp){
  83 | 			#print OUT1 ${$tmp{$k1}}[0],"|",${$tmp{$k1}}[1],"\t";
  84 | 			print OUT1 sprintf("%.4f", ${$tmp{$k2}}[$i]),"\t";
  85 | 		}
  86 | 		print OUT1 "\n";
  87 | 	}	
  88 | }
  89 | close OUT1;
  90 | 
  91 | my $summary_file = $prot_cell."_summary.txt";
  92 | my $meme_file = $prot_cell."_motif_10_seq.meme";
  93 | my $seq_file = $prot_cell."_motif_10_seq.txt";
  94 | my $str_file = $prot_cell."_motif_10_str.txt";
  95 | 
  96 | my $tmeme_file = $prot_cell."_top10_motif_10_seq.meme";
  97 | my $tmeme_file2 = $prot_cell."_top10_motif_10_seq2.meme";
  98 | 
  99 | my $sum_out = $prot_cell."_motif_summary.txt";
 100 | my $seqstr_out = $prot_cell."_motif_prob.txt";
 101 | #RBFOX2_mes_summary.txt
 102 | #RBFOX2_mes_motif_10_seq.txt
 103 | #RBFOX2_mes_motif_10_str.txt
 104 | #RBFOX2_mes_motif_10_seq.meme
 105 | 
 106 | `head -n 240 $meme_file > $tmeme_file`;
 107 | `cp $tmeme_file $tmeme_file2`;
 108 | `tomtom -o $prot_cell $tmeme_file $tmeme_file2`;
 109 | 
 110 | my $motif_similar = $prot_cell."/tomtom.txt";
 111 | my $Sinf = &read_summary($summary_file);
 112 | my $Seq_inf = &read_seq_count($seq_file);
 113 | my $Str_inf = &read_str_count($str_file);
 114 | 
 115 | my %sinf = %{$Sinf}; my %seq_inf = %{$Seq_inf}; my %str_inf = %{$Str_inf};
 116 | my %finf = (); my %fsinf = ();
 117 | 
 118 | my $Motif_com = &read_tomtom($motif_similar, $prot_cell, \%sinf);
 119 | my %motif_com = %{$Motif_com}; 
 120 | my $num = 0; my $pnum = 0; my $unum = 0;
 121 | foreach my $key (keys %sinf){
 122 | 	my $count = (${$sinf{$key}}[1] =~ s/U/U/g);
 123 | 	if($count >= 4){
 124 | 		$unum = $unum + ${$sinf{$key}}[2];
 125 | 	}else{
 126 | 		$pnum = $pnum + ${$sinf{$key}}[2];
 127 | 	}
 128 | 	$num = $num + ${$sinf{$key}}[2];
 129 | }
 130 | 
 131 | foreach my $key ( sort{$a<=>$b} keys %motif_com){
 132 | 	my @sen = @{$motif_com{$key}};
 133 | 	#print $key,"\t",join("|", @sen),"\n";
 134 | }
 135 | 
 136 | foreach my $key ( sort{$a<=>$b} keys %motif_com){
 137 | 	my @sen = @{$motif_com{$key}};
 138 | 	#print ">",$key,"\t",join("|",@sen),"\n";
 139 | 	if($#sen == -1){
 140 | 		$finf{$key} = $sinf{$key};
 141 | 		$fsinf{$key} = $seq_inf{$key};
 142 | 		${$fsinf{$key}}{4} = ${$str_inf{$key}}{0};
 143 | 		${$fsinf{$key}}{5} = ${$str_inf{$key}}{1};
 144 | 	}else{
 145 | 		$finf{$key} = $sinf{$key};
 146 | 		$fsinf{$key} = $seq_inf{$key};
 147 | 		${$fsinf{$key}}{4} = ${$str_inf{$key}}{0};
 148 | 		${$fsinf{$key}}{5} = ${$str_inf{$key}}{1};
 149 | 		for(my $i=0; $i<=$#sen; $i++){
 150 | 			my @sent1 = split(/\|/, $sen[$i]);
 151 | 			my $shf = -$sent1[1];
 152 | 			${$finf{$key}}[2] = ${$finf{$key}}[2] + ${$sinf{$sent1[0]}}[2];
 153 | 			for(my $i=0; $i<=3; $i++){
 154 | 				for(my $j=max(0-$shf, 0); $j<=min(9-$shf, 9); $j++){
 155 | 					${${$fsinf{$key}}{$i}}[$j] = ${${$fsinf{$key}}{$i}}[$j] + ${${$seq_inf{$sent1[0]}}{$i}}[$j+$shf];
 156 | 				}
 157 | 			}
 158 | 			for(my $i=0; $i<=1; $i++){
 159 | 				for(my $j=max(0-$shf, 0); $j<=min(9-$shf, 9); $j++){
 160 | 					${${$fsinf{$key}}{$i+4}}[$j] = ${${$fsinf{$key}}{$i+4}}[$j] + ${${$str_inf{$sent1[0]}}{$i}}[$j+$shf];
 161 | 				}
 162 | 			}
 163 | 		}
 164 | 	}
 165 | }
 166 | 
 167 | open(OUT1, ">", $sum_out);
 168 | open(OUT2, ">", $seqstr_out);
 169 | print OUT1 "motif_id\tmotif_site\tmotif_weight\n";
 170 | 
 171 | my @labc = ("seq_A", "seq_C", "seq_G", "seq_U", "str_P", "str_U");
 172 | #print OUT1 $num,"\t",$unum,"\t",$unum/$num,"\t",$pnum,"\t",$pnum/$num,"\n";
 173 | foreach my $key (sort{${$finf{$b}}[2] <=> ${$finf{$a}}[2]} keys %finf){
 174 | 	print OUT1 ${$finf{$key}}[0],"|",${$finf{$key}}[1],"\t",${$finf{$key}}[2],"\t",${$finf{$key}}[2]/$num,"\n";
 175 | 	my $nsite = ${$finf{$key}}[2];
 176 | 	#for(my $i=0; $i<=5; $i++){
 177 | 	#	for(my $j=0; $j<=9; $j++){
 178 | 	#		print OUT2 ${${$fsinf{$key}}{$i}}[$j],"\t";
 179 | 	#	}
 180 | 	#	print OUT2 "\n";
 181 | 	#}
 182 | 	for(my $i=0; $i<=5; $i++){
 183 | 		print OUT2 $key."_".$labc[$i],"\t";
 184 | 		for(my $j=0; $j<=9; $j++){
 185 | 			print OUT2 ${${$fsinf{$key}}{$i}}[$j]/$nsite,"\t";
 186 | 		}
 187 | 		print OUT2 "\n";
 188 | 	}
 189 | }
 190 | 
 191 | close OUT1;
 192 | close OUT2;
 193 | 
 194 | #id，int 
 195 | #label，int 
 196 | #Predictscore，float 
 197 | #Sequence，str,101 
 198 | #Icshape, float,101 
 199 | #Saliency,101x5(5v), 101x6(6v,7v)
 200 | 
 201 | sub fdata_read3{
 202 | 	my $fdata_file = shift; 
 203 | 	my $i = 0; my $j = 0; my $r = 0;
 204 | 	my $sen = ""; my $sen1 = ""; my $seq = ""; my $sna = "";
 205 | 	my @sen = (); my @sen1 = (); my @sen2 = ();
 206 | 	my %inf = ();
 207 | 	open(FILE1, $fdata_file)||die("open $fdata_file error!\n");
 208 | 	#11      5b724f3fcd6c7ec4fa054a09f038a70e   1.000   0.999   AUAAUUUUUUUCACUGUGCACCAGCAUCAGCAUCACUGUGUACCAGCAUCAGCAUCACUGUGUACCAGCAUCAGCAUCACUGUGUAUCAGCAUCAGCAUCACUG 0.55708003,0.0,0.0,0.0,0.0,0.5653001,0.0,0.0,0.0,0.45 0.55708003,0.0,0.0,0.0,0.0,0.5653001,0.0,0.0,0.0,0.45
 209 | 	$sna = 0;
 210 | 	while($sen = <FILE1>){
 211 | 		chomp($sen);
 212 | 		@sen1 = split(/\t/, $sen);
 213 | 		if($sen1[2] == 1){
 214 | 			@sen2 = split(/\,/, $sen1[6]);
 215 | 			my @ics = split(/\,/, $sen1[5]);
 216 | 			my $num = 0;
 217 | 			my @sent1 = ();
 218 | 			my @psent = (); my @usent = ();
 219 | 			for($i=0; $i<=100; $i++){
 220 | 				$num = max($sen2[$i*5], $sen2[$i*5+1], $sen2[$i*5+2], $sen2[$i*5+3]);
 221 | 				push(@sent1, $num);
 222 | 				push(@psent, $sen2[$i*5+4]);			
 223 | 			}
 224 | 			my $seq_map = join("|", @sent1);
 225 | 			$inf{$sen1[0]} = [$sen1[3], $sen1[4], $seq_map, join("|", @ics), join("|", @psent)];
 226 | 		}
 227 | 	}
 228 | 	close FILE1;
 229 | 	return(\%inf);
 230 | }
 231 | 
 232 | sub bind_select3{
 233 | 	my $Inf = shift; my $len = shift; my $bind_score = shift; my $per = shift; my $protein_name = shift;
 234 | 	my ($score1, $score2) = &max_per_seq_str($Inf, $len, $per, $bind_score);
 235 | 	my %inf = %{$Inf}; my %site_inf = ();
 236 | 	my $sna = ""; my $key = "";
 237 | 	foreach $key (sort {${$inf{$b}}[0] <=> ${$inf{$a}}[0]} keys %inf){
 238 | 		if(${$inf{$key}}[0] < $bind_score){
 239 | 			last;
 240 | 		}
 241 | 		my @sent1 = split(/\|/, ${$inf{$key}}[2]);
 242 | 		shift(@sent1);
 243 | 		my @sent2 = split(/\|/, ${$inf{$key}}[4]);
 244 | 		shift(@sent2);
 245 | 		my $i = 0; my $j = 0; my $index = 0; my $maxn = 0; my $sum1 = 0; my $sum2 = 0; my $sta = -1; my $end = -1; my $tsum1 = 0; my $tsum2 = 0;
 246 | 		my $samstr = &get_str(${$inf{$key}}[1], ${$inf{$key}}[3], $protein_name);
 247 | 		$samstr =~s/\./U/g;
 248 | 		$samstr =~s/\(/P/g;
 249 | 		$samstr =~s/\)/P/g;
 250 | 		for($i=0; $i<=$#sent1-$len+1; $i++){
 251 | 			$sum1 = 0; $sum2 = 0;
 252 | 			for($j=0; $j<$len; $j++){
 253 | 				$sum1 = $sum1 + $sent1[$i+$j];
 254 | 				$sum2 = $sum2 + $sent2[$i+$j];
 255 | 			}
 256 | 			if(($sum1 > $score1) && ($sum2 > $score2)){
 257 | 				if($end >= $i){
 258 | 					$end = $i + $len - 1;
 259 | 				}elsif($end > 0){
 260 | 					my $sna = $key."_".$sta;
 261 | 					$site_inf{$sna} = [$tsum1, $tsum2, substr(${$inf{$key}}[1], $sta, $end - $sta + 1), &subshape(${$inf{$key}}[3], $sta, $end - $sta + 1), substr($samstr, $sta, $end - $sta + 1)];
 262 | 					$sta = $i;
 263 | 					$end = $i + $len - 1;
 264 | 					$tsum1 = $sum1;
 265 | 					$tsum2 = $sum2;
 266 | 				}else{
 267 | 					$sta = $i;
 268 | 					$end = $i + $len - 1;
 269 | 					$tsum1 = $sum1;
 270 | 					$tsum2 = $sum2;
 271 | 				}
 272 | 			}
 273 | 		}
 274 | 		if($end > 0){
 275 | 			my $sna = $key."_".$sta;
 276 | 			$site_inf{$sna} = [$tsum1, $tsum2, substr(${$inf{$key}}[1], $sta, $end - $sta + 1), &subshape(${$inf{$key}}[3], $sta, $end - $sta + 1), substr($samstr, $sta, $end - $sta + 1)];
 277 | 		}
 278 | 	}
 279 | 	return(\%site_inf);
 280 | }
 281 | 
 282 | sub subshape{
 283 | 	my $Shape_list = shift; my $sta = shift; my $len = shift;
 284 | 	#my @dvalue = (0.0, 0.088, 0.233, 0.484, 1.0);
 285 | 	my @sent1 = split(/\|/, $Shape_list);
 286 | 	my @sent2 = ();
 287 | 	#shift(@sent1);
 288 | 	my $i = 0; my $str_seq = "";
 289 | 	for($i=$sta; $i<=$sta+$len-1; $i++){
 290 | 		push(@sent2, $sent1[$i]);
 291 | 	}
 292 | 	$str_seq = join("|", @sent2);
 293 | 	return ($str_seq);
 294 | }
 295 | 
 296 | sub subshape2{
 297 | 	my $Shape_list = shift; my $sta = shift; my $len = shift;
 298 | 	my @dvalue = (0.0, 0.088, 0.233, 0.484, 1.0);
 299 | 	my @sent1 = split(/\|/, $Shape_list);
 300 | 	shift(@sent1);
 301 | 	my $i = 0; my $str_seq = "";
 302 | 	for($i=$sta; $i<=$sta+$len-1; $i++){
 303 | 		if($sent1[$i] <= $dvalue[1]){
 304 | 			$str_seq = $str_seq."P";
 305 | 		}elsif($sent1[$i] <= $dvalue[2]){
 306 | 			$str_seq = $str_seq."Q";
 307 | 		}elsif($sent1[$i] <= $dvalue[3]){
 308 | 			$str_seq = $str_seq."S";
 309 | 		}else{
 310 | 			$str_seq = $str_seq."Z";
 311 | 		}
 312 | 	}
 313 | 	return ($str_seq);
 314 | }
 315 | 
 316 | sub shape2str{
 317 | 	my $Shape_list = shift; my $Str_list = shift;
 318 | 	my @dvalue = (0.0, 0.088, 0.233, 0.484, 1.0);
 319 | 	my @sent1 = split(/\|/, $Shape_list);
 320 | 	my @sent2 = split(//, $Str_list);
 321 | 	#shift(@sent1);
 322 | 	my $i = 0; my $str_seq = "";
 323 | 	for($i=0; $i<=$#sent1; $i++){
 324 | 		if($sent1[$i] <= 0){
 325 | 			$str_seq = $str_seq.$sent2[$i];
 326 | 		}elsif($sent1[$i] <= $dvalue[2]){
 327 | 			$str_seq = $str_seq."P";
 328 | 		}else{
 329 | 			$str_seq = $str_seq."U";
 330 | 		}
 331 | 	}
 332 | 	return ($str_seq);
 333 | }
 334 | 
 335 | sub max_index{
 336 | 	my $list = shift; my $len = shift;
 337 | 	my @sen1 = @{$list};
 338 | 	my $i = 0; my $j = 0; my $r = 0; my $index = 0; my $maxn = 0; my $sum = 0;
 339 | 	for($i=0; $i<=$#sen1-$len+1; $i++){
 340 | 		$sum = 0;
 341 | 		for($j=0; $j<$len; $j++){
 342 | 			$sum = $sum + $sen1[$i+$j];
 343 | 		}
 344 | 		if($sum > $maxn){
 345 | 			$index = $i;
 346 | 			$maxn = $sum;
 347 | 		}
 348 | 	}
 349 | 	return ($index, $maxn);
 350 | }
 351 | 
 352 | sub max_per_seq{
 353 | 	my $Inf = shift; my $len = shift; my $per = shift; my $bind_score = shift;
 354 | 	#my $list = shift; my $len = shift;
 355 | 	my %inf = %{$Inf};
 356 | 	my $key = ""; my @total = (); my $i = 0;
 357 | 	foreach $key (sort {${$inf{$b}}[0] <=> ${$inf{$a}}[0]} keys %inf){
 358 | 		if(${$inf{$key}}[0] < $bind_score){
 359 | 			last;
 360 | 		}
 361 | 		my @sent1 = split(/\|/, ${$inf{$key}}[2]);
 362 | 		shift(@sent1);
 363 | 		for($i=0; $i<=$#sent1-$len+1; $i++){
 364 | 			push(@total, sum(@sent1[$i..($i+$len-1)]));
 365 | 		}
 366 | 	}
 367 | 	@total = sort {$b <=> $a} @total;
 368 | 	my $boun = $total[int(($#total + 1)*$per)-1];
 369 | 	return ($boun);
 370 | }
 371 | 
 372 | sub max_per_seq_str{
 373 | 	my $Inf = shift; my $len = shift; my $per = shift; my $bind_score = shift;
 374 | 	#my $list = shift; my $len = shift;
 375 | 	my %inf = %{$Inf};
 376 | 	my $key = ""; my @total1 = (); my @total2 = (); my $i = 0;
 377 | 	foreach $key (sort {${$inf{$b}}[0] <=> ${$inf{$a}}[0]} keys %inf){
 378 | 		if(${$inf{$key}}[0] < $bind_score){
 379 | 			last;
 380 | 		}
 381 | 		my @sent1 = split(/\|/, ${$inf{$key}}[2]);
 382 | 		shift(@sent1);
 383 | 		my @sent2 = split(/\|/, ${$inf{$key}}[4]);
 384 | 		shift(@sent2);
 385 | 		for($i=0; $i<=$#sent1-$len+1; $i++){
 386 | 			push(@total1, sum(@sent1[$i..($i+$len-1)]));
 387 | 		}
 388 | 		for($i=0; $i<=$#sent2-$len+1; $i++){
 389 | 			push(@total2, sum(@sent2[$i..($i+$len-1)]));
 390 | 		}
 391 | 	}
 392 | 	@total1 = sort {$b <=> $a} @total1;
 393 | 	@total2 = sort {$b <=> $a} @total2;
 394 | 	my $boun1 = $total1[int(($#total1 + 1)*$per)-1];
 395 | 	my $boun2 = $total2[int(($#total2 + 1)*$per*2)-1];
 396 | 	return ($boun1, $boun2);
 397 | }
 398 | 
 399 | sub motif_print{
 400 | 	my $Motif = shift; my $out_pref = shift; my $motif_len = shift; my $pmotif_file = shift; my $ALPHA = shift;
 401 | 	my %mot_inf = %{$Motif};
 402 | 	my $sen = ""; my $sen1 = ""; my $seq = ""; my $sna = ""; my $file = ""; my $ics = "";
 403 | 	my @sen = (); my @sen1 = (); my @sen2 = (); my @sent1 = (); my @sent2 = ();
 404 | 	my @alpha = split(//, $ALPHA);
 405 | 	my $sid = ""; my $key;
 406 | 	my $i = 0; my $j = 0; my $r = 0; my $num = 0; my $tsum = 0; my $k1 = 0;
 407 | 	
 408 | 	#my $head = "MEME version 4.10.1 (Release date: Wed Mar 25 11:40:43 2015 +1000)\nstrands: +\n\nMOTIF ";
 409 | 	#my $head_str = "MEME version 4.10.1 (Release date: Wed Mar 25 11:40:43 2015 +1000)\n\nALPHABET\nP\nQ\nS\nZ\nEND ALPHABET\n\nstrands: +\n\nMOTIF ";
 410 | 	my $head = "MEME version 4.10.1 (Release date: Wed Mar 25 11:40:43 2015 +1000)\n\nALPHABET\n".$alpha[1]."\n".$alpha[2]."\n".$alpha[3]."\n".$alpha[4]."\nEND ALPHABET\n\nstrands: +\n\nMOTIF ";
 411 | 	
 412 | 	#open(FILE1, $pf_file)||die("open $pf_file error!\n");
 413 | 	open(OUT1, ">", $pmotif_file);
 414 | 	$j = 0;
 415 | 	foreach $key (sort{$a<=>$b} keys %mot_inf){
 416 | 		print OUT1 $head;
 417 | 		print OUT1 $out_pref,$key,"\n";
 418 | 		my %motifi = %{$mot_inf{$key}};
 419 | 		@sen1 = @{$motifi{1}};
 420 | 		$tsum = $sen1[0] + $sen1[1] + $sen1[2] + $sen1[3];
 421 | 		print OUT1 "letter-probability matrix: alength= 4 w= $motif_len nsites = $tsum\n";
 422 | 		foreach $i (sort{$a<=>$b} keys %motifi){
 423 | 			@sen1 = @{$motifi{$i}};
 424 | 			$tsum = $sen1[0] + $sen1[1] + $sen1[2] + $sen1[3];
 425 | 			if($tsum > 0){
 426 | 				my $new_var = sprintf(" %.6f  %.6f  %.6f  %.6f \n", $sen1[0]/$tsum, $sen1[1]/$tsum, $sen1[2]/$tsum, $sen1[3]/$tsum);
 427 | 				print OUT1 $new_var;
 428 | 			}else{
 429 | 				my $new_var = sprintf(" %.6f  %.6f  %.6f  %.6f \n", 0.25, 0.25, 0.25, 0.25);
 430 | 				print OUT1 $new_var;
 431 | 			}
 432 | 		}
 433 | 		print OUT1 "\n";
 434 | 	}
 435 | 	close OUT1;
 436 | }
 437 | 
 438 | sub kmer_cal2{
 439 | 	my $Inf = shift; my $len = shift;
 440 | 	my %inf = %{$Inf}; my %kmer_seq = (); my %kmer_loc = ();
 441 | 	my @sent = ();
 442 | 	my $i = 0; my $key = ""; my $seq = ""; my $subseq = ""; my $sta = 0; my $end = 0; my $sna = "";
 443 | 	my $stru = ""; my $substru = ""; my $kmer_name = "";
 444 | 	foreach $key (sort {${$inf{$b}}[0] <=> ${$inf{$a}}[0]} keys %inf){
 445 | 		@sent = split(/\_/, $key);
 446 | 		$seq = ${$inf{$key}}[2];
 447 | 		#$stru = ${$inf{$key}}[4];
 448 | 		$stru = &shape2str(${$inf{$key}}[3], ${$inf{$key}}[4]);
 449 | 		for($i=0; $i<=length($seq)-$len; $i++){
 450 | 			$subseq = substr($seq, $i, $len);
 451 | 			$substru = substr($stru, $i, $len);
 452 | 			$sta = $sent[1] + $i; $end = $sta + $len - 1; $sna = $sent[0]."_".$sta."_".$end;
 453 | 			$kmer_name = $subseq."|".$substru;
 454 | 			if(exists $kmer_seq{$kmer_name}){
 455 | 				$kmer_seq{$kmer_name} = $kmer_seq{$kmer_name} + 1;
 456 | 				push(@{$kmer_loc{$kmer_name}}, $sna);
 457 | 			}else{
 458 | 				$kmer_seq{$kmer_name} = 1;
 459 | 				$kmer_loc{$kmer_name} = [$sna];
 460 | 			}
 461 | 		}
 462 | 	}
 463 | 	return (\%kmer_seq, \%kmer_loc);
 464 | }
 465 | 
 466 | sub combine_kmer{
 467 | 	my $Inf = shift; my $ALPHA = shift; my $Kmer_loc = shift; my $Data_Inf = shift; my $protein_name = shift;
 468 | 	my %inf = %{$Inf}; my %cinf = (); my %mot_inf = (); my %mot_str_inf = (); my %cinf_con = ();
 469 | 	#my %kmer_loc = %{$Kmer_loc}; my %data_inf = %{$Data_Inf};
 470 | 	my $key = ""; my $k1 = ""; my $r = 0; my $flag = 0; my $exkey = "";
 471 | 	my $kmer_sht = sum(values %inf)*0.2;
 472 | 	my $kmer_sh = 0; my $kmer_sum = 0;
 473 | 	foreach $key ( sort{$inf{$b} <=> $inf{$a}} keys %inf){
 474 | 		$kmer_sum = $kmer_sum + $inf{$key};
 475 | 		if($kmer_sum > $kmer_sht){
 476 | 			$kmer_sh = $inf{$key};
 477 | 			last;
 478 | 		}
 479 | 	}
 480 | 	if(max(values %inf) <= 5){
 481 | 		$kmer_sh = 0;
 482 | 	}
 483 | 	print $kmer_sh,"\n";
 484 | 	#my $kmer_sh = 0;
 485 | 	open(OUT, ">", $protein_name."_summary.txt");
 486 | 	open(OUT2, ">", $protein_name."_summary2.txt");
 487 | 	print OUT $kmer_sh,"\n";
 488 | 	foreach $key ( sort{$inf{$b} <=> $inf{$a}} keys %inf){
 489 | 		if($inf{$key} <= $kmer_sh){
 490 | 			last;
 491 | 		}
 492 | 		$flag = 0;
 493 | 		foreach $k1 (sort{${$cinf{$b}}[0] <=> ${$cinf{$a}}[0]} keys %cinf){
 494 | 			($exkey, $flag) = &tcluster($k1, $key);
 495 | 			if($flag == 1){
 496 | 				${$cinf{$k1}}[0] = ${$cinf{$k1}}[0] + $inf{$key};
 497 | 				push(@{$cinf{$k1}}, $exkey, $inf{$key});
 498 | 				push(@{$cinf_con{$k1}}, $key);
 499 | 				last;
 500 | 			}
 501 | 		}
 502 | 		if($flag == 0){
 503 | 			$cinf{$key} = [$inf{$key}, $inf{$key}];
 504 | 			$cinf_con{$key} = [$key];
 505 | 		}
 506 | 	}
 507 | 	$r = 0;
 508 | 	foreach $key ( sort {${$cinf{$b}}[0] <=> ${$cinf{$a}}[0]} keys %cinf){
 509 | 		$r = $r + 1;
 510 | 		my ($mot1, $mot2, $num1) = &build_motif2($key, $cinf{$key}, $ALPHA, "-PU");
 511 | 			print OUT $key,"\t",$num1,"\n";
 512 | 			print OUT2 $key,"\t",$num1,"\n";
 513 | 			for(my $i=0; $i<=$#{$cinf{$key}}; $i++){
 514 | 				print OUT2 ${$cinf{$key}}[$i],"\t";
 515 | 			}
 516 | 			print OUT2 "\n";
 517 | 		$mot_inf{$r} = $mot1; 
 518 | 		$mot_str_inf{$r} = $mot2;
 519 | 	}
 520 | 	close OUT;
 521 | 	close OUT2;
 522 | 	return (\%mot_inf, \%mot_str_inf);
 523 | }
 524 | 
 525 | sub icSHAPE_str2{
 526 | 	my $Shape_value = shift;
 527 | 	#my $Shape_list = shift; my $sta = shift; my $len = shift;
 528 | 	my $str_seq = ""; my $str_pro = "";
 529 | 	my @dvalue = (0.0, 0.088, 0.233, 0.484, 1.0);
 530 | 	if($Shape_value < $dvalue[0]){
 531 | 		$str_seq = "N";
 532 | 		$str_pro =	0;	
 533 | 	}elsif($Shape_value <= $dvalue[1]){
 534 | 		$str_seq = "P";
 535 | 		$str_pro = 1 - ($Shape_value - $dvalue[0])/($dvalue[1] - $dvalue[0])*0.8;
 536 | 	}elsif($Shape_value <= $dvalue[2]){
 537 | 		$str_seq = "P";
 538 | 		$str_pro = 0.2 - ($Shape_value - $dvalue[1])/($dvalue[2] - $dvalue[1])*0.2;
 539 | 	}elsif($Shape_value <= $dvalue[3]){
 540 | 		$str_seq = "U";
 541 | 		$str_pro = ($Shape_value - $dvalue[2])/($dvalue[3] - $dvalue[2])*0.2;
 542 | 	}else{
 543 | 		$str_seq = "U";
 544 | 		$str_pro = 0.2 + ($Shape_value - $dvalue[3])/($dvalue[4] - $dvalue[3])*0.8;
 545 | 	}
 546 | 	return ($str_seq, $str_pro);
 547 | }
 548 | 
 549 | sub icSHAPE_str{
 550 | 	my $Shape_value = shift;
 551 | 	#my $Shape_list = shift; my $sta = shift; my $len = shift;
 552 | 	my $str_seq = ""; my $str_pro = "";
 553 | 	my @dvalue = (0.0, 0.088, 0.233, 0.484, 1.0);
 554 | 	if($Shape_value < $dvalue[0]){
 555 | 		$str_seq = "N";
 556 | 		$str_pro =	0;	
 557 | 	}elsif($Shape_value <= $dvalue[1]){
 558 | 		$str_seq = "P";
 559 | 		$str_pro = 1 - ($Shape_value - $dvalue[0])/($dvalue[1] - $dvalue[0])*0.5;
 560 | 	}elsif($Shape_value <= $dvalue[2]){
 561 | 		$str_seq = "P";
 562 | 		$str_pro = 0.5 - ($Shape_value - $dvalue[1])/($dvalue[2] - $dvalue[1])*0.5;
 563 | 	}elsif($Shape_value <= $dvalue[3]){
 564 | 		$str_seq = "U";
 565 | 		$str_pro = ($Shape_value - $dvalue[2])/($dvalue[3] - $dvalue[2])*0.5;
 566 | 	}else{
 567 | 		$str_seq = "U";
 568 | 		$str_pro = 0.5 + ($Shape_value - $dvalue[3])/($dvalue[4] - $dvalue[3])*0.5;
 569 | 	}
 570 | 	return ($str_seq, $str_pro);
 571 | }
 572 | 
 573 | sub fivechar{
 574 | 	my $char1 = shift; my $char2 = shift;
 575 | 	my @sen1 = split(//, $char1); my @sen2 = split(//, $char2); my $i = 0; my $r = 0; my $mismatch = 0; my $flag = 0;
 576 | 	if(substr($char1, 0, 4) eq substr($char2, 1, 4)){
 577 | 		return ("-".$char2."---", 1);
 578 | 	}elsif(substr($char1, 1, 4) eq substr($char2, 0, 4)){
 579 | 		return ("---".$char2."-", 1);
 580 | 	}
 581 | 	for($i=0; $i<=$#sen1; $i++){
 582 | 		if($sen1[$i] eq $sen2[$i]){
 583 | 			$r = $r + 1;
 584 | 		}
 585 | 	}
 586 | 	$mismatch = length($char1) - $r;
 587 | 	if($mismatch == 1){
 588 | 		return ("--".$char2."--", 1);
 589 | 	}
 590 | 	if(substr($char1, 0, 3) eq substr($char2, 2, 3)){
 591 | 		return ($char2."----", 1);
 592 | 	}elsif(substr($char1, 2, 3) eq substr($char2, 0, 3)){
 593 | 		return ("----".$char2, 1);
 594 | 	}
 595 | 	for($i=0; $i<=$#sen1-1; $i++){
 596 | 		if($sen1[$i] eq $sen2[$i+1]){
 597 | 			$r = $r + 1;
 598 | 		}
 599 | 	}
 600 | 	$mismatch = length($char1) - 1 - $r;
 601 | 	if($mismatch == 1){
 602 | 		return ("-".$char2."---", 1);
 603 | 	}
 604 | 	for($i=1; $i<=$#sen1; $i++){
 605 | 		if($sen1[$i] eq $sen2[$i-1]){
 606 | 			$r = $r + 1;
 607 | 		}
 608 | 	}
 609 | 	$mismatch = length($char1) - 1 - $r;
 610 | 	if($mismatch == 1){
 611 | 		return ("---".$char2."-", 1);
 612 | 	}
 613 | 	return($char2, 0);
 614 | }
 615 | 
 616 | sub tcluster{
 617 | 	my $char1 = shift; my $char2 = shift;
 618 | 	my @sen1 = split(/\|/, $char1); my @sen2 = split(/\|/, $char2);
 619 | 	my @sent1 = split(//, $sen1[1]);
 620 | 	my $r = 0; my $i = 0;
 621 | 	for($i=0; $i<=$#sent1; $i++){
 622 | 		if($sent1[$i] eq "P"){
 623 | 			$r = $r + 1;
 624 | 		}
 625 | 	}
 626 | 	#$r = $r/($#sent1 + 1);
 627 | 	my ($ch1, $flag1) = &clusterchar1($sen1[0], $sen2[0]);
 628 | 	my ($ch2, $flag2) = &clusterchar1($sen1[1], $sen2[1]);
 629 | 	if(($flag1 == 1)&&($flag2 == 1)){
 630 | 		return($ch1."|".$ch2, 1);
 631 | 	}
 632 | 	if(($flag1 == 0)&&($flag2 == 1)&&($r > 4)){
 633 | 		my ($ch01, $flag01) = &clusterchar2($sen1[0], $sen2[0]);
 634 | 		if($flag01 == 1){
 635 | 			return($ch01."|".$ch2, 1);
 636 | 		}
 637 | 		if(&mismatch($sen1[0], $sen2[0]) <= $r - 3){
 638 | 			return("--".$sen2[0]."--|".$ch2, 1);
 639 | 		}
 640 | 	}
 641 | 	return($char2, 0);
 642 | }
 643 | 
 644 | sub tcluster2{
 645 | 	my $char1 = shift; my $char2 = shift;
 646 | 	my @sen1 = split(/\|/, $char1); my @sen2 = split(/\|/, $char2);
 647 | 	my @sent1 = split(//, $sen1[1]);
 648 | 	my $r = 0; my $i = 0;
 649 | 	for($i=0; $i<=$#sent1; $i++){
 650 | 		if($sent1[$i] eq "P"){
 651 | 			$r = $r + 1;
 652 | 		}
 653 | 	}
 654 | 	$r = $r/($#sent1 + 1);
 655 | 	my ($ch1, $flag1) = &clusterchar1($sen1[0], $sen2[0]);
 656 | 	my ($ch2, $flag2) = &clusterchar1($sen1[1], $sen2[1]);
 657 | 	if(($flag1 == 1)&&($flag2 == 1)){
 658 | 		return($ch1."|".$ch2, 1);
 659 | 	}elsif(($flag1 == 1)&&($flag2 == 0)){
 660 | 		my ($ch02, $flag02) = &clusterchar2($sen1[1], $sen2[1]);
 661 | 		if($flag02 == 1){
 662 | 			return($ch1."|".$ch02, 1);
 663 | 		}
 664 | 	}elsif(($flag1 == 0)&&($flag2 == 1)){
 665 | 		my ($ch01, $flag01) = &clusterchar2($sen1[0], $sen2[0]);
 666 | 		if($flag01 == 1){
 667 | 			return($ch01."|".$ch2, 1);
 668 | 		}
 669 | 	}
 670 | 	return($char2, 0);
 671 | }
 672 | 
 673 | sub clusterchar1{
 674 | 	my $char1 = shift; my $char2 = shift;
 675 | 	my $tnum = length($char1);
 676 | 	my @sen1 = split(//, $char1); my @sen2 = split(//, $char2); my $i = 0; my $r = 0; my $mismatch = 0; my $flag = 0;
 677 | 	if($char1 eq $char2){
 678 | 		return ("--".$char2."--", 1);
 679 | 	}elsif(substr($char1, 0, $tnum-1) eq substr($char2, 1, $tnum-1)){
 680 | 		return ("-".$char2."---", 1);
 681 | 	}elsif(substr($char1, 1, $tnum-1) eq substr($char2, 0, $tnum-1)){
 682 | 		return ("---".$char2."-", 1);
 683 | 	}
 684 | 	$r = 0;
 685 | 	for($i=0; $i<=$#sen1; $i++){
 686 | 		if($sen1[$i] eq $sen2[$i]){
 687 | 			$r = $r + 1;
 688 | 		}
 689 | 	}
 690 | 	$mismatch = length($char1) - $r;
 691 | 	if($mismatch <= 1){
 692 | 		return ("--".$char2."--", 1);
 693 | 	}
 694 | 	return($char2, 0);
 695 | }
 696 | 
 697 | sub clusterchar2{
 698 | 	my $char1 = shift; my $char2 = shift;
 699 | 	my $tnum = length($char1);
 700 | 	my @sen1 = split(//, $char1); my @sen2 = split(//, $char2); my $i = 0; my $r = 0; my $mismatch = 0; my $flag = 0;
 701 | 	if($char1 eq $char2){
 702 | 		return ("--".$char2."--", 1);
 703 | 	}elsif(substr($char1, 0, $tnum-1) eq substr($char2, 1, $tnum-1)){
 704 | 		return ("-".$char2."---", 1);
 705 | 	}elsif(substr($char1, 1, $tnum-1) eq substr($char2, 0, $tnum-1)){
 706 | 		return ("---".$char2."-", 1);
 707 | 	}
 708 | 	$r = 0;
 709 | 	for($i=0; $i<=$#sen1; $i++){
 710 | 		if($sen1[$i] eq $sen2[$i]){
 711 | 			$r = $r + 1;
 712 | 		}
 713 | 	}
 714 | 	$mismatch = length($char1) - $r;
 715 | 	if($mismatch <= 1){
 716 | 		return ("--".$char2."--", 1);
 717 | 	}
 718 | 	if(substr($char1, 0, $tnum-2) eq substr($char2, 2, $tnum-2)){
 719 | 		return ($char2."----", 1);
 720 | 	}elsif(substr($char1, 2, $tnum-2) eq substr($char2, 0, $tnum-2)){
 721 | 		return ("----".$char2, 1);
 722 | 	}
 723 | 	$r = 0;
 724 | 	for($i=0; $i<=$#sen1-1; $i++){
 725 | 		if($sen1[$i] eq $sen2[$i+1]){
 726 | 			$r = $r + 1;
 727 | 		}
 728 | 	}
 729 | 	$mismatch = length($char1) - 1 - $r;
 730 | 	if($mismatch <= 1){
 731 | 		return ("-".$char2."---", 1);
 732 | 	}
 733 | 	$r = 0;
 734 | 	for($i=1; $i<=$#sen1; $i++){
 735 | 		if($sen1[$i] eq $sen2[$i-1]){
 736 | 			$r = $r + 1;
 737 | 		}
 738 | 	}
 739 | 	$mismatch = length($char1) - 1 - $r;
 740 | 	if($mismatch <= 1){
 741 | 		return ("---".$char2."-", 1);
 742 | 	}
 743 | 	return($char2, 0);
 744 | }
 745 | 
 746 | sub build_motif{
 747 | 	my $rep_seq = shift; my $cont = shift; my $ALPHA = shift;
 748 | 	$rep_seq = "--".$rep_seq."--";
 749 | 	my @cons = @{$cont}; my @sen1 = split(//, $rep_seq); my @sen2 = split(//, $ALPHA);
 750 | 	my %minf = (); 
 751 | 	my $i=0; my $j=0; my $tnum = $cons[1];
 752 | 	for($j=0; $j<=$#sen1; $j++){
 753 | 		if($sen1[$j] eq $sen2[0]){
 754 | 			$minf{$j} = [$tnum*0.25,$tnum*0.25,$tnum*0.25,$tnum*0.25];
 755 | 		}elsif($sen1[$j] eq $sen2[1]){
 756 | 			$minf{$j} = [$tnum,0,0,0];
 757 | 		}elsif($sen1[$j] eq $sen2[2]){
 758 | 			$minf{$j} = [0,$tnum,0,0];
 759 | 		}elsif($sen1[$j] eq $sen2[3]){
 760 | 			$minf{$j} = [0,0,$tnum,0];
 761 | 		}else{
 762 | 			$minf{$j} = [0,0,0,$tnum];
 763 | 		}
 764 | 	}
 765 | 	for($i=2; $i<=$#cons; $i=$i+2){
 766 | 		@sen1 = split(//, $cons[$i]);
 767 | 		$tnum = $cons[$i+1];
 768 | 		for($j=0; $j<=$#sen1; $j++){
 769 | 			if($sen1[$j] eq $sen2[0]){
 770 | 				${$minf{$j}}[0] = ${$minf{$j}}[0] + $tnum*0.25;
 771 | 				${$minf{$j}}[1] = ${$minf{$j}}[1] + $tnum*0.25;
 772 | 				${$minf{$j}}[2] = ${$minf{$j}}[2] + $tnum*0.25;
 773 | 				${$minf{$j}}[3] = ${$minf{$j}}[3] + $tnum*0.25;
 774 | 			}elsif($sen1[$j] eq $sen2[1]){
 775 | 				${$minf{$j}}[0] = ${$minf{$j}}[0] + $tnum;
 776 | 			}elsif($sen1[$j] eq $sen2[2]){
 777 | 				${$minf{$j}}[1] = ${$minf{$j}}[1] + $tnum;
 778 | 			}elsif($sen1[$j] eq $sen2[3]){
 779 | 				${$minf{$j}}[2] = ${$minf{$j}}[2] + $tnum;
 780 | 			}else{
 781 | 				${$minf{$j}}[3] = ${$minf{$j}}[3] + $tnum;
 782 | 			}
 783 | 		}		
 784 | 	}
 785 | 	$tnum = $cons[0];
 786 | 	return (\%minf, $tnum);
 787 | }
 788 | 
 789 | sub build_motif2{
 790 | 	my $rep_seq = shift; my $cont = shift; my $ALPHA1 = shift; my $ALPHA2 = shift;
 791 | 	#$rep_seq = "--".$rep_seq."--";
 792 | 	my @Sen = split(/\|/, $rep_seq);
 793 | 	my @sent1 = split(//, "--".$Sen[0]."--"); my @sent2 = split(//, "--".$Sen[1]."--");
 794 | 	my @cons = @{$cont}; my @sen1 = split(//, $ALPHA1); my @sen2 = split(//, $ALPHA2);
 795 | 	my %minf1 = (); my %minf2 = ();
 796 | 	my $i=0; my $j=0; my $tnum = $cons[1];
 797 | 	for($j=0; $j<=$#sent1; $j++){
 798 | 		if($sent1[$j] eq $sen1[0]){
 799 | 			$minf1{$j} = [$tnum*0.25,$tnum*0.25,$tnum*0.25,$tnum*0.25];
 800 | 		}elsif($sent1[$j] eq $sen1[1]){
 801 | 			$minf1{$j} = [$tnum,0,0,0];
 802 | 		}elsif($sent1[$j] eq $sen1[2]){
 803 | 			$minf1{$j} = [0,$tnum,0,0];
 804 | 		}elsif($sent1[$j] eq $sen1[3]){
 805 | 			$minf1{$j} = [0,0,$tnum,0];
 806 | 		}else{
 807 | 			$minf1{$j} = [0,0,0,$tnum];
 808 | 		}
 809 | 	}
 810 | 	for($j=0; $j<=$#sent2; $j++){
 811 | 		if($sent2[$j] eq $sen2[0]){
 812 | 			$minf2{$j} = [$tnum*0.5,$tnum*0.5];
 813 | 		}elsif($sent2[$j] eq $sen2[1]){
 814 | 			$minf2{$j} = [$tnum,0];
 815 | 		}else{
 816 | 			$minf2{$j} = [0,$tnum];
 817 | 		}
 818 | 	}
 819 | 	for($i=2; $i<=$#cons; $i=$i+2){
 820 | 		@Sen = split(/\|/, $cons[$i]);
 821 | 		@sent1 = split(//, $Sen[0]); @sent2 = split(//, $Sen[1]); $tnum = $cons[$i+1];
 822 | 		for($j=0; $j<=$#sent1; $j++){
 823 | 			if($sent1[$j] eq $sen1[0]){
 824 | 				${$minf1{$j}}[0] = ${$minf1{$j}}[0] + $tnum*0.25;
 825 | 				${$minf1{$j}}[1] = ${$minf1{$j}}[1] + $tnum*0.25;
 826 | 				${$minf1{$j}}[2] = ${$minf1{$j}}[2] + $tnum*0.25;
 827 | 				${$minf1{$j}}[3] = ${$minf1{$j}}[3] + $tnum*0.25;
 828 | 			}elsif($sent1[$j] eq $sen1[1]){
 829 | 				${$minf1{$j}}[0] = ${$minf1{$j}}[0] + $tnum;
 830 | 			}elsif($sent1[$j] eq $sen1[2]){
 831 | 				${$minf1{$j}}[1] = ${$minf1{$j}}[1] + $tnum;
 832 | 			}elsif($sent1[$j] eq $sen1[3]){
 833 | 				${$minf1{$j}}[2] = ${$minf1{$j}}[2] + $tnum;
 834 | 			}else{
 835 | 				${$minf1{$j}}[3] = ${$minf1{$j}}[3] + $tnum;
 836 | 			}
 837 | 		}
 838 | 		for($j=0; $j<=$#sent2; $j++){
 839 | 			if($sent2[$j] eq $sen2[0]){
 840 | 				${$minf2{$j}}[0] = ${$minf2{$j}}[0] + $tnum*0.5;
 841 | 				${$minf2{$j}}[1] = ${$minf2{$j}}[1] + $tnum*0.5;
 842 | 			}elsif($sent2[$j] eq $sen2[1]){
 843 | 				${$minf2{$j}}[0] = ${$minf2{$j}}[0] + $tnum;
 844 | 			}else{
 845 | 				${$minf2{$j}}[1] = ${$minf2{$j}}[1] + $tnum;
 846 | 			}
 847 | 		}
 848 | 	}
 849 | 	$tnum = $cons[0];
 850 | 	return (\%minf1, \%minf2, $tnum);
 851 | }
 852 | 
 853 | sub kmer_cal3{
 854 | 	my $Inf = shift; 
 855 | 	my %inf = %{$Inf}; my %kmer_seq = (); my %kmer_loc = ();
 856 | 	my @sent = (); my @psign1 = (0)x(16); my @usign1 = (0)x(16); my @sen1 = (); my @sen2 = (); my @sen3 = ();
 857 | 	my $i = 0; my $j = 0; my $r = 0; 
 858 | 	my $key = ""; my $seq = ""; my $subseq = ""; my $sta = 0; my $end = 0; my $sna = ""; my $Num = 0;
 859 | 	foreach $key (keys %inf){
 860 | 		$seq = ${$inf{$key}}[1];
 861 | 		$subseq = substr($seq, 5, 6);
 862 | 		if(exists $kmer_seq{$subseq}){
 863 | 			$kmer_seq{$subseq} = $kmer_seq{$subseq} + 1;
 864 | 			push(@{$kmer_loc{$subseq}}, $seq);
 865 | 		}else{
 866 | 			$kmer_seq{$subseq} = 1;
 867 | 			$kmer_loc{$subseq} = [$seq];
 868 | 		}
 869 | 	}
 870 | 	foreach $key ( sort{$kmer_seq{$b} <=> $kmer_seq{$a}} keys %kmer_seq){
 871 | 		push(@sent, $key);
 872 | 	}
 873 | 	#print $#sent+1,"\n";
 874 | 	$key = $sent[0];
 875 | 	@sen1 = @{$kmer_loc{$key}};
 876 | 	#print $#sen1+1,"\n";
 877 | 	for($i=0; $i<=$#sen1; $i++){
 878 | 		$Num = $Num + 1;
 879 | 		@sen2 = split(//, $sen1[$i]);
 880 | 		for($j=0; $j<=$#sen2; $j++){
 881 | 			if($sen2[$j] eq "P"){
 882 | 				$psign1[$j] = $psign1[$j] + 1;
 883 | 			}else{
 884 | 				$usign1[$j] = $usign1[$j] + 1;
 885 | 			}
 886 | 		}
 887 | 	}
 888 | 	for($r=1; $r<=$#sent; $r++){
 889 | 		$sna = $sent[$r];
 890 | 		#print $key,"\t",$sna,"\n";
 891 | 		my ($sna2, $flag) = &clusterchar1($key, $sna);
 892 | 		if($flag == 1){
 893 | 			@sen3 = split(//, $sna2);
 894 | 			$sta = 0;
 895 | 			for($i=0; $i<=3; $i++){
 896 | 				if($sen3[$i] eq "-"){
 897 | 					$sta = $sta + 1;
 898 | 				}
 899 | 			}
 900 | 			@sen1 = @{$kmer_loc{$sna}};
 901 | 			#print $#sen1+1,"\n";
 902 | 			$sta = $sta - 2;
 903 | 			for($i=0; $i<=$#sen1; $i++){
 904 | 				$Num = $Num + 1;
 905 | 				@sen2 = split(//, $sen1[$i]);
 906 | 				for($j=0; $j<=$#sen2; $j++){
 907 | 					#$j = $j + $sta;
 908 | 					if((0<=$j + $sta)&&($j + $sta<=$#psign1)){
 909 | 						if($sen2[$j + $sta] eq "P"){
 910 | 							$psign1[$j] = $psign1[$j] + 1;
 911 | 						}else{
 912 | 							$usign1[$j] = $usign1[$j] + 1;
 913 | 						}
 914 | 					}
 915 | 				}
 916 | 			}
 917 | 		}
 918 | 	}
 919 | 	return (\@psign1, \@usign1, $Num);
 920 | }
 921 | 
 922 | sub build_str_motif{
 923 | 	my $rep_seq = shift; my $cont = shift; my $Kmer_loc = shift; my $Data_Inf = shift; my $protein_name = shift;
 924 | 	my $flank_len = 5;
 925 | 	my %kmer_loc = %{$Kmer_loc}; my %data_inf = %{$Data_Inf};
 926 | 	my %minf1 = (); my %minf2 = (); my %str_inf = ();
 927 | 	my @cons = @{$cont}; my @sen1 = (); my @sen2 = ();
 928 | 	my $i = 0; my $j = 0; my $r = 0; my $k = 0; my $num = 0; my $num1 = 0; my $num2 = 0;
 929 | 	my @psign1 = (0)x(length($rep_seq) + 2*$flank_len); my @usign1 = (0)x(length($rep_seq) + 2*$flank_len);
 930 | 	my @psign2 = (0)x(length($rep_seq) + 2*$flank_len); my @usign2 = (0)x(length($rep_seq) + 2*$flank_len);
 931 | 	#open(OUT1, ">", "strtmp/".$rep_seq."str_test.txt");
 932 | 	for($i=0; $i<=$#cons; $i++){
 933 | 		#print $cons[$i],"|\t";
 934 | 		@sen1 = @{$kmer_loc{$cons[$i]}};
 935 | 		for($j=0; $j<=$#sen1; $j++){
 936 | 			#print $sen1[$j],"\t";
 937 | 			$num = $num + 1;
 938 | 			#print $num,"\t";
 939 | 			@sen2 = split(/\_/, $sen1[$j]);
 940 | 			if(!exists $data_inf{$sen2[0]}){
 941 | 				print $cons[$i],"|",$sen2[0],"\n";
 942 | 				next;
 943 | 			}
 944 | 			my @ics = split(/\|/, ${$data_inf{$sen2[0]}}[3]);
 945 | 			my @psent = split(/\|/, ${$data_inf{$sen2[0]}}[4]);
 946 | 			my $max_sign = max(@psent); my $min_sign = min(@psent);
 947 | 			my $ave_ics = 0; my $n_ics = 0;
 948 | 			
 949 | 			my $sta = $sen2[1]; my $end = $sen2[2]; my $key = $sen2[0]; 
 950 | 			my $str_char = &get_str(${$data_inf{$key}}[1], ${$data_inf{$key}}[3], $protein_name);
 951 | 			my $str_char1 = substr($str_char, $sta - 5, $end - $sta + 1 + 10);
 952 | 
 953 | 			my $flag = 0; my $fnum = ""; 
 954 | 			if(length($str_char1) == 16){
 955 | 				my @str_sent = split(//, $str_char1);
 956 | 				for($r=0; $r<=$#str_sent; $r++){
 957 | 					if($str_sent[$r] eq "."){
 958 | 						$fnum = $fnum."U";
 959 | 					}else{
 960 | 						$fnum = $fnum."P";
 961 | 					}
 962 | 				}
 963 | 				$str_inf{$num} = [$str_char1, $fnum];
 964 | 			}
 965 | 		}
 966 | 		#last;
 967 | 	}
 968 | 	#close OUT1;
 969 | 	#print "strcuture site: ",$num,"\n";
 970 | 	#$k = keys %str_inf;
 971 | 	#print $k,"\n";
 972 | 	my ($Psign1, $Usign1, $NUM) = kmer_cal2(\%str_inf);
 973 | 	@psign1 = @{$Psign1}; @usign1 = @{$Usign1};
 974 | 	for($i=0; $i<=$#psign1; $i++){
 975 | 		#$minf{$i} = [$psign[$i], $usign[$i]];
 976 | 		if($psign1[$i] + $usign1[$i] != 0){
 977 | 			#my @prob = ($psign[$i]/($psign[$i] + $usign[$i]), $usign[$i]/($psign[$i] + $usign[$i]));
 978 | 			#my $Height = &cal_entropy(\@prob);
 979 | 			$minf1{$i} = [$psign1[$i]/($psign1[$i] + $usign1[$i]), $usign1[$i]/($psign1[$i] + $usign1[$i])];
 980 | 			#$minf{$i} = [$psign[$i]/($psign[$i] + $usign[$i])*$Height, $usign[$i]/($psign[$i] + $usign[$i])*$Height];
 981 | 		}else{
 982 | 			$minf1{$i} = [0.5, 0.5];
 983 | 			#$minf{$i} = [0, 0];
 984 | 		}
 985 | 	}
 986 | 	for($i=0; $i<=$#psign2; $i++){
 987 | 		#$minf{$i} = [$psign[$i], $usign[$i]];
 988 | 		if($psign2[$i] + $usign2[$i] != 0){
 989 | 			#my @prob = ($psign[$i]/($psign[$i] + $usign[$i]), $usign[$i]/($psign[$i] + $usign[$i]));
 990 | 			#my $Height = &cal_entropy(\@prob);
 991 | 			$minf2{$i} = [$psign2[$i]/($psign2[$i] + $usign2[$i]), $usign2[$i]/($psign2[$i] + $usign2[$i])];
 992 | 			#$minf{$i} = [$psign[$i]/($psign[$i] + $usign[$i])*$Height, $usign[$i]/($psign[$i] + $usign[$i])*$Height];
 993 | 		}else{
 994 | 			$minf2{$i} = [0.5, 0.5];
 995 | 			#$minf{$i} = [0, 0];
 996 | 		}
 997 | 	}
 998 | 	return (\%minf1, \%minf2, $num1, $NUM);
 999 | }
1000 | 
1001 | sub cal_entropy{
1002 | 	my $sen = shift;
1003 | 	my @sent = @{$sen};
1004 | 	my $i = 0; my $Sum = sum(@sent); my $Entropy = 0;
1005 | 	for($i=0; $i<=$#sent; $i++){
1006 | 		$sent[$i] = $sent[$i]/$Sum;
1007 | 		if($sent[$i] > 0){
1008 | 			$Entropy = $Entropy - $sent[$i]*log($sent[$i])/log(2);
1009 | 		}
1010 | 	}
1011 | 	return(1 - $Entropy);
1012 | }
1013 | 
1014 | sub get_str{
1015 | 	my $seqref = shift; my $ics = shift; my $protien = shift;
1016 | 	my $i; my $j; my $r;
1017 | 	my @sen1 = split(//, $seqref); my @sen2 = split(/\|/, $ics);
1018 | 	my $tmp_seq_file = $protien."_tmp_seq_file.txt"; my $tmp_shape_file = $protien."_tmp_shape_file.txt"; 
1019 | 	open(SEQ, ">", $tmp_seq_file);
1020 | 	open(SHAPE, ">", $tmp_shape_file);
1021 | 	for($i=0; $i<=$#sen1; $i++){
1022 | 		print SEQ $sen1[$i];
1023 | 	}
1024 | 	print SEQ "\n";
1025 | 	close SEQ;
1026 | 	$j = 1;
1027 | 	for($i = 0; $i<=$#sen2; $i++){
1028 | 		$j = $i + 1;
1029 | 		if($sen2[$i] < 0){
1030 | 			print SHAPE $j,"\t-1\n";
1031 | 		}else{
1032 | 			print SHAPE $j,"\t",$sen2[$i]*2,"\n";
1033 | 		}
1034 | 	}
1035 | 	close SHAPE;
1036 | 	my $str_res = `RNAfold --noPS --shapeMethod="Dm8b−0.7" --shape=$tmp_shape_file < $tmp_seq_file`;
1037 | 	my @sent1 = split(/\n/,$str_res);
1038 | 	my $exa_seq = $sent1[0];
1039 | 	my @sent2 = split(/\s/,$sent1[1]); 
1040 | 	my $exa_str = $sent2[0];
1041 | 	#my @sent3 = split(/\|/, $sna);
1042 | 	#$inf_str{$sna} = [$sent3[3], $exa_seq, $exa_str];
1043 | 	return($exa_str);
1044 | }
1045 | 
1046 | sub get_str2{
1047 | 	my $seqref = shift; my $ics = shift; my $protien = shift;
1048 | 	my %inf_seq = %{$seqref}; my %inf_ics = %{$ics}; 
1049 | 	my %inf_str = ();
1050 | 	my $sen; my $sna; my $sen1; my $seq;
1051 | 	my $i; my $j; my $r; my $count = 0;
1052 | 	my @sen = (); my @sen1 = (); my @sen2 = (); my @sen3 = ();
1053 | 	foreach $sna (keys %inf_seq){
1054 | 		@sen1 = @{$inf_seq{$sna}};
1055 | 		@sen2 = @{$inf_ics{$sna}};
1056 | 		my $tmp_seq_file = $protien."_tmp_seq_file.txt"; my $tmp_shape_file = $protien."_tmp_shape_file.txt";
1057 | 		open(SEQ, ">", $tmp_seq_file);
1058 | 		open(SHAPE, ">", $tmp_shape_file);
1059 | 		for($i=0; $i<=$#sen1; $i++){
1060 | 			print SEQ $sen1[$i];
1061 | 		}
1062 | 		print SEQ "\n";
1063 | 		close SEQ;
1064 | 		$j = 1;
1065 | 		for($i = 0; $i<=$#sen2; $i++){
1066 | 			if($sen2[$i] eq "NULL"){
1067 | 				print SHAPE $j,"\t-1\n";
1068 | 			}else{
1069 | 				print SHAPE $j,"\t",$sen2[$i]*2,"\n";
1070 | 			}
1071 | 		}
1072 | 		close SHAPE;
1073 | 		#my $str_res = `/Share2/home/zhangqf/usr/ViennaRNA-2.2.3/bin/RNAfold --noPS −−shapeMethod="Dm8b−0.7" --shape=tmp_shape_file.txt < tmp_seq_file.txt`;
1074 | 		my $str_res = `RNAfold --noPS --shapeMethod="Dm8b−0.7" --shape=$tmp_shape_file < $tmp_seq_file`;
1075 | 		my @sent1 = split(/\n/,$str_res);
1076 | 		my $exa_seq = $sent1[0];
1077 | 		my @sent2 = split(/\s/,$sent1[1]); 
1078 | 		my $exa_str = $sent2[0];
1079 | 		my @sent3 = split(/\|/, $sna);
1080 | 		$inf_str{$sna} = [$sent3[3], $exa_seq, $exa_str];
1081 | 	}
1082 | 	return (\%inf_str);
1083 | }
1084 | 
1085 | sub mismatch{
1086 | 	my $char1 = shift; my $char2 = shift;
1087 | 	my @sen1 = split(//, $char1); my @sen2 = split(//, $char2); my $i = 0; my $r = 0;
1088 | 	for($i=0; $i<=$#sen1; $i++){
1089 | 		if($sen1[$i] ne $sen2[$i]){
1090 | 			$r = $r + 1;
1091 | 		}
1092 | 	}
1093 | 	return($r);
1094 | }
1095 | 
1096 | sub read_summary{
1097 | 	my $fdata_file = shift; 
1098 | 	my $i = 0; my $j = 0; my $r = 0;
1099 | 	my $sen = ""; 
1100 | 	my @sen1 = (); my @sen2 = ();
1101 | 	my %inf = ();
1102 | 	open(FILE1, $fdata_file)||die("open $fdata_file error!\n");
1103 | 	#2
1104 | 	#UGCAUG|UUUUUU	157
1105 | 	#UGCAUG|PPPPPP	119
1106 | 	$sen = <FILE1>;
1107 | 	while($sen = <FILE1>){
1108 | 		$i = $i + 1;
1109 | 		chomp($sen);
1110 | 		@sen1 = split(/\t/, $sen);
1111 | 		@sen2 = split(/\|/, $sen1[0]);
1112 | 		$inf{$i} = [$sen2[0], $sen2[1], $sen1[1]];
1113 | 		if($i >= 10){
1114 | 			last;
1115 | 		}
1116 | 	}
1117 | 	close FILE1;
1118 | 	return(\%inf);
1119 | }
1120 | 
1121 | sub read_seq_count{
1122 | 	my $fdata_file = shift; 
1123 | 	my $i = 0; my $j = 0; my $r = 0;
1124 | 	my $sen = ""; 
1125 | 	my @sen1 = (); my @sen2 = ();
1126 | 	my %inf = ();
1127 | 	open(FILE1, $fdata_file)||die("open $fdata_file error!\n");
1128 | 	#39.2500	31.0000	35.0000	0.0000	0.0000	157.0000	0.0000	8.2500	33.2500	39.2500	
1129 | 	#39.2500	45.0000	15.0000	0.0000	157.0000	0.0000	0.0000	8.2500	42.2500	39.2500
1130 | 	while($sen = <FILE1>){
1131 | 		$i = $i + 1;
1132 | 		chomp($sen);
1133 | 		@sen1 = split(/\t/, $sen);
1134 | 		${$inf{$i}}{0} = [@sen1[0..9]];
1135 | 		for($j=1; $j<=3; $j++){
1136 | 			$sen = <FILE1>;
1137 | 			chomp($sen);
1138 | 			@sen1 = split(/\t/, $sen);
1139 | 			${$inf{$i}}{$j} = [@sen1[0..9]];
1140 | 		}
1141 | 		if($i >= 10){
1142 | 			last;
1143 | 		}
1144 | 	}
1145 | 	close FILE1;
1146 | 	return(\%inf);
1147 | }
1148 | 
1149 | sub read_str_count{
1150 | 	my $fdata_file = shift; 
1151 | 	my $i = 0; my $j = 0; my $r = 0;
1152 | 	my $sen = ""; 
1153 | 	my @sen1 = (); my @sen2 = ();
1154 | 	my %inf = ();
1155 | 	open(FILE1, $fdata_file)||die("open $fdata_file error!\n");
1156 | 	#39.2500	31.0000	35.0000	0.0000	0.0000	157.0000	0.0000	8.2500	33.2500	39.2500	
1157 | 	#39.2500	45.0000	15.0000	0.0000	157.0000	0.0000	0.0000	8.2500	42.2500	39.2500
1158 | 	while($sen = <FILE1>){
1159 | 		$i = $i + 1;
1160 | 		chomp($sen);
1161 | 		@sen1 = split(/\t/, $sen);
1162 | 		${$inf{$i}}{0} = [@sen1[0..9]];
1163 | 		for($j=1; $j<=1; $j++){
1164 | 			$sen = <FILE1>;
1165 | 			chomp($sen);
1166 | 			@sen1 = split(/\t/, $sen);
1167 | 			${$inf{$i}}{$j} = [@sen1[0..9]];
1168 | 		}
1169 | 		if($i >= 10){
1170 | 			last;
1171 | 		}
1172 | 	}
1173 | 	close FILE1;
1174 | 	return(\%inf);
1175 | }
1176 | 
1177 | sub read_tomtom{
1178 | 	my $fdata_file = shift; my $prot_name = shift; my $Sinf = shift;
1179 | 	my $i = 0; my $j = 0; my $r = 0;
1180 | 	my $sen = ""; my $Pattern = $prot_name."_";
1181 | 	my @sen1 = (); my @sen2 = (); my @sent1 = (); my @sent2 = ();
1182 | 	my %inf = (); my %uniq = (); my %sinf = %{$Sinf};
1183 | 	$inf{1} = []; $uniq{1} = 1;
1184 | 	open(FILE1, $fdata_file)||die("open $fdata_file error!\n");
1185 | 	##Query ID	Target ID	Optimal offset	p-value	E-value	q-value	Overlap	Query consensus	Target consensus	Orientation
1186 | 	#RBFOX2_mes1	RBFOX2_mes1	0	2.32559e-10	2.32559e-09	4.65117e-09	10	ACTGCATGTA	ACTGCATGTA	+
1187 | 	#RBFOX2_mes3     RBFOX2_mes9     -1      0.00661106      0.0661106       0.0440737       9       AACATGTTCA      AAATGTGCCA      +
1188 | 	#RBFOX2_mes1     RBFOX2_mes4     1       0.0902425       0.902425        0.150404        9       ACTGCATGTA      AAATGCTTGA      -
1189 | 	#RBFOX2_mes3     RBFOX2_mes2     2       0.230382        2.30382 0.394583        8       AACATGTTCA      TCTGCATGCT      +
1190 | 	$sen = <FILE1>;
1191 | 	while($sen = <FILE1>){
1192 | 		#$i = $i + 1;
1193 | 		chomp($sen);
1194 | 		@sen1 = split(/\t/, $sen);
1195 | 		if(($sen1[5] < 0.05) && ($sen1[9] eq "+") &&($sen1[1] ne $sen1[0])){
1196 | 			$sen1[0]=~s/$prot_name/$Pattern/g;
1197 | 			$sen1[1]=~s/$prot_name/$Pattern/g;
1198 | 			@sent1 = split(/\_/, $sen1[0]);
1199 | 			@sent2 = split(/\_/, $sen1[1]);
1200 | 			if(&mismatch(${$sinf{$sent1[-1]}}[1], ${$sinf{$sent2[-1]}}[1]) <= 1){
1201 | 				if((!exists $uniq{$sent1[-1]})&&(!exists $uniq{$sent2[-1]})){
1202 | 					if(exists $inf{$sent1[-1]}){
1203 | 						push(@{$inf{$sent1[-1]}}, $sent2[-1]."|".$sen1[2]."|".$sen1[7]."|".$sen1[8]);
1204 | 						$uniq{$sent1[-1]} = 1;
1205 | 						$uniq{$sent2[-1]} = 1;
1206 | 					}else{
1207 | 						$inf{$sent1[-1]} = [$sent2[-1]."|".$sen1[2]."|".$sen1[7]."|".$sen1[8]];
1208 | 						$uniq{$sent1[-1]} = 1;
1209 | 						$uniq{$sent2[-1]} = 1;
1210 | 					}
1211 | 				}
1212 | 			}
1213 | 		}
1214 | 	}
1215 | 	for($i=1; $i<=10; $i++){
1216 | 		if((!exists $uniq{$i}) && (!exists $inf{$i})){
1217 | 			$inf{$i} = [];
1218 | 		}
1219 | 	}
1220 | 	
1221 | 	close FILE1;
1222 | 	return(\%inf);
1223 | }
1224 | 
1225 | exit;
1226 | 


--------------------------------------------------------------------------------