├── Docker
    └── Dockerfile
├── LICENSE
├── Manual.pdf
├── README.md
├── Testdata
    └── Testdata.zip
├── bin
    ├── Addchr2gtf.py
    ├── Annotate.py
    ├── Annotate.sh
    ├── CalcRPKM.sh
    ├── ChiDist_2.2.1_Combine.R
    ├── CombinePipeline_Predict.sh
    ├── CombinePipeline_Retrain.sh
    ├── CombinePipeline_before_FS.sh
    ├── CombinePipeline_before_FS_NoMappa.sh
    ├── CombinePipeline_startwith_ChiDist.sh
    ├── CombinePipeline_startwith_FS.sh
    ├── Data_preprocess_MyRetrain.py
    ├── ExtractChimericRead4Retrain.py
    ├── ExtractSimulatedChimericRead4Retrain.py
    ├── FilterChiDist.py
    ├── FindChiDist.py
    ├── FindFusionSupport.py
    ├── FindFusionSupport.sh
    ├── FindHomoPattern_RAM.py
    ├── FusionScore.py
    ├── GetExonPos.py
    ├── GetGenePos.py
    ├── Model1.py
    ├── Model1_Retrain.py
    ├── MyPredict.py
    ├── PreProcessing_SingleFile.py
    ├── RenameFastqFiles.py
    ├── ResultFinalOutput.py
    ├── ResultLastFiltered.py
    ├── Results_Filtered2Final.py
    ├── RmLowMappibility_ChimericRead.py
    ├── RmLowMappibility_ChimericRead_NoFilter.py
    ├── StarMapping_Chimeric.sh
    ├── TidyupFusionFinalResult.py
    ├── TidyupFusionFinalResult_FindSupCell.py
    └── run_RmLowMap.sh
├── data
    ├── hg19mappability75.txt.zip
    └── weight-V9-2.hdf5
└── scFusion.py


/Docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM jzj2035198/scfusion:latest
 2 | 
 3 | MAINTAINER Zijie Jin (jinzijie@pku.edu.cn)
 4 | 
 5 | RUN apt-get update -y \
 6 |         && apt-get install -y git
 7 | 
 8 | WORKDIR /usr/local/src/
 9 | 
10 | RUN git clone --recursive https://github.com/ZijieJin/scFusion.git
11 | 
12 | RUN mv scFusion/bin/* scFusion-1.4/bin/ \
13 |         && mv scFusion/scFusion* scFusion-1.4/ \
14 |         && mv scFusion/ scFusion_old/ \
15 |         && mv scFusion-1.4/ scFusion/
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | For non-academic use, please email Prof. Xi (ruibinxi@math.pku.edu.cn) to obtain the paid commercial license.
 2 | 
 3 | For academnic use, source code is licensed under MIT License. 
 4 | 
 5 | 
 6 | MIT License
 7 | 
 8 | Copyright (c) 2022 XiLab
 9 | 
10 | Permission is hereby granted, free of charge, to any person obtaining a copy
11 | of this software and associated documentation files (the "Software"), to deal
12 | in the Software without restriction, including without limitation the rights
13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 | copies of the Software, and to permit persons to whom the Software is
15 | furnished to do so, subject to the following conditions:
16 | 
17 | The above copyright notice and this permission notice shall be included in all
18 | copies or substantial portions of the Software.
19 | 
20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 | SOFTWARE.
27 | 


--------------------------------------------------------------------------------
/Manual.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiDsLab/scFusion/c5fc1bd43452d9f187e47242565e782e59c5afed/Manual.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # scFusion  [![DOI](https://zenodo.org/badge/372129480.svg)](https://zenodo.org/badge/latestdoi/372129480)
 2 | 
 3 | scFusion is a computational pipeline for detecting gene fusions at single-cell resolution. scFusion works on Linux/Mac OS. If you have any questions related to scFusion, please visit https://github.com/ZijieJin/scFusion and post them on the *Issues* page or email me: jinzijie@pku.edu.cn
 4 | 
 5 | ## Software Prerequisite
 6 | 
 7 | The software below should be in your PATH. **(They can be installed by conda and pip)**
 8 | 
 9 | - [STAR](https://github.com/alexdobin/STAR) >= 2.7.2d (tested on 2.7.2d and 2.7.8a)
10 | - samtools >= 1.0 (tested on version 1.10)
11 | - bedtools (tested on version 2.29.2)
12 | - python 3
13 | - R >= 3.5 
14 | - R package: stringr
15 | - python module: [pyensembl](https://github.com/openvax/pyensembl)
16 | - python module: pysam (tested on version 0.18.0)
17 | - python module: tensorflow, keras, and numpy (version 2.8.0, 2.8.0, and 1.22.3, respectively) **OR** tensorflow, keras, numpy, and scipy (version 2.3.0, 2.4.3, 1.18.5, and 1.4.1, respectively)
18 | 
19 | 
20 | ## Recommend Configuration
21 | 
22 | - 64 GB memory or more for each task
23 | 
24 | - 8 CPU cores or more for each task 
25 | 
26 | ## Optional Configuration
27 | 
28 | - Job scheduler (e.g. Slurm)
29 | 
30 | ## Data Requirement
31 | 
32 | - Single cell RNA sequencing files from Smart-Seq protocol. File names should be *_1.fastq, *_2.fastq (e.g. 1_1.fastq, 1_2.fastq, 2_1.fastq, 2_2.fastq) ** \*_1.fq is not allowed.**
33 | 
34 | - Reference genome file (*.fa)(like hg19.fa, file size = ~3G)
35 | 
36 | - GTF annotation file (*.gtf) (Can be obtained from Ensembl (ftp://ftp.ensembl.org/pub/), NCBI, or UCSC)
37 | 
38 | - (Optional) [Mappabilityfile](https://genome.ucsc.edu/cgi-bin/hgTables) (Can be obtained from UCSC) **If you are using the hg38 version of mappability file, please ensure the file format is the same as hg19's. (Only 4 columns. chr, start, end, value)** If you do not provide this file, scFusion will turn off the mappability filter.
39 | 
40 | ## Usage
41 | 
42 | Download all scripts, unzip the hg19mappability file in the data folder, and run scFusion.py.
43 | 
44 | See **Manual.pdf** for details.
45 | 
46 | 
47 | ## Alternative installation
48 | 
49 | scFusion is easy to use, consisting of Python, R, and Shell scripts. All prerequisites can be installed by conda and pip. If you have trouble with the installation, we provide a Dockerfile to build a Docker image. 
50 | 
51 | Below we assume all the required files and folders are in the directory XXX, run
52 | 
53 | `docker run -v XXX:/data --rm jzj2035198/scfusion python -u /usr/local/src/scFusion/scFusion.py [commands]`
54 | 
55 | The `XXX:/data` means you map the XXX folder to /data, so all your files and directories in XXX can be found in /data. 
56 | 
57 | ## About the annotation file
58 | 
59 | The annotation file (\*.gtf) may have different format, so making scFusion be compatible with all the formats is difficult. 'gene_name' is the gene name indicator in the annotation, and 'gene_type' or 'gene_biotype' are the gene type (pseudo gene or LncRNA). 
60 | 
61 | ## Commercial use
62 | 
63 | For non-academic use, please email Prof. Xi (ruibinxi@math.pku.edu.cn) to obtain the paid commercial license.
64 | 
65 | For academic use, source code is licensed under MIT License. 
66 | 


--------------------------------------------------------------------------------
/Testdata/Testdata.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiDsLab/scFusion/c5fc1bd43452d9f187e47242565e782e59c5afed/Testdata/Testdata.zip


--------------------------------------------------------------------------------
/bin/Addchr2gtf.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | infile = open(sys.argv[1])
 4 | 
 5 | for line in infile.readlines():
 6 | 	if line.startswith('#'):
 7 | 		print(line, end='')
 8 | 		continue
 9 | 	if not line.startswith('chr'):
10 | 		print('chr' + line, end='')
11 | 	else:
12 | 		print(line, end='')
13 | 
14 | infile.close()
15 | 


--------------------------------------------------------------------------------
/bin/Annotate.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import sys
 3 | import pyensembl
 4 | 
 5 | 
 6 | myfile = open(sys.argv[1])
 7 | gtffile = sys.argv[2]
 8 | outfile = open(sys.argv[1][:-4] + '_geneanno.sam', 'w')
 9 | data = pyensembl.Genome(reference_name='GRCH37', annotation_name='my_genome_features', gtf_path_or_url=gtffile)
10 | data.index()
11 | with myfile:
12 |     for line in myfile:
13 |         if line[0] == '@':
14 |             continue
15 |         info = line.split('\t')
16 |         chr = info[2]
17 |         pos = int(info[3])
18 |         nameresult = data.gene_names_at_locus(contig=chr, position=pos)
19 |         k = 20
20 |         if nameresult == []:
21 |             nameresult = data.gene_names_at_locus(contig=chr, position=pos+k)
22 |         if nameresult == []:
23 |             nameresult = data.gene_names_at_locus(contig=chr, position=pos - k)
24 |         genename = ''
25 |         for item in nameresult:
26 |             genename += item + ';'
27 |         genename = genename[:-1]
28 |         outfile.write(genename + '\t' + line)
29 | 


--------------------------------------------------------------------------------
/bin/Annotate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | path=$1
 3 | mystart=$2
 4 | myend=$3
 5 | gtf=$4
 6 | codedir=$5
 7 | for((i=${mystart};i<=${myend};i++))
 8 | do
 9 | 	file=${path}/${i}.sam
10 | 	python ${codedir}/Annotate.py ${file} ${gtf}
11 | done
12 | 


--------------------------------------------------------------------------------
/bin/CalcRPKM.sh:
--------------------------------------------------------------------------------
 1 | bed=$1
 2 | dir=$2
 3 | outdir=$3
 4 | mystart=$4
 5 | myend=$5
 6 | for((i=${mystart};i<=${myend};i++))
 7 | do
 8 | 	file=`ls ${dir}/${i}/*Aligned.sortedByCoord.out.bam`
 9 | 	samtools index ${file}
10 | 	export total_reads=`samtools idxstats ${file}|awk -F '\t' '{s+=$3}END{print s}'`
11 | 	bedtools multicov -bams ${file} -bed $bed |perl -alne '{$len=$F[2]-$F[1];if($len <1 ){print "$F[3]\t$F[4]\t0" }else{$rpkm=(1000000000*$F[4]/($len* $ENV{total_reads}));print "$F[3]\t$F[4]\t$rpkm"}}' >  ${outdir}/${i}.rpkm.txt
12 | done


--------------------------------------------------------------------------------
/bin/ChiDist_2.2.1_Combine.R:
--------------------------------------------------------------------------------
  1 | ## README ####
  2 | ### V 2.2.1
  3 | ##############
  4 | 
  5 | options(warn=-1)
  6 | ## 设分布都是ZINB分布，先用极大似然估计参数
  7 | CalcZINB = function(par)
  8 | {
  9 |   sum = 0
 10 |   for (i in 1:dim(data_filtered)[1])
 11 |   {
 12 |     cc = as.numeric(stringr::str_split(stringr::str_sub(data_filtered$V4[i], 2, -2), ', ')[[1]])
 13 |     expr1 = as.numeric(stringr::str_split(stringr::str_sub(data_filtered$V15[i], 2, -2), ', ')[[1]])
 14 |     expr2 = as.numeric(stringr::str_split(stringr::str_sub(data_filtered$V16[i], 2, -2), ', ')[[1]])
 15 |     if (sum(is.na(expr1))>0 | sum(is.na(expr2))>0 | sum(is.na(cc))>0){
 16 |       next()
 17 |     }
 18 |     aveexpr1 = mean(expr1)
 19 |     aveexpr2 = mean(expr2)
 20 |     gc = (data_filtered$V13[i] + data_filtered$V14[i]) / 2
 21 |     gcpre = log(gcprelist[round(gc * 1000) + 1])
 22 |     lambda = exp(par[5])
 23 |     linpart = par[6] + par[7] * gcpre + par[8] * aveexpr1 + par[9] * aveexpr2
 24 |     zeroprob = min(exp(linpart) / (1 + exp(linpart)), 0.9999999999999)
 25 |     for (j in 1:length(cc)) {
 26 |       mu = par[1] + par[2] * gcpre + exp(par[3]) * expr1[j] + exp(par[4]) * expr2[j]
 27 |       mu = exp(mu)
 28 |       sum = sum + dnbinom(as.numeric(cc[j]),
 29 |                           1 / lambda,
 30 |                           prob = 1 / (1 + mu * lambda),
 31 |                           log = TRUE) + log(1 - zeroprob)
 32 |       sum = sum - log(1 - dnbinom(0, 1 / lambda, prob = 1 / (1 + mu * lambda))) # 因为是truncated NB
 33 |     }
 34 |     sum = sum + (numcell - length(cc)) * log(zeroprob)
 35 |     sum = sum - log(1 - zeroprob ^ numcell - numcell * (1 - zeroprob) *
 36 |                       zeroprob ^ (numcell - 1)) #条件在至少两个细胞支持
 37 |     if (is.nan(sum)){
 38 |       break
 39 |     }
 40 |   }
 41 |   - sum
 42 | }
 43 | 
 44 | 
 45 | CalcPValueZINB_Fusion_2.0.0_3 = function(par)
 46 | {
 47 |   Pv = rep(0, dim(data)[1])
 48 |   for (i in 1:dim(data)[1]){
 49 |     cc = as.numeric(stringr::str_split(stringr::str_sub(data$V4[i], 2, -2), ', ')[[1]])
 50 |     if (sum(cc) <= 3 | data$V3[i] <= 1){
 51 |       Pv[i] = 1
 52 |       next()
 53 |     }
 54 |     expr1 = as.numeric(stringr::str_split(stringr::str_sub(data$V15[i], 2, -2), ', ')[[1]])
 55 |     expr2 = as.numeric(stringr::str_split(stringr::str_sub(data$V16[i], 2, -2), ', ')[[1]])
 56 |     if (sum(is.na(expr1))>0 | sum(is.na(expr2))>0 | sum(is.na(cc))>0){
 57 |       Pv[i] = 1
 58 |       next()
 59 |     }
 60 |     aveexpr1 = mean(expr1) / 2
 61 |     aveexpr2 = mean(expr2) / 2
 62 |     gc = (data$V13[i] + data$V14[i]) / 2
 63 |     gcpre = log(gcprelist[round(gc * 1000) + 1])
 64 |     numcellsup = length(cc)
 65 |     linpart = par[6] + par[7] * gcpre + par[8] * aveexpr1 + par[9] * aveexpr2
 66 |     zeroprob = min(exp(linpart) / (1 + exp(linpart)), 0.9995)
 67 |     betamu = 1 - zeroprob
 68 |     betavar = betamu * (1 - betamu) / numcell
 69 |     myalpha = (betamu * (1 - betamu) / betavar - 1) * betamu
 70 |     mybeta = (1 / betamu - 1) * myalpha
 71 |     for (j in 1:length(cc)) {
 72 |       cc[j] = min(15, cc[j])
 73 |     }
 74 |     totalread = sum(cc)
 75 |     mu = par[1] + par[2] * gcpre + exp(par[3]) * 2 * aveexpr1 + exp(par[4]) * 2 * aveexpr2
 76 |     truemu = exp(mu)
 77 |     mu = max(truemu, 0.1)
 78 |     lambda = exp(par[5])
 79 |     truevar = truemu + truemu ^ 2 * lambda
 80 |     lambda = max(0.001, (truevar - mu) / (mu ^ 2))
 81 |     simuset = rnbinom(10000, 1 / lambda, prob = 1 / (1 + mu * lambda))
 82 |     simuset = simuset[simuset > 0]
 83 |     whilecount = 0
 84 |     while (length(simuset) < 2000){
 85 |       newsimuset = rnbinom(10000, 1 / lambda, prob = 1 / (1 + mu * lambda))
 86 |       newsimuset = newsimuset[newsimuset>0]
 87 |       simuset = c(simuset, newsimuset)
 88 |       whilecount = whilecount + 1
 89 |       if (whilecount > 100){
 90 |         simuset = c(simuset, 1)
 91 |       }
 92 |     }
 93 |     simuset[simuset >= 15] = 15
 94 |     totalreadsample = c()
 95 |     for (j in 1:100){
 96 |       nonzeronum = rbinom(1, numcell, 1-zeroprob)
 97 |       usedreadnum = sample(simuset, nonzeronum, replace = T)
 98 |       totalreadsample = c(totalreadsample, sum(usedreadnum))
 99 |     }
100 |     totalreadmean = mean(totalreadsample)
101 |     totalreadsd = sd(totalreadsample)
102 |     thispvalue = pnorm(totalread, totalreadmean, totalreadsd, lower.tail = F)
103 |     Pv[i] = thispvalue
104 |   }
105 |   #Pv = p.adjust(Pv, method = 'fdr')
106 |   Pv
107 | }
108 | 
109 | library(splines)
110 | library(stringr)
111 | set.seed(1122)
112 | Args = commandArgs()
113 | numcell = as.integer(Args[6])
114 | data = read.table(Args[7],sep='\t', stringsAsFactors = F)
115 | data_filtered = read.table(Args[8],sep='\t', stringsAsFactors = F)
116 | lastpar = rep(-100, 9)
117 | if (length(Args) >= 10){
118 |   if (file.exists(Args[10])){
119 |     load(Args[10])
120 |     lastpar = optres$par
121 |   }
122 | }
123 | 
124 | ### filter bad gc
125 | neiflag = c()
126 | for (i in 1:dim(data)[1]) {
127 |   if (max(data$V14[i], data$V13[i]) <= 0.05||min(data$V14[i], data$V13[i]) >= 0.95) {
128 |     neiflag[i] = 1
129 |   } else{
130 |     neiflag[i] = 0
131 |   }
132 | }
133 | data = data.frame(data, neiflag)
134 | data = data[data$neiflag == 0,]
135 | row.names(data) = 1:dim(data)[1]
136 | 
137 | 
138 | # fit a spline for gc
139 | gc = round((data$V14 + data$V13) / 2, digits = 3)
140 | gclist = sort(unique(gc))
141 | gclist = round(gclist, digits = 3)
142 | gclist = unique(gclist)
143 | chimexpr = gclist
144 | for (i in 1:length(gclist)) {
145 |   usedata = data[gc == gclist[i],]
146 |   exprtotal = 0
147 |   count = 0
148 |   for (j in 1:dim(usedata)[1]) {
149 |     cc = as.numeric(stringr::str_split(stringr::str_sub(usedata$V4[j], 2, -2), ', ')[[1]])
150 |     if (length(cc) >= 1) {
151 |       exprtotal = exprtotal + sum(cc)
152 |       count = count + length(cc)
153 |     }
154 |   }
155 |   chimexpr[i] = exprtotal / count
156 | }
157 | lmres = lm(chimexpr ~ bs(gclist, df = 5, intercept = T))
158 | basepre = predict(lmres, data.frame(gclist = gclist))
159 | diff = abs(chimexpr - basepre)
160 | thred = mean(diff) + 1.5 * sqrt(var(diff))
161 | for (i in 1:length(gclist)) {
162 |   pre = basepre[i]
163 |   if (chimexpr[i] - pre > thred) {
164 |     chimexpr[i] = pre
165 |   }
166 | }
167 | lmres = lm(chimexpr ~ bs(gclist, df = 5, intercept = T))
168 | gcprelist = predict(lmres, data.frame(gclist = seq(0, 1, length.out = 1001)))
169 | gcprelist[gcprelist <= 0] = 0.01
170 | 
171 | print('Start Reading ChiDist File!')
172 | neiflag = rep(0, dim(data)[1])
173 | totalcount = rep(0, dim(data)[1])
174 | datacc = rep('', dim(data)[1])
175 | dataexpmin = rep('', dim(data)[1])
176 | dataexpmax = rep('', dim(data)[1])
177 | datagcmin = rep(0, dim(data)[1])
178 | datagcmax = rep(0, dim(data)[1])
179 | for (i in 1:dim(data)[1]) {
180 |   cc = as.numeric(stringr::str_split(stringr::str_sub(data$V4[i], 2, -2), ', ')[[1]])
181 |   totalcount[i] = sum(cc)
182 |   smallindex = which(cc <= 2)
183 |   bigindex = which(cc > 2)
184 |   if (max(cc) < 2 | length(cc) < 2) {
185 |     neiflag[i] = 1
186 |   }
187 |   count1 = sum(cc<=2)
188 |   usesmallcellnum = floor(data$V5[i] * 1.1 + 2)
189 |   usesmallcell = c()
190 |   if (count1 <= usesmallcellnum){
191 |     usesmallcell = smallindex
192 |   }else{
193 |     usesmallcell = sample(smallindex, usesmallcellnum)
194 |   }
195 |   cc = cc[union(bigindex, usesmallcell)]
196 |   if (neiflag[i] == 1) {
197 |     next()
198 |   }
199 |   expmin = c()
200 |   expmax = c()
201 |   gene1expr = as.numeric(stringr::str_split(stringr::str_sub(data$V15[i], 2, -2), ', ')[[1]])
202 |   gene2expr = as.numeric(stringr::str_split(stringr::str_sub(data$V16[i], 2, -2), ', ')[[1]])
203 |   if (min(min(gene1expr), min(gene2expr)) < 0.0001) {
204 |     neiflag[i] = 1
205 |     next()
206 |   }
207 |   datagcmin[i] = min(data$V14[i], data$V13[i])
208 |   datagcmax[i] = max(data$V14[i], data$V13[i])
209 |   for (j in 1:length(gene1expr)) {
210 |     expmin = c(expmin, log(1 + min(
211 |       as.numeric(gene1expr[j]), as.numeric(gene2expr[j])
212 |     ) / 100))
213 |     expmax = c(expmax, log(1 + max(
214 |       as.numeric(gene1expr[j]), as.numeric(gene2expr[j])
215 |     ) / 100))
216 |   }
217 |   expmin = expmin[union(bigindex, usesmallcell)]
218 |   expmax = expmax[union(bigindex, usesmallcell)]
219 |   datacc[i] = paste0('[', str_c(as.character(cc), collapse = ', '), ']')
220 |   dataexpmin[i] = paste0('[', str_c(as.character(expmin), collapse = ', '), ']')
221 |   dataexpmax[i] = paste0('[', str_c(as.character(expmax), collapse = ', '), ']')
222 | }
223 | data$V4 = datacc
224 | data$V15 = dataexpmin
225 | data$V16 = dataexpmax
226 | data$V13 = datagcmin
227 | data$V14 = datagcmax
228 | data$neiflag = neiflag
229 | data$totalreadcount = totalcount
230 | data = data[data$neiflag == 0,]
231 | row.names(data) = 1:dim(data)[1]
232 | 
233 | if (norm(lastpar - rep(-100, 9), '2') <= 1){
234 |   neiflag = rep(0, dim(data_filtered)[1])
235 |   totalsup = rep(0, dim(data_filtered)[1])
236 |   lastpos1 = c(-1)
237 |   lastpos2 = c(-1)
238 |   cellsupqs = quantile(data_filtered$V3)
239 |   highthres = cellsupqs[3] + cellsupqs[4] - cellsupqs[2]
240 |   for (i in 1:dim(data_filtered)[1]) {
241 |     cc = as.numeric(stringr::str_split(stringr::str_sub(data_filtered$V4[i], 2, -2), ',')[[1]])
242 |     m = mean(cc)
243 |     a = runif(1)
244 |     if (a > m ^ 0.1 * 70000 / dim(data_filtered)[1]) {
245 |       neiflag[i] = 1
246 |       next()
247 |     }
248 |     if (max(cc) < 2 && dim(data_filtered)[1]>150) {
249 |       neiflag[i] = 0
250 |     }else{
251 |       neiflag[i] = 0
252 |     }
253 |     if (max(cc) < 2 && length(cc) <= 5){
254 |       neiflag[i] = 1
255 |     }
256 |     if (data_filtered$V3[i] > highthres && dim(data_filtered)[1]<150){
257 |       neiflag[i] = 1
258 |     }
259 |     for (k in 1:length(lastpos1)) {
260 |       if (data_filtered$V9[i] == lastpos1[k] &&
261 |           data_filtered$V10[i] == lastpos2[k]||data_filtered$V9[i] == lastpos2[k] &&
262 |           data_filtered$V10[i] == lastpos1[k]) {
263 |         neiflag[i] = 1
264 |         break()
265 |       }
266 |     }
267 |     if (neiflag[i] == 0) {
268 |       lastpos1 = c(lastpos1, data_filtered$V9[i])
269 |       lastpos2 = c(lastpos2, data_filtered$V10[i])
270 |       if (length(lastpos1) > 100) {
271 |         lastpos1 = lastpos1[2:101]
272 |         lastpos2 = lastpos2[2:101]
273 |       }
274 |     }
275 |     if (neiflag[i] == 1) {
276 |       next()
277 |     }
278 |     totalsup[i] = sum(cc)
279 |     data_filtered$V13[i] = min(data_filtered$V14[i], data_filtered$V13[i])
280 |     data_filtered$V14[i] = max(data_filtered$V14[i], data_filtered$V13[i])
281 |     expmin = c()
282 |     expmax = c()
283 |     gene1expr = as.numeric(stringr::str_split(stringr::str_sub(data_filtered$V15[i], 2, -2), ', ')[[1]])
284 |     gene2expr = as.numeric(stringr::str_split(stringr::str_sub(data_filtered$V16[i], 2, -2), ', ')[[1]])
285 |     for (j in 1:length(gene1expr)) {
286 |       expmin = c(expmin, log(1 + min(
287 |         as.numeric(gene1expr[j]), as.numeric(gene2expr[j])
288 |       ) / 100))
289 |       expmax = c(expmax, log(1 + max(
290 |         as.numeric(gene1expr[j]), as.numeric(gene2expr[j])
291 |       ) / 100))
292 |     }
293 |     data_filtered$V15[i] = paste0('[', str_c(as.character(expmin), collapse = ', '), ']')
294 |     data_filtered$V16[i] = paste0('[', str_c(as.character(expmax), collapse = ', '), ']')
295 |     if (data_filtered$V7[i] == data_filtered$V8[i] &
296 |         abs(data_filtered$V10[i] - data_filtered$V9[i]) < 200000) {
297 |       neiflag[i] = 1
298 |     } else{
299 |       neiflag[i] = 0
300 |     }
301 |     if (min(min(gene1expr), min(gene2expr)) < 0.0001) {
302 |       neiflag[i] = 1
303 |     }
304 |   }
305 |   data_filtered = data.frame(data_filtered, neiflag)
306 |   data_filtered = data_filtered[data_filtered$neiflag == 0,]
307 |   row.names(data_filtered) = 1:dim(data_filtered)[1]
308 |   
309 |   neiflag = rep(0, dim(data_filtered)[1])
310 |   for (i in 1:dim(data_filtered)[1]) {
311 |     cc = as.numeric(stringr::str_split(stringr::str_sub(data_filtered$V4[i], 2, -2), ',')[[1]])
312 |     m = mean(cc)
313 |     a = runif(1)
314 |     if (a > m ^ 0.1 * 700 / dim(data_filtered)[1]) {
315 |       neiflag[i] = 1
316 |     } else{
317 |       neiflag[i] = 0
318 |     }
319 |   }
320 |   data_filtered$neiflag = neiflag
321 |   data_filtered = data_filtered[data_filtered$neiflag == 0,]
322 |   row.names(data_filtered) = 1:dim(data_filtered)[1]
323 | }
324 | 
325 | 
326 | 
327 | if (norm(lastpar - rep(-100, 9), '2') < 1){
328 |   print('Start Estimating Parameters!')
329 |   optres = optim(rnorm(9, mean=-0.4, sd=0.1), CalcZINB, control = list(maxit = 10000))
330 |   if (length(Args) >= 10){
331 |     save(optres, file = Args[10])
332 |   }
333 | }else{
334 |   print('Skip Estimating Parameters!')
335 | }
336 | 
337 | 
338 | print('Start Calculating P-values!')
339 | PvalueFusion23 = CalcPValueZINB_Fusion_2.0.0_3(optres$par)
340 | data = cbind(data, PvalueFusion23)
341 | TrueFusion = rep(1, dim(data)[1])
342 | 
343 | print('Preparing for Output!')
344 | goodindex = which(TrueFusion <= 2)
345 | totalnumfusion = length(goodindex)
346 | allresult = data.frame(
347 |   'FusionName' = rep(NA, totalnumfusion),
348 |   CellSupport = rep(NA, totalnumfusion),
349 |   JunctionReadCount = rep(NA, totalnumfusion),
350 |   SpanningFragCount = rep(NA, totalnumfusion),
351 |   Position1 = rep(NA, totalnumfusion),
352 |   Position2 = rep(NA, totalnumfusion),
353 |   FakeProb = rep(NA, totalnumfusion),
354 |   Pv23 = rep(NA, totalnumfusion),
355 |   strand1 = rep(NA, totalnumfusion),
356 |   strand2 = rep(NA, totalnumfusion)
357 | )
358 | 
359 | fusioncandidateread = data.frame(
360 |   JunctionRead = rep(NA, totalnumfusion),
361 |   Brkpnt = rep(NA, totalnumfusion),
362 |   Prob = rep(NA, totalnumfusion)
363 | )
364 | fusioncandidate = rep('', totalnumfusion)
365 | for (i in 1:totalnumfusion) {
366 |   fusioncandidate[i] = paste(data$V1[i], data$V2[i], sep = '--')
367 | }
368 | allresult$FusionName = fusioncandidate
369 | allresult$JunctionReadCount = data$totalreadcount
370 | fusioncandidateread$JunctionRead = data$V19
371 | fusioncandidateread$Brkpnt = data$V20
372 | fusioncandidateread$Prob = data$V25
373 | allresult$FakeProb = data$V25
374 | allresult$Pv23 = data$PvalueFusion23
375 | allresult$CellSupport = data$V3
376 | allresult$strand1 = data$V21
377 | allresult$strand2 = data$V22
378 | spanningfragcount = rep(0, totalnumfusion)
379 | pos1 = rep('', totalnumfusion)
380 | pos2 = rep('', totalnumfusion)
381 | for (i in 1:totalnumfusion) {
382 |   if (as.numeric(data$V5[i]) == 0) {
383 |     spanningfragcount[i] = 0
384 |   } else{
385 |     spanningfragcount[i] = sum(as.numeric(stringr::str_split(
386 |       stringr::str_sub(data$V6[i], 2, -2), ', '
387 |     )[[1]]))
388 |   }
389 |   pos1[i] = paste(data$V7[i], data$V9[i], sep = ':')
390 |   pos2[i] = paste(data$V8[i], data$V10[i], sep = ':')
391 | }
392 | allresult$SpanningFragCount = spanningfragcount
393 | allresult$Position1 = pos1
394 | allresult$Position2 = pos2
395 | write.table(allresult, Args[9], sep = '\t', quote = F, row.names = F)
396 | 


--------------------------------------------------------------------------------
/bin/CombinePipeline_Predict.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | FilePath=$1
 4 | mystart=$2
 5 | myend=$3
 6 | prefix=$4
 7 | weightfile=$5
 8 | hg19file=$6
 9 | gtf=$7
10 | codedir=$8
11 | 
12 | if [ "${prefix}" = "." ]
13 | then
14 | 	prefix=""
15 | fi
16 | 
17 | mkdir -p ${FilePath}/ChiDist/
18 | 
19 | python ${codedir}/MyPredict.py  ${FilePath}/ChiDist/${prefix}Prob.txt ${weightfile} ${prefix}
20 | paste ${FilePath}/ChiDist/${prefix}ChiDist_middle.txt ${FilePath}/ChiDist/${prefix}Prob.txt > ${FilePath}/ChiDist/${prefix}ChiDist.txt
21 | python ${codedir}/FilterChiDist.py ${FilePath}/ChiDist/${prefix}ChiDist.txt > ${FilePath}/ChiDist/${prefix}ChiDist_filtered.txt


--------------------------------------------------------------------------------
/bin/CombinePipeline_Retrain.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | DatasetName=$1
 3 | prefix=$2
 4 | initweight=$3
 5 | epochnum=$4
 6 | codedir=$5
 7 | mkdir -p ${DatasetName}/Retrain
 8 | mkdir -p ${DatasetName}/weights/
 9 | 
10 | if [ "${prefix}" = "." ]
11 | then
12 | 	prefix=""
13 | fi
14 | 
15 | python ${codedir}/ExtractChimericRead4Retrain.py ${DatasetName}/ChiDist/${prefix}ChiDist_middle.txt > ${DatasetName}/Retrain/${prefix}ChimericRead.txt
16 | python ${codedir}/ExtractSimulatedChimericRead4Retrain.py ${DatasetName}/Retrain/${prefix}ChimericRead.txt ${DatasetName}/STARMapping/ > ${DatasetName}/Retrain/${prefix}SimuRead.txt
17 | python ${codedir}/Data_preprocess_MyRetrain.py ${DatasetName}/Retrain/${prefix}ChimericRead.txt ${DatasetName}/Retrain/${prefix}SimuRead.txt ${DatasetName}/Retrain/
18 | python ${codedir}/Model1_Retrain.py ${DatasetName}/Retrain/ ${initweight} ${DatasetName}/weights/ ${epochnum}


--------------------------------------------------------------------------------
/bin/CombinePipeline_before_FS.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | FilePath=$1
 4 | mystart=$2
 5 | myend=$3
 6 | gtffile=$4
 7 | mappabilityfile=$5
 8 | exonfile=$6
 9 | codedir=$7
10 | mkdir -p ${FilePath}/STARMapping
11 | mkdir -p ${FilePath}/ChimericOut
12 | mkdir -p ${FilePath}/Expr/
13 | 
14 | 
15 | for ((i=${mystart};i<=${myend};i++))
16 | do
17 | 	file=`ls ${FilePath}/STARMapping/${i}/*Chimeric.out.sam`
18 | 	if [[ -n ${file} ]]; then
19 | 		python ${codedir}/RmLowMappibility_ChimericRead.py ${file} ${FilePath}/ChimericOut/${i}.sam ${mappabilityfile} 1
20 | 	fi
21 | done
22 | 
23 | sh ${codedir}/Annotate.sh ${FilePath}/ChimericOut/ ${mystart} ${myend} ${gtffile} ${codedir}
24 | sh ${codedir}/FindFusionSupport.sh ${FilePath}/ChimericOut/ ${mystart} ${myend} ${codedir}
25 | sh ${codedir}/CalcRPKM.sh ${exonfile} ${FilePath}/STARMapping ${FilePath}/Expr/ ${mystart} ${myend}


--------------------------------------------------------------------------------
/bin/CombinePipeline_before_FS_NoMappa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | FilePath=$1
 4 | mystart=$2
 5 | myend=$3
 6 | gtffile=$4
 7 | exonfile=$5
 8 | codedir=$6
 9 | mkdir -p ${FilePath}/STARMapping
10 | mkdir -p ${FilePath}/ChimericOut
11 | mkdir -p ${FilePath}/Expr/
12 | 
13 | 
14 | for ((i=${mystart};i<=${myend};i++))
15 | do
16 | 	file=`ls ${FilePath}/STARMapping/${i}/*Chimeric.out.sam`
17 | 	if [[ -n ${file} ]]; then
18 | 		python ${codedir}/RmLowMappibility_ChimericRead_NoFilter.py ${file} ${FilePath}/ChimericOut/${i}.sam
19 | 	fi
20 | done
21 | 
22 | sh ${codedir}/Annotate.sh ${FilePath}/ChimericOut/ ${mystart} ${myend} ${gtffile} ${codedir}
23 | sh ${codedir}/FindFusionSupport.sh ${FilePath}/ChimericOut/ ${mystart} ${myend} ${codedir}
24 | sh ${codedir}/CalcRPKM.sh ${exonfile} ${FilePath}/STARMapping ${FilePath}/Expr/ ${mystart} ${myend}


--------------------------------------------------------------------------------
/bin/CombinePipeline_startwith_ChiDist.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | FilePath=$1
 4 | prefix=$2
 5 | numcell=$3
 6 | pv=$4
 7 | fakeprob=$5
 8 | gtf=$6
 9 | genepos=$7
10 | codedir=$8
11 | lncfilter=$9
12 | nasfilter=${10}
13 | 
14 | if [ "${prefix}" = "." ]
15 | then
16 | 	prefix=""
17 | fi
18 | 
19 | mkdir -p ${FilePath}/FinalResult/
20 | mkdir -p ${FilePath}/FinalResult/temp/
21 | Rscript ${codedir}/ChiDist_2.2.1_Combine.R ${numcell} ${FilePath}/ChiDist/${prefix}ChiDist.txt ${FilePath}/ChiDist/${prefix}ChiDist_filtered.txt ${FilePath}/FinalResult/temp/${prefix}Allresult.txt ${FilePath}/FinalResult/temp/${prefix}pars.RData
22 | python ${codedir}/TidyupFusionFinalResult.py ${FilePath}/FinalResult/temp/${prefix}Allresult.txt ${FilePath}/ChimericOut/ > ${FilePath}/FinalResult/temp/${prefix}Allresult_filtered.txt
23 | python ${codedir}/Results_Filtered2Final.py ${FilePath}/FinalResult/temp/${prefix}Allresult_filtered.txt ${pv} ${fakeprob} ${numcell} > ${FilePath}/FinalResult/${prefix}Final.txt
24 | python ${codedir}/TidyupFusionFinalResult_FindSupCell.py ${FilePath}/FinalResult/${prefix}Final.txt ${FilePath}/ChimericOut/ > ${FilePath}/FinalResult/${prefix}Final_Cells.txt
25 | python ${codedir}/ResultLastFiltered.py ${FilePath}/FinalResult/${prefix}Final_Cells.txt ${genepos} ${FilePath}/FinalResult/${prefix}Final_Cells_filtered.txt ${gtf} ${genepos} ${lncfilter} ${nasfilter}
26 | python ${codedir}/ResultFinalOutput.py ${FilePath}/FinalResult/${prefix}Final_Cells_filtered.txt ${gtf} ${FilePath}/ChimericOut/ ${FilePath}/FinalResult/${prefix}FinalOutput
27 | 


--------------------------------------------------------------------------------
/bin/CombinePipeline_startwith_FS.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | FilePath=$1
 4 | mystart=$2
 5 | myend=$3
 6 | prefix=$4
 7 | hg19file=$5
 8 | gtf=$6
 9 | codedir=$7
10 | 
11 | if [ "${prefix}" = "." ]
12 | then
13 | 	prefix=""
14 | fi
15 | 
16 | mkdir -p ${FilePath}/ChiDist/
17 | python ${codedir}/FusionScore.py ${FilePath}/ChimericOut/ ${mystart} ${myend} ${FilePath}/Expr/ > ${FilePath}/ChimericOut/${prefix}FusionScore.txt
18 | python ${codedir}/FindHomoPattern_RAM.py ${FilePath}/ChimericOut/${prefix}FusionScore.txt ${hg19file} ${gtf} > ${FilePath}/ChiDist/${prefix}Homo.txt
19 | python ${codedir}/FindChiDist.py ${FilePath}/ChimericOut/ ${mystart} ${myend} ${FilePath}/Expr/ ${FilePath}/ChiDist/${prefix}Homo.txt ${prefix} > ${FilePath}/ChiDist/${prefix}ChiDist_middle.txt
20 | python ${codedir}/PreProcessing_SingleFile.py ${FilePath}/ChiDist/${prefix}FusionRead.txt ${prefix}
21 | 


--------------------------------------------------------------------------------
/bin/Data_preprocess_MyRetrain.py:
--------------------------------------------------------------------------------
  1 | from keras.models import Sequential
  2 | from keras.layers import Embedding,Dropout,Bidirectional,Flatten,Dense,LSTM,TimeDistributed
  3 | from keras.callbacks import ModelCheckpoint,CSVLogger
  4 | from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
  5 | import numpy as np
  6 | import sys
  7 | ########################################################################################################################
  8 | np.random.seed(1122)
  9 | ChimericFile = sys.argv[1]
 10 | FakeChimericFile = sys.argv[2]
 11 | Outdir = sys.argv[3]
 12 | with open(ChimericFile,'r') as f:
 13 |     ChimericRead_info = f.read()
 14 | 
 15 | ChimericRead_info = ChimericRead_info.split('\n')
 16 | 
 17 | ChimericRead =[]
 18 | # ChimericPoint=[]
 19 | Cont = 1
 20 | for readinfo in ChimericRead_info:
 21 |     readinfo_split = readinfo.split('\t')
 22 |     read = readinfo_split[0]
 23 |     try:
 24 |         MergePoint = int(readinfo_split[1])
 25 |     except:
 26 |         sys.stderr.write(readinfo)
 27 |     read_new = read[0:MergePoint]+'H'+read[MergePoint:]
 28 |     # read_new = read
 29 |     Point = np.zeros(60)
 30 |     Point[MergePoint-1] = 1
 31 |     Point[MergePoint] = 1
 32 |     if 'N' not in read_new:
 33 |         ChimericRead.append(read_new)
 34 |         read_new_inv = read_new.replace('A','5').replace('T','6').replace('C','7').replace('G','8')
 35 |         read_new_inv = read_new_inv[::-1]
 36 |         read_new_inv = read_new_inv.replace('5','T').replace('6','A').replace('7','G').replace('8','C')
 37 |         ChimericRead.append(read_new_inv)
 38 |         # ChimericPoint.append(Point)
 39 |     # print(Cont)
 40 |     Cont = Cont + 1
 41 | ########################################################################################################################
 42 | with open(FakeChimericFile,'r') as f:
 43 |     ManmadeRead_info = f.read()
 44 | 
 45 | ManmadeRead_info = ManmadeRead_info.split('\n')
 46 | 
 47 | ManmadeRead =[]
 48 | # ManmadePoint=[]
 49 | Cont = 1
 50 | for readinfo in ManmadeRead_info:
 51 |     readinfo_split = readinfo.split('\t')
 52 |     read = readinfo_split[0]
 53 |     try:
 54 |         MergePoint = int(readinfo_split[1])
 55 |     except:
 56 |         sys.stderr.write(readinfo)
 57 |     read_new = read[0:MergePoint]+'H'+read[MergePoint:]
 58 |     # read_new = read
 59 |     Point = np.zeros(61)
 60 |     Point[MergePoint-1] = 1
 61 |     Point[MergePoint] = 1
 62 | 
 63 |     if 'N' not in read_new:
 64 |         ManmadeRead.append(read_new)
 65 |         read_new_inv = read_new.replace('A','5').replace('T','6').replace('C','7').replace('G','8')
 66 |         read_new_inv = read_new_inv[::-1]
 67 |         read_new_inv = read_new_inv.replace('5','T').replace('6','A').replace('7','G').replace('8','C') 
 68 |         ManmadeRead.append(read_new_inv)
 69 |         # ManmadePoint.append(Point)
 70 | 
 71 |     # print(Cont)
 72 |     Cont = Cont + 1
 73 | ########################################################################################################################
 74 | 
 75 | Data1 = np.ndarray(shape=(len(ChimericRead),61,1),dtype=float)
 76 | for index in range(len(ChimericRead)):
 77 |     Data1[index,:,0] = np.array([int(c) for c in ChimericRead[index].replace('A','0').replace('T','1').replace('C','2').replace('G','3').replace('H','4')])
 78 |     # Data1[index,:,1] = ManmadePoint[index]
 79 | 
 80 | Data2 = np.ndarray(shape=(len(ManmadeRead),61,1),dtype=float)
 81 | for index in range(len(ManmadeRead)):
 82 |     Data2[index,:,0] = np.array([int(c) for c in ManmadeRead[index].replace('A','0').replace('T','1').replace('C','2').replace('G','3').replace('H','4')])
 83 |     # Data2[index,:,1] = ManmadePoint[index]
 84 | 
 85 | ########################################################################################################################
 86 | 
 87 | DataNum = min(Data1.shape[0],Data2.shape[0])
 88 | Data1 = Data1[0:DataNum,:,:]
 89 | Data2 = Data2[0:DataNum,:,:]
 90 | 
 91 | TraNum = int(DataNum*0.7)
 92 | 
 93 | Good_for_Tra = Data1[0:TraNum,:,:]
 94 | Simu_for_Tra = Data2[0:TraNum,:,:]
 95 | 
 96 | Good_for_Tst = Data1[TraNum:,:,:]
 97 | Simu_for_Tst = Data2[TraNum:,:,:]
 98 | 
 99 | 
100 | LIST = list(range(Good_for_Tra.shape[0]))
101 | np.random.shuffle(LIST)
102 | Good_for_Tra = Good_for_Tra[LIST,:,:]
103 | Simu_for_Tra = Simu_for_Tra[LIST,:,:]
104 | 
105 | 
106 | LIST = list(range(Good_for_Tst.shape[0]))
107 | np.random.shuffle(LIST)
108 | Good_for_Tst = Good_for_Tst[LIST,:,:]
109 | Simu_for_Tst = Simu_for_Tst[LIST,:,:]
110 |     
111 | 
112 | np.save(Outdir + '/Good_for_Tra.npy',Good_for_Tra)
113 | np.save(Outdir + '/Simu_for_Tra.npy',Simu_for_Tra)
114 | 
115 | np.save(Outdir + '/Good_for_Tst.npy',Good_for_Tst)
116 | np.save(Outdir + '/Simu_for_Tst.npy',Simu_for_Tst)
117 | 
118 | ########################################################################################################################
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 


--------------------------------------------------------------------------------
/bin/ExtractChimericRead4Retrain.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import division
 3 | import sys
 4 | import math
 5 | import random
 6 | 
 7 | 
 8 | # ***** readme *****
 9 | # This code extracts chimeric read from sam file for training, with pos and direction
10 | # The input is *ChiDist_middle.txt
11 | random.seed(1122)
12 | infile = open(sys.argv[1])
13 | lines = infile.readlines()
14 | totallines = len(lines)
15 | uselines = random.sample(lines[int(totallines/10):], math.floor(min(int(totallines * 0.4), max(15000, totallines/9))))
16 | for line in uselines:
17 |     info = line.rstrip().split('\t')
18 |     gene1 = info[0]
19 |     gene2 = info[1]
20 |     if gene1.startswith('IG')  or gene2.startswith('IG')  or gene2.startswith('TRA') or gene1.startswith('TRA'):
21 |         continue
22 |     print(info[-4] + '\t' + info[-3] + '\t' + info[6] + ':' + info[8] + ':' + info[-2] + '\t' + info[7] + ':' + info[9] + ':' + info[-1])
23 | infile.close()
24 | 


--------------------------------------------------------------------------------
/bin/ExtractSimulatedChimericRead4Retrain.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from __future__ import division
  3 | import sys
  4 | import random
  5 | import os
  6 | import pysam
  7 | 
  8 | # ***** readme *****
  9 | # This code extracts chimeric read from sam file for training, with pos and direction
 10 | # The input is *.sam
 11 | 
 12 | def ReverseComplement(str):
 13 |     return str[::-1].replace('A', 't').replace('T', 'a').replace('G', 'c').replace('C', 'g').upper()
 14 | 
 15 | 
 16 | random.seed(1122)
 17 | chimericfile = open(sys.argv[1])
 18 | mappingpath = sys.argv[2]
 19 | linenum = len(chimericfile.readlines())
 20 | chimericfile.close()
 21 | count = 0
 22 | cellindex = []
 23 | for dir in os.listdir(mappingpath):
 24 |     cellindex.append(dir)
 25 | while count < linenum:
 26 |     thisindex = random.sample(cellindex, 1)[0]
 27 |     try:
 28 |         found = False
 29 |         for file in os.listdir(mappingpath + thisindex):
 30 |             if file.find('Aligned.sortedByCoord.out.bam') > -1:
 31 |                 samfile = pysam.AlignmentFile(mappingpath + thisindex + '/' + file, 'rb')
 32 |                 found = True
 33 |                 break
 34 |     except:
 35 |         continue
 36 |     if not found:
 37 |         continue
 38 |     sam = []
 39 |     for r in samfile:
 40 |         sam.append(r)
 41 |     thiscount = 0
 42 |     while thiscount < len(sam) / 5 and count < linenum:
 43 |         while True:
 44 |             a = random.randint(1, len(sam)) - 1
 45 |             b = random.randint(1, len(sam)) - 1
 46 |             chr1 = str(sam[a].reference_name)
 47 |             chr2 = str(sam[b].reference_name)
 48 |             allread1 = sam[a].seq
 49 |             allread2 = sam[b].seq
 50 |             readlength = len(allread1)
 51 |             if not chr1.startswith('chr'):
 52 |                 chr1 = 'chr' + chr1
 53 |             if not chr2.startswith('chr'):
 54 |                 chr2 = 'chr' + chr2
 55 |             if len(sam[a].cigar) > 1 or len(sam[b].cigar) > 1:
 56 |                 continue
 57 |             if not (chr1 == '*' or chr2 == '*' or chr1 == 'chrM' or chr2 == 'chrM'):
 58 |                 break
 59 |         read1length = 30
 60 |         read2length = 60 - read1length
 61 |         c = random.randint(0, 60 - read1length - 1)
 62 |         d = random.randint(0, 60 - read2length - 1)
 63 |         try:
 64 |             read1 = sam[a].seq[c:c + read1length]
 65 |         except:
 66 |             sys.stderr.write(str(sam[a]))
 67 |         line2 = sam[b]
 68 |         try:
 69 |             read2 = sam[b].seq[d:d + read2length]
 70 |         except:
 71 |             sys.stderr.write(str(sam[b]))
 72 |         e = random.randint(0, 1)
 73 |         f = random.randint(0, 1)
 74 |         if e == 0:
 75 |             e = -1
 76 |             read2 = ReverseComplement(read2)
 77 |             pos2 = sam[b].pos + d + read2length - 1
 78 |         else:
 79 |             pos2 = sam[b].pos + d
 80 |         if f == 0:
 81 |             f = -1
 82 |             read1 = ReverseComplement(read1)
 83 |             pos1 = sam[a].pos + c
 84 |         else:
 85 |             pos1 = sam[a].pos + c + read1length - 1
 86 | 
 87 | 
 88 |         if f == -1:
 89 |             direct1 = '+'
 90 |         else:
 91 |             direct1 = '-'
 92 |         if e == 1:
 93 |             direct2 = '+'
 94 |         else:
 95 |             direct2 = '-'
 96 |         if read1.find('N') == -1 and read2.find('N') == -1 and len(read1 + read2) == 60:
 97 |             print(read1.upper() + read2.upper() + '\t' + str(read1length) + '\t', end='')
 98 |             print(chr1 + ':' + str(pos1) + ':' + direct1 + '\t' + chr2 + ':' + str(pos2) + ':' + direct2)
 99 |             count += 1
100 |             thiscount += 1
101 |     samfile.close()


--------------------------------------------------------------------------------
/bin/FilterChiDist.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import division
 3 | import sys
 4 | import numpy
 5 | 
 6 | # ***** readme *****
 7 | # This code filters the fusion candidates than have lower than 3 supportors.
 8 | 
 9 | 
10 | def MakeString(mylist, sep='\t'):
11 |     string = ''
12 |     for i in range(len(mylist) - 1):
13 |         string += str(mylist[i]) + sep
14 |     string += mylist[-1]
15 |     return string
16 | 
17 | 
18 | bigcount = 0
19 | ChiDistFile = open(sys.argv[1])
20 | templines = []
21 | linedic = {}
22 | for line in ChiDistFile.readlines():
23 |     if line[0] == '#':
24 |         continue
25 |     try:
26 |         info = line.split('\t')
27 |         cc = info[3].replace('\n', '').replace('\r', '')
28 |         count = cc[1:-1].split(', ')
29 |         for i in range(len(count) - 1, -1, -1):
30 |             if int(count[i].replace("'", '')) < 1:
31 |                 count.remove(count[i].replace("'", ''))
32 |         mysum = 0
33 |         for item in count:
34 |             mysum += int(item.replace("'", ''))
35 |         if len(count) == 1:
36 |             continue
37 |         if mysum / len(count) > 5:
38 |             continue
39 |         info[3] = '[' + MakeString(count, ', ') + ']'
40 |         info[2] = len(count)
41 |         templines.append(MakeString(info))
42 |     except:
43 |         pass
44 | ChiDistFile.close()
45 | for line in templines:
46 |     info = line.split('\t')
47 |     score = len(info[3])
48 |     linedic[line] = score
49 | numrecord = len(linedic)
50 | for key in sorted(linedic, key=linedic.__getitem__, reverse=True):
51 |     if bigcount < min(10.0, numpy.floor(numrecord / 50)):
52 |         bigcount += 1
53 |     else:
54 |         print(key, end='')
55 | 


--------------------------------------------------------------------------------
/bin/FindChiDist.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from __future__ import division
  3 | import sys
  4 | import math
  5 | import os
  6 | import numpy
  7 | 
  8 | 
  9 | # ***** readme *****
 10 | # This code finds the distribution of the chimeric read in each cell for each gene.
 11 | 
 12 | 
 13 | def ReverseComplement(str):
 14 |     return str[::-1].replace('A', 't').replace('T', 'a').replace('G', 'c').replace('C', 'g').upper()
 15 | 
 16 | 
 17 | def chr2num(str):
 18 |     if str.startswith('chr'):
 19 |         str = str[3:]
 20 |     if str == 'X':
 21 |         return 23
 22 |     if str == 'Y':
 23 |         return 24
 24 |     return int(str)
 25 | 
 26 | 
 27 | def SolveClip(str, readlen):
 28 |     num = []
 29 |     alp = []
 30 |     res = []
 31 |     start = 0
 32 |     scount = 0
 33 |     Ndel = 0
 34 |     length = len(str)
 35 |     lastalp = -1
 36 |     for i in range(length):
 37 |         if str[i].isalpha():
 38 |             alp.append(str[i])
 39 |             num.append(int(str[lastalp + 1:i]))
 40 |             lastalp = i
 41 |     i = 0
 42 |     while i < len(alp):
 43 |         if alp[i] == 'N':
 44 |             for j in range(i):
 45 |                 if alp[j] == 'S':
 46 |                     scount = 1
 47 |                     break
 48 |             del alp[i]
 49 |             Ndel += num[i]
 50 |             del num[i]
 51 |             continue
 52 |         i += 1
 53 |     lastm = -1
 54 |     i = 0
 55 |     while i < len(alp):
 56 |         if alp[i] == 'M':
 57 |             if lastm == -1:
 58 |                 lastm = i
 59 |                 continue
 60 |             if lastm == i - 1:
 61 |                 num[i - 1] += num[i]
 62 |                 del alp[i]
 63 |                 del num[i]
 64 |                 continue
 65 |             lastm = i
 66 |         i += 1
 67 |     numsum = [num[0]]
 68 |     for i in range(1, len(num)):
 69 |         numsum.append(numsum[-1] + num[i])
 70 |     for i in range(len(alp)):
 71 |         if alp[i] == 'M':
 72 |             if i == 0:
 73 |                 res = [numsum[0]]
 74 |             elif i == len(alp) - 1:
 75 |                 res.append(numsum[i - 1])
 76 |             else:
 77 |                 res.append(numsum[i - 1])
 78 |                 res.append(numsum[i])
 79 |     for i in range(len(alp)):
 80 |         if alp[i] == 'M':
 81 |             if i > 0:
 82 |                 start = numsum[i - 1]
 83 |             break
 84 |     return [res, alp, start, 0, Ndel, scount]
 85 | 
 86 | 
 87 | def AddCount(genename, q=1.0):
 88 |     genename = genename.split(';')
 89 |     for gene in genename:
 90 |         gene = gene.split('.')[0]
 91 |         if gene in ExprDic:
 92 |             if ExprDic[gene][1] == i:
 93 |                 ExprDic[gene][0][-1] += q
 94 |             else:
 95 |                 for k1 in range(ExprDic[gene][1]+1, i):
 96 |                     ExprDic[gene][0].append(0)
 97 |                 ExprDic[gene][0].append(q)
 98 |                 ExprDic[gene][1] = i
 99 |         else:
100 |             ExprDic[gene] = [[], i]
101 |             for k1 in range(start, i):
102 |                 ExprDic[gene][0].append(0)
103 |             ExprDic[gene][0].append(q)
104 | 
105 | 
106 | def AddCount2(genename, q=1.0):
107 |     genename = genename.split('.')[0]
108 |     if genename in ChimericExprDic:
109 |         ChimericExprDic[genename] += q
110 |     else:
111 |         ChimericExprDic[genename] = q
112 | 
113 | 
114 | argnum = len(sys.argv)
115 | filedir = sys.argv[1]
116 | start = int(sys.argv[2])
117 | last = int(sys.argv[3])
118 | ExprDir = sys.argv[4]
119 | HomologyResultFile = open(sys.argv[5])
120 | FusionMatrix = {}
121 | EncompassingMatrix = {}
122 | FusionPos = {}
123 | Clusters = {}
124 | ClusterSize = {}
125 | ClusterCount = {}
126 | ExprDic = {}
127 | ChimericExprDic = {}
128 | GeneDictionary = {}
129 | HomologyDic = {}
130 | good_dist = 1
131 | goodcount = 0
132 | prefix = ''
133 | if argnum == 7:
134 |     prefix = sys.argv[6]
135 | if prefix == '.':
136 |     prefix = ''
137 | readfile = open(filedir + '../ChiDist/' + prefix + 'FusionRead.txt', 'w')
138 | # for finding reads near the brkpnts
139 | CandidateList = []
140 | CandidateVector = {}
141 | 
142 | 
143 | for line in HomologyResultFile.readlines():
144 |     if line[0] == '#':
145 |         continue
146 |     info = line.split('\t')
147 |     try:
148 |         fusionscore = float(info[2])
149 |         chr1 = info[3]
150 |         pos1 = info[5]
151 |         chr2 = info[4].replace(' ', '')
152 |         pos2 = info[6]
153 |         homoscore = info[11]
154 |         gccontent = [info[12], info[13]]
155 |         cellcount = info[8]
156 |         read1 = info[16]
157 |         read2 = info[17].replace('\n', '').replace('\r', '')
158 |         HomologyDic[chr1 + '\t' + chr2 + '\t' + pos1 + '\t' + pos2] = [homoscore, gccontent, cellcount, read1, read2]
159 |     except:
160 |         pass
161 | HomologyResultFile.close()
162 | for i in range(int(start), int(last) + 1):
163 |     try:
164 |         infile = open(filedir + '/' + str(i) + '_FusionSupport.txt')
165 |         exprfile = open(ExprDir + '/' + str(i) + '.rpkm.txt')
166 |         samfile = open(filedir + '/' + str(i) + '_geneanno.sam')
167 |     except:
168 |         continue
169 |     foundcount = 0
170 |     for key in CandidateVector:
171 |         if CandidateVector[key]:
172 |             foundcount += 1
173 |     sys.stderr.write('Starting: ' + str(i) + '\tCandidate Size: ' + str(len(CandidateVector)) + '\tFound Size: ' + str(foundcount) + '\n')
174 |     lastgene = ''
175 |     totalrc = 0
176 |     for line in exprfile.readlines():
177 |         info = line.split('\t')
178 |         gene = info[0]
179 |         rc = int(info[1])
180 |         if gene == lastgene:
181 |             totalrc += rc
182 |             continue
183 |         if lastgene != '':
184 |             AddCount(lastgene, totalrc)
185 |         lastgene = gene
186 |         totalrc = rc
187 |     AddCount(lastgene, totalrc)
188 |     lastname = ''
189 |     lines = []
190 |     badreadname = ''
191 |     allsamfilelines = samfile.readlines()
192 |     readlength = 1000
193 |     for line in allsamfilelines:
194 |         if line[0] == '#':
195 |             continue
196 |         info = line.split('\t')
197 |         thisname = info[1]
198 |         if thisname == badreadname:
199 |             continue
200 |         if thisname == lastname:
201 |             lines.append(line)
202 |         else:
203 |             if len(lines) == 2:
204 |                 info1 = lines[0].split('\t')
205 |                 info2 = lines[1].split('\t')
206 |                 if info1[0] != info2[0]:
207 |                     gene1 = info1[0]
208 |                     gene2 = info2[0]
209 |                     AddCount(gene1)
210 |                     AddCount(gene2)
211 |                     AddCount2(gene1)
212 |                     AddCount2(gene2)
213 |             elif len(lines) == 3:
214 |                 info1 = lines[0].split('\t')
215 |                 info2 = lines[1].split('\t')
216 |                 info3 = lines[2].split('\t')
217 |                 gene1 = info1[0].split('.')[0]
218 |                 gene2 = info2[0].split('.')[0]
219 |                 gene3 = info3[0].split('.')[0]
220 |                 if gene1 != gene2 or gene2 != gene3:
221 |                     readlength = len(info1[10])
222 |                     if info1[10] == info2[10] or info1[10] == ReverseComplement(info2[10]):
223 |                         AddCount(gene1, 0.5)
224 |                         AddCount(gene2, 0.5)
225 |                         AddCount(gene3)
226 |                         AddCount2(gene1, 0.5)
227 |                         AddCount2(gene2, 0.5)
228 |                         AddCount2(gene3)
229 |                         readinfo1 = info1
230 |                         readinfo2 = info2
231 |                     elif info1[10] == info3[10] or info1[10] == ReverseComplement(info3[10]):
232 |                         AddCount(gene1, 0.5)
233 |                         AddCount(gene2)
234 |                         AddCount(gene3, 0.5)
235 |                         AddCount2(gene1, 0.5)
236 |                         AddCount2(gene2)
237 |                         AddCount2(gene3, 0.5)
238 |                         readinfo1 = info1
239 |                         readinfo2 = info3
240 |                     elif info2[10] == info3[10] or info2[10] == ReverseComplement(info3[10]):
241 |                         AddCount(gene1)
242 |                         AddCount(gene2, 0.5)
243 |                         AddCount(gene3, 0.5)
244 |                         AddCount2(gene1)
245 |                         AddCount2(gene2, 0.5)
246 |                         AddCount2(gene3, 0.5)
247 |                         readinfo1 = info3
248 |                         readinfo2 = info2
249 |                     else:
250 |                         #sys.stderr.write('!!!!!' + info1[1])
251 |                         lines = [line]
252 |                         lastname = thisname
253 |                         readinfo2 = [1]
254 |                         readinfo1 = [1]  # don't run codes below, just want to read lines
255 |             lines = [line]
256 |             lastname = thisname
257 | 
258 |     CandidateList = []
259 |     for line in infile.readlines():
260 |         info = line.split('\t')
261 |         gene1 = info[0]
262 |         gene2 = info[1]
263 |         chromo1 = info[4]
264 |         chromo2 = info[5]
265 |         encompass = int(info[2])
266 |         if gene1 + '\t' + gene2 in EncompassingMatrix:
267 |             if encompass > 0:
268 |                 EncompassingMatrix[gene1 + '\t' + gene2].append(encompass)
269 |         elif gene2 + '\t' + gene1 in EncompassingMatrix:
270 |             if encompass > 0:
271 |                 EncompassingMatrix[gene2 + '\t' + gene1].append(encompass)
272 |         else:
273 |             if encompass > 0:
274 |                 EncompassingMatrix[gene1 + '\t' + gene2] = [encompass]
275 |             else:
276 |                 EncompassingMatrix[gene1 + '\t' + gene2] = []
277 |         splitcount = int(info[3])
278 |         if splitcount == 0:
279 |             continue
280 |         splitreadinfo = info[6].split(';')
281 |         splitreadinfo = list(set(splitreadinfo))        # remove duplication
282 |         subscore = []
283 |         Pos = {}
284 |         subgrp = []
285 |         for item in splitreadinfo:
286 |             if len(item) > 3:
287 |                 iteminfo = item.split('+')
288 |                 if not 6 <= int(iteminfo[1]) <= readlength - 6:
289 |                     continue
290 |                 pos = iteminfo[0].split(',')
291 |                 if pos[0] + ',' + pos[1] in Pos:
292 |                     Pos[pos[0] + ',' + pos[1]] += 1
293 |                 else:
294 |                     Pos[pos[0] + ',' + pos[1]] = 1
295 |                 badmap = iteminfo[2].split(',')
296 |                 # editscore = max(50 - abs(50 - int(iteminfo[1])) - int(badmap[0]) - int(badmap[1]), 0)
297 |                 ssscore = 1
298 |                 if len(subgrp) == 0:
299 |                     subgrp = [{pos[0] + ',' + pos[1]: 1}]
300 |                     subscore.append(ssscore)
301 |                 else:
302 |                     fin = False
303 |                     for j in range(len(subgrp)):
304 |                         for key in subgrp[j]:
305 |                             grpos0 = key.split(',')[0]
306 |                             grpos1 = key.split(',')[1]
307 |                             if abs(int(pos[0]) - int(grpos0)) <= 3 * good_dist and abs(int(pos[1]) - int(grpos1)) <= 3 * good_dist:
308 |                                 fin = True
309 |                                 if pos[0] + ',' + pos[1] in subgrp[j]:
310 |                                     subgrp[j][pos[0] + ',' + pos[1]] += 1
311 |                                 else:
312 |                                     subgrp[j][pos[0] + ',' + pos[1]] = 1
313 |                                 subscore[j] += ssscore
314 |                                 break
315 |                         if fin:
316 |                             break
317 |                     if not fin:
318 |                         subgrp.append({pos[0] + ',' + pos[1]: 1})
319 |                         subscore.append(ssscore)
320 |         if len(subgrp) == 0:
321 |             continue
322 |         rawscore = []
323 |         for j in subscore:
324 |             if j > 0.5:
325 |                 rawscore.append(j)
326 |             else:
327 |                 rawscore.append(0)
328 |         rev = False
329 |         AvePos = []
330 |         for mdict in subgrp:  # calculate average pos for each cluster
331 |             left = 0
332 |             right = 0
333 |             cc = 0
334 |             for key in mdict:
335 |                 pos = key.split(',')
336 |                 left += int(pos[0]) * mdict[key]
337 |                 right += int(pos[1]) * mdict[key]
338 |                 cc += mdict[key]
339 |             left = int(left / cc)
340 |             right = int(right / cc)
341 |             AvePos.append([left, right, cc])
342 |         if gene1 + '\t' + gene2 in FusionMatrix:
343 |             added = list(numpy.zeros(len(FusionMatrix[gene1 + '\t' + gene2])))
344 |             for j in range(len(subgrp)):
345 |                 dist = []
346 |                 for k in FusionPos[gene1 + '\t' + gene2][1]:
347 |                     d1 = abs(k[1][0] - AvePos[j][0])
348 |                     d2 = abs(k[1][1] - AvePos[j][1])
349 |                     if d1 <= good_dist and d2 <= good_dist:
350 |                         dist.append(d1 + d2)
351 |                     else:
352 |                         dist.append(10 * good_dist)
353 |                 if min(dist) >= 3 * good_dist:
354 |                     FusionPos[gene1 + '\t' + gene2][1].append([subgrp[j], AvePos[j], [i]])
355 |                     FusionMatrix[gene1 + '\t' + gene2].append([rawscore[j]])
356 |                     added.append(0)
357 |                 else:
358 |                     smallindex = dist.index(min(dist))
359 |                     if added[smallindex] == 0:
360 |                         FusionMatrix[gene1 + '\t' + gene2][smallindex].append(rawscore[j])
361 |                         added[smallindex] = 1
362 |                     else:
363 |                         FusionMatrix[gene1 + '\t' + gene2][smallindex][-1] += (rawscore[j])
364 |                     old = FusionPos[gene1 + '\t' + gene2][1][smallindex][1]
365 |                     newcc = AvePos[j][2] + old[2]
366 |                     newleft = int((old[2] * old[0] + AvePos[j][2] * AvePos[j][0]) / newcc)
367 |                     newright = int((old[2] * old[1] + AvePos[j][2] * AvePos[j][1]) / newcc)
368 |                     FusionPos[gene1 + '\t' + gene2][1][smallindex][1] = [newleft, newright, newcc]
369 |                     FusionPos[gene1 + '\t' + gene2][1][smallindex][2].append(i)
370 |                 if not [gene1, gene2, chromo1, chromo2, AvePos[j][0], AvePos[j][1]] in CandidateList and not [gene2, gene1, chromo2, chromo1, AvePos[j][1], AvePos[j][0]] in CandidateList:
371 |                     CandidateList.append([gene1, gene2, chromo1, chromo2, AvePos[j][0], AvePos[j][1]])
372 |             gene = gene1 + '\t' + gene2
373 |         elif gene2 + '\t' + gene1 in FusionMatrix:
374 |             added = list(numpy.zeros(len(FusionMatrix[gene2 + '\t' + gene1])))
375 |             for j in range(len(subgrp)):
376 |                 dist = []
377 |                 for k in FusionPos[gene2 + '\t' + gene1][1]:
378 |                     d1 = abs(k[1][0] - AvePos[j][1])
379 |                     d2 = abs(k[1][1] - AvePos[j][0])
380 |                     if d1 <= good_dist and d2 <= good_dist:
381 |                         dist.append(d1 + d2)
382 |                     else:
383 |                         dist.append(10 * good_dist)
384 |                 if min(dist) >= 3 * good_dist:
385 |                     FusionPos[gene2 + '\t' + gene1][1].append(
386 |                         [subgrp[j], [AvePos[j][1], AvePos[j][0], AvePos[j][2]], [i]])
387 |                     FusionMatrix[gene2 + '\t' + gene1].append([rawscore[j]])
388 |                     added.append(0)
389 |                 else:
390 |                     smallindex = dist.index(min(dist))
391 |                     if added[smallindex] == 0:
392 |                         FusionMatrix[gene2 + '\t' + gene1][smallindex].append(rawscore[j])
393 |                         added[smallindex] = 1
394 |                     else:
395 |                         FusionMatrix[gene2 + '\t' + gene1][smallindex][-1] += (rawscore[j])
396 |                     old = FusionPos[gene2 + '\t' + gene1][1][smallindex][1]
397 |                     newcc = AvePos[j][2] + old[2]
398 |                     newleft = int((old[2] * old[0] + AvePos[j][2] * AvePos[j][1]) / newcc)
399 |                     newright = int((old[2] * old[1] + AvePos[j][2] * AvePos[j][0]) / newcc)
400 |                     FusionPos[gene2 + '\t' + gene1][1][smallindex][1] = [newleft, newright, newcc]
401 |                     FusionPos[gene2 + '\t' + gene1][1][smallindex][2].append(i)
402 |                 if not [gene1, gene2, chromo1, chromo2, AvePos[j][0], AvePos[j][1]] in CandidateList and not [gene2, gene1, chromo2, chromo1, AvePos[j][1], AvePos[j][0]] in CandidateList:
403 |                     CandidateList.append([gene1, gene2, chromo1, chromo2, AvePos[j][0], AvePos[j][1]])
404 |             gene = gene2 + '\t' + gene1
405 |             rev = True
406 |         else:
407 |             FusionMatrix[gene1 + '\t' + gene2] = []
408 |             for item in rawscore:
409 |                 FusionMatrix[gene1 + '\t' + gene2].append([item])
410 |             FusionPos[gene1 + '\t' + gene2] = [[chromo1, chromo2], []]
411 |             for k in range(len(subgrp)):
412 |                 FusionPos[gene1 + '\t' + gene2][1].append([subgrp[k], AvePos[k], [i]])
413 |                 if not [gene1, gene2, chromo1, chromo2, AvePos[k][0], AvePos[k][1]] in CandidateList and not [gene2, gene1, chromo2, chromo1, AvePos[k][1], AvePos[k][0]] in CandidateList:
414 |                     CandidateList.append([gene1, gene2, chromo1, chromo2, AvePos[k][0], AvePos[k][1]])
415 |             gene = gene1 + '\t' + gene2
416 | 
417 |     for l in range(len(CandidateList)):
418 |         try:
419 |             aa = chr2num(CandidateList[l][2])
420 |             aa = chr2num(CandidateList[l][3])
421 |             thischr1 = chr2num(CandidateList[l][2])
422 |             thischr2 = chr2num(CandidateList[l][3])
423 |         except:
424 |             pass
425 |         pos1 = int(CandidateList[l][4])
426 |         pos2 = int(CandidateList[l][5])
427 |         if thischr1 > thischr2 or (thischr1 == thischr2 and pos2 > pos1):
428 |             thischr1, thischr2 = thischr2, thischr1
429 |             pos1, pos2 = pos2, pos1
430 |         if not (thischr1, thischr2, pos1, pos2) in CandidateVector:
431 |             CandidateVector[(thischr1, thischr2, pos1, pos2)] = []
432 | 
433 |     for line in allsamfilelines:
434 |         if line[0] == '#':
435 |             continue
436 |         info = line.split('\t')
437 |         thisname = info[1]
438 |         if thisname == badreadname:
439 |             continue
440 |         if thisname == lastname:
441 |             lines.append(line)
442 |         else:
443 |             if len(lines) == 2:
444 |                 info1 = lines[0].split('\t')
445 |                 info2 = lines[1].split('\t')
446 |             elif len(lines) == 3:
447 |                 info1 = lines[0].split('\t')
448 |                 info2 = lines[1].split('\t')
449 |                 info3 = lines[2].split('\t')
450 |                 gene1 = info1[0].split('.')[0]
451 |                 gene2 = info2[0].split('.')[0]
452 |                 gene3 = info3[0].split('.')[0]
453 |                 if gene1 != gene2 or gene2 != gene3:
454 |                     if info1[10] == info2[10] or info1[10] == ReverseComplement(info2[10]):
455 |                         readinfo1 = info1
456 |                         readinfo2 = info2
457 |                     elif info1[10] == info3[10] or info1[10] == ReverseComplement(info3[10]):
458 |                         readinfo1 = info1
459 |                         readinfo2 = info3
460 |                     elif info2[10] == info3[10] or info2[10] == ReverseComplement(info3[10]):
461 |                         readinfo1 = info3
462 |                         readinfo2 = info2
463 |                     else:
464 |                         # sys.stderr.write('!!!!!' + info1[1])
465 |                         lines = [line]
466 |                         lastname = thisname
467 |                         readinfo2 = [1]
468 |                         readinfo1 = [1]  # don't run codes below, just want to read lines
469 |                     if len(readinfo1) > 1 and len(gene1) > 1 and len(gene2) > 1 and len(gene3) > 1:  # normal conditions
470 |                         gene1 = readinfo1[0]
471 |                         gene2 = readinfo2[0]
472 |                         chromo1 = readinfo1[3]
473 |                         chromo2 = readinfo2[3]
474 |                         if not chromo1.startswith('chr'):
475 |                             chromo1 = 'chr' + chromo1
476 |                         if not chromo2.startswith('chr'):
477 |                             chromo2 = 'chr' + chromo2
478 |                         clip1 = readinfo1[6]
479 |                         clip2 = readinfo2[6]
480 |                         readlen = len(readinfo1[10])
481 |                         clipsplit1 = SolveClip(clip1, readlen)
482 |                         clipsplit2 = SolveClip(clip2, readlen)
483 |                         cc = False
484 |                         splitpnt1 = -1
485 |                         splitpnt2 = -1
486 |                         for i in range(len(clipsplit1[0])):
487 |                             for j in range(len(clipsplit2[0])):
488 |                                 if clipsplit2[0][j] == clipsplit1[0][i] or clipsplit2[0][j] + clipsplit1[0][i] == readlen:
489 |                                     splitpnt1 = clipsplit1[0][i]
490 |                                     splitpnt2 = clipsplit2[0][j]
491 |                                     cc = True
492 |                                     break
493 |                             if cc:
494 |                                 break
495 |                         if readlen - 0.5 > splitpnt1 > -0.5 and readlen - 0.5 > splitpnt2 > -0.5:
496 |                             if clipsplit1[1][i] == 'S':
497 |                                 brkpnt1 = int(readinfo1[4])
498 |                                 direct1 = '+'
499 |                             else:
500 |                                 brkpnt1 = splitpnt1 - clipsplit1[2] + int(readinfo1[4]) + clipsplit1[4] - 1
501 |                                 direct1 = '-'
502 |                             if clipsplit2[1][j] == 'S':
503 |                                 brkpnt2 = int(readinfo2[4])
504 |                                 direct2 = '+'
505 |                             else:
506 |                                 brkpnt2 = splitpnt2 - clipsplit2[2] + int(readinfo2[4]) + clipsplit2[4] - 1
507 |                                 direct2 = '-'
508 |                             try:
509 |                                 aa = chr2num(chromo1)
510 |                                 aa = chr2num(chromo2)
511 |                                 thisposlist = [chr2num(chromo1), chr2num(chromo2), brkpnt1, brkpnt2]
512 |                             except:
513 |                                 thisposlist = [chromo1, chromo2, brkpnt1, brkpnt2]
514 |                             exchange = False
515 |                             if thisposlist[0] > thisposlist[1] or (thisposlist[0] == thisposlist[1] and thisposlist[3] > thisposlist[2]):
516 |                                 thisposlist = [thisposlist[1], thisposlist[0], thisposlist[3], thisposlist[2]]
517 |                                 exchange = True
518 |                             found = False
519 |                             for k1 in range(-3, 4):
520 |                                 for k2 in range(-3, 4):
521 |                                     if (thisposlist[0], thisposlist[1], thisposlist[2] + k1, thisposlist[3] + k2) in CandidateVector:
522 |                                         found = True
523 |                                         thisposindex = (thisposlist[0], thisposlist[1], thisposlist[2] + k1, thisposlist[3] + k2)
524 |                             if found:
525 |                                 extractedRead = ''
526 |                                 if CandidateVector[thisposindex]:
527 |                                     if CandidateVector[thisposindex][1] != 30:
528 |                                         dist = min(CandidateVector[thisposindex][1], 60 - CandidateVector[thisposindex][1])
529 |                                         if 30 > splitpnt1 > dist:
530 |                                             extractedRead = readinfo1[10][:60]
531 |                                             inbrkpnt = splitpnt1
532 |                                         if 30 <= splitpnt1 <= readlen - 30:
533 |                                             extractedRead = readinfo1[10][splitpnt1 - 30:splitpnt1 + 30]
534 |                                             inbrkpnt = 30
535 |                                         if readlen - dist > splitpnt1 > readlen - 30:
536 |                                             extractedRead = readinfo1[10][readlen - 60:]
537 |                                             inbrkpnt = splitpnt1 - readlen + 60
538 |                                         if extractedRead != '':
539 |                                             if direct1 == '+':
540 |                                                 extractedRead = ReverseComplement(extractedRead)
541 |                                                 inbrkpnt = 60 - inbrkpnt
542 |                                             if exchange:
543 |                                                 extractedRead = ReverseComplement(extractedRead)
544 |                                                 inbrkpnt = 60 - inbrkpnt
545 |                                             CandidateVector[thisposindex][0] = extractedRead
546 |                                             CandidateVector[thisposindex][1] = inbrkpnt
547 |                                 else:
548 |                                     if splitpnt1 < 30:
549 |                                         extractedRead = readinfo1[10][:60]
550 |                                         inbrkpnt = splitpnt1
551 |                                     if 30 <= splitpnt1 <= readlen - 30:
552 |                                         extractedRead = readinfo1[10][splitpnt1 - 30:splitpnt1 + 30]
553 |                                         inbrkpnt = 30
554 |                                     if splitpnt1 > readlen - 30:
555 |                                         extractedRead = readinfo1[10][readlen - 60:]
556 |                                         inbrkpnt = splitpnt1 - readlen + 60
557 |                                     if direct1 == '+':
558 |                                         extractedRead = ReverseComplement(extractedRead)
559 |                                         inbrkpnt = 60 - inbrkpnt
560 |                                     if exchange:
561 |                                         extractedRead = ReverseComplement(extractedRead)
562 |                                         inbrkpnt = 60 - inbrkpnt
563 |                                         direct1, direct2 = direct2, direct1
564 |                                     CandidateVector[thisposindex] = [extractedRead, inbrkpnt, direct1, direct2]
565 |             lines = [line]
566 |             lastname = thisname
567 |     samfile.close()
568 |     infile.close()
569 |     exprfile.close()
570 | for genename in ExprDic:
571 |     if ExprDic[genename][1] < last:
572 |         for k in range(ExprDic[genename][1]+1, last+1):
573 |             ExprDic[genename][0].append(0)
574 | for gene in FusionMatrix:
575 |     temp = FusionPos[gene][1]
576 |     for i in range(len(temp)):
577 |         keylist = sorted(temp[i][0], key=temp[i][0].__getitem__, reverse=True)
578 |         possum = 0
579 |         countlist = []
580 |         for key in keylist:
581 |             possum += temp[i][0][key]
582 |             countlist.append(int(temp[i][0][key]))
583 |         if possum > 0 and max(countlist) / possum >= 0.4:
584 |             for key in temp[i][0]:
585 |                 if key == keylist[0]:
586 |                     pos0 = int(keylist[0].split(',')[0])
587 |                     pos1 = int(keylist[0].split(',')[1])
588 |                     if abs(pos0 - temp[i][1][0]) + abs(pos1 - temp[i][1][1]) < abs(pos1 - temp[i][1][0]) + abs(
589 |                             pos0 - temp[i][1][1]):
590 |                         FusionPos[gene][1][i].append([pos0, pos1])
591 |                     else:
592 |                         FusionPos[gene][1][i].append([pos1, pos0])
593 |                     break
594 |         else:
595 |             FusionPos[gene][1][i].append([-1, -1])
596 | templines = []
597 | for gene in FusionMatrix:
598 |     for i in range(len(FusionPos[gene][1])):
599 |         if FusionPos[gene][1][i][3][0] > -0.5:
600 |             genepart = gene.split('\t')
601 |             usecell = FusionPos[gene][1][i][2]
602 |             usecell = list(set(usecell))
603 |             try:
604 |                 aaaaa = []
605 |                 thisgenename = genepart[0].split('.')[0]
606 |                 for cell in usecell:
607 |                     aaaaa.append(ExprDic[thisgenename][0][cell-start])
608 |             except:
609 |                 aaaaa = list(numpy.zeros(last-start+1, int))
610 |                 continue
611 |             try:
612 |                 bbbbb = []
613 |                 thisgenename = genepart[1].split('.')[0]
614 |                 for cell in usecell:
615 |                     bbbbb.append(ExprDic[thisgenename][0][cell-start])
616 |             except:
617 |                 bbbbb = list(numpy.zeros(last-start+1, int))
618 |                 continue
619 |             if FusionPos[gene][0][0] + '\t' + FusionPos[gene][0][1] + '\t' + str(FusionPos[gene][1][i][3][0]) + '\t' \
620 |                     + str(FusionPos[gene][1][i][3][1]) in HomologyDic:
621 |                 [homoscore, gccontent, cellcount, read1, read2] = HomologyDic[FusionPos[gene][0][0] + '\t' + FusionPos[gene][0][1]
622 |                                                                 + '\t' + str(FusionPos[gene][1][i][3][0]) + '\t' + str(
623 |                     FusionPos[gene][1][i][3][1])]
624 |             elif FusionPos[gene][0][1] + '\t' + FusionPos[gene][0][0] + '\t' + str(FusionPos[gene][1][i][3][1]) + '\t' \
625 |                     + str(FusionPos[gene][1][i][3][0]) in HomologyDic:
626 |                 [homoscore, gccontent, cellcount, read1, read2] = HomologyDic[FusionPos[gene][0][1] + '\t' + FusionPos[gene][0][0]
627 |                                                                 + '\t' + str(FusionPos[gene][1][i][3][1]) + '\t' + str(
628 |                     FusionPos[gene][1][i][3][0])]
629 |             else:
630 |                 continue
631 |             engene = gene
632 |             if gene not in EncompassingMatrix:
633 |                 engene = genepart[1] + '\t' + genepart[0]
634 |             templine = genepart[0] + '\t' + genepart[1] + '\t' + \
635 |                        str(len(FusionMatrix[gene][i])) + '\t' + str(FusionMatrix[gene][i]) + '\t' + \
636 |                        str(len(EncompassingMatrix[engene])) + '\t' + str(EncompassingMatrix[engene]) + '\t' + \
637 |                        FusionPos[gene][0][0] + '\t' + FusionPos[gene][0][1] + '\t' + str(FusionPos[gene][1][i][3][0]) + \
638 |                        '\t' + str(FusionPos[gene][1][i][3][1]) + '\t' + str(i) + '\t' + homoscore + '\t' + \
639 |                        gccontent[0] + '\t' + gccontent[1] + '\t' + str(aaaaa) + '\t' + str(bbbbb) + '\t' + read1 + '\t' + read2
640 |             templines.append(templine)
641 | uselines = []
642 | for i in range(len(templines)):
643 |     info = templines[i].split('\t')
644 |     try:
645 |         aa = chr2num(info[6])
646 |         aa = chr2num(info[7])
647 |         thischr1 = chr2num(info[6])
648 |         thischr2 = chr2num(info[7])
649 |     except:
650 |         pass
651 |     thispos1 = int(info[8])
652 |     thispos2 = int(info[9])
653 |     exchange = False
654 |     if thischr1 > thischr2 or (thischr1 == thischr2 and thispos2 > thispos1):
655 |         thischr1, thischr2 = thischr2, thischr1
656 |         thispos1, thispos2 = thispos2, thispos1
657 |         exchange = True
658 |     found = False
659 |     for k1 in range(-3, 4):
660 |         for k2 in range(-3, 4):
661 |             if (thischr1, thischr2, thispos1 + k1, thispos2 + k2) in CandidateVector:
662 |                 found = True
663 |                 thisposindex = (thischr1, thischr2, thispos1 + k1, thispos2 + k2)
664 |     if found:
665 |         if CandidateVector[thisposindex]:
666 |             if exchange:
667 |                 uselines.append(templines[i].rstrip() + '\t' + ReverseComplement(CandidateVector[thisposindex][
668 |                     0]) + '\t' + str(60-CandidateVector[thisposindex][1]) + '\t' + CandidateVector[thisposindex][3] + '\t' +
669 |                                 CandidateVector[thisposindex][2])
670 |             else:
671 |                 uselines.append(templines[i].rstrip() + '\t' + CandidateVector[thisposindex][
672 |                     0] + '\t' + str(CandidateVector[thisposindex][1]) + '\t' + CandidateVector[thisposindex][2] + '\t' +
673 |                                 CandidateVector[thisposindex][3])
674 | 
675 | 
676 | linedic = {}
677 | for line in uselines:
678 |     info = line.split('\t')
679 |     score = len(info[3])
680 |     linedic[line] = score
681 | for key in sorted(linedic, key=linedic.__getitem__, reverse=True):
682 |     info = key.split('\t')
683 |     read = info[-4]
684 |     splitpos = info[-3]
685 |     chromo1 = info[6]
686 |     chromo2 = info[7]
687 |     pos1 = info[8]
688 |     pos2 = info[9]
689 |     if read.find('N') > -1:
690 |         continue
691 |     if len(read) == 60 or True:
692 |         readfile.write(read + '\t' + splitpos + '\t' + chromo1 + ':' + pos1 + ':' + info[-2] + '\t' + chromo2 + ':' +
693 |                        pos2 + ':' + info[-1] + '\n')
694 |         print(key)
695 | readfile.close()
696 | 
697 | 


--------------------------------------------------------------------------------
/bin/FindFusionSupport.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import sys
  3 | # ***** readme *****
  4 | # This code analyses softclipping to find encompassing reads and split reads for one sam file
  5 | 
  6 | 
  7 | 
  8 | 
  9 | def ReverseComplement(str):
 10 |     return str[::-1].replace('A', 't').replace('T', 'a').replace('G', 'c').replace('C', 'g').upper()
 11 | 
 12 | 
 13 | def SolveClip(str, readlen):
 14 |     num = []
 15 |     alp = []
 16 |     res = []
 17 |     start = 0
 18 |     scount = 0
 19 |     Ndel = 0
 20 |     length = len(str)
 21 |     lastalp = -1
 22 |     for i in range(length):
 23 |         if str[i].isalpha():
 24 |             alp.append(str[i])
 25 |             num.append(int(str[lastalp + 1:i]))
 26 |             lastalp = i
 27 |     i = 0
 28 |     while i < len(alp):
 29 |         if alp[i] == 'N':
 30 |             for j in range(i):
 31 |                 if alp[j] == 'S':
 32 |                     scount = 1
 33 |                     break
 34 |             del alp[i]
 35 |             Ndel += num[i]
 36 |             del num[i]
 37 |             continue
 38 |         i += 1
 39 |     lastm = -1
 40 |     i = 0
 41 |     while i < len(alp):
 42 |         if alp[i] == 'M':
 43 |             if lastm == -1:
 44 |                 lastm = i
 45 |                 continue
 46 |             if lastm == i - 1:
 47 |                 num[i - 1] += num[i]
 48 |                 del alp[i]
 49 |                 del num[i]
 50 |                 continue
 51 |             lastm = i
 52 |         i += 1
 53 |     numsum = [num[0]]
 54 |     for i in range(1, len(num)):
 55 |         numsum.append(numsum[-1] + num[i])
 56 |     for i in range(len(alp)):
 57 |         if alp[i] == 'M':
 58 |             if i == 0:
 59 |                 res = [numsum[0]]
 60 |             elif i == len(alp) - 1:
 61 |                 res.append(numsum[i - 1])
 62 |             else:
 63 |                 res.append(numsum[i - 1])
 64 |                 res.append(numsum[i])
 65 |     for i in range(len(alp)):
 66 |         if alp[i] == 'M':
 67 |             if i > 0:
 68 |                 start = numsum[i - 1]
 69 |             break
 70 |     return [res, alp, start, 0, Ndel, scount]
 71 | 
 72 | 
 73 | def TakeoutFusionSupport(lines):
 74 |     if len(lines) == 2:
 75 |         info1 = lines[0].split('\t')
 76 |         info2 = lines[1].split('\t')
 77 |         if info1[0] != info2[0]:
 78 |             cgene1 = info1[0]
 79 |             cgene2 = info2[0]
 80 |             chromo1 = info1[3]
 81 |             chromo2 = info2[3]
 82 |             if not chromo1.startswith('chr'):
 83 |                 chromo1 = 'chr' + chromo1
 84 |             if not chromo2.startswith('chr'):
 85 |                 chromo2 = 'chr' + chromo2
 86 |             genelist1 = cgene1.split(';')
 87 |             genelist2 = cgene2.split(';')  # separate genes with semicolon
 88 |             for gene1 in genelist1:
 89 |                 for gene2 in genelist2:
 90 |                     if gene1 + '\t' + gene2 in geneset:
 91 |                         geneset[gene1 + '\t' + gene2][0] += 1
 92 |                     elif gene2 + '\t' + gene1 in geneset:
 93 |                         geneset[gene2 + '\t' + gene1][0] += 1
 94 |                     else:
 95 |                         geneset[gene1 + '\t' + gene2] = [1, 0, [], [chromo1, chromo2]]
 96 |     elif len(lines) == 3:
 97 |         info1 = lines[0].split('\t')
 98 |         info2 = lines[1].split('\t')
 99 |         info3 = lines[2].split('\t')
100 |         gene1 = info1[0]
101 |         gene2 = info2[0]
102 |         gene3 = info3[0]
103 |         if (gene1 != gene2 or gene2 != gene3) and (gene1 == gene2 or gene2 == gene3 or gene1 == gene3) and (gene1 != '' and gene2 != '' and gene3 != ''):
104 |             if info1[10] == info2[10] or info1[10] == ReverseComplement(info2[10]):
105 |                 readinfo1 = info1
106 |                 readinfo2 = info2
107 |             elif info1[10] == info3[10] or info1[10] == ReverseComplement(info3[10]):
108 |                 readinfo1 = info1
109 |                 readinfo2 = info3
110 |             elif info2[10] == info3[10] or info2[10] == ReverseComplement(info3[10]):
111 |                 readinfo1 = info3
112 |                 readinfo2 = info2
113 |             else:
114 |                 print('Three reads are different!')
115 |                 for ll in lines:
116 |                     print(ll, end='')
117 |                 return thisname
118 |             if readinfo1[0] != readinfo2[0]:
119 |                 clip1 = readinfo1[6]
120 |                 clip2 = readinfo2[6]
121 |                 readlen = len(readinfo1[10])
122 |                 clipsplit1 = SolveClip(clip1, readlen)
123 |                 clipsplit2 = SolveClip(clip2, readlen)
124 |                 cc = False
125 |                 splitpnt1 = -1
126 |                 splitpnt2 = -1
127 |                 for i in range(len(clipsplit1[0])):
128 |                     for j in range(len(clipsplit2[0])):
129 |                         if clipsplit2[0][j] == clipsplit1[0][i] or clipsplit2[0][j] + clipsplit1[0][i] == readlen:
130 |                             splitpnt1 = clipsplit1[0][i]
131 |                             splitpnt2 = clipsplit2[0][j]
132 |                             cc = True
133 |                             break
134 |                     if cc:
135 |                         break
136 |                 if readlen - 0.5 > splitpnt1 > -0.5 and readlen - 0.5 > splitpnt2 > 0.5:
137 |                     if clipsplit1[1][i] == 'S':
138 |                         brkpnt1 = int(readinfo1[4])
139 |                         direct1 = 1
140 |                     else:
141 |                         brkpnt1 = splitpnt1 - clipsplit1[2] + int(readinfo1[4]) + clipsplit1[4] - 1
142 |                         direct1 = -1
143 |                     if clipsplit2[1][j] == 'S':
144 |                         brkpnt2 = int(readinfo2[4])
145 |                         direct2 = 1
146 |                     else:
147 |                         brkpnt2 = splitpnt2 - clipsplit2[2] + int(readinfo2[4]) + clipsplit2[4] - 1
148 |                         direct2 = -1
149 |                     cgene1 = readinfo1[0]
150 |                     cgene2 = readinfo2[0]
151 |                     chromo1 = readinfo1[3]
152 |                     chromo2 = readinfo2[3]
153 |                     if not chromo1.startswith('chr'):
154 |                         chromo1 = 'chr' + chromo1
155 |                     if not chromo2.startswith('chr'):
156 |                         chromo2 = 'chr' + chromo2
157 |                     genelist1 = cgene1.split(';')
158 |                     genelist2 = cgene2.split(';')  # separate genes with semicolon
159 |                     for gene1 in genelist1:
160 |                         for gene2 in genelist2:
161 |                             if gene1 + '\t' + gene2 in geneset:
162 |                                 geneset[gene1 + '\t' + gene2][1] += 1
163 |                                 geneset[gene1 + '\t' + gene2][2].append(
164 |                                     [brkpnt1, brkpnt2, splitpnt1, clipsplit1[2], clipsplit2[2], direct1, direct2])
165 |                             elif gene2 + '\t' + gene1 in geneset:
166 |                                 geneset[gene2 + '\t' + gene1][1] += 1
167 |                                 geneset[gene2 + '\t' + gene1][2].append(
168 |                                     [brkpnt2, brkpnt1, splitpnt2, clipsplit2[2], clipsplit1[2], direct2, direct1])
169 |                             else:
170 |                                 geneset[gene1 + '\t' + gene2] = [0, 1,
171 |                                                                  [[brkpnt1, brkpnt2, splitpnt1, clipsplit1[2], clipsplit2[2], direct1, direct2]],
172 |                                                                  [chromo1, chromo2]]
173 |     return ''
174 | 
175 | 
176 | geneset = {}
177 | infile = open(sys.argv[1])
178 | outfile = open(sys.argv[2], 'w')
179 | lastname = ''
180 | lines = []
181 | badreadname = ''
182 | thisname = 'hgfhfrjfjzjbest1122gh'
183 | with infile:
184 |     for line in infile:
185 |         if len(line) > 0:
186 |             if line[0] == '#':
187 |                 continue
188 |         info = line.split('\t')
189 |         if len(line) > 0:
190 |             thisname = info[1]
191 |         if thisname == badreadname:
192 |             continue
193 |         if thisname == lastname:
194 |             lines.append(line)
195 |         else:
196 |             aa = TakeoutFusionSupport(lines)  # mean procedure to record fusion.
197 |             if len(aa) > 1:
198 |                 lastname = aa
199 |             else:
200 |                 lastname = thisname
201 |             lines = [line]
202 |         if len(info[0]) <= 1:
203 |             badreadname = info[1]
204 |             lastname = ''
205 |             lines = []
206 | aa = TakeoutFusionSupport(lines)
207 | for key in geneset:
208 |     outfile.write(key + '\t' + str(geneset[key][0]) + '\t' + str(geneset[key][1]) + '\t' + geneset[key][3][0] + '\t' + geneset[key][3][1] + '\t')
209 |     posstr = []
210 |     for i in geneset[key][2]:
211 |         # posstr.append(str(i[0]) + ',' + str(i[1]) + '+' + str(i[2]) + '+' + str(i[3]) + ',' + str(i[4]) + '+' + str(i[5]) + ',' + str(i[6]) + ';')
212 |         posstr.append(str(i[0]) + ',' + str(i[1]) + '+' + str(i[2]) + '+' + str(i[3]) + ',' + str(i[4]) + ';')
213 |     posstr = set(posstr)
214 |     for i in posstr:
215 |         outfile.write(i)
216 |     outfile.write('\n')
217 | infile.close()
218 | outfile.close()
219 | 


--------------------------------------------------------------------------------
/bin/FindFusionSupport.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | path=$1
 3 | mystart=$2
 4 | myend=$3
 5 | codedir=$4
 6 | for((i=${mystart};i<=${myend};i++))
 7 | do
 8 | 	file=${path}/${i}_geneanno.sam
 9 | 	python ${codedir}/FindFusionSupport.py ${file} ${file%_*}_FusionSupport.txt
10 | done
11 | 


--------------------------------------------------------------------------------
/bin/FindHomoPattern_RAM.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from __future__ import division
  3 | import sys
  4 | import numpy as np
  5 | import math
  6 | 
  7 | 
  8 | # ***** readme *****
  9 | # This code wants to verify that fake fusion is more likely to be homo
 10 | 
 11 | 
 12 | def GCcontent(str1, str2):
 13 |     l1 = len(str1)
 14 |     l2 = len(str2)
 15 |     count = 0
 16 |     for i in range(l1):
 17 |         if str1[i].upper() == 'G' or str1[i].upper() == 'C':
 18 |             count += 1
 19 |     res1 = count / l1
 20 |     count = 0
 21 |     for i in range(l2):
 22 |         if str2[i].upper() == 'G' or str2[i].upper() == 'C':
 23 |             count += 1
 24 |     res2 = count / l2
 25 |     return [res1, res2]
 26 | 
 27 | 
 28 | def ReverseComplement(str):
 29 |     return str[::-1].replace('A', 't').replace('T', 'a').replace('G', 'c').replace('C', 'g').upper()
 30 | 
 31 | 
 32 | def Findbrkpnt_SWResult(string, tailpos, halfwidth):
 33 |     aa = []
 34 |     for i in range(len(string)):
 35 |         if string[i] != '-':
 36 |             aa.append(string[i])
 37 |     if tailpos < halfwidth:
 38 |         return len(string) + halfwidth - tailpos
 39 |     if tailpos - len(aa) >= halfwidth:
 40 |         return - tailpos + len(aa) + halfwidth
 41 |     j = len(string) - 1
 42 |     start = tailpos
 43 |     while tailpos > halfwidth:
 44 |         if string[j] != '-':
 45 |             start -= 1
 46 |         j -= 1
 47 |         if start == halfwidth:
 48 |             break
 49 |     return j + 1
 50 | 
 51 | 
 52 | def getline(thefilepath, desired_line_number):
 53 |     if desired_line_number < 1:
 54 |         return ''
 55 |     for current_line_number, line in enumerate(open(thefilepath, 'rU')):
 56 |         if current_line_number == desired_line_number - 1:
 57 |             return line
 58 |     return ''
 59 | 
 60 | 
 61 | def findpos(chromo, pos):
 62 |     row = int(math.ceil(pos / reffilelinelength))
 63 |     col = pos % reffilelinelength
 64 |     if col == 0:
 65 |         col = reffilelinelength
 66 |     return row, col
 67 | 
 68 | 
 69 | def getTCGAstring(chr, start, end):
 70 |     row1, col1 = findpos(chr, start)
 71 |     row2, col2 = findpos(chr, end)
 72 |     if row1 == row2:
 73 |         return refdict[chr][row1 - 1][col1 - 1:col2]
 74 |     resstring = refdict[chr][row1 - 1].rstrip()[col1 - 1:]
 75 |     for i in range(row1 + 1, row2):
 76 |         resstring += refdict[chr][i - 1].rstrip()
 77 |     return resstring + refdict[chr][row2 - 1][:col2]
 78 | 
 79 | 
 80 | def onethread(halfwindowwidth, fusionfile):
 81 |     geneset = {}
 82 |     for line in refannotfile.readlines():
 83 |         if line.startswith('#'):
 84 |             continue
 85 |         info = line.split('\t')
 86 |         if info[2] == 'exon':
 87 |             chromo = info[0]
 88 |             start = int(info[3])
 89 |             end = int(info[4])
 90 |             geneinfo = info[8].split('; ')
 91 |             genename = ''
 92 |             genetype = ''
 93 |             genetypeFound = False
 94 |             if len(geneinfo) > 4:
 95 |                 for item in geneinfo:
 96 |                     if item.find('gene_name') > -1:
 97 |                         genename = item[11:-1]
 98 |                     if item.find('gene_type') > -1:
 99 |                         genetype = item[11:-1]
100 |                         genetypeFound = True
101 |                     if item.find('gene_biotype') > -1:
102 |                         genetype = item[14:-1]
103 |                         genetypeFound = True
104 |             else:
105 |                 continue
106 |             if genename == '':
107 |                 continue
108 |             if genetype != 'protein_coding' and genetype != 'processed_transcript' and genetypeFound:
109 |                 continue
110 |             if chromo not in geneset:
111 |                 geneset[chromo] = {}
112 |             if genename not in geneset[chromo]:
113 |                 geneset[chromo][genename] = [[start, end], [[start, end]]]
114 |             else:
115 |                 found = False
116 |                 for item in geneset[chromo][genename][1]:
117 |                     if start <= item[1] and end >= item[0]:
118 |                         found = True
119 |                         break
120 |                 if found:
121 |                     continue
122 |                 geneset[chromo][genename][1].append([start, end])
123 |                 geneset[chromo][genename][0][0] = min(geneset[chromo][genename][0][0], start)
124 |                 geneset[chromo][genename][0][1] = max(geneset[chromo][genename][0][1], end)
125 |     refannotfile.close()
126 |     for i in geneset:
127 |         for j in geneset[i]:
128 |             geneset[i][j][1].sort()
129 | 
130 |     allline = fusionfile.readlines()
131 |     for line in allline:
132 |         if line[0] == '#':
133 |             continue
134 |         try:
135 |             info = line.split('\t')
136 |             gene1 = info[0]
137 |             gene2 = info[1]
138 |             chr1 = info[3].replace(' ', '')
139 |             chr2 = info[4].replace(' ', '')
140 |             if not chr1.startswith('chr'):
141 |                 chr1 = 'chr' + chr1
142 |             if not chr2.startswith('chr'):
143 |                 chr2 = 'chr' + chr2
144 |             pos1 = int(info[5].replace(' ', ''))
145 |             pos2 = int(info[6].replace(' ', ''))
146 |             exoninterval1 = -1
147 |             exoninterval2 = -1
148 |             for g1 in geneset[chr1]:
149 |                 if geneset[chr1][g1][0][0] <= pos1 <= geneset[chr1][g1][0][1]:
150 |                     gene1 = g1
151 |                     break
152 |             for g2 in geneset[chr2]:
153 |                 if geneset[chr2][g2][0][0] <= pos2 <= geneset[chr2][g2][0][1]:
154 |                     gene2 = g2
155 |                     break
156 |             if gene1 not in geneset[chr1] or gene2 not in geneset[chr2]:
157 |                 pass
158 |             else:
159 |                 for i in range(len(geneset[chr1][gene1][1])):
160 |                     if geneset[chr1][gene1][1][i][0] <= pos1 <= geneset[chr1][gene1][1][i][1]:
161 |                         exoninterval1 = i
162 |                         break
163 |                 for i in range(len(geneset[chr2][gene2][1])):
164 |                     if geneset[chr2][gene2][1][i][0] <= pos2 <= geneset[chr2][gene2][1][i][1]:
165 |                         exoninterval2 = i
166 |                         break
167 |             # 获取外显子的序列
168 |             if exoninterval2 == -1 or exoninterval1 == -1:
169 |                 res1d = getTCGAstring(chr1, pos1, pos1 + halfwindowslength)
170 |                 res1u = getTCGAstring(chr1, pos1 - halfwindowslength, pos1 - 1)
171 |                 res2d = getTCGAstring(chr2, pos2, pos2 + halfwindowslength)
172 |                 res2u = getTCGAstring(chr2, pos2 - halfwindowslength, pos2 - 1)
173 |             else:
174 |                 res1d = ''
175 |                 res2d = ''
176 |                 res1u = ''
177 |                 res2u = ''
178 |                 last1 = 0
179 |                 last2 = 0
180 |                 if geneset[chr1][gene1][1][exoninterval1][1] - pos1 + 1 >= halfwindowslength:
181 |                     res1d = getTCGAstring(chr1, pos1, pos1 + halfwindowslength - 1)
182 |                 else:
183 |                     res1d = getTCGAstring(chr1, pos1, geneset[chr1][gene1][1][exoninterval1][1])
184 |                     last1 = geneset[chr1][gene1][1][exoninterval1][1]
185 |                 if geneset[chr2][gene2][1][exoninterval2][1] - pos2 + 1 >= halfwindowslength:
186 |                     res2d = getTCGAstring(chr2, pos2, pos2 + halfwindowslength - 1)
187 |                 else:
188 |                     res2d = getTCGAstring(chr2, pos2, geneset[chr2][gene2][1][exoninterval2][1])
189 |                     last2 = geneset[chr2][gene2][1][exoninterval2][1]
190 |                 fi1 = exoninterval1
191 |                 fi2 = exoninterval2
192 |                 while len(res1d) < halfwindowslength:
193 |                     fi1 += 1
194 |                     if fi1 >= len(geneset[chr1][gene1][1]):
195 |                         break
196 |                     if last1 >= geneset[chr1][gene1][1][fi1][0]:
197 |                         break
198 |                     if geneset[chr1][gene1][1][fi1][1] - geneset[chr1][gene1][1][fi1][0] + 1 + len(
199 |                             res1d) >= halfwindowslength:
200 |                         res1d += getTCGAstring(chr1, geneset[chr1][gene1][1][fi1][0],
201 |                                                geneset[chr1][gene1][1][fi1][0] + halfwindowslength - len(res1d) - 1)
202 |                     else:
203 |                         res1d += getTCGAstring(chr1, geneset[chr1][gene1][1][fi1][0], geneset[chr1][gene1][1][fi1][1])
204 |                         last1 = geneset[chr1][gene1][1][fi1][1]
205 |                 while len(res2d) < halfwindowslength:
206 |                     fi2 += 1
207 |                     if fi2 >= len(geneset[chr2][gene2][1]):
208 |                         break
209 |                     if last2 >= geneset[chr2][gene2][1][fi2][0]:
210 |                         break
211 |                     if geneset[chr2][gene2][1][fi2][1] - geneset[chr2][gene2][1][fi2][0] + 1 + len(
212 |                             res2d) >= halfwindowslength:
213 |                         res2d += getTCGAstring(chr2, geneset[chr2][gene2][1][fi2][0],
214 |                                                geneset[chr2][gene2][1][fi2][0] + halfwindowslength - len(res2d) - 1)
215 |                     else:
216 |                         res2d += getTCGAstring(chr2, geneset[chr2][gene2][1][fi2][0], geneset[chr2][gene2][1][fi2][1])
217 |                         last2 = geneset[chr2][gene2][1][fi2][1]
218 |                 if - geneset[chr1][gene1][1][exoninterval1][0] + pos1 + 1 >= halfwindowslength:
219 |                     res1u = getTCGAstring(chr1, pos1 - halfwindowslength + 1, pos1)
220 |                 else:
221 |                     res1u = getTCGAstring(chr1, geneset[chr1][gene1][1][exoninterval1][0], pos1)
222 |                     last1 = geneset[chr1][gene1][1][exoninterval1][0]
223 |                 if - geneset[chr2][gene2][1][exoninterval2][0] + pos2 + 1 >= halfwindowslength:
224 |                     res2u = getTCGAstring(chr2, pos2 - halfwindowslength + 1, pos2)
225 |                 else:
226 |                     res2u = getTCGAstring(chr2, geneset[chr2][gene2][1][exoninterval2][0], pos2)
227 |                     last2 = geneset[chr2][gene2][1][exoninterval2][0]
228 |                 fi1 = exoninterval1
229 |                 fi2 = exoninterval2
230 |                 while len(res1u) < halfwindowslength:
231 |                     fi1 -= 1
232 |                     if fi1 < 0:
233 |                         break
234 |                     if last1 <= geneset[chr1][gene1][1][fi1][1]:
235 |                         break
236 |                     if geneset[chr1][gene1][1][fi1][1] - geneset[chr1][gene1][1][fi1][0] + 1 + len(
237 |                             res1u) >= halfwindowslength:
238 |                         res1u = getTCGAstring(chr1,
239 |                                               geneset[chr1][gene1][1][fi1][1] - halfwindowslength + len(res1u) + 1,
240 |                                               geneset[chr1][gene1][1][fi1][1]) + res1u
241 |                     else:
242 |                         res1u = getTCGAstring(chr1, geneset[chr1][gene1][1][fi1][0],
243 |                                               geneset[chr1][gene1][1][fi1][1]) + res1u
244 |                         last1 = geneset[chr1][gene1][1][fi1][0]
245 |                 while len(res2u) < halfwindowslength:
246 |                     fi2 -= 1
247 |                     if fi2 < 0:
248 |                         break
249 |                     if last2 <= geneset[chr2][gene2][1][fi2][1]:
250 |                         break
251 |                     if geneset[chr2][gene2][1][fi2][1] - geneset[chr2][gene2][1][fi2][0] + 1 + len(
252 |                             res2u) >= halfwindowslength:
253 |                         res2u = getTCGAstring(chr2,
254 |                                               geneset[chr2][gene2][1][fi2][1] - halfwindowslength + len(res2u) + 1,
255 |                                               geneset[chr2][gene2][1][fi2][1]) + res2u
256 |                     else:
257 |                         res2u = getTCGAstring(chr2, geneset[chr2][gene2][1][fi2][0],
258 |                                               geneset[chr2][gene2][1][fi2][1]) + res2u
259 |                         last2 = geneset[chr2][gene2][1][fi2][0]
260 |             string1 = res1u.lower() + res1d.upper()
261 |             string2 = res2u.lower() + res2d.upper()
262 |             gccontent = GCcontent(string1, string2)
263 |             print(line.rstrip() + '\t-\t-\t0\t' + str(gccontent[0]) + '\t' + str(gccontent[1]) + '\t0\t0\t' + string1 + '\t' + string2)
264 |         except:
265 |             sys.stderr.write('Bad Line:' + line)
266 | 
267 | 
268 | def main():
269 |     onethread(halfwindowwidth, fusionfile)
270 | 
271 | 
272 | if __name__ == "__main__":
273 |     halfwindowwidth = 10
274 |     halfwindowslength = 100
275 |     fusionfile = open(sys.argv[1])
276 |     reffilepath = sys.argv[2]
277 |     reference = open(reffilepath)
278 |     refannotfile = open(sys.argv[3])
279 |     currentchr = ''
280 |     refdict = {}
281 |     for line in reference.readlines():
282 |         if line.startswith('>'):
283 |             currentchr = line.rstrip()[1:].split(' ')[0]
284 |             if not currentchr.startswith('chr'):
285 |                 currentchr = 'chr' + currentchr
286 |             refdict[currentchr] = []
287 |         else:
288 |             refdict[currentchr].append(line)
289 |     reffilelinelength = len(refdict[currentchr][0].rstrip())
290 |     if reffilelinelength > 100:
291 |         reffilelinelength = 99999999999
292 |     main()
293 | 


--------------------------------------------------------------------------------
/bin/FusionScore.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from __future__ import division
  3 | import sys
  4 | import math
  5 | import os
  6 | 
  7 | 
  8 | # ***** readme *****
  9 | # This code calculate fusion score for each cell and
 10 | # add rawscore for every cell
 11 | 
 12 | 
 13 | def sigmoid(x):
 14 |     return 1 / (1 + math.exp(-x))
 15 | 
 16 | 
 17 | def ReverseComplement(str):
 18 |     return str[::-1].replace('A', 't').replace('T', 'a').replace('G', 'c').replace('C', 'g').upper()
 19 | 
 20 | 
 21 | def AddCount(genenum, q=1.0):
 22 |     genenum = genenum.split('.')[0]
 23 |     if genenum in ExprDic:
 24 |         ExprDic[genenum] += q
 25 |     else:
 26 |         ExprDic[genenum] = q
 27 | 
 28 | 
 29 | def GetExpr(gene):
 30 |     genelist = gene.split(',')
 31 |     sum = 0
 32 |     count = 0
 33 |     for item in genelist:
 34 |         if item in ExprDic:
 35 |             sum += ExprDic[item]
 36 |             count += 1
 37 |     if count > 0:
 38 |         return sum / count
 39 |     return 0
 40 | 
 41 | 
 42 | argnum = len(sys.argv)
 43 | filedir = sys.argv[1]
 44 | start = sys.argv[2]
 45 | last = sys.argv[3]
 46 | ExprDir = sys.argv[4]
 47 | FusionMatrix = {}
 48 | FusionPos = {}
 49 | Clusters = {}
 50 | ClusterSize = {}
 51 | ClusterCount = {}
 52 | ExprDic = {}
 53 | GeneDictionary = {}
 54 | good_dist = 20
 55 | EnableClustering = False
 56 | EmcmpscoreDic = {}
 57 | if argnum == 6:
 58 |     EnableClustering = True
 59 |     ClusteringFile = open(sys.argv[5])
 60 |     for line in ClusteringFile.readlines():
 61 |         if line[0] == '#':
 62 |             continue
 63 |         info = line.split('\t')
 64 |         cellname = info[0]
 65 |         cluster = info[1]
 66 |         Clusters[cellname] = cluster
 67 |         if cluster in ClusterSize:
 68 |             ClusterSize[cluster] += 1
 69 |         else:
 70 |             ClusterSize[cluster] = 1
 71 |         ClusterCount[cluster] = 0  # this var is for the final output fusion categories.
 72 |     ClusteringFile.close()
 73 | for i in range(int(start), int(last) + 1):
 74 |     try:
 75 |         infile = open(filedir + '/' + str(i) + '_FusionSupport.txt')
 76 |         exprfile = open(ExprDir + '/' + str(i) + '.rpkm.txt')
 77 |         samfile = open(filedir + '/' + str(i) + '_geneanno.sam')
 78 |     except:
 79 |         continue
 80 |     lastgene = ''
 81 |     totalrc = 0
 82 |     cellsup = 0
 83 |     ExprDic = {}
 84 |     for line in exprfile.readlines():
 85 |         info = line.split('\t')
 86 |         gene = info[0]
 87 |         rc = int(info[1])
 88 |         if gene == lastgene:
 89 |             totalrc += rc
 90 |             continue
 91 |         if lastgene != '':
 92 |             ExprDic[lastgene] = totalrc
 93 |         lastgene = gene
 94 |         totalrc = rc
 95 |     exprfile.close()
 96 |     ExprDic[lastgene] = totalrc
 97 |     lastname = ''
 98 |     lines = []
 99 |     badreadname = ''
100 |     for line in samfile.readlines():
101 |         if line[0] == '#':
102 |             continue
103 |         info = line.split('\t')
104 |         thisname = info[1]
105 |         if thisname == badreadname:
106 |             continue
107 |         if thisname == lastname:
108 |             lines.append(line)
109 |         else:
110 |             if len(lines) == 2:
111 |                 info1 = lines[0].split('\t')
112 |                 info2 = lines[1].split('\t')
113 |                 if info1[0] != info2[0]:
114 |                     gene1 = info1[0]
115 |                     gene2 = info2[0]
116 |                     AddCount(gene1)
117 |                     AddCount(gene2)
118 |             elif len(lines) == 3:
119 |                 info1 = lines[0].split('\t')
120 |                 info2 = lines[1].split('\t')
121 |                 info3 = lines[2].split('\t')
122 |                 gene1 = info1[0].split('.')[0]
123 |                 gene2 = info2[0].split('.')[0]
124 |                 gene3 = info3[0].split('.')[0]
125 |                 if gene1 != gene2 or gene2 != gene3:
126 |                     if info1[10] == info2[10] or info1[10] == ReverseComplement(info2[10]):
127 |                         AddCount(gene1, 0.5)
128 |                         AddCount(gene2, 0.5)
129 |                         AddCount(gene3)
130 |                     elif info1[10] == info3[10] or info1[10] == ReverseComplement(info3[10]):
131 |                         AddCount(gene1, 0.5)
132 |                         AddCount(gene2)
133 |                         AddCount(gene3, 0.5)
134 |                     elif info2[10] == info3[10] or info2[10] == ReverseComplement(info3[10]):
135 |                         AddCount(gene1)
136 |                         AddCount(gene2, 0.5)
137 |                         AddCount(gene3, 0.5)
138 |                     else:
139 |                         sys.stderr.write('!!!!!' + info1[1])
140 |                         lines = [line]
141 |                         lastname = thisname
142 |                         readinfo2 = [1]
143 |                         readinfo1 = [1]  # don't run codes below, just want to read lines
144 |             lines = [line]
145 |             lastname = thisname
146 |     samfile.close()
147 |     for line in infile.readlines():
148 |         info = line.split('\t')
149 |         gene1 = info[0]
150 |         gene2 = info[1]
151 |         chromo1 = info[4]
152 |         chromo2 = info[5]
153 |         encompass = int(info[2])
154 |         splitcount = int(info[3])
155 |         if splitcount + encompass == 0:
156 |             continue
157 |         splitreadinfo = info[6].split(';')
158 |         splitreadinfo = list(set(splitreadinfo))        # remove duplication
159 |         encmpscore = encompass
160 |         subscore = []
161 |         Pos = {}
162 |         try:
163 |             scale1 = ExprDic[gene1]
164 |             scale2 = ExprDic[gene2]
165 |         except:
166 |             sys.stderr.write('!!!' + gene1 + '\t' + gene2)
167 |             continue
168 |         scale = math.sqrt(math.log(1+scale1, 2) * math.log(1+scale2, 2))
169 |         subgrp = []
170 |         for item in splitreadinfo:
171 |             if len(item) > 3:
172 |                 iteminfo = item.split('+')
173 |                 pos = iteminfo[0].split(',')
174 |                 if pos[0] + ',' + pos[1] in Pos:
175 |                     Pos[pos[0] + ',' + pos[1]] += 1
176 |                 else:
177 |                     Pos[pos[0] + ',' + pos[1]] = 1
178 |                 badmap = iteminfo[2].split(',')
179 |                 editscore = max(50 - abs(50 - int(iteminfo[1])) - int(badmap[0]) - int(badmap[1]), 0)
180 |                 if editscore > 12:
181 |                     ssscore = 12 + (editscore - 12) / 10
182 |                 else:
183 |                     ssscore = editscore
184 |                 if len(subgrp) == 0:
185 |                     subgrp = [{pos[0] + ',' + pos[1]: 1}]
186 |                     subscore.append(ssscore)
187 |                 else:
188 |                     fin = False
189 |                     for j in range(len(subgrp)):
190 |                         for key in subgrp[j]:
191 |                             grpos0 = key.split(',')[0]
192 |                             grpos1 = key.split(',')[1]
193 |                             if abs(int(pos[0]) - int(grpos0)) <= 20 and abs(int(pos[1]) - int(grpos1)) <= 20:
194 |                                 fin = True
195 |                                 if pos[0] + ',' + pos[1] in subgrp[j]:
196 |                                     subgrp[j][pos[0] + ',' + pos[1]] += 1
197 |                                 else:
198 |                                     subgrp[j][pos[0] + ',' + pos[1]] = 1
199 |                                 subscore[j] += ssscore
200 |                                 break
201 |                         if fin:
202 |                             break
203 |                     if not fin:
204 |                         subgrp.append({pos[0] + ',' + pos[1]: 1})
205 |                         subscore.append(ssscore)
206 |         rawscore = []
207 |         for j in subscore:
208 |             if j > 0.5 or encmpscore > 0:
209 |                 # rawscore.append(sigmoid((j + encmpscore) / 20 - 1.7))
210 |                 rawscore.append(pow(1 + (j + encmpscore) / (7 + scale), 1.3))
211 |             else:
212 |                 rawscore.append(0)
213 |         rev = False
214 |         AvePos = []
215 |         for mdict in subgrp:  # calculate average pos for each cluster
216 |             left = 0
217 |             right = 0
218 |             cc = 0
219 |             for key in mdict:
220 |                 pos = key.split(',')
221 |                 left += int(pos[0]) * mdict[key]
222 |                 right += int(pos[1]) * mdict[key]
223 |                 cc += mdict[key]
224 |             left = left / cc
225 |             right = right / cc
226 |             AvePos.append([left, right, cc])
227 |         if gene1 + '\t' + gene2 in FusionMatrix:
228 |             if len(subgrp) == 0:
229 |                 if gene1 + '\t' + gene2 in EmcmpscoreDic:
230 |                     EmcmpscoreDic[gene1 + '\t' + gene2] += rawscore
231 |                 elif gene2 + '\t' + gene1 in EmcmpscoreDic:
232 |                     EmcmpscoreDic[gene2 + '\t' + gene1] += rawscore
233 |                 else:
234 |                     EmcmpscoreDic[gene1 + '\t' + gene2] = rawscore
235 |             for j in range(len(subgrp)):
236 |                 if len(FusionPos[gene1 + '\t' + gene2][1]) == 0 and len(subgrp) > 0:
237 |                     FusionMatrix[gene1 + '\t' + gene2] = rawscore
238 |                     for k in range(len(subgrp)):
239 |                         FusionPos[gene1 + '\t' + gene2][1].append([subgrp[k], AvePos[k], [i], 1])
240 |                 else:
241 |                     for subpos in AvePos:
242 |                         dist = []
243 |                         for k in FusionPos[gene1 + '\t' + gene2][1]:
244 |                             d1 = abs(k[1][0] - subpos[0])
245 |                             d2 = abs(k[1][1] - subpos[1])
246 |                             if d1 <= good_dist and d2 <= good_dist:
247 |                                 dist.append(d1 + d2)
248 |                             else:
249 |                                 dist.append(10 * good_dist)
250 |                         if len(dist) > 0:
251 |                             if min(dist) >= 3 * good_dist:
252 |                                 FusionPos[gene1 + '\t' + gene2][1].append([subgrp[j], AvePos[j], [i], 1])
253 |                                 FusionMatrix[gene1 + '\t' + gene2].append(rawscore[j])
254 |                             else:
255 |                                 smallindex = dist.index(min(dist))
256 |                                 FusionMatrix[gene1 + '\t' + gene2][smallindex] += rawscore[j]
257 |                                 old = FusionPos[gene1 + '\t' + gene2][1][smallindex][1]
258 |                                 newcc = subpos[2] + old[2]
259 |                                 newleft = (old[2] * old[0] + subpos[2] * subpos[0]) / newcc
260 |                                 newright = (old[2] * old[1] + subpos[2] * subpos[1]) / newcc
261 |                                 FusionPos[gene1 + '\t' + gene2][1][smallindex][1] = [newleft, newright, newcc]
262 |                                 FusionPos[gene1 + '\t' + gene2][1][smallindex][2].append(i)
263 |                                 FusionPos[gene1 + '\t' + gene2][1][smallindex][3] += 1
264 |             gene = gene1 + '\t' + gene2
265 |         elif gene2 + '\t' + gene1 in FusionMatrix:
266 |             if len(subgrp) == 0:
267 |                 if gene1 + '\t' + gene2 in EmcmpscoreDic:
268 |                     EmcmpscoreDic[gene1 + '\t' + gene2] += rawscore
269 |                 elif gene2 + '\t' + gene1 in EmcmpscoreDic:
270 |                     EmcmpscoreDic[gene2 + '\t' + gene1] += rawscore
271 |                 else:
272 |                     EmcmpscoreDic[gene1 + '\t' + gene2] = rawscore
273 |             for j in range(len(subgrp)):
274 |                 if len(FusionPos[gene2 + '\t' + gene1][1]) == 0 and len(subgrp) > 0:
275 |                     FusionMatrix[gene2 + '\t' + gene1] = rawscore
276 |                     for k in range(len(subgrp)):
277 |                         FusionPos[gene2 + '\t' + gene1][1].append(
278 |                             [subgrp[j], [AvePos[j][1], AvePos[j][0], AvePos[j][2]], [i], 1])
279 |                 else:
280 |                     for subpos in AvePos:
281 |                         dist = []
282 |                         for k in FusionPos[gene2 + '\t' + gene1][1]:
283 |                             d1 = abs(k[1][0] - subpos[1])
284 |                             d2 = abs(k[1][1] - subpos[0])
285 |                             if d1 <= good_dist and d2 <= good_dist:
286 |                                 dist.append(d1 + d2)
287 |                             else:
288 |                                 dist.append(10 * good_dist)
289 |                         if min(dist) >= 3 * good_dist:
290 |                             FusionPos[gene2 + '\t' + gene1][1].append(
291 |                                 [subgrp[j], [AvePos[j][1], AvePos[j][0], AvePos[j][2]], [i], 1])
292 |                             FusionMatrix[gene2 + '\t' + gene1].append(rawscore[j])
293 |                         else:
294 |                             smallindex = dist.index(min(dist))
295 |                             FusionMatrix[gene2 + '\t' + gene1][smallindex] += rawscore[j]
296 |                             old = FusionPos[gene2 + '\t' + gene1][1][smallindex][1]
297 |                             newcc = subpos[2] + old[2]
298 |                             newleft = (old[2] * old[0] + subpos[2] * subpos[1]) / newcc
299 |                             newright = (old[2] * old[1] + subpos[2] * subpos[0]) / newcc
300 |                             FusionPos[gene2 + '\t' + gene1][1][smallindex][1] = [newleft, newright, newcc]
301 |                             FusionPos[gene2 + '\t' + gene1][1][smallindex][2].append(i)
302 |                             FusionPos[gene2 + '\t' + gene1][1][smallindex][3] += 1
303 |             gene = gene2 + '\t' + gene1
304 |             rev = True
305 |         else:
306 |             if len(subgrp) == 0:
307 |                 if gene1 + '\t' + gene2 in EmcmpscoreDic:
308 |                     EmcmpscoreDic[gene1 + '\t' + gene2] += rawscore
309 |                 elif gene2 + '\t' + gene1 in EmcmpscoreDic:
310 |                     EmcmpscoreDic[gene2 + '\t' + gene1] += rawscore
311 |                 else:
312 |                     EmcmpscoreDic[gene1 + '\t' + gene2] = rawscore
313 |             FusionMatrix[gene1 + '\t' + gene2] = rawscore
314 |             FusionPos[gene1 + '\t' + gene2] = [[chromo1, chromo2], []]
315 |             for k in range(len(subgrp)):
316 |                 FusionPos[gene1 + '\t' + gene2][1].append([subgrp[k], AvePos[k], [i], 1])
317 |             gene = gene1 + '\t' + gene2
318 |     infile.close()
319 |     exprfile.close()
320 | '''
321 | for key in EmcmpscoreDic:
322 |     if key in FusionMatrix:
323 |         for i in range(len(FusionMatrix[key])):
324 |             FusionMatrix[key][i] += EmcmpscoreDic[key]
325 | '''
326 | for gene in FusionMatrix:
327 |     temp = FusionPos[gene][1]
328 |     for i in range(len(temp)):
329 |         keylist = sorted(temp[i][0], key=temp[i][0].__getitem__, reverse=True)
330 |         possum = 0
331 |         countlist = []
332 |         for key in keylist:
333 |             possum += temp[i][0][key]
334 |             countlist.append(int(temp[i][0][key]))
335 |         if possum > 0 and max(countlist) / possum >= 0.4:
336 |             for key in temp[i][0]:
337 |                 if key == keylist[0]:
338 |                     pos0 = int(keylist[0].split(',')[0])
339 |                     pos1 = int(keylist[0].split(',')[1])
340 |                     if abs(pos0 - temp[i][1][0]) + abs(pos1 - temp[i][1][1]) < abs(pos1 - temp[i][1][0]) + abs(
341 |                             pos0 - temp[i][1][1]):
342 |                         FusionPos[gene][1][i].append([pos0, pos1])
343 |                     else:
344 |                         FusionPos[gene][1][i].append([pos1, pos0])
345 |                     break
346 |         else:
347 |             FusionPos[gene][1][i].append([-1, -1])
348 | templines = []
349 | for gene in FusionMatrix:
350 |     for i in range(len(FusionPos[gene][1])):
351 |         if FusionPos[gene][1][i][4][0] > -0.5:
352 |             genepart = gene.split('\t')
353 |             templine = genepart[0] + '\t' + genepart[1] + '\t' + str(
354 |                 FusionMatrix[gene][i]) + '\t' + FusionPos[gene][0][0] + '\t' + FusionPos[gene][0][1] + '\t' \
355 |                        + str(FusionPos[gene][1][i][4][0]) + '\t' + str(FusionPos[gene][1][i][4][1]) + '\t' + str(i) \
356 |                        + '\t' + str(FusionPos[gene][1][i][3])
357 |             if EnableClustering:
358 |                 CluRes = ''
359 |                 EffClu = []
360 |                 cellset = FusionPos[gene][1][i][2]
361 |                 for cell in cellset:
362 |                     ClusterCount[Clusters[cell]] += 1
363 |                 cclist = []
364 |                 for key in ClusterCount:
365 |                     ClusterCount[key] /= ClusterSize[key]
366 |                     cclist.append(ClusterCount[key])
367 |                 if max(cclist) < 0.5:
368 |                     CluRes = "No_Spec_Clu"
369 |                 else:
370 |                     for key in ClusterCount:
371 |                         if ClusterCount[key] > 0.5 * max(cclist):
372 |                             EffClu.append(key)
373 |                     if len(EffClu) == 1:
374 |                         CluRes = EffClu[0] + "_Spec"
375 |                     else:
376 |                         CluRes = "Share"
377 |                         for item in EffClu:
378 |                             CluRes += '_' + item
379 |                 templine += '\t' + CluRes + '\n'
380 |             else:
381 |                 templine += '\n'
382 |             templines.append(templine)
383 | linedic = {}
384 | for line in templines:
385 |     info = line.split('\t')
386 |     score = float(info[2])
387 |     linedic[line] = score
388 | for key in sorted(linedic, key=linedic.__getitem__, reverse=True):
389 |     print(key, end='')
390 | 


--------------------------------------------------------------------------------
/bin/GetExonPos.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import division
 3 | import sys
 4 | import os.path
 5 | 
 6 | 
 7 | # ***** readme *****
 8 | # 输入ref_annot文件，输出一个列表，是基因的位置
 9 | 
10 | reffile = open(sys.argv[1])
11 | genenameset = []
12 | for line in reffile.readlines():
13 |     if line.startswith('#'):
14 |         continue
15 |     info = line.split('\t')
16 |     if info[2] == 'gene':
17 |         chromo = info[0]
18 |         start = info[3]
19 |         end = info[4]
20 |         geneinfo = info[8].split('; ')
21 |         genename = ''
22 |         for item in geneinfo:
23 |             if item.find('gene_name') > -1:
24 |                 genename = item[11:-1]
25 |         if genename == '':
26 |             for item in geneinfo:
27 |                 if item.find('gene_id') > -1:
28 |                     genename = item[9:-1]
29 |         if genename == '':
30 |             continue
31 |         if genename not in genenameset:
32 |             print(chromo + '\t' + start + '\t' + end + '\t' + genename)
33 | 


--------------------------------------------------------------------------------
/bin/GetGenePos.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import division
 3 | import sys
 4 | import os.path
 5 | 
 6 | 
 7 | # ***** readme *****
 8 | # 输入ref_annot文件，输出一个列表，是基因的位置
 9 | 
10 | reffile = open(sys.argv[1])
11 | genenameset = []
12 | for line in reffile.readlines():
13 |     if line.startswith('#'):
14 |         continue
15 |     info = line.split('\t')
16 |     if info[2] == 'gene':
17 |         chromo = info[0]
18 |         start = info[3]
19 |         end = info[4]
20 |         geneinfo = info[8].split('; ')
21 |         genename = ''
22 |         for item in geneinfo:
23 |             if item.find('gene_name') > -1:
24 |                 genename = item[11:-1]
25 |             if item.find('gene_type') > -1:
26 |                 genetype = item[11:-1]
27 |             if item.find('gene_biotype') > -1:
28 |                 genetype = item[14:-1]
29 |         if genename == '':
30 |             for item in geneinfo:
31 |                 if item.find('gene_id') > -1:
32 |                     genename = item[9:-1]
33 |         if genename == '':
34 |             continue
35 |         if genename not in genenameset:
36 |             print(genename + '\t' + chromo + '\t' + start + '\t' + end + '\t' + genetype)
37 | 


--------------------------------------------------------------------------------
/bin/Model1.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Sep  2 21:38:29 2019
  4 | 
  5 | @author: BioMed-X
  6 | """
  7 | 
  8 | 
  9 | from keras.models import Sequential,Model
 10 | from keras.layers import Embedding,Dropout,Bidirectional,Flatten,Dense,LSTM,TimeDistributed, Activation,Input,merge,concatenate
 11 | from keras.callbacks import ModelCheckpoint,CSVLogger
 12 | from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
 13 | from tensorflow.keras.optimizers import Adam
 14 | import numpy as np
 15 | from tensorflow.keras.utils import to_categorical
 16 | import os
 17 | 
 18 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 19 | 
 20 | def Cla_LSTM():
 21 | # 搭模型
 22 |     INPUT1 = Input(shape=(61,))
 23 |     # INPUT2 = Input(shape=(60,1))
 24 |     INPUT1_enco = Embedding(5,5,input_length=61)(INPUT1)
 25 | 
 26 |     # MERGE = merge((INPUT1_enco, INPUT2),mode='concat',concat_axis=-1)
 27 |     # MERGE = concatenate([INPUT1,INPUT2])
 28 |     LSTM1 = Bidirectional(LSTM(32,return_sequences=True),merge_mode='concat')(INPUT1_enco)
 29 |     DROP1 = Dropout(0.5)(LSTM1)
 30 |     
 31 |     LSTM2 = Bidirectional(LSTM(64,return_sequences=True),merge_mode='concat')(DROP1)
 32 |     DROP2 = Dropout(0.5)(LSTM2)
 33 |     
 34 |     LSTM3 = Bidirectional(LSTM(128,return_sequences=True),merge_mode='concat')(DROP2)
 35 |     DROP3 = Dropout(0.5)(LSTM3)
 36 |     
 37 |     '''
 38 |     LSTM4 = Bidirectional(LSTM(64,return_sequences=True),merge_mode='concat')(DROP3)
 39 |     DROP4 = Dropout(0.5)(LSTM4)    
 40 |     
 41 |     LSTM5 = Bidirectional(LSTM(128,return_sequences=True),merge_mode='concat')(DROP4)
 42 |     DROP5 = Dropout(0.5)(LSTM5) 
 43 | 
 44 |     LSTM6 = Bidirectional(LSTM(128,return_sequences=True),merge_mode='concat')(DROP5)
 45 |     DROP6 = Dropout(0.5)(LSTM6) 
 46 | 
 47 |     LSTM7 = Bidirectional(LSTM(36,return_sequences=True),merge_mode='concat')(DROP6)
 48 |     DROP7 = Dropout(0.5)(LSTM7) 
 49 |     '''
 50 |     LSTM8 = Bidirectional(LSTM(256, return_sequences=False), merge_mode='concat')(DROP3)
 51 |     DENSE1 = Dense(256)(LSTM8)
 52 |     DENSE2 = Dense(2)(DENSE1)
 53 |     
 54 |     ACT1 = Activation('softmax')(DENSE2)
 55 |     # model = Model(inputs=[INPUT1,INPUT2],outputs= ACT1)
 56 |     model = Model(inputs=INPUT1,outputs= ACT1)
 57 |     return model
 58 | 
 59 | if __name__ == '__main__':
 60 |     np.random.seed(1122)
 61 |     Good_for_Tra = np.load('Good_for_Tra.npy')
 62 |     Simu_for_Tra = np.load('Simu_for_Tra.npy')
 63 |     Good_for_Tst = np.load('Good_for_Tst.npy')
 64 |     Simu_for_Tst = np.load('Simu_for_Tst.npy')
 65 |     Tra_x = np.squeeze(np.concatenate((Good_for_Tra,Simu_for_Tra),axis=0))
 66 |     Tra_y = np.concatenate( (np.zeros((Good_for_Tra.shape[0],1)),np.ones((Simu_for_Tra.shape[0],1))), axis=0)
 67 |     Tst_x = np.squeeze(np.concatenate((Good_for_Tst,Simu_for_Tst),axis=0))
 68 |     Tst_y = np.concatenate((np.zeros((Good_for_Tst.shape[0], 1)), np.ones((Simu_for_Tst.shape[0], 1))), axis=0)
 69 |     Tra_y = to_categorical(Tra_y)
 70 |     Tst_y = to_categorical(Tst_y)
 71 | 
 72 |     LIST = list(range(Tra_x.shape[0]))
 73 |     np.random.shuffle(LIST)
 74 | 
 75 |     Tra_x = Tra_x[LIST,:]
 76 |     Tra_y = Tra_y[LIST,:]
 77 |     # Tra_x_input1 = Tra_x[..., 0]
 78 |     # Tra_x_input2 = Tra_x[..., 1][...,np.newaxis]
 79 | 
 80 |     LIST = list(range(Tst_x.shape[0]))
 81 |     np.random.shuffle(LIST)
 82 |     Tst_x = Tst_x[LIST,:]
 83 |     Tst_y = Tst_y[LIST,:]
 84 | 
 85 |     model = Cla_LSTM()
 86 |     model.load_weights('weight-010.hdf5')
 87 | 
 88 |     ADAM = Adam(learning_rate=0.001)
 89 |     model_checkpoint = ModelCheckpoint(filepath='weight-{epoch:03d}.hdf5', verbose=1, monitor='val_loss', save_best_only=True)
 90 | 
 91 |     model.compile(loss='binary_crossentropy', optimizer=ADAM, metrics=['accuracy'])
 92 |     csv_loger=CSVLogger('log.csv',append=True,separator=';')
 93 | 
 94 |     # 训练模型
 95 |     batch_size = 2500
 96 |     epochs = 200
 97 |     
 98 |     # model.fit(x=[Tra_x_input1,Tra_x_input2], y=Tra_y,batch_size=batch_size,epochs=epochs, verbose=1 ,callbacks=[model_checkpoint,csv_loger], validation_split=0.25, shuffle=True)
 99 |     model.fit(x=Tra_x, y=Tra_y,batch_size=batch_size,epochs=epochs, initial_epoch= 11,validation_data=(Tst_x, Tst_y),verbose=1 ,callbacks=[model_checkpoint,csv_loger])
100 | 
101 | 
102 | # 
103 | 
104 | '''
105 | model.fit(x_train, y_train, epochs=20, batch_size=128)
106 | score = model.evaluate(x_test, y_test, batch_size=128)
107 | 
108 | 
109 | model.add(Conv1D(filters=2, kernel_size=1 , activation='relu', name='conv1_1'))
110 | 
111 | model.add(Conv1D(filters=64, kernel_size=5 , activation='relu', name='conv1_1'))
112 | model.add(MaxPooling1D(2))
113 | 
114 | model.add(Conv1D(filters=64, kernel_size=5 , activation='relu', name='conv1_1'))
115 | 
116 | model.add(Conv1D(filters=2, kernel_size=2 , activation='relu', name='conv1_1'))
117 | 
118 | model.add(Bidirectional(LSTM(20,return_sequences=True),merge_mode='concat'))
119 | 
120 | model.summary()
121 | 
122 | model.add(Activation('softmax'));
123 | '''
124 | 
125 |     


--------------------------------------------------------------------------------
/bin/Model1_Retrain.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Sep  2 21:38:29 2019
  4 | 
  5 | @author: BioMed-X
  6 | """
  7 | 
  8 | 
  9 | from keras.models import Sequential,Model
 10 | from keras.layers import Embedding,Dropout,Bidirectional,Flatten,Dense,LSTM,TimeDistributed, Activation,Input,merge,concatenate
 11 | from keras.callbacks import ModelCheckpoint,CSVLogger
 12 | from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
 13 | from tensorflow.keras.optimizers import Adam
 14 | import numpy as np
 15 | from tensorflow.keras.utils import to_categorical
 16 | import os
 17 | import sys
 18 | import random
 19 | import tensorflow as tf
 20 | 
 21 | os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 22 | 
 23 | def Cla_LSTM():
 24 | # 搭模型
 25 |     INPUT1 = Input(shape=(61,))
 26 |     # INPUT2 = Input(shape=(60,1))
 27 |     INPUT1_enco = Embedding(5,5,input_length=61)(INPUT1)
 28 | 
 29 |     # MERGE = merge((INPUT1_enco, INPUT2),mode='concat',concat_axis=-1)
 30 |     # MERGE = concatenate([INPUT1,INPUT2])
 31 |     LSTM1 = Bidirectional(LSTM(32,return_sequences=True),merge_mode='concat')(INPUT1_enco)
 32 |     DROP1 = Dropout(0.5)(LSTM1)
 33 |     
 34 |     LSTM2 = Bidirectional(LSTM(64,return_sequences=True),merge_mode='concat')(DROP1)
 35 |     DROP2 = Dropout(0.5)(LSTM2)
 36 |     
 37 |     LSTM3 = Bidirectional(LSTM(128,return_sequences=True),merge_mode='concat')(DROP2)
 38 |     DROP3 = Dropout(0.5)(LSTM3)
 39 |     
 40 |     '''
 41 |     LSTM4 = Bidirectional(LSTM(64,return_sequences=True),merge_mode='concat')(DROP3)
 42 |     DROP4 = Dropout(0.5)(LSTM4)    
 43 |     
 44 |     LSTM5 = Bidirectional(LSTM(128,return_sequences=True),merge_mode='concat')(DROP4)
 45 |     DROP5 = Dropout(0.5)(LSTM5) 
 46 | 
 47 |     LSTM6 = Bidirectional(LSTM(128,return_sequences=True),merge_mode='concat')(DROP5)
 48 |     DROP6 = Dropout(0.5)(LSTM6) 
 49 | 
 50 |     LSTM7 = Bidirectional(LSTM(36,return_sequences=True),merge_mode='concat')(DROP6)
 51 |     DROP7 = Dropout(0.5)(LSTM7) 
 52 |     '''
 53 |     LSTM8 = Bidirectional(LSTM(256, return_sequences=False), merge_mode='concat')(DROP3)
 54 |     DENSE1 = Dense(256)(LSTM8)
 55 |     DENSE2 = Dense(2)(DENSE1)
 56 |     
 57 |     ACT1 = Activation('softmax')(DENSE2)
 58 |     # model = Model(inputs=[INPUT1,INPUT2],outputs= ACT1)
 59 |     model = Model(inputs=INPUT1,outputs= ACT1)
 60 |     #model.summary()
 61 |     return model
 62 | 
 63 | if __name__ == '__main__':
 64 |     np.random.seed(1122)
 65 |     random.seed(1122)
 66 |     tf.random.set_seed(1122)
 67 |     npydir = sys.argv[1]
 68 |     weightfile = sys.argv[2]
 69 |     epochoutdir = sys.argv[3]
 70 |     itere = int(sys.argv[4])
 71 |     Good_for_Tra = np.load(npydir + '/Good_for_Tra.npy')
 72 |     Simu_for_Tra = np.load(npydir + '/Simu_for_Tra.npy')
 73 |     Good_for_Tst = np.load(npydir + '/Good_for_Tst.npy')
 74 |     Simu_for_Tst = np.load(npydir + '/Simu_for_Tst.npy')
 75 |     Tra_x = np.squeeze(np.concatenate((Good_for_Tra,Simu_for_Tra),axis=0))
 76 |     Tra_y = np.concatenate( (np.zeros((Good_for_Tra.shape[0],1)),np.ones((Simu_for_Tra.shape[0],1))), axis=0)
 77 |     Tst_x = np.squeeze(np.concatenate((Good_for_Tst,Simu_for_Tst),axis=0))
 78 |     Tst_y = np.concatenate((np.zeros((Good_for_Tst.shape[0], 1)), np.ones((Simu_for_Tst.shape[0], 1))), axis=0)
 79 |     Tra_y = to_categorical(Tra_y)
 80 |     Tst_y = to_categorical(Tst_y)
 81 | 
 82 |     LIST = list(range(Tra_x.shape[0]))
 83 |     np.random.shuffle(LIST)
 84 | 
 85 |     Tra_x = Tra_x[LIST,:]
 86 |     Tra_y = Tra_y[LIST,:]
 87 |     # Tra_x_input1 = Tra_x[..., 0]
 88 |     # Tra_x_input2 = Tra_x[..., 1][...,np.newaxis]
 89 | 
 90 |     LIST = list(range(Tst_x.shape[0]))
 91 |     np.random.shuffle(LIST)
 92 |     Tst_x = Tst_x[LIST,:]
 93 |     Tst_y = Tst_y[LIST,:]
 94 | 
 95 |     model = Cla_LSTM()
 96 |     model.load_weights(weightfile)
 97 | 
 98 |     ADAM = Adam(learning_rate=0.0001)
 99 |     model_checkpoint = ModelCheckpoint(filepath=epochoutdir + '/RetrainWeight-{epoch:03d}.hdf5', verbose=1, monitor='val_loss', save_best_only=True)
100 |     model_checkpoint2 = ModelCheckpoint(filepath=epochoutdir + '/RetrainWeight.hdf5', verbose=1, monitor='val_loss', save_best_only=True)
101 |     model.compile(loss='binary_crossentropy', optimizer=ADAM, metrics=['accuracy'])
102 |     csv_loger=CSVLogger('log.csv',append=True,separator=';')
103 | 
104 |     # 训练模型
105 |     batch_size = 500
106 |     epochs = itere
107 |     np.random.seed(1122)
108 |     # model.fit(x=[Tra_x_input1,Tra_x_input2], y=Tra_y,batch_size=batch_size,epochs=epochs, verbose=1 ,callbacks=[model_checkpoint,csv_loger], validation_split=0.25, shuffle=True)
109 |     model.fit(x=Tra_x, y=Tra_y,batch_size=batch_size,epochs=epochs,validation_data=(Tst_x, Tst_y),verbose=1 ,callbacks=[model_checkpoint,model_checkpoint2, csv_loger], shuffle=False)
110 | 
111 | 
112 | # 
113 | 
114 | '''
115 | model.fit(x_train, y_train, epochs=20, batch_size=128)
116 | score = model.evaluate(x_test, y_test, batch_size=128)
117 | 
118 | 
119 | model.add(Conv1D(filters=2, kernel_size=1 , activation='relu', name='conv1_1'))
120 | 
121 | model.add(Conv1D(filters=64, kernel_size=5 , activation='relu', name='conv1_1'))
122 | model.add(MaxPooling1D(2))
123 | 
124 | model.add(Conv1D(filters=64, kernel_size=5 , activation='relu', name='conv1_1'))
125 | 
126 | model.add(Conv1D(filters=2, kernel_size=2 , activation='relu', name='conv1_1'))
127 | 
128 | model.add(Bidirectional(LSTM(20,return_sequences=True),merge_mode='concat'))
129 | 
130 | model.summary()
131 | 
132 | model.add(Activation('softmax'));
133 | '''
134 | 
135 |     


--------------------------------------------------------------------------------
/bin/MyPredict.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Huang Wenjian
 4 | """
 5 | 
 6 | from keras.models import Sequential
 7 | from keras.layers import Embedding,Dropout,Bidirectional,Flatten,Dense,LSTM,TimeDistributed, Activation
 8 | from keras.callbacks import ModelCheckpoint,CSVLogger
 9 | from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
10 | from tensorflow.keras.optimizers import Adam
11 | import numpy as np
12 | from tensorflow.keras.utils import to_categorical
13 | from Model1 import Cla_LSTM
14 | import os
15 | import sys
16 | 
17 | # os.environ["CUDA_VISIBLE_DEVICES"] = "0"
18 | np.random.seed(1122)
19 | outfile = open(sys.argv[1], 'w')
20 | weightfile = sys.argv[2]
21 | prefix = ''
22 | if len(sys.argv) == 4:
23 |     prefix = sys.argv[3]
24 | 
25 | findgang = sys.argv[1].rfind('/')
26 | filedir = sys.argv[1][:findgang+1]
27 | findgang = sys.argv[0].rfind('/')
28 | codedir = sys.argv[0][:findgang+1]
29 | 
30 | Good_for_Tra = np.load(filedir + prefix + 'Reads.npy')
31 | Good_for_Tra_rev = np.load(filedir + prefix + 'Reads_rev.npy')
32 | Tst_x = np.squeeze(Good_for_Tra)
33 | Tst_x_rev = np.squeeze(Good_for_Tra_rev)
34 | 
35 | model = Cla_LSTM()
36 | model.load_weights(weightfile)
37 | 
38 | batch_size = 500 
39 | 
40 | Prob = model.predict(Tst_x,batch_size)
41 | Prob_rev = model.predict(Tst_x_rev,batch_size)
42 | AveProb = (Prob[:,0] + Prob_rev[:,0]) / 2
43 | for i in range(len(AveProb)):
44 |     outfile.write(str(Prob[i,0]) + '\t' + str(Prob_rev[i,0]) + '\t' + str(AveProb[i]) + '\n')
45 | outfile.close()
46 | 


--------------------------------------------------------------------------------
/bin/PreProcessing_SingleFile.py:
--------------------------------------------------------------------------------
 1 | from keras.models import Sequential
 2 | from keras.layers import Embedding,Dropout,Bidirectional,Flatten,Dense,LSTM,TimeDistributed
 3 | from keras.callbacks import ModelCheckpoint,CSVLogger
 4 | from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
 5 | import numpy as np
 6 | import sys
 7 | ########################################################################################################################
 8 | def ReverseComplement(str):
 9 |     return str[::-1].replace('A', 't').replace('T', 'a').replace('G', 'c').replace('C', 'g').upper()
10 | 
11 | 
12 | 
13 | np.random.seed(1122)
14 | readfile = sys.argv[1]
15 | findgang = sys.argv[1].rfind('/')
16 | filedir = sys.argv[1][:findgang+1]
17 | outprefix = ''
18 | if len(sys.argv) == 3:
19 |     outprefix = sys.argv[2]
20 | 
21 | with open(readfile,'r') as f:
22 |     ChimericRead_info = f.read()
23 | 
24 | ChimericRead_info = ChimericRead_info.split('\n')
25 | 
26 | ChimericRead =[]
27 | ChimericRead_rev = []
28 | # ChimericPoint=[]
29 | Cont = 1
30 | for readinfo in ChimericRead_info:
31 |     readinfo_split = readinfo.split('\t')
32 |     if len(readinfo_split) <= 1:
33 |         continue
34 |     read = readinfo_split[0]
35 |     MergePoint = int(readinfo_split[1])
36 |     read_new = read[0:MergePoint]+'H'+read[MergePoint:]
37 |     # read_new = read
38 |     Point = np.zeros(60)
39 |     Point[MergePoint-1] = 1
40 |     Point[MergePoint] = 1
41 |     if 'N' not in read_new:
42 |         ChimericRead.append(read_new)
43 |         ChimericRead_rev.append(ReverseComplement(read_new))
44 |     Cont = Cont + 1
45 | ########################################################################################################################
46 | 
47 | Data1 = np.ndarray(shape=(len(ChimericRead),61,1),dtype=float)
48 | Data2 = np.ndarray(shape=(len(ChimericRead_rev),61,1),dtype=float)
49 | for index in range(len(ChimericRead)):
50 |     Data1[index,:,0] = np.array([int(c) for c in ChimericRead[index].upper().replace('A','0').replace('T','1').replace('C','2').replace('G','3').replace('H','4')])
51 |     Data2[index,:,0] = np.array([int(c) for c in ChimericRead_rev[index].upper().replace('A','0').replace('T','1').replace('C','2').replace('G','3').replace('H','4')])
52 | 
53 |     
54 | ########################################################################################################################
55 | 
56 | LIST1 = list(range(len(ChimericRead)))
57 | LIST2 = list(range(len(ChimericRead_rev)))
58 | 
59 | DataNum = len(ChimericRead)
60 | Good_for_Tra = Data1[LIST1[0:DataNum],:,:]
61 | Good_for_Tra_rev = Data2[LIST2[0:DataNum],:,:]
62 | 
63 | np.save(filedir + outprefix + 'Reads.npy',Good_for_Tra)
64 | np.save(filedir + outprefix + 'Reads_rev.npy',Good_for_Tra_rev)


--------------------------------------------------------------------------------
/bin/RenameFastqFiles.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import division
 3 | import sys
 4 | import math
 5 | import os
 6 | import numpy
 7 | 
 8 | # ***** readme *****
 9 | # 这个代码将所有的fastq文件重命名为数字系列，并且保存映射表
10 | # 也可以反过来操作
11 | 
12 | 
13 | folder = sys.argv[1]
14 | 
15 | mapfilepath = ''
16 | 
17 | if len(sys.argv) > 2:
18 |     mapfilepath = sys.argv[2]
19 | 
20 | mapdict = {}
21 | if mapfilepath == '':
22 |     if os.path.exists(folder + '/RenameList.txt'):
23 |         print('The input files may have already renamed, since RenameList.txt is detected in ' + folder + '. To force rename files, please remove RenameList.txt')
24 |         exit(1)
25 |     count = 0
26 |     for filename in os.listdir(folder):
27 |         if filename.endswith('_1.fastq'):
28 |             count += 1
29 |             os.rename(folder + '/' + filename, folder + '/' + str(count) + '_1.fastq')
30 |             mapdict[count] = filename[:-8]
31 |             try:
32 |                 os.rename(folder + '/' + filename[:-7] + '2.fastq', folder + '/' + str(count) + '_2.fastq')
33 |             except Exception as e:
34 |                 sys.stderr.write(str(e) + ' Rename the pair: ' + filename[:-7] + '_2.fastq failed.\n')
35 |         if filename.endswith('_1.fastq.gz'):
36 |             count += 1
37 |             os.rename(folder + '/' + filename, folder + '/' + str(count) + '_1.fastq.gz')
38 |             mapdict[count] = filename[:-11]
39 |             try:
40 |                 os.rename(folder + '/' + filename[:-10] + '2.fastq.gz', folder + '/' + str(count) + '_2.fastq.gz')
41 |             except Exception as e:
42 |                 sys.stderr.write(str(e) + ' Rename the pair: ' + filename[:-10] + '_2.fastq.gz failed.\n')
43 |     outfile = open(folder + '/RenameList.txt', 'w')
44 |     for key in mapdict:
45 |         outfile.write(str(key) + '\t' + mapdict[key] + '\n')
46 |     outfile.close()
47 | else:
48 |     if not os.path.exists(folder + '/RenameList.txt'):
49 |         print('RenameList.txt cannot be found in ' + folder + ', please check it')
50 |         exit(1)
51 |     mapfile = open(mapfilepath)
52 |     for line in mapfile.readlines():
53 |         info = line.rstrip().split('\t')
54 |         mapdict[int(info[0])] = info[1]
55 |         for key in mapdict:
56 |             if os.path.exists(folder + '/' + str(key) + '_1.fastq'):
57 |                 os.rename(folder + '/' + str(key) + '_1.fastq', folder + '/' + str(mapdict[key]) + '_1.fastq')
58 |             if os.path.exists(folder + '/' + str(key) + '_2.fastq'):
59 |                 os.rename(folder + '/' + str(key) + '_2.fastq', folder + '/' + str(mapdict[key]) + '_2.fastq')
60 |             if os.path.exists(folder + '/' + str(key) + '_1.fastq.gz'):
61 |                 os.rename(folder + '/' + str(key) + '_1.fastq.gz', folder + '/' + str(mapdict[key]) + '_1.fastq.gz')
62 |             if os.path.exists(folder + '/' + str(key) + '_2.fastq.gz'):
63 |                 os.rename(folder + '/' + str(key) + '_2.fastq.gz', folder + '/' + str(mapdict[key]) + '_2.fastq.gz')


--------------------------------------------------------------------------------
/bin/ResultFinalOutput.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from __future__ import division
  3 | import sys
  4 | 
  5 | # ***** readme *****
  6 | # 添加信息： read，direct，insertsize, anchor
  7 | 
  8 | infile = open(sys.argv[1])
  9 | refannotfile = open(sys.argv[2])
 10 | ChimericOutDir = sys.argv[3]
 11 | OutPrefix = sys.argv[4]
 12 | #nodetailcell = sys.argv[5]
 13 | nodetailcell = 0
 14 | outfull = open(OutPrefix + '.full.txt', 'w')
 15 | outabridge = open(OutPrefix + '.abridged.txt', 'w')
 16 | genestrand = {}
 17 | genecount = {}
 18 | repeatcount = {}
 19 | posrecord = []
 20 | for line in refannotfile.readlines():
 21 |     if line.startswith('#'):
 22 |         continue
 23 |     info = line.split('\t')
 24 |     if info[2] == 'exon':
 25 |         chromo = info[0]
 26 |         start = int(info[3])
 27 |         end = int(info[4])
 28 |         strand = info[6]
 29 |         geneinfo = info[8].split('; ')
 30 |         genename = ''
 31 |         for item in geneinfo:
 32 |             if item.startswith('gene_name'):
 33 |                 genename = item[11:-1]
 34 |             if item.startswith('gene_type'):
 35 |                 genetype = item[11:-1]
 36 |             if item.startswith('gene_biotype'):
 37 |                 genetype = item[14:-1]
 38 |         if genename == '':
 39 |             for item in geneinfo:
 40 |                 if item.find('gene_id') > -1:
 41 |                     genename = item[9:-1]
 42 |         if genename == '':
 43 |             continue
 44 |         genestrand[genename] = strand
 45 | refannotfile.close()
 46 | outabridge.write(
 47 |     '#Fusion_id\tFusionGene\tPosition1\tPosition2\tstrands\tdirections\tSupportingCellNumber\tTotalSplitRead\tTotalDiscordantRead\tFakeProbability\tFDR\n')
 48 | outfull.write(
 49 |     '#Fusion_id\tFusionGene\tPosition1\tPosition2\tstrands\tdirections\tSupportingCellNumber\tSupportingCells\tTotalSplitRead\tSplitReads\tTotalDiscordantRead\tDiscordantReads\tFakeProbability\tFDR\n')
 50 | currentid = 0
 51 | lines = infile.readlines()
 52 | uselines = []
 53 | finaluselines = []
 54 | for line in lines:
 55 |     if line.startswith('#'):
 56 |         continue
 57 |     resinfo = line.rstrip('\n').split('\t')
 58 |     fusiongene1 = resinfo[0].split('--')[0]
 59 |     fusiongene2 = resinfo[0].split('--')[1]
 60 |     fusionpos1 = int(resinfo[4].split(':')[1])
 61 |     fusionpos2 = int(resinfo[5].split(':')[1])
 62 |     '''
 63 |     found = False
 64 |     for item in posrecord:
 65 |         if abs(fusionpos1 - item[0]) == abs(fusionpos2 - item[1]) or abs(fusionpos2 - item[0]) == abs(
 66 |                 fusionpos1 - item[1]) or fusionpos2 + item[1] == fusionpos1 + item[0] or fusionpos1 + item[
 67 |             1] == fusionpos2 + item[0]:
 68 |             found = True
 69 |             break
 70 |     if not found:
 71 |         posrecord.append([fusionpos1, fusionpos2])
 72 |         uselines.append(line)
 73 |         if fusiongene1 not in genecount:
 74 |             genecount[fusiongene1] = 0
 75 |         genecount[fusiongene1] += 1
 76 |         if fusiongene2 not in genecount:
 77 |             genecount[fusiongene2] = 0
 78 |         genecount[fusiongene2] += 1
 79 |     '''
 80 |     if fusiongene1 not in repeatcount:
 81 |         repeatcount[fusiongene1] = {}
 82 |     if fusiongene2 not in repeatcount:
 83 |         repeatcount[fusiongene2] = {}
 84 |     if fusionpos1 not in repeatcount[fusiongene1]:
 85 |         repeatcount[fusiongene1][fusionpos1] = []
 86 |     if fusionpos2 not in repeatcount[fusiongene2]:
 87 |         repeatcount[fusiongene2][fusionpos2] = []
 88 |     if fusiongene2 not in repeatcount[fusiongene1][fusionpos1]:
 89 |         repeatcount[fusiongene1][fusionpos1].append(fusiongene2)
 90 |     if fusiongene1 not in repeatcount[fusiongene2][fusionpos2]:
 91 |         repeatcount[fusiongene2][fusionpos2].append(fusiongene1)
 92 | '''
 93 | for line in uselines:
 94 |     if line.startswith('#'):
 95 |         continue
 96 |     resinfo = line.rstrip('\n').split('\t')
 97 |     fusiongene1 = resinfo[0].split('--')[0]
 98 |     fusiongene2 = resinfo[0].split('--')[1]
 99 |     if genecount[fusiongene1] >= 5 or genecount[fusiongene2] >= 5:
100 |         continue
101 |     finaluselines.append(line)
102 | '''
103 | for line in lines:
104 |     if line.startswith('#'):
105 |         continue
106 |     resinfo = line.rstrip('\n').split('\t')
107 |     cellsup = resinfo[-1].split(', ')[:-1]
108 |     fusiongene1 = resinfo[0].split('--')[0]
109 |     fusiongene2 = resinfo[0].split('--')[1]
110 |     fusionpos1 = int(resinfo[4].split(':')[1])
111 |     fusionpos2 = int(resinfo[5].split(':')[1])
112 |     direct1 = resinfo[9]
113 |     direct2 = resinfo[10]
114 |     try:
115 |         strand1 = genestrand[fusiongene1]
116 |         if direct1 == strand1:
117 |             direct1 = 'd'
118 |         else:
119 |             direct1 = 'u'
120 |     except KeyError:
121 |         strand1 = 'N/A'
122 |         direct1 = 'N/A'
123 |     try:
124 |         strand2 = genestrand[fusiongene2]
125 |         if direct2 == strand2:
126 |             direct2 = 'd'
127 |         else:
128 |             direct2 = 'u'
129 |     except KeyError:
130 |         strand2 = 'N/A'
131 |         direct2 = 'N/A'
132 |     anchor = []
133 |     insertsize = []
134 |     discordant = []
135 |     splitreadnum = []
136 |     totalsplitread = 0
137 |     totaldiscordant = 0
138 |     if len(repeatcount[fusiongene1][fusionpos1]) >= 5 or len(repeatcount[fusiongene2][fusionpos2]) >= 5:
139 |         continue
140 |     if nodetailcell != '1':
141 |         for cellsupitem in cellsup:
142 |             if cellsupitem == '' or cellsupitem == ' ':
143 |                 continue
144 |             try:
145 |                 fsfile = open(ChimericOutDir + '/' + cellsupitem + '_FusionSupport.txt')
146 |                 for fsline in fsfile.readlines():
147 |                     info = fsline.rstrip().split('\t')
148 |                     gene1 = info[0]
149 |                     gene2 = info[1]
150 |                     if gene1 == fusiongene1 and gene2 == fusiongene2 or gene1 == fusiongene2 and gene2 == fusiongene1:
151 |                         discordant.append(int(info[2]))
152 |                         if len(info) < 7:
153 |                             splitsup = []
154 |                         else:
155 |                             splitsup = info[6].split(';')
156 |                         splitcount = 0
157 |                         for item in splitsup:
158 |                             if item.find('+') == -1:
159 |                                 continue
160 |                             thispos1 = int(item.split('+')[0].split(',')[0])
161 |                             thispos2 = int(item.split('+')[0].split(',')[1])
162 |                             if abs(thispos2 - fusionpos2) + abs(thispos1 - fusionpos1) < 30 or abs(
163 |                                     thispos2 - fusionpos1) + abs(thispos1 - fusionpos2) < 30:
164 |                                 splitcount += 1
165 |                         splitreadnum.append(splitcount)
166 |                         break
167 |             except IOError:
168 |                 pass
169 |         totalsplitread = sum(splitreadnum)
170 |         totaldiscordant = sum(discordant)
171 |         cellsup = list(map(int, cellsup))
172 |     currentid += 1
173 |     if direct1 == 'd' and direct2 == 'u':
174 |         fgenes = resinfo[0].split('--')
175 |         outabridge.write(str(currentid) + '\t' + fgenes[1] + '--' + fgenes[0] + '\t' + resinfo[5] + '\t' + resinfo[
176 |             4] + '\t' + strand2 + '/' + strand1 + '\t' + direct2 + '/' + direct1 + '\t' +
177 |                          str(len(splitreadnum)) + '\t' + str(totalsplitread) + '\t' + str(totaldiscordant) + '\t' + resinfo[
178 |                              6] + '\t' + resinfo[8] + '\n')
179 |         outfull.write(str(currentid) + '\t' + fgenes[1] + '--' + fgenes[0] + '\t' + resinfo[5] + '\t' + resinfo[
180 |             4] + '\t' + strand2 + '/' + strand1 + '\t' + direct2 + '/' + direct1 + '\t' + str(
181 |             len(splitreadnum)) + '\t' + str(cellsup) + '\t' + str(totalsplitread) + '\t' + str(splitreadnum) + '\t' +
182 |                       str(totaldiscordant) + '\t' + str(discordant) + '\t' + resinfo[6] + '\t' + resinfo[8] + '\n')
183 |     else:
184 |         outabridge.write(str(currentid) + '\t' + resinfo[0] + '\t' + resinfo[4] + '\t' + resinfo[
185 |             5] + '\t' + strand1 + '/' + strand2 + '\t' + direct1 + '/' + direct2 + '\t' +
186 |                          str(len(splitreadnum)) + '\t' + str(totalsplitread) + '\t' + str(totaldiscordant) + '\t' + resinfo[
187 |                              6] + '\t' + resinfo[8] + '\n')
188 |         outfull.write(str(currentid) + '\t' + resinfo[0] + '\t' + resinfo[4] + '\t' + resinfo[
189 |             5] + '\t' + strand1 + '/' + strand2 + '\t' + direct1 + '/' + direct2 + '\t' + str(
190 |             len(splitreadnum)) + '\t' + str(cellsup) + '\t' + str(totalsplitread) + '\t' + str(splitreadnum) + '\t' +
191 |                       str(totaldiscordant) + '\t' + str(discordant) + '\t' + resinfo[6] + '\t' + resinfo[8] + '\n')
192 | infile.close()
193 | outabridge.close()
194 | outfull.close()
195 | 


--------------------------------------------------------------------------------
/bin/ResultLastFiltered.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from __future__ import division
  3 | import sys
  4 | import os.path
  5 | 
  6 | resultfile = open(sys.argv[1])
  7 | geneposfile = open(sys.argv[2])
  8 | outfilepath = sys.argv[3]
  9 | annot = open(sys.argv[4])
 10 | geneposfile2path = sys.argv[5]
 11 | lncfilter = sys.argv[6]
 12 | nasfilter = sys.argv[7]
 13 | outfile = open(outfilepath, 'w')
 14 | trimmedfile = open(outfilepath + '.fail', 'w')
 15 | totalfile = open(outfilepath + '.total', 'w')
 16 | genedic = {}
 17 | linecount = 0
 18 | exondic = {}
 19 | annotgene = {}
 20 | LINCoption = True
 21 | NoApprovoption = True
 22 | for line in annot.readlines():
 23 |     if line.startswith('#'):
 24 |         continue
 25 |     info = line.split('\t')
 26 |     start = int(info[3])
 27 |     end = int(info[4])
 28 |     geneinfo = info[8].split('; ')
 29 |     genename = ''
 30 |     for item in geneinfo:
 31 |         if item.startswith('gene_name'):
 32 |             genename = item[11:-1]
 33 |             break
 34 |     if genename == '':
 35 |         for item in geneinfo:
 36 |             if item.startswith('gene_id'):
 37 |                 genename = item[9:-1]
 38 |     if info[2] == 'exon':
 39 |         if genename != '':
 40 |             if genename not in exondic:
 41 |                 exondic[genename] = []
 42 |             exondic[genename].append([start, end])
 43 |     if info[2] == 'gene':
 44 |         annotgene[genename] = [[start, end]]
 45 | for key in annotgene:
 46 |     if key not in exondic:
 47 |         exondic[key] = annotgene[key]
 48 | for line in geneposfile.readlines():
 49 |     linecount += 1
 50 |     info = line.rstrip().split('\t')
 51 |     if info[0] not in genedic:
 52 |         genedic[info[0]] = [linecount, info[1], int(info[2]), int(info[3]), info[4]]
 53 | if geneposfile2path != '':
 54 |     geneposfile2 = open(geneposfile2path)
 55 |     for line in geneposfile2.readlines():
 56 |         info = line.rstrip().split('\t')
 57 |         if info[0] not in genedic:
 58 |             genedic[info[0]] = [linecount, info[1], int(info[2]), int(info[3]), info[4]]
 59 |         else:
 60 |             if genedic[info[0]][4] != 'pseudogene' and genedic[info[0]][4] != 'lincRNA':
 61 |                 genedic[info[0]][4] = info[4]
 62 | posdict = {}
 63 | for line in resultfile.readlines():
 64 |     if line.startswith('#'):
 65 |         continue
 66 |     info = line.split('\t')
 67 |     genes = info[0].split('--')
 68 |     suppcells = int(info[1])
 69 |     splitread = int(info[2])
 70 |     discordant = int(info[3])
 71 |     pos1 = int(info[4].split(':')[1])
 72 |     pos2 = int(info[5].split(':')[1])
 73 |     posdict[pos1 + pos2] = 1
 74 |     flag = ['\t', '\t', '\t', '\t', '\t', '\t']
 75 |     if NoApprovoption and ((genes[0].find('RP') > -1 and genes[0].find('-') > -1 and genes[0].find('.') > -1) or \
 76 |             (genes[1].find('RP') > -1 and genes[1].find('-') > -1 and genes[1].find('.') > -1) or \
 77 |             (genes[0].find('A') > -1 and genes[0].find('.') > -1) or \
 78 |             (genes[1].find('A') > -1 and genes[1].find('.') > -1) or \
 79 |             genes[0].find('MT-') > -1 or genes[1].find('MT-') > -1 or \
 80 |             (genes[0].find('CT') > -1 and genes[0].find('-') > -1 and genes[0].find('.') > -1) or \
 81 |             (genes[1].find('CT') > -1 and genes[1].find('-') > -1 and genes[1].find('.') > -1)):
 82 |         flag[0] = '\tRP'
 83 |     if discordant / splitread > 10 or suppcells >= 100 and splitread / suppcells < 1.6:
 84 |         flag[1] = '\tTooManyDiscordant'
 85 |     if genedic[genes[0]][1] == genedic[genes[1]][1] and splitread / suppcells < 5 and (abs(genedic[genes[0]][0] - genedic[genes[1]][0]) < 2 or not (genedic[genes[0]][2] > genedic[genes[1]][3] or genedic[genes[1]][2] > genedic[genes[0]][3])) or genes[0] == genes[1]:
 86 |         flag[2] = '\tOverlap'
 87 |     if genedic[genes[0]][1] == genedic[genes[1]][1] and abs(pos1 - pos2) < 200000 and splitread < 150 and info[0].find('IGH') == -1:
 88 |         flag[2] = '\tOverlap'
 89 |     if LINCoption and (genedic[genes[0]][4] == 'lincRNA' or genedic[genes[1]][4] == 'lincRNA' or genedic[genes[0]][4] == 'lncRNA' or \
 90 |             genedic[genes[1]][4] == 'lncRNA' or genes[0].startswith('LINC') or genes[1].startswith('LINC')):
 91 |         flag[3] = '\tlncRNA'
 92 |     if genedic[genes[0]][4] == 'pseudogene' or genedic[genes[1]][4] == 'pseudogene':
 93 |         flag[4] = '\tpseudogene'
 94 |     gene1exon = False
 95 |     gene2exon = False
 96 |     for item in exondic[genes[0]]:
 97 |         if pos1 - 100 <= item[1] and item[0] <= pos1 + 100:
 98 |             gene1exon = True
 99 |             break
100 |     for item in exondic[genes[1]]:
101 |         if pos2 - 100 <= item[1] and item[0] <= pos2 + 100:
102 |             gene2exon = True
103 |             break
104 |     if not gene1exon or not gene2exon:
105 |         flag[5] = '\tintron'
106 |     if (flag[0] == '\t' or nasfilter == '1') and flag[1] == '\t' and flag[2] == '\t' and (flag[3] == '\t' or lncfilter == '1') and flag[4] == '\t' and flag[5] == '\t':
107 |         outfile.write(line)
108 |     else:
109 |         trimmedfile.write(line.rstrip() + flag[0] + flag[1] + flag[2] + flag[3] + flag[4] + flag[5] + '\n')
110 |     totalfile.write(line.rstrip() + flag[0] + flag[1] + flag[2] + flag[3] + flag[4]  + flag[5] + '\n')
111 | 


--------------------------------------------------------------------------------
/bin/Results_Filtered2Final.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import division
 3 | import sys
 4 | import math
 5 | import os
 6 | import numpy
 7 | 
 8 | # ***** readme *****
 9 | # 这个代码从结果文件AllResults_filtered.txt中提取出概率低、p值低的出来
10 | # 并且做出筛选
11 | 
12 | resultfile = open(sys.argv[1])
13 | Pvaluethres = float(sys.argv[2])
14 | probthres = float(sys.argv[3])
15 | totalcellnum = min(int(sys.argv[4]), 500)
16 | 
17 | 
18 | resultfilelines = resultfile.readlines()
19 | goodpv = []
20 | badpv = []
21 | FDRDict = {}
22 | setpcutoff = False
23 | threspv = 20
24 | setthrespv = False
25 | for line in resultfilelines:
26 |     if line.startswith('Fusion'):
27 |         continue
28 |     info = line.rstrip().split('\t')
29 |     if (int(info[2]) / int(info[1])) < 1.25 or int(info[1]) < 5:
30 |         badpv.append(float(info[7]))
31 |     else:
32 |         goodpv.append(float(info[7]))
33 | 
34 | for l in range(1000):
35 |     pcutoff = pow(10, -l - 0.5)
36 |     smallgoodpv = 0
37 |     smallbadpv = 0
38 |     for p in goodpv:
39 |         if p < pcutoff:
40 |             smallgoodpv += 1
41 |     for p in badpv:
42 |         if p < pcutoff:
43 |             smallbadpv += 1
44 |     aaa = smallbadpv / max(1, len(badpv)) * (len(goodpv) + len(badpv)) / max(1, smallgoodpv + smallbadpv)
45 |     #sys.stderr.write(str(aaa) + '\n')
46 |     if aaa < Pvaluethres and not setpcutoff:
47 |         threspv = l + 0.5
48 |         FDRDict[l] = aaa
49 |         FDRDict[-1] = aaa
50 |         setpcutoff = True
51 |         continue
52 |     if setpcutoff:
53 |         FDRDict[l] = min(aaa, FDRDict[-1])
54 |         FDRDict[-1] = FDRDict[l]
55 |     
56 | 
57 | 
58 | if len(sys.argv) > 5:
59 |     threspv = float(sys.argv[5])
60 |     for i in range(1000):
61 |         FDRDict[i] = pow(10, -i)
62 | 
63 | 
64 | for line in resultfilelines:
65 |     if line.startswith('Fusion'):
66 |         continue
67 |     info = line.rstrip().split('\t')
68 |     if float(info[6]) <= probthres and float(info[7]) <= pow(10, -threspv):
69 |         if float(info[7]) > 0:
70 |             fdrlevel = FDRDict[math.floor(-math.log10(float(info[7])) - 0.5)]
71 |         else:
72 |             fdrlevel = FDRDict[-1]
73 |         info.insert(8, str(fdrlevel))
74 |         sepe = '\t'
75 |         print(sepe.join(info))
76 | 


--------------------------------------------------------------------------------
/bin/RmLowMappibility_ChimericRead.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import sys
  3 | import codecs
  4 | 
  5 | 
  6 | # ***** readme *****
  7 | # This code remove the low mappability records of Chimeric sam by star
  8 | # if the mappability < 1, delete the record locating at this position
  9 | 
 10 | ChimericSamFile = codecs.open(sys.argv[1], 'r',  encoding='utf-8', errors='ignore')
 11 | OutputFile = open(sys.argv[2], 'w')
 12 | MappabilityFile = open(sys.argv[3])
 13 | thres = float(sys.argv[4])
 14 | Mappability = {}
 15 | chrflag = []
 16 | keys = {}
 17 | for line in MappabilityFile.readlines():
 18 |     if line[0] == '#' or len(line) < 3:
 19 |         continue
 20 |     info = line.split('\t')
 21 |     if float(info[3].replace('\n', '').replace('\r', '')) < thres:
 22 |         continue
 23 |     if info[0] not in chrflag:
 24 |         chrflag.append(info[0])
 25 |         Mappability[info[0]] = {}
 26 |     Mappability[info[0]][int(info[1])] = int(info[2])
 27 | for i in Mappability:
 28 |     keys[i] = list(Mappability[i].keys())
 29 |     keys[i].sort()
 30 | lastname = ''
 31 | linestore = []
 32 | flag = 0
 33 | aaaaa = 0
 34 | linecount = 0
 35 | lines = ChimericSamFile.readlines()
 36 | for line in lines:
 37 |     info = line.split('\t')
 38 |     if len(info) < 5:
 39 |         continue
 40 |     if len(line) > 10:
 41 |         if line[0] == '@':
 42 |             continue
 43 |         if info[2].find('M') > -1:
 44 |             continue
 45 |     if info[0] != lastname:
 46 |         bad = 0
 47 |         poss = []
 48 |         for item in linestore:
 49 |             info = item.split('\t')
 50 |             '''
 51 |             if info[5].find('N') > -1:
 52 |                 bad = 1
 53 |                 break
 54 |             '''
 55 |             poss.append(info[3])
 56 |         if len(poss) <= 1 or (len(poss) == 2 and abs(int(poss[0])-int(poss[1])) <= 10):
 57 |             bad = 1
 58 |         if bad == 0:
 59 |             mapflag = 1
 60 |             index = 0
 61 |             for subline in linestore:
 62 |                 if (index == 1 and poss[0] == poss[1]) or (index == 2 and (poss[2] == poss[1] or poss[2] == poss[0])):
 63 |                     index += 1
 64 |                     continue
 65 |                 index += 1
 66 |                 info = subline.split('\t')
 67 |                 if not info[2].startswith('chr'):
 68 |                     info[2] = 'chr' + info[2]
 69 |                 findstart = 0
 70 |                 if info[2] in keys:
 71 |                     findend = len(keys[info[2]]) - 1
 72 |                     if keys[info[2]][findstart] > int(info[3]):
 73 |                         mapflag = 0
 74 |                         break
 75 |                     if Mappability[info[2]][keys[info[2]][findend]] <= int(info[3]):
 76 |                         mapflag = 0
 77 |                         break
 78 |                     if keys[info[2]][findstart] <= int(info[3]) <= Mappability[info[2]][keys[info[2]][findstart]]:
 79 |                         continue
 80 |                     if keys[info[2]][findend] <= int(info[3]) <= Mappability[info[2]][keys[info[2]][findend]]:
 81 |                         continue
 82 |                     whilecount = 0
 83 |                     while True:
 84 |                         whilecount += 1
 85 |                         if findend - findstart <= 1:
 86 |                             mapflag = 0
 87 |                             break
 88 |                         mid = int((findend - findstart) * 0.618) + findstart
 89 |                         if keys[info[2]][mid] > int(info[3]):
 90 |                             findend = mid
 91 |                         elif keys[info[2]][mid] <= int(info[3]):
 92 |                             if Mappability[info[2]][keys[info[2]][mid]] <= int(info[3]):
 93 |                                 findstart = mid
 94 |                             else:
 95 |                                 ppp = mid
 96 |                                 break
 97 |             if mapflag == 1:
 98 |                 for subline in linestore:
 99 |                     flag += 1
100 |                     try:
101 |                         sep = '\t'
102 |                         subinfo = subline.split('\t')
103 |                         if not subinfo[2].startswith('chr'):
104 |                             subinfo[2] = 'chr' + subinfo[2]
105 |                         OutputFile.write(sep.join(subinfo))
106 |                     except:
107 |                         print(subline)
108 |         info = line.split('\t')
109 |         lastname = info[0]
110 |         linestore = [line]
111 |     else:
112 |         linestore.append(line)
113 |     if line == '':
114 |         break
115 | ChimericSamFile.close()
116 | MappabilityFile.close()
117 | OutputFile.close()
118 | 


--------------------------------------------------------------------------------
/bin/RmLowMappibility_ChimericRead_NoFilter.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import sys
 3 | import codecs
 4 | 
 5 | 
 6 | # ***** readme *****
 7 | # This code remove the low mappability records of Chimeric sam by star
 8 | # if the mappability < 1, delete the record locating at this position
 9 | 
10 | ChimericSamFile = codecs.open(sys.argv[1], 'r',  encoding='utf-8', errors='ignore')
11 | OutputFile = open(sys.argv[2], 'w')
12 | chrflag = []
13 | lastname = ''
14 | linestore = []
15 | flag = 0
16 | aaaaa = 0
17 | linecount = 0
18 | lines = ChimericSamFile.readlines()
19 | for line in lines:
20 |     info = line.split('\t')
21 |     if len(info) < 5:
22 |         continue
23 |     if len(line) > 10:
24 |         if line[0] == '@':
25 |             continue
26 |         if info[2].find('M') > -1:
27 |             continue
28 |     if info[0] != lastname:
29 |         bad = 0
30 |         poss = []
31 |         for item in linestore:
32 |             info = item.split('\t')
33 |             '''
34 |             if info[5].find('N') > -1:
35 |                 bad = 1
36 |                 break
37 |             '''
38 |             poss.append(info[3])
39 |         if len(poss) <= 1 or (len(poss) == 2 and abs(int(poss[0])-int(poss[1])) <= 10):
40 |             continue
41 |         for subline in linestore:
42 |             try:
43 |                 sep = '\t'
44 |                 subinfo = subline.split('\t')
45 |                 if not subinfo[2].startswith('chr'):
46 |                     subinfo[2] = 'chr' + subinfo[2]
47 |                 OutputFile.write(sep.join(subinfo))
48 |             except:
49 |                 print(subline)
50 |         info = line.split('\t')
51 |         lastname = info[0]
52 |         linestore = [line]
53 |     else:
54 |         linestore.append(line)
55 |     if line == '':
56 |         break
57 | ChimericSamFile.close()
58 | OutputFile.close()
59 | 


--------------------------------------------------------------------------------
/bin/StarMapping_Chimeric.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | filedir=$1
 3 | mystart=$2
 4 | myend=$3
 5 | outdir=$4
 6 | genomedir=$5
 7 | ncore=$6
 8 | for ((i=${mystart};i<=${myend};i++))
 9 | do
10 | 	if [ -f ${filedir}/${i}_1.fastq ];then
11 | 		mkdir -p ${outdir}/${i}
12 | 		STAR --runThreadN ${ncore} --genomeDir ${genomedir} --readFilesIn ${filedir}/${i}_1.fastq ${filedir}/${i}_2.fastq --outSAMtype BAM SortedByCoordinate --chimOutType SeparateSAMold --outSAMunmapped Within KeepPairs --quantMode GeneCounts --outFileNamePrefix ${outdir}/${i}/human --chimSegmentMin 12 --chimJunctionOverhangMin 8 --alignSJDBoverhangMin 10 --alignMatesGapMax 100000 --alignIntronMax 100000 --chimSegmentReadGapMax 3 --alignSJstitchMismatchNmax 5 -1 5 5 --alignSplicedMateMapLminOverLmate 0 --alignSplicedMateMapLmin 30 --chimMultimapScoreRange 3 --chimScoreJunctionNonGTAG -4 --chimNonchimScoreDropMin 10 --peOverlapMMp 0.1
13 | 	fi
14 | done
15 | 


--------------------------------------------------------------------------------
/bin/TidyupFusionFinalResult.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import division
 3 | import sys
 4 | import os.path
 5 | 
 6 | 
 7 | # ***** readme *****
 8 | # This code removes the duplicates and add supporting info.
 9 | 
10 | def CheckGoodGene(gene, pos, thiscount):
11 |     if not FilteringSwitch:
12 |         return True
13 |     sum = 0
14 |     for item in FusionGeneRecord[gene]:
15 |         sum += item[1]
16 |     for item in FusionGeneRecord[gene]:
17 |         if abs(item[0] - pos) < 5:
18 |             if thiscount / sum < 0.4 and item[2] >= 3:
19 |                 return False
20 |             return True
21 |     return False
22 | 
23 | 
24 | FilteringSwitch = True
25 | ResultFile = open(sys.argv[1])
26 | if len(sys.argv) > 2:
27 |     FilteringSwitch = False
28 | PosRecord = {}
29 | FusionGeneRecord = {}
30 | uselines = []
31 | for line in ResultFile.readlines():
32 |     if line[0] == '#':
33 |         print(line[:-1] + '\tSupportingCells')
34 |         continue
35 |     if line.startswith('FusionName'):
36 |         print(line[:-1] + '\tSupportingCells')
37 |         continue
38 |     info = line.split('\t')
39 |     pos1 = int(info[4].split(':')[1])
40 |     pos2 = int(info[5].split(':')[1].rstrip('\n'))
41 |     gene = info[0].split('--')
42 |     gene1 = gene[0]
43 |     gene2 = gene[1]
44 |     '''
45 |     if info[5] + '--' + info[4] in PosRecord or info[4] + '--' + info[5] in PosRecord:
46 |         continue
47 |     '''
48 |     PosRecord[info[4] + '--' + info[5]] = 0
49 |     uselines.append(line)
50 |     if gene1 not in FusionGeneRecord:
51 |         FusionGeneRecord[gene1] = [[pos1, int(info[2]), 1]]
52 |     else:
53 |         near = False
54 |         for i in range(len(FusionGeneRecord[gene1])):
55 |             if abs(FusionGeneRecord[gene1][i][0] - pos1) <= 5:
56 |                 near = True
57 |                 FusionGeneRecord[gene1][i][1] += int(info[2])
58 |                 FusionGeneRecord[gene1][i][2] += 1
59 |                 break
60 |         if not near:
61 |             FusionGeneRecord[gene1].append([pos1, int(info[2]), 1])
62 |     if gene2 not in FusionGeneRecord:
63 |         FusionGeneRecord[gene2] = [[pos2, int(info[2]), 1]]
64 |     else:
65 |         near = False
66 |         for i in range(len(FusionGeneRecord[gene2])):
67 |             if abs(FusionGeneRecord[gene2][i][0] - pos2) <= 5:
68 |                 near = True
69 |                 FusionGeneRecord[gene2][i][1] += int(info[2])
70 |                 FusionGeneRecord[gene2][i][2] += 1
71 |                 break
72 |         if not near:
73 |             FusionGeneRecord[gene2].append([pos2, int(info[2]), 1])
74 | ResultFile.close()
75 | PosRecord = []
76 | for line in uselines:
77 |     if line[0] == '#':
78 |         continue
79 |     if line.startswith('FusionName'):
80 |         continue
81 |     info = line.split('\t')
82 |     gene = info[0].split('--')
83 |     gene1 = gene[0]
84 |     gene2 = gene[1]
85 |     gene1main = gene1.split('-')[0]
86 |     gene2main = gene2.split('-')[0]
87 |     pos1 = int(info[4].split(':')[1])
88 |     pos2 = int(info[5].split(':')[1].rstrip('\n'))
89 |     if CheckGoodGene(gene1, pos1, int(info[2])) and CheckGoodGene(gene2, pos2, int(info[2])) and gene1main != gene2main:
90 |         print(line[:-1])
91 | ResultFile.close()
92 | 


--------------------------------------------------------------------------------
/bin/TidyupFusionFinalResult_FindSupCell.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from __future__ import division
  3 | import sys
  4 | import os.path
  5 | 
  6 | 
  7 | # ***** readme *****
  8 | # This code removes the duplicates and add supporting info.
  9 | 
 10 | def CheckGoodGene(gene, pos):
 11 |     if not FilteringSwitch:
 12 |         return True
 13 |     sum = 0
 14 |     for item in FusionGeneRecord[gene]:
 15 |         sum += item[1]
 16 |     for item in FusionGeneRecord[gene]:
 17 |         if abs(item[0] - pos) < 5:
 18 |             if item[1] / sum < 0.4 and item[2] > 3:
 19 |                 return False
 20 |             return True
 21 |     return False
 22 | 
 23 | FilteringSwitch = True
 24 | FindSupCell = True
 25 | ResultFile = open(sys.argv[1])
 26 | if len(sys.argv) > 3:
 27 |     FilteringSwitch = False
 28 | FileDir = sys.argv[2]
 29 | PosRecord = {}
 30 | FusionGeneRecord = {}
 31 | uselines = []
 32 | for line in ResultFile.readlines():
 33 |     if line[0] == '#':
 34 |         print(line[:-1] + '\tSupportingCells')
 35 |         continue
 36 |     if line.startswith('FusionName'):
 37 |         print(line[:-1] + '\tSupportingCells')
 38 |         continue
 39 |     info = line.split('\t')
 40 |     pos1 = int(info[4].split(':')[1])
 41 |     pos2 = int(info[5].split(':')[1].rstrip('\n'))
 42 |     gene = info[0].split('--')
 43 |     gene1 = gene[0]
 44 |     gene2 = gene[1]
 45 |     '''
 46 |     if info[5] + '--' + info[4] in PosRecord or info[4] + '--' + info[5] in PosRecord:
 47 |         continue
 48 |     '''
 49 |     PosRecord[info[4] + '--' + info[5]] = 0
 50 |     uselines.append(line)
 51 |     if gene1 not in FusionGeneRecord:
 52 |         FusionGeneRecord[gene1] = [[pos1, int(info[2]), 1]]
 53 |     else:
 54 |         near = False
 55 |         for i in range(len(FusionGeneRecord[gene1])):
 56 |             if abs(FusionGeneRecord[gene1][i][0] - pos1) <= 5:
 57 |                 near = True
 58 |                 FusionGeneRecord[gene1][i][1] += int(info[2])
 59 |                 FusionGeneRecord[gene1][i][2] += 1
 60 |                 break
 61 |         if not near:
 62 |             FusionGeneRecord[gene1].append([pos1, int(info[2]), 1])
 63 |     if gene2 not in FusionGeneRecord:
 64 |         FusionGeneRecord[gene2] = [[pos2, int(info[2]), 1]]
 65 |     else:
 66 |         near = False
 67 |         for i in range(len(FusionGeneRecord[gene2])):
 68 |             if abs(FusionGeneRecord[gene2][i][0] - pos2) <= 5:
 69 |                 near = True
 70 |                 FusionGeneRecord[gene2][i][1] += int(info[2])
 71 |                 FusionGeneRecord[gene2][i][2] += 1
 72 |                 break
 73 |         if not near:
 74 |             FusionGeneRecord[gene2].append([pos2, int(info[2]), 1])
 75 | ResultFile.close()
 76 | 
 77 | 
 78 | FusionRecCells = {}
 79 | filecount = 0
 80 | filenames = os.listdir(FileDir)
 81 | for file in filenames:
 82 |     if file.endswith('_FusionSupport.txt'):
 83 |         thisfile = open(FileDir + file)
 84 |         filecount += 1
 85 |         cellindex = int(file.split('_')[0])
 86 |         FusionRecCells[cellindex] = {}
 87 |         for thisline in thisfile.readlines():
 88 |             info = thisline.split('\t')
 89 |             found = False
 90 |             if len(info) == 7:
 91 |                 readsup = info[6].rstrip(';')
 92 |                 splitinfo = readsup.split(';')
 93 |                 FusionRecCells[cellindex][info[0] + '--' + info[1]] = splitinfo
 94 |             elif len(info) == 6:
 95 |                 FusionRecCells[cellindex][info[0] + '--' + info[1]] = []
 96 |         thisfile.close()
 97 | 
 98 | 
 99 | 
100 | for line in uselines:
101 |     if line[0] == '#':
102 |         continue
103 |     if line.startswith('FusionName'):
104 |         continue
105 |     info = line.split('\t')
106 |     gene = info[0].split('--')
107 |     gene1 = gene[0]
108 |     gene2 = gene[1]
109 |     pos1 = int(info[4].split(':')[1])
110 |     pos2 = int(info[5].split(':')[1].rstrip('\n'))
111 |     if CheckGoodGene(gene1, pos1) and CheckGoodGene(gene2, pos2):
112 |         print(line[:-1], end='\t')
113 |         cellname = []
114 |         for cell in FusionRecCells:
115 |             if gene1 + '--' + gene2 in FusionRecCells[cell]:
116 |                 key = gene1 + '--' + gene2
117 |             elif gene2 + '--' + gene1 in FusionRecCells[cell]:
118 |                 key = gene2 + '--' + gene1
119 |             else:
120 |                 continue
121 |             if FusionRecCells[cell][key] == []:
122 |                 cellname.append(cell)
123 |             else:
124 |                 for item in FusionRecCells[cell][key]:
125 |                     if len(item) <= 3:
126 |                         continue
127 |                     a = item.split('+')[0]
128 |                     pp = a.split(',')
129 |                     try:
130 |                         if abs(pos1 - int(pp[0])) < 10 and abs(pos2 - int(pp[1])) < 10 or abs(pos2 - int(pp[0])) < 10 and abs(pos1 - int(pp[1])) < 10:
131 |                             cellname.append(cell)
132 |                             break
133 |                     except:
134 |                         sys.stderr.write(str(pos1) + '\t' + str(pos2) + '\t' + item)
135 |         cellname = sorted(cellname)
136 |         if cellname == []:
137 |             print('NoSupCell', end='')
138 |         else:
139 |             for item in cellname:
140 |                 print(str(item), end=', ')
141 |         print('')
142 | ResultFile.close()
143 | 


--------------------------------------------------------------------------------
/bin/run_RmLowMap.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | start=$1
 3 | end=$2
 4 | filedir=$3
 5 | outdir=$4
 6 | thres=$5
 7 | mappabilityfile=$6
 8 | for ((i=${start};i<=${end};i++))
 9 | do
10 | 	file=`ls ${filedir}/${i}/*Chimeric.out.sam`
11 | 	if [[ -n ${file} ]]; then
12 | 		python codes/RmLowMappibility_ChimericRead.py ${file} ${outdir}/${i}.sam ${mappabilityfile} ${thres}
13 | 	fi
14 | done


--------------------------------------------------------------------------------
/data/hg19mappability75.txt.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiDsLab/scFusion/c5fc1bd43452d9f187e47242565e782e59c5afed/data/hg19mappability75.txt.zip


--------------------------------------------------------------------------------
/data/weight-V9-2.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiDsLab/scFusion/c5fc1bd43452d9f187e47242565e782e59c5afed/data/weight-V9-2.hdf5


--------------------------------------------------------------------------------
/scFusion.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import argparse
  4 | import subprocess
  5 | 
  6 | lastd = sys.argv[0].rfind('/')
  7 | if lastd == -1:
  8 |     programdir = './'
  9 | else:
 10 |     programdir = sys.argv[0][:lastd] + '/'
 11 | parser = argparse.ArgumentParser(description='Single-cell Gene Fusion Detection (c) Zjie Jin', add_help=False)
 12 | subparsers = parser.add_subparsers(help='Select the function', required=True)
 13 | parser1 = subparsers.add_parser('BuildSTARIndex', help='Build the STAR Index', add_help=False)
 14 | group11 = parser1.add_argument_group('Required parameters')
 15 | group12 = parser1.add_argument_group('Optional parameters')
 16 | group11.add_argument("-s", "--STARIndex", help='The STAR index output folder.')
 17 | group11.add_argument("-g", "--Genome", help='The reference file (*.fasta or *.fa).')
 18 | group11.add_argument("-a", "--Annotation", help='The gtf annotation file (*.gtf).')
 19 | group12.add_argument("-t", "--Thread", default='8', help='Number of threads can be used, default is 8.')
 20 | parser2 = subparsers.add_parser('Rename', help='Rename files using consecutive numbers', add_help=False)
 21 | group2 = parser2.add_argument_group('Required parameters')
 22 | group2.add_argument("-f", "--FileDir", help='The folder of input data.')
 23 | parser3 = subparsers.add_parser('RestoreName', help='Restore file names.', add_help=False)
 24 | group3 = parser3.add_argument_group('Required parameters')
 25 | group3.add_argument("-f", "--FileDir", help='The folder of input data.')
 26 | parser4 = subparsers.add_parser('Index', help='Generate some necessary files.', add_help=False)
 27 | group4 = parser4.add_argument_group('Required parameters')
 28 | group4.add_argument("-d", "--GenomeDir", help='The path of folder saving the generated files [GENOMEDIR].')
 29 | group4.add_argument("-g", "--Genome", help='The reference file (*.fasta or *.fa).')
 30 | group4.add_argument("-a", "--Annotation", help='The gtf annotation file (*.gtf).')
 31 | parser5 = subparsers.add_parser('ReadMapping', help='Mapping pair-end reads using STAR.', add_help=False)
 32 | group51 = parser5.add_argument_group('Required parameters')
 33 | group52 = parser5.add_argument_group('Optional parameters')
 34 | group51.add_argument("-f", "--FileDir", help='The folder of input data.')
 35 | group51.add_argument("-b", "--Begin", help='The first index of input files.')
 36 | group51.add_argument("-e", "--End", help='The last index of input files.')
 37 | group51.add_argument("-s", "--STARIndex", help='The STAR index folder.')
 38 | group51.add_argument("-o", "--OutDir", help='The output folder of the results and temporal files.')
 39 | group52.add_argument("-t", "--Thread", default='8', help='Number of threads can be used, default is 8')
 40 | parser6 = subparsers.add_parser('ReadProcessing', help='Process the chimeric reads.', add_help=False)
 41 | group61 = parser6.add_argument_group('Required parameters')
 42 | group62 = parser6.add_argument_group('Optional parameters')
 43 | group61.add_argument("-d", "--GenomeDir", help='The path of GENOMEDIR.')
 44 | group61.add_argument("-o", "--OutDir", help='The output folder of the results and temporal files.')
 45 | mappabilitygroup = group62.add_mutually_exclusive_group()
 46 | mappabilitygroup.add_argument("-m", "--Mappability", default=programdir + '/data/hg19mappability75.txt',
 47 |                               help='The mappability file, default is hg19mappability75.txt in the data folder.')
 48 | mappabilitygroup.add_argument("-M", "--NoMappabilityFilter", help='Keep all reads and do not apply mappability filter.',
 49 |                               action='store_true')
 50 | group61.add_argument("-b", "--Begin", help='The first index of input files.')
 51 | group61.add_argument("-e", "--End", help='The last index of input files.')
 52 | parser7 = subparsers.add_parser('FusionCandidate', help='Identify the fusion candidates', add_help=False)
 53 | group71 = parser7.add_argument_group('Required parameters')
 54 | group72 = parser7.add_argument_group('Optional parameters')
 55 | group71.add_argument("-o", "--OutDir", help='The output folder of the results and temporal files.')
 56 | group71.add_argument("-b", "--Begin", help='The first index of input files.')
 57 | group71.add_argument("-e", "--End", help='The last index of input files.')
 58 | group71.add_argument("-d", "--GenomeDir", help='The path of GENOMEDIR.')
 59 | group72.add_argument("-p", "--Prefix", default='.',
 60 |                      help='The prefix of result file, default is blank. This should be specified if users want to compare the results of different settings.')
 61 | parser8 = subparsers.add_parser('Retrain', help='Retrain the network using current data.', add_help=False)
 62 | group81 = parser8.add_argument_group('Required parameters')
 63 | group82 = parser8.add_argument_group('Optional parameters')
 64 | group81.add_argument("-o", "--OutDir", help='The output folder of the results and temporal files.')
 65 | group82.add_argument("-p", "--Prefix", default='.',
 66 |                      help='The prefix of result file, default is blank. This should be specified if users want to compare the results of different settings.')
 67 | group82.add_argument("-w", "--Weight", default=programdir + '/data/weight-V9-2.hdf5',
 68 |                      help='The initial weight file of the deep-learning network, default is "weight-V9-2.hdf5" in the data folder.')
 69 | group82.add_argument("-c", "--Epoch", default='10',
 70 |                      help='The number of epochs in the retraining step, default is 10')
 71 | parser9 = subparsers.add_parser('ArtifactScoring', help='Find the artifacts', add_help=False)
 72 | group91 = parser9.add_argument_group('Required parameters')
 73 | group92 = parser9.add_argument_group('Optional parameters')
 74 | group91.add_argument("-d", "--GenomeDir", help='The path of GENOMEDIR.')
 75 | group91.add_argument("-o", "--OutDir", help='The output folder of the results and temporal files.')
 76 | group91.add_argument("-b", "--Begin", help='The first index of input files.')
 77 | group91.add_argument("-e", "--End", help='The last index of input files.')
 78 | group92.add_argument("-p", "--Prefix", default='.',
 79 |                      help='The prefix of result file, default is blank. This should be specified if users want to compare the results of different settings.')
 80 | group92.add_argument("-w", "--Weight", default=programdir + '/data/weight-V9-2.hdf5',
 81 |                      help='The weight file used in the deep-learning network, default is "weight-V9-2.hdf5" in the data folder.')
 82 | parser10 = subparsers.add_parser('FusionReport', help='Report gene fusions.', add_help=False)
 83 | group101 = parser10.add_argument_group('Required parameters')
 84 | group102 = parser10.add_argument_group('Optional parameters')
 85 | group101.add_argument("-f", "--FileDir", help='The folder of input data.')
 86 | group101.add_argument("-d", "--GenomeDir", help='The path of GENOMEDIR.')
 87 | group101.add_argument("-o", "--OutDir", help='The output folder of the results and temporal files.')
 88 | group101.add_argument("-b", "--Begin", help='The first index of input files.')
 89 | group101.add_argument("-e", "--End", help='The last index of input files.')
 90 | group102.add_argument("-p", "--Prefix", default='.',
 91 |                       help='The prefix of result file, default is blank. This should be specified if users want to compare the results of different settings.')
 92 | group102.add_argument("-v", "--PvalueCutoff", default='0.05', help='Pvalue(FDR) cutoff of the statistical model, default is 0.05.')
 93 | group102.add_argument("-n", "--ArtifactScoreCutoff", default='0.75', help='Artifact score cutoff, default is 0.75')
 94 | group102.add_argument("--LncRNAFilterOff", help='turn off the LncRNA filter', action='store_true')
 95 | group102.add_argument("--NoApprovedSymbolFilterOff", help='turn off the no-approved-symbol filter', action='store_true')
 96 | parser11 = subparsers.add_parser('DeleteTempFiles', help='Delete temporary files generated by scFusion. ', add_help=False)
 97 | group111 = parser11.add_argument_group('Required parameters')
 98 | group112 = parser11.add_argument_group('Optional parameters')
 99 | group112.add_argument("--AllTempFiles", help='Delete all temporary files generated by scFusion and STAR.', action='store_true')
100 | group112.add_argument("--AllUnimportantTempFiles", help='Delete all unimportant temporary files generated by scFusion.', action='store_true')
101 | group112.add_argument("--STARMappingFiles", help='Delete STAR mapping results.', action='store_true')
102 | group111.add_argument("-o", "--OutDir", help='The output folder of the results and temporal files.')
103 | 
104 | 
105 | if len(sys.argv) == 1:
106 |     parser.print_help()
107 |     exit()
108 | 
109 | args = parser.parse_args()
110 | 
111 | if sys.argv[1] == 'Rename':
112 |     if not args.FileDir:
113 |         parser2.print_help()
114 |         print('Please specify all required parameters!')
115 |         exit()
116 |     aa = os.system('python ' + programdir + '/bin/RenameFastqFiles.py ' + args.FileDir)
117 |     if aa == 0:
118 |         print('Successfully rename files to consecutive numbers!')
119 |         print('Please keep RenameList.txt carefully, or the file names can not be restored.')
120 |     else:
121 |         print('ERROR!!!!!')
122 | 
123 | if sys.argv[1] == 'RestoreName':
124 |     if not args.FileDir:
125 |         parser3.print_help()
126 |         print('Please specify all required parameters!')
127 |         exit()
128 |     aa = os.system(
129 |         'python ' + programdir + '/bin/RenameFastqFiles.py ' + args.FileDir + ' ' + args.FileDir + '/RenameList.txt')
130 |     if aa == 0:
131 |         print('Successfully restore file names!')
132 |     else:
133 |         print('ERROR!!!!!')
134 | 
135 | if sys.argv[1] == 'BuildSTARIndex':
136 |     if not args.STARIndex or not args.Genome or not args.Annotation:
137 |         parser1.print_help()
138 |         print('Please specify all required parameters!')
139 |         exit()
140 |     os.system('STAR --runMode genomeGenerate --genomeDir ' + args.STARIndex + ' --runThreadN ' +
141 |               args.Thread + ' --genomeFastaFiles ' + args.Genome + ' --sjdbGTFfile ' +
142 |               args.Annotation)
143 | 
144 | if sys.argv[1] == 'Index':
145 |     if not args.GenomeDir or not args.Genome or not args.Annotation:
146 |         parser4.print_help()
147 |         print('Please specify all required parameters!')
148 |         exit()
149 |     aa = os.system('mkdir -p ' + args.GenomeDir + ' && cp ' + args.Genome + ' ' + args.GenomeDir + '/ref.fa && cp ' +
150 |                    args.Annotation + ' ' + args.GenomeDir + '/ref_annot.gtf && python ' + programdir +
151 |                    '/bin/Addchr2gtf.py ' + args.GenomeDir + '/ref_annot.gtf > ' + args.GenomeDir +
152 |                    '/ref_annot.gtf.added && python ' + programdir + '/bin/GetExonPos.py ' + args.GenomeDir +
153 |                    '/ref_annot.gtf.added > ' + args.GenomeDir + '/exon_probe.hg19.gene.new.bed && python ' + programdir +
154 |                    '/bin/GetGenePos.py ' + args.GenomeDir + '/ref_annot.gtf.added > ' + args.GenomeDir +
155 |                    '/GenePos.txt && pyensembl install --reference-name GRCH37 --annotation-name my_genome_features --gtf '
156 |                    + args.GenomeDir + '/ref_annot.gtf.added')
157 |     if aa == 0:
158 |         print('Finish Indexing!')
159 |     else:
160 |         print('ERROR!!!!!')
161 | 
162 | if sys.argv[1] == 'ReadMapping':
163 |     if not args.FileDir or not args.OutDir or not args.Begin or not args.End or not args.STARIndex:
164 |         parser5.print_help()
165 |         print('Please specify all required parameters!')
166 |         exit()
167 |     aa = os.system(
168 |         'sh ' + programdir + '/bin/StarMapping_Chimeric.sh ' + args.FileDir + ' ' + args.Begin + ' ' + args.End + ' ' + args.OutDir +
169 |         'STARMapping/ ' + args.STARIndex + ' ' + args.Thread)
170 |     if aa == 0:
171 |         print('Finish mapping! Index: ' + args.Begin + ' ~ ' + args.End)
172 |     else:
173 |         print('ERROR!!!!!')
174 | 
175 | if sys.argv[1] == 'ReadProcessing':
176 |     if not args.OutDir or not args.Begin or not args.End or not args.GenomeDir or not args.NoMappabilityFilter and not args.Mappability:
177 |         parser6.print_help()
178 |         print('Please specify all required parameters!')
179 |         exit()
180 |     if args.NoMappabilityFilter:
181 |         aa = os.system(
182 |             'sh ' + programdir + '/bin/CombinePipeline_before_FS_NoMappa.sh ' + args.OutDir + ' ' + args.Begin + ' ' + args.End + ' ' +
183 |             args.GenomeDir + '/ref_annot.gtf.added ' + args.GenomeDir + '/exon_probe.hg19.gene.new.bed ' + programdir + '/bin/')
184 |     else:
185 |         aa = os.system(
186 |             'sh ' + programdir + '/bin/CombinePipeline_before_FS.sh ' + args.OutDir + ' ' + args.Begin + ' ' + args.End + ' ' +
187 |             args.GenomeDir + '/ref_annot.gtf.added ' + args.Mappability + ' ' + args.GenomeDir + '/exon_probe.hg19.gene.new.bed ' + programdir + '/bin/')
188 |     if aa == 0:
189 |         print('Finish Read Processing! Index: ' + args.Begin + ' ~ ' + args.End)
190 |     else:
191 |         print('ERROR!!!!!')
192 | 
193 | if sys.argv[1] == 'FusionCandidate':
194 |     if not args.OutDir or not args.Begin or not args.End or not args.GenomeDir:
195 |         parser7.print_help()
196 |         print('Please specify all required parameters!')
197 |         exit()
198 |     aa = os.system(
199 |         'sh ' + programdir + '/bin/CombinePipeline_startwith_FS.sh ' + args.OutDir + ' ' + args.Begin + ' ' + args.End + ' ' + args.Prefix + ' ' + args.GenomeDir + '/ref.fa ' + args.GenomeDir + '/ref_annot.gtf.added ' + programdir + '/bin/')
200 |     if aa == 0:
201 |         print('Finish identifying fusion candidates')
202 |     else:
203 |         print('ERROR!!!!!')
204 | 
205 | if sys.argv[1] == 'Retrain':
206 |     if not args.OutDir:
207 |         parser8.print_help()
208 |         print('Please specify all required parameters!')
209 |         exit()
210 |     aa = os.system(
211 |         'sh ' + programdir + '/bin/CombinePipeline_Retrain.sh ' + args.OutDir + ' ' + args.Prefix + ' ' + args.Weight + ' ' + args.Epoch + ' ' + programdir + '/bin/')
212 |     if aa == 0:
213 |         print('Finish Retraining!')
214 |         print('New weight file was saved at ' + args.OutDir + '/weights/RetrainWeight.hdf5')
215 |     else:
216 |         print('ERROR!!!!!')
217 | 
218 | if sys.argv[1] == 'ArtifactScoring':
219 |     if not args.OutDir or not args.Begin or not args.End or not args.GenomeDir:
220 |         parser9.print_help()
221 |         print('Please specify all required parameters!')
222 |         exit()
223 |     aa = os.system(
224 |         'sh ' + programdir + '/bin/CombinePipeline_Predict.sh ' + args.OutDir + ' ' + args.Begin + ' ' + args.End + ' ' + args.Prefix + ' ' +
225 |         args.Weight + ' ' + args.GenomeDir + '/ref.fa ' + args.GenomeDir + '/ref_annot.gtf.added ' + programdir + '/bin/')
226 |     if aa == 0:
227 |         print('Finish ArtifactScoring Step!')
228 |     else:
229 |         print('ERROR!!!!!')
230 | 
231 | if sys.argv[1] == 'FusionReport':
232 |     if not args.FileDir or not args.OutDir or not args.Begin or not args.End or not args.GenomeDir:
233 |         parser10.print_help()
234 |         print('Please specify all required parameters!')
235 |         exit()
236 |     numcell = 0
237 |     for i in range(int(args.Begin), int(args.End) + 1):
238 |         if os.path.exists(args.FileDir + str(i) + '_2.fastq') or os.path.exists(args.FileDir + str(i) + '_2.fastq.gz'):
239 |             numcell += 1
240 |     if args.LncRNAFilterOff:
241 |         lncfilter = '1'
242 |     else:
243 |         lncfilter = '0'
244 |     if args.NoApprovedSymbolFilterOff:
245 |         nasfilter = '1'
246 |     else:
247 |         nasfilter = '0'
248 |     aa = os.system(
249 |         'sh ' + programdir + '/bin/CombinePipeline_startwith_ChiDist.sh ' + args.OutDir + ' ' + args.Prefix + ' ' + str(numcell) +
250 |         ' ' + args.PvalueCutoff + ' ' + args.ArtifactScoreCutoff + ' ' + args.GenomeDir + '/ref_annot.gtf.added ' + args.GenomeDir + '/GenePos.txt ' + programdir + '/bin/ ' + lncfilter + ' ' + nasfilter)
251 |     if aa == 0:
252 |         if args.Prefix == '.':
253 |             args.Prefix = ''
254 |         os.system(
255 |             'cat ' + args.OutDir + '/FinalResult/' + args.Prefix + 'FinalOutput.abridged.txt > ' + args.OutDir + '/' + args.Prefix + 'Result.abridged.txt')
256 |         os.system(
257 |             'cat ' + args.OutDir + '/FinalResult/' + args.Prefix + 'FinalOutput.full.txt > ' + args.OutDir + '/' + args.Prefix + 'Result.full.txt')
258 |         os.system('mv ' + args.OutDir + '/FinalResult/ ' + args.OutDir + '/Resulttemp/')
259 |         print('Final Results are in ' + args.OutDir + '/' + args.Prefix + 'Result.abridged.txt and ' + args.OutDir + '/' + args.Prefix + 'Result.full.txt')
260 |     else:
261 |         print('ERROR!!!!!')
262 | 
263 | if sys.argv[1] == 'DeleteTempFiles':
264 |     if not args.OutDir:
265 |         parser11.print_help()
266 |         print('Please specify all required parameters!')
267 |         exit()
268 |     if args.AllTempFiles:
269 |         os.system('rm -r ' + args.OutDir + '/weights/')
270 |         os.system('rm -r ' + args.OutDir + '/Retrain/')
271 |         os.system('rm -r ' + args.OutDir + '/Expr/')
272 |         os.system('rm -r ' + args.OutDir + '/ChimericOut/')
273 |         os.system('rm -r ' + args.OutDir + '/STARMapping/')
274 |         os.system('rm -r ' + args.OutDir + '/Resulttemp/')
275 |         os.system('rm -r ' + args.OutDir + '/ChiDist/')
276 |     else:
277 |         if args.STARMappingFiles:
278 |             os.system('rm -r ' + args.OutDir + '/STARMapping/')
279 |         if args.AllUnimportantTempFiles:
280 |             os.system('rm -r ' + args.OutDir + '/weights/')
281 |             os.system('rm -r ' + args.OutDir + '/Retrain/')
282 |             os.system('rm -r ' + args.OutDir + '/Expr/')
283 |             os.system('rm -r ' + args.OutDir + '/ChimericOut/')
284 |             os.system('rm -r ' + args.OutDir + '/Resulttemp/*.*')
285 |             os.system('rm -r ' + args.OutDir + '/ChiDist/*.npy')
286 |             os.system('rm -r ' + args.OutDir + '/ChiDist/ChiDist_middle.txt')
287 |             os.system('rm -r ' + args.OutDir + '/ChiDist/FusionRead.txt')
288 |             os.system('rm -r ' + args.OutDir + '/ChiDist/Homo.txt')
289 |             os.system('rm -r ' + args.OutDir + '/ChiDist/Prob.txt')
290 |             os.system('rm -r ' + args.OutDir + '/ChiDist/ChiDist_filtered.txt')
291 |     print('Finish!')
292 | 
293 | 
294 | 
295 | 


--------------------------------------------------------------------------------