├── Strelka
    ├── Strelka.germline
    ├── README.md
    ├── manta.sh
    ├── Strelka.sh
    ├── getmanta.sh
    ├── getpy.sh
    ├── 1.batch_manta.py
    └── 2.batch_streka_somatic.py
├── ex_region.sort.bed.gz
├── 7.run_sompy.sh
├── 6.index_bam.sh
├── 5.run_merge.sh
├── GATK
    ├── BQSR2.sh
    ├── BQSR1.sh
    ├── 1.bqsr1.report.py
    ├── 2.bqsr.2.bqsr.py
    ├── 3.multiprocess_mutect2.py
    └── mutect2.sh
├── multiprocess_index_bam.py
├── README.md
├── 3.dedup.sh
├── 4.downsample.sh
├── multiprocess_merge.py
├── 1.QC.sh
├── 2.mapping.sh
└── batch_extract.py


/Strelka/Strelka.germline:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ex_region.sort.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zic12345/SR2019/HEAD/ex_region.sort.bed.gz


--------------------------------------------------------------------------------
/Strelka/README.md:
--------------------------------------------------------------------------------
1 | As is recomended, Manta was run first and its result were used as input of Strelka2.


--------------------------------------------------------------------------------
/7.run_sompy.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/bash
2 | som.py truth_snvs.vcf.gz my_snvs.vcf.gz -f ex_region.sort.bed.gz -o result
3 | 


--------------------------------------------------------------------------------
/Strelka/manta.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | (time yhrun -n 1 -c 24 ./runWorkflow.py --memGb=60 --mode=local) 2>manta.logfile
3 | 


--------------------------------------------------------------------------------
/Strelka/Strelka.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | (time yhrun -n 1 -c 24 ./runWorkflow.py --memGb=60 --mode=local) 2>strelka.logfile
3 | 


--------------------------------------------------------------------------------
/6.index_bam.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Batch index the merged BAM files
4 | 
5 | dir=/path/to/merged
6 | yhrun -n 1 -c 24 python3 multiprocess_index_bam.py $dir


--------------------------------------------------------------------------------
/5.run_merge.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/bash
2 | 
3 | # Batch merge the BAM files generated by 4.downsample.sh, put all unmerged BAM files in one directory
4 | # which do not contain any other files, run this script with one parameter which represent the depth 
5 | # you used in 4.downsample.sh
6 | 
7 | dir=/path/to/unmerged
8 | yhrun -n 1 -c 24 python3 multiprocess_merge.py $dir $1


--------------------------------------------------------------------------------
/Strelka/getmanta.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | manta='/path/to/manta-1.5.0.centos6_x86_64/bin/configManta.py'
 3 | reference='/path/to/hg19.fa'
 4 | normbam='something'
 5 | tumorbam='something'
 6 | runpath='something'
 7 | bed='/path/to/ex_region.sort.bed.gz'
 8 | $manta \
 9 | --normalBam $normbam \
10 | --tumorBam $tumorbam \
11 | --referenceFasta $reference \
12 | --runDir $runpath \
13 | --exome \
14 | --callRegions $bed


--------------------------------------------------------------------------------
/GATK/BQSR2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | #apply BQSR
 3 | gatk='/path/to/gatk --java-options -Xmx60g'
 4 | dbsnp='/path/to/dbsnp_138.hg19.vcf.gz'
 5 | Mills='/path/to/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz'
 6 | ref='/path/to/hg19.fa'
 7 | bed='/path/to/ex_region.sort.bed'
 8 | cd $1
 9 | bam=$2
10 | recaltable=$3
11 | bqsrbam=$4
12 | 
13 | time $gatk ApplyBQSR \
14 | -R $ref \
15 | -I $bam \
16 | -L $bed \
17 | -O $bqsrbam \
18 | --bqsr-recal-file $recaltable
19 | 
20 | 


--------------------------------------------------------------------------------
/multiprocess_index_bam.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os,sys
 3 | from multiprocessing import Pool
 4 | os.chdir(sys.argv[1])
 5 | files = [x for x in os.listdir() if (x.endswith(".cram"))|(x.endswith('.bam'))]
 6 | 
 7 | def replaceRG(infile):
 8 |     script = "samtools index -@ 2 %s"%infile
 9 |     os.system(script)
10 | 
11 | if __name__ == "__main__":
12 |     p=Pool(12)
13 |     for infile in files:
14 |         p.apply_async(replaceRG,args=(infile,))
15 |     p.close()
16 |     p.join()
17 | 


--------------------------------------------------------------------------------
/Strelka/getpy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | strelka2='/path/to/strelka-2.9.7/bin/configureStrelkaSomaticWorkflow.py'
 3 | reference='/BIGDATA1/scut_hldu_1/work/Low_frequency/NAYH/Genome/UCSC/hg19.fa'
 4 | normbam='something'
 5 | tumorbam='something'
 6 | runpath='something'
 7 | indel='something'
 8 | bed='/path/to/ex_region.sort.bed.gz'
 9 | $strelka2 \
10 | --normalBam $normbam \
11 | --tumorBam $tumorbam \
12 | --referenceFasta $reference \
13 | --runDir $runpath \
14 | --indelCandidates $indel \
15 | --exome \
16 | --callRegions $bed
17 | 


--------------------------------------------------------------------------------
/GATK/BQSR1.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | gatk='/path/to/gatk --java-options -Xmx60g'
 3 | dbsnp='/path/to/dbsnp_138.hg19.vcf.gz'
 4 | Mills='/path/to/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz'
 5 | ref='/path/to/hg19.fa'
 6 | bed='/path/to/ex_region.sort.bed'
 7 | cd $1
 8 | bam=$2
 9 | recaltable=$3
10 | bqsrbam=$4
11 | ##the first step of BQSR, generate a recalibrating table
12 | time $gatk BaseRecalibrator \
13 | -I $bam \
14 | -L $bed \
15 | --known-sites $dbsnp \
16 | --known-sites $Mills \
17 | -R $ref \
18 | -O $recaltable
19 | 
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SR2019
2 | All codes used to generate data in article "Systematic comparison of somatic variant calling performance among different sequencing depth and mutation frequency"
3 | 
4 | If you have problems, please contact the authors or submit issues.
5 | 
6 | 
7 | ## Noticing
8 | When you downsampled and mixed the bam files, all of the RG tags in NA.md.bam and YH.md.bam will be kept, Strelka2 ignores RG tags so the result won't be affected, however, the result of programs which needs the information of RG tags such as GATK mutect2 will be afffected, **thus you need to run RG tag replacement on the mixed bam files before calling mutations**.
9 | 


--------------------------------------------------------------------------------
/GATK/1.bqsr1.report.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #batch run BQSR1.sh for files in a certain path
 3 | import os,sys
 4 | from multiprocessing import Pool
 5 | wkdir=sys.argv[1]
 6 | 
 7 | def bqsr(script):
 8 |     os.system(script)
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     files = sorted([x for x in os.listdir(wkdir) if ((x.endswith(".cram"))|(x.endswith(".bam")))])
13 |     p=Pool(2)
14 |     for file in files:
15 |         report=file+'.report'
16 |         bqsrbam='bqsr.'+file
17 |         script='/absolute/path/to/BQSR1.sh %s %s %s %s'%(wkdir,file,report,bqsrbam)
18 |         p.apply_async(bqsr,args=(script,))
19 |     p.close()
20 |     p.join()
21 | 


--------------------------------------------------------------------------------
/GATK/2.bqsr.2.bqsr.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #batch run BQSR2.sh for files in a certain path
 3 | import os,sys
 4 | from multiprocessing import Pool
 5 | wkdir=sys.argv[1]
 6 | 
 7 | def bqsr(script):
 8 |     os.system(script)
 9 | 
10 | if __name__ == '__main__':
11 |     files = sorted([x for x in os.listdir(wkdir) if ((x.endswith(".cram"))|(x.endswith(".bam")))])
12 |     p=Pool(6)
13 | 
14 |     for file in files:
15 |         report=file+'.report'
16 |         bqsrbam='bqsr.'+file
17 |         script='/absolute/path/to/BQSR2.sh %s %s %s %s'%(wkdir,file,report,bqsrbam)
18 |         p.apply_async(bqsr,args=(script,))
19 |     p.close()
20 |     p.join()
21 | 


--------------------------------------------------------------------------------
/3.dedup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | # remove duplications using picard
 4 | naI='/path/to/na.addRG.bam'
 5 | naO='/path/to/na.addRG.mdup.bam'
 6 | naD='/path/to/na.dup.txt'
 7 | yhI='/path/to/yh.addRG.bam'
 8 | yhO='/path/to/yh.addRG.mdup.bam'
 9 | yhD='/path/to/yh.dup.txt'
10 | NAlog='/path/to/na.dup.log'
11 | YHlog='/path/to/yh.dup.log'
12 | 
13 | (time yhrun -N 1 -n 1 java -jar $picard MarkDuplicates  \
14 |  I=$naI  \
15 |  O=$naO \
16 |  M=$naD \
17 |  REMOVE_DUPLICATES=true \
18 |  ASO=coordinate \
19 |  CREATE_INDEX=true) 2>&1 >$NAlog
20 | 
21 | (time yhrun -N 1 -n 1 java -jar $picard MarkDuplicates  \
22 |  I=$yhI  \
23 |  O=$yhO \
24 |  M=$yhD \
25 |  REMOVE_DUPLICATES=true \
26 |  ASO=coordinate \
27 |  CREATE_INDEX=true) 2>&1 >$YHlog
28 | 


--------------------------------------------------------------------------------
/GATK/3.multiprocess_mutect2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #batch run mutect2.sh for files in a certain path
 3 | import os,sys
 4 | os.chdir(sys.argv[1])
 5 | files = [x for x in os.listdir() if ((x.endswith(".bam"))|(x.endswith(".cram")))]
 6 | os.mkdir("mutect")
 7 | 
 8 | for file in files:
 9 |     outgzvcf = "mutect/"+file+".vcf.gz"
10 |     NAdepth=file.split("-")[1][:-1]
11 |     YHdepth=file.split("-")[2][:-1]
12 |     totaldepth = int(NAdepth)+int(YHdepth)
13 |     YHpercent = int(YHdepth)/int(totaldepth)
14 |     num = file.split("-")[4].split('.')[0]
15 |     rgid="mix%sX_YH%s_%s"%(totaldepth,YHpercent,num)
16 |     script = "yhbatch /path/to/mutect2.sh %s %s %s"%(file,rgid,outgzvcf)
17 |     os.system(script)
18 | 
19 | # "yhbach" is the command which submit batch jobs to the computer clusters, we allocated 1 node and 24 threads for fastp.


--------------------------------------------------------------------------------
/GATK/mutect2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | gatk='/BIGDATA1/scut_hldu_1/work/Low_frequency/NAYH/mix-NA1-YH1/GATK/gatk-4.1.0.0/gatk --java-options -Xmx40g'
 3 | ref='/BIGDATA1/scut_hldu_1/work/Low_frequency/NAYH/Genome/UCSC/hg19.fa'
 4 | bed='/BIGDATA1/scut_hldu_1/work/Low_frequency/NAYH/Genome/AgilentV5/ex_region.sort.bed'
 5 | norm='/BIGDATA1/scut_hldu_1/work/Low_frequency/NAYH/mix-NA1-YH1/100Xnorm/bqsr/bqsr.NA-100X-1.bam'
 6 | gnomad='/BIGDATA1/scut_hldu_1/work/Low_frequency/NAYH/mix-NA1-YH1/GATK/bundle/af-only-gnomad.raw.sites.hg19.vcf.gz'
 7 | tumor=$1
 8 | tumorid=$2
 9 | outvcf=$3
10 | (time $gatk Mutect2 \
11 | -R $ref \
12 | -I $tumor \
13 | -I $norm \
14 | -tumor $tumorid \
15 | -normal NAnorm100X \
16 | --germline-resource $gnomad \
17 | --disable-read-filter MateOnSameContigOrNoMappedMateReadFilter \
18 | --native-pair-hmm-threads 24 \
19 | -O $outvcf
20 | )
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/4.downsample.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | # Downsample the original NA12878 and YH-1 BAM files to different depth.
 4 | # Here we use a python script to conduct batch extraction
 5 | # This script needs two parameters, the first one is the total depth you 
 6 | # finally want, and the second parameter is "NA" or "YH", which means the
 7 | # file you want to downsample.
 8 | 
 9 | # e.g. If you want to generate 100X depth mixed files, with YH-1 percent
10 | # 1%, 5%, 10%, 20%, 30% and 40%, first run this script with parameters:
11 | # "100 NA", then run this script with parameters:"100 YH", you will get 
12 | # the downsampled NA12878 and YH-1 BAM files, the depth of NA12878 BAM
13 | # files are: 99X, 95X, 90X, 80X, 70X and 60X, the depth of YH-1 BAM files
14 | # are 1X, 5X, 10X, 20X, 30X and 40X. These downsampled BAM files can be 
15 | # merged using script 5.run_merge.sh to get the 100X depth mixed files
16 | # you wanted
17 | 
18 | python3 batch_extract.py $1 $2
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/multiprocess_merge.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os,sys
 3 | from multiprocessing import Pool
 4 | os.chdir(sys.argv[1])
 5 | na = [x for x in os.listdir() if 'NA' in x]
 6 | total_depth = int(sys.argv[2])
 7 | 
 8 | def merge_bam(out,bam1,bam2):
 9 |     script = "samtools merge -n -@ 3 %s %s %s"%(out,bam1,bam2)
10 |     os.system(script)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     p = Pool(8)
15 |     result = []
16 |     for naname in na:
17 |         num = naname.split(".")[0].split("-")[-1]
18 |         na_depth = int(naname.split(".")[0].split("-")[1].split('X')[0])
19 |         yh_depth = total_depth - na_depth
20 |         yhname = "YH-%sX-%s.bam"%(yh_depth,num)
21 |         bam1 = os.path.abspath(naname)
22 |         bam2 = os.path.abspath(yhname)
23 |         out = os.path.join(os.path.split(os.getcwd())[0],"NAYH-%sX-%sX-somatic-%s.bam"%(na_depth,yh_depth,num))
24 |         result.append(p.apply_async(merge_bam,args=(out,bam1,bam2)))
25 |     p.close()
26 |     p.join()
27 | 


--------------------------------------------------------------------------------
/1.QC.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | # Raw data QC using fastp
 4 | # specify paths for softwares and input/output files
 5 | fastp='/path/to/fastp'
 6 | 
 7 | # infile
 8 | NA12878_1='/path/to/NA12878.r1.fq.gz'
 9 | NA12878_2='/path/to/NA12878.r2.fq.gz'
10 | YH_1='/path/to/YH.r1.fq.gz'
11 | YH_2='/path/to/YH.r2.fq.gz'
12 | 
13 | # outfile
14 | NAout1='/path/to/naout.r1.fq.gz'
15 | NAout2='/path/to/naout.r2.fq.gz'
16 | NA12878js='/path/to/na12878.json'
17 | NA12878html='/path/to/na12878.html'
18 | YHout1='/path/to/yhout.r1.fq.gz'
19 | YHout2='/path/to/yhout.r2.fq.gz'
20 | YHjs='/path/to/yh.json'
21 | YHhtml='/path/to/yh.html'
22 | logNA='/path/to/na.log'
23 | logYH='/path/to/yh.log'
24 | 
25 | # run fastp
26 | (time yhrun -N 1 -n 1 $fastp -w 24 -j $NA1287js -h $NA12878html -R "NA fastp Report" -i $NA12878_1 -o $NAout1 -I $NA12878_2 -O $NAout2) 2>&1 > $logNA
27 | (time yhrun -N 1 -n 1 $fastp -w 24 -j $YHjs -h $YHhtml -R "YH fastp Report" -i $YH_1 -o $YHout1 -I $YH_2 -O $YHout2) 2>&1 > $logYH
28 | # note:"yhrun" is the command which submit jobs to the computer clusters, we allocated 1 node and 24 threads for fastp.


--------------------------------------------------------------------------------
/2.mapping.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | # mapping to hg19 reference genome using bwa mem
 4 | # specify paths for softwares and input/output files
 5 | hg19='/path/to/hg19'
 6 | bwa='/path/to/bwa'
 7 | picard='/path/to/picard.jar'
 8 | NA12878_1='/path/to/naout.r1.fq.gz'
 9 | NA12878_2='/path/to/naout.r2.fq.gz'
10 | YH_1='/path/to/yhout.r1.fq.gz'
11 | YH_2='/path/to/yhout.r2.fq.gz'
12 | NAsam='/path/to/nasam.sam'
13 | NAbam='/path/to/na.addRG.bam'
14 | YHsam='/path/to/yhsam.sam'
15 | YHbam='/path/to/yh.addRG.bam'
16 | logNAmap='/path/to/na.log'
17 | logYHmap='/path/to/yh.log'
18 | lognatrans='/path/to/lognatrans.log'
19 | logyhtrans='/path/to/logyhtrans.log'
20 | 
21 | # run bwa mem
22 | (time yhrun -N 1 -n 1 $bwa mem -t 6 $hg19 $NA12878_1 $NA12878_2 > $NAsam ) 2>&1 > $logNAmap
23 | 
24 | (time yhrun -N 1 -n 1 $bwa mem -t 6 $hg19 $YH_1 $YH_2 > $YHsam ) 2>&1 > $logYHmap
25 | 
26 | # convert SAM to BAM and add read groups
27 | (time yhrun -N 1 -n 1 java -jar $picard AddOrReplaceReadGroups  \
28 | I=$NAsam  \
29 | O=$NAbam \
30 | RGID=NA \
31 | RGLB=WES \
32 | RGPL=Illumina \
33 | RGPU=Novaseq \
34 | RGSM=NA \
35 | SO=coordinate \
36 | CREATE_INDEX=true ) 2>&1 > $lognatrans
37 | 
38 | (time yhrun -N 1 -n 1 java -jar $picard AddOrReplaceReadGroups  \
39 | I=$YHsam  \
40 | O=$YHbam \
41 | RGID=YH \
42 | RGLB=WES \
43 | RGPL=Illumina \
44 | RGPU=Novaseq \
45 | RGSM=YH \
46 | SO=coordinate \
47 | CREATE_INDEX=true ) 2>&1 > $logyhtrans


--------------------------------------------------------------------------------
/Strelka/1.batch_manta.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # batch start manta software on computer cluster, require 3 parameters: normal BAM file dir, 
 4 | # tumor BAM file dir and work dir 
 5 | 
 6 | import os,sys,shutil
 7 | import subprocess as sub
 8 | normdir = sys.argv[1]
 9 | tumordir = sys.argv[2]
10 | workdir = sys.argv[3]
11 | 
12 | tumordic={x:os.path.join(tumordir,x) for x in os.listdir(tumordir) if ((x.endswith(".bam"))|(x.endswith(".cram")))}
13 | normdic={x[-5]:os.path.join(normdir,x) for x in os.listdir(normdir) if ((x.endswith(".bam"))|(x.endswith(".cram")))}
14 | 
15 | if not os.path.exists(workdir):
16 |     os.makedirs(workdir,exist_ok=True)
17 | os.chdir(workdir)
18 | 
19 | mantash = "manta.sh"
20 | 
21 | with open('getmanta.sh') as f:
22 |     getpy = f.readlines()
23 | 
24 | for file in tumordic:
25 |     tumor_abspath = tumordic[file]
26 |     tmp = getpy.copy()
27 |     os.makedirs(file.split(".")[0],exist_ok=True)
28 |     os.chdir(file.split(".")[0])
29 |     shutil.copy(mantash,"./")
30 |     norm_abspath = normdic['1']
31 |     tmp[4] = tmp[4].replace("something",norm_abspath)
32 |     tmp[5] = tmp[5].replace("something",tumor_abspath)
33 |     tmp[6] = tmp[6].replace("something",os.getcwd())
34 |     with open("getmanta.sh",'w') as f:
35 |         f.writelines(tmp)
36 |     os.system("chmod +x ./getmanta.sh")
37 |     process = sub.Popen("./getmanta.sh",shell=True)
38 |     ret = process.wait()
39 |     if ret == 0:
40 |        os.system("yhbatch manta.sh")
41 |     os.chdir("..")
42 | 


--------------------------------------------------------------------------------
/batch_extract.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os,sys
 3 | from multiprocessing import Pool
 4 | 
 5 | # the original depth of NA12878 and YH-1
 6 | nadepth=812.40
 7 | yhdepth=407.25
 8 | work_dph=int(sys.argv[1])
 9 | 
10 | napercent = [0.99,0.95,0.90,0.80,0.70,0.60]
11 | yhpercent = [0.01,0.05,0.10,0.20,0.30,0.40]
12 | 
13 | yhpercent = [0.05]
14 | NAtotal='/path/to/na.addRG.mdup.bam'
15 | YHtotal='/path/to/yh.addRG.mdup.bam'
16 | downsample="java -Xmx3000m -Djava.io.tmpdir=/BIGDATA1/scut_hldu_1/tmp -jar /BIGDATA1/scut_hldu_1/bin/picard.jar DownsampleSam"
17 | 
18 | # Important! Random seed must NOT be changed! Or you won't get the same bam file!
19 | seeds = {1:6666,2:8888,3:9999}
20 | 
21 | def extract(script):
22 |     os.system(script)
23 | 
24 | 
25 | p=Pool(18)
26 | if sys.argv[2]=='NA':
27 |     for i in napercent:
28 |         na_dph = int(work_dph*i)
29 |         p_na = na_dph/nadepth
30 |         for a in seeds:
31 |             rand_seed = seeds[a]
32 |             script = "%s I=%s O=NA-%sX-%s.bam R=%s P=%s A=0.00000001"%(downsample,NAtotal,na_dph,a,rand_seed,p_na)
33 |             p.apply_async(extract,args=(script,))
34 | 
35 | elif sys.argv[2]=='YH':
36 |     for i in yhpercent:
37 |         yh_dph = int(work_dph*i)
38 |         p_yh = yh_dph/yhdepth
39 |         for a in seeds:
40 |             rand_seed = seeds[a]
41 |             script = "%s I=%s O=YH-%sX-%s.bam R=%s P=%s A=0.00000001"%(downsample,YHtotal,yh_dph,a,rand_seed,p_yh)
42 |             p.apply_async(extract,args=(script,))
43 | 
44 | p.close()
45 | p.join()
46 | 


--------------------------------------------------------------------------------
/Strelka/2.batch_streka_somatic.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # batch start Strelka2 somatic mode on computer cluster, require 4 parameters: normal BAM file dir, 
 4 | # tumor BAM file dir, work dir and the according manta result directory (the 3rd parameter of 1.batch_manta.py)  
 5 | 
 6 | import os,sys,shutil
 7 | import subprocess as sub
 8 | normdir = sys.argv[1]
 9 | tumordir = sys.argv[2]
10 | workdir = sys.argv[3]
11 | depthN = sys.argv[4]
12 | tumordic={x:os.path.join(tumordir,x) for x in os.listdir(tumordir) if ((x.endswith(".bam"))|(x.endswith(".cram")))}
13 | normdic={x[-5]:os.path.join(normdir,x) for x in os.listdir(normdir) if ((x.endswith(".bam"))|(x.endswith(".cram")))}
14 | 
15 | if not os.path.exists(workdir):
16 |     os.makedirs(workdir,exist_ok=True)
17 | os.chdir(workdir)
18 | manta_dir = os.path.join(os.path.split(workdir)[0],"manta-%sNorm"%depthN)
19 | strelkash = "Strelka.sh"
20 | 
21 | with open('getpy.sh') as f:
22 |     getpy = f.readlines()
23 | 
24 | for file in tumordic:
25 |     tumor_abspath = tumordic[file]
26 |     tmp = getpy.copy()
27 | 
28 |     os.makedirs(file.split(".")[0],exist_ok=True)
29 |     os.chdir(file.split(".")[0])
30 |     shutil.copy(strelkash,"./")
31 |     norm_abspath = normdic['1']
32 |     manta_path = os.path.join(manta_dir,file.split(".")[0],"results/variants/candidateSmallIndels.vcf.gz")
33 |     tmp[4] = tmp[4].replace("something",norm_abspath)
34 |     tmp[5] = tmp[5].replace("something",tumor_abspath)
35 |     tmp[6] = tmp[6].replace("something",os.getcwd())
36 |     tmp[7] = tmp[7].replace("something",manta_path)
37 |     with open("getpy.sh",'w') as f:
38 |         f.writelines(tmp)
39 |     os.system("chmod +x ./getpy.sh")
40 |     process = sub.Popen("./getpy.sh",shell=True)
41 |     ret = process.wait()
42 |     if ret == 0:
43 |        os.system("yhbatch Strelka.sh")
44 |     os.chdir("..")


--------------------------------------------------------------------------------