├── .gitignore
├── .idea
├── .gitignore
├── .name
├── SPART.iml
├── SPART.py
├── inspectionProfiles
│ ├── Project_Default.xml
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── .readthedocs.yaml
├── 00_Contig_screen
├── chloroplast.paf
├── chloroplast.txt
├── fastp.sh
├── flye.sh
├── gemma_los.py
├── hifiasm.sh
├── mitochondrion.paf
├── mitochondrion.txt
├── rm_mt_cp.sh
└── verkko.sh
├── 01_Contig_scaffolding
├── Bionano_DLS_map.sh
├── HiC-Pro.sh
├── hicpro_config.txt
└── yahs.sh
├── 02_Gap_patching
├── Centromeric_region_analysis.sh
├── chip-seq.py
├── paf_filter.pl
├── renameagp.pl
└── wfmash_ragtag.sh
├── 03_Polishing
├── bwa_winnowmap.py
├── callsv_snv.py
├── calsv_snv.sh
├── clust.json
├── clust_align.json
├── conf_ck.yaml
└── conf_ck_align.yaml
├── 04_Evaluation
├── BUSCO.sh
├── bac.sh
├── ltr.sh
├── mapping_rates_coverages .sh
├── qv.sh
├── synteny.sh
├── while.sh
└── winnowmap.sh
├── 05_Annotation
├── Snakefile
├── clust.json
├── config.yaml
└── modules
│ ├── __pycache__
│ ├── fasta.cpython-310.pyc
│ ├── fasta.cpython-35.pyc
│ ├── fasta.cpython-39.pyc
│ ├── mygff.cpython-310.pyc
│ └── mygff.cpython-35.pyc
│ ├── fasta.py
│ └── mygff.py
├── LICENSE
├── README.md
├── SPART.py
├── SPART.yaml
├── clust.json
├── conf_ck.yaml
├── contacts.md
├── docs
├── Makefile
├── make.bat
└── source
│ ├── README.md
│ ├── README1.md
│ ├── conf.py
│ ├── index.rst
│ ├── pipeline.jpg
│ └── requirements.txt
├── example
├── README.md
├── cp
│ └── chloroplast.fasta
├── mt
│ └── mitochondrion.fasta
├── pcr
│ ├── PCRFREE_1.fq
│ └── PCRFREE_2.fq
├── rule.svg
└── verkko
│ └── verkko.fasta
└── pic
├── pipeline.jpg
└── rule.png
/.gitignore:
--------------------------------------------------------------------------------
1 | docs/build/
2 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liushoucheng/SPART/8b2d0cf778a446c1e24dbdc772e914262cff5930/.idea/.gitignore
--------------------------------------------------------------------------------
/.idea/.name:
--------------------------------------------------------------------------------
1 | SPART.py
--------------------------------------------------------------------------------
/.idea/SPART.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/SPART.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import sys
4 | b={}
5 | hifi_single={}
6 | hifi_mix={}
7 | e={}
8 | d={}
9 | HiFi_hybrid_all=config["HiFi_reads_merge"]
10 | ONT_all=config["ONT_reads_merge"]
11 | mitochondrion=config["mitochondrion"]
12 | chloroplast=config["chloroplast"]
13 | hic_hybrid_dir=config["hic_dir"]
14 | SPART_dir=config["SPART_dir"]
15 | hic_hybrid_enzyme=config["hic_enzyme"]
16 | hic_enzyme_ligation_site=config["hic_enzyme_ligation_site"]
17 | verkko_fa=config["verkko_assemble"]
18 | pcrfree_hybrid_r1=config["pcrfree_r1"]
19 | pcrfree_hybrid_r2=config["pcrfree_r2"]
20 | google_deepvariant_latest_gpu_sif=config["google_deepvariant_latest-gpu_sif"]
21 | W=config["WORKDIR"]
22 | DIR=config["DIR"]
23 | DIRont=config["DIRont"]
24 | for dirs in os.listdir(DIR):
25 | b2 = dirs.split(".fastq")
26 | if ".fastq" in dirs:
27 | absPath = os.path.join(DIR, dirs)
28 | hifi_mix[b2[0]]=absPath
29 |
30 | for dirs in os.listdir(DIRont):
31 | b2 = dirs.split(".fastq")
32 | if ".fastq" in dirs:
33 | absPath = os.path.join(DIRont, dirs)
34 | e[b2[0]]=absPath
35 |
36 | rule final:
37 | input:
38 | W+"hybrid_hifi_pcr/hybrid.bam",
39 | W + "ont_merge/q10l120k.bam"
40 |
41 | rule hifi_fastp:
42 | input:
43 | HiFi_hybrid_all
44 | output:
45 | W+"fastp/hybrid.fq"
46 | shell:
47 | "fastp -w 16 -i {input} -o {output}"
48 |
49 | rule ont_fastp:
50 | input:
51 | ONT_all
52 | output:
53 | W+"fastp/ont.fq"
54 | shell:
55 | "fastp -q 10 -l 100000 -w 16 -i {input} -o {output}"
56 |
57 | rule hifiasm:
58 | input:
59 | hifi=W+"fastp/hybrid.fq",
60 | ont=W+"fastp/ont.fq"
61 | output:
62 | W+"hifiasm_hybrid/hybrid.all.asm.p_ctg.fa"
63 | params:
64 | W+"hifiasm_hybrid"
65 | shell:
66 | """
67 | cd {params}
68 | hifiasm -o hybrid.all.asm --primary -t 96 --ul {input.ont} -k 63 {input.hifi}
69 | awk '/^S/{{print ">"$2;print $3}}' hybrid.all.asm.p_ctg.gfa > {output}
70 | """
71 |
72 | rule flye:
73 | input:
74 | W+"fastp/ont.fq"
75 | output:
76 | W + "flye/assembly.fasta"
77 | params:
78 | W
79 | shell:
80 | """
81 | cd {params}
82 | flye --nano-hq {input} --read-error 0.1 -g 5.4g --asm-coverage 80 --scaffold --out-dir flye --threads 96 --no-alt-contigs
83 | """
84 |
85 | rule rm_mt_cp:
86 | input:
87 | hybrid=W+"hifiasm_hybrid/hybrid.all.asm.p_ctg.fa",
88 | mt=mitochondrion,
89 | cp=chloroplast
90 | output:
91 | W+"hifiasm_hybrid/hybrid.remove_cp_mt.fa"
92 | params:
93 | W+"hifiasm_hybrid"
94 | shell:
95 | """
96 | cd {params}
97 | minimap2 -t 96 -x asm5 {input.mt} {input.hybrid}> mitochondrion.paf
98 | minimap2 -t 96 -x asm5 {input.cp} {input.hybrid}> chloroplast.paf
99 | python gemma_los.py mitochondrion.paf > mitochondrion.txt
100 | python gemma_los.py chloroplast.paf > chloroplast.txt
101 | seqkit grep -v -f chloroplast.txt {input.hybrid} > wheat_remove_cp.fa
102 | seqkit grep -v -f mitochondrion.txt wheat_remove_cp.fa > {output}
103 | """
104 |
105 | rule hicpro:
106 | input:
107 | hic=hic_hybrid_dir,
108 | ref=W+"hifiasm_hybrid/hybrid.remove_cp_mt.fa"
109 | output:
110 | W+"hic_hybrid/hic_hybrid.bam"
111 | params:
112 | dir=W+"hic_hybrid",
113 | prefix="hybrid.remove_cp_mt",
114 | spart_dir=SPART_dir,
115 | enzyme=hic_hybrid_enzyme,
116 | LIGATION_SITE=hic_enzyme_ligation_site
117 | shell:
118 | """
119 | cd {params.dir}
120 | ln -s {input.ref} ./
121 | bowtie2-build --large-index --threads 96 {params.prefix}.fa {params.prefix}
122 | samtools faidx {params.prefix}.fa
123 | awk '{{print $1 "\t" $2}}' {params.prefix}.fa.fai > genome_sizes.bed
124 | python ./HiC-Pro/bin/utils/digest_genome.py -r ^{params.enzyme} -o enzyme.bed {params.prefix}.fa
125 | makeblastdb -in {params.prefix}.fa -dbtype nucl -parse_seqids -out {params.prefix}
126 | cp {params.spart_dir}/01_Contig_scaffolding/hicpro_config.txt ./
127 | sed -i 's#^N_CPU = #N_CPU = 96#g' hicpro_config.txt
128 | sed -i 's#^BOWTIE2_IDX_PATH = #BOWTIE2_IDX_PATH = {params.dir}#g' hicpro_config.txt
129 | sed -i 's#^REFERENCE_GENOME = #REFERENCE_GENOME = {params.prefix}#g' hicpro_config.txt
130 | sed -i 's#^GENOME_SIZE = #GENOME_SIZE = {params.dir}/genome_sizes.bed#g' hicpro_config.txt
131 | sed -i 's#^GENOME_FRAGMENT = #GENOME_FRAGMENT = {params.dir}/enzyme.bed#g' hicpro_config.txt
132 | sed -i 's#^GENOME_FRAGMENT = #GENOME_FRAGMENT = {params.dir}/enzyme.bed#g' hicpro_config.txt
133 | HiC-Pro -i {input.hic} -c hicpro_config.txt -o {params.dir}
134 | cd bowtie_results/bwt2
135 | for item in dir {params.dir}/bowtie_results/bwt2/*/*.bwt2pairs.bam; do samtools sort -m 1500M -n -@ 96 $item > $item.bam; done
136 | samtools merge -@ 96 -o {output} {params.dir}/bowtie_results/bwt2/*/*.bwt2pairs.bam.bam
137 | """
138 |
139 | rule yahs:
140 | input:
141 | bam=W+"hic_hybrid/hic_hybrid.bam",
142 | ref=W+"hifiasm_hybrid/hybrid.remove_cp_mt.fa"
143 | output:
144 | W + "yahs_hybrid/yahs_hybrid.fa"
145 | params:
146 | dir = W + "yahs_hybrid",
147 | prefix = "hybrid_bam",
148 | enzyme = hic_hybrid_enzyme
149 | shell:
150 | """
151 | cd {params.dir}
152 | yahs -e {params.enzyme} {input.ref} {input.bam} -o {params.prefix}
153 | cp {params.dir}/yahs_bam_scaffolds_final.fa ./yahs_hybrid.fa
154 | """
155 |
156 | rule patch_flye:
157 | input:
158 | single_hybrid=W + "yahs_hybrid/yahs_hybrid.fa",
159 | flye=W + "flye/assembly.fasta"
160 | output:
161 | W + "patch_flye/patch_single_hybrid_flye.fa"
162 | params:
163 | dir = W + "patch_flye",
164 | prefix = "single_hybrid_flye",
165 | shell:
166 | """
167 | cd {params.dir}
168 | wfmash {input.single_hybrid} {input.flye} > {params.prefix}.paf
169 | mkdir ragtag_output
170 | cd ragtag_output
171 | ln -s ../{params.prefix}.paf ragtag.patch.asm.paf
172 | cd ..
173 | ragtag.py patch -f 10000 --remove-small {input.single_hybrid} {input.flye}
174 | cp {params.dir}/ragtag_output/ragtag.patch.fasta {output}
175 | """
176 |
177 | rule patch_verkko:
178 | input:
179 | single_hybrid_flye=W + "patch_flye/patch_single_hybrid_flye.fa",
180 | verkko=verkko_fa
181 | output:
182 | ref=W + "patch_verkko/patch_single_hybrid_flye_verkko.fa",
183 | txt = W + "repetitive_k27.txt"
184 | params:
185 | dir = W + "patch_verkko",
186 | prefix = "single_hybrid_flye_verkko",
187 | shell:
188 | """
189 | cd {params.dir}
190 | wfmash {input.single_hybrid_flye} {input.verkko} > {params.prefix}.paf
191 | mkdir ragtag_output
192 | cd ragtag_output
193 | ln -s ../{params.prefix}.paf ragtag.patch.asm.paf
194 | cd ..
195 | ragtag.py patch -f 10000 --remove-small {input.single_hybrid_flye} {input.verkko}
196 | cp {params.dir}/ragtag_output/ragtag.patch.fasta {output.ref}
197 | bwa-mem2.avx512bw index {output.ref}
198 | meryl count k=27 output merylDB {output.ref}
199 | meryl print greater-than distinct=0.9998 merylDB > {output.txt}
200 | """
201 |
202 | rule winnowmap_hifi:
203 | input:
204 | fq=W+"fastp/hybrid.fq",
205 | ref=W + "patch_verkko/patch_single_hybrid_flye_verkko.fa",
206 | txt = W + "repetitive_k27.txt"
207 | output:
208 | sam=W+"hifi_mix_winnowmap/{hifi_mix}_q40l15k.sam"
209 | benchmark:
210 | W+"benchmarks/hifi_mix_winnowmap/{hifi_mix}.benchmark.txt"
211 | shell:
212 | """
213 | winnowmap --MD -W {input.txt} -ax map-pb -H -K 1500M -k 27 -w27 -t32 {input.ref} {input.fq} > {output.sam}
214 | """
215 |
216 | rule winnowmap_hifi_sort:
217 | input:
218 | W+"hifi_mix_winnowmap/{hifi_mix}_q40l15k.sam"
219 | output:
220 | W+"hifi_mix_sort/{hifi_mix}_q40l15k.bam"
221 | params:
222 | W + "patch_verkko/patch_single_hybrid_flye_verkko.fa.fai"
223 | benchmark:
224 | W + "benchmarks/hifi_mix_sort/{hifi_mix}.benchmark.txt"
225 | shell:
226 | "samtools view -@32 -bt {params} {input}|samtools sort -@32 -m1500M -O bam -o {output} -"
227 |
228 | rule winnowmap_hifi_sort_filter:
229 | input:
230 | W+"hifi_mix_sort/{hifi_mix}_q40l15k.bam"
231 | output:
232 | W+"hifi_mix_sort_filter/{hifi_mix}_q40l15k.bam"
233 | benchmark:
234 | W + "benchmarks/hifi_mix_sort_filter/{hifi_mix}.benchmark.txt"
235 | shell:
236 | "samtools view -@32 -F0x104 -hb {input} > {output}"
237 |
238 | rule winnowmap_hifi_sort_filter_merge:
239 | input:
240 | expand(W+"hifi_mix_sort_filter/{hifi_mix}_q40l15k.bam",hifi_mix=hifi_mix)
241 | output:
242 | W+"hybrid/hybrid.bam"
243 | benchmark:
244 | W + "benchmarks/hybrid/hybrid.benchmark.txt"
245 | shell:
246 | "samtools merge -@ 128 -l 0 {output} {input}"
247 |
248 | rule pcr_free:
249 | input:
250 | fa=W + "patch_verkko/patch_single_hybrid_flye_verkko.fa",
251 | r1=pcrfree_hybrid_r1,
252 | r2=pcrfree_hybrid_r2
253 | output:
254 | W+"hybrid_hifi_pcr/pcr.bam"
255 | shell:
256 | "bwa-mem2.avx512bw mem -t 96 {input.fa} {input.r1} {input.r2}|samtools view -@ 96 -b -|samtools sort -@ 96 -m 30G -o {output} -"
257 |
258 | rule winnowmap_hifi_filter_pcr_merge:
259 | input:
260 | hifi=expand(W+"hifi_mix_sort_filter/{hifi_mix}_q40l15k.bam",hifi_mix=hifi_mix),
261 | pcr=W+"hybrid_hifi_pcr/pcr.bam"
262 | output:
263 | W+"hybrid_hifi_pcr/hybrid.bam"
264 | benchmark:
265 | W + "benchmarks/hybrid_pcr/hybrid.benchmark.txt"
266 | shell:
267 | "samtools merge -@ 128 -l 0 {output} {input.hifi} {input.pcr}"
268 |
269 | rule winnowmap_ont:
270 | input:
271 | fq=W+"fastp/ont.fq",
272 | ref=W + "patch_verkko/patch_single_hybrid_flye_verkko.fa",
273 | txt=W+"repetitive_k27.txt"
274 | output:
275 | W+"ont_winnowmap/{e}/{e}_q10l120k.sam"
276 | benchmark:
277 | W+"benchmarks/ont_winnowmap/{e}.benchmark.txt"
278 | shell:
279 | "winnowmap --MD -W {input.txt} -ax map-ont -H -K 1500M -k 27 -w27 -t32 {input.ref} {input.fq} > {output}"
280 |
281 | rule winnowmap_ont_sort:
282 | input:
283 | W+"ont_winnowmap/{e}/{e}_q10l120k.sam"
284 | output:
285 | W+"ont_sort/{e}/{e}_q10l120k.bam"
286 | params:
287 | W + "patch_verkko/patch_single_hybrid_flye_verkko.fa.fai"
288 | benchmark:
289 | W + "benchmarks/ont_sort/{e}.benchmark.txt"
290 | shell:
291 | "samtools view -@32 -bt {params} {input}|samtools sort -@32 -m1500M -O bam -o {output} -"
292 |
293 | rule winnowmap_ont_sort_filter:
294 | input:
295 | W+"ont_sort/{e}/{e}_q10l120k.bam"
296 | output:
297 | W+"ont_filter/{e}_q10l120k.bam"
298 | benchmark:
299 | W + "benchmarks/ont_filter/{e}.benchmark.txt"
300 | shell:
301 | "samtools view -@ 128 -F0x104 -hb {input} > {output}"
302 |
303 | rule winnowmap_ont_sort_filter_merge:
304 | input:
305 | expand(W+"ont_filter/{e}_q10l120k.bam",e=e)
306 | output:
307 | W + "ont_merge/q10l120k.bam"
308 | benchmark:
309 | W + "benchmarks/ont_merge/benchmark.txt"
310 | shell:
311 | "samtools merge -@ 128 -o {output} {input}"
312 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 | 1689602086247
41 |
42 |
43 | 1689602086247
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yaml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # Set the OS, Python version and other tools you might need
9 | build:
10 | os: ubuntu-20.04
11 | tools:
12 | python: "3.9"
13 | # You can also specify other tool versions:
14 | # nodejs: "19"
15 | # rust: "1.64"
16 | # golang: "1.19"
17 |
18 | # Build documentation in the "docs/" directory with Sphinx
19 | sphinx:
20 | configuration: docs/source/conf.py
21 |
22 | # Optionally build your docs in additional formats such as PDF and ePub
23 | # formats:
24 | # - pdf
25 | # - epub
26 |
27 | # Optional but recommended, declare the Python requirements required
28 | # to build your documentation
29 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
30 | python:
31 | install:
32 | - requirements: docs/source/requirements.txt
--------------------------------------------------------------------------------
/00_Contig_screen/chloroplast.txt:
--------------------------------------------------------------------------------
1 | ptg000373l
2 | ptg000680l
3 | ptg000714l
4 | ptg001146l
5 | ptg001324l
6 | ptg001414l
7 | ptg001489l
8 | ptg001497l
9 | ptg001519l
10 | ptg001579l
11 | ptg001661l
12 | ptg001777l
13 | ptg001836l
14 | ptg001902l
15 | ptg001941l
16 | ptg001978l
17 | ptg002103l
18 | ptg002292l
19 | ptg002325l
20 | ptg002335l
21 | ptg002346l
22 | ptg002359l
23 | ptg002374l
24 | ptg002405l
25 | ptg002426l
26 | ptg002434l
27 | ptg002480l
28 | ptg002517l
29 | ptg002537l
30 | ptg002548l
31 | ptg002575l
32 | ptg002592l
33 | ptg002686l
34 | ptg002752l
35 | ptg002928l
36 | ptg002944l
37 | ptg002968l
38 | ptg002991l
39 | ptg003000l
40 | ptg003059l
41 | ptg003087l
42 | ptg003124l
43 | ptg003157l
44 | ptg003172l
45 | ptg003203l
46 | ptg003232l
47 | ptg003240l
48 | ptg003333l
49 | ptg003389l
50 | ptg003431l
51 | ptg003473l
52 | ptg003528l
53 | ptg003569l
54 | ptg003578l
55 | ptg003587l
56 | ptg003666l
57 | ptg003668l
58 | ptg003671l
59 | ptg003676l
60 | ptg003708l
61 | ptg003718l
62 | ptg003726l
63 | ptg003749l
64 | ptg003761l
65 | ptg003815l
66 | ptg003857l
67 | ptg003858l
68 | ptg003861l
69 | ptg003863l
70 | ptg003909l
71 | ptg003919l
72 | ptg003927l
73 | ptg003944l
74 | ptg003992l
75 | ptg003999l
76 | ptg004008l
77 | ptg004051l
78 | ptg004087l
79 | ptg004090l
80 | ptg004151l
81 | ptg004168l
82 | ptg004193l
83 | ptg004211l
84 | ptg004218l
85 | ptg004246l
86 | ptg004247l
87 | ptg004258l
88 | ptg004307l
89 | ptg004308l
90 | ptg004346l
91 | ptg004362l
92 | ptg004370l
93 | ptg004411l
94 | ptg004419l
95 | ptg004433l
96 | ptg004446l
97 | ptg004492l
98 | ptg004523l
99 | ptg004526l
100 | ptg004534l
101 | ptg004546l
102 | ptg004555l
103 | ptg004558l
104 | ptg004569l
105 | ptg004576l
106 | ptg004635l
107 | ptg004654l
108 | ptg004657l
109 | ptg004677l
110 | ptg004683l
111 | ptg004687l
112 | ptg004694l
113 | ptg004714l
114 | ptg004717l
115 | ptg004718l
116 | ptg004724l
117 | ptg004746l
118 | ptg004756l
119 | ptg004796l
120 | ptg004804l
121 | ptg004824l
122 | ptg004861l
123 | ptg004874l
124 | ptg004885l
125 | ptg004916l
126 | ptg004927l
127 | ptg004968l
128 | ptg004987l
129 | ptg004989l
130 | ptg005014l
131 |
--------------------------------------------------------------------------------
/00_Contig_screen/fastp.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | ont=$1
4 | hifi=$2
5 |
6 | fastp -w 16 -i $hifi -o hifi_clean_data.fastq
7 | fastp -q 10 -l 100000 -w 16 -i $ont -o ont_clean_data.fastq
8 |
--------------------------------------------------------------------------------
/00_Contig_screen/flye.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | ont=$1
4 | outdir=$2
5 | threads=$3
6 |
7 |
8 | flye --nano-hq $ont --read-error 0.1 -g 5.4g --asm-coverage 80 --scaffold --out-dir $outdir --threads $threads --no-alt-contigs
--------------------------------------------------------------------------------
/00_Contig_screen/gemma_los.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import re
4 | import sys
5 | # 参数解析
6 | parser = argparse.ArgumentParser(description="Filter config names based on fixed total length threshold.")
7 | parser.add_argument("input_file", help="Path to the input paf file")
8 | parser.add_argument("--length", type=int, required=True, help="Threshold length to aligned printing")
9 |
10 | args = parser.parse_args()
11 |
12 | input_file = args.input_file
13 | length_threshold = args.length
14 |
15 | a = 0
16 | c = 0
17 |
18 | with open(input_file) as file_object:
19 | for line in file_object:
20 | line = line.strip()
21 | if not line:
22 | continue
23 | line2 = line.split("\t")
24 | if a > 0:
25 | if line2[0] == config_name:
26 | d_value += int(line2[3]) - int(line2[2])
27 | if c == 0 and d_value > length_threshold:
28 | print(config_name)
29 | c = 1
30 | else:
31 | config_name = line2[0]
32 | d_value = int(line2[3]) - int(line2[2])
33 | c = 0
34 | if d_value > length_threshold:
35 | print(config_name)
36 | c = 1
37 | else:
38 | a += 1
39 | config_name = line2[0]
40 | d_value = int(line2[3]) - int(line2[2])
41 | if d_value > length_threshold:
42 | print(config_name)
43 | c = 1
44 |
--------------------------------------------------------------------------------
/00_Contig_screen/hifiasm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | hifi_reads=$1
4 | ont_reads=$2
5 | pre=$3
6 | thread=$4
7 |
8 | hifiasm -k 63 -o "$pre".asm -t $thread $hifi_reads --ul $ont_reads
9 |
--------------------------------------------------------------------------------
/00_Contig_screen/mitochondrion.txt:
--------------------------------------------------------------------------------
1 | ptg000019l
2 | ptg000176l
3 | ptg000203l
4 | ptg000613l
5 | ptg000791l
6 | ptg000966l
7 | ptg001425l
8 | ptg001436l
9 | ptg001452l
10 | ptg001510l
11 | ptg001591l
12 | ptg001634l
13 | ptg001685l
14 | ptg001703l
15 | ptg001782l
16 | ptg001854l
17 | ptg001887l
18 | ptg001991l
19 | ptg002013l
20 | ptg002149l
21 | ptg002290l
22 | ptg002319l
23 | ptg002341l
24 | ptg002419l
25 | ptg002477l
26 | ptg002763l
27 | ptg003220l
28 | ptg003271l
29 | ptg003365l
30 | ptg003378l
31 | ptg003460l
32 | ptg003535l
33 | ptg003630l
34 | ptg003829l
35 | ptg003845l
36 | ptg003854l
37 | ptg004354l
38 | ptg004503l
39 | ptg004621l
40 | ptg004942l
41 | ptg005046l
42 |
--------------------------------------------------------------------------------
/00_Contig_screen/rm_mt_cp.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | mt=$1
4 | cp=$2
5 | ref=$3
6 | threads=$4
7 | minimap2 -t $threads -x asm5 $mt $ref> mitochondrion.paf
8 |
9 | minimap2 -t $threads -x asm5 $cp $ref> chloroplast.paf
10 |
11 | python gemma_los.py mitochondrion.paf --length 100 > mitochondrion.txt
12 | python gemma_los.py chloroplast.paf --length 100 > chloroplast.txt
13 |
14 | seqkit grep -v -f chloroplast.txt $ref > wheat_remove_cp.fa
15 |
16 | seqkit grep -v -f mitochondrion.txt wheat_remove_cp.fa > wheat_remove_cp_mt.fa
17 |
--------------------------------------------------------------------------------
/00_Contig_screen/verkko.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 |
4 | output=$1
5 | HiFi=$2
6 | ONT=$3
7 | threads=$4
8 | memory=$5
9 |
10 | verkko -d $output --hifi $HiFi --nano $4 --threads $threads --slurm --local-memory $memory --snakeopts "--max-jobs-per-second 10 --max-status-checks-per-second 0.5 --restart-times 1 --local-cores 128 --jobs 250" --base-k 1001 --window 971 --hifi-coverage 100 --slurm --sto-run 128 200 24 --mer-run 128 200 20000 --ovb-run 64 100 24 --ovs-run 16 35 24 --red-run 16 31 24 --mbg-run 32 0 20000 --utg-run 128 240 20000 --spl-run 128 240 20000 --ali-run 23 50 20000 --pop-run 128 240 20000 --utp-run 1 240 200000 --lay-run 1 240 20000 --sub-run 128 240 200000 --par-run 128 240 20000 --cns-run 24 0 20000
11 |
--------------------------------------------------------------------------------
/01_Contig_scaffolding/Bionano_DLS_map.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | threads=$1
4 | bnx=$2
5 | ref_cmap=$3
6 | prefix=$4
7 | xml=$5
8 | Bio_dir=$6
9 | cluster_xml=$7
10 | ref=$8
11 | bio_camp=$9
12 | merge_xml=$10
13 | RefAligner=$11
14 | perl fa2cmap_multi_color.pl -i $ref -e cttaag 1 -o .
15 | python pipelineCL.py -Tn $threads -i 5 -b $bnx -r $ref_cmap -l $prefix -e w -a $xml -t $Bio_dir -y -z --species-reference other -C $cluster_xml -F 1
16 | perl hybridScaffold.pl -n $ref -b $bio_camp -u CTTAAG -c $merge_xml -r $RefAligner -o bio_hybrid -B 2 -N 2 -f
--------------------------------------------------------------------------------
/01_Contig_scaffolding/HiC-Pro.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | ref=$1
4 | ref_prefix=$2
5 | hicpro_data=$3
6 | hicpro_config=$4
7 | hicpro_outdir=$5
8 |
9 | ln -s $ref ./"$ref_prefix".fa
10 | bowtie2-build --large-index --threads 128 "$ref_prefix".fa "$ref_prefix"
11 |
12 | samtools faidx "$ref_prefix".fa
13 | awk '{print $1 "\t" $2}' "$ref_prefix".fa.fai > genome_sizes.bed
14 |
15 | python ./HiC-Pro/bin/utils/digest_genome.py -r ^GATC -o wheat_DpnII.bed "$ref_prefix".fa
16 | makeblastdb -in "$ref_prefix".fa -dbtype nucl -parse_seqids -out "$ref_prefix"
17 |
18 | HiC-Pro -i $hicpro_data -c $hicpro_config -o $hicpro_outdir -p
19 |
--------------------------------------------------------------------------------
/01_Contig_scaffolding/hicpro_config.txt:
--------------------------------------------------------------------------------
1 | # Please change the variable settings below if necessary
2 |
3 | #########################################################################
4 | ## Paths and Settings - Do not edit !
5 | #########################################################################
6 |
7 | TMP_DIR = tmp
8 | LOGS_DIR = logs
9 | BOWTIE2_OUTPUT_DIR = bowtie_results
10 | MAPC_OUTPUT = hic_results
11 | RAW_DIR = rawdata
12 |
13 | #######################################################################
14 | ## SYSTEM - PBS - Start Editing Here !!
15 | #######################################################################
16 | N_CPU =
17 | LOGFILE = hicpro.log
18 |
19 | JOB_NAME = hicpro-testop
20 | JOB_MEM =
21 | JOB_WALLTIME =
22 | JOB_QUEUE =
23 | JOB_MAIL =
24 |
25 | #########################################################################
26 | ## Data
27 | #########################################################################
28 |
29 | PAIR1_EXT = _R1
30 | PAIR2_EXT = _R2
31 |
32 | #######################################################################
33 | ## Alignment options
34 | #######################################################################
35 |
36 | MIN_MAPQ = 10
37 |
38 | BOWTIE2_IDX_PATH =
39 | BOWTIE2_GLOBAL_OPTIONS = --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder
40 | BOWTIE2_LOCAL_OPTIONS = --very-sensitive -L 20 --score-min L,-0.6,-0.2 --end-to-end --reorder
41 |
42 | #######################################################################
43 | ## Annotation files
44 | #######################################################################
45 |
46 | REFERENCE_GENOME =
47 | GENOME_SIZE =
48 |
49 | #######################################################################
50 | ## Allele specific
51 | #######################################################################
52 |
53 | ALLELE_SPECIFIC_SNP =
54 |
55 | #######################################################################
56 | ## Capture Hi-C analysis
57 | #######################################################################
58 |
59 | CAPTURE_TARGET =
60 | REPORT_CAPTURE_REPORTER =
61 |
62 | #######################################################################
63 | ## Digestion Hi-C
64 | #######################################################################
65 |
66 | GENOME_FRAGMENT =
67 | LIGATION_SITE = GATCGATC
68 | MIN_FRAG_SIZE = 100
69 | MAX_FRAG_SIZE = 100000
70 | MIN_INSERT_SIZE = 100
71 | MAX_INSERT_SIZE = 1000
72 |
73 | #######################################################################
74 | ## Hi-C processing
75 | #######################################################################
76 |
77 | MIN_CIS_DIST =
78 | GET_ALL_INTERACTION_CLASSES = 1
79 | GET_PROCESS_SAM = 1
80 | RM_SINGLETON = 1
81 | RM_MULTI = 1
82 | RM_DUP = 1
83 |
84 | #######################################################################
85 | ## Contact Maps
86 | #######################################################################
87 |
88 | BIN_SIZE = 100000 500000 1000000 1500000
89 | MATRIX_FORMAT = upper
90 |
91 | #######################################################################
92 | ## ICE Normalization
93 | #######################################################################
94 | MAX_ITER = 100
95 | FILTER_LOW_COUNT_PERC = 0.02
96 | FILTER_HIGH_COUNT_PERC = 0
97 | EPS = 0.1
98 |
--------------------------------------------------------------------------------
/01_Contig_scaffolding/yahs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | enzyme=$1
4 | ref=$2
5 | bed=$3
6 | profix=$4
7 |
8 | yahs -e $enzyme $ref $bed -o $4
9 |
10 |
--------------------------------------------------------------------------------
/02_Gap_patching/Centromeric_region_analysis.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | workdir=$1
4 | FASTA=$2
5 | prefix=$3
6 | CHIP1=$4
7 | CHIP2=$5
8 | threads=$6
9 | CHIP1c=$7
10 | CHIP2c=$8
11 |
12 | #treatment
13 | SAM="${workdir}/ref_chip.sam"
14 | BAM="${workdir}/ref_chip.bam"
15 | BAMSORT="${workdir}/ref_chip_sort.bam"
16 | BAMFILTER="${workdir}/ref_chip_sort_filter.bam"
17 | CHIP1TRIM="${workdir}/CHIP_1.trim.fastq"
18 | CHIP2TRIM="${workdir}/CHIP_2.trim.fastq"
19 | BINS="${workdir}/"$prefix".genome.size.100kb"
20 |
21 | faidx $FASTA -i chromsizes | bedtools makewindows -g - -w 100000 | awk -vFS="\t" -vOFS="\t" '{print $1,$2,$3}' | bedtools sort -i - > "$prefix".genome.size.100kb
22 |
23 | ln -s ${FASTA} ./"$prefix".fasta
24 | hisat2-build --large-index -a -p $threads ${workdir}/"$prefix".fasta ${workdir}/"$prefix"
25 |
26 | # adapter
27 | fastp --in1 ${CHIP1} --in2 ${CHIP2} --out1 ${CHIP1TRIM} --out2 ${CHIP2TRIM} --thread $threads
28 |
29 | ## RUN HISAT2
30 | hisat2 -p $threads -x "$prefix" -1 ${CHIP1TRIM} -2 ${CHIP2TRIM} -S ${SAM}
31 |
32 | ## convert to BAM
33 | samtools view -b -S -@ $threads -o ${BAM} ${SAM}
34 |
35 | ## sort
36 | samtools sort -m 4G -@ $threads -o ${BAMSORT} ${BAM}
37 |
38 | ## filter
39 | samtools view -@ $threads -q 30 -o ${BAMFILTER} ${BAMSORT}
40 | samtools index -c -@ $threads ${BAMFILTER}
41 |
42 | #control
43 | SAMc="${workdir}/ref_control.sam"
44 | BAMc="${workdir}/ref_control.bam"
45 | BAMSORTc="${workdir}/ref_control_sort.bam"
46 | BAMFILTERc="${workdir}/ref_control_sort_filter.bam"
47 | CHIP1TRIMc="${workdir}/ref_control.trim.fastq"
48 | CHIP2TRIMc="${workdir}/ref_control.trim.fastq"
49 | INTERSECTc="${workdir}/"$profix".intersect.bed"
50 |
51 | # adapter
52 | #fastp --in1 ${CHIP1c} --in2 ${CHIP2c} --out1 ${CHIP1TRIMc} --out2 ${CHIP2TRIMc} --thread 52
53 |
54 | ## RUN HISAT2
55 | hisat2 -p $threads -x "$profix" -1 ${CHIP1TRIMc} -2 ${CHIP2TRIMc} -S ${SAMc}
56 |
57 | ## convert to BAM
58 | samtools view -b -S -@ $threads -o ${BAMc} ${SAMc}
59 |
60 | ## sort
61 | samtools sort -m 4G -@ $threads -o ${BAMSORTc} ${BAMc}
62 |
63 | ## filter
64 | samtools view -@ $threads -q 30 -o ${BAMFILTERc} ${BAMSORTc}
65 | samtools index -c -@ $threads ${BAMFILTERc}
66 |
67 | epic2 -t ${BAMFILTER} -c ${BAMFILTERc} --chromsizes -o "CENH3.bed" --bin-size 25000 --mapq 30 --gaps-allowed 4
68 |
--------------------------------------------------------------------------------
/02_Gap_patching/chip-seq.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import sys
4 |
5 | files = sys.argv[1]
6 |
7 | a=0
8 | c=0
9 | diff=0
10 | with open(files, 'r') as file:
11 | for line in file:
12 | line1 = line.split("\n")
13 | line2 = line1[0].split("\t")
14 | a=a+int(line2[3])
15 | c=c+1
16 | f=3*(a/c)
17 | d=0
18 | with open(files, 'r') as file_object:
19 | for line in file_object:
20 | line1 = line.split("\n")
21 | line2 = line1[0].split("\t")
22 | if int(line2[3]) > f:
23 | if d==0:
24 | name=line2[0]
25 | start=line2[1]
26 | end=line2[2]
27 | d=d+1
28 | else:
29 | if name==line2[0]:
30 | if 500000 >= int(line2[1])-int(end):
31 | end=line2[2]
32 | else:
33 | if diff < int(end)-int(start):
34 | diff=int(end)-int(start)
35 | maxs=start
36 | maxe=end
37 | #print(name+"\t"+start+"\t"+end)
38 | start=line2[1]
39 | end=line2[2]
40 | else:
41 | if diff < int(end) - int(start):
42 | diff = int(end) - int(start)
43 | maxs = start
44 | maxe = end
45 | print(name + "\t" + maxs + "\t" + maxe + "\t" + str(diff))
46 | diff=0
47 | name = line2[0]
48 | start = line2[1]
49 | end = line2[2]
50 | if diff < int(end) - int(start):
51 | diff = int(end) - int(start)
52 | maxs = start
53 | maxe = end
54 | print(name + "\t" + maxs + "\t" + maxe + "\t" + str(diff))
--------------------------------------------------------------------------------
/02_Gap_patching/paf_filter.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | #author :SHOUCHENG LIU
3 | #email :286596224@QQ.com
4 | #last modified :
5 |
6 | #-------------------------------------------------------------------------
7 | #please consult DOCUMENTATION for detailed information...
8 | #perl paf_filter.pl -i ragtag.patch.debug.filtered.paf -minlen 500000 -iden 0.8
9 | use strict;
10 | use diagnostics;
11 | use warnings;
12 | use Getopt::Long;
13 |
14 | my %opts=();
15 | GetOptions(\%opts,"i:s","minlen:s","iden:s");
16 |
17 | my %h=();
18 | open IN, $opts{i} or die "Cannot open file: $opts{i}!\n";
19 | my $o=$opts{i}."_fiter.paf";
20 | open OUT,">$o" or die "Cannot create file: $o!\n";
21 | #ragtag.patch.debug.filtered.paf
22 | while(){
23 | chomp;
24 | my $number=0;
25 | my @f = split /\t/, $_;
26 | if ($f[8]>($f[6]-$opts{minlen}) or $f[7]<$opts{minlen}) {
27 | print $f[8]."\t".$f[9]/$f[10]."\n";
28 | if ($f[9]/$f[10]>$opts{iden}) {
29 | $h{$f[0]}{$f[5]}=1
30 | }}
31 | }
32 | close IN;
33 | open IN, $opts{i} or die "Cannot open file: $opts{i}!\n";
34 | while(){
35 | chomp;
36 | my $number=0;
37 | my @f = split /\t/, $_;
38 | if ($f[8]>($f[6]-$opts{minlen}) or $f[7]<$opts{minlen}) {
39 | print $f[8]."\t".$f[9]/$f[10]."\n";
40 | if ($f[9]/$f[10]>$opts{iden}) {
41 | my $le=keys %{$h{$f[0]}};
42 | print $le."\n";
43 | if ($le>1) {
44 |
45 | print OUT $_."\n";
46 | }
47 | }
48 | }
49 | }
--------------------------------------------------------------------------------
/02_Gap_patching/renameagp.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | #author :SHOUCHENG LIU
3 | #email :286596224@QQ.com
4 | #last modified :
5 |
6 | #-------------------------------------------------------------------------
7 | #please consult DOCUMENTATION for detailed information...
8 | use strict;
9 | use diagnostics;
10 | use warnings;
11 | use Getopt::Long;
12 | # perl /home/liusc/proj/wheat/code/agp.pl -i ragtag.patch.ctg.agp -i1 ragtag.patch.debug.filtered.paf -start seq00000000 -end seq00000001 -o test.agp
13 | my %opts=();
14 | GetOptions(\%opts,"i:s","i1:s","o:s","start:s","end:s");
15 |
16 | if (!$opts{i} or !$opts{o}){
17 | print "----------------------------------------------------------------------
18 | USAGE: perl $0
19 | -i input file
20 | -o out file
21 | ----------------------------------------------------------------------\n";
22 | exit;
23 |
24 | }
25 | my %h=();
26 | open IN, $opts{i} or die "Cannot open file: $opts{i}!\n";
27 | open IN1, $opts{i1} or die "Cannot open file: $opts{i1}!\n";
28 | open OUT,">$opts{o}" or die "Cannot create file: $opts{o}!\n";
29 | L:while(){
30 | #chr8 1 1000000 1 W seq00000000 1 1000000 +
31 | #chr8 1000001 1000100 2 N 100 scaffold yes align_genus
32 | #chr8 1000101 4999101 3 W seq00000001 1 3999001 +
33 | #scf0 1000001 1000999 2 W qseq00000000 10002 11000 +
34 | chomp;
35 | if ($_=~ $opts{start}) {
36 | my @f = split /\t/, $_;
37 |
38 | $h{$f[5]}=[@f];
39 | @f = split /\t/, ;
40 | $h{"scaffold"}=[@f];
41 | @f = split /\t/, ;
42 | $h{$f[5]}=[@f];
43 | last L;
44 | }
45 | }
46 | my $start;
47 | my $end;
48 | my $len2;
49 | while(){
50 | #qseq00000269 12474565 2150891 12471563 + seq00000001 79845467 259191 10579680 9480999 10320756 60
51 | #qseq00000269 12474565 75 2194153 + seq00000000 673241388 671105857 673241383 2029712 2194088 60
52 | my $l1=chomp($_);
53 | my $number=0;
54 | my @f = split /\t/, $_;
55 |
56 | if ($f[4] eq "+") {
57 | if ($f[5] eq $opts{start}) {
58 | my $offset=$f[6]-$f[8];
59 | $h{$f[5]}[2]-=$offset;
60 | $h{$f[5]}[7]-=$offset;
61 |
62 | $h{"scaffold"}[1]-=$offset;
63 | $start=$f[3];
64 | $h{"scaffold"}[1]=$h{$opts{start}}[2]+1;
65 | $h{"scaffold"}[6]=$f[3]+1;
66 | }elsif ($f[5] eq $opts{end}){
67 | $end=$f[2];
68 | $h{"scaffold"}[4]="W";
69 | $h{"scaffold"}[5]=$f[0];
70 |
71 | $h{"scaffold"}[8]="+";
72 | $h{$opts{end}}[6]=$f[7]+1;
73 | }
74 |
75 | }elsif($f[4] eq "-") {
76 | if ($f[5] eq $opts{start}) {
77 | my $offset=$f[6]-$f[8];
78 | $h{$f[5]}[2]-=$offset;
79 | $h{$f[5]}[7]-=$offset;
80 | $h{"scaffold"}[1]-=$offset;
81 | $end=$f[2];
82 | $h{"scaffold"}[1]=$h{$opts{start}}[2]+1;
83 | $h{"scaffold"}[7]=$f[2];
84 |
85 |
86 | }elsif ($f[5] eq $opts{end}){
87 | $start=$f[3];
88 | $h{"scaffold"}[4]="W";
89 | $h{"scaffold"}[5]=$f[0];
90 |
91 | $h{"scaffold"}[8]="-";
92 | $h{$f[5]}[6]=$f[7]+1;
93 | $h{"scaffold"}[6]=$f[3]+1;
94 | }
95 | }
96 | }
97 | my $gapsize=$end-$start;
98 | print $gapsize."\n";
99 | $h{"scaffold"}[2]=$h{"scaffold"}[1]+$gapsize-1;
100 | $h{"scaffold"}[7]=$h{"scaffold"}[6]+$gapsize-1;
101 | if ($h{"scaffold"}[2]>$h{$opts{start}}[2]) {
102 |
103 | $h{$opts{end}}[1]=$h{"scaffold"}[2]+1;
104 | $h{$opts{end}}[2]=$h{$opts{end}}[1]+($h{$opts{end}}[7]-$h{$opts{end}}[6]);
105 | my $string = join "\t", @{$h{$opts{start}}};
106 | print OUT "## agp-version 2.1\n";
107 | print OUT "# AGP created by RagTag v2.1.0\n";
108 | print OUT $string."\n";
109 | $string = join "\t", @{$h{"scaffold"}};
110 | print OUT $string."\n";
111 | $string = join "\t", @{$h{$opts{end}}};
112 | print OUT $string;
113 | }else{
114 | $h{$opts{end}}[1]=$h{$opts{start}}[2]+1;
115 | $h{$opts{end}}[2]=$h{$opts{end}}[1]+($h{$opts{end}}[7]-$h{$opts{end}}[6]);
116 | $h{$opts{end}}[3]--;
117 | my $string = join "\t", @{$h{$opts{start}}};
118 | print OUT "## agp-version 2.1\n";
119 | print OUT "# AGP created by RagTag v2.1.0\n";
120 | print OUT $string."\n";
121 | $string = join "\t", @{$h{$opts{end}}};
122 | print OUT $string;
123 | }
--------------------------------------------------------------------------------
/02_Gap_patching/wfmash_ragtag.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | query=$1
4 | ref=$2
5 | region=$3
6 | mkdir "$region"
7 | cd "$region"
8 | wfmash "$ref" "$query" > "$region".paf
9 | mkdir ragtag_output
10 | cd ragtag_output
11 | ln -s ../"$region".paf ragtag.patch.asm.paf
12 | cd ..
13 | ln -s "$ref" ref.fasta
14 | ln -s "$query" query.fasta
15 |
16 | ragtag.py patch -i 0.99 --remove-small -q 10 --debug -u --aligner minimap2 -t 128 --mm2-params "-x asm20 -I1G -t 128" ref.fasta query.fasta
17 | _submit_telomere.sh ragtag_output/ragtag.patch.fasta #(https://github.com/VGP/vgp-assembly)
18 |
--------------------------------------------------------------------------------
/03_Polishing/bwa_winnowmap.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 |
4 | b={}
5 | hifi_single={}
6 | hifi_mix={}
7 | e={}
8 | d={}
9 | W=config["WORKDIR"]
10 | DIR=config["DIR"]
11 | DIRs=config["DIRs"]
12 | DIRont=config["DIRont"]
13 |
14 |
15 |
16 |
17 | for dirs in os.listdir(DIR):
18 | b2 = dirs.split(".fastq")
19 | if ".fastq" in dirs:
20 | absPath = os.path.join(DIR, dirs)
21 | hifi_mix[b2[0]]=absPath
22 | elif ".bam" in dirs:
23 | a = 0
24 |
25 | for x in range(1, 4):
26 | x=str(x)
27 | for dirs in os.listdir(DIRs+x):
28 | b2 = dirs.split(".fastq")
29 | if ".fastq" in dirs:
30 | absPath = os.path.join(DIRs+x, dirs)
31 | d[b2[0]]=absPath
32 | elif ".bam" in dirs:
33 | a = 0
34 |
35 | for dirs in os.listdir(DIRont):
36 | b2 = dirs.split(".fastq")
37 | if ".fastq" in dirs:
38 | absPath = os.path.join(DIRont, dirs)
39 | e[b2[0]]=absPath
40 | elif ".bam" in dirs:
41 | a = 0
42 |
43 | rule final:
44 | input:
45 | W+"single_hifi_pcr/hybrid.bam",
46 | W+"hybrid_hifi_pcr/hybrid.bam",
47 | W + "ont_merge/q10l120k.bam"
48 |
49 |
50 |
51 | rule minimap_cu_mix:
52 | input:
53 | fq=W+"hifi_mix_reads/{hifi_mix}_q40l15k.fastq",
54 | ref="CS_ISSA.fasta",
55 | txt="repetitive_k27.txt"
56 | output:
57 | W+"hifi_mix_winnowmap/{hifi_mix}_q40l15k.sam"
58 | benchmark:
59 | W+"benchmarks/hifi_mix_winnowmap/{hifi_mix}.benchmark.txt"
60 | shell:
61 | # meryl count k=27 output merylDB CS_ISSA.fasta
62 | #meryl print greater-than distinct=0.9998 merylDB > repetitive_k27.txt
63 | "winnowmap --MD -W {input.txt} -ax map-pb -H -K 1500M -k 27 -w27 -t32 {input.ref} {input.fq} > {output}"
64 |
65 | rule minimap_cu_single:
66 | input:
67 | fq=W+"hifi_single_reads/{hifi_single}_q40l15k.fastq",
68 | ref = "CS_ISSA.fasta",
69 | txt = "repetitive_k27.txt"
70 | output:
71 | W+"hifi_single_winnowmap/{hifi_single}_q40l15k.sam"
72 | benchmark:
73 | W+"benchmarks/hifi_single_winnowmap/{hifi_single}.benchmark.txt"
74 | shell:
75 | # meryl count k=27 output merylDB CS_ISSA.fasta
76 | #meryl print greater-than distinct=0.9998 merylDB > repetitive_k27.txt
77 | "winnowmap --MD -W {input.txt} -ax map-pb -H -K 1500M -k 27 -w27 -t32 {input.ref} {input.fq} > {output}"
78 |
79 | rule hifi_mix_sort:
80 | input:
81 | W+"hifi_mix_winnowmap/{hifi_mix}_q40l15k.sam"
82 | output:
83 | W+"hifi_mix_sort/{hifi_mix}_q40l15k.bam"
84 | params:
85 | "CS_ISSA.fasta.fai"
86 | benchmark:
87 | W + "benchmarks/hifi_mix_sort/{hifi_mix}.benchmark.txt"
88 | shell:
89 | "samtools view -@32 -bt {params} {input}|samtools sort -@32 -m1500M -O bam -o {output} -"
90 |
91 | rule filter:
92 | input:
93 | W+"hifi_mix_sort/{hifi_mix}_q40l15k.bam"
94 | output:
95 | W+"hifi_mix_sort_filter/{hifi_mix}_q40l15k.bam"
96 | benchmark:
97 | W + "benchmarks/hifi_mix_sort_filter/{hifi_mix}.benchmark.txt"
98 | shell:
99 | "samtools view -@32 -F0x104 -hb {input} > {output}"
100 |
101 | rule filter_merge:
102 | input:
103 | expand(W+"hifi_mix_sort_filter/{hifi_mix}_q40l15k.bam",hifi_mix=hifi_mix)
104 | output:
105 | W+"hybrid/hybrid.bam"
106 | benchmark:
107 | W + "benchmarks/hybrid/hybrid.benchmark.txt"
108 | shell:
109 | "samtools merge -@ 128 -l 0 {output} {input}"
110 |
111 |
112 | #"/home/liusc/lxp/software/bwa-mem2/bwa-mem2.avx512bw index {input}"
113 |
114 | rule pcr_free_hybrid:
115 | input:
116 | fa="CS_ISSA.fasta",
117 | r1="hybrid_1.clean.fq.gz",
118 | r2="hybrid_2.clean.fq.gz"
119 | output:
120 | W+"hybrid_hifi_pcr/pcr.bam"
121 | shell:
122 | "bwa-mem2.avx512bw mem -t 96 {input.fa} {input.r1} {input.r2}|samtools view -@ 96 -b -|samtools sort -@ 96 -m 30G -o {output} -"
123 |
124 | rule pcr_free_single:
125 | input:
126 | fa="CS_ISSA.fasta",
127 | r1="single_1.clean.fq.gz",
128 | r2="single_2.clean.fq.gz"
129 | output:
130 | W+"single_hifi_pcr/pcr.bam"
131 | shell:
132 | "bwa-mem2.avx512bw mem -t 96 {input.fa} {input.r1} {input.r2}|samtools view -@ 96 -b -|samtools sort -@ 96 -m 30G -o {output} -"
133 |
134 | rule filter_merge_hybrid:
135 | input:
136 | hifi=expand(W+"hifi_mix_sort_filter/{hifi_mix}_q40l15k.bam",hifi_mix=hifi_mix),
137 | pcr=W+"hybrid_hifi_pcr/pcr.bam"
138 | output:
139 | W+"hybrid_hifi_pcr/hybrid.bam"
140 | benchmark:
141 | W + "benchmarks/hybrid_pcr/hybrid.benchmark.txt"
142 | shell:
143 | "samtools merge -@ 128 -l 0 {output} {input.hifi} {input.pcr}"
144 |
145 | rule hifi_single_sort:
146 | input:
147 | W+"hifi_single_winnowmap/{hifi_single}_q40l15k.sam"
148 | output:
149 | W+"hifi_single_sort/{hifi_single}_q40l15k.bam"
150 | params:
151 | "/data/liusc/lixp/wheat/result/ref/m219/m219.fasta.fai"
152 | benchmark:
153 | W + "benchmarks/hifi_single_sort/{hifi_single}.benchmark.txt"
154 | shell:
155 | "samtools view -@32 -bt {params} {input}|samtools sort -@32 -m1500M -O bam -o {output} -"
156 |
157 | rule filter_single:
158 | input:
159 | W+"hifi_single_sort/{hifi_single}_q40l15k.bam"
160 | output:
161 | W+"hifi_single_sort_filter/{hifi_single}_q40l15k.bam"
162 | benchmark:
163 | W + "benchmarks/hifi_single_sort_filter/{hifi_single}.benchmark.txt"
164 | shell:
165 | "samtools view -@ 32 -F0x104 -hb {input} > {output}"
166 |
167 | rule filter_merge_single:
168 | input:
169 | expand(W+"hifi_single_sort_filter/{hifi_single}_q40l15k.bam",hifi_single=d)
170 | output:
171 | W+"single/single.bam"
172 | benchmark:
173 | W + "benchmarks/single/single.benchmark.txt"
174 | shell:
175 | "samtools merge -@ 128 -l 0 {output} {input}"
176 |
177 | rule filter_merge_single_pcr:
178 | input:
179 | hifi=expand(W+"hifi_single_sort_filter/{hifi_single}_q40l15k.bam",hifi_single=d),
180 | pcr=W+"single_hifi_pcr/pcr.bam"
181 | output:
182 | W+"single_hifi_pcr/hybrid.bam"
183 | benchmark:
184 | W + "benchmarks/single_pcr/hybrid.benchmark.txt"
185 | shell:
186 | "samtools merge -@ 128 -l 0 {output} {input.hifi} {input.pcr}"
187 |
188 |
189 | rule minimap_cu_ont:
190 | input:
191 | fq=W+"ont/reads/{e}_q10l120k.fastq",
192 | ref="CS_ISSA.fasta",
193 | txt="repetitive_k27.txt"
194 | output:
195 | W+"ont_winnowmap/{e}/{e}_q10l120k.sam"
196 | benchmark:
197 | W+"benchmarks/ont_winnowmap/{e}.benchmark.txt"
198 | shell:
199 | "winnowmap --MD -W {input.txt} -ax map-ont -H -K 1500M -k 27 -w27 -t32 {input.ref} {input.fq} > {output}"
200 |
201 | rule sort:
202 | input:
203 | W+"ont_winnowmap/{e}/{e}_q10l120k.sam"
204 | output:
205 | W+"ont_sort/{e}/{e}_q10l120k.bam"
206 | params:
207 | "CS_ISSA.fasta.fai"
208 | benchmark:
209 | W + "benchmarks/ont_sort/{e}.benchmark.txt"
210 | shell:
211 | "samtools view -@32 -bt {params} {input}|samtools sort -@32 -m1500M -O bam -o {output} -"
212 |
213 | rule filter_ont:
214 | input:
215 | W+"ont_sort/{e}/{e}_q10l120k.bam"
216 | output:
217 | W+"ont_filter/{e}_q10l120k.bam"
218 | benchmark:
219 | W + "benchmarks/ont_filter/{e}.benchmark.txt"
220 | shell:
221 | "samtools view -@ 128 -F0x104 -hb {input} > {output}"
222 |
223 | rule merge:
224 | input:
225 | expand(W+"ont_filter/{e}_q10l120k.bam",e=e)
226 | output:
227 | W + "ont_merge/q10l120k.bam"
228 | benchmark:
229 | W + "benchmarks/ont_merge/benchmark.txt"
230 | shell:
231 | "samtools merge -@ 128 -o {output} {input}"
232 |
233 | # snakemake -s bwa_winnowmap.py --cluster-config clust.json --configfile conf_ck.yaml --cluster '{cluster.account}' --jobs 40 --rerun-incomplete --restart-times 1 -np
--------------------------------------------------------------------------------
/03_Polishing/callsv_snv.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 |
4 | b={}
5 | d={}
6 | e={}
7 | W=config["WORKDIR"]
8 |
9 | rule final:
10 | input:
11 | W + "output/SV_SNV/merfin_sv_snv_consensus.fasta"
12 |
13 | rule sv_sniffles_hybrid:
14 | input:
15 | W+"hybrid/hybrid.bam"
16 | output:
17 | W + "output/SV/sniffles_hybrid.vcf"
18 | shell:
19 | "sniffles --threads 128 --input {input} --vcf {output}"
20 |
21 | rule sv_sniffles_single:
22 | input:
23 | W+"single/single.bam"
24 | output:
25 | W + "output/SV/sniffles_single.vcf"
26 | shell:
27 | "sniffles --threads 128 --input {input} --vcf {output}"
28 |
29 | rule sv_sniffles_ont:
30 | input:
31 | W + "ont_merge/q10l120k.bam"
32 | output:
33 | W + "output/SV/sniffles_ont.vcf"
34 | shell:
35 | "sniffles --threads 128 --input {input} --vcf {output}"
36 |
37 | #################call SV
38 | rule sv_cutesv_hybrid:
39 | input:
40 | bam=W+"hybrid/hybrid.bam",
41 | ref="CS_ISSA.fasta"
42 | output:
43 | W + "output/SV/cutesv_hybrid.vcf"
44 | params:
45 | W + "output/SV/cutesv_hybrid"
46 | shell:
47 | "cuteSV --threads 128 {input.bam} {input.ref} {output} {params} --genotype"
48 |
49 | rule sv_cutesv_hybrid_suggest:
50 | input:
51 | bam=W+"hybrid/hybrid.bam",
52 | ref="CS_ISSA.fasta"
53 | output:
54 | W + "output/SV/cutesv_hybrid_suggest.vcf"
55 | params:
56 | W + "output/SV/cutesv_hybrid_suggest"
57 | shell:
58 | "cuteSV --threads 128 {input.bam} {input.ref} {output} {params} --max_cluster_bias_INS 100 --diff_ratio_merging_INS 0.3 --max_cluster_bias_DEL 100 --diff_ratio_merging_DEL 0.3 --genotype"
59 |
60 | rule sv_cutesv_single:
61 | input:
62 | bam=W+"single/single.bam",
63 | ref="CS_ISSA.fasta"
64 | output:
65 | W + "output/SV/cutesv_single.vcf"
66 | params:
67 | W + "output/SV/cutesv_single"
68 | shell:
69 | "cuteSV --threads 128 {input.bam} {input.ref} {output} {params} --genotype"
70 |
71 | rule sv_cutesv_single_suggest:
72 | input:
73 | bam=W+"single/single.bam",
74 | ref="CS_ISSA.fasta"
75 | output:
76 | W + "output/SV/cutesv_single_suggest.vcf"
77 | params:
78 | W + "output/SV/cutesv_single_suggest"
79 | shell:
80 | "cuteSV --threads 128 {input.bam} {input.ref} {output} {params} --max_cluster_bias_INS 100 --diff_ratio_merging_INS 0.3 --max_cluster_bias_DEL 100 --diff_ratio_merging_DEL 0.3 --genotype"
81 |
82 | rule sv_cutesv_ont:
83 | input:
84 | bam=W + "ont_merge/q10l120k.bam",
85 | ref="CS_ISSA.fasta"
86 | output:
87 | W + "output/SV/cutesv_ont.vcf"
88 | params:
89 | W + "output/SV/cutesv_ont"
90 | shell:
91 | "cuteSV --threads 128 {input.bam} {input.ref} {output} {params} --genotype"
92 |
93 | rule sv_cutesv_ont_suggest:
94 | input:
95 | bam=W + "ont_merge/q10l120k.bam",
96 | ref="CS_ISSA.fasta"
97 | output:
98 | W + "output/SV/cutesv_ont_suggest.vcf"
99 | params:
100 | W + "output/SV/cutesv_ont_suggest"
101 | shell:
102 | "cuteSV --threads 128 {input.bam} {input.ref} {output} {params} --max_cluster_bias_INS 1000 --diff_ratio_merging_INS 0.9 --max_cluster_bias_DEL 1000 --diff_ratio_merging_DEL 0.5 --genotype"
103 |
104 | rule hifi_ls:
105 | input:
106 | hifi1=W + "output/SV/sniffles_hybrid.vcf",
107 | hifi2=W + "output/SV/cutesv_hybrid.vcf",
108 | hifi3=W + "output/SV/cutesv_hybrid_suggest.vcf",
109 | hifi4=W + "output/SV/sniffles_single.vcf",
110 | hifi5=W + "output/SV/cutesv_single.vcf",
111 | hifi6=W + "output/SV/cutesv_single_suggest.vcf"
112 | output:
113 | W + "output/SV/hifi_ls.txt"
114 | shell:
115 | "ls {input.hifi1} {input.hifi2} > {output}"
116 |
117 | rule ont_ls:
118 | input:
119 | ont1=W + "output/SV/{chr}/{chr}_q10l120k_cutesv_ont.vcf",
120 | ont2=W + "output/SV/{chr}/{chr}_q10l120k_cutesv_ont_suggest.vcf",
121 | ont3=W + "output/SV/sniffles_ont.vcf"
122 | output:
123 | W + "output/SV/ont_ls.txt"
124 | shell:
125 | "ls {input.ont1} {input.ont2} > {output}"
126 |
127 | rule jasmine_hifi:
128 | input:
129 | W + "output/SV/hifi_ls.txt"
130 | output:
131 | W + "output/SV/jasmine_hifi.vcf"
132 | shell:
133 | "jasmine max_dist=500 min_seq_id=0.3 spec_reads=3 threads=128 min_support=1 --output_genotypes file_list={input} out_file={output}"
134 |
135 | rule jasmine_ont:
136 | input:
137 | W + "output/SV/ont_ls.txt"
138 | output:
139 | W + "output/SV/jasmine_ont.vcf"
140 | shell:
141 | "jasmine max_dist=500 min_seq_id=0.3 spec_reads=3 threads=128 min_support=1 --output_genotypes file_list={input} out_file={output}"
142 |
143 | rule ls_hifi_ont:
144 | input:
145 | hifi=W + "output/SV/jasmine_hifi.vcf",
146 | ont=W + "output/SV/jasmine_ont.vcf"
147 | output:
148 | W + "output/hifi_ont_ls.txt"
149 | shell:
150 | "ls {input.hifi} {input.ont} > {output}"
151 |
152 | rule jasmine_hifi_ont:
153 | input:
154 | W + "output/SV/hifi_ont_ls.txt"
155 | output:
156 | W + "output/SV/jasmine_hifi_ont.vcf"
157 | shell:
158 | "jasmine max_dist=500 min_seq_id=0.3 spec_reads=3 threads=96 min_support=2 --output_genotypes file_list={input} out_file={output}"
159 |
160 | ############################call SNV
161 |
162 | rule vcf_filter_ont:
163 | input:
164 | W + "output/pepper_deepvariant_output/intermediate_files/PEPPER_VARIANT_FULL.vcf.gz"
165 | output:
166 | W + "output/pepper_deepvariant_output/intermediate_files/PEPPER_VARIANT_FULL.PASS.gq25.gt10.vcf.gz"
167 | shell:
168 | "bcftools view -f ""PASS"" -e 'FORMAT/GQ<=25 | FORMAT/DP<=10' -Oz {input} > {output}"
169 |
170 | rule vcf_filter_ont_index:
171 | input:
172 | W + "output/pepper_deepvariant_output/intermediate_files/PEPPER_VARIANT_FULL.PASS.gq25.gt10.vcf.gz"
173 | output:
174 | W + "output/pepper_deepvariant_output/intermediate_files/PEPPER_VARIANT_FULL.PASS.gq25.gt10.vcf.gz.csi"
175 | shell:
176 | "bcftools index -c {input}"
177 |
178 | rule vcf_filter_hifi:
179 | input:
180 | W + "output/hybrid.vcf"
181 | output:
182 | W + "output/hybrid.PASS.gq25.gt10.vcf.gz"
183 | shell:
184 | "bcftools view -f ""PASS"" -e 'FORMAT/GQ<=25 | FORMAT/DP<=10' -Oz {input} > {output}"
185 |
186 | rule vcf_filter_hifi_index:
187 | input:
188 | W + "output/hybrid.PASS.gq25.gt10.vcf.gz"
189 | output:
190 | W + "output/hybrid.PASS.gq25.gt10.vcf.gz.csi"
191 | shell:
192 | "bcftools index -c {input}"
193 |
194 | rule hap:
195 | input:
196 | hifi=W + "output/hybrid.PASS.gq25.gt10.vcf.gz",
197 | hifi_csi=W + "output/hybrid.PASS.gq25.gt10.vcf.gz.csi",
198 | ont=W + "output/pepper_deepvariant_output/intermediate_files/PEPPER_VARIANT_FULL.PASS.gq25.gt10.vcf.gz",
199 | ont_csi=W + "output/pepper_deepvariant_output/intermediate_files/PEPPER_VARIANT_FULL.PASS.gq25.gt10.vcf.gz.csi",
200 | ref="CS_ISSA.fasta"
201 | output:
202 | W + "output/SNV/HAPPY.vcf.gz",
203 | params:
204 | W + "output/SNV/HAPPY"
205 | shell:
206 | "python hap.py {input.hifi} {input.ont} -r {input.ref} -o {params} --pass-only --threads 128"
207 |
208 | rule vcf_merge_t2t:
209 | input:
210 | hifi=W + "output/hybrid.PASS.gq25.gt10.vcf.gz",
211 | ont=W + "output/pepper_deepvariant_output/intermediate_files/PEPPER_VARIANT_FULL.PASS.gq25.gt10.vcf.gz",
212 | hap=W + "output/SNV/HAPPY.vcf.gz",
213 | output:
214 | W + "output/SNV/MERGED_SMALL_VARIANTS.vcf.gz"
215 | shell:
216 | "python3 vcf_merge_t2t.py -v1 {input.hifi} -v2 {input.ont} -hv {input.hap} -o {output}"
217 |
218 | rule gunzip:
219 | input:
220 | W + "output/SNV/MERGED_SMALL_VARIANTS.vcf.gz"
221 | output:
222 | W + "output/SNV/MERGED_SMALL_VARIANTS.vcf"
223 | shell:
224 | "gunzip -d -c {input} > {output}"
225 |
226 | rule meryl_count:
227 | input:
228 | "CS_ISSA.fasta"
229 | output:
230 | directory(W+"meryl/merylDB_k21"),
231 | params:
232 | k="21",
233 | dir=W+"meryl/merylDB{chr}_k21"
234 | threads: 128
235 | shell:
236 | "/home/liusc/software/meryl-1.4/bin/meryl count k={params.k} threads={threads} {input} output {params.dir}"
237 |
238 | rule merfin_snv:
239 | input:
240 | ref="CS_ISSA.fasta",
241 | seqmers=W+"meryl/merylDB_k21",
242 | vcf=W + "output/SNV/MERGED_SMALL_VARIANTS.vcf"
243 | output:
244 | W + "output/SNV/merfin_snv.filter.vcf"
245 | params:
246 | W + "output/SNV/merfin_snv"
247 | shell:
248 | "merfin -strict -threads 128 -sequence {input.ref} -seqmers {input.seqmers} -readmers single.hifi40_cspcrfree.k21.gt1.meryl -peak 88.3 -prob lookup_table.txt -vcf {input.vcf} -output {params}"
249 |
250 | rule merfin_sv:
251 | input:
252 | ref="CS_ISSA.fasta",
253 | seqmers=W+"meryl/merylDB_k21",
254 | vcf=W + "output/SV/jasmine_hifi_ont.vcf"
255 | output:
256 | W + "output/SV/merfin_sv.filter.vcf"
257 | params:
258 | W + "output/SV/merfin_sv"
259 | shell:
260 | "merfin -strict -threads 128 -sequence {input.ref} -seqmers {input.seqmers} -readmers single.hifi40_cspcrfree.k21.gt1.meryl -peak 88.3 -prob single.hifi40_cspcrfree.k21/lookup_table.txt -vcf {input.vcf} -output {params}"
261 |
262 | rule cut:
263 | input:
264 | W + "output/SV/merfin_sv.filter.vcf"
265 | output:
266 | W + "output/SV/merfin_sv10.filter.vcf"
267 | shell:
268 | "cut -f 1-10 {input} > {output}"
269 |
270 | rule ls_sv_snv:
271 | input:
272 | snv=W + "output/SNV/merfin_snv.filter.vcf",
273 | sv=W + "output/SV/merfin_sv10.filter.vcf"
274 | output:
275 | W + "output/SV_SNV/lst.txt"
276 | shell:
277 | "ls {input.snv} {input.sv} > {output}"
278 | rule jasmine_merge:
279 | input:
280 | W + "output/SV_SNV/lst.txt"
281 | output:
282 | W + "output/SV_SNV/SV_SNV.vcf"
283 | shell:
284 | "jasmine max_dist=500 min_seq_id=0.3 spec_reads=3 threads=96 --output_genotypes file_list={input} out_file={output}"
285 | rule merfin_sv_snv:
286 | input:
287 | ref="CS_ISSA.fasta",
288 | seqmers=W+"meryl/merylDB_k21",
289 | vcf=W + "output/SV_SNV/SV_SNV.vcf"
290 | output:
291 | W + "output/SV_SNV/merfin_sv_snv.filter.vcf"
292 | params:
293 | W + "output/SV_SNV/merfin_sv_snv"
294 | shell:
295 | "merfin -strict -threads 128 -sequence {input.ref} -seqmers {input.seqmers} -readmers single.hifi40_cspcrfree.k21.gt1.meryl -peak 88.3 -prob single.hifi40_cspcrfree.k21/lookup_table.txt -vcf {input.vcf} -output {params}"
296 |
297 | rule cut_merfin_sv_snv:
298 | input:
299 | W + "output/SV_SNV/merfin_sv_snv.filter.vcf"
300 | output:
301 | W + "output/SV_SNV/merfin_sv_snv.filter10.vcf"
302 | shell:
303 | "cut -f 1-10 {input} > {output}"
304 |
305 | rule view_merfin_sv_snv:
306 | input:
307 | W + "output/SV_SNV/merfin_sv_snv.filter10.vcf"
308 | output:
309 | W + "output/SV_SNV/merfin_sv_snv.filter.vcf.gz"
310 | shell:
311 | "bcftools view -Oz {input} > {output}"
312 |
313 | rule view_merfin_sv_snv_index:
314 | input:
315 | W + "output/SV_SNV/merfin_sv_snv.filter.vcf.gz"
316 | output:
317 | W + "output/SV_SNV/merfin_sv_snv.filter.vcf.gz.csi"
318 | shell:
319 | "bcftools index -c {input}"
320 |
321 | rule fasta:
322 | input:
323 | vcf=W + "output/SV_SNV/merfin_sv_snv.filter.vcf.gz",
324 | index=W + "output/SV_SNV/merfin_sv_snv.filter.vcf.gz.csi",
325 | ref="CS_ISSA.fasta",
326 | output:
327 | W + "output/SV_SNV/merfin_sv_snv_consensus.fasta"
328 | shell:
329 | "bcftools consensus -f {input.ref} -H 1 {input.vcf} > {output}"
330 |
331 |
332 | #snakemake -s callsv_snv.py --cluster-config clust.json --configfile conf_ck.yaml --cluster '{cluster.account}' --jobs 128 --rerun-incomplete --restart-times 1 -np
--------------------------------------------------------------------------------
/03_Polishing/calsv_snv.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | snakemake -s bwa_winnowmap.py --cluster-config clust_align.json --configfile conf_ck_align.yaml --cluster '{cluster.account}' --jobs 40 --rerun-incomplete --restart-times 1
4 | workdir=$1
5 | input="$workdir"/input
6 | output="$workdir"/output
7 | mkdir -p $input
8 | mkdir -p $output
9 | mkdir -p "$output"/intermediate_results_dir/work
10 | mkdir -p "$output"/intermediate_results_dir/temp
11 | mkdir -p "$output"/pepper_deepvariant_output
12 | ref=$2
13 |
14 | a=${ref%.*}
15 | b=${a##*/}
16 | $(basename $ref .fa)
17 | cp $ref $input
18 | threads=$3
19 | ref="$input"/"$b".fasta
20 | mv ont_merge/q10l120k.bam $input
21 | mv ont_merge/q10l120k.bam.csi $input
22 | mv single_hifi_pcr/hybrid.bam $input
23 | mv single_hifi_pcr/hybrid.bam.csi $input
24 | mv hybrid_hifi_pcr/hybrid.bam $input
25 | mv hybrid_hifi_pcr/hybrid.bam.csi $input
26 |
27 | ####dv
28 |
29 | singularity run --cpus $threads --nv -B /home:/home -B /data:/data -B "$input":"$input" -B "$output":"$output" --workdir "$output"/intermediate_results_dir_hybrid/temp google_deepvariant_latest-gpu.sif /opt/deepvariant/bin/run_deepvariant --model_type "HYBRID_PACBIO_ILLUMINA" --ref "$ref" --reads "$input"/hybrid.bam --output_vcf "$output"/hybrid.vcf --num_shards $threads --intermediate_results_dir "$output"/intermediate_results_dir_hybrid
30 |
31 | singularity run --cpus $threads --nv -B /home:/home -B /data:/data -B "$input":"$input" -B "$output":"$output" --workdir "$output"/intermediate_results_dir_single/temp google_deepvariant_latest-gpu.sif /opt/deepvariant/bin/run_deepvariant --model_type "HYBRID_PACBIO_ILLUMINA" --ref "$ref" --reads "$input"/single.bam --output_vcf "$output"/single.vcf --num_shards $threads --intermediate_results_dir "$output"/intermediate_results_dir_single
32 |
33 | ####pepper dv
34 |
35 | docker run --ipc=host --gpus all -v /home:/home -v /data:/data -v "$input":"$input" -v "$output":"$output" kishwars/pepper_deepvariant:r0.8-gpu run_pepper_margin_deepvariant call_variant -b "$input"/q10l120k.bam -f $ref -o "$output"/pepper_deepvariant_output -g -p pep_dv_ont -t $threads --ont_r9_guppy5_sup
36 |
37 | ####filter & consensus
38 | snakemake -s callsv_snv.py --cluster-config clust.json --configfile conf_ck.yaml --cluster '{cluster.account}' --jobs 128 --rerun-incomplete --restart-times 1
--------------------------------------------------------------------------------
/03_Polishing/clust.json:
--------------------------------------------------------------------------------
1 | #snakemake --cluster-config clust.json --cluster '{cluster.account}'
2 | {
3 | "__default__" :
4 | {
5 | "account" : "sbatch -N 1 -n 1 -c 128 -p tcum256c128Partition",
6 | "jobs" : "59"
7 | },
8 | }
9 | #snakemake -j 999 --cluster-config cluster.json --cluster "{cluster.account} -p {cluster.partition} -n {cluster.n} -t {cluster.time}"
10 | #--cluster "sbatch -A {cluster.account} -q {cluster.queue} -l select={cluster.nodes}:ncpus{cluster.ppn}:mem={cluster.mem} -l walltime={cluster.time}"
11 | #nohup snakemake -s sum.py --cluster-config clust.json --use-conda --cluster '{cluster.account}' --jobs 16 --restart-times 5 --conda-prefix /lustre1/deng_pkuhpc/deng_test/SF/min3/envs/map&
12 |
--------------------------------------------------------------------------------
/03_Polishing/clust_align.json:
--------------------------------------------------------------------------------
1 | #snakemake --cluster-config clust.json --cluster '{cluster.account}'
2 | {
3 | "__default__" :
4 | {
5 | "account" : "sbatch -N 1 -n 1 -c 32 -p tcum256c128Partition",
6 | "jobs" : "59"
7 | },
8 | "filter_merge" :
9 | {
10 | "account" : "sbatch -N 1 -n 1 -c 128 -p tcuHm512c128Partition",
11 | "jobs" : "59"
12 | },
13 | "merge" :
14 | {
15 | "account" : "sbatch -N 1 -n 1 -c 128 -p tcuHm512c128Partition",
16 | "jobs" : "59"
17 | },
18 | "filter_merge_flagstat" :
19 | {
20 | "account" : "sbatch -N 1 -n 1 -c 128 -p tcuHm512c128Partition",
21 | "jobs" : "59"
22 | },
23 | "filter_merge_single" :
24 | {
25 | "account" : "sbatch -N 1 -n 1 -c 128 -p tcuHm512c128Partition",
26 | "jobs" : "59"
27 | },
28 | "merge_flagstat" :
29 | {
30 | "account" : "sbatch -N 1 -n 1 -c 128 -p tcuHm512c128Partition",
31 | "jobs" : "59"
32 | },
33 | "filter_merge_single_flagstat" :
34 | {
35 | "account" : "sbatch -N 1 -n 1 -c 128 -p tcuHm512c128Partition",
36 | "jobs" : "59"
37 | },
38 | "merge_stat" :
39 | {
40 | "account" : "sbatch -N 1 -n 1 -c 128 -p tcuHm512c128Partition",
41 | "jobs" : "59"
42 | },
43 | "filter_merge_single_pcr" :
44 | {
45 | "account" : "sbatch -N 1 -n 1 -c 128 -p tcuHm512c128Partition",
46 | "jobs" : "59"
47 | },
48 | "filter_merge_hybrid" :
49 | {
50 | "account" : "sbatch -N 1 -n 1 -c 128 -p tcuHm512c128Partition",
51 | "jobs" : "59"
52 | },
53 | "pcr_free_single" :
54 | {
55 | "account" : "sbatch -N 1 -n 1 -c 96 -p fatM4TC96Partition",
56 | "jobs" : "59"
57 | },
58 | "pcr_free_hybrid" :
59 | {
60 | "account" : "sbatch -N 1 -n 1 -c 96 -p fatM4TC96Partition",
61 | "jobs" : "59"
62 | },
63 | }
64 | #snakemake -j 999 --cluster-config cluster.json --cluster "{cluster.account} -p {cluster.partition} -n {cluster.n} -t {cluster.time}"
65 | #--cluster "sbatch -A {cluster.account} -q {cluster.queue} -l select={cluster.nodes}:ncpus{cluster.ppn}:mem={cluster.mem} -l walltime={cluster.time}"
66 | #nohup snakemake -s sum.py --cluster-config clust.json --use-conda --cluster '{cluster.account}' --jobs 16 --restart-times 5 --conda-prefix /lustre1/deng_pkuhpc/deng_test/SF/min3/envs/map&
67 |
--------------------------------------------------------------------------------
/03_Polishing/conf_ck.yaml:
--------------------------------------------------------------------------------
1 |
2 | INDEX:
3 | /lustre1/deng_pkuhpc/deng_test/rf/97103_genome_v2/97103_genome_v2_b2
4 | DIR:
5 | /home/liusc/proj/wheat/rawdata/ont/splitall
6 | WORKDIR:
7 | /data/liusc/lixp/wheat/result/t2tpolish/dv/
8 | REF:
9 | /home/liusc/lxp/xiaomai/iwgsc/IWGSC_RefSeq_Assembliesv2.1/iwgsc_refseqv2.1_assembly.fa
10 | SNP:
11 | /lustre1/deng_pkuhpc/deng_test/projects/watermelon/pub/cucurbit/reseq/watermelon/v2/1_SNP.vcf
12 | DICT:
13 | /lustre1/deng_pkuhpc/deng_test/rf/97103_genome_v2_chr.fa.dict
14 | TEMP:
15 | /gpfs1/deng_pkuhpc/deng_test/watermelon/mk/temp
16 | intervals:
17 | /lustre1/deng_pkuhpc/deng_test/projects/watermelon/snakemake/watermelon.list
18 | snpeff:
19 | /lustre1/deng_pkuhpc/deng_test/SF/conda/share/snpeff-5.0-1/data/97103_genome_v2
20 |
21 |
--------------------------------------------------------------------------------
/03_Polishing/conf_ck_align.yaml:
--------------------------------------------------------------------------------
1 | DIR:
2 | HiFi hybrid path
3 | DIRs:
4 | HiFi single path
5 | DIRont:
6 | ONT path
7 | WORKDIR:
8 | WORK path
9 |
--------------------------------------------------------------------------------
/04_Evaluation/BUSCO.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | geno=$1
4 | profix=$2
5 |
6 | busco -m geno -i $geno -l poales_odb10 -o $profix -c 52
7 |
8 |
--------------------------------------------------------------------------------
/04_Evaluation/bac.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | item=$1
4 | chr=$2
5 | blastn -query $item -db ./"$chr"index -evalue 1e-6 -outfmt "6 qseqid qlen sseqid qstart qend sstart send pident length nident qcovs" -num_threads 128 -out "$1".bed
6 | #-outfmt "6 qseqid qlen sseqid qstart qend sstart send pident length nident qcovs"
7 | sed -n '1p' "$1".bed > "$1".txt
8 | mv "$1".txt ./
9 | cat *.fasta.txt > all_bac.bed
10 | #for chr in Chr3A Chr3B Chr5A Chr5D; do makeblastdb -in part_"$chr".fasta -dbtype nucl -parse_seqids -out ./"$chr"index; for item in dir bac.fasta.split/"$chr"/*.fasta; do sbatch --job-name="$chr"blast --partition= --cpus-per-task=128 blast.sh $item $chr; done; done
--------------------------------------------------------------------------------
/04_Evaluation/ltr.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | ref=$1
4 | prefix=$2
5 |
6 | LTR_FINDER_parallel -seq $ref -threads 96 -harvest_out -size 1000000
7 |
8 | LTR_retriever -threads 96 -genome $ref -inharvest m2.1.7.fasta.finder.combine.scn -dnalib clariTeRep.fna -plantprolib protein.fasta
9 |
--------------------------------------------------------------------------------
/04_Evaluation/mapping_rates_coverages .sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 |
4 | hybrid_bam=$1
5 | single_bam=$2
6 | ont_bam=$3
7 |
8 |
9 | samtools flagstat -@ 128 $hybrid_bam > hybrid_bam.flagstat
10 | samtools coverage -o hybrid_bam.cov hybrid_bam
11 | samtools flagstat -@ 128 $single_bam > single_bam.flagstat
12 | samtools coverage -o single_bam.cov single_bam
13 | samtools flagstat -@ 128 $ont_bam > ont_bam.flagstat
14 | samtools coverage -o ont_bam.cov ont_bam
15 |
--------------------------------------------------------------------------------
/04_Evaluation/qv.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | query=$1
4 | ref=$2
5 |
6 | merqury.sh single.hifi40_cspcrfree.k21.gt1.meryl $query $ref t0 > t0.log
7 |
--------------------------------------------------------------------------------
/04_Evaluation/synteny.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | mkdir blastdb
4 | mkdir blastresult
5 | protein=$1
6 | name=$2
7 | gff3=$3
8 |
9 | makeblastdb -in $protein -dbtype prot -out ./blastdb/${name}
10 | blastp -query $protein -db ./blastdb/${name} -out ./blastresult/${name}.blast -num_threads 52 -outfmt 6 -evalue 1e-10 -num_alignments 5
11 |
12 | awk -vFS="\t" -vOFS="\t" '{if($3=="mRNA"){match($9,/ID=([^;]+)/,a);sub(/ID=/,"",a[0]);print $1,a[0],$4,$5}}' ${gff3} > ./blastresult/${name}.gff
13 | cd blastresult
14 | MCScanX ./${name}
15 |
--------------------------------------------------------------------------------
/04_Evaluation/while.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | threads=$1
4 | partition=$2
5 | ref=$3
6 | query=$4
7 | seqkit split -i $ref
8 | seqkit split -i $query
9 | for num in {1..7}
10 | do
11 | for chr in A B D
12 | do
13 | sbatch --job-name="$num""$chr" --partition=$Partition --cpus-per-task="$threads" winnowmap.sh "$num""$chr" $ref $query
14 | done
15 | done
16 |
--------------------------------------------------------------------------------
/04_Evaluation/winnowmap.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | chr=$1
4 | mkdir "$chr"
5 | cd "$chr"
6 | ref="$2".split/Chr"$chr".fa
7 | query="$3".split/Chr"$chr".fa
8 | samtools faidx $ref
9 | cut -f 1,2 "$ref".fai > Chr"$chr".sizes
10 |
11 | meryl count k=27 output merylDB_"$chr" ${ref}
12 | meryl print greater-than distinct=0.9998 merylDB_"$chr" > repetitive_"$chr".txt
13 | split_fa ${query} > split.fa
14 |
15 | winnowmap -W repetitive_"$chr".txt -ax asm20 -K 1500M -k 27 -w 18 -t 52 -H --MD ${ref} split.fa > chr"$chr".sam
16 |
17 | k8 paftools.js sam2paf -p chr"$chr".sam > chr"$chr".paf
18 | cat chr"$chr".paf |awk '{if ($12 > 0) print $6"\t"$8"\t"$9}' |bedtools sort -i - |bedtools merge -i - |bedtools complement -i - -g Chr"$chr".sizes > Chr"$chr".bed
19 |
20 |
--------------------------------------------------------------------------------
/05_Annotation/Snakefile:
--------------------------------------------------------------------------------
1 | # Created on May 09, 2017
2 | #
3 | # Version 1.0
4 | #
5 | # @author: sven.twardziok@posteo.de
6 |
7 |
8 | configfile: "config.yaml"
9 |
10 | import csv
11 |
12 | from Bio import SeqIO
13 | from Bio.Seq import Seq
14 | from Bio.SeqRecord import SeqRecord
15 | from modules import fasta
16 | import os
17 | import re
18 | import linecache
19 |
20 | #####################################################################################################################################
21 | reference="CS-IAAS_v1.softmask.fasta"
22 | flair_stringtie="flair.output.isoforms.gtf"
23 | rule final:
24 | input:
25 | "transcripts.genes.gff3"
26 |
27 | rule gtf_genome_to_cdna_fasta:
28 | input:
29 | gff=flair_stringtie,
30 | genome=reference
31 | output:
32 | "transcripts.fasta"
33 | threads: 128
34 | run:
35 | shell("gtf_genome_to_cdna_fasta.pl {input.gff} {input.genome} > {output}")
36 | rule gtf_to_alignment_gff3:
37 | input:
38 | flair_stringtie
39 | output:
40 | "transcripts.gff3"
41 | threads: 128
42 | run:
43 | shell("gtf_to_alignment_gff3.pl {input} > {output}")
44 | rule transdecoder_longorfs:
45 | input:
46 | fasta="transcripts.fasta"
47 | output:
48 | pep="transcripts.fasta.transdecoder_dir/longest_orfs.pep"
49 | params:
50 | executable=config["executables"]["transdecoder"]["longorfs"]
51 | threads: 128
52 | run:
53 | shell("{params.executable} -t {input.fasta}")
54 |
55 | rule transdecoder_splitfasta:
56 | input:
57 | fasta="transcripts.fasta.transdecoder_dir/longest_orfs.pep"
58 | output:
59 | fastas=temp(["transcripts.fasta.transdecoder_dir/batches/part_" + str(nbatch) + "/part_" + str(nbatch) + ".fasta"
60 | for nbatch in range(1, config["transdecoder"]["nbatches"]+1)])
61 | threads: 1
62 | run:
63 | splitfasta = fasta.SplitSeqs(sequences=input.fasta, outdir="transcripts.fasta.transdecoder_dir/batches" , nfiles=config["transdecoder"]["nbatches"])
64 |
65 | rule transdecoder_blast:
66 | input:
67 | fasta="transcripts.fasta.transdecoder_dir/batches/part_{nbatch}/part_{nbatch}.fasta"
68 | output:
69 | blp=temp("transcripts.fasta.transdecoder_dir/batches/part_{nbatch}/part_{nbatch}.blp")
70 | params:
71 | database = config["data"]["transdecoder"]["blastp"],
72 | executable = config["executables"]["blastp"],
73 | threads: 1
74 | run:
75 | shell(params.executable + " -max_target_seqs 1 -evalue 1e-05 -db {params.database} -query {input.fasta} -out {output.blp} -outfmt 6")
76 |
77 | rule transdecoder_blast_combine:
78 | input:
79 | blps=lambda wildcards: ["transcripts.fasta.transdecoder_dir/batches/part_" + str(nbatch) + "/part_" + str(nbatch) + ".blp"
80 | for nbatch in range(1, config["transdecoder"]["nbatches"]+1)]
81 | output:
82 | blp="transcripts.fasta.transdecoder_dir/longest_orfs.pep_blastresults.blp"
83 | threads: 1
84 | run:
85 | shell("touch {output.blp}")
86 | for blp in input.blps:
87 | shell("cat " + blp + " >> {output.blp}")
88 |
89 | rule transdecoder_hmmscan:
90 | input:
91 | fasta="transcripts.fasta.transdecoder_dir/batches/part_{nbatch}/part_{nbatch}.fasta"
92 | output:
93 | domtblout=temp("transcripts.fasta.transdecoder_dir/batches/part_{nbatch}/part_{nbatch}.domtblout")
94 | params:
95 | executable = config["executables"]["hmmscan"],
96 | pfamhmm = config["data"]["transdecoder"]["pfamhmm"],
97 | nodes = config["transdecoder"]["hmmscan"]["nodes"],
98 | memory = config["transdecoder"]["hmmscan"]["memory"],
99 | job_name = "hmmscanning",
100 | log = config['transdecoder']['log']
101 | resources:
102 | MB = 2000,
103 | load = 1
104 | threads: 2
105 | run:
106 | shell(params.executable + " --domtblout {output.domtblout} {params.pfamhmm} {input.fasta}")
107 |
108 | rule transdecoder_hmmscan_combine:
109 | input:
110 | domtblout=lambda wildcards: ["transcripts.fasta.transdecoder_dir/batches/part_" + str(nbatch) + "/part_" + str(nbatch) + ".domtblout"
111 | for nbatch in range(1, config["transdecoder"]["nbatches"]+1)]
112 | output:
113 | domtblout="transcripts.fasta.transdecoder_dir/longest_orfs.pep_hmmscan.domtblout"
114 | params:
115 | nodes = 1,
116 | memory = "4G",
117 | job_name = config['transdecoder']['job_name'],
118 | log = config['transdecoder']['log']
119 | resources:
120 | load = 1,
121 | MB = 2000
122 | threads: 1
123 | run:
124 | shell("touch {output.domtblout}")
125 | for domtblout in input.domtblout:
126 | shell("grep -v \"#\" " + domtblout + " >> {output.domtblout}")
127 |
128 | rule transdecoder_predict:
129 | input:
130 | fasta = "transcripts.fasta",
131 | blp = "transcripts.fasta.transdecoder_dir/longest_orfs.pep_blastresults.blp",
132 | domtblout = "transcripts.fasta.transdecoder_dir/longest_orfs.pep_hmmscan.domtblout"
133 | output:
134 | gff3 = "transcripts.fasta.transdecoder.gff3"
135 | params:
136 | executable=config["executables"]["transdecoder"]["predict"],
137 | nodes = config["transdecoder"]["predict"]["nodes"],
138 | memory = config["transdecoder"]["predict"]["memory"],
139 | job_name = "predicting",
140 | log = config['transdecoder']['log']
141 | resources:
142 | load = 1
143 | threads: 128
144 | run:
145 | shell("{params.executable} -t {input.fasta} --retain_pfam_hits {input.domtblout} --retain_blastp_hits {input.blp} --cpu {params.nodes}")
146 |
147 | rule transdecoder_convert:
148 | input:
149 | fasta = "transcripts.fasta",
150 | gff3 = "transcripts.fasta.transdecoder.gff3",
151 | gtf="transcripts.gff3"
152 | output:
153 | gff3 = "transcripts.genes.gff3"
154 | params:
155 | executable_gff3=config["executables"]["transdecoder"]["convertgff3"],
156 | executable_genome=config["executables"]["transdecoder"]["convertgenome"],
157 | nodes = config["transdecoder"]["convert"]["nodes"],
158 | memory = config["transdecoder"]["convert"]["memory"],
159 | job_name = "converting",
160 | log = config['transdecoder']['log']
161 | resources:
162 | load = 1
163 | threads: 1
164 | run:
165 | shell("{params.executable_genome} {input.gff3} {input.gtf} {input.fasta} > {output.gff3}")
166 |
167 | # nohup python ~/software/miniconda3/envs/annotation/bin/snakemake -s Snakefile --cluster-config clust.json --configfile config.yaml --jobs 2000 --cluster '{cluster.account}' --rerun-incomplete --restart-times 1& -np
168 |
--------------------------------------------------------------------------------
/05_Annotation/clust.json:
--------------------------------------------------------------------------------
1 | #snakemake --cluster-config clust.json --cluster '{cluster.account}'
2 | {
3 | "__default__" :
4 | {
5 | "account" : "sbatch -N 1 -n 1 -c 1 -p fatM4TC96Partition",
6 | "jobs" : "59"
7 | },
8 | "transdecoder_predict" :
9 | {
10 | "account" : "sbatch -N 1 -n 1 -c 96 -p fatM4TC96Partition",
11 | "jobs" : "59"
12 | },
13 | "gtf_genome_to_cdna_fasta" :
14 | {
15 | "account" : "sbatch -N 1 -n 1 -c 96 -p fatM4TC96Partition",
16 | "jobs" : "59"
17 | },
18 | "merge_flagstat" :
19 | {
20 | "account" : "sbatch -N 1 -n 1 -c 96 -p fatM4TC96Partition",
21 | "jobs" : "59"
22 | },
23 | "gtf_to_alignment_gff3" :
24 | {
25 | "account" : "sbatch -N 1 -n 1 -c 96 -p fatM4TC96Partition",
26 | "jobs" : "59"
27 | },
28 | "transdecoder_longorfs" :
29 | {
30 | "account" : "sbatch -N 1 -n 1 -c 96 -p fatM4TC96Partition",
31 | "jobs" : "59"
32 | },
33 | }
34 | #snakemake -j 999 --cluster-config cluster.json --cluster "{cluster.account} -p {cluster.partition} -n {cluster.n} -t {cluster.time}"
35 | #--cluster "sbatch -A {cluster.account} -q {cluster.queue} -l select={cluster.nodes}:ncpus{cluster.ppn}:mem={cluster.mem} -l walltime={cluster.time}"
36 | #nohup snakemake -s sum.py --cluster-config clust.json --use-conda --cluster '{cluster.account}' --jobs 16 --restart-times 5 --conda-prefix /lustre1/deng_pkuhpc/deng_test/SF/min3/envs/map&
37 |
--------------------------------------------------------------------------------
/05_Annotation/config.yaml:
--------------------------------------------------------------------------------
1 | # the following section defines all inputs
2 | data:
3 | # add paths to ISOseq data sets. Each data set as a separate line
4 | longnucl:
5 | ds1:
6 | # add paths to reference proteins
7 | refprot:
8 | triticeae:
9 | # add paths to RNAseq data sets. Create a data set entry for different libaries. DS1 is an example for paired-end data; DS2 is single-ended
10 | rnaseq:
11 | ds1: # name of first data set
12 | LIB1: # name of first library
13 | 1:
14 | -
15 | -
16 | 2:
17 | -
18 | -
19 | ds2: # name of second data set
20 | LIB2: # name of second library
21 | 1:
22 | -
23 | -
24 | hisat2db:
25 | gmap:
26 | dbdir:
27 | dbname:
28 | gth:
29 | chr1:
30 | genome:
31 | transdecoder:
32 | pfamhmm: Pfam-A.hmm
33 | blastp: db
34 | cocla:
35 | unimag:
36 | unipoa:
37 | trep:
38 |
39 | # the following section defines all executables and parameters
40 | hisat2:
41 | arguments: -t --dta --no-unal --max-intronlen 50000
42 | memory: 24G
43 | nodes: 8
44 | threads: 8
45 | job_name: hisat2
46 | log: "hisat2.log"
47 | jobs: 4
48 |
49 |
50 | stringtie:
51 | arguments: -m 150 -t -f 0.3
52 | memory: 4G
53 | nodes: 8
54 | threads: 8
55 | job_name: stringtie
56 | log: "stringtie.log"
57 |
58 |
59 | gmap:
60 | arguments: -K 50000
61 | memory: 16G
62 | nodes: 8
63 | threads: 8
64 | job_name: gmap
65 | log: "gmap.log"
66 |
67 | gth:
68 | arguments: -species rice -startcodon -finalstopcodon -gcmaxgapwidth 50000 -gcmincoverage 70 -paralogs -prseedlength 7 -prhdist 4
69 | memory: 5G
70 | nbatches: 100
71 | nodes: 1
72 | threads: 1
73 | job_name: gth
74 | log: "gth.log"
75 |
76 |
77 | transdecoder:
78 | job_name: transdecoder
79 | log: "transdecoder.log"
80 | nbatches: 1000
81 | predict:
82 | nodes: 128
83 | memory: 8G
84 | convert:
85 | nodes: 1
86 | memory: 8G
87 | stringtie:
88 | memory: 8G
89 | nodes: 1
90 | threads: 1
91 | hmmscan:
92 | memory: 2G
93 | nodes: 1
94 | threads: 1
95 | blastp:
96 | memory: 8G
97 | nodes: 1
98 | threads: 1
99 |
100 |
101 | cocla:
102 | nbatches: 100
103 | memory: 1G
104 | nodes: 1
105 | evalue: 10
106 | job_name: cocla
107 | version:
108 | prefix: <"short name of genome">
109 | unipoa_threshold: 0.95 #complete
110 | unimag_threshold: 0.95 #reviewed
111 | repeat_threshold: 0.95 #trep
112 |
113 | executables:
114 | blastp: blastp
115 | cuffcompare: cuffcompare
116 | gffread: gffread
117 | gth: gth
118 | gmap: gmap.sse42
119 | hmmscan: hmmscan
120 | hisat2: hisat2
121 | samtools: samtools
122 | bamtools: bamtools
123 | stringtie: stringtie
124 | transdecoder:
125 | extract:
126 | convertgff3:
127 | convertgenome: cdna_alignment_orf_to_genome_orf.pl
128 | longorfs: TransDecoder.LongOrfs
129 | predict: TransDecoder.Predict
130 |
131 |
--------------------------------------------------------------------------------
/05_Annotation/modules/__pycache__/fasta.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liushoucheng/SPART/8b2d0cf778a446c1e24dbdc772e914262cff5930/05_Annotation/modules/__pycache__/fasta.cpython-310.pyc
--------------------------------------------------------------------------------
/05_Annotation/modules/__pycache__/fasta.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liushoucheng/SPART/8b2d0cf778a446c1e24dbdc772e914262cff5930/05_Annotation/modules/__pycache__/fasta.cpython-35.pyc
--------------------------------------------------------------------------------
/05_Annotation/modules/__pycache__/fasta.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liushoucheng/SPART/8b2d0cf778a446c1e24dbdc772e914262cff5930/05_Annotation/modules/__pycache__/fasta.cpython-39.pyc
--------------------------------------------------------------------------------
/05_Annotation/modules/__pycache__/mygff.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liushoucheng/SPART/8b2d0cf778a446c1e24dbdc772e914262cff5930/05_Annotation/modules/__pycache__/mygff.cpython-310.pyc
--------------------------------------------------------------------------------
/05_Annotation/modules/__pycache__/mygff.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liushoucheng/SPART/8b2d0cf778a446c1e24dbdc772e914262cff5930/05_Annotation/modules/__pycache__/mygff.cpython-35.pyc
--------------------------------------------------------------------------------
/05_Annotation/modules/fasta.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 27, 2015
3 |
4 | @author: sven.twardziok
5 |
6 | Version 0.9
7 |
8 | '''
9 |
10 | from Bio import SeqIO
11 | import subprocess, re, itertools, csv
12 |
13 | class SplitSeqs(object):
14 | def __init__(self, sequences, outdir, nfiles=500):
15 | nseqs = 0
16 | with open(sequences, "r") as infile:
17 | for record in SeqIO.parse(infile, "fasta"):
18 | nseqs += 1
19 | seqsperfile = nseqs/nfiles
20 | for i in range(1, nfiles+1):
21 | tmpoutdir = "%s/part_%i" %(outdir, i)
22 | subprocess.call(["mkdir", "-p", tmpoutdir])
23 | self.fasta_parts = {}
24 | with open(sequences, "r") as infile:
25 | tmpcounter = 0
26 | nfile = 1
27 | for record in SeqIO.parse(infile, "fasta"):
28 | if nfile < nfiles:
29 | if tmpcounter==0:
30 | tmpfilename = "%s/part_%i/part_%i.fasta" % (outdir, nfile, nfile)
31 | outfasta = open(tmpfilename, "w")
32 | self.fasta_parts["part_%i" % (nfile)] = tmpfilename
33 | SeqIO.write(record, outfasta, "fasta")
34 | tmpcounter = tmpcounter+1
35 | else:
36 | SeqIO.write(record, outfasta, "fasta")
37 | tmpcounter = tmpcounter+1
38 | if tmpcounter>=(seqsperfile-1):
39 | outfasta.close()
40 | tmpcounter = 0
41 | nfile = nfile+1
42 | else:
43 | if tmpcounter==0:
44 | tmpfilename = "%s/part_%i/part_%i.fasta" % (outdir, nfile, nfile)
45 | outfasta = open(tmpfilename, "w")
46 | self.fasta_parts["part_%i" % (nfile)] = tmpfilename
47 | SeqIO.write(record, outfasta, "fasta")
48 | tmpcounter = tmpcounter+1
49 | else:
50 | SeqIO.write(record, outfasta, "fasta")
51 | tmpcounter = tmpcounter+1
52 |
53 | class PrintCdsStats(object):
54 | def __init__(self, infasta, outstats):
55 | stop_codons = ["TGA", "TAG", "TAA"]
56 | start_codons = ["ATG"]
57 | with open(infasta, "r") as infile:
58 | with open(outstats, "w") as outfile:
59 | rowpattern = {"id":"none", "length": 0, "status": "fragment"}
60 | variables = ["id", "length", "status"]
61 | writer = csv.DictWriter(outfile, fieldnames=variables)
62 | writer.writeheader()
63 | for record in SeqIO.parse(infile, "fasta"):
64 | x = str(record.seq)
65 | outdata = dict(rowpattern)
66 | outdata["id"] = record.id
67 | outdata["length"] = len(x)
68 | if len(str(record.seq)) % 3 != 0:
69 | outdata["status"] = "no translation"
70 | elif any(x[i:i+3] in stop_codons for i in range(3,len(x)-3,3)):
71 | outdata["status"] = "internal stop"
72 | elif x[0:3] in start_codons and x[(len(x)-3):len(x)] in stop_codons:
73 | outdata["status"] = "complete"
74 | elif not x[0:3] in start_codons and x[(len(x)-3):len(x)] in stop_codons:
75 | outdata["status"] = "no start"
76 | elif x[0:3] in start_codons and not x[(len(x)-3):len(x)] in stop_codons:
77 | outdata["status"] = "no stop"
78 | writer.writerow(outdata)
79 |
--------------------------------------------------------------------------------
/05_Annotation/modules/mygff.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on May 09, 2017
3 |
4 | Version 1.0
5 |
6 | @author: sven.twardziok@posteo.de
7 | """
8 |
9 | import csv, re, math
10 | from Bio import SeqIO
11 | from Bio.Seq import Seq
12 | from Bio.Alphabet import IUPAC
13 | from Bio.SeqRecord import SeqRecord
14 |
15 | class Feature(object):
16 | """Class for single features
17 |
18 | based on gff3 specification:
19 | https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
20 | """
21 |
22 | def __lt__(self, other):
23 | """Defines behavior for the less-than operator, <
24 |
25 | :param other: other feature object to compare with
26 | :type other: object
27 | """
28 |
29 | if self.seqid=other.end:
41 | if self.ftype in ["gene"] and other.ftype not in ["gene"]:
42 | return True
43 | elif self.ftype in ["mRNA"] and other.ftype not in ["mRNA", "gene"]:
44 | return True
45 | elif self.ftype in ["exon"] and other.ftype not in ["exon", "mRNA", "gene"]:
46 | return True
47 | else:
48 | return False
49 |
50 | def __gt__(self, other):
51 | """Defines behavior for the greater-than operator, >
52 |
53 | :param other: other feature object to compare with
54 | :type other: object
55 | """
56 |
57 | if self.seqid>other.seqid or (self.seqid==other.seqid and self.start>other.start):
58 | return True
59 | elif self.seqid==other.seqid and self.start==other.start and self.end>other.end:
60 | if self.ftype in ["gene"] and other.ftype in ["gene"]:
61 | return True
62 | if self.ftype in ["mRNA"] and other.ftype in ["mRNA", "gene"]:
63 | return True
64 | if self.ftype in ["exon"] and other.ftype in ["mRNA", "gene", "exon"]:
65 | return True
66 | if self.ftype in ["three_prime_UTR", "five_prime_UTR", "CDS", "intron"]:
67 | return True
68 | elif self.seqid==other.seqid and self.start==other.start and self.end<=other.end:
69 | if self.ftype in ["mRNA"] and other.ftype in ["gene"]:
70 | return True
71 | if self.ftype in ["exon"] and other.ftype in ["mRNA", "gene"]:
72 | return True
73 | if self.ftype in ["three_prime_UTR", "five_prime_UTR", "CDS", "intron"] and other.ftype in ["exon", "mRNA", "gene"]:
74 | return True
75 | else:
76 | return False
77 |
78 | def __eq__(self, other):
79 | """Defines behavior for the equality operator, ==
80 |
81 | :param other: other feature object to compare with
82 | :type other: object
83 | """
84 |
85 | if self.seqid==other.seqid and self.start==other.start and self.end==other.end:
86 | # define equality for mRNAs
87 | if self.ftype=="mRNA" and other.ftype=="mRNA":
88 | # get all CDSs from both mRNAs
89 | cdss_self = []
90 | cdss_other = []
91 | for feature in self.features:
92 | if feature.ftype=="CDS":
93 | cdss_self.append(feature)
94 | for feature in other.features:
95 | if feature.ftype=="CDS":
96 | cdss_other.append(feature)
97 | cdss_self = sorted(cdss_self)
98 | cdss_other = sorted(cdss_other)
99 | # check if number of CDSs are equal
100 | if len(cdss_self) == len(cdss_other):
101 | # check if all CDSs are equal and return False if one pair is unequal
102 | for i in range(0, len(cdss_self)):
103 | if cdss_self[i] != cdss_other[i]:
104 | return False
105 | return True
106 | elif self.ftype==other.ftype:
107 | return True
108 | else:
109 | return False
110 |
111 | def __hash__(self):
112 | return hash((self.seqid, self.start, self.end, self.ftype, self.identifier))
113 |
114 | def __init__(self, seqid, source, ftype, start, end, score, strand, phase):
115 | """Create feature object
116 |
117 | :param seqid: sequence identifier
118 | :type seqid: string
119 | :param source: name of source
120 | :type source: string
121 | :param ftype: feature type ("exon", "mRNA", "gene", "three_prime_UTR", "five_prime_UTR", "CDS", "intron")
122 | :type ftype: string
123 | :param start: start position
124 | :type start: int
125 | :param end: end position
126 | :type end: int
127 | :param score: score value
128 | :type score: imt
129 | :param strand: strand inforamtion
130 | :type strand: string ("+", "-" or ".")
131 | :param phase: phase information
132 | :type phase: string
133 | """
134 |
135 | # standard fields from gff3 columns
136 | self.seqid = seqid
137 | self.source = source
138 | self.ftype = ftype
139 | self.start = start
140 | self.end = end
141 | self.score = score
142 | self.strand = strand
143 | self.phase = phase
144 |
145 | # attributes
146 | self.identifier = ""
147 | self.name = ""
148 | self.alias = ""
149 | self.notes = ""
150 | self.target = ""
151 |
152 |
153 | #links between features
154 | self.parent = None
155 | self.features = []
156 |
157 | # annotation stuff
158 | self.primary_confidence_class = ""
159 | self.secondary_condidence_class = ""
160 |
161 |
162 | def getLine(self):
163 | writeattributes = ""
164 | # required attributes
165 | if self.ftype=="gene":
166 | writeattributes = "ID=%s" % (self.identifier)
167 | elif self.ftype=="mRNA":
168 | if self.parent is None:
169 | print("error, no parent for: %s" % (self.identifier))
170 | else:
171 | writeattributes = "ID=%s;Parent=%s" % (self.identifier, self.parent.identifier)
172 | else:
173 | if self.parent is None:
174 | print("error, no parent for: %s %s %s %i %i" % (self.seqid, self.source, self.ftype, self.start, self.end))
175 | else:
176 | writeattributes = "Parent=%s" % (self.parent.identifier)
177 | #optional attributes
178 | if len(self.name)>0:
179 | writeattributes += ";Name=%s" % (self.name)
180 | if len(self.alias)>0:
181 | writeattributes += ";Alias=%s" % (self.alias)
182 | if len(self.target)>0:
183 | writeattributes += ";Target=%s" % (self.target)
184 | if len(self.notes)>0:
185 | writeattributes += ";Notes=%s" % (self.notes)
186 | if len(self.primary_confidence_class)>0:
187 | writeattributes += ";primary_confidence_class=%s" % (self.primary_confidence_class)
188 | if len(self.secondary_condidence_class)>0:
189 | writeattributes += ";secondary_confidence_class=%s" % (self.secondary_condidence_class)
190 |
191 | return [self.seqid, self.source, self.ftype, self.start, self.end, self.score, self.strand, self.phase, writeattributes]
192 |
193 |
194 | class GeneAnnotation(object):
195 | """Read specific gff files and returns structured data for plant.annot"""
196 |
197 | def readGff3PlantAnnot(self, path):
198 | """General GFF3 file used in plant.annot pipeline
199 |
200 | :param path: path to gff file
201 | :type path: string
202 |
203 | 0 seqname chrX Chromosome, scaffold or contig name
204 | 1 source name Name of source, e.g. database or software
205 | 2 feature exon "three_prime_UTR", "five_prime_UTR", "mRNA", "exon", "CDS", "gene", "intron"
206 | 3 start 77696957 The leftmost coordinate of this record (where 1 is the leftmost possible coordinate)
207 | 4 end 77712009 The rightmost coordinate of this record, inclusive.
208 | 5 score 0.3221 Some score value
209 | 6 strand + One of "+", "-", "."
210 | 7 frame . Frame for feature (just used for CDS)
211 | 8 attributes (GFF3) ID=XXX;Parent=XXX (ID is only used for genes and mRNAs; Parent is not used for genes)
212 | """
213 |
214 | self.features = []
215 | self.genes = {}
216 | self.mrnas = {}
217 | self.seqids = {}
218 | genes2mrnas = []
219 | mrnas2features = []
220 |
221 | # create features
222 | with open(path, "r") as ingff3:
223 | reader = csv.reader(ingff3, delimiter="\t", quoting = csv.QUOTE_NONE)
224 | for line in reader:
225 | if len(line)==9:
226 | seqid = line[0]
227 | source = line[1]
228 | ftype = line[2]
229 | start = int(line[3])
230 | end = int(line[4])
231 | score = line[5]
232 | strand = line[6]
233 | phase = line[7]
234 | feature = Feature(seqid, source, ftype, start, end, score, strand, phase)
235 | attributesline = line[8]
236 | attributes = {}
237 | for entry in attributesline.split(";"):
238 | matchAttribute = re.match(r"(.*)=(.*)", entry)
239 | if matchAttribute:
240 | attributes[matchAttribute.group(1)] = matchAttribute.group(2)
241 | # add attributes to feature
242 | if "ID" in attributes.keys():
243 | feature.identifier = attributes["ID"]
244 | if "Name" in attributes.keys():
245 | feature.name = attributes["Name"]
246 | if "Alias" in attributes.keys():
247 | feature.alias = attributes["Alias"]
248 | if "Notes" in attributes.keys():
249 | feature.notes = attributes["Notes"]
250 | if "Target" in attributes.keys():
251 | feature.target = attributes["Target"]
252 | if "primary_confidence_class" in attributes.keys():
253 | feature.primary_confidence_class = attributes["primary_confidence_class"]
254 | if "secondary_condidence_class" in attributes.keys():
255 | feature.secondary_condidence_class = attributes["secondary_condidence_class"]
256 | if "primconf" in attributes.keys():
257 | feature.primary_confidence_class = attributes["primconf"] #old version
258 | if "secconf" in attributes.keys():
259 | feature.secondary_condidence_class = attributes["secconf"] #old version
260 | # add gene to seqid and genes
261 | if feature.ftype == "gene":
262 | self.features.append(feature)
263 | if not feature.seqid in self.seqids.keys():
264 | self.seqids[seqid] = []
265 | self.seqids[feature.seqid].append(feature)
266 | self.genes[feature.identifier] = feature
267 | # add mrna to mrnas and mark for gene assignment
268 | elif feature.ftype == "mRNA":
269 | self.features.append(feature)
270 | self.mrnas[feature.identifier] = feature
271 | genes2mrnas.append({"geneid":attributes["Parent"], "mrna":feature})
272 | # mark remaining features for mrna assignment
273 | elif feature.ftype in ["exon", "three_prime_UTR", "five_prime_UTR", "CDS", "intron"]:
274 | self.features.append(feature)
275 | mrnas2features.append({"mrnaid":attributes["Parent"], "feature":feature})
276 |
277 | # assign genes to mrnas
278 | for assignment in genes2mrnas:
279 | geneid = assignment["geneid"]
280 | mrna = assignment["mrna"]
281 | if geneid in self.genes.keys():
282 | gene = self.genes[geneid]
283 | mrna.parent = gene
284 | gene.features.append(mrna)
285 | else:
286 | print("gene missing")
287 |
288 | # assign mrnas to features
289 | for assignment in mrnas2features:
290 | mrnaid = assignment["mrnaid"]
291 | feature = assignment["feature"]
292 | if mrnaid in self.mrnas.keys():
293 | mrna = self.mrnas[mrnaid]
294 | feature.parent = mrna
295 | mrna.features.append(feature)
296 | else:
297 | print("mrna missing")
298 |
299 | # sort features and return
300 | self.features = sorted(self.features)
301 | return(self)
302 |
303 | def combine(self, geneannotations, annoversion="PGSB"):
304 | self.features = []
305 | for geneannotation in geneannotations:
306 | self.features += geneannotation.features
307 | self.features = sorted(self.features)
308 | genecounter = 0
309 | mrnacounter = 0
310 | self.genes = {}
311 | self.mrnas = {}
312 | self.seqids = {}
313 | for feature in self.features:
314 | if feature.ftype=="gene":
315 | genecounter += 1
316 | if not feature.seqid in self.seqids.keys():
317 | self.seqids[feature.seqid] = []
318 | feature.identifier = "%s_gene_%i" % (annoversion, genecounter)
319 | self.genes[feature.identifier] = feature
320 | self.seqids[feature.seqid].append(feature)
321 | if feature.ftype=="mRNA":
322 | mrnacounter += 1
323 | feature.identifier = "%s_mRNA_%i" % (annoversion, mrnacounter)
324 | self.mrnas[feature.identifier] = feature
325 | return(self)
326 |
327 | def recalcGeneids(self, annoversion="PGSB"):
328 | #1) get one new gene for each mrna; import all attributes from former genes
329 | tmpnewgenes = []
330 | tmpcounter = 0
331 | for feature in self.features:
332 | if feature.ftype == "mRNA":
333 | tmpcounter += 1
334 | tmpnewgeneid = "%s_gene_%i" % (annoversion, tmpcounter)
335 | tmpnewgene = Feature(seqid=feature.seqid, source=feature.source, ftype="gene", start=feature.start, end=feature.end, score=feature.score, strand=feature.strand, phase=feature.phase)
336 | tmpnewgene.identifier = tmpnewgeneid
337 | tmpnewgene.name = tmpnewgeneid
338 | tmpnewgene.alias = feature.parent.alias
339 | tmpnewgene.notes = feature.parent.notes
340 | tmpnewgene.target = feature.parent.target
341 | tmpnewgene.primary_confidence_class = feature.parent.primary_confidence_class
342 | tmpnewgene.secondary_condidence_class = feature.parent.secondary_condidence_class
343 | feature.parent = tmpnewgene
344 | tmpnewgene.features = [feature]
345 | tmpnewgenes.append(tmpnewgene)
346 |
347 | #2) merge genes with overlapping CDS (features need to be sorted)
348 | opencdss = {}
349 | removegeneids = set([])
350 | for feature in self.features:
351 | if feature.ftype=="CDS":
352 | tmpopencdss = []
353 | opengeneid = "none"
354 | currentgene = feature.parent.parent
355 | # if there are no open CDS for current seqid initialze empty array
356 | if not feature.seqid in opencdss.keys():
357 | opencdss[feature.seqid] = []
358 | # go through all open cds and keep if still open; set new gene to last open cds (gene are same for all open CDS on same strand)
359 | for opencds in opencdss[feature.seqid]:
360 | if opencds.end>=feature.start:
361 | tmpopencdss.append(opencds)
362 | if opencds.strand==feature.strand:
363 | opengene = opencds.parent.parent
364 | opengeneid = opengene.identifier
365 | # set new gene to last open cds gene
366 | if currentgene.identifier!=opengeneid and opengeneid!="none":
367 | tmpstart=math.inf
368 | tmpend=0
369 | for tmpmrna in currentgene.features:
370 | tmpmrna.parent = opengene
371 | opengene.features.append(tmpmrna)
372 | tmpstart = min(tmpstart, tmpmrna.start)
373 | tmpend = max(tmpend, tmpmrna.end)
374 | opengene.start = min(tmpstart, opengene.start)
375 | opengene.end = max(tmpend, opengene.end)
376 | if currentgene.source!=opengene.source:
377 | opengene.source = "multiple"
378 | currentgene.mrnas = []
379 | removegeneids.add(currentgene.identifier)
380 | tmpopencdss.append(feature)
381 | opencdss[feature.seqid] = tmpopencdss
382 |
383 | #3) update features and return object
384 | newfeatures = []
385 | newgenes = {}
386 | newseqids = {}
387 | for feature in self.features:
388 | if feature.ftype!="gene":
389 | newfeatures.append(feature)
390 | for gene in tmpnewgenes:
391 | if not gene.identifier in removegeneids:
392 | if not gene.seqid in newseqids.keys():
393 | newseqids[gene.seqid] = []
394 | newgenes[gene.identifier] = gene
395 | newseqids[gene.seqid].append(gene)
396 | newfeatures.append(gene)
397 | self.genes = newgenes
398 | self.seqids = newseqids
399 | self.features = sorted(newfeatures)
400 | return(self)
401 |
402 | def collapseMrnas(self):
403 | """
404 | This function removes redundant mRNAs
405 | """
406 |
407 | newfeatures = []
408 | newmrnas = {}
409 | # go through all genes
410 | for geneid in self.genes:
411 | gene = self.genes[geneid]
412 | newfeatures.append(gene)
413 | tmp_keeptranscripts = []
414 | # go through all mRNAs
415 | for mrna1 in gene.features:
416 | isequal = False
417 | # check if there is already equal mRNA in set of new mRNAs
418 | for mrna2 in tmp_keeptranscripts:
419 | if mrna1 == mrna2:
420 | isequal = True
421 | if not isequal:
422 | tmp_keeptranscripts.append(mrna1)
423 | # set new mRNAs
424 | gene.features = tmp_keeptranscripts
425 | # add features to newfeatures (those to keep)
426 | for mrna in tmp_keeptranscripts:
427 | newfeatures.append(mrna)
428 | newmrnas[mrna.identifier] = mrna
429 | newfeatures += mrna.features
430 | self.features = sorted(newfeatures)
431 | self.mrnas = newmrnas
432 | return(self)
433 |
434 | def writeGff3Genes(self, path):
435 | with open(path, "w") as outgff:
436 | writer = csv.writer(outgff, delimiter="\t", quotechar="#", quoting = csv.QUOTE_NONE)
437 | for feature in self.features:
438 | writer.writerow(feature.getLine())
439 |
440 | def printGeneStats(self, path):
441 | with open(path, "w") as outfile:
442 | rowpattern = {"id":"none", "source":"none", "seqid":"none", "start":0, "end":0, "ntranscripts":0, "primconf":""}
443 | variables = ["id", "source", "seqid", "start", "end", "ntranscripts", "primconf"]
444 | writer = csv.DictWriter(outfile, fieldnames=variables)
445 | writer.writeheader()
446 | for geneid in self.genes:
447 | gene = self.genes[geneid]
448 | outdata = dict(rowpattern)
449 | outdata["id"] = geneid
450 | outdata["source"] = gene.source
451 | outdata["seqid"] = gene.seqid
452 | outdata["start"] = gene.start
453 | outdata["end"] = gene.end
454 | outdata["ntranscripts"] = len(gene.features)
455 | outdata["primconf"] = gene.primary_confidence_class
456 | writer.writerow(outdata)
457 |
458 | def printTranscriptsStats(self, path, includetargets=False):
459 | with open(path, "w") as outfile:
460 | rowpattern = {"id":"none", "gene": "none", "source":"none", "seqid":"none", "start":0, "end":0, "bpcdss":0, "ncdss":0, "primconf":"", "secconf":""}
461 | variables = ["id", "gene", "source", "seqid", "start", "end", "bpcdss", "ncdss", "primconf", "secconf"]
462 | if includetargets:
463 | rowpattern["target"] = ""
464 | variables.append("target")
465 | writer = csv.DictWriter(outfile, fieldnames=variables)
466 | writer.writeheader()
467 | for mrnaid in self.mrnas:
468 | mrna = self.mrnas[mrnaid]
469 | outdata = dict(rowpattern)
470 | outdata["id"] = mrnaid
471 | outdata["gene"] = mrna.parent.identifier
472 | outdata["source"] = mrna.source
473 | outdata["seqid"] = mrna.seqid
474 | outdata["start"] = mrna.start
475 | outdata["end"] = mrna.end
476 | outdata["primconf"] = mrna.primary_confidence_class
477 | outdata["secconf"] = mrna.secondary_condidence_class
478 | tmpbpcdss = 0
479 | tmpncdss = 0
480 | for cds in mrna.features:
481 | if cds.ftype=="CDS":
482 | tmpncdss += 1
483 | tmpbpcdss += (cds.end-cds.start)+1
484 | outdata["ncdss"] = tmpncdss
485 | outdata["bpcdss"] = tmpbpcdss
486 | if includetargets:
487 | outdata["target"] = mrna.target
488 | writer.writerow(outdata)
489 |
490 | def getHcGff3Genes(self):
491 | newfeatures = []
492 | newgenes = {}
493 | newseqids = {}
494 | newmrnas = {}
495 | for geneid in self.genes:
496 | gene = self.genes[geneid]
497 | if gene.primary_confidence_class=="HC":
498 | newfeatures.append(gene)
499 | newgenes[gene.identifier] = gene
500 | if not gene.seqid in newseqids:
501 | newseqids[gene.seqid] = []
502 | newseqids[gene.seqid].append(gene)
503 | for mrna in gene.features:
504 | newmrnas[mrna.identifier] = mrna
505 | newfeatures.append(mrna)
506 | newfeatures += mrna.features
507 | newanno = GeneAnnotation()
508 | newanno.features = sorted(newfeatures)
509 | newanno.genes = newgenes
510 | newanno.seqids = newseqids
511 | newanno.mrnas = newmrnas
512 | return newanno
513 |
514 | def getLcGff3Genes(self):
515 | newfeatures = []
516 | newgenes = {}
517 | newseqids = {}
518 | newmrnas = {}
519 | for geneid in self.genes:
520 | gene = self.genes[geneid]
521 | if gene.primary_confidence_class=="LC":
522 | newfeatures.append(gene)
523 | newgenes[gene.identifier] = gene
524 | if not gene.seqid in newseqids:
525 | newseqids[gene.seqid] = []
526 | newseqids[gene.seqid].append(gene)
527 | for mrna in gene.features:
528 | newmrnas[mrna.identifier] = mrna
529 | newfeatures.append(mrna)
530 | newfeatures += mrna.features
531 | newanno = GeneAnnotation()
532 | newanno.features = sorted(newfeatures)
533 | newanno.genes = newgenes
534 | newanno.seqids = newseqids
535 | newanno.mrnas = newmrnas
536 | return newanno
537 |
538 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 liusc
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SPART
2 | [](https://doi.org/10.5281/zenodo.14551115)
3 | [](https://doi.org/10.1038/s41588-025-02137-x)
4 | 
5 |
6 | SPART, a Semi-automated pipeline for assembling reference sequence of telomere-to-telomere (T2T).
7 |
8 |
9 | **See [tutorial]( https://spart1.readthedocs.io/en/latest/) for more details.**
10 | ## Table of Contents
11 |
12 | - [Quick install and start](#started)
13 | - [Install](#Install)
14 | - [Dependencies](#Dependencies)
15 | - [Running pipeline with snakemake](#pipe)
16 | - [Output files](#Output)
17 | - [Run step by step](#step)
18 | - [00_Contig screen](#00_Contig)
19 | - [01_Contig scaffolding](#01_Contig)
20 | - [02_Gap patching](#02_Gap)
21 | - [03_Polishing](#03_Polishing)
22 | - [04_Evaluation](#04_Evaluation)
23 | - [05_Annotation](#05_Annotation)
24 |
25 | ## Quick install and start
26 | ### Install
27 | ```sh
28 | git clone https://github.com/liushoucheng/SPART.git
29 | cd SPART
30 | conda env create -f SPART.yaml
31 | conda activate spart
32 | ```
33 | ### Dependencies
34 |
35 | List of tools assumed loadable or accessible with no path are:
36 |
37 | * [Bionano DLS map]( https://bionano.com)
38 |
39 | * [HiC-Pro v3.1.0]( https://github.com/nservant/HiC-Pro)
40 |
41 | * [_submit_telomere.sh]( https://github.com/VGP/vgp-assembly/blob/master/pipeline/telomere/_submit_telomere.sh)
42 |
43 | * [Medaka]( https://anaconda.org/bioconda/medaka)
44 |
45 | * [racon]( https://anaconda.org/bioconda/racon)
46 |
47 | * [hisat2]( https://github.com/DaehwanKimLab/hisat2)
48 |
49 | * [DeepVariant v1.5.0-gpu]( https://github.com/google/deepvariant)
50 |
51 | * [PEPPER-Margin-DeepVariant v0.8-gpu]( https://github.com/kishwarshafin/pepper)
52 |
53 | * [hap.py v0.3.15]( https://github.com/Illumina/hap.py)
54 |
55 | * [vcf_merge_t2t.py](https://github.com/kishwarshafin/T2T_polishing_scripts/blob/master/polishing_merge_script/vcf_merge_t2t.py)
56 |
57 | * [miniprot_GFF_2_EVM_alignment_GFF3.py](https://github.com/EVidenceModeler/EVidenceModeler/blob/master/EvmUtils/misc/miniprot_GFF_2_EVM_alignment_GFF3.py)
58 |
59 | ### Using snakemake to run the pipeline can be assembled to the chromosome level but may contain gaps that require the rest to be done manually.(Exclude Verkko,Bionano DLS Map,Telomere determination and patch,Centromeric region analysis,Variant calls and Evaluation):
60 | * [Download the example in SPART/example/]( https://gofile.me/77wE8/Vj6Vlp1LK)
61 | * [Download the digest_genome.py of HiC-Pro in SPART/]( https://github.com/nservant/HiC-Pro/blob/master/bin/utils/digest_genome.py)
62 | ```sh
63 | # Replace SPART_PATH with the current working directory
64 | sed -i "s#^ SPART_PATH# ${PWD}#g" conf_ck.yaml
65 | # HiC enzyme
66 | HiC_enzyme=" GATC"
67 | # Replace hic_sca_enzyme with the value stored in the HiC_enzyme variable
68 | sed -i "s#^ hic_sca_enzyme# ${HiC_enzyme}#g" conf_ck.yaml
69 | # Ligation site sequence used for reads trimming. Depends on the fill in strategy. Example: AAGCTAGCTT
70 | HiC_ligation_site=" GATCGATC"
71 | sed -i "s#^ hic_sca_ligation_site# ${HiC_ligation_site}#g" conf_ck.yaml #Replace hic_sca_ligation_site with the value stored in the HiC_ligation_site variable
72 | # This process uses the centos 7.6 operating system, slurm job scheduling system, please modify your SPART/clust.json according to the cluster situation.
73 | # This process requires the use of HiC-Pro, please add it to the environment before running.
74 | snakemake -s SPART.py --cluster-config clust.json --configfile conf_ck.yaml --cluster '{cluster.account}' --jobs $threads --rerun-incomplete --restart-times 1 -np --rulegraph |dot -Tpng > rule.png #Running pipeline with snakemake
75 | # configfile:The config file can be used to define a dictionary of configuration parameters and their values.
76 | # cluster-config:A JSON or YAML file that defines the wildcards used in 'cluster'for specific rules.
77 | ```
78 |
79 |

80 |
81 |
82 | ### Output files
83 | please see the complete [documentation](https://github.com/liushoucheng/SPART/tree/main/example).
84 |
85 | ## Run step by step
86 |
87 | ### 00_Contig screen
88 | ```sh
89 | HiFi_reads=# file names of HiFi reads
90 | ONT_reads=# file names of Ultra-Long reads
91 | thread=# number of threads
92 | memory=# Specify the upper limit on memory to use
93 | output_prefix=# prefix of output files
94 | mitochondrion=# mitochondrion fasta
95 | chloroplast=# chloroplast fasta
96 | ref=# Sequences of mitochondria and chloroplasts need to be removed
97 | # Fastp :was used to filter adapter sequences, primers and other low quality sequence from raw sequencing reads.
98 | SPART/00_Contig_screen/fastp.sh $HiFi_reads $ONT_reads
99 | # Hifiasm
100 | SPART/00_Contig_screen/hifiasm.sh $HiFi_reads $ONT_reads $output_prefix $thread
101 | # Verkko
102 | SPART/00_Contig_screen/verkko.sh $output_prefix $HiFi_reads $ONT_reads $threads $memory
103 | # Flye
104 | SPART/00_Contig_screen/flye.sh $ONT_reads $output_prefix $threads
105 | # Remove mitochondrion && chloroplast
106 | SPART/00_Contig_screen/rm_mt_cp.sh $mitochondrion $chloroplast $ref $threads
107 | ```
108 | ### 01_Contig scaffolding
109 | ```sh
110 | threads=# Nominal threads per Node, without overloading (non-zero value will override -T -Tp -Te -TJ)
111 | bnx=# Input molecule (.bnx) file, required
112 | ref_cmap=# Reference file (must be .cmap), to compare resulting contigs
113 | prefix=# Location of output files root directory, required, will be created if does not exist; if does exist, will overwrite contents
114 | xml=# Read XML file for parameters
115 | Bio_dir=# Location of executable files (RefAligner and Assembler, required)
116 | cluster_xml=# Run on cluster, read XML file for submission arguments (optional--will not use cluster submission if absent)
117 | ref=# Input NGS FASTA
118 | bio_camp=# Input BioNano CMAP
119 | merge_xml=# Merge configuration file
120 | RefAligner=# RefAligner program
121 | hicpro_data=# input data folder; Must contains a folder per sample with input files
122 | hicpro_config=# configuration file for Hi-C processing
123 | hicpro_outdir=# output folder
124 | enzyme=# restriction enzyme cutting sites
125 | #### Bionano
126 | SPART/01_Contig_scaffolding/Bionano_DLS_map.sh $threads $bnx $ref_cmap $prefix $xml $Bio_dir $cluster_xml $ref $bio_camp $merge_xml $RefAligner
127 | #### Hi-C
128 | # hic-pro
129 | SPART/01_Contig_scaffolding/HiC-Pro.sh $ref $prefix $hicpro_data $hicpro_config $hicpro_outdir
130 | # yahs
131 | SPART/01_Contig_scaffolding/yahs.sh $enzyme $ref $bed/bam/bin $profix
132 | ```
133 | ### 02_Gap patching
134 | ```sh
135 | query=# query fasta file (uncompressed or bgzipped)
136 | ref=# target fasta file (uncompressed or bgzipped)
137 | region=# output directory
138 | SPART/02_Gap_patching/wfmash_ragtag.sh $query $ref $region
139 | ```
140 | #### Manual operation
141 | ```sh
142 | cd ragtag_output
143 | perl SPART/02_Gap_patching/paf_filter.pl -i ragtag.patch.debug.filtered.paf -minlen 10000000 -iden 0.5
144 | ```
145 | **Manually editing the ragtag.patch.debug.filtered.paf file.Keep the high-quality contig and preserve the location of the only high confidence match in ragtag.patch.debug.filtered.paf that matches the sequence at both ends of the gap.**
146 | ```sh
147 | perl SPART/02_Gap_patching/renameagp.pl -i ragtag.patch.ctg.agp -i1 ragtag.patch.debug.filtered.paf -start seq00000000 -end seq00000001 -o test.agp
148 | ```
149 | **Test.agp is merged into ragtag.patch.agp and fasta is generated.**
150 |
151 | #### e.g.
152 | ```sh
153 | # make joins and fill gaps in target.fa using sequences from query.fa
154 | cd SPART/example
155 | ragtag.py patch -i 0.99 --remove-small -q 10 --debug -u --aligner minimap2 -t 128 --mm2-params "-x asm20 -I1G -t 128" reference1A.fasta query1A.fasta
156 | # filter
157 | cd ragtag_output
158 | perl SPART/02_Gap_patching/paf_filter.pl -i ragtag.patch.debug.filtered.paf -minlen 10000000 -iden 0.5
159 | # Manually editing the ragtag.patch.debug.filtered.paf_fiter.paf file.Keep the high-quality contig and preserve the location of the only high confidence match in ragtag.patch.debug.filtered.paf_fiter.paf that matches the sequence at both ends of the gap.
160 | less ragtag.patch.debug.filtered.paf_fiter.paf
161 | qseq00000000 600453479 27150 3999147 + seq00000001 3972000 4 3971997 2266668 3972018 60
162 | qseq00000000 600453479 4038251 35116708 + seq00000002 597339226 17 31075089 17568679 31079144 60
163 | # gain agp
164 | perl SPART/02_Gap_patching/renameagp.pl -i ragtag.patch.ctg.agp -i1 ragtag.patch.debug.filtered.paf_fiter.paf -start seq00000001 -end seq00000002 -o test.agp
165 | less -S ragtag.patch.agp
166 | chr1A_RagTag_MOD_MOD 1 2046621 1 W seq00000000 1 2046621 +
167 | chr1A_RagTag_MOD_MOD 2046622 2046821 2 N 200 scaffold yes align_genus
168 | chr1A_RagTag_MOD_MOD 2046822 6018821 3 W seq00000001 1 3972000 +
169 | chr1A_RagTag_MOD_MOD 6018822 6019021 4 N 200 scaffold yes align_genus
170 | chr1A_RagTag_MOD_MOD 6019022 603358247 5 W seq00000002 1 597339226 +
171 | # Test.agp is merged into ragtag.patch.agp and fasta is generated.
172 | less -S ragtag.patch.agp
173 | scf00000000 1 2046621 1 W seq00000000 1 2046621 +
174 | scf00000000 2046622 2046821 2 N 200 scaffold yes align_genus
175 | scf00000000 2046822 6018821 3 W seq00000001 1 3972000 +
176 | scf00000000 6018822 6057905 4 W qseq00000000 3999151 4038234 +
177 | scf00000000 6057906 603397131 5 W seq00000002 1 597339226 +
178 | ragtag_agp2fa.py ragtag.patch.agp ragtag.patch.comps.fasta > ragtag.patch.fasta
179 | ```
180 | #### telomere patching
181 | We used _submit_telomere.sh in ONT reads >100kb.ONT reads with telomere sequence mapping to this locus based on minimap2 alignments were manually identified. The longest was selected as template , all others aligned to it and polished with Medaka:
182 | ```sh
183 | medaka -v -i ONT_tel_reads.fasta -d longest_ont_tel.fasta -o ont_tel_medaka.fasta
184 | ```
185 | Telomere signal in all HiFi reads was identified with the commands:
186 | ```sh
187 | _submit_telomere.sh hifi_reads.fasta
188 | ```
189 | Additional HiFi reads were recruited from a manual analysis. We looked for trimmed tips that could extend. All reads had telomere signal and were aligned to the medaka consensus and polished with Racon with the commands:
190 | ```sh
191 | minimap2 -t16 -ax map-pb ont_tel_medaka.fasta hifi_tel.fasta > medaka.sam
192 | racon hifi_tel.fasta medaka.sam ont_tel_medaka.fasta > racon.fasta
193 | ```
194 | Finally, the polished result was patched into the assembly with ragtag patch or manually patched.
195 | ##### Citation
196 | https://github.com/marbl/CHM13-issues/blob/main/error_detection.md.
197 | #### Centromeric region analysis
198 | ```sh
199 | workdir=# work directory
200 | FASTA=# target fasta file (uncompressed or bgzipped)
201 | prefix=# prefix of output files
202 | CHIP1_treatment=# Treatment (pull-down) file(s).
203 | CHIP2_treatment=# Treatment (pull-down) file(s).
204 | threads=# number of threads
205 | CHIP1_control=# Control (input) file(s)
206 | CHIP2_control=# Control (input) file(s)
207 | SPART/02_Gap_patching/Centromeric_region_analysis.sh $workdir $FASTA $prefix $CHIP1_treatment $CHIP2_treatment $threads $CHIP1_control $CHIP2_control
208 | ```
209 | ### 03_Polishing
210 | ```sh
211 | # Use singularity and docker to download google_deepvariant_latest-gpu.sif and kishwars/pepper_deepvariant:r0.8-gpu respectively and modify the cluster-config and configfile in snakemake
212 | workdir=# work directory
213 | ref=# target fasta file (uncompressed or bgzipped)
214 | threads=# number of threads
215 | SPART/03_Polishing/calsv_snv.sh $workdir $ref $threads
216 | ```
217 | ### 04_Evaluation
218 | ```sh
219 | ref=# target fasta file (uncompressed or bgzipped)
220 | prefix=# prefix of output files
221 | query=# query fasta file (uncompressed or bgzipped)
222 | threads=# number of threads
223 | partition=# your cluster partition
224 | bac_reads=# bac reads
225 | ref_chr=# target chromosome fasta file (uncompressed or bgzipped)
226 | protein=# target protein fasta file
227 | name=# output file name
228 | gff3# target gff file
229 | #### BUSCO
230 | SPART/04_Evaluation/BUSCO.sh $ref $prefix
231 | #### mapping rates & coverages
232 | SPART/04_Evaluation/mapping_rates_coverages.sh hybrid_bam single_bam ont_bam
233 | #### LTR
234 | SPART/04_Evaluation/ltr.sh $ref $prefix
235 | #### QV
236 | SPART/04_Evaluation/qv.sh $query $ref
237 | #### BACs
238 | SPART/04_Evaluation/bac.sh $bac_reads $ref_chr
239 | ### Addition
240 | SPART/04_Evaluation/while.sh $threads $partition $ref $query
241 | ### Analysis of synteny
242 | SPART/04_Evaluation/synteny.sh $protein $name $gff3
243 | ```
244 | ### 05_Annotation
245 | #### RNA-seq
246 | Detect adapter
247 | ```sh
248 | fastp --detect_adapter_for_pe -w ${threads} -i ${RNAseq1} -I ${RNAseq2} -o ${RNAseq1_clean} -O ${RNAseq2_clean} --json ${output}.json --html ${output}.html
249 | ```
250 | Build genome index
251 | ```sh
252 | STAR --runThreadN ${threads} --runMode genomeGenerate --genomeDir ${Output Dir} --genomeFastaFiles ${genome} --sjdbGTFtagExonParentTranscript Parent --sjdbGTFfile ${annotations} --limitGenomeGenerateRAM 40000000000 --sjdbOverhang 149 --sjdbFileChrStartEnd ${genomic coordinates} --limitSjdbInsertNsj 1854820
253 | ```
254 | Mapping to genome
255 | ```sh
256 | STAR --runThreadN ${threads} --genomeDir ${Output Dir} --readFilesIn ${RNAseq1_clean} ${RNAseq2_clean} --sjdbGTFtagExonParentTranscript Parent --sjdbGTFfile ${annotations} --outFileNamePrefix "$profix" --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterType BySJout --outSAMunmapped Within --outFilterMultimapNmax 20 --outSAMstrandField intronMotif --outFilterMismatchNoverLmax 0.02 --outFilterMismatchNmax 999 --alignIntronMin 20 --alignIntronMax 10000 --alignMatesGapMax 100000 --sjdbScore 1 --genomeLoad NoSharedMemory --outSAMtype BAM SortedByCoordinate --limitSjdbInsertNsj 1854820
257 | ```
258 | Assembly and merge
259 | ```sh
260 | stringtie -j 2 -c 2 -m 150 -f 0.3 -G ${reference annotation} -l rna-seq -t -p ${threads} -l "$profix" -A "$profix"gene_abund.tab -C "$profix"cov_refs.gtf -o "$profix".gtf "$profix"Aligned.sortedByCoord.out.bam
261 | stringtie --merge -p 96 -m 150 -c 10 -G ${reference annotation} -l rna_merge -o rna_all.gtf { gtf_list | strg1.gtf ...}
262 | ```
263 | TransDecoder
264 | ```sh
265 | python snakemake -s Snakefile --cluster-config clust.json --configfile config.yaml --jobs 2000 --cluster '{cluster.account}' --rerun-incomplete --restart-times 1
266 | ```
267 | #### ISO-seq
268 | Build genome index
269 | ```sh
270 | minimap2 -t 96 -I 16G -d $mmi $genome
271 | ```
272 | Align && Correct && Collapse
273 | ```sh
274 | flair 123 --mm2_args=-I15g,-axsplice:hq,-uf,-secondary=no -g $genome -r $iso_seq --mm_index $mmi -f $gtf -o flair.output --temp_dir temp_flair --stringent --no_gtf_end_adjustment --check_splice --generate_map --trust_end -t 96 --annotation_reliant generate --junction_bed $stringtie.bed
275 | ```
276 | TransDecoder
277 | ```sh
278 | python snakemake -s Snakefile --cluster-config clust.json --configfile config.yaml --jobs 2000 --cluster '{cluster.account}' --rerun-incomplete --restart-times 1
279 | ```
280 | #### Homology protein
281 | miniprot
282 | ```sh
283 | miniprot -t96 -d CS-IAAS_v1.softmask.mpi CS-IAAS_v1.softmask.fasta
284 | miniprot -It96 --gff CS-IAAS_v1.softmask.mpi ${Homology protein} > miniprot.gff3
285 | python miniprot_GFF_2_EVM_alignment_GFF3.py miniprot.gff3 > protein_alignments.gff3
286 | ```
287 | #### Ab initio gene prediction
288 | Braker3
289 | ```sh
290 | ##### RNA-seq && Homology protein
291 | docker run -c ${threads} --user 1000:100 -v /tmp:/tmp -v /home:/home -v /data:/data -v "$PWD":"$PWD" teambraker/braker3:latest braker.pl --workingdir="$PWD" --species=CS-IAAS --softmasking --genome=CS-IAAS_v1.softmask.fasta --addUTR=on --gff3 --nocleanup --bam=rna_seq.bam --prot_seq=${Homology protein} --threads ${threads} --BAMTOOLS_PATH= --AUGUSTUS_BIN_PATH= --JAVA_PATH=
292 | ##### ISO-seq && Homology protein
293 | docker run -c ${threads} --user 1000:100 -v /tmp:/tmp -v /home:/home -v /data:/data -v "$PWD":"$PWD" katharinahoff/playground:devel braker.pl --workingdir="$PWD" --species=CS-IAAS --softmasking --genome=CS-IAAS_v1.softmask.fasta --gff3 --nocleanup --bam=iso_seq.bam --prot_seq=${Homology protein} --threads ${threads} --BAMTOOLS_PATH= --AUGUSTUS_BIN_PATH=
294 | ```
295 | ## contacts
296 | Shoucheng Liu (liusc_work@163.com)
297 | Xiaopeng Li (xiaopeng.li@pku-iaas.edu.cn)
298 |
299 | ## Citating SPART
300 | If you use SPART in your work, please cite:
301 |
302 | Liu, S., Li, K., Dai, X. et al. A telomere-to-telomere genome assembly coupled with multi-omic data provides insights into the evolution of hexaploid bread wheat. Nat Genet (2025).
303 | https://doi.org/10.1038/s41588-025-02137-x
304 |
--------------------------------------------------------------------------------
/SPART.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import sys
4 | b={}
5 | hifi_single={}
6 | hifi_mix={}
7 | e={}
8 | d={}
9 | HiFi_hybrid_all=config["HiFi_reads_merge"]
10 | ONT_all=config["ONT_reads_merge"]
11 | mitochondrion=config["mitochondrion"]
12 | chloroplast=config["chloroplast"]
13 | hic_hybrid_dir=config["hic_dir"]
14 | SPART_dir=config["SPART_dir"]
15 | hic_hybrid_enzyme=config["hic_enzyme"]
16 | hic_enzyme_ligation_site=config["hic_enzyme_ligation_site"]
17 | verkko_fa=config["verkko_assemble"]
18 | pcrfree_hybrid_r1=config["pcrfree_r1"]
19 | pcrfree_hybrid_r2=config["pcrfree_r2"]
20 | google_deepvariant_latest_gpu_sif=config["google_deepvariant_latest-gpu_sif"]
21 | W=config["WORKDIR"]
22 | DIR=config["DIR"]
23 | DIRont=config["DIRont"]
24 | for dirs in os.listdir(DIR):
25 | b2 = dirs.split(".fastq")
26 | if ".fastq" in dirs:
27 | absPath = os.path.join(DIR, dirs)
28 | hifi_mix[b2[0]]=absPath
29 |
30 | for dirs in os.listdir(DIRont):
31 | b2 = dirs.split(".fastq")
32 | if ".fastq" in dirs:
33 | absPath = os.path.join(DIRont, dirs)
34 | e[b2[0]]=absPath
35 |
36 | rule final:
37 | input:
38 | W+"hybrid_hifi_pcr/hybrid.bam",
39 | W + "ont_merge/q10l120k.bam"
40 |
41 | rule hifi_fastp:
42 | input:
43 | HiFi_hybrid_all
44 | output:
45 | W+"fastp/hybrid.fq"
46 | shell:
47 | "fastp -w 16 -i {input} -o {output}"
48 |
49 | rule ont_fastp:
50 | input:
51 | ONT_all
52 | output:
53 | W+"fastp/ont.fq"
54 | shell:
55 | "fastp -q 10 -l 100000 -w 16 -i {input} -o {output}"
56 |
57 | rule hifiasm:
58 | input:
59 | hifi=W+"fastp/hybrid.fq",
60 | ont=W+"fastp/ont.fq"
61 | output:
62 | W+"hifiasm_hybrid/hybrid.all.asm.p_ctg.fa"
63 | params:
64 | W+"hifiasm_hybrid"
65 | shell:
66 | """
67 | cd {params}
68 | hifiasm -o hybrid.all.asm --primary -t 96 --ul {input.ont} -k 63 {input.hifi}
69 | awk '/^S/{{print ">"$2;print $3}}' hybrid.all.asm.p_ctg.gfa > {output}
70 | """
71 |
72 | rule flye:
73 | input:
74 | W+"fastp/ont.fq"
75 | output:
76 | W + "flye/assembly.fasta"
77 | params:
78 | W
79 | shell:
80 | """
81 | cd {params}
82 | flye --nano-hq {input} --read-error 0.1 -g 5.4g --asm-coverage 80 --scaffold --out-dir flye --threads 96 --no-alt-contigs
83 | """
84 |
85 | rule rm_mt_cp:
86 | input:
87 | hybrid=W+"hifiasm_hybrid/hybrid.all.asm.p_ctg.fa",
88 | mt=mitochondrion,
89 | cp=chloroplast
90 | output:
91 | W+"hifiasm_hybrid/hybrid.remove_cp_mt.fa"
92 | params:
93 | dir=W+"hifiasm_hybrid",
94 | workdir=SPART_dir
95 | shell:
96 | """
97 | cd {params.dir}
98 | minimap2 -t 96 -x asm5 {input.mt} {input.hybrid}> mitochondrion.paf
99 | minimap2 -t 96 -x asm5 {input.cp} {input.hybrid}> chloroplast.paf
100 | python {params.workdir}/gemma_los.py mitochondrion.paf > mitochondrion.txt
101 | python {params.workdir}/gemma_los.py chloroplast.paf > chloroplast.txt
102 | seqkit grep -v -f chloroplast.txt {input.hybrid} > wheat_remove_cp.fa
103 | seqkit grep -v -f mitochondrion.txt wheat_remove_cp.fa > {output}
104 | """
105 |
106 | rule hicpro:
107 | input:
108 | hic=hic_hybrid_dir,
109 | ref=W+"hifiasm_hybrid/hybrid.remove_cp_mt.fa"
110 | output:
111 | W+"hic_hybrid/hic_hybrid.bam"
112 | params:
113 | dir=W+"hic_hybrid",
114 | prefix="hybrid.remove_cp_mt",
115 | spart_dir=SPART_dir,
116 | enzyme=hic_hybrid_enzyme,
117 | LIGATION_SITE=hic_enzyme_ligation_site
118 | shell:
119 | """
120 | cd {params.dir}
121 | ln -s {input.ref} ./
122 | bowtie2-build --large-index --threads 96 {params.prefix}.fa {params.prefix}
123 | samtools faidx {params.prefix}.fa
124 | awk '{{print $1 "\t" $2}}' {params.prefix}.fa.fai > genome_sizes.bed
125 | python {params.spart_dir}/digest_genome.py -r ^{params.enzyme} -o enzyme.bed {params.prefix}.fa
126 | makeblastdb -in {params.prefix}.fa -dbtype nucl -parse_seqids -out {params.prefix}
127 | cp {params.spart_dir}/01_Contig_scaffolding/hicpro_config.txt ./
128 | sed -i 's#^N_CPU = #N_CPU = 128#g' hicpro_config.txt
129 | sed -i 's#^BOWTIE2_IDX_PATH = #BOWTIE2_IDX_PATH = {params.dir}#g' hicpro_config.txt
130 | sed -i 's#^REFERENCE_GENOME = #REFERENCE_GENOME = {params.prefix}#g' hicpro_config.txt
131 | sed -i 's#^GENOME_SIZE = #GENOME_SIZE = {params.dir}/genome_sizes.bed#g' hicpro_config.txt
132 | sed -i 's#^GENOME_FRAGMENT = #GENOME_FRAGMENT = {params.dir}/enzyme.bed#g' hicpro_config.txt
133 | HiC-Pro -i {input.hic} -c hicpro_config.txt -o {params.dir}/result
134 | cd result/bowtie_results/bwt2/sample
135 | samtools sort -m 1500M -n -@ 96 HiC_hybrid.remove_cp_mt.bwt2pairs.bam > {params.dir}/hic_hybrid.bam
136 | """
137 |
138 | rule yahs:
139 | input:
140 | bam=W+"hic_hybrid/hic_hybrid.bam",
141 | ref=W+"hifiasm_hybrid/hybrid.remove_cp_mt.fa"
142 | output:
143 | W + "yahs_hybrid/yahs_hybrid.fa"
144 | params:
145 | dir = W + "yahs_hybrid",
146 | prefix = "hybrid_bam",
147 | enzyme = hic_hybrid_enzyme
148 | shell:
149 | """
150 | cd {params.dir}
151 | samtools faidx {input.ref}
152 | samtools sort -@ 128 -o hic_hybrid_sort.bam {input.bam}
153 | samtools index hic_hybrid_sort.bam
154 | yahs -e {params.enzyme} {input.ref} hic_hybrid_sort.bam -o {params.prefix}
155 | cp {params.dir}/{params.prefix}_scaffolds_final.fa {output}
156 | """
157 |
158 | rule patch_flye:
159 | input:
160 | single_hybrid=W + "yahs_hybrid/yahs_hybrid.fa",
161 | flye=W + "flye/assembly.fasta"
162 | output:
163 | W + "patch_flye/patch_single_hybrid_flye.fa"
164 | params:
165 | dir = W + "patch_flye",
166 | prefix = "single_hybrid_flye"
167 | shell:
168 | """
169 | cd {params.dir}
170 | samtools faidx {input.single_hybrid}
171 | samtools faidx {input.flye}
172 | wfmash {input.single_hybrid} {input.flye} > {params.prefix}.paf
173 | mkdir ragtag_output
174 | cd ragtag_output
175 | ln -s ../{params.prefix}.paf ragtag.patch.asm.paf
176 | cd ..
177 | ragtag.py patch -f 10000 --remove-small {input.single_hybrid} {input.flye}
178 | cp {params.dir}/ragtag_output/ragtag.patch.fasta {output}
179 | """
180 |
181 | rule patch_verkko:
182 | input:
183 | single_hybrid_flye=W + "patch_flye/patch_single_hybrid_flye.fa",
184 | verkko=verkko_fa
185 | output:
186 | ref=W + "patch_verkko/patch_single_hybrid_flye_verkko.fa",
187 | txt = W + "repetitive_k27.txt"
188 | params:
189 | dir = W + "patch_verkko",
190 | prefix = "single_hybrid_flye_verkko",
191 | shell:
192 | """
193 | cd {params.dir}
194 | samtools faidx {input.single_hybrid_flye}
195 | samtools faidx {input.verkko}
196 | wfmash {input.single_hybrid_flye} {input.verkko} > {params.prefix}.paf
197 | mkdir ragtag_output
198 | cd ragtag_output
199 | ln -s ../{params.prefix}.paf ragtag.patch.asm.paf
200 | cd ..
201 | ragtag.py patch -f 10000 --remove-small {input.single_hybrid_flye} {input.verkko}
202 | cp {params.dir}/ragtag_output/ragtag.patch.fasta {output.ref}
203 | bwa-mem2 index {output.ref}
204 | meryl count k=27 output merylDB {output.ref}
205 | meryl print greater-than distinct=0.9998 merylDB > {output.txt}
206 | """
207 |
208 | rule winnowmap_hifi:
209 | input:
210 | fq=W+"fastp/hybrid.fq",
211 | ref=W + "patch_verkko/patch_single_hybrid_flye_verkko.fa",
212 | txt = W + "repetitive_k27.txt"
213 | output:
214 | sam=W+"hifi_mix_winnowmap/{hifi_mix}_q40l15k.sam"
215 | benchmark:
216 | W+"benchmarks/hifi_mix_winnowmap/{hifi_mix}.benchmark.txt"
217 | shell:
218 | """
219 | winnowmap --MD -W {input.txt} -ax map-pb -H -K 1500M -k 27 -w27 -t32 {input.ref} {input.fq} > {output.sam}
220 | """
221 |
222 | rule winnowmap_hifi_sort:
223 | input:
224 | W+"hifi_mix_winnowmap/{hifi_mix}_q40l15k.sam"
225 | output:
226 | W+"hifi_mix_sort/{hifi_mix}_q40l15k.bam"
227 | params:
228 | W + "patch_verkko/patch_single_hybrid_flye_verkko.fa.fai"
229 | benchmark:
230 | W + "benchmarks/hifi_mix_sort/{hifi_mix}.benchmark.txt"
231 | shell:
232 | "samtools view -@32 -bt {params} {input}|samtools sort -@32 -m1500M -O bam -o {output} -"
233 |
234 | rule winnowmap_hifi_sort_filter:
235 | input:
236 | W+"hifi_mix_sort/{hifi_mix}_q40l15k.bam"
237 | output:
238 | W+"hifi_mix_sort_filter/{hifi_mix}_q40l15k.bam"
239 | benchmark:
240 | W + "benchmarks/hifi_mix_sort_filter/{hifi_mix}.benchmark.txt"
241 | shell:
242 | "samtools view -@32 -F0x104 -hb {input} > {output}"
243 |
244 | rule winnowmap_hifi_sort_filter_merge:
245 | input:
246 | expand(W+"hifi_mix_sort_filter/{hifi_mix}_q40l15k.bam",hifi_mix=hifi_mix)
247 | output:
248 | W+"hybrid/hybrid.bam"
249 | benchmark:
250 | W + "benchmarks/hybrid/hybrid.benchmark.txt"
251 | shell:
252 | "samtools merge -@ 128 -l 0 {output} {input}"
253 |
254 | rule pcr_free:
255 | input:
256 | fa=W + "patch_verkko/patch_single_hybrid_flye_verkko.fa",
257 | r1=pcrfree_hybrid_r1,
258 | r2=pcrfree_hybrid_r2
259 | output:
260 | W+"hybrid_hifi_pcr/pcr.bam"
261 | shell:
262 | "bwa-mem2 mem -t 96 {input.fa} {input.r1} {input.r2}|samtools view -@ 96 -b -|samtools sort -@ 96 -m 1G -o {output} -"
263 |
264 | rule winnowmap_hifi_filter_pcr_merge:
265 | input:
266 | hifi=expand(W+"hifi_mix_sort_filter/{hifi_mix}_q40l15k.bam",hifi_mix=hifi_mix),
267 | pcr=W+"hybrid_hifi_pcr/pcr.bam"
268 | output:
269 | W+"hybrid_hifi_pcr/hybrid.bam"
270 | benchmark:
271 | W + "benchmarks/hybrid_pcr/hybrid.benchmark.txt"
272 | shell:
273 | "samtools merge -@ 128 -l 0 {output} {input.hifi} {input.pcr}"
274 |
275 | rule winnowmap_ont:
276 | input:
277 | fq=W+"fastp/ont.fq",
278 | ref=W + "patch_verkko/patch_single_hybrid_flye_verkko.fa",
279 | txt=W+"repetitive_k27.txt"
280 | output:
281 | W+"ont_winnowmap/{e}/{e}_q10l120k.sam"
282 | benchmark:
283 | W+"benchmarks/ont_winnowmap/{e}.benchmark.txt"
284 | shell:
285 | "winnowmap --MD -W {input.txt} -ax map-ont -H -K 1500M -k 27 -w27 -t32 {input.ref} {input.fq} > {output}"
286 |
287 | rule winnowmap_ont_sort:
288 | input:
289 | W+"ont_winnowmap/{e}/{e}_q10l120k.sam"
290 | output:
291 | W+"ont_sort/{e}/{e}_q10l120k.bam"
292 | params:
293 | W + "patch_verkko/patch_single_hybrid_flye_verkko.fa.fai"
294 | benchmark:
295 | W + "benchmarks/ont_sort/{e}.benchmark.txt"
296 | shell:
297 | "samtools view -@32 -bt {params} {input}|samtools sort -@32 -m1500M -O bam -o {output} -"
298 |
299 | rule winnowmap_ont_sort_filter:
300 | input:
301 | W+"ont_sort/{e}/{e}_q10l120k.bam"
302 | output:
303 | W+"ont_filter/{e}_q10l120k.bam"
304 | benchmark:
305 | W + "benchmarks/ont_filter/{e}.benchmark.txt"
306 | shell:
307 | "samtools view -@ 128 -F0x104 -hb {input} > {output}"
308 |
309 | rule winnowmap_ont_sort_filter_merge:
310 | input:
311 | expand(W+"ont_filter/{e}_q10l120k.bam",e=e)
312 | output:
313 | W + "ont_merge/q10l120k.bam"
314 | benchmark:
315 | W + "benchmarks/ont_merge/benchmark.txt"
316 | shell:
317 | "samtools merge -@ 128 {output} {input}"
318 |
--------------------------------------------------------------------------------
/SPART.yaml:
--------------------------------------------------------------------------------
1 | name: Spart
2 | channels:
3 | - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free
4 | - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/
5 | - bioconda
6 | - defaults
7 | - etetoolkit
8 | - conda-forge
9 | dependencies:
10 | - _libgcc_mutex=0.1=conda_forge
11 | - _openmp_mutex=4.5=2_gnu
12 | - _r-mutex=1.0.0=anacondar_1
13 | - _sysroot_linux-64_curr_repodata_hack=3=haa98f57_10
14 | - abseil-cpp=20211102.0=hd4dd3e8_0
15 | - aioeasywebdav=2.4.0=py39hf3d152e_1001
16 | - aiohttp=3.8.3=py39h5eee18b_0
17 | - aiosignal=1.2.0=pyhd3eb1b0_0
18 | - amply=0.1.6=pyhd8ed1ab_0
19 | - appdirs=1.4.4=pyhd3eb1b0_0
20 | - async-timeout=4.0.2=py39h06a4308_0
21 | - attmap=0.13.2=pyhd8ed1ab_0
22 | - attrs=22.1.0=py39h06a4308_0
23 | - augustus=3.5.0=pl5321h700735d_3
24 | - bamtools=2.5.1=hd03093a_10
25 | - bbmap=39.01=h92535d8_1
26 | - bcftools=1.17=h3cc50cf_1
27 | - bcrypt=3.2.0=py39h5eee18b_1
28 | - bedtools=2.31.0=hf5e1c6e_2
29 | - binutils_impl_linux-64=2.38=h2a08ee3_1
30 | - biopython=1.78=py39h7f8727e_0
31 | - blas=1.0=openblas
32 | - blast=2.14.0=pl5321h6f7f691_2
33 | - boost-cpp=1.78.0=h5adbc97_2
34 | - boto3=1.24.28=py39h06a4308_0
35 | - botocore=1.27.59=py39h06a4308_0
36 | - bottleneck=1.3.5=py39h7deecbd_0
37 | - bowtie2=2.5.1=py39h3321a2d_0
38 | - brotlipy=0.7.0=py39h27cfd23_1003
39 | - busco=5.4.6=pyhdfd78af_0
40 | - bwa-mem2=2.2.1=hd03093a_5
41 | - bwidget=1.9.11=1
42 | - bzip2=1.0.8=h7b6447c_0
43 | - c-ares=1.19.0=h5eee18b_0
44 | - ca-certificates=2023.05.30=h06a4308_0
45 | - cachetools=4.2.2=pyhd3eb1b0_0
46 | - cairo=1.16.0=hb05425b_5
47 | - cdbtools=0.99=hdcf5f25_9
48 | - certifi=2023.5.7=py39h06a4308_0
49 | - cffi=1.15.1=py39h5eee18b_3
50 | - charset-normalizer=2.0.4=pyhd3eb1b0_0
51 | - cigar=0.1.3=pyh864c0ab_1
52 | - coin-or-cbc=2.10.7=hd28fd6d_0
53 | - coin-or-cgl=0.60.5=h3cb4718_0
54 | - coin-or-clp=1.17.7=h4e0f3ec_0
55 | - coin-or-osi=0.108.7=h3cb4718_0
56 | - coin-or-utils=2.11.6=h4e0f3ec_0
57 | - coincbc=2.10.7=0_metapackage
58 | - configargparse=1.4=pyhd3eb1b0_0
59 | - connection_pool=0.0.3=pyhd3deb0d_0
60 | - cryptography=39.0.1=py39h9ce1e76_0
61 | - curl=7.26.0=1
62 | - cutesv=2.0.3=pyhdfd78af_0
63 | - datrie=0.8.2=py39h27cfd23_0
64 | - dbus=1.10.20=0
65 | - defusedxml=0.7.1=pyhd3eb1b0_0
66 | - dendropy=4.6.1=pyhdfd78af_0
67 | - diamond=2.1.8=h43eeafb_0
68 | - docutils=0.18.1=py39h06a4308_3
69 | - dpath=2.1.6=pyha770c72_0
70 | - dropbox=11.36.1=py39h06a4308_0
71 | - entrez-direct=16.2=he881be0_1
72 | - epic2=0.0.52=py39h9f35bd6_6
73 | - exceptiongroup=1.0.4=py39h06a4308_0
74 | - expat=2.5.0=hcb278e6_1
75 | - fastp=0.22.0=h2e03b76_0
76 | - filechunkio=1.8=py_2
77 | - filelock=3.9.0=py39h06a4308_0
78 | - findutils=4.6.0=h166bdaf_1001
79 | - flye=2.9.1=py39h6935b12_0
80 | - font-ttf-dejavu-sans-mono=2.37=0
81 | - font-ttf-inconsolata=2.000=0
82 | - font-ttf-source-code-pro=2.030=0
83 | - font-ttf-ubuntu=0.83=0
84 | - fontconfig=2.14.2=h14ed4e7_0
85 | - fonts-anaconda=1=h8fa9717_0
86 | - fonts-conda-ecosystem=1=hd3eb1b0_0
87 | - freetype=2.12.1=h4a9f257_0
88 | - fribidi=1.0.10=h7b6447c_0
89 | - frozenlist=1.3.3=py39h5eee18b_0
90 | - ftputil=5.0.4=pyhd8ed1ab_0
91 | - gawk=5.1.0=h7b6447c_0
92 | - gcc_impl_linux-64=11.2.0=h1234567_1
93 | - gettext=0.21.1=h27087fc_0
94 | - gfortran_impl_linux-64=11.2.0=h7a446d4_16
95 | - gitdb=4.0.7=pyhd3eb1b0_0
96 | - gitpython=3.1.30=py39h06a4308_0
97 | - glib=2.76.4=hfc55251_0
98 | - glib-tools=2.76.4=hfc55251_0
99 | - gmp=6.2.1=h295c915_3
100 | - google-api-core=2.10.1=py39h06a4308_0
101 | - google-api-python-client=2.93.0=pyhd8ed1ab_0
102 | - google-auth=2.6.0=pyhd3eb1b0_0
103 | - google-auth-httplib2=0.1.0=pyhd8ed1ab_1
104 | - google-cloud-core=2.3.2=py39h06a4308_0
105 | - google-cloud-storage=2.6.0=py39h06a4308_0
106 | - google-crc32c=1.5.0=py39h5eee18b_0
107 | - google-resumable-media=2.4.0=py39h06a4308_0
108 | - googleapis-common-protos=1.56.4=py39h06a4308_0
109 | - graphaligner=1.0.17=hd03093a_0
110 | - graphite2=1.3.14=h295c915_1
111 | - grpc-cpp=1.48.2=h5bf31a4_0
112 | - grpcio=1.48.2=py39h5bf31a4_0
113 | - gsl=2.7=he838d99_0
114 | - gxx_impl_linux-64=11.2.0=h1234567_1
115 | - harfbuzz=6.0.0=h8e241bc_0
116 | - hifiasm=0.19.5=h43eeafb_2
117 | - hmmer=3.1b2=3
118 | - htslib=1.17=h6bc39ce_1
119 | - httplib2=0.22.0=pyhd8ed1ab_0
120 | - humanfriendly=10.0=py39h06a4308_1
121 | - icu=70.1=h27087fc_0
122 | - idna=3.4=py39h06a4308_0
123 | - iniconfig=1.1.1=pyhd3eb1b0_0
124 | - intervaltree=3.1.0=pyhd3eb1b0_0
125 | - irissv=1.0.4=hdfd78af_2
126 | - jasminesv=1.1.5=hdfd78af_0
127 | - jinja2=3.1.2=py39h06a4308_0
128 | - jmespath=0.10.0=pyhd3eb1b0_0
129 | - jpeg=9e=h5eee18b_1
130 | - jsonschema=4.17.3=py39h06a4308_0
131 | - jupyter_core=5.3.0=py39h06a4308_0
132 | - k8=0.2.5=hdcf5f25_4
133 | - kernel-headers_linux-64=3.10.0=h57e8cba_10
134 | - krb5=1.19.4=h568e23c_0
135 | - ld_impl_linux-64=2.38=h1181459_1
136 | - lerc=4.0.0=h27087fc_0
137 | - libblas=3.9.0=17_linux64_openblas
138 | - libcblas=3.9.0=17_linux64_openblas
139 | - libcrc32c=1.1.2=h6a678d5_0
140 | - libcurl=7.88.1=h91b91d3_0
141 | - libdeflate=1.13=h166bdaf_0
142 | - libdivsufsort=2.0.2=h031d066_8
143 | - libedit=3.1.20221030=h5eee18b_0
144 | - libev=4.33=h7f8727e_1
145 | - libexpat=2.5.0=hcb278e6_1
146 | - libffi=3.4.4=h6a678d5_0
147 | - libgcc-devel_linux-64=11.2.0=h1234567_1
148 | - libgcc-ng=13.1.0=he5830b7_0
149 | - libgfortran-ng=13.1.0=h69a702a_0
150 | - libgfortran5=13.1.0=h15d22d2_0
151 | - libglib=2.76.4=hebfc3b9_0
152 | - libgomp=13.1.0=he5830b7_0
153 | - libiconv=1.17=h166bdaf_0
154 | - libidn2=2.3.4=h5eee18b_0
155 | - libjemalloc=5.3.0=hcb278e6_0
156 | - liblapack=3.9.0=17_linux64_openblas
157 | - libnghttp2=1.52.0=ha637b67_1
158 | - libnsl=2.0.0=h5eee18b_0
159 | - libopenblas=0.3.23=pthreads_h80387f5_0
160 | - libpng=1.6.39=h5eee18b_0
161 | - libprotobuf=3.20.3=he621ea3_0
162 | - libsodium=1.0.18=h7b6447c_0
163 | - libsqlite=3.42.0=h2797004_0
164 | - libssh2=1.10.0=h37d81fd_2
165 | - libstdcxx-devel_linux-64=11.2.0=h1234567_1
166 | - libstdcxx-ng=13.1.0=hfd8a6a1_0
167 | - libtiff=4.4.0=h0e0dad5_3
168 | - libunistring=0.9.10=h27cfd23_0
169 | - libuuid=2.38.1=h0b41bf4_0
170 | - libwebp-base=1.2.4=h5eee18b_1
171 | - libxcb=1.15=h7f8727e_0
172 | - libxml2=2.10.3=hca2bb57_4
173 | - libzlib=1.2.13=hd590300_5
174 | - logmuse=0.2.6=pyh8c360ce_0
175 | - lp_solve=5.5.2.5=h14c3975_1001
176 | - lz4-c=1.9.4=h6a678d5_0
177 | - make=4.2.1=h1bed415_1
178 | - markdown-it-py=2.2.0=py39h06a4308_1
179 | - markupsafe=2.1.1=py39h7f8727e_0
180 | - mashmap=3.0.5=h97b747e_0
181 | - mbg=1.0.15=hdcf5f25_2
182 | - mdurl=0.1.0=py39h06a4308_0
183 | - merqury=1.3=hdfd78af_1
184 | - meryl=1.3=hdbdd923_2
185 | - metaeuk=6.a5d39d9=pl5321h6a68c12_3
186 | - metis=5.1.0=hf484d3e_4
187 | - minimap2=2.24=h7132678_1
188 | - mpfr=4.0.2=hb69a4c5_1
189 | - mscorefonts=0.0.1=3
190 | - multidict=6.0.2=py39h5eee18b_0
191 | - mummer=3.23=pl5321hdbdd923_16
192 | - mysql-connector-c=6.1.11=h24aacaa_2
193 | - natsort=7.1.1=pyhd3eb1b0_0
194 | - nbformat=5.7.0=py39h06a4308_0
195 | - ncbi-vdb=3.0.0=pl5321h87f3376_0
196 | - ncurses=6.4=h6a678d5_0
197 | - networkx=2.8.4=py39h06a4308_1
198 | - numexpr=2.8.4=py39hd2a5715_1
199 | - numpy=1.25.0=py39heeff2f4_0
200 | - numpy-base=1.25.0=py39h8a23956_0
201 | - oauth2client=4.1.3=py_0
202 | - openblas=0.3.23=pthreads_h855a84d_0
203 | - openjdk=11.0.13=h87a67e3_0
204 | - openssl=1.1.1u=h7f8727e_0
205 | - packaging=23.0=py39h06a4308_0
206 | - pandas=1.5.3=py39h417a72b_0
207 | - pango=1.50.14=hd33c08f_0
208 | - paramiko=2.8.1=pyhd3eb1b0_0
209 | - parasail-python=1.3.4=py39h4e691d4_0
210 | - pbzip2=1.1.13=0
211 | - pcre=8.45=h295c915_0
212 | - pcre2=10.40=hc3806b6_0
213 | - peppy=0.35.6=pyhd8ed1ab_0
214 | - perl=5.32.1=0_h5eee18b_perl5
215 | - perl-apache-test=1.43=pl5321hdfd78af_0
216 | - perl-app-cpanminus=1.7046=pl5321hd8ed1ab_0
217 | - perl-archive-tar=2.40=pl5321hdfd78af_0
218 | - perl-base=2.23=pl5321hdfd78af_2
219 | - perl-carp=1.38=pl5321hdfd78af_4
220 | - perl-class-load=0.25=pl5321hdfd78af_1
221 | - perl-class-load-xs=0.10=pl5321h9f5acd7_6
222 | - perl-class-method-modifiers=2.13=pl5321hdfd78af_0
223 | - perl-common-sense=3.75=pl5321hdfd78af_0
224 | - perl-compress-raw-bzip2=2.201=pl5321h87f3376_1
225 | - perl-compress-raw-zlib=2.105=pl5321h87f3376_0
226 | - perl-constant=1.33=pl5321hdfd78af_2
227 | - perl-cpan-meta-check=0.014=pl5321hdfd78af_1
228 | - perl-cpan-meta-requirements=2.143=pl5321hdfd78af_0
229 | - perl-data-optlist=0.113=pl5321ha770c72_0
230 | - perl-dbi=1.643=pl5321hec16e2b_1
231 | - perl-devel-globaldestruction=0.14=pl5321hdfd78af_1
232 | - perl-devel-overloadinfo=0.007=pl5321hdfd78af_0
233 | - perl-devel-stacktrace=2.04=pl5321hdfd78af_1
234 | - perl-dist-checkconflicts=0.11=pl5321hdfd78af_3
235 | - perl-encode=3.19=pl5321hec16e2b_1
236 | - perl-eval-closure=0.14=pl5321h9f5acd7_6
237 | - perl-exporter=5.72=pl5321hdfd78af_2
238 | - perl-exporter-tiny=1.002002=pl5321hdfd78af_0
239 | - perl-extutils-makemaker=7.70=pl5321hd8ed1ab_0
240 | - perl-file-path=2.18=pl5321hd8ed1ab_0
241 | - perl-file-temp=0.2304=pl5321hd8ed1ab_0
242 | - perl-file-which=1.24=pl5321hd8ed1ab_0
243 | - perl-getopt-long=2.54=pl5321hdfd78af_0
244 | - perl-inc-latest=0.500=pl5321ha770c72_0
245 | - perl-io-compress=2.201=pl5321hdbdd923_2
246 | - perl-io-zlib=1.14=pl5321hdfd78af_0
247 | - perl-json=4.10=pl5321hdfd78af_0
248 | - perl-json-xs=2.34=pl5321h4ac6f70_6
249 | - perl-list-moreutils=0.430=pl5321hdfd78af_0
250 | - perl-list-moreutils-xs=0.430=pl5321h031d066_2
251 | - perl-module-build=0.4234=pl5321ha770c72_0
252 | - perl-module-implementation=0.09=pl5321hdfd78af_3
253 | - perl-module-metadata=1.000038=pl5321hdfd78af_0
254 | - perl-module-runtime=0.016=pl5321hdfd78af_2
255 | - perl-module-runtime-conflicts=0.003=pl5321hdfd78af_1
256 | - perl-moo=2.005004=pl5321hdfd78af_0
257 | - perl-moose=2.2202=pl5321hec16e2b_0
258 | - perl-mro-compat=0.15=pl5321hdfd78af_0
259 | - perl-package-deprecationmanager=0.17=pl5321hdfd78af_1
260 | - perl-package-stash=0.40=pl5321h87f3376_1
261 | - perl-package-stash-xs=0.30=pl5321h0b41bf4_0
262 | - perl-parallel-forkmanager=2.02=pl5321hdfd78af_1
263 | - perl-params-util=1.102=pl5321h9f5acd7_1
264 | - perl-parent=0.236=pl5321hdfd78af_2
265 | - perl-pathtools=3.75=pl5321hec16e2b_3
266 | - perl-role-tiny=2.002004=pl5321hdfd78af_0
267 | - perl-scalar-list-utils=1.62=pl5321hec16e2b_1
268 | - perl-storable=3.15=pl5321hec16e2b_3
269 | - perl-sub-exporter=0.988=pl5321hdfd78af_0
270 | - perl-sub-exporter-progressive=0.001013=pl5321hdfd78af_1
271 | - perl-sub-identify=0.14=pl5321hec16e2b_2
272 | - perl-sub-install=0.928=pl5321hdfd78af_3
273 | - perl-sub-name=0.21=pl5321hec16e2b_3
274 | - perl-sub-quote=2.006006=pl5321hdfd78af_0
275 | - perl-test-fatal=0.016=pl5321hdfd78af_0
276 | - perl-try-tiny=0.31=pl5321hdfd78af_1
277 | - perl-types-serialiser=1.01=pl5321hdfd78af_0
278 | - perl-version=0.9924=pl5321hec16e2b_2
279 | - perl-xsloader=0.24=pl5321hd8ed1ab_0
280 | - perl-yaml=1.30=pl5321hdfd78af_0
281 | - pip=23.1.2=py39h06a4308_0
282 | - pixman=0.40.0=h7f8727e_1
283 | - plac=1.3.4=pyhd3eb1b0_0
284 | - platformdirs=2.5.2=py39h06a4308_0
285 | - pluggy=1.0.0=py39h06a4308_1
286 | - ply=3.11=py39h06a4308_0
287 | - prettytable=3.5.0=py39h06a4308_0
288 | - prodigal=2.6.3=h031d066_6
289 | - protobuf=3.20.3=py39h6a678d5_0
290 | - psutil=5.9.0=py39h5eee18b_0
291 | - pulp=2.7.0=py39hf3d152e_0
292 | - pyasn1=0.4.8=pyhd3eb1b0_0
293 | - pyasn1-modules=0.2.8=py_0
294 | - pycparser=2.21=pyhd3eb1b0_0
295 | - pygments=2.15.1=py39h06a4308_1
296 | - pynacl=1.5.0=py39h5eee18b_0
297 | - pyopenssl=23.0.0=py39h06a4308_0
298 | - pyparsing=3.0.9=py39h06a4308_0
299 | - pyrsistent=0.18.0=py39heee7806_0
300 | - pysam=0.21.0=py39h9abd093_0
301 | - pysftp=0.2.9=pyhd3eb1b0_1
302 | - pysocks=1.7.1=py39h06a4308_0
303 | - pytest=7.3.1=py39h06a4308_0
304 | - python=3.9.16=h7a1cb2a_2
305 | - python-dateutil=2.8.2=pyhd3eb1b0_0
306 | - python-fastjsonschema=2.16.2=py39h06a4308_0
307 | - python-irodsclient=1.1.8=pyhd8ed1ab_0
308 | - python_abi=3.9=2_cp39
309 | - pytz=2022.7=py39h06a4308_0
310 | - pyvcf=0.6.8=py39hde42818_1002
311 | - pyyaml=6.0=py39h5eee18b_1
312 | - r-argparse=2.1.5=r42h6115d3f_0
313 | - r-base=4.2.2=hb87df5d_1
314 | - r-cli=3.3.0=r42h884c59f_0
315 | - r-colorspace=2.0_3=r42h76d94ec_0
316 | - r-crayon=1.5.1=r42h6115d3f_0
317 | - r-digest=0.6.29=r42h884c59f_0
318 | - r-ellipsis=0.3.2=r42h76d94ec_0
319 | - r-fansi=1.0.3=r42h76d94ec_0
320 | - r-farver=2.1.0=r42h884c59f_0
321 | - r-findpython=1.0.7=r42h6115d3f_0
322 | - r-ggplot2=3.3.6=r42h6115d3f_0
323 | - r-glue=1.6.2=r42h76d94ec_0
324 | - r-gtable=0.3.0=r42h6115d3f_0
325 | - r-isoband=0.2.5=r42h884c59f_0
326 | - r-jsonlite=1.8.0=r42h76d94ec_0
327 | - r-labeling=0.4.2=r42h6115d3f_0
328 | - r-lattice=0.20_45=r42h76d94ec_0
329 | - r-lifecycle=1.0.1=r42h142f84f_0
330 | - r-magrittr=2.0.3=r42h76d94ec_0
331 | - r-mass=7.3_57=r42h76d94ec_0
332 | - r-matrix=1.4_1=r42h76d94ec_0
333 | - r-mgcv=1.8_40=r42h76d94ec_0
334 | - r-munsell=0.5.0=r42h6115d3f_0
335 | - r-nlme=3.1_157=r42h640688f_0
336 | - r-pillar=1.7.0=r42h6115d3f_0
337 | - r-pkgconfig=2.0.3=r42h6115d3f_0
338 | - r-r6=2.5.1=r42h6115d3f_0
339 | - r-rcolorbrewer=1.1_3=r42h6115d3f_0
340 | - r-rlang=1.0.2=r42h884c59f_0
341 | - r-scales=1.2.0=r42h6115d3f_0
342 | - r-tibble=3.1.7=r42h76d94ec_0
343 | - r-utf8=1.2.2=r42h76d94ec_0
344 | - r-vctrs=0.4.1=r42h884c59f_0
345 | - r-viridislite=0.4.0=r42h6115d3f_0
346 | - r-withr=2.5.0=r42h6115d3f_0
347 | - racon=1.5.0=h21ec9f0_2
348 | - ragtag=2.1.0=pyhb7b1952_0
349 | - re2=2022.04.01=h295c915_0
350 | - readline=8.2=h5eee18b_0
351 | - requests=2.29.0=py39h06a4308_0
352 | - reretry=0.11.8=pyhd8ed1ab_0
353 | - rich=13.3.5=py39h06a4308_0
354 | - rsa=4.7.2=pyhd3eb1b0_1
355 | - s3transfer=0.6.0=py39h06a4308_0
356 | - samtools=1.17=hd87286a_1
357 | - scipy=1.11.3=py39heeff2f4_0
358 | - sed=4.8=h7b6447c_0
359 | - sepp=4.4.0=py39_0
360 | - setuptools=67.8.0=py39h06a4308_0
361 | - seqkit=2.5.1=h9ee0642_0
362 | - six=1.16.0=pyhd3eb1b0_1
363 | - slacker=0.14.0=py_0
364 | - smart_open=5.2.1=py39h06a4308_0
365 | - smmap=4.0.0=pyhd3eb1b0_0
366 | - snakemake=7.21.0=hdfd78af_0
367 | - snakemake-minimal=7.21.0=pyhdfd78af_0
368 | - sniffles=2.0.7=pyhdfd78af_0
369 | - sortedcontainers=2.4.0=pyhd3eb1b0_0
370 | - sqlite=3.41.2=h5eee18b_0
371 | - stone=3.3.1=py39h06a4308_0
372 | - stopit=1.1.2=py_0
373 | - suitesparse=5.10.1=he2db622_2
374 | - sysroot_linux-64=2.17=h57e8cba_10
375 | - tabulate=0.8.10=py39h06a4308_0
376 | - tar=1.34=hb2e2bae_1
377 | - tbb=2021.8.0=hdb19cb5_0
378 | - throttler=1.2.1=pyhd8ed1ab_0
379 | - tk=8.6.12=h1ccaba5_0
380 | - tktable=2.10=h14c3975_0
381 | - tomli=2.0.1=py39h06a4308_0
382 | - toposort=1.10=pyhd8ed1ab_0
383 | - traitlets=5.7.1=py39h06a4308_0
384 | - tzdata=2023c=h04d1e81_0
385 | - ubiquerg=0.6.2=pyhd8ed1ab_0
386 | - ucsc-fatotwobit=447=h954228d_0
387 | - ucsc-twobitinfo=447=h954228d_0
388 | - uritemplate=4.1.1=pyhd8ed1ab_0
389 | - urllib3=1.26.16=py39h06a4308_0
390 | - veracitools=0.1.3=py_0
391 | - verkko=1.3.1=h64afbab_0
392 | - wcwidth=0.2.5=pyhd3eb1b0_0
393 | - wfmash=0.10.3=hea8008d_2
394 | - wget=1.21.4=h91b91d3_1
395 | - wheel=0.38.4=py39h06a4308_0
396 | - winnowmap=2.03=h43eeafb_2
397 | - wrapt=1.14.1=py39h5eee18b_0
398 | - xorg-kbproto=1.0.7=h7f98852_1002
399 | - xorg-libice=1.0.10=h7f98852_0
400 | - xorg-libsm=1.2.3=hd9c2040_1000
401 | - xorg-libx11=1.8.6=h8ee46fc_0
402 | - xorg-libxt=1.3.0=hd590300_0
403 | - xorg-xextproto=7.3.0=h0b41bf4_1003
404 | - xorg-xproto=7.0.31=h27cfd23_1007
405 | - xz=5.2.10=h5eee18b_1
406 | - yahs=1.2a.2=he4a0461_2
407 | - yaml=0.2.5=h7b6447c_0
408 | - yarl=1.8.1=py39h5eee18b_0
409 | - yte=1.5.1=pyha770c72_2
410 | - zlib=1.2.13=hd590300_5
411 | - zstd=1.5.5=hc292b87_0
412 |
--------------------------------------------------------------------------------
/clust.json:
--------------------------------------------------------------------------------
1 | #snakemake --cluster-config clust.json --cluster '{cluster.account}'
2 | {
3 | "__default__" :
4 | {
5 | "account" : "sbatch -N 1 -n 1 -c 96 -p Partition",
6 | "jobs" : "59"
7 | },
8 | "hifi_fastp" :
9 | {
10 | "account" : "sbatch -N 1 -n 1 -c 16 -p Partition",
11 | "jobs" : "59"
12 | },
13 | "ont_fastp" :
14 | {
15 | "account" : "sbatch -N 1 -n 1 -c 16 -p Partition",
16 | "jobs" : "59"
17 | },
18 | }
19 |
--------------------------------------------------------------------------------
/conf_ck.yaml:
--------------------------------------------------------------------------------
1 | HiFi_reads_merge:
2 | SPART_PATH/example/hifi/HiFi.fastq
3 | ONT_reads_merge:
4 | SPART_PATH/example/ont/ONT.fastq
5 | mitochondrion:
6 | SPART_PATH/example/mt/mitochondrion.fasta
7 | chloroplast:
8 | SPART_PATH/example/cp/chloroplast.fasta
9 | hic_dir:
10 | SPART_PATH/example/hic
11 | SPART_dir:
12 | SPART_PATH
13 | hic_enzyme:
14 | GATC
15 | hic_enzyme_ligation_site:
16 | GATCGATC
17 | verkko_assemble:
18 | SPART_PATH/example/verkko/verkko.fasta
19 | pcrfree_r1:
20 | SPART_PATH/example/pcr/PCRFREE_1.fq
21 | pcrfree_r2:
22 | SPART_PATH/example/pcr/PCRFREE_2.fq
23 | google_deepvariant_latest-gpu_sif:
24 | google_deepvariant_latest_gpu_sif
25 | DIR:
26 | SPART_PATH/example/hifi
27 | DIRont:
28 | SPART_PATH/example/ont
29 | WORKDIR:
30 | SPART_PATH/example/test/
31 |
--------------------------------------------------------------------------------
/contacts.md:
--------------------------------------------------------------------------------
1 | Shoucheng Liu (liusc_work@163.com)
2 |
3 | Xiaopeng Li (xiaopeng.li@pku-iaas.edu.cn)
4 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | echo.
16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | echo.installed, then set the SPHINXBUILD environment variable to point
18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | echo.may add the Sphinx directory to PATH.
20 | echo.
21 | echo.If you don't have Sphinx installed, grab it from
22 | echo.https://www.sphinx-doc.org/
23 | exit /b 1
24 | )
25 |
26 | if "%1" == "" goto help
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/source/README.md:
--------------------------------------------------------------------------------
1 | # Install
2 | git clone https://github.com/liushoucheng/SPART.git
3 |
4 | cd SPART
5 |
6 | conda env create -f SPART.yaml
7 |
8 | conda activate spart
9 |
10 | # Dependencies
11 |
12 | List of tools assumed loadable or accessible with no path are:
13 |
14 | * [Bionano DLS map]( https://bionano.com)
15 |
16 | * [HiC-Pro v3.1.0]( https://github.com/nservant/HiC-Pro)
17 |
18 | * [_submit_telomere.sh]( https://github.com/VGP/vgp-assembly/blob/master/pipeline/telomere/_submit_telomere.sh)
19 |
20 | * [Medaka]( https://anaconda.org/bioconda/medaka)
21 |
22 | * [racon]( https://anaconda.org/bioconda/racon)
23 |
24 | * [hisat2]( https://github.com/DaehwanKimLab/hisat2)
25 |
26 | * [DeepVariant v1.5.0-gpu]( https://github.com/google/deepvariant)
27 |
28 | * [PEPPER-Margin-DeepVariant v0.8-gpu]( https://github.com/kishwarshafin/pepper)
29 |
30 | * [hap.py v0.3.15]( https://github.com/Illumina/hap.py)
31 |
32 | * [vcf_merge_t2t.py](https://github.com/kishwarshafin/T2T_polishing_scripts/blob/master/polishing_merge_script/vcf_merge_t2t.py)
33 |
34 | # Running pipeline with snakemake
35 | Exclude Verkko,Bionano DLS Map,Telomere determination and patch,Centromeric region analysis,Variant calls and Evaluation
36 |
37 | sed -i "s#^ SPART_PATH# ${PWD}#g" conf_ck.yaml
38 |
39 | HiC_enzyme=" GATC"
40 |
41 | sed -i "s#^ hic_sca_enzyme# ${HiC_enzyme}#g" conf_ck.yaml
42 |
43 | HiC_ligation_site=" GATCGATC"
44 |
45 | sed -i "s#^ hic_sca_ligation_site# ${HiC_ligation_site}#g" conf_ck.yaml
46 |
47 | snakemake -s SPART.py --cluster-config clust.json --configfile conf_ck.yaml --cluster '{cluster.account}' --jobs $threads --rerun-incomplete --restart-times 1 -np --rulegraph |dot -Tpng > rule.png
48 |
49 | configfile:The config file can be used to define a dictionary of configuration parameters and their values.
50 |
51 | cluster-config:A JSON or YAML file that defines the wildcards used in 'cluster'for specific rules.
52 |
53 |

54 |
55 |
56 | ## Output files
57 | please see the complete [documentation]( https://github.com/liushoucheng/SPART/tree/main/exmple).
58 |
59 |
--------------------------------------------------------------------------------
/docs/source/README1.md:
--------------------------------------------------------------------------------
1 | # 00_Contig screen
2 | ## Fastp :was used to filter adapter sequences, primers and other low quality sequence from raw sequencing reads.
3 | SPART/00_Contig_screen/fastp.sh $HiFi_reads $ONT_reads
4 | ## Hifiasm
5 | SPART/00_Contig_screen/hifiasm.sh $HiFi_reads $ONT_reads $output_prefix
6 | ## Verkko
7 | SPART/00_Contig_screen/verkko.sh $output_prefix $HiFi_reads $ONT_reads $threads $memory
8 | ## Flye
9 | SPART/00_Contig_screen/flye.sh $ONT_reads $output_prefix $threads
10 | ## Remove MT & CP
11 | SPART/00_Contig_screen/rm_mt_cp.sh $mitochondrion $chloroplast $ref
12 | # 01_Contig scaffolding
13 | ## Bionano
14 | SPART/01_Contig_scaffolding/Bionano_DLS_map.sh threads bnx ref_cmap prefix xml Bio_dir cluster_xml ref bio_camp merge_xml RefAligner
15 | ## Hi-C
16 | SPART/01_Contig_scaffolding/HiC-Pro.sh ref ref_prefix hicpro_data hicpro_config hicpro_outdir
17 |
18 | SPART/01_Contig_scaffolding/yahs.sh enzyme ref bed/bam/bin profix
19 | # 02_Gap patching
20 | SPART/02_Gap_patching/wfmash_ragtag.sh prefix ref region
21 |
22 | ## Manual operation
23 |
24 | cd ragtag_output
25 |
26 | perl SPART/02_Gap_patching/paf_filter.pl -i ragtag.patch.debug.filtered.paf -minlen 10000000 -iden 0.5
27 |
28 | **Manually editing the ragtag.patch.debug.filtered.paf file.Keep the high-quality contig and preserve the location of the only high confidence match in ragtag.patch.debug.filtered.paf that matches the sequence at both ends of the gap.**
29 |
30 | perl SPART/02_Gap_patching/renameagp.pl -i ragtag.patch.ctg.agp -i1 ragtag.patch.debug.filtered.paf -start seq00000000 -end seq00000001 -o test.agp
31 |
32 | **Test.agp is merged into ragtag.patch.agp and fasta is generated.**
33 |
34 | ## telomere patching
35 | We used _submit_telomere.sh in ONT reads >100kb.ONT reads with telomere sequence mapping to this locus based on minimap2 alignments were manually identified. The longest was selected as template , all others aligned to it and polished with Medaka:
36 |
37 | medaka -v -i ONT_tel_reads.fasta -d longest_ont_tel.fasta -o ont_tel_medaka.fasta
38 |
39 | Telomere signal in all HiFi reads was identified with the commands:
40 |
41 | _submit_telomere.sh hifi_reads.fasta
42 |
43 | Additional HiFi reads were recruited from a manual analysis. We looked for trimmed tips that could extend. All reads had telomere signal and were aligned to the medaka consensus and polished with Racon with the commands:
44 |
45 | minimap2 -t16 -ax map-pb ont_tel_medaka.fasta hifi_tel.fasta > medaka.sam
46 |
47 | racon hifi_tel.fasta medaka.sam ont_tel_medaka.fasta > racon.fasta
48 |
49 | Finally, the polished result was patched into the assembly with ragtag patch or manually patched.
50 | ### Citation
51 | https://github.com/marbl/CHM13-issues/blob/main/error_detection.md.
52 | ## Centromeric region analysis
53 |
54 | SPART/02_Gap_patching/Centromeric_region_analysis.sh workdir FASTA INDEX prefix CHIP1 CHIP2 threads
55 |
56 | # 03_Polishing
57 | SPART/03_Polishing/calsv_snv.sh workdir ref threads
58 | # 04_Evaluation
59 | ## BUSCO
60 | SPART/04_Evaluation/BUSCO.sh ref prefix
61 | ## mapping rates & coverages
62 | SPART/04_Evaluation/mapping_rates_coverages.sh hybrid_bam single_bam ont_bam
63 | ## LTR
64 | SPART/04_Evaluation/ltr.sh ref prefix
65 | ## QV
66 | SPART/04_Evaluation/qv.sh query ref
67 | ## BACs
68 | SPART/04_Evaluation/bac.sh bac_reads ref_chr
69 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # For the full list of built-in configuration values, see the documentation:
4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
5 |
6 | # -- Project information -----------------------------------------------------
7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
8 |
9 | project = 'SPART'
10 | copyright = '2023, Shoucheng Liu'
11 | author = 'Shoucheng Liu'
12 |
13 | # -- General configuration ---------------------------------------------------
14 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
15 |
16 | extensions = ['recommonmark', 'sphinx_markdown_tables']
17 |
18 | templates_path = ['_templates']
19 | exclude_patterns = []
20 |
21 |
22 |
23 | # -- Options for HTML output -------------------------------------------------
24 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
25 |
26 | import sphinx_rtd_theme
27 | html_theme = "sphinx_rtd_theme"
28 | html_theme_path = ['/home/liusc/software/miniconda3/lib/python3.9/site-packages']
29 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | .. SPART documentation master file, created by
2 | sphinx-quickstart on Thu Oct 12 17:50:51 2023.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | SPART
7 | =================================
8 | SPART, a Semi-automated pipeline for assembling reference sequence of telomere-to-telomere (T2T).
9 |
10 | .. image:: pipeline.jpg
11 |
12 |
13 | Quick install and start
14 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
15 | .. toctree::
16 | :maxdepth: 2
17 |
18 | README
19 |
20 | Run step by step
21 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
22 | .. toctree::
23 | :maxdepth: 2
24 |
25 | README1
26 |
--------------------------------------------------------------------------------
/docs/source/pipeline.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liushoucheng/SPART/8b2d0cf778a446c1e24dbdc772e914262cff5930/docs/source/pipeline.jpg
--------------------------------------------------------------------------------
/docs/source/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx-autobuild==2021.3.14
2 | sphinx-markdown-tables==0.0.17
3 | sphinx-rtd-theme==1.3.0
4 | sphinxcontrib-applehelp==1.0.7
5 | sphinxcontrib-devhelp==1.0.5
6 | sphinxcontrib-htmlhelp==2.0.4
7 | sphinxcontrib-jquery==4.1
8 | sphinxcontrib-jsmath==1.0.1
9 | sphinxcontrib-qthelp==1.0.6
10 | sphinxcontrib-serializinghtml==1.1.9
11 | recommonmark==0.7.1
12 |
--------------------------------------------------------------------------------
/example/README.md:
--------------------------------------------------------------------------------
1 | # Running pipeline with snakemake(Exclude Verkko,Bionano DLS Map,Telomere determination and patch,Centromeric region analysis,Variant calls and Evaluation):
2 | sed -i "s#^ SPART_PATH# ${PWD}#g" conf_ck.yaml
3 |
4 | snakemake -s SPART.py --cluster-config clust.json --configfile conf_ck.yaml --cluster '{cluster.account}' --jobs $threads --rerun-incomplete --restart-times 1 -np --rulegraph |dot -Tsvg > rule.svg
5 |
6 | **configfile**:The config file can be used to define a dictionary of configuration parameters and their values.
7 |
8 | **cluster-config**:A JSON or YAML file that defines the wildcards used in 'cluster'for specific rules.
9 | # Output files
10 | **Workdir/fastp**:Filtered adapter sequences, primers and other low quality sequence from raw HiFi and ONT sequencing reads.
11 |
12 | **Workdir/hifiasm_hybrid/hybrid.all.asm.p_ctg.fa**:Hifiasm generated a preliminary contig genome assembly.
13 |
14 | **Workdir/flye/assembly.fasta**:Flye generated the ONT UL reads assembly.
15 |
16 | **Workdir/hifiasm_hybrid/hybrid.remove_cp_mt.fa**:Contigs with at least 50% of their bases covered by alignments were considered to be chloroplast or mitochondria genome sequences and were removed from the assembly.
17 |
18 | **Workdir/hic_hybrid/hic_hybrid.bam**:Hi-C data were classified as valid or invalid interaction pairs.
19 |
20 | **Workdir/yahs_hybrid/yahs_hybrid.fa**:Only valid interaction pairs were retained for subsequent assembly and scaffolding into chromosomes.
21 |
22 | **Workdir/patch_flye/patch_single_hybrid_flye.fa**:Assembly gaps in chromosome scaffolds were directly filled by the corresponding Flye.
23 |
24 | **Workdir/patch_verkko/patch_single_hybrid_flye_verkko.fa**:Assembly gaps in chromosome scaffolds were directly filled by the corresponding Verkko.
25 |
26 | **Workdir/hybrid/hybrid.bam**:Alignment data file between patch_single_hybrid_flye_verkko.fa and HiFi reads.
27 |
28 | **Workdir/hybrid_hifi_pcr/pcr.bam**:Alignment data file between patch_single_hybrid_flye_verkko.fa and PCR-FREE reads.
29 |
30 | **Workdir/hybrid_hifi_pcr/hybrid.bam**:Merged Workdir/hybrid/hybrid.bam and Workdir/hybrid_hifi_pcr/pcr.bam.
31 |
32 | **Workdir/ont_merge/q10l120k.bam**:Alignment data file between patch_single_hybrid_flye_verkko.fa and ONT reads.
33 |
34 |
35 |

36 |
37 |
--------------------------------------------------------------------------------
/example/pcr/PCRFREE_1.fq:
--------------------------------------------------------------------------------
1 | @E100053086L1C001R0020000056/1
2 | ACCAGGAATATCAATGAACCCCATTCTTGCAATTGCTCAGGATACCCTCTTTTAGCTGCTAGGTCTATTTCTTAGTTCAAGATCCCTCTTACTAACTGGAATAAAAGAATTAGTAGATCTGTTCCGCCCAAAATGGGAATGGGCGCTAGG
3 | +
4 | GGFGGGGGGGFGGGGGFGGGFHCFGFGFGGFGGEFFFGGGEGEGFGGGFGEGGGGGEGGEGGG>GGEGFFFGFEGGFFGEFEGFGGFGFGFGGFGGGGFFGGGFGFGGBGGG
5 | @E100053086L1C001R0020000058/1
6 | CAAAGCTAGGATGTCGGGTCTCGTTATGTAGACCTTTAGCCCACTGGCACTTAGATATTGCACTCTCCTCGGCAACTTCATCAAGGTCCTATAACATGTTCCTATAGTAGCTAGGCGGCGCTTTATTGTTTGCCACCTGCCTCATCTTGG
7 | +
8 | FGGFFGGGGFGGGFGGFGFGFGGGGGGGFGGFGGGFFGGFGGFGGGFGGGGGGFGGGGGGGGGGGGGGGGFGFGGGGGGGFGGFGGGGGGGGGGGGGGGGGGGGGGGEGGGGGFGFGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
9 | @E100053086L1C001R0020000070/1
10 | AAAGCACGGCACCTCCGAGTTCTGCACACGTTCGGCTCGGTGACGTCCTCGCCTTCTCGATCCAACAAGAGGGGCGAAGTAGTAGATGAGTTCCGGCAGCACGACGGCGTGGTGACGGTGTTGGTGAAGAACAATCTTCGCAGGGCTTCG
11 | +
12 | FFGGGGGGFGGGGGFGGFFFFGGGFGGFGGFGGEGGFGGFFFFGGFFFGGGFGFGGGGAEGGFGGFGGGFFFG=GFFFGFGGFFGFFEFFFFGGGFFFGFFGGFGDFFFFGFFFCGGGFGFGGFEFFFGFFGGFGGFGFFGEDGFFGFGF
13 | @E100053086L1C001R0020000093/1
14 | TCGAACCCTTGCAGTTGAAGAAAGAGCCTACTCCTGGTACTGTATATTAGTGATAGGGGTGTATACAAAGTACACGTGAATACCAGTCATTGTGCGTGTGTGTATACTATCGACGAACTAGCCCCCAAGCTTTTATAACATACTGGGGGC
15 | +
16 | FFGFFGFGFFFEFFFFFFFFFFFFFGFGFFGFFFFFFFGFGGFGFGFGGFFFFFGFDFGFGFFFGFFFFFFGFGFEFFFFFFGFGFFFGFFGFFFFFGFGFGFGFFGFGGFFEFFFFGFFFFGGFFFFFFFFGFGGFGFGGGGGFFFDFF
17 | @E100053086L1C001R0020000112/1
18 | CCCTAGATATTTAATGACTGATGGTGGTTCACATTTTATTCATGGTGCTTTCCGTAAAATGCTTGCTAAATATGATGTTAATCATAGAATTGCATCTCCTTATCACCCTCAGTCTAGTGGTCAAGTAGAATTGAGTAATAGAGAACTCAA
19 | +
20 | FGFGGFGGHGGGGGGGFFGFGFGFFGFFGGGGGGGGGGGGGGFGFFGGGGGGGGGGGGFFFGGGGFGGGGGGGGGGGGGGGGGGGGFFGGGFGGGGGGGGGGGGGGGGGGGFGGGGFGFFGFGGGGGFGGGGGGGGGGGGDGEFFGGGGG
21 | @E100053086L1C001R0020000131/1
22 | TAAGTACTCTACTACTACTATATTAATTGGAGGCACATATTAGGTTTTATATTTGTTTACGTGTATGCCCCGAGACCTTCCAACTAGACATGTCTTTCTCTCCAGGTCGCACTTTGTATCGTTCATACCACTAGTACAGTAGTACTACTA
23 | +
24 | GGFGFGGGGGGGFGFFGGGGGGGGGFFGG7DG@GGFGGGGGGEFEGGGFGGGGGGFGGGGFGGFGGGGGGGGEDFFGGGGEGFGGGFGGGFGFGGFGGGFGGFGEGAGGGEGGGG*FEGG@FGGEGGGGFGGG5GGGGFGGGGGGGGGFG
25 | @E100053086L1C001R0020000150/1
26 | AGTGATTTTCACGATTAACGAACCCTGGGATGCGTTTACGTGTCGGTCATCAACACTTGTAGTTTTGGCTGATTCTGGCCCGTTTCTTGGACTATTACTCACTGATTTGGGGTCCGTGAGTGAGTTCCATGATTTTCGAACCCCGGGGTG
27 | +
28 | GEFFFFFFGGGFFFFGGFFGGFGGFFFFFEFGGFFFGGGGGG9GFGFGGFGFFGGGGFGFGGGGFGFDFGGEGFGGGFGFGFFFGGGGGAEGGGGGGGGGGGFGFGGGGEFFFFGGEGFGGGFFFGGGGGGGGFGGGFFFGGGG?FFGFG
29 | @E100053086L1C001R0020000158/1
30 | CAGTTCCACAGAGTCATTCCAGGGAAGAAGGCAAAATCACTCGGTCAGATTGCTCTGGATGTGGTTTTTGGTGACTCAAAGAATTTTCGAAAGGAGAAACTGGCGTTTGAAGTGGTAGACTTTCACAATGCCTATCATGCTATCCTCGGC
31 | +
32 | FFGGGGGGGGFFFFGGGGGGGFFEFGFFFGGGGGFFGGGGGGGFFGGFFFGGFGGGGFGGGGFFGGGGGGFGGFGGGGGFFGGGGGGGFGGGFFFGGGDFGGFEGGFGG=G@EGFFFEFGGGGFGGGGFGFGGGFGGFGGGDEGGGGDCF
33 | @E100053086L1C001R0020000169/1
34 | ATGTTCTCGCTCCTGCTCTCTTTTACTGCATTAAGACAACGCGATTCAAACTGTTGTGTGCTACGGTAGTTGAACCCATTTCCTCTGCATGACCTGTCATTGCCACAGTAACTAGATGAAACCCACTAGCATGTGTAGGAGTTGATTGAG
35 | +
36 | GDGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGFGGEGGGGEGGFG>GGGFGGGGEGGGGGFGGGGGGGGGGGGGGGGGGGGGGFGGGDGGGGGF1@GFGCG?DGGFEFGGGFGGG?GGGCGGGGF?F/GGGGFGGFF
37 | @E100053086L1C001R0020000179/1
38 | TAAATAGGACTAGCCACCATAAGGTAGAGGCATCTAGAGACCAACCAGAGAGAGAGAAAAAAGCGAGTGAACTCACCCAAGCAGTTCATCACACCATCTCAAGAACAGCCCCTCGCGAGGCTGTTCTTCCTTTGTACTGTTCACTATCAG
39 | +
40 | GGGGGGGGFFGFGGGGGGGGGGGFFGGFFFGGGFFGGFF>G@GGGGGEGGFGGGGGFGFFFFFGGGFFGFFFGGGGGGGFGGGGFGGGGGGGGGGGGGGGGDGFEGGFGGGGGGGFFEGDGGAFGGGGGGGGGBGFGGGGFGFGG@FCFF
41 |
--------------------------------------------------------------------------------
/example/pcr/PCRFREE_2.fq:
--------------------------------------------------------------------------------
1 | @E100053086L1C001R0020000056/2
2 | ATGGAATGAACTTATAATCTGATGATCGAGTCGATTCCATGATTATAAGTTCATAACCCTAGCGCCCATTCCCATTTTGGGCGGAACAGATCTACTAATTCTTTTATTCCAGTTAGTAAGAGGGATCTTGAACTAAGAAATAGACCTAGC
3 | +
4 | FGHFGGGGGGGIGFGGGGGGGGFGGGGGGGGGGGBFFFGGGGFGFFGGGGGGHGGGGFGGGGGGFGFGGGGGGFGGGGGGFFGGEDGGGGFEHGGGGFGGFGGFGGGGGGGGGEDGGFFGGGGGGGGGGGAGGGHGAEEGGGGGGFHFGG
5 | @E100053086L1C001R0020000058/2
6 | GTTAGCAGATCGGTTTTGGCATGACCTTTGTGCCAGGCCATATCTTTGTTTTCAGCAGCATCATCCCCTTCACTGATTCGACCGACCATCTAGGACAAGTCGAGAATTTTGCTCCTGGACAAATCATTAGGTTTGGCAACTTGGAGTATG
7 | +
8 | GGGGGGGGGGGFFFGGGGFGGGGGGGGGGGFGGGGFFGGGGGGGGGGGGGGGGGGGGGFGGGGGGGFGGGGGGGGFGGGGFGGGFGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGFGGGGFGGGFGGFGGGGGGFGGGGGGFFGGGGG
9 | @E100053086L1C001R0020000070/2
10 | GGGAACCCTGGCGTCTGGCCCTGGAGTCCGAGAAGGACTCTTGCCTTTCGGGTGAAACCGACTTTGTGGAGGCTTTTACTCCAAGTTTCGACCCCAAGGCTCAACATATAAATAGAGGGGTAGGGCTAGCACCCAAGGCACATCAAGAAA
11 | +
12 | FFGFFFGGGGFGFG?GGFFGGEGFEGFFFFFGGFGFFGGGEGGFGGGGFGFFGGBFEFFFEGGGGFEGFEGF9EGGGGFGGGFGECFGGFFFGEG6FFEFEGFFFGGGGGGGGGFFF?FFFFFFFFDFFGGGGGFFF?EGFGFGGFFFEF
13 | @E100053086L1C001R0020000093/2
14 | CATGAAGACTCATGGGTCGGCTCCATCCTGCACCACATATAGGTTGGACCTAGATGCGTTATAGGAAGACCCATGTGGAGGACTCGGCGTACAACACGACGCTACTAGAGTCGCGGAGGACTCTATCCCCACTGGTGATAAGCCGACTAT
15 | +
16 | FGGGFFFFFGFGGFFFFFBFFFGFGFFGFGFFFGFFGFGFFFFFFFFFFFGFFFGGFGFFGFFFFFFFFEGGFGGFFFFFFEFGGFFCFFFFFFGFFGEFFFFGFGFFFFEFDFFADFFFFGEFFFFFFGFGGEEFFDFFFFEEDEFDGF
17 | @E100053086L1C001R0020000112/2
18 | GAGAGGTAAGTGACATGCTTTTCCATAAACCATTTTATACGGAGACATACCCATAGGATTTTTATATGCAGTTCTATAGGCCCATAATGCATCATCAAGTTTCTTGGGCCAATTCTTTCTAGACCTATTAACAGTCTTTTGCAAAATTAA
19 | +
20 | GGGGFBGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGFGGGGGGGGGG?GGGGGGGGGGGGGFGGGGGGGGGGGGG
21 | @E100053086L1C001R0020000131/2
22 | GTTTTTATTTCTAATGAATGCTAGCGTTTCAACTCTTACGACGAAGGAGTGCCAAACACACGGCACGTGCTGGATGTCGCACACGTCAGCGAAAGGAGTGGGACATGAACAGCGTGGTAGAGCGTCTCCACTTCTGGGTCGGAGACCACA
23 | +
24 | GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGFGGGGG
25 | @E100053086L1C001R0020000150/2
26 | TTCAACAACTATGGTGATCGCTTCGGAACCCCAAAACGGTGAGTAATAGTGCACGAAACGGGTCAGAACTGGCCAAAACTTCGAGTGTTGGTGACCGACACGTAAGCACACATTGGGGTTCGACAACCATGGAAATCGCTTTGTGACCCC
27 | +
28 | GGFFFFGGFGFFFEFGFFFFFFFFFFFEFFFFFFDDFECFFEDFFFFFFFFCFFFFFFFFBCFEFEDFFFFECFFFEEFFFFEFCEFFFGBFFDFFFFFFFFFEEGFFFGFGGFF?FEEFFFDDEFFGFFFCFE@FFFFGEGGFG7FFFG
29 | @E100053086L1C001R0020000158/2
30 | GTTGGCTTCTTGGATTGGAGCAAATCAATCACATCTGCATTTCTCTTGTATTCTTCCAGTTCCACCATGGCCATCTGCTCATCAGCAATTTTGGATCCTTGCTAAACGCACTCGGCTCTTTGCCTATTCCATGTAATTATGATTACGCCC
31 | +
32 | FFFFEFGGEGGGCGGGFFFFFGEFFFGFFFFFGGGGFFGFFGFGFGGFFGGGFGGGGGGGGGFGGGFFGCFGFGFGGFGGGGFGFGGFFFGGEFFGFGGGFFGGFFGFFGFGGFDDGGGGGGFFGFGGGGGGGGGFFGGFGFGGFGBFGG
33 | @E100053086L1C001R0020000169/2
34 | CCAAATCGTGTTCACCACACCACTCTCATTACCGACGTAATCATTTCACTCCAGCCCATCACCCAGATGAACCAGACCTGACACGACTCTAAGCATAGCAGGCATAGCAAGGTAGGAACAACACATACATATGGCTCAATCAACTCCTAC
35 | +
36 | GGGGGGGGGGGGFGGFGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGFGGFGGGGGFGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGFFGGGGGGGFGFFFGFGGGGGGGFGGGGGGGGGGFGGDG
37 | @E100053086L1C001R0020000179/2
38 | TCTGTCAGACCCTTGAGGTTCGAACACTGGGGTGCACACGAAGATCTCTCCCCTACCAGCTCACGTCTCGAAGTCTCGCAAAGATCTAAGCAAGAAAGATGAACACATAAGGGACACGAGATTTATACTAGTTCAGGCCACCATTGTGGT
39 | +
40 | GGGGGGGGGGGGGGGCGFGGGGFGGGGGGFFEGGGGGGFFFGGFGGGGGGGGGGGGGGFGGGGFGGGGGGGFFGGGGGGGGFFFGGGGGGGGGFGGGGGGGGGGGGGGGFFEFFFGGFGGGGGGGGGGGGFGGGGGEFGDGGGGGGGGFG
41 |
--------------------------------------------------------------------------------
/example/rule.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
6 |
7 |
272 |
--------------------------------------------------------------------------------
/pic/pipeline.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liushoucheng/SPART/8b2d0cf778a446c1e24dbdc772e914262cff5930/pic/pipeline.jpg
--------------------------------------------------------------------------------
/pic/rule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liushoucheng/SPART/8b2d0cf778a446c1e24dbdc772e914262cff5930/pic/rule.png
--------------------------------------------------------------------------------