├── .gitignore ├── .idea ├── .gitignore ├── .name ├── SPART.iml ├── SPART.py ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── vcs.xml └── workspace.xml ├── .readthedocs.yaml ├── 00_Contig_screen ├── chloroplast.paf ├── chloroplast.txt ├── fastp.sh ├── flye.sh ├── gemma_los.py ├── hifiasm.sh ├── mitochondrion.paf ├── mitochondrion.txt ├── rm_mt_cp.sh └── verkko.sh ├── 01_Contig_scaffolding ├── Bionano_DLS_map.sh ├── HiC-Pro.sh ├── hicpro_config.txt └── yahs.sh ├── 02_Gap_patching ├── Centromeric_region_analysis.sh ├── chip-seq.py ├── paf_filter.pl ├── renameagp.pl └── wfmash_ragtag.sh ├── 03_Polishing ├── bwa_winnowmap.py ├── callsv_snv.py ├── calsv_snv.sh ├── clust.json ├── clust_align.json ├── conf_ck.yaml └── conf_ck_align.yaml ├── 04_Evaluation ├── BUSCO.sh ├── bac.sh ├── ltr.sh ├── mapping_rates_coverages .sh ├── qv.sh ├── synteny.sh ├── while.sh └── winnowmap.sh ├── 05_Annotation ├── Snakefile ├── clust.json ├── config.yaml └── modules │ ├── __pycache__ │ ├── fasta.cpython-310.pyc │ ├── fasta.cpython-35.pyc │ ├── fasta.cpython-39.pyc │ ├── mygff.cpython-310.pyc │ └── mygff.cpython-35.pyc │ ├── fasta.py │ └── mygff.py ├── LICENSE ├── README.md ├── SPART.py ├── SPART.yaml ├── clust.json ├── conf_ck.yaml ├── contacts.md ├── docs ├── Makefile ├── make.bat └── source │ ├── README.md │ ├── README1.md │ ├── conf.py │ ├── index.rst │ ├── pipeline.jpg │ └── requirements.txt ├── example ├── README.md ├── cp │ └── chloroplast.fasta ├── mt │ └── mitochondrion.fasta ├── pcr │ ├── PCRFREE_1.fq │ └── PCRFREE_2.fq ├── rule.svg └── verkko │ └── verkko.fasta └── pic ├── pipeline.jpg └── rule.png /.gitignore: -------------------------------------------------------------------------------- 1 | docs/build/ 2 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liushoucheng/SPART/8b2d0cf778a446c1e24dbdc772e914262cff5930/.idea/.gitignore -------------------------------------------------------------------------------- /.idea/.name: -------------------------------------------------------------------------------- 1 | SPART.py -------------------------------------------------------------------------------- /.idea/SPART.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/SPART.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | b={} 5 | hifi_single={} 6 | hifi_mix={} 7 | e={} 8 | d={} 9 | HiFi_hybrid_all=config["HiFi_reads_merge"] 10 | ONT_all=config["ONT_reads_merge"] 11 | mitochondrion=config["mitochondrion"] 12 | chloroplast=config["chloroplast"] 13 | hic_hybrid_dir=config["hic_dir"] 14 | SPART_dir=config["SPART_dir"] 15 | hic_hybrid_enzyme=config["hic_enzyme"] 16 | hic_enzyme_ligation_site=config["hic_enzyme_ligation_site"] 17 | verkko_fa=config["verkko_assemble"] 18 | pcrfree_hybrid_r1=config["pcrfree_r1"] 19 | pcrfree_hybrid_r2=config["pcrfree_r2"] 20 | google_deepvariant_latest_gpu_sif=config["google_deepvariant_latest-gpu_sif"] 21 | W=config["WORKDIR"] 22 | DIR=config["DIR"] 23 | DIRont=config["DIRont"] 24 | for dirs in os.listdir(DIR): 25 | b2 = dirs.split(".fastq") 26 | if ".fastq" in dirs: 27 | absPath = os.path.join(DIR, dirs) 28 | hifi_mix[b2[0]]=absPath 29 | 30 | for dirs in os.listdir(DIRont): 31 | b2 = dirs.split(".fastq") 32 | if ".fastq" in dirs: 33 | absPath = os.path.join(DIRont, dirs) 34 | e[b2[0]]=absPath 35 | 36 | rule final: 37 | input: 38 | W+"hybrid_hifi_pcr/hybrid.bam", 39 | W + "ont_merge/q10l120k.bam" 40 | 41 | rule hifi_fastp: 42 | input: 43 | HiFi_hybrid_all 44 | output: 45 | W+"fastp/hybrid.fq" 46 | shell: 47 | "fastp -w 16 -i {input} -o {output}" 48 | 49 | rule ont_fastp: 50 | input: 51 | ONT_all 52 | output: 53 | W+"fastp/ont.fq" 54 | shell: 55 | "fastp -q 10 -l 100000 -w 16 -i {input} -o {output}" 56 | 57 | rule hifiasm: 58 | input: 59 | hifi=W+"fastp/hybrid.fq", 60 | ont=W+"fastp/ont.fq" 61 | output: 62 | W+"hifiasm_hybrid/hybrid.all.asm.p_ctg.fa" 63 | params: 64 | W+"hifiasm_hybrid" 65 | shell: 66 | """ 67 | cd {params} 68 | hifiasm -o hybrid.all.asm --primary -t 96 --ul {input.ont} -k 63 {input.hifi} 69 | awk '/^S/{{print ">"$2;print $3}}' hybrid.all.asm.p_ctg.gfa > {output} 70 | """ 71 | 72 | rule flye: 73 | input: 74 | W+"fastp/ont.fq" 75 | output: 76 | W + "flye/assembly.fasta" 77 | params: 78 | W 79 | shell: 80 | """ 81 | cd {params} 82 | flye --nano-hq {input} --read-error 0.1 -g 5.4g --asm-coverage 80 --scaffold --out-dir flye --threads 96 --no-alt-contigs 83 | """ 84 | 85 | rule rm_mt_cp: 86 | input: 87 | hybrid=W+"hifiasm_hybrid/hybrid.all.asm.p_ctg.fa", 88 | mt=mitochondrion, 89 | cp=chloroplast 90 | output: 91 | W+"hifiasm_hybrid/hybrid.remove_cp_mt.fa" 92 | params: 93 | W+"hifiasm_hybrid" 94 | shell: 95 | """ 96 | cd {params} 97 | minimap2 -t 96 -x asm5 {input.mt} {input.hybrid}> mitochondrion.paf 98 | minimap2 -t 96 -x asm5 {input.cp} {input.hybrid}> chloroplast.paf 99 | python gemma_los.py mitochondrion.paf > mitochondrion.txt 100 | python gemma_los.py chloroplast.paf > chloroplast.txt 101 | seqkit grep -v -f chloroplast.txt {input.hybrid} > wheat_remove_cp.fa 102 | seqkit grep -v -f mitochondrion.txt wheat_remove_cp.fa > {output} 103 | """ 104 | 105 | rule hicpro: 106 | input: 107 | hic=hic_hybrid_dir, 108 | ref=W+"hifiasm_hybrid/hybrid.remove_cp_mt.fa" 109 | output: 110 | W+"hic_hybrid/hic_hybrid.bam" 111 | params: 112 | dir=W+"hic_hybrid", 113 | prefix="hybrid.remove_cp_mt", 114 | spart_dir=SPART_dir, 115 | enzyme=hic_hybrid_enzyme, 116 | LIGATION_SITE=hic_enzyme_ligation_site 117 | shell: 118 | """ 119 | cd {params.dir} 120 | ln -s {input.ref} ./ 121 | bowtie2-build --large-index --threads 96 {params.prefix}.fa {params.prefix} 122 | samtools faidx {params.prefix}.fa 123 | awk '{{print $1 "\t" $2}}' {params.prefix}.fa.fai > genome_sizes.bed 124 | python ./HiC-Pro/bin/utils/digest_genome.py -r ^{params.enzyme} -o enzyme.bed {params.prefix}.fa 125 | makeblastdb -in {params.prefix}.fa -dbtype nucl -parse_seqids -out {params.prefix} 126 | cp {params.spart_dir}/01_Contig_scaffolding/hicpro_config.txt ./ 127 | sed -i 's#^N_CPU = #N_CPU = 96#g' hicpro_config.txt 128 | sed -i 's#^BOWTIE2_IDX_PATH = #BOWTIE2_IDX_PATH = {params.dir}#g' hicpro_config.txt 129 | sed -i 's#^REFERENCE_GENOME = #REFERENCE_GENOME = {params.prefix}#g' hicpro_config.txt 130 | sed -i 's#^GENOME_SIZE = #GENOME_SIZE = {params.dir}/genome_sizes.bed#g' hicpro_config.txt 131 | sed -i 's#^GENOME_FRAGMENT = #GENOME_FRAGMENT = {params.dir}/enzyme.bed#g' hicpro_config.txt 132 | sed -i 's#^GENOME_FRAGMENT = #GENOME_FRAGMENT = {params.dir}/enzyme.bed#g' hicpro_config.txt 133 | HiC-Pro -i {input.hic} -c hicpro_config.txt -o {params.dir} 134 | cd bowtie_results/bwt2 135 | for item in dir {params.dir}/bowtie_results/bwt2/*/*.bwt2pairs.bam; do samtools sort -m 1500M -n -@ 96 $item > $item.bam; done 136 | samtools merge -@ 96 -o {output} {params.dir}/bowtie_results/bwt2/*/*.bwt2pairs.bam.bam 137 | """ 138 | 139 | rule yahs: 140 | input: 141 | bam=W+"hic_hybrid/hic_hybrid.bam", 142 | ref=W+"hifiasm_hybrid/hybrid.remove_cp_mt.fa" 143 | output: 144 | W + "yahs_hybrid/yahs_hybrid.fa" 145 | params: 146 | dir = W + "yahs_hybrid", 147 | prefix = "hybrid_bam", 148 | enzyme = hic_hybrid_enzyme 149 | shell: 150 | """ 151 | cd {params.dir} 152 | yahs -e {params.enzyme} {input.ref} {input.bam} -o {params.prefix} 153 | cp {params.dir}/yahs_bam_scaffolds_final.fa ./yahs_hybrid.fa 154 | """ 155 | 156 | rule patch_flye: 157 | input: 158 | single_hybrid=W + "yahs_hybrid/yahs_hybrid.fa", 159 | flye=W + "flye/assembly.fasta" 160 | output: 161 | W + "patch_flye/patch_single_hybrid_flye.fa" 162 | params: 163 | dir = W + "patch_flye", 164 | prefix = "single_hybrid_flye", 165 | shell: 166 | """ 167 | cd {params.dir} 168 | wfmash {input.single_hybrid} {input.flye} > {params.prefix}.paf 169 | mkdir ragtag_output 170 | cd ragtag_output 171 | ln -s ../{params.prefix}.paf ragtag.patch.asm.paf 172 | cd .. 173 | ragtag.py patch -f 10000 --remove-small {input.single_hybrid} {input.flye} 174 | cp {params.dir}/ragtag_output/ragtag.patch.fasta {output} 175 | """ 176 | 177 | rule patch_verkko: 178 | input: 179 | single_hybrid_flye=W + "patch_flye/patch_single_hybrid_flye.fa", 180 | verkko=verkko_fa 181 | output: 182 | ref=W + "patch_verkko/patch_single_hybrid_flye_verkko.fa", 183 | txt = W + "repetitive_k27.txt" 184 | params: 185 | dir = W + "patch_verkko", 186 | prefix = "single_hybrid_flye_verkko", 187 | shell: 188 | """ 189 | cd {params.dir} 190 | wfmash {input.single_hybrid_flye} {input.verkko} > {params.prefix}.paf 191 | mkdir ragtag_output 192 | cd ragtag_output 193 | ln -s ../{params.prefix}.paf ragtag.patch.asm.paf 194 | cd .. 195 | ragtag.py patch -f 10000 --remove-small {input.single_hybrid_flye} {input.verkko} 196 | cp {params.dir}/ragtag_output/ragtag.patch.fasta {output.ref} 197 | bwa-mem2.avx512bw index {output.ref} 198 | meryl count k=27 output merylDB {output.ref} 199 | meryl print greater-than distinct=0.9998 merylDB > {output.txt} 200 | """ 201 | 202 | rule winnowmap_hifi: 203 | input: 204 | fq=W+"fastp/hybrid.fq", 205 | ref=W + "patch_verkko/patch_single_hybrid_flye_verkko.fa", 206 | txt = W + "repetitive_k27.txt" 207 | output: 208 | sam=W+"hifi_mix_winnowmap/{hifi_mix}_q40l15k.sam" 209 | benchmark: 210 | W+"benchmarks/hifi_mix_winnowmap/{hifi_mix}.benchmark.txt" 211 | shell: 212 | """ 213 | winnowmap --MD -W {input.txt} -ax map-pb -H -K 1500M -k 27 -w27 -t32 {input.ref} {input.fq} > {output.sam} 214 | """ 215 | 216 | rule winnowmap_hifi_sort: 217 | input: 218 | W+"hifi_mix_winnowmap/{hifi_mix}_q40l15k.sam" 219 | output: 220 | W+"hifi_mix_sort/{hifi_mix}_q40l15k.bam" 221 | params: 222 | W + "patch_verkko/patch_single_hybrid_flye_verkko.fa.fai" 223 | benchmark: 224 | W + "benchmarks/hifi_mix_sort/{hifi_mix}.benchmark.txt" 225 | shell: 226 | "samtools view -@32 -bt {params} {input}|samtools sort -@32 -m1500M -O bam -o {output} -" 227 | 228 | rule winnowmap_hifi_sort_filter: 229 | input: 230 | W+"hifi_mix_sort/{hifi_mix}_q40l15k.bam" 231 | output: 232 | W+"hifi_mix_sort_filter/{hifi_mix}_q40l15k.bam" 233 | benchmark: 234 | W + "benchmarks/hifi_mix_sort_filter/{hifi_mix}.benchmark.txt" 235 | shell: 236 | "samtools view -@32 -F0x104 -hb {input} > {output}" 237 | 238 | rule winnowmap_hifi_sort_filter_merge: 239 | input: 240 | expand(W+"hifi_mix_sort_filter/{hifi_mix}_q40l15k.bam",hifi_mix=hifi_mix) 241 | output: 242 | W+"hybrid/hybrid.bam" 243 | benchmark: 244 | W + "benchmarks/hybrid/hybrid.benchmark.txt" 245 | shell: 246 | "samtools merge -@ 128 -l 0 {output} {input}" 247 | 248 | rule pcr_free: 249 | input: 250 | fa=W + "patch_verkko/patch_single_hybrid_flye_verkko.fa", 251 | r1=pcrfree_hybrid_r1, 252 | r2=pcrfree_hybrid_r2 253 | output: 254 | W+"hybrid_hifi_pcr/pcr.bam" 255 | shell: 256 | "bwa-mem2.avx512bw mem -t 96 {input.fa} {input.r1} {input.r2}|samtools view -@ 96 -b -|samtools sort -@ 96 -m 30G -o {output} -" 257 | 258 | rule winnowmap_hifi_filter_pcr_merge: 259 | input: 260 | hifi=expand(W+"hifi_mix_sort_filter/{hifi_mix}_q40l15k.bam",hifi_mix=hifi_mix), 261 | pcr=W+"hybrid_hifi_pcr/pcr.bam" 262 | output: 263 | W+"hybrid_hifi_pcr/hybrid.bam" 264 | benchmark: 265 | W + "benchmarks/hybrid_pcr/hybrid.benchmark.txt" 266 | shell: 267 | "samtools merge -@ 128 -l 0 {output} {input.hifi} {input.pcr}" 268 | 269 | rule winnowmap_ont: 270 | input: 271 | fq=W+"fastp/ont.fq", 272 | ref=W + "patch_verkko/patch_single_hybrid_flye_verkko.fa", 273 | txt=W+"repetitive_k27.txt" 274 | output: 275 | W+"ont_winnowmap/{e}/{e}_q10l120k.sam" 276 | benchmark: 277 | W+"benchmarks/ont_winnowmap/{e}.benchmark.txt" 278 | shell: 279 | "winnowmap --MD -W {input.txt} -ax map-ont -H -K 1500M -k 27 -w27 -t32 {input.ref} {input.fq} > {output}" 280 | 281 | rule winnowmap_ont_sort: 282 | input: 283 | W+"ont_winnowmap/{e}/{e}_q10l120k.sam" 284 | output: 285 | W+"ont_sort/{e}/{e}_q10l120k.bam" 286 | params: 287 | W + "patch_verkko/patch_single_hybrid_flye_verkko.fa.fai" 288 | benchmark: 289 | W + "benchmarks/ont_sort/{e}.benchmark.txt" 290 | shell: 291 | "samtools view -@32 -bt {params} {input}|samtools sort -@32 -m1500M -O bam -o {output} -" 292 | 293 | rule winnowmap_ont_sort_filter: 294 | input: 295 | W+"ont_sort/{e}/{e}_q10l120k.bam" 296 | output: 297 | W+"ont_filter/{e}_q10l120k.bam" 298 | benchmark: 299 | W + "benchmarks/ont_filter/{e}.benchmark.txt" 300 | shell: 301 | "samtools view -@ 128 -F0x104 -hb {input} > {output}" 302 | 303 | rule winnowmap_ont_sort_filter_merge: 304 | input: 305 | expand(W+"ont_filter/{e}_q10l120k.bam",e=e) 306 | output: 307 | W + "ont_merge/q10l120k.bam" 308 | benchmark: 309 | W + "benchmarks/ont_merge/benchmark.txt" 310 | shell: 311 | "samtools merge -@ 128 -o {output} {input}" 312 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 8 | 13 | 14 | 16 | 17 | 18 | 19 | 20 | 22 | 23 | 24 | 25 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 1689602086247 41 | 46 | 47 | 48 | 49 | 51 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the OS, Python version and other tools you might need 9 | build: 10 | os: ubuntu-20.04 11 | tools: 12 | python: "3.9" 13 | # You can also specify other tool versions: 14 | # nodejs: "19" 15 | # rust: "1.64" 16 | # golang: "1.19" 17 | 18 | # Build documentation in the "docs/" directory with Sphinx 19 | sphinx: 20 | configuration: docs/source/conf.py 21 | 22 | # Optionally build your docs in additional formats such as PDF and ePub 23 | # formats: 24 | # - pdf 25 | # - epub 26 | 27 | # Optional but recommended, declare the Python requirements required 28 | # to build your documentation 29 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 30 | python: 31 | install: 32 | - requirements: docs/source/requirements.txt -------------------------------------------------------------------------------- /00_Contig_screen/chloroplast.txt: -------------------------------------------------------------------------------- 1 | ptg000373l 2 | ptg000680l 3 | ptg000714l 4 | ptg001146l 5 | ptg001324l 6 | ptg001414l 7 | ptg001489l 8 | ptg001497l 9 | ptg001519l 10 | ptg001579l 11 | ptg001661l 12 | ptg001777l 13 | ptg001836l 14 | ptg001902l 15 | ptg001941l 16 | ptg001978l 17 | ptg002103l 18 | ptg002292l 19 | ptg002325l 20 | ptg002335l 21 | ptg002346l 22 | ptg002359l 23 | ptg002374l 24 | ptg002405l 25 | ptg002426l 26 | ptg002434l 27 | ptg002480l 28 | ptg002517l 29 | ptg002537l 30 | ptg002548l 31 | ptg002575l 32 | ptg002592l 33 | ptg002686l 34 | ptg002752l 35 | ptg002928l 36 | ptg002944l 37 | ptg002968l 38 | ptg002991l 39 | ptg003000l 40 | ptg003059l 41 | ptg003087l 42 | ptg003124l 43 | ptg003157l 44 | ptg003172l 45 | ptg003203l 46 | ptg003232l 47 | ptg003240l 48 | ptg003333l 49 | ptg003389l 50 | ptg003431l 51 | ptg003473l 52 | ptg003528l 53 | ptg003569l 54 | ptg003578l 55 | ptg003587l 56 | ptg003666l 57 | ptg003668l 58 | ptg003671l 59 | ptg003676l 60 | ptg003708l 61 | ptg003718l 62 | ptg003726l 63 | ptg003749l 64 | ptg003761l 65 | ptg003815l 66 | ptg003857l 67 | ptg003858l 68 | ptg003861l 69 | ptg003863l 70 | ptg003909l 71 | ptg003919l 72 | ptg003927l 73 | ptg003944l 74 | ptg003992l 75 | ptg003999l 76 | ptg004008l 77 | ptg004051l 78 | ptg004087l 79 | ptg004090l 80 | ptg004151l 81 | ptg004168l 82 | ptg004193l 83 | ptg004211l 84 | ptg004218l 85 | ptg004246l 86 | ptg004247l 87 | ptg004258l 88 | ptg004307l 89 | ptg004308l 90 | ptg004346l 91 | ptg004362l 92 | ptg004370l 93 | ptg004411l 94 | ptg004419l 95 | ptg004433l 96 | ptg004446l 97 | ptg004492l 98 | ptg004523l 99 | ptg004526l 100 | ptg004534l 101 | ptg004546l 102 | ptg004555l 103 | ptg004558l 104 | ptg004569l 105 | ptg004576l 106 | ptg004635l 107 | ptg004654l 108 | ptg004657l 109 | ptg004677l 110 | ptg004683l 111 | ptg004687l 112 | ptg004694l 113 | ptg004714l 114 | ptg004717l 115 | ptg004718l 116 | ptg004724l 117 | ptg004746l 118 | ptg004756l 119 | ptg004796l 120 | ptg004804l 121 | ptg004824l 122 | ptg004861l 123 | ptg004874l 124 | ptg004885l 125 | ptg004916l 126 | ptg004927l 127 | ptg004968l 128 | ptg004987l 129 | ptg004989l 130 | ptg005014l 131 | -------------------------------------------------------------------------------- /00_Contig_screen/fastp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | ont=$1 4 | hifi=$2 5 | 6 | fastp -w 16 -i $hifi -o hifi_clean_data.fastq 7 | fastp -q 10 -l 100000 -w 16 -i $ont -o ont_clean_data.fastq 8 | -------------------------------------------------------------------------------- /00_Contig_screen/flye.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | ont=$1 4 | outdir=$2 5 | threads=$3 6 | 7 | 8 | flye --nano-hq $ont --read-error 0.1 -g 5.4g --asm-coverage 80 --scaffold --out-dir $outdir --threads $threads --no-alt-contigs -------------------------------------------------------------------------------- /00_Contig_screen/gemma_los.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import re 4 | import sys 5 | # 参数解析 6 | parser = argparse.ArgumentParser(description="Filter config names based on fixed total length threshold.") 7 | parser.add_argument("input_file", help="Path to the input paf file") 8 | parser.add_argument("--length", type=int, required=True, help="Threshold length to aligned printing") 9 | 10 | args = parser.parse_args() 11 | 12 | input_file = args.input_file 13 | length_threshold = args.length 14 | 15 | a = 0 16 | c = 0 17 | 18 | with open(input_file) as file_object: 19 | for line in file_object: 20 | line = line.strip() 21 | if not line: 22 | continue 23 | line2 = line.split("\t") 24 | if a > 0: 25 | if line2[0] == config_name: 26 | d_value += int(line2[3]) - int(line2[2]) 27 | if c == 0 and d_value > length_threshold: 28 | print(config_name) 29 | c = 1 30 | else: 31 | config_name = line2[0] 32 | d_value = int(line2[3]) - int(line2[2]) 33 | c = 0 34 | if d_value > length_threshold: 35 | print(config_name) 36 | c = 1 37 | else: 38 | a += 1 39 | config_name = line2[0] 40 | d_value = int(line2[3]) - int(line2[2]) 41 | if d_value > length_threshold: 42 | print(config_name) 43 | c = 1 44 | -------------------------------------------------------------------------------- /00_Contig_screen/hifiasm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | hifi_reads=$1 4 | ont_reads=$2 5 | pre=$3 6 | thread=$4 7 | 8 | hifiasm -k 63 -o "$pre".asm -t $thread $hifi_reads --ul $ont_reads 9 | -------------------------------------------------------------------------------- /00_Contig_screen/mitochondrion.txt: -------------------------------------------------------------------------------- 1 | ptg000019l 2 | ptg000176l 3 | ptg000203l 4 | ptg000613l 5 | ptg000791l 6 | ptg000966l 7 | ptg001425l 8 | ptg001436l 9 | ptg001452l 10 | ptg001510l 11 | ptg001591l 12 | ptg001634l 13 | ptg001685l 14 | ptg001703l 15 | ptg001782l 16 | ptg001854l 17 | ptg001887l 18 | ptg001991l 19 | ptg002013l 20 | ptg002149l 21 | ptg002290l 22 | ptg002319l 23 | ptg002341l 24 | ptg002419l 25 | ptg002477l 26 | ptg002763l 27 | ptg003220l 28 | ptg003271l 29 | ptg003365l 30 | ptg003378l 31 | ptg003460l 32 | ptg003535l 33 | ptg003630l 34 | ptg003829l 35 | ptg003845l 36 | ptg003854l 37 | ptg004354l 38 | ptg004503l 39 | ptg004621l 40 | ptg004942l 41 | ptg005046l 42 | -------------------------------------------------------------------------------- /00_Contig_screen/rm_mt_cp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mt=$1 4 | cp=$2 5 | ref=$3 6 | threads=$4 7 | minimap2 -t $threads -x asm5 $mt $ref> mitochondrion.paf 8 | 9 | minimap2 -t $threads -x asm5 $cp $ref> chloroplast.paf 10 | 11 | python gemma_los.py mitochondrion.paf --length 100 > mitochondrion.txt 12 | python gemma_los.py chloroplast.paf --length 100 > chloroplast.txt 13 | 14 | seqkit grep -v -f chloroplast.txt $ref > wheat_remove_cp.fa 15 | 16 | seqkit grep -v -f mitochondrion.txt wheat_remove_cp.fa > wheat_remove_cp_mt.fa 17 | -------------------------------------------------------------------------------- /00_Contig_screen/verkko.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | 4 | output=$1 5 | HiFi=$2 6 | ONT=$3 7 | threads=$4 8 | memory=$5 9 | 10 | verkko -d $output --hifi $HiFi --nano $4 --threads $threads --slurm --local-memory $memory --snakeopts "--max-jobs-per-second 10 --max-status-checks-per-second 0.5 --restart-times 1 --local-cores 128 --jobs 250" --base-k 1001 --window 971 --hifi-coverage 100 --slurm --sto-run 128 200 24 --mer-run 128 200 20000 --ovb-run 64 100 24 --ovs-run 16 35 24 --red-run 16 31 24 --mbg-run 32 0 20000 --utg-run 128 240 20000 --spl-run 128 240 20000 --ali-run 23 50 20000 --pop-run 128 240 20000 --utp-run 1 240 200000 --lay-run 1 240 20000 --sub-run 128 240 200000 --par-run 128 240 20000 --cns-run 24 0 20000 11 | -------------------------------------------------------------------------------- /01_Contig_scaffolding/Bionano_DLS_map.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | threads=$1 4 | bnx=$2 5 | ref_cmap=$3 6 | prefix=$4 7 | xml=$5 8 | Bio_dir=$6 9 | cluster_xml=$7 10 | ref=$8 11 | bio_camp=$9 12 | merge_xml=$10 13 | RefAligner=$11 14 | perl fa2cmap_multi_color.pl -i $ref -e cttaag 1 -o . 15 | python pipelineCL.py -Tn $threads -i 5 -b $bnx -r $ref_cmap -l $prefix -e w -a $xml -t $Bio_dir -y -z --species-reference other -C $cluster_xml -F 1 16 | perl hybridScaffold.pl -n $ref -b $bio_camp -u CTTAAG -c $merge_xml -r $RefAligner -o bio_hybrid -B 2 -N 2 -f -------------------------------------------------------------------------------- /01_Contig_scaffolding/HiC-Pro.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | ref=$1 4 | ref_prefix=$2 5 | hicpro_data=$3 6 | hicpro_config=$4 7 | hicpro_outdir=$5 8 | 9 | ln -s $ref ./"$ref_prefix".fa 10 | bowtie2-build --large-index --threads 128 "$ref_prefix".fa "$ref_prefix" 11 | 12 | samtools faidx "$ref_prefix".fa 13 | awk '{print $1 "\t" $2}' "$ref_prefix".fa.fai > genome_sizes.bed 14 | 15 | python ./HiC-Pro/bin/utils/digest_genome.py -r ^GATC -o wheat_DpnII.bed "$ref_prefix".fa 16 | makeblastdb -in "$ref_prefix".fa -dbtype nucl -parse_seqids -out "$ref_prefix" 17 | 18 | HiC-Pro -i $hicpro_data -c $hicpro_config -o $hicpro_outdir -p 19 | -------------------------------------------------------------------------------- /01_Contig_scaffolding/hicpro_config.txt: -------------------------------------------------------------------------------- 1 | # Please change the variable settings below if necessary 2 | 3 | ######################################################################### 4 | ## Paths and Settings - Do not edit ! 5 | ######################################################################### 6 | 7 | TMP_DIR = tmp 8 | LOGS_DIR = logs 9 | BOWTIE2_OUTPUT_DIR = bowtie_results 10 | MAPC_OUTPUT = hic_results 11 | RAW_DIR = rawdata 12 | 13 | ####################################################################### 14 | ## SYSTEM - PBS - Start Editing Here !! 15 | ####################################################################### 16 | N_CPU = 17 | LOGFILE = hicpro.log 18 | 19 | JOB_NAME = hicpro-testop 20 | JOB_MEM = 21 | JOB_WALLTIME = 22 | JOB_QUEUE = 23 | JOB_MAIL = 24 | 25 | ######################################################################### 26 | ## Data 27 | ######################################################################### 28 | 29 | PAIR1_EXT = _R1 30 | PAIR2_EXT = _R2 31 | 32 | ####################################################################### 33 | ## Alignment options 34 | ####################################################################### 35 | 36 | MIN_MAPQ = 10 37 | 38 | BOWTIE2_IDX_PATH = 39 | BOWTIE2_GLOBAL_OPTIONS = --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder 40 | BOWTIE2_LOCAL_OPTIONS = --very-sensitive -L 20 --score-min L,-0.6,-0.2 --end-to-end --reorder 41 | 42 | ####################################################################### 43 | ## Annotation files 44 | ####################################################################### 45 | 46 | REFERENCE_GENOME = 47 | GENOME_SIZE = 48 | 49 | ####################################################################### 50 | ## Allele specific 51 | ####################################################################### 52 | 53 | ALLELE_SPECIFIC_SNP = 54 | 55 | ####################################################################### 56 | ## Capture Hi-C analysis 57 | ####################################################################### 58 | 59 | CAPTURE_TARGET = 60 | REPORT_CAPTURE_REPORTER = 61 | 62 | ####################################################################### 63 | ## Digestion Hi-C 64 | ####################################################################### 65 | 66 | GENOME_FRAGMENT = 67 | LIGATION_SITE = GATCGATC 68 | MIN_FRAG_SIZE = 100 69 | MAX_FRAG_SIZE = 100000 70 | MIN_INSERT_SIZE = 100 71 | MAX_INSERT_SIZE = 1000 72 | 73 | ####################################################################### 74 | ## Hi-C processing 75 | ####################################################################### 76 | 77 | MIN_CIS_DIST = 78 | GET_ALL_INTERACTION_CLASSES = 1 79 | GET_PROCESS_SAM = 1 80 | RM_SINGLETON = 1 81 | RM_MULTI = 1 82 | RM_DUP = 1 83 | 84 | ####################################################################### 85 | ## Contact Maps 86 | ####################################################################### 87 | 88 | BIN_SIZE = 100000 500000 1000000 1500000 89 | MATRIX_FORMAT = upper 90 | 91 | ####################################################################### 92 | ## ICE Normalization 93 | ####################################################################### 94 | MAX_ITER = 100 95 | FILTER_LOW_COUNT_PERC = 0.02 96 | FILTER_HIGH_COUNT_PERC = 0 97 | EPS = 0.1 98 | -------------------------------------------------------------------------------- /01_Contig_scaffolding/yahs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | enzyme=$1 4 | ref=$2 5 | bed=$3 6 | profix=$4 7 | 8 | yahs -e $enzyme $ref $bed -o $4 9 | 10 | -------------------------------------------------------------------------------- /02_Gap_patching/Centromeric_region_analysis.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | workdir=$1 4 | FASTA=$2 5 | prefix=$3 6 | CHIP1=$4 7 | CHIP2=$5 8 | threads=$6 9 | CHIP1c=$7 10 | CHIP2c=$8 11 | 12 | #treatment 13 | SAM="${workdir}/ref_chip.sam" 14 | BAM="${workdir}/ref_chip.bam" 15 | BAMSORT="${workdir}/ref_chip_sort.bam" 16 | BAMFILTER="${workdir}/ref_chip_sort_filter.bam" 17 | CHIP1TRIM="${workdir}/CHIP_1.trim.fastq" 18 | CHIP2TRIM="${workdir}/CHIP_2.trim.fastq" 19 | BINS="${workdir}/"$prefix".genome.size.100kb" 20 | 21 | faidx $FASTA -i chromsizes | bedtools makewindows -g - -w 100000 | awk -vFS="\t" -vOFS="\t" '{print $1,$2,$3}' | bedtools sort -i - > "$prefix".genome.size.100kb 22 | 23 | ln -s ${FASTA} ./"$prefix".fasta 24 | hisat2-build --large-index -a -p $threads ${workdir}/"$prefix".fasta ${workdir}/"$prefix" 25 | 26 | # adapter 27 | fastp --in1 ${CHIP1} --in2 ${CHIP2} --out1 ${CHIP1TRIM} --out2 ${CHIP2TRIM} --thread $threads 28 | 29 | ## RUN HISAT2 30 | hisat2 -p $threads -x "$prefix" -1 ${CHIP1TRIM} -2 ${CHIP2TRIM} -S ${SAM} 31 | 32 | ## convert to BAM 33 | samtools view -b -S -@ $threads -o ${BAM} ${SAM} 34 | 35 | ## sort 36 | samtools sort -m 4G -@ $threads -o ${BAMSORT} ${BAM} 37 | 38 | ## filter 39 | samtools view -@ $threads -q 30 -o ${BAMFILTER} ${BAMSORT} 40 | samtools index -c -@ $threads ${BAMFILTER} 41 | 42 | #control 43 | SAMc="${workdir}/ref_control.sam" 44 | BAMc="${workdir}/ref_control.bam" 45 | BAMSORTc="${workdir}/ref_control_sort.bam" 46 | BAMFILTERc="${workdir}/ref_control_sort_filter.bam" 47 | CHIP1TRIMc="${workdir}/ref_control.trim.fastq" 48 | CHIP2TRIMc="${workdir}/ref_control.trim.fastq" 49 | INTERSECTc="${workdir}/"$profix".intersect.bed" 50 | 51 | # adapter 52 | #fastp --in1 ${CHIP1c} --in2 ${CHIP2c} --out1 ${CHIP1TRIMc} --out2 ${CHIP2TRIMc} --thread 52 53 | 54 | ## RUN HISAT2 55 | hisat2 -p $threads -x "$profix" -1 ${CHIP1TRIMc} -2 ${CHIP2TRIMc} -S ${SAMc} 56 | 57 | ## convert to BAM 58 | samtools view -b -S -@ $threads -o ${BAMc} ${SAMc} 59 | 60 | ## sort 61 | samtools sort -m 4G -@ $threads -o ${BAMSORTc} ${BAMc} 62 | 63 | ## filter 64 | samtools view -@ $threads -q 30 -o ${BAMFILTERc} ${BAMSORTc} 65 | samtools index -c -@ $threads ${BAMFILTERc} 66 | 67 | epic2 -t ${BAMFILTER} -c ${BAMFILTERc} --chromsizes -o "CENH3.bed" --bin-size 25000 --mapq 30 --gaps-allowed 4 68 | -------------------------------------------------------------------------------- /02_Gap_patching/chip-seq.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | 5 | files = sys.argv[1] 6 | 7 | a=0 8 | c=0 9 | diff=0 10 | with open(files, 'r') as file: 11 | for line in file: 12 | line1 = line.split("\n") 13 | line2 = line1[0].split("\t") 14 | a=a+int(line2[3]) 15 | c=c+1 16 | f=3*(a/c) 17 | d=0 18 | with open(files, 'r') as file_object: 19 | for line in file_object: 20 | line1 = line.split("\n") 21 | line2 = line1[0].split("\t") 22 | if int(line2[3]) > f: 23 | if d==0: 24 | name=line2[0] 25 | start=line2[1] 26 | end=line2[2] 27 | d=d+1 28 | else: 29 | if name==line2[0]: 30 | if 500000 >= int(line2[1])-int(end): 31 | end=line2[2] 32 | else: 33 | if diff < int(end)-int(start): 34 | diff=int(end)-int(start) 35 | maxs=start 36 | maxe=end 37 | #print(name+"\t"+start+"\t"+end) 38 | start=line2[1] 39 | end=line2[2] 40 | else: 41 | if diff < int(end) - int(start): 42 | diff = int(end) - int(start) 43 | maxs = start 44 | maxe = end 45 | print(name + "\t" + maxs + "\t" + maxe + "\t" + str(diff)) 46 | diff=0 47 | name = line2[0] 48 | start = line2[1] 49 | end = line2[2] 50 | if diff < int(end) - int(start): 51 | diff = int(end) - int(start) 52 | maxs = start 53 | maxe = end 54 | print(name + "\t" + maxs + "\t" + maxe + "\t" + str(diff)) -------------------------------------------------------------------------------- /02_Gap_patching/paf_filter.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | #author :SHOUCHENG LIU 3 | #email :286596224@QQ.com 4 | #last modified : 5 | 6 | #------------------------------------------------------------------------- 7 | #please consult DOCUMENTATION for detailed information... 8 | #perl paf_filter.pl -i ragtag.patch.debug.filtered.paf -minlen 500000 -iden 0.8 9 | use strict; 10 | use diagnostics; 11 | use warnings; 12 | use Getopt::Long; 13 | 14 | my %opts=(); 15 | GetOptions(\%opts,"i:s","minlen:s","iden:s"); 16 | 17 | my %h=(); 18 | open IN, $opts{i} or die "Cannot open file: $opts{i}!\n"; 19 | my $o=$opts{i}."_fiter.paf"; 20 | open OUT,">$o" or die "Cannot create file: $o!\n"; 21 | #ragtag.patch.debug.filtered.paf 22 | while(){ 23 | chomp; 24 | my $number=0; 25 | my @f = split /\t/, $_; 26 | if ($f[8]>($f[6]-$opts{minlen}) or $f[7]<$opts{minlen}) { 27 | print $f[8]."\t".$f[9]/$f[10]."\n"; 28 | if ($f[9]/$f[10]>$opts{iden}) { 29 | $h{$f[0]}{$f[5]}=1 30 | }} 31 | } 32 | close IN; 33 | open IN, $opts{i} or die "Cannot open file: $opts{i}!\n"; 34 | while(){ 35 | chomp; 36 | my $number=0; 37 | my @f = split /\t/, $_; 38 | if ($f[8]>($f[6]-$opts{minlen}) or $f[7]<$opts{minlen}) { 39 | print $f[8]."\t".$f[9]/$f[10]."\n"; 40 | if ($f[9]/$f[10]>$opts{iden}) { 41 | my $le=keys %{$h{$f[0]}}; 42 | print $le."\n"; 43 | if ($le>1) { 44 | 45 | print OUT $_."\n"; 46 | } 47 | } 48 | } 49 | } -------------------------------------------------------------------------------- /02_Gap_patching/renameagp.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | #author :SHOUCHENG LIU 3 | #email :286596224@QQ.com 4 | #last modified : 5 | 6 | #------------------------------------------------------------------------- 7 | #please consult DOCUMENTATION for detailed information... 8 | use strict; 9 | use diagnostics; 10 | use warnings; 11 | use Getopt::Long; 12 | # perl /home/liusc/proj/wheat/code/agp.pl -i ragtag.patch.ctg.agp -i1 ragtag.patch.debug.filtered.paf -start seq00000000 -end seq00000001 -o test.agp 13 | my %opts=(); 14 | GetOptions(\%opts,"i:s","i1:s","o:s","start:s","end:s"); 15 | 16 | if (!$opts{i} or !$opts{o}){ 17 | print "---------------------------------------------------------------------- 18 | USAGE: perl $0 19 | -i input file 20 | -o out file 21 | ----------------------------------------------------------------------\n"; 22 | exit; 23 | 24 | } 25 | my %h=(); 26 | open IN, $opts{i} or die "Cannot open file: $opts{i}!\n"; 27 | open IN1, $opts{i1} or die "Cannot open file: $opts{i1}!\n"; 28 | open OUT,">$opts{o}" or die "Cannot create file: $opts{o}!\n"; 29 | L:while(){ 30 | #chr8 1 1000000 1 W seq00000000 1 1000000 + 31 | #chr8 1000001 1000100 2 N 100 scaffold yes align_genus 32 | #chr8 1000101 4999101 3 W seq00000001 1 3999001 + 33 | #scf0 1000001 1000999 2 W qseq00000000 10002 11000 + 34 | chomp; 35 | if ($_=~ $opts{start}) { 36 | my @f = split /\t/, $_; 37 | 38 | $h{$f[5]}=[@f]; 39 | @f = split /\t/, ; 40 | $h{"scaffold"}=[@f]; 41 | @f = split /\t/, ; 42 | $h{$f[5]}=[@f]; 43 | last L; 44 | } 45 | } 46 | my $start; 47 | my $end; 48 | my $len2; 49 | while(){ 50 | #qseq00000269 12474565 2150891 12471563 + seq00000001 79845467 259191 10579680 9480999 10320756 60 51 | #qseq00000269 12474565 75 2194153 + seq00000000 673241388 671105857 673241383 2029712 2194088 60 52 | my $l1=chomp($_); 53 | my $number=0; 54 | my @f = split /\t/, $_; 55 | 56 | if ($f[4] eq "+") { 57 | if ($f[5] eq $opts{start}) { 58 | my $offset=$f[6]-$f[8]; 59 | $h{$f[5]}[2]-=$offset; 60 | $h{$f[5]}[7]-=$offset; 61 | 62 | $h{"scaffold"}[1]-=$offset; 63 | $start=$f[3]; 64 | $h{"scaffold"}[1]=$h{$opts{start}}[2]+1; 65 | $h{"scaffold"}[6]=$f[3]+1; 66 | }elsif ($f[5] eq $opts{end}){ 67 | $end=$f[2]; 68 | $h{"scaffold"}[4]="W"; 69 | $h{"scaffold"}[5]=$f[0]; 70 | 71 | $h{"scaffold"}[8]="+"; 72 | $h{$opts{end}}[6]=$f[7]+1; 73 | } 74 | 75 | }elsif($f[4] eq "-") { 76 | if ($f[5] eq $opts{start}) { 77 | my $offset=$f[6]-$f[8]; 78 | $h{$f[5]}[2]-=$offset; 79 | $h{$f[5]}[7]-=$offset; 80 | $h{"scaffold"}[1]-=$offset; 81 | $end=$f[2]; 82 | $h{"scaffold"}[1]=$h{$opts{start}}[2]+1; 83 | $h{"scaffold"}[7]=$f[2]; 84 | 85 | 86 | }elsif ($f[5] eq $opts{end}){ 87 | $start=$f[3]; 88 | $h{"scaffold"}[4]="W"; 89 | $h{"scaffold"}[5]=$f[0]; 90 | 91 | $h{"scaffold"}[8]="-"; 92 | $h{$f[5]}[6]=$f[7]+1; 93 | $h{"scaffold"}[6]=$f[3]+1; 94 | } 95 | } 96 | } 97 | my $gapsize=$end-$start; 98 | print $gapsize."\n"; 99 | $h{"scaffold"}[2]=$h{"scaffold"}[1]+$gapsize-1; 100 | $h{"scaffold"}[7]=$h{"scaffold"}[6]+$gapsize-1; 101 | if ($h{"scaffold"}[2]>$h{$opts{start}}[2]) { 102 | 103 | $h{$opts{end}}[1]=$h{"scaffold"}[2]+1; 104 | $h{$opts{end}}[2]=$h{$opts{end}}[1]+($h{$opts{end}}[7]-$h{$opts{end}}[6]); 105 | my $string = join "\t", @{$h{$opts{start}}}; 106 | print OUT "## agp-version 2.1\n"; 107 | print OUT "# AGP created by RagTag v2.1.0\n"; 108 | print OUT $string."\n"; 109 | $string = join "\t", @{$h{"scaffold"}}; 110 | print OUT $string."\n"; 111 | $string = join "\t", @{$h{$opts{end}}}; 112 | print OUT $string; 113 | }else{ 114 | $h{$opts{end}}[1]=$h{$opts{start}}[2]+1; 115 | $h{$opts{end}}[2]=$h{$opts{end}}[1]+($h{$opts{end}}[7]-$h{$opts{end}}[6]); 116 | $h{$opts{end}}[3]--; 117 | my $string = join "\t", @{$h{$opts{start}}}; 118 | print OUT "## agp-version 2.1\n"; 119 | print OUT "# AGP created by RagTag v2.1.0\n"; 120 | print OUT $string."\n"; 121 | $string = join "\t", @{$h{$opts{end}}}; 122 | print OUT $string; 123 | } -------------------------------------------------------------------------------- /02_Gap_patching/wfmash_ragtag.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | query=$1 4 | ref=$2 5 | region=$3 6 | mkdir "$region" 7 | cd "$region" 8 | wfmash "$ref" "$query" > "$region".paf 9 | mkdir ragtag_output 10 | cd ragtag_output 11 | ln -s ../"$region".paf ragtag.patch.asm.paf 12 | cd .. 13 | ln -s "$ref" ref.fasta 14 | ln -s "$query" query.fasta 15 | 16 | ragtag.py patch -i 0.99 --remove-small -q 10 --debug -u --aligner minimap2 -t 128 --mm2-params "-x asm20 -I1G -t 128" ref.fasta query.fasta 17 | _submit_telomere.sh ragtag_output/ragtag.patch.fasta #(https://github.com/VGP/vgp-assembly) 18 | -------------------------------------------------------------------------------- /03_Polishing/bwa_winnowmap.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | b={} 5 | hifi_single={} 6 | hifi_mix={} 7 | e={} 8 | d={} 9 | W=config["WORKDIR"] 10 | DIR=config["DIR"] 11 | DIRs=config["DIRs"] 12 | DIRont=config["DIRont"] 13 | 14 | 15 | 16 | 17 | for dirs in os.listdir(DIR): 18 | b2 = dirs.split(".fastq") 19 | if ".fastq" in dirs: 20 | absPath = os.path.join(DIR, dirs) 21 | hifi_mix[b2[0]]=absPath 22 | elif ".bam" in dirs: 23 | a = 0 24 | 25 | for x in range(1, 4): 26 | x=str(x) 27 | for dirs in os.listdir(DIRs+x): 28 | b2 = dirs.split(".fastq") 29 | if ".fastq" in dirs: 30 | absPath = os.path.join(DIRs+x, dirs) 31 | d[b2[0]]=absPath 32 | elif ".bam" in dirs: 33 | a = 0 34 | 35 | for dirs in os.listdir(DIRont): 36 | b2 = dirs.split(".fastq") 37 | if ".fastq" in dirs: 38 | absPath = os.path.join(DIRont, dirs) 39 | e[b2[0]]=absPath 40 | elif ".bam" in dirs: 41 | a = 0 42 | 43 | rule final: 44 | input: 45 | W+"single_hifi_pcr/hybrid.bam", 46 | W+"hybrid_hifi_pcr/hybrid.bam", 47 | W + "ont_merge/q10l120k.bam" 48 | 49 | 50 | 51 | rule minimap_cu_mix: 52 | input: 53 | fq=W+"hifi_mix_reads/{hifi_mix}_q40l15k.fastq", 54 | ref="CS_ISSA.fasta", 55 | txt="repetitive_k27.txt" 56 | output: 57 | W+"hifi_mix_winnowmap/{hifi_mix}_q40l15k.sam" 58 | benchmark: 59 | W+"benchmarks/hifi_mix_winnowmap/{hifi_mix}.benchmark.txt" 60 | shell: 61 | # meryl count k=27 output merylDB CS_ISSA.fasta 62 | #meryl print greater-than distinct=0.9998 merylDB > repetitive_k27.txt 63 | "winnowmap --MD -W {input.txt} -ax map-pb -H -K 1500M -k 27 -w27 -t32 {input.ref} {input.fq} > {output}" 64 | 65 | rule minimap_cu_single: 66 | input: 67 | fq=W+"hifi_single_reads/{hifi_single}_q40l15k.fastq", 68 | ref = "CS_ISSA.fasta", 69 | txt = "repetitive_k27.txt" 70 | output: 71 | W+"hifi_single_winnowmap/{hifi_single}_q40l15k.sam" 72 | benchmark: 73 | W+"benchmarks/hifi_single_winnowmap/{hifi_single}.benchmark.txt" 74 | shell: 75 | # meryl count k=27 output merylDB CS_ISSA.fasta 76 | #meryl print greater-than distinct=0.9998 merylDB > repetitive_k27.txt 77 | "winnowmap --MD -W {input.txt} -ax map-pb -H -K 1500M -k 27 -w27 -t32 {input.ref} {input.fq} > {output}" 78 | 79 | rule hifi_mix_sort: 80 | input: 81 | W+"hifi_mix_winnowmap/{hifi_mix}_q40l15k.sam" 82 | output: 83 | W+"hifi_mix_sort/{hifi_mix}_q40l15k.bam" 84 | params: 85 | "CS_ISSA.fasta.fai" 86 | benchmark: 87 | W + "benchmarks/hifi_mix_sort/{hifi_mix}.benchmark.txt" 88 | shell: 89 | "samtools view -@32 -bt {params} {input}|samtools sort -@32 -m1500M -O bam -o {output} -" 90 | 91 | rule filter: 92 | input: 93 | W+"hifi_mix_sort/{hifi_mix}_q40l15k.bam" 94 | output: 95 | W+"hifi_mix_sort_filter/{hifi_mix}_q40l15k.bam" 96 | benchmark: 97 | W + "benchmarks/hifi_mix_sort_filter/{hifi_mix}.benchmark.txt" 98 | shell: 99 | "samtools view -@32 -F0x104 -hb {input} > {output}" 100 | 101 | rule filter_merge: 102 | input: 103 | expand(W+"hifi_mix_sort_filter/{hifi_mix}_q40l15k.bam",hifi_mix=hifi_mix) 104 | output: 105 | W+"hybrid/hybrid.bam" 106 | benchmark: 107 | W + "benchmarks/hybrid/hybrid.benchmark.txt" 108 | shell: 109 | "samtools merge -@ 128 -l 0 {output} {input}" 110 | 111 | 112 | #"/home/liusc/lxp/software/bwa-mem2/bwa-mem2.avx512bw index {input}" 113 | 114 | rule pcr_free_hybrid: 115 | input: 116 | fa="CS_ISSA.fasta", 117 | r1="hybrid_1.clean.fq.gz", 118 | r2="hybrid_2.clean.fq.gz" 119 | output: 120 | W+"hybrid_hifi_pcr/pcr.bam" 121 | shell: 122 | "bwa-mem2.avx512bw mem -t 96 {input.fa} {input.r1} {input.r2}|samtools view -@ 96 -b -|samtools sort -@ 96 -m 30G -o {output} -" 123 | 124 | rule pcr_free_single: 125 | input: 126 | fa="CS_ISSA.fasta", 127 | r1="single_1.clean.fq.gz", 128 | r2="single_2.clean.fq.gz" 129 | output: 130 | W+"single_hifi_pcr/pcr.bam" 131 | shell: 132 | "bwa-mem2.avx512bw mem -t 96 {input.fa} {input.r1} {input.r2}|samtools view -@ 96 -b -|samtools sort -@ 96 -m 30G -o {output} -" 133 | 134 | rule filter_merge_hybrid: 135 | input: 136 | hifi=expand(W+"hifi_mix_sort_filter/{hifi_mix}_q40l15k.bam",hifi_mix=hifi_mix), 137 | pcr=W+"hybrid_hifi_pcr/pcr.bam" 138 | output: 139 | W+"hybrid_hifi_pcr/hybrid.bam" 140 | benchmark: 141 | W + "benchmarks/hybrid_pcr/hybrid.benchmark.txt" 142 | shell: 143 | "samtools merge -@ 128 -l 0 {output} {input.hifi} {input.pcr}" 144 | 145 | rule hifi_single_sort: 146 | input: 147 | W+"hifi_single_winnowmap/{hifi_single}_q40l15k.sam" 148 | output: 149 | W+"hifi_single_sort/{hifi_single}_q40l15k.bam" 150 | params: 151 | "/data/liusc/lixp/wheat/result/ref/m219/m219.fasta.fai" 152 | benchmark: 153 | W + "benchmarks/hifi_single_sort/{hifi_single}.benchmark.txt" 154 | shell: 155 | "samtools view -@32 -bt {params} {input}|samtools sort -@32 -m1500M -O bam -o {output} -" 156 | 157 | rule filter_single: 158 | input: 159 | W+"hifi_single_sort/{hifi_single}_q40l15k.bam" 160 | output: 161 | W+"hifi_single_sort_filter/{hifi_single}_q40l15k.bam" 162 | benchmark: 163 | W + "benchmarks/hifi_single_sort_filter/{hifi_single}.benchmark.txt" 164 | shell: 165 | "samtools view -@ 32 -F0x104 -hb {input} > {output}" 166 | 167 | rule filter_merge_single: 168 | input: 169 | expand(W+"hifi_single_sort_filter/{hifi_single}_q40l15k.bam",hifi_single=d) 170 | output: 171 | W+"single/single.bam" 172 | benchmark: 173 | W + "benchmarks/single/single.benchmark.txt" 174 | shell: 175 | "samtools merge -@ 128 -l 0 {output} {input}" 176 | 177 | rule filter_merge_single_pcr: 178 | input: 179 | hifi=expand(W+"hifi_single_sort_filter/{hifi_single}_q40l15k.bam",hifi_single=d), 180 | pcr=W+"single_hifi_pcr/pcr.bam" 181 | output: 182 | W+"single_hifi_pcr/hybrid.bam" 183 | benchmark: 184 | W + "benchmarks/single_pcr/hybrid.benchmark.txt" 185 | shell: 186 | "samtools merge -@ 128 -l 0 {output} {input.hifi} {input.pcr}" 187 | 188 | 189 | rule minimap_cu_ont: 190 | input: 191 | fq=W+"ont/reads/{e}_q10l120k.fastq", 192 | ref="CS_ISSA.fasta", 193 | txt="repetitive_k27.txt" 194 | output: 195 | W+"ont_winnowmap/{e}/{e}_q10l120k.sam" 196 | benchmark: 197 | W+"benchmarks/ont_winnowmap/{e}.benchmark.txt" 198 | shell: 199 | "winnowmap --MD -W {input.txt} -ax map-ont -H -K 1500M -k 27 -w27 -t32 {input.ref} {input.fq} > {output}" 200 | 201 | rule sort: 202 | input: 203 | W+"ont_winnowmap/{e}/{e}_q10l120k.sam" 204 | output: 205 | W+"ont_sort/{e}/{e}_q10l120k.bam" 206 | params: 207 | "CS_ISSA.fasta.fai" 208 | benchmark: 209 | W + "benchmarks/ont_sort/{e}.benchmark.txt" 210 | shell: 211 | "samtools view -@32 -bt {params} {input}|samtools sort -@32 -m1500M -O bam -o {output} -" 212 | 213 | rule filter_ont: 214 | input: 215 | W+"ont_sort/{e}/{e}_q10l120k.bam" 216 | output: 217 | W+"ont_filter/{e}_q10l120k.bam" 218 | benchmark: 219 | W + "benchmarks/ont_filter/{e}.benchmark.txt" 220 | shell: 221 | "samtools view -@ 128 -F0x104 -hb {input} > {output}" 222 | 223 | rule merge: 224 | input: 225 | expand(W+"ont_filter/{e}_q10l120k.bam",e=e) 226 | output: 227 | W + "ont_merge/q10l120k.bam" 228 | benchmark: 229 | W + "benchmarks/ont_merge/benchmark.txt" 230 | shell: 231 | "samtools merge -@ 128 -o {output} {input}" 232 | 233 | # snakemake -s bwa_winnowmap.py --cluster-config clust.json --configfile conf_ck.yaml --cluster '{cluster.account}' --jobs 40 --rerun-incomplete --restart-times 1 -np -------------------------------------------------------------------------------- /03_Polishing/callsv_snv.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | b={} 5 | d={} 6 | e={} 7 | W=config["WORKDIR"] 8 | 9 | rule final: 10 | input: 11 | W + "output/SV_SNV/merfin_sv_snv_consensus.fasta" 12 | 13 | rule sv_sniffles_hybrid: 14 | input: 15 | W+"hybrid/hybrid.bam" 16 | output: 17 | W + "output/SV/sniffles_hybrid.vcf" 18 | shell: 19 | "sniffles --threads 128 --input {input} --vcf {output}" 20 | 21 | rule sv_sniffles_single: 22 | input: 23 | W+"single/single.bam" 24 | output: 25 | W + "output/SV/sniffles_single.vcf" 26 | shell: 27 | "sniffles --threads 128 --input {input} --vcf {output}" 28 | 29 | rule sv_sniffles_ont: 30 | input: 31 | W + "ont_merge/q10l120k.bam" 32 | output: 33 | W + "output/SV/sniffles_ont.vcf" 34 | shell: 35 | "sniffles --threads 128 --input {input} --vcf {output}" 36 | 37 | #################call SV 38 | rule sv_cutesv_hybrid: 39 | input: 40 | bam=W+"hybrid/hybrid.bam", 41 | ref="CS_ISSA.fasta" 42 | output: 43 | W + "output/SV/cutesv_hybrid.vcf" 44 | params: 45 | W + "output/SV/cutesv_hybrid" 46 | shell: 47 | "cuteSV --threads 128 {input.bam} {input.ref} {output} {params} --genotype" 48 | 49 | rule sv_cutesv_hybrid_suggest: 50 | input: 51 | bam=W+"hybrid/hybrid.bam", 52 | ref="CS_ISSA.fasta" 53 | output: 54 | W + "output/SV/cutesv_hybrid_suggest.vcf" 55 | params: 56 | W + "output/SV/cutesv_hybrid_suggest" 57 | shell: 58 | "cuteSV --threads 128 {input.bam} {input.ref} {output} {params} --max_cluster_bias_INS 100 --diff_ratio_merging_INS 0.3 --max_cluster_bias_DEL 100 --diff_ratio_merging_DEL 0.3 --genotype" 59 | 60 | rule sv_cutesv_single: 61 | input: 62 | bam=W+"single/single.bam", 63 | ref="CS_ISSA.fasta" 64 | output: 65 | W + "output/SV/cutesv_single.vcf" 66 | params: 67 | W + "output/SV/cutesv_single" 68 | shell: 69 | "cuteSV --threads 128 {input.bam} {input.ref} {output} {params} --genotype" 70 | 71 | rule sv_cutesv_single_suggest: 72 | input: 73 | bam=W+"single/single.bam", 74 | ref="CS_ISSA.fasta" 75 | output: 76 | W + "output/SV/cutesv_single_suggest.vcf" 77 | params: 78 | W + "output/SV/cutesv_single_suggest" 79 | shell: 80 | "cuteSV --threads 128 {input.bam} {input.ref} {output} {params} --max_cluster_bias_INS 100 --diff_ratio_merging_INS 0.3 --max_cluster_bias_DEL 100 --diff_ratio_merging_DEL 0.3 --genotype" 81 | 82 | rule sv_cutesv_ont: 83 | input: 84 | bam=W + "ont_merge/q10l120k.bam", 85 | ref="CS_ISSA.fasta" 86 | output: 87 | W + "output/SV/cutesv_ont.vcf" 88 | params: 89 | W + "output/SV/cutesv_ont" 90 | shell: 91 | "cuteSV --threads 128 {input.bam} {input.ref} {output} {params} --genotype" 92 | 93 | rule sv_cutesv_ont_suggest: 94 | input: 95 | bam=W + "ont_merge/q10l120k.bam", 96 | ref="CS_ISSA.fasta" 97 | output: 98 | W + "output/SV/cutesv_ont_suggest.vcf" 99 | params: 100 | W + "output/SV/cutesv_ont_suggest" 101 | shell: 102 | "cuteSV --threads 128 {input.bam} {input.ref} {output} {params} --max_cluster_bias_INS 1000 --diff_ratio_merging_INS 0.9 --max_cluster_bias_DEL 1000 --diff_ratio_merging_DEL 0.5 --genotype" 103 | 104 | rule hifi_ls: 105 | input: 106 | hifi1=W + "output/SV/sniffles_hybrid.vcf", 107 | hifi2=W + "output/SV/cutesv_hybrid.vcf", 108 | hifi3=W + "output/SV/cutesv_hybrid_suggest.vcf", 109 | hifi4=W + "output/SV/sniffles_single.vcf", 110 | hifi5=W + "output/SV/cutesv_single.vcf", 111 | hifi6=W + "output/SV/cutesv_single_suggest.vcf" 112 | output: 113 | W + "output/SV/hifi_ls.txt" 114 | shell: 115 | "ls {input.hifi1} {input.hifi2} > {output}" 116 | 117 | rule ont_ls: 118 | input: 119 | ont1=W + "output/SV/{chr}/{chr}_q10l120k_cutesv_ont.vcf", 120 | ont2=W + "output/SV/{chr}/{chr}_q10l120k_cutesv_ont_suggest.vcf", 121 | ont3=W + "output/SV/sniffles_ont.vcf" 122 | output: 123 | W + "output/SV/ont_ls.txt" 124 | shell: 125 | "ls {input.ont1} {input.ont2} > {output}" 126 | 127 | rule jasmine_hifi: 128 | input: 129 | W + "output/SV/hifi_ls.txt" 130 | output: 131 | W + "output/SV/jasmine_hifi.vcf" 132 | shell: 133 | "jasmine max_dist=500 min_seq_id=0.3 spec_reads=3 threads=128 min_support=1 --output_genotypes file_list={input} out_file={output}" 134 | 135 | rule jasmine_ont: 136 | input: 137 | W + "output/SV/ont_ls.txt" 138 | output: 139 | W + "output/SV/jasmine_ont.vcf" 140 | shell: 141 | "jasmine max_dist=500 min_seq_id=0.3 spec_reads=3 threads=128 min_support=1 --output_genotypes file_list={input} out_file={output}" 142 | 143 | rule ls_hifi_ont: 144 | input: 145 | hifi=W + "output/SV/jasmine_hifi.vcf", 146 | ont=W + "output/SV/jasmine_ont.vcf" 147 | output: 148 | W + "output/hifi_ont_ls.txt" 149 | shell: 150 | "ls {input.hifi} {input.ont} > {output}" 151 | 152 | rule jasmine_hifi_ont: 153 | input: 154 | W + "output/SV/hifi_ont_ls.txt" 155 | output: 156 | W + "output/SV/jasmine_hifi_ont.vcf" 157 | shell: 158 | "jasmine max_dist=500 min_seq_id=0.3 spec_reads=3 threads=96 min_support=2 --output_genotypes file_list={input} out_file={output}" 159 | 160 | ############################call SNV 161 | 162 | rule vcf_filter_ont: 163 | input: 164 | W + "output/pepper_deepvariant_output/intermediate_files/PEPPER_VARIANT_FULL.vcf.gz" 165 | output: 166 | W + "output/pepper_deepvariant_output/intermediate_files/PEPPER_VARIANT_FULL.PASS.gq25.gt10.vcf.gz" 167 | shell: 168 | "bcftools view -f ""PASS"" -e 'FORMAT/GQ<=25 | FORMAT/DP<=10' -Oz {input} > {output}" 169 | 170 | rule vcf_filter_ont_index: 171 | input: 172 | W + "output/pepper_deepvariant_output/intermediate_files/PEPPER_VARIANT_FULL.PASS.gq25.gt10.vcf.gz" 173 | output: 174 | W + "output/pepper_deepvariant_output/intermediate_files/PEPPER_VARIANT_FULL.PASS.gq25.gt10.vcf.gz.csi" 175 | shell: 176 | "bcftools index -c {input}" 177 | 178 | rule vcf_filter_hifi: 179 | input: 180 | W + "output/hybrid.vcf" 181 | output: 182 | W + "output/hybrid.PASS.gq25.gt10.vcf.gz" 183 | shell: 184 | "bcftools view -f ""PASS"" -e 'FORMAT/GQ<=25 | FORMAT/DP<=10' -Oz {input} > {output}" 185 | 186 | rule vcf_filter_hifi_index: 187 | input: 188 | W + "output/hybrid.PASS.gq25.gt10.vcf.gz" 189 | output: 190 | W + "output/hybrid.PASS.gq25.gt10.vcf.gz.csi" 191 | shell: 192 | "bcftools index -c {input}" 193 | 194 | rule hap: 195 | input: 196 | hifi=W + "output/hybrid.PASS.gq25.gt10.vcf.gz", 197 | hifi_csi=W + "output/hybrid.PASS.gq25.gt10.vcf.gz.csi", 198 | ont=W + "output/pepper_deepvariant_output/intermediate_files/PEPPER_VARIANT_FULL.PASS.gq25.gt10.vcf.gz", 199 | ont_csi=W + "output/pepper_deepvariant_output/intermediate_files/PEPPER_VARIANT_FULL.PASS.gq25.gt10.vcf.gz.csi", 200 | ref="CS_ISSA.fasta" 201 | output: 202 | W + "output/SNV/HAPPY.vcf.gz", 203 | params: 204 | W + "output/SNV/HAPPY" 205 | shell: 206 | "python hap.py {input.hifi} {input.ont} -r {input.ref} -o {params} --pass-only --threads 128" 207 | 208 | rule vcf_merge_t2t: 209 | input: 210 | hifi=W + "output/hybrid.PASS.gq25.gt10.vcf.gz", 211 | ont=W + "output/pepper_deepvariant_output/intermediate_files/PEPPER_VARIANT_FULL.PASS.gq25.gt10.vcf.gz", 212 | hap=W + "output/SNV/HAPPY.vcf.gz", 213 | output: 214 | W + "output/SNV/MERGED_SMALL_VARIANTS.vcf.gz" 215 | shell: 216 | "python3 vcf_merge_t2t.py -v1 {input.hifi} -v2 {input.ont} -hv {input.hap} -o {output}" 217 | 218 | rule gunzip: 219 | input: 220 | W + "output/SNV/MERGED_SMALL_VARIANTS.vcf.gz" 221 | output: 222 | W + "output/SNV/MERGED_SMALL_VARIANTS.vcf" 223 | shell: 224 | "gunzip -d -c {input} > {output}" 225 | 226 | rule meryl_count: 227 | input: 228 | "CS_ISSA.fasta" 229 | output: 230 | directory(W+"meryl/merylDB_k21"), 231 | params: 232 | k="21", 233 | dir=W+"meryl/merylDB{chr}_k21" 234 | threads: 128 235 | shell: 236 | "/home/liusc/software/meryl-1.4/bin/meryl count k={params.k} threads={threads} {input} output {params.dir}" 237 | 238 | rule merfin_snv: 239 | input: 240 | ref="CS_ISSA.fasta", 241 | seqmers=W+"meryl/merylDB_k21", 242 | vcf=W + "output/SNV/MERGED_SMALL_VARIANTS.vcf" 243 | output: 244 | W + "output/SNV/merfin_snv.filter.vcf" 245 | params: 246 | W + "output/SNV/merfin_snv" 247 | shell: 248 | "merfin -strict -threads 128 -sequence {input.ref} -seqmers {input.seqmers} -readmers single.hifi40_cspcrfree.k21.gt1.meryl -peak 88.3 -prob lookup_table.txt -vcf {input.vcf} -output {params}" 249 | 250 | rule merfin_sv: 251 | input: 252 | ref="CS_ISSA.fasta", 253 | seqmers=W+"meryl/merylDB_k21", 254 | vcf=W + "output/SV/jasmine_hifi_ont.vcf" 255 | output: 256 | W + "output/SV/merfin_sv.filter.vcf" 257 | params: 258 | W + "output/SV/merfin_sv" 259 | shell: 260 | "merfin -strict -threads 128 -sequence {input.ref} -seqmers {input.seqmers} -readmers single.hifi40_cspcrfree.k21.gt1.meryl -peak 88.3 -prob single.hifi40_cspcrfree.k21/lookup_table.txt -vcf {input.vcf} -output {params}" 261 | 262 | rule cut: 263 | input: 264 | W + "output/SV/merfin_sv.filter.vcf" 265 | output: 266 | W + "output/SV/merfin_sv10.filter.vcf" 267 | shell: 268 | "cut -f 1-10 {input} > {output}" 269 | 270 | rule ls_sv_snv: 271 | input: 272 | snv=W + "output/SNV/merfin_snv.filter.vcf", 273 | sv=W + "output/SV/merfin_sv10.filter.vcf" 274 | output: 275 | W + "output/SV_SNV/lst.txt" 276 | shell: 277 | "ls {input.snv} {input.sv} > {output}" 278 | rule jasmine_merge: 279 | input: 280 | W + "output/SV_SNV/lst.txt" 281 | output: 282 | W + "output/SV_SNV/SV_SNV.vcf" 283 | shell: 284 | "jasmine max_dist=500 min_seq_id=0.3 spec_reads=3 threads=96 --output_genotypes file_list={input} out_file={output}" 285 | rule merfin_sv_snv: 286 | input: 287 | ref="CS_ISSA.fasta", 288 | seqmers=W+"meryl/merylDB_k21", 289 | vcf=W + "output/SV_SNV/SV_SNV.vcf" 290 | output: 291 | W + "output/SV_SNV/merfin_sv_snv.filter.vcf" 292 | params: 293 | W + "output/SV_SNV/merfin_sv_snv" 294 | shell: 295 | "merfin -strict -threads 128 -sequence {input.ref} -seqmers {input.seqmers} -readmers single.hifi40_cspcrfree.k21.gt1.meryl -peak 88.3 -prob single.hifi40_cspcrfree.k21/lookup_table.txt -vcf {input.vcf} -output {params}" 296 | 297 | rule cut_merfin_sv_snv: 298 | input: 299 | W + "output/SV_SNV/merfin_sv_snv.filter.vcf" 300 | output: 301 | W + "output/SV_SNV/merfin_sv_snv.filter10.vcf" 302 | shell: 303 | "cut -f 1-10 {input} > {output}" 304 | 305 | rule view_merfin_sv_snv: 306 | input: 307 | W + "output/SV_SNV/merfin_sv_snv.filter10.vcf" 308 | output: 309 | W + "output/SV_SNV/merfin_sv_snv.filter.vcf.gz" 310 | shell: 311 | "bcftools view -Oz {input} > {output}" 312 | 313 | rule view_merfin_sv_snv_index: 314 | input: 315 | W + "output/SV_SNV/merfin_sv_snv.filter.vcf.gz" 316 | output: 317 | W + "output/SV_SNV/merfin_sv_snv.filter.vcf.gz.csi" 318 | shell: 319 | "bcftools index -c {input}" 320 | 321 | rule fasta: 322 | input: 323 | vcf=W + "output/SV_SNV/merfin_sv_snv.filter.vcf.gz", 324 | index=W + "output/SV_SNV/merfin_sv_snv.filter.vcf.gz.csi", 325 | ref="CS_ISSA.fasta", 326 | output: 327 | W + "output/SV_SNV/merfin_sv_snv_consensus.fasta" 328 | shell: 329 | "bcftools consensus -f {input.ref} -H 1 {input.vcf} > {output}" 330 | 331 | 332 | #snakemake -s callsv_snv.py --cluster-config clust.json --configfile conf_ck.yaml --cluster '{cluster.account}' --jobs 128 --rerun-incomplete --restart-times 1 -np -------------------------------------------------------------------------------- /03_Polishing/calsv_snv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | snakemake -s bwa_winnowmap.py --cluster-config clust_align.json --configfile conf_ck_align.yaml --cluster '{cluster.account}' --jobs 40 --rerun-incomplete --restart-times 1 4 | workdir=$1 5 | input="$workdir"/input 6 | output="$workdir"/output 7 | mkdir -p $input 8 | mkdir -p $output 9 | mkdir -p "$output"/intermediate_results_dir/work 10 | mkdir -p "$output"/intermediate_results_dir/temp 11 | mkdir -p "$output"/pepper_deepvariant_output 12 | ref=$2 13 | 14 | a=${ref%.*} 15 | b=${a##*/} 16 | $(basename $ref .fa) 17 | cp $ref $input 18 | threads=$3 19 | ref="$input"/"$b".fasta 20 | mv ont_merge/q10l120k.bam $input 21 | mv ont_merge/q10l120k.bam.csi $input 22 | mv single_hifi_pcr/hybrid.bam $input 23 | mv single_hifi_pcr/hybrid.bam.csi $input 24 | mv hybrid_hifi_pcr/hybrid.bam $input 25 | mv hybrid_hifi_pcr/hybrid.bam.csi $input 26 | 27 | ####dv 28 | 29 | singularity run --cpus $threads --nv -B /home:/home -B /data:/data -B "$input":"$input" -B "$output":"$output" --workdir "$output"/intermediate_results_dir_hybrid/temp google_deepvariant_latest-gpu.sif /opt/deepvariant/bin/run_deepvariant --model_type "HYBRID_PACBIO_ILLUMINA" --ref "$ref" --reads "$input"/hybrid.bam --output_vcf "$output"/hybrid.vcf --num_shards $threads --intermediate_results_dir "$output"/intermediate_results_dir_hybrid 30 | 31 | singularity run --cpus $threads --nv -B /home:/home -B /data:/data -B "$input":"$input" -B "$output":"$output" --workdir "$output"/intermediate_results_dir_single/temp google_deepvariant_latest-gpu.sif /opt/deepvariant/bin/run_deepvariant --model_type "HYBRID_PACBIO_ILLUMINA" --ref "$ref" --reads "$input"/single.bam --output_vcf "$output"/single.vcf --num_shards $threads --intermediate_results_dir "$output"/intermediate_results_dir_single 32 | 33 | ####pepper dv 34 | 35 | docker run --ipc=host --gpus all -v /home:/home -v /data:/data -v "$input":"$input" -v "$output":"$output" kishwars/pepper_deepvariant:r0.8-gpu run_pepper_margin_deepvariant call_variant -b "$input"/q10l120k.bam -f $ref -o "$output"/pepper_deepvariant_output -g -p pep_dv_ont -t $threads --ont_r9_guppy5_sup 36 | 37 | ####filter & consensus 38 | snakemake -s callsv_snv.py --cluster-config clust.json --configfile conf_ck.yaml --cluster '{cluster.account}' --jobs 128 --rerun-incomplete --restart-times 1 -------------------------------------------------------------------------------- /03_Polishing/clust.json: -------------------------------------------------------------------------------- 1 | #snakemake --cluster-config clust.json --cluster '{cluster.account}' 2 | { 3 | "__default__" : 4 | { 5 | "account" : "sbatch -N 1 -n 1 -c 128 -p tcum256c128Partition", 6 | "jobs" : "59" 7 | }, 8 | } 9 | #snakemake -j 999 --cluster-config cluster.json --cluster "{cluster.account} -p {cluster.partition} -n {cluster.n} -t {cluster.time}" 10 | #--cluster "sbatch -A {cluster.account} -q {cluster.queue} -l select={cluster.nodes}:ncpus{cluster.ppn}:mem={cluster.mem} -l walltime={cluster.time}" 11 | #nohup snakemake -s sum.py --cluster-config clust.json --use-conda --cluster '{cluster.account}' --jobs 16 --restart-times 5 --conda-prefix /lustre1/deng_pkuhpc/deng_test/SF/min3/envs/map& 12 | -------------------------------------------------------------------------------- /03_Polishing/clust_align.json: -------------------------------------------------------------------------------- 1 | #snakemake --cluster-config clust.json --cluster '{cluster.account}' 2 | { 3 | "__default__" : 4 | { 5 | "account" : "sbatch -N 1 -n 1 -c 32 -p tcum256c128Partition", 6 | "jobs" : "59" 7 | }, 8 | "filter_merge" : 9 | { 10 | "account" : "sbatch -N 1 -n 1 -c 128 -p tcuHm512c128Partition", 11 | "jobs" : "59" 12 | }, 13 | "merge" : 14 | { 15 | "account" : "sbatch -N 1 -n 1 -c 128 -p tcuHm512c128Partition", 16 | "jobs" : "59" 17 | }, 18 | "filter_merge_flagstat" : 19 | { 20 | "account" : "sbatch -N 1 -n 1 -c 128 -p tcuHm512c128Partition", 21 | "jobs" : "59" 22 | }, 23 | "filter_merge_single" : 24 | { 25 | "account" : "sbatch -N 1 -n 1 -c 128 -p tcuHm512c128Partition", 26 | "jobs" : "59" 27 | }, 28 | "merge_flagstat" : 29 | { 30 | "account" : "sbatch -N 1 -n 1 -c 128 -p tcuHm512c128Partition", 31 | "jobs" : "59" 32 | }, 33 | "filter_merge_single_flagstat" : 34 | { 35 | "account" : "sbatch -N 1 -n 1 -c 128 -p tcuHm512c128Partition", 36 | "jobs" : "59" 37 | }, 38 | "merge_stat" : 39 | { 40 | "account" : "sbatch -N 1 -n 1 -c 128 -p tcuHm512c128Partition", 41 | "jobs" : "59" 42 | }, 43 | "filter_merge_single_pcr" : 44 | { 45 | "account" : "sbatch -N 1 -n 1 -c 128 -p tcuHm512c128Partition", 46 | "jobs" : "59" 47 | }, 48 | "filter_merge_hybrid" : 49 | { 50 | "account" : "sbatch -N 1 -n 1 -c 128 -p tcuHm512c128Partition", 51 | "jobs" : "59" 52 | }, 53 | "pcr_free_single" : 54 | { 55 | "account" : "sbatch -N 1 -n 1 -c 96 -p fatM4TC96Partition", 56 | "jobs" : "59" 57 | }, 58 | "pcr_free_hybrid" : 59 | { 60 | "account" : "sbatch -N 1 -n 1 -c 96 -p fatM4TC96Partition", 61 | "jobs" : "59" 62 | }, 63 | } 64 | #snakemake -j 999 --cluster-config cluster.json --cluster "{cluster.account} -p {cluster.partition} -n {cluster.n} -t {cluster.time}" 65 | #--cluster "sbatch -A {cluster.account} -q {cluster.queue} -l select={cluster.nodes}:ncpus{cluster.ppn}:mem={cluster.mem} -l walltime={cluster.time}" 66 | #nohup snakemake -s sum.py --cluster-config clust.json --use-conda --cluster '{cluster.account}' --jobs 16 --restart-times 5 --conda-prefix /lustre1/deng_pkuhpc/deng_test/SF/min3/envs/map& 67 | -------------------------------------------------------------------------------- /03_Polishing/conf_ck.yaml: -------------------------------------------------------------------------------- 1 | 2 | INDEX: 3 | /lustre1/deng_pkuhpc/deng_test/rf/97103_genome_v2/97103_genome_v2_b2 4 | DIR: 5 | /home/liusc/proj/wheat/rawdata/ont/splitall 6 | WORKDIR: 7 | /data/liusc/lixp/wheat/result/t2tpolish/dv/ 8 | REF: 9 | /home/liusc/lxp/xiaomai/iwgsc/IWGSC_RefSeq_Assembliesv2.1/iwgsc_refseqv2.1_assembly.fa 10 | SNP: 11 | /lustre1/deng_pkuhpc/deng_test/projects/watermelon/pub/cucurbit/reseq/watermelon/v2/1_SNP.vcf 12 | DICT: 13 | /lustre1/deng_pkuhpc/deng_test/rf/97103_genome_v2_chr.fa.dict 14 | TEMP: 15 | /gpfs1/deng_pkuhpc/deng_test/watermelon/mk/temp 16 | intervals: 17 | /lustre1/deng_pkuhpc/deng_test/projects/watermelon/snakemake/watermelon.list 18 | snpeff: 19 | /lustre1/deng_pkuhpc/deng_test/SF/conda/share/snpeff-5.0-1/data/97103_genome_v2 20 | 21 | -------------------------------------------------------------------------------- /03_Polishing/conf_ck_align.yaml: -------------------------------------------------------------------------------- 1 | DIR: 2 | HiFi hybrid path 3 | DIRs: 4 | HiFi single path 5 | DIRont: 6 | ONT path 7 | WORKDIR: 8 | WORK path 9 | -------------------------------------------------------------------------------- /04_Evaluation/BUSCO.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | geno=$1 4 | profix=$2 5 | 6 | busco -m geno -i $geno -l poales_odb10 -o $profix -c 52 7 | 8 | -------------------------------------------------------------------------------- /04_Evaluation/bac.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | item=$1 4 | chr=$2 5 | blastn -query $item -db ./"$chr"index -evalue 1e-6 -outfmt "6 qseqid qlen sseqid qstart qend sstart send pident length nident qcovs" -num_threads 128 -out "$1".bed 6 | #-outfmt "6 qseqid qlen sseqid qstart qend sstart send pident length nident qcovs" 7 | sed -n '1p' "$1".bed > "$1".txt 8 | mv "$1".txt ./ 9 | cat *.fasta.txt > all_bac.bed 10 | #for chr in Chr3A Chr3B Chr5A Chr5D; do makeblastdb -in part_"$chr".fasta -dbtype nucl -parse_seqids -out ./"$chr"index; for item in dir bac.fasta.split/"$chr"/*.fasta; do sbatch --job-name="$chr"blast --partition= --cpus-per-task=128 blast.sh $item $chr; done; done -------------------------------------------------------------------------------- /04_Evaluation/ltr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | ref=$1 4 | prefix=$2 5 | 6 | LTR_FINDER_parallel -seq $ref -threads 96 -harvest_out -size 1000000 7 | 8 | LTR_retriever -threads 96 -genome $ref -inharvest m2.1.7.fasta.finder.combine.scn -dnalib clariTeRep.fna -plantprolib protein.fasta 9 | -------------------------------------------------------------------------------- /04_Evaluation/mapping_rates_coverages .sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | 4 | hybrid_bam=$1 5 | single_bam=$2 6 | ont_bam=$3 7 | 8 | 9 | samtools flagstat -@ 128 $hybrid_bam > hybrid_bam.flagstat 10 | samtools coverage -o hybrid_bam.cov hybrid_bam 11 | samtools flagstat -@ 128 $single_bam > single_bam.flagstat 12 | samtools coverage -o single_bam.cov single_bam 13 | samtools flagstat -@ 128 $ont_bam > ont_bam.flagstat 14 | samtools coverage -o ont_bam.cov ont_bam 15 | -------------------------------------------------------------------------------- /04_Evaluation/qv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | query=$1 4 | ref=$2 5 | 6 | merqury.sh single.hifi40_cspcrfree.k21.gt1.meryl $query $ref t0 > t0.log 7 | -------------------------------------------------------------------------------- /04_Evaluation/synteny.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mkdir blastdb 4 | mkdir blastresult 5 | protein=$1 6 | name=$2 7 | gff3=$3 8 | 9 | makeblastdb -in $protein -dbtype prot -out ./blastdb/${name} 10 | blastp -query $protein -db ./blastdb/${name} -out ./blastresult/${name}.blast -num_threads 52 -outfmt 6 -evalue 1e-10 -num_alignments 5 11 | 12 | awk -vFS="\t" -vOFS="\t" '{if($3=="mRNA"){match($9,/ID=([^;]+)/,a);sub(/ID=/,"",a[0]);print $1,a[0],$4,$5}}' ${gff3} > ./blastresult/${name}.gff 13 | cd blastresult 14 | MCScanX ./${name} 15 | -------------------------------------------------------------------------------- /04_Evaluation/while.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | threads=$1 4 | partition=$2 5 | ref=$3 6 | query=$4 7 | seqkit split -i $ref 8 | seqkit split -i $query 9 | for num in {1..7} 10 | do 11 | for chr in A B D 12 | do 13 | sbatch --job-name="$num""$chr" --partition=$Partition --cpus-per-task="$threads" winnowmap.sh "$num""$chr" $ref $query 14 | done 15 | done 16 | -------------------------------------------------------------------------------- /04_Evaluation/winnowmap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | chr=$1 4 | mkdir "$chr" 5 | cd "$chr" 6 | ref="$2".split/Chr"$chr".fa 7 | query="$3".split/Chr"$chr".fa 8 | samtools faidx $ref 9 | cut -f 1,2 "$ref".fai > Chr"$chr".sizes 10 | 11 | meryl count k=27 output merylDB_"$chr" ${ref} 12 | meryl print greater-than distinct=0.9998 merylDB_"$chr" > repetitive_"$chr".txt 13 | split_fa ${query} > split.fa 14 | 15 | winnowmap -W repetitive_"$chr".txt -ax asm20 -K 1500M -k 27 -w 18 -t 52 -H --MD ${ref} split.fa > chr"$chr".sam 16 | 17 | k8 paftools.js sam2paf -p chr"$chr".sam > chr"$chr".paf 18 | cat chr"$chr".paf |awk '{if ($12 > 0) print $6"\t"$8"\t"$9}' |bedtools sort -i - |bedtools merge -i - |bedtools complement -i - -g Chr"$chr".sizes > Chr"$chr".bed 19 | 20 | -------------------------------------------------------------------------------- /05_Annotation/Snakefile: -------------------------------------------------------------------------------- 1 | # Created on May 09, 2017 2 | # 3 | # Version 1.0 4 | # 5 | # @author: sven.twardziok@posteo.de 6 | 7 | 8 | configfile: "config.yaml" 9 | 10 | import csv 11 | 12 | from Bio import SeqIO 13 | from Bio.Seq import Seq 14 | from Bio.SeqRecord import SeqRecord 15 | from modules import fasta 16 | import os 17 | import re 18 | import linecache 19 | 20 | ##################################################################################################################################### 21 | reference="CS-IAAS_v1.softmask.fasta" 22 | flair_stringtie="flair.output.isoforms.gtf" 23 | rule final: 24 | input: 25 | "transcripts.genes.gff3" 26 | 27 | rule gtf_genome_to_cdna_fasta: 28 | input: 29 | gff=flair_stringtie, 30 | genome=reference 31 | output: 32 | "transcripts.fasta" 33 | threads: 128 34 | run: 35 | shell("gtf_genome_to_cdna_fasta.pl {input.gff} {input.genome} > {output}") 36 | rule gtf_to_alignment_gff3: 37 | input: 38 | flair_stringtie 39 | output: 40 | "transcripts.gff3" 41 | threads: 128 42 | run: 43 | shell("gtf_to_alignment_gff3.pl {input} > {output}") 44 | rule transdecoder_longorfs: 45 | input: 46 | fasta="transcripts.fasta" 47 | output: 48 | pep="transcripts.fasta.transdecoder_dir/longest_orfs.pep" 49 | params: 50 | executable=config["executables"]["transdecoder"]["longorfs"] 51 | threads: 128 52 | run: 53 | shell("{params.executable} -t {input.fasta}") 54 | 55 | rule transdecoder_splitfasta: 56 | input: 57 | fasta="transcripts.fasta.transdecoder_dir/longest_orfs.pep" 58 | output: 59 | fastas=temp(["transcripts.fasta.transdecoder_dir/batches/part_" + str(nbatch) + "/part_" + str(nbatch) + ".fasta" 60 | for nbatch in range(1, config["transdecoder"]["nbatches"]+1)]) 61 | threads: 1 62 | run: 63 | splitfasta = fasta.SplitSeqs(sequences=input.fasta, outdir="transcripts.fasta.transdecoder_dir/batches" , nfiles=config["transdecoder"]["nbatches"]) 64 | 65 | rule transdecoder_blast: 66 | input: 67 | fasta="transcripts.fasta.transdecoder_dir/batches/part_{nbatch}/part_{nbatch}.fasta" 68 | output: 69 | blp=temp("transcripts.fasta.transdecoder_dir/batches/part_{nbatch}/part_{nbatch}.blp") 70 | params: 71 | database = config["data"]["transdecoder"]["blastp"], 72 | executable = config["executables"]["blastp"], 73 | threads: 1 74 | run: 75 | shell(params.executable + " -max_target_seqs 1 -evalue 1e-05 -db {params.database} -query {input.fasta} -out {output.blp} -outfmt 6") 76 | 77 | rule transdecoder_blast_combine: 78 | input: 79 | blps=lambda wildcards: ["transcripts.fasta.transdecoder_dir/batches/part_" + str(nbatch) + "/part_" + str(nbatch) + ".blp" 80 | for nbatch in range(1, config["transdecoder"]["nbatches"]+1)] 81 | output: 82 | blp="transcripts.fasta.transdecoder_dir/longest_orfs.pep_blastresults.blp" 83 | threads: 1 84 | run: 85 | shell("touch {output.blp}") 86 | for blp in input.blps: 87 | shell("cat " + blp + " >> {output.blp}") 88 | 89 | rule transdecoder_hmmscan: 90 | input: 91 | fasta="transcripts.fasta.transdecoder_dir/batches/part_{nbatch}/part_{nbatch}.fasta" 92 | output: 93 | domtblout=temp("transcripts.fasta.transdecoder_dir/batches/part_{nbatch}/part_{nbatch}.domtblout") 94 | params: 95 | executable = config["executables"]["hmmscan"], 96 | pfamhmm = config["data"]["transdecoder"]["pfamhmm"], 97 | nodes = config["transdecoder"]["hmmscan"]["nodes"], 98 | memory = config["transdecoder"]["hmmscan"]["memory"], 99 | job_name = "hmmscanning", 100 | log = config['transdecoder']['log'] 101 | resources: 102 | MB = 2000, 103 | load = 1 104 | threads: 2 105 | run: 106 | shell(params.executable + " --domtblout {output.domtblout} {params.pfamhmm} {input.fasta}") 107 | 108 | rule transdecoder_hmmscan_combine: 109 | input: 110 | domtblout=lambda wildcards: ["transcripts.fasta.transdecoder_dir/batches/part_" + str(nbatch) + "/part_" + str(nbatch) + ".domtblout" 111 | for nbatch in range(1, config["transdecoder"]["nbatches"]+1)] 112 | output: 113 | domtblout="transcripts.fasta.transdecoder_dir/longest_orfs.pep_hmmscan.domtblout" 114 | params: 115 | nodes = 1, 116 | memory = "4G", 117 | job_name = config['transdecoder']['job_name'], 118 | log = config['transdecoder']['log'] 119 | resources: 120 | load = 1, 121 | MB = 2000 122 | threads: 1 123 | run: 124 | shell("touch {output.domtblout}") 125 | for domtblout in input.domtblout: 126 | shell("grep -v \"#\" " + domtblout + " >> {output.domtblout}") 127 | 128 | rule transdecoder_predict: 129 | input: 130 | fasta = "transcripts.fasta", 131 | blp = "transcripts.fasta.transdecoder_dir/longest_orfs.pep_blastresults.blp", 132 | domtblout = "transcripts.fasta.transdecoder_dir/longest_orfs.pep_hmmscan.domtblout" 133 | output: 134 | gff3 = "transcripts.fasta.transdecoder.gff3" 135 | params: 136 | executable=config["executables"]["transdecoder"]["predict"], 137 | nodes = config["transdecoder"]["predict"]["nodes"], 138 | memory = config["transdecoder"]["predict"]["memory"], 139 | job_name = "predicting", 140 | log = config['transdecoder']['log'] 141 | resources: 142 | load = 1 143 | threads: 128 144 | run: 145 | shell("{params.executable} -t {input.fasta} --retain_pfam_hits {input.domtblout} --retain_blastp_hits {input.blp} --cpu {params.nodes}") 146 | 147 | rule transdecoder_convert: 148 | input: 149 | fasta = "transcripts.fasta", 150 | gff3 = "transcripts.fasta.transdecoder.gff3", 151 | gtf="transcripts.gff3" 152 | output: 153 | gff3 = "transcripts.genes.gff3" 154 | params: 155 | executable_gff3=config["executables"]["transdecoder"]["convertgff3"], 156 | executable_genome=config["executables"]["transdecoder"]["convertgenome"], 157 | nodes = config["transdecoder"]["convert"]["nodes"], 158 | memory = config["transdecoder"]["convert"]["memory"], 159 | job_name = "converting", 160 | log = config['transdecoder']['log'] 161 | resources: 162 | load = 1 163 | threads: 1 164 | run: 165 | shell("{params.executable_genome} {input.gff3} {input.gtf} {input.fasta} > {output.gff3}") 166 | 167 | # nohup python ~/software/miniconda3/envs/annotation/bin/snakemake -s Snakefile --cluster-config clust.json --configfile config.yaml --jobs 2000 --cluster '{cluster.account}' --rerun-incomplete --restart-times 1& -np 168 | -------------------------------------------------------------------------------- /05_Annotation/clust.json: -------------------------------------------------------------------------------- 1 | #snakemake --cluster-config clust.json --cluster '{cluster.account}' 2 | { 3 | "__default__" : 4 | { 5 | "account" : "sbatch -N 1 -n 1 -c 1 -p fatM4TC96Partition", 6 | "jobs" : "59" 7 | }, 8 | "transdecoder_predict" : 9 | { 10 | "account" : "sbatch -N 1 -n 1 -c 96 -p fatM4TC96Partition", 11 | "jobs" : "59" 12 | }, 13 | "gtf_genome_to_cdna_fasta" : 14 | { 15 | "account" : "sbatch -N 1 -n 1 -c 96 -p fatM4TC96Partition", 16 | "jobs" : "59" 17 | }, 18 | "merge_flagstat" : 19 | { 20 | "account" : "sbatch -N 1 -n 1 -c 96 -p fatM4TC96Partition", 21 | "jobs" : "59" 22 | }, 23 | "gtf_to_alignment_gff3" : 24 | { 25 | "account" : "sbatch -N 1 -n 1 -c 96 -p fatM4TC96Partition", 26 | "jobs" : "59" 27 | }, 28 | "transdecoder_longorfs" : 29 | { 30 | "account" : "sbatch -N 1 -n 1 -c 96 -p fatM4TC96Partition", 31 | "jobs" : "59" 32 | }, 33 | } 34 | #snakemake -j 999 --cluster-config cluster.json --cluster "{cluster.account} -p {cluster.partition} -n {cluster.n} -t {cluster.time}" 35 | #--cluster "sbatch -A {cluster.account} -q {cluster.queue} -l select={cluster.nodes}:ncpus{cluster.ppn}:mem={cluster.mem} -l walltime={cluster.time}" 36 | #nohup snakemake -s sum.py --cluster-config clust.json --use-conda --cluster '{cluster.account}' --jobs 16 --restart-times 5 --conda-prefix /lustre1/deng_pkuhpc/deng_test/SF/min3/envs/map& 37 | -------------------------------------------------------------------------------- /05_Annotation/config.yaml: -------------------------------------------------------------------------------- 1 | # the following section defines all inputs 2 | data: 3 | # add paths to ISOseq data sets. Each data set as a separate line 4 | longnucl: 5 | ds1: 6 | # add paths to reference proteins 7 | refprot: 8 | triticeae: 9 | # add paths to RNAseq data sets. Create a data set entry for different libaries. DS1 is an example for paired-end data; DS2 is single-ended 10 | rnaseq: 11 | ds1: # name of first data set 12 | LIB1: # name of first library 13 | 1: 14 | - 15 | - 16 | 2: 17 | - 18 | - 19 | ds2: # name of second data set 20 | LIB2: # name of second library 21 | 1: 22 | - 23 | - 24 | hisat2db: 25 | gmap: 26 | dbdir: 27 | dbname: 28 | gth: 29 | chr1: 30 | genome: 31 | transdecoder: 32 | pfamhmm: Pfam-A.hmm 33 | blastp: db 34 | cocla: 35 | unimag: 36 | unipoa: 37 | trep: 38 | 39 | # the following section defines all executables and parameters 40 | hisat2: 41 | arguments: -t --dta --no-unal --max-intronlen 50000 42 | memory: 24G 43 | nodes: 8 44 | threads: 8 45 | job_name: hisat2 46 | log: "hisat2.log" 47 | jobs: 4 48 | 49 | 50 | stringtie: 51 | arguments: -m 150 -t -f 0.3 52 | memory: 4G 53 | nodes: 8 54 | threads: 8 55 | job_name: stringtie 56 | log: "stringtie.log" 57 | 58 | 59 | gmap: 60 | arguments: -K 50000 61 | memory: 16G 62 | nodes: 8 63 | threads: 8 64 | job_name: gmap 65 | log: "gmap.log" 66 | 67 | gth: 68 | arguments: -species rice -startcodon -finalstopcodon -gcmaxgapwidth 50000 -gcmincoverage 70 -paralogs -prseedlength 7 -prhdist 4 69 | memory: 5G 70 | nbatches: 100 71 | nodes: 1 72 | threads: 1 73 | job_name: gth 74 | log: "gth.log" 75 | 76 | 77 | transdecoder: 78 | job_name: transdecoder 79 | log: "transdecoder.log" 80 | nbatches: 1000 81 | predict: 82 | nodes: 128 83 | memory: 8G 84 | convert: 85 | nodes: 1 86 | memory: 8G 87 | stringtie: 88 | memory: 8G 89 | nodes: 1 90 | threads: 1 91 | hmmscan: 92 | memory: 2G 93 | nodes: 1 94 | threads: 1 95 | blastp: 96 | memory: 8G 97 | nodes: 1 98 | threads: 1 99 | 100 | 101 | cocla: 102 | nbatches: 100 103 | memory: 1G 104 | nodes: 1 105 | evalue: 10 106 | job_name: cocla 107 | version: 108 | prefix: <"short name of genome"> 109 | unipoa_threshold: 0.95 #complete 110 | unimag_threshold: 0.95 #reviewed 111 | repeat_threshold: 0.95 #trep 112 | 113 | executables: 114 | blastp: blastp 115 | cuffcompare: cuffcompare 116 | gffread: gffread 117 | gth: gth 118 | gmap: gmap.sse42 119 | hmmscan: hmmscan 120 | hisat2: hisat2 121 | samtools: samtools 122 | bamtools: bamtools 123 | stringtie: stringtie 124 | transdecoder: 125 | extract: 126 | convertgff3: 127 | convertgenome: cdna_alignment_orf_to_genome_orf.pl 128 | longorfs: TransDecoder.LongOrfs 129 | predict: TransDecoder.Predict 130 | 131 | -------------------------------------------------------------------------------- /05_Annotation/modules/__pycache__/fasta.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liushoucheng/SPART/8b2d0cf778a446c1e24dbdc772e914262cff5930/05_Annotation/modules/__pycache__/fasta.cpython-310.pyc -------------------------------------------------------------------------------- /05_Annotation/modules/__pycache__/fasta.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liushoucheng/SPART/8b2d0cf778a446c1e24dbdc772e914262cff5930/05_Annotation/modules/__pycache__/fasta.cpython-35.pyc -------------------------------------------------------------------------------- /05_Annotation/modules/__pycache__/fasta.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liushoucheng/SPART/8b2d0cf778a446c1e24dbdc772e914262cff5930/05_Annotation/modules/__pycache__/fasta.cpython-39.pyc -------------------------------------------------------------------------------- /05_Annotation/modules/__pycache__/mygff.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liushoucheng/SPART/8b2d0cf778a446c1e24dbdc772e914262cff5930/05_Annotation/modules/__pycache__/mygff.cpython-310.pyc -------------------------------------------------------------------------------- /05_Annotation/modules/__pycache__/mygff.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liushoucheng/SPART/8b2d0cf778a446c1e24dbdc772e914262cff5930/05_Annotation/modules/__pycache__/mygff.cpython-35.pyc -------------------------------------------------------------------------------- /05_Annotation/modules/fasta.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 27, 2015 3 | 4 | @author: sven.twardziok 5 | 6 | Version 0.9 7 | 8 | ''' 9 | 10 | from Bio import SeqIO 11 | import subprocess, re, itertools, csv 12 | 13 | class SplitSeqs(object): 14 | def __init__(self, sequences, outdir, nfiles=500): 15 | nseqs = 0 16 | with open(sequences, "r") as infile: 17 | for record in SeqIO.parse(infile, "fasta"): 18 | nseqs += 1 19 | seqsperfile = nseqs/nfiles 20 | for i in range(1, nfiles+1): 21 | tmpoutdir = "%s/part_%i" %(outdir, i) 22 | subprocess.call(["mkdir", "-p", tmpoutdir]) 23 | self.fasta_parts = {} 24 | with open(sequences, "r") as infile: 25 | tmpcounter = 0 26 | nfile = 1 27 | for record in SeqIO.parse(infile, "fasta"): 28 | if nfile < nfiles: 29 | if tmpcounter==0: 30 | tmpfilename = "%s/part_%i/part_%i.fasta" % (outdir, nfile, nfile) 31 | outfasta = open(tmpfilename, "w") 32 | self.fasta_parts["part_%i" % (nfile)] = tmpfilename 33 | SeqIO.write(record, outfasta, "fasta") 34 | tmpcounter = tmpcounter+1 35 | else: 36 | SeqIO.write(record, outfasta, "fasta") 37 | tmpcounter = tmpcounter+1 38 | if tmpcounter>=(seqsperfile-1): 39 | outfasta.close() 40 | tmpcounter = 0 41 | nfile = nfile+1 42 | else: 43 | if tmpcounter==0: 44 | tmpfilename = "%s/part_%i/part_%i.fasta" % (outdir, nfile, nfile) 45 | outfasta = open(tmpfilename, "w") 46 | self.fasta_parts["part_%i" % (nfile)] = tmpfilename 47 | SeqIO.write(record, outfasta, "fasta") 48 | tmpcounter = tmpcounter+1 49 | else: 50 | SeqIO.write(record, outfasta, "fasta") 51 | tmpcounter = tmpcounter+1 52 | 53 | class PrintCdsStats(object): 54 | def __init__(self, infasta, outstats): 55 | stop_codons = ["TGA", "TAG", "TAA"] 56 | start_codons = ["ATG"] 57 | with open(infasta, "r") as infile: 58 | with open(outstats, "w") as outfile: 59 | rowpattern = {"id":"none", "length": 0, "status": "fragment"} 60 | variables = ["id", "length", "status"] 61 | writer = csv.DictWriter(outfile, fieldnames=variables) 62 | writer.writeheader() 63 | for record in SeqIO.parse(infile, "fasta"): 64 | x = str(record.seq) 65 | outdata = dict(rowpattern) 66 | outdata["id"] = record.id 67 | outdata["length"] = len(x) 68 | if len(str(record.seq)) % 3 != 0: 69 | outdata["status"] = "no translation" 70 | elif any(x[i:i+3] in stop_codons for i in range(3,len(x)-3,3)): 71 | outdata["status"] = "internal stop" 72 | elif x[0:3] in start_codons and x[(len(x)-3):len(x)] in stop_codons: 73 | outdata["status"] = "complete" 74 | elif not x[0:3] in start_codons and x[(len(x)-3):len(x)] in stop_codons: 75 | outdata["status"] = "no start" 76 | elif x[0:3] in start_codons and not x[(len(x)-3):len(x)] in stop_codons: 77 | outdata["status"] = "no stop" 78 | writer.writerow(outdata) 79 | -------------------------------------------------------------------------------- /05_Annotation/modules/mygff.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on May 09, 2017 3 | 4 | Version 1.0 5 | 6 | @author: sven.twardziok@posteo.de 7 | """ 8 | 9 | import csv, re, math 10 | from Bio import SeqIO 11 | from Bio.Seq import Seq 12 | from Bio.Alphabet import IUPAC 13 | from Bio.SeqRecord import SeqRecord 14 | 15 | class Feature(object): 16 | """Class for single features 17 | 18 | based on gff3 specification: 19 | https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md 20 | """ 21 | 22 | def __lt__(self, other): 23 | """Defines behavior for the less-than operator, < 24 | 25 | :param other: other feature object to compare with 26 | :type other: object 27 | """ 28 | 29 | if self.seqid=other.end: 41 | if self.ftype in ["gene"] and other.ftype not in ["gene"]: 42 | return True 43 | elif self.ftype in ["mRNA"] and other.ftype not in ["mRNA", "gene"]: 44 | return True 45 | elif self.ftype in ["exon"] and other.ftype not in ["exon", "mRNA", "gene"]: 46 | return True 47 | else: 48 | return False 49 | 50 | def __gt__(self, other): 51 | """Defines behavior for the greater-than operator, > 52 | 53 | :param other: other feature object to compare with 54 | :type other: object 55 | """ 56 | 57 | if self.seqid>other.seqid or (self.seqid==other.seqid and self.start>other.start): 58 | return True 59 | elif self.seqid==other.seqid and self.start==other.start and self.end>other.end: 60 | if self.ftype in ["gene"] and other.ftype in ["gene"]: 61 | return True 62 | if self.ftype in ["mRNA"] and other.ftype in ["mRNA", "gene"]: 63 | return True 64 | if self.ftype in ["exon"] and other.ftype in ["mRNA", "gene", "exon"]: 65 | return True 66 | if self.ftype in ["three_prime_UTR", "five_prime_UTR", "CDS", "intron"]: 67 | return True 68 | elif self.seqid==other.seqid and self.start==other.start and self.end<=other.end: 69 | if self.ftype in ["mRNA"] and other.ftype in ["gene"]: 70 | return True 71 | if self.ftype in ["exon"] and other.ftype in ["mRNA", "gene"]: 72 | return True 73 | if self.ftype in ["three_prime_UTR", "five_prime_UTR", "CDS", "intron"] and other.ftype in ["exon", "mRNA", "gene"]: 74 | return True 75 | else: 76 | return False 77 | 78 | def __eq__(self, other): 79 | """Defines behavior for the equality operator, == 80 | 81 | :param other: other feature object to compare with 82 | :type other: object 83 | """ 84 | 85 | if self.seqid==other.seqid and self.start==other.start and self.end==other.end: 86 | # define equality for mRNAs 87 | if self.ftype=="mRNA" and other.ftype=="mRNA": 88 | # get all CDSs from both mRNAs 89 | cdss_self = [] 90 | cdss_other = [] 91 | for feature in self.features: 92 | if feature.ftype=="CDS": 93 | cdss_self.append(feature) 94 | for feature in other.features: 95 | if feature.ftype=="CDS": 96 | cdss_other.append(feature) 97 | cdss_self = sorted(cdss_self) 98 | cdss_other = sorted(cdss_other) 99 | # check if number of CDSs are equal 100 | if len(cdss_self) == len(cdss_other): 101 | # check if all CDSs are equal and return False if one pair is unequal 102 | for i in range(0, len(cdss_self)): 103 | if cdss_self[i] != cdss_other[i]: 104 | return False 105 | return True 106 | elif self.ftype==other.ftype: 107 | return True 108 | else: 109 | return False 110 | 111 | def __hash__(self): 112 | return hash((self.seqid, self.start, self.end, self.ftype, self.identifier)) 113 | 114 | def __init__(self, seqid, source, ftype, start, end, score, strand, phase): 115 | """Create feature object 116 | 117 | :param seqid: sequence identifier 118 | :type seqid: string 119 | :param source: name of source 120 | :type source: string 121 | :param ftype: feature type ("exon", "mRNA", "gene", "three_prime_UTR", "five_prime_UTR", "CDS", "intron") 122 | :type ftype: string 123 | :param start: start position 124 | :type start: int 125 | :param end: end position 126 | :type end: int 127 | :param score: score value 128 | :type score: imt 129 | :param strand: strand inforamtion 130 | :type strand: string ("+", "-" or ".") 131 | :param phase: phase information 132 | :type phase: string 133 | """ 134 | 135 | # standard fields from gff3 columns 136 | self.seqid = seqid 137 | self.source = source 138 | self.ftype = ftype 139 | self.start = start 140 | self.end = end 141 | self.score = score 142 | self.strand = strand 143 | self.phase = phase 144 | 145 | # attributes 146 | self.identifier = "" 147 | self.name = "" 148 | self.alias = "" 149 | self.notes = "" 150 | self.target = "" 151 | 152 | 153 | #links between features 154 | self.parent = None 155 | self.features = [] 156 | 157 | # annotation stuff 158 | self.primary_confidence_class = "" 159 | self.secondary_condidence_class = "" 160 | 161 | 162 | def getLine(self): 163 | writeattributes = "" 164 | # required attributes 165 | if self.ftype=="gene": 166 | writeattributes = "ID=%s" % (self.identifier) 167 | elif self.ftype=="mRNA": 168 | if self.parent is None: 169 | print("error, no parent for: %s" % (self.identifier)) 170 | else: 171 | writeattributes = "ID=%s;Parent=%s" % (self.identifier, self.parent.identifier) 172 | else: 173 | if self.parent is None: 174 | print("error, no parent for: %s %s %s %i %i" % (self.seqid, self.source, self.ftype, self.start, self.end)) 175 | else: 176 | writeattributes = "Parent=%s" % (self.parent.identifier) 177 | #optional attributes 178 | if len(self.name)>0: 179 | writeattributes += ";Name=%s" % (self.name) 180 | if len(self.alias)>0: 181 | writeattributes += ";Alias=%s" % (self.alias) 182 | if len(self.target)>0: 183 | writeattributes += ";Target=%s" % (self.target) 184 | if len(self.notes)>0: 185 | writeattributes += ";Notes=%s" % (self.notes) 186 | if len(self.primary_confidence_class)>0: 187 | writeattributes += ";primary_confidence_class=%s" % (self.primary_confidence_class) 188 | if len(self.secondary_condidence_class)>0: 189 | writeattributes += ";secondary_confidence_class=%s" % (self.secondary_condidence_class) 190 | 191 | return [self.seqid, self.source, self.ftype, self.start, self.end, self.score, self.strand, self.phase, writeattributes] 192 | 193 | 194 | class GeneAnnotation(object): 195 | """Read specific gff files and returns structured data for plant.annot""" 196 | 197 | def readGff3PlantAnnot(self, path): 198 | """General GFF3 file used in plant.annot pipeline 199 | 200 | :param path: path to gff file 201 | :type path: string 202 | 203 | 0 seqname chrX Chromosome, scaffold or contig name 204 | 1 source name Name of source, e.g. database or software 205 | 2 feature exon "three_prime_UTR", "five_prime_UTR", "mRNA", "exon", "CDS", "gene", "intron" 206 | 3 start 77696957 The leftmost coordinate of this record (where 1 is the leftmost possible coordinate) 207 | 4 end 77712009 The rightmost coordinate of this record, inclusive. 208 | 5 score 0.3221 Some score value 209 | 6 strand + One of "+", "-", "." 210 | 7 frame . Frame for feature (just used for CDS) 211 | 8 attributes (GFF3) ID=XXX;Parent=XXX (ID is only used for genes and mRNAs; Parent is not used for genes) 212 | """ 213 | 214 | self.features = [] 215 | self.genes = {} 216 | self.mrnas = {} 217 | self.seqids = {} 218 | genes2mrnas = [] 219 | mrnas2features = [] 220 | 221 | # create features 222 | with open(path, "r") as ingff3: 223 | reader = csv.reader(ingff3, delimiter="\t", quoting = csv.QUOTE_NONE) 224 | for line in reader: 225 | if len(line)==9: 226 | seqid = line[0] 227 | source = line[1] 228 | ftype = line[2] 229 | start = int(line[3]) 230 | end = int(line[4]) 231 | score = line[5] 232 | strand = line[6] 233 | phase = line[7] 234 | feature = Feature(seqid, source, ftype, start, end, score, strand, phase) 235 | attributesline = line[8] 236 | attributes = {} 237 | for entry in attributesline.split(";"): 238 | matchAttribute = re.match(r"(.*)=(.*)", entry) 239 | if matchAttribute: 240 | attributes[matchAttribute.group(1)] = matchAttribute.group(2) 241 | # add attributes to feature 242 | if "ID" in attributes.keys(): 243 | feature.identifier = attributes["ID"] 244 | if "Name" in attributes.keys(): 245 | feature.name = attributes["Name"] 246 | if "Alias" in attributes.keys(): 247 | feature.alias = attributes["Alias"] 248 | if "Notes" in attributes.keys(): 249 | feature.notes = attributes["Notes"] 250 | if "Target" in attributes.keys(): 251 | feature.target = attributes["Target"] 252 | if "primary_confidence_class" in attributes.keys(): 253 | feature.primary_confidence_class = attributes["primary_confidence_class"] 254 | if "secondary_condidence_class" in attributes.keys(): 255 | feature.secondary_condidence_class = attributes["secondary_condidence_class"] 256 | if "primconf" in attributes.keys(): 257 | feature.primary_confidence_class = attributes["primconf"] #old version 258 | if "secconf" in attributes.keys(): 259 | feature.secondary_condidence_class = attributes["secconf"] #old version 260 | # add gene to seqid and genes 261 | if feature.ftype == "gene": 262 | self.features.append(feature) 263 | if not feature.seqid in self.seqids.keys(): 264 | self.seqids[seqid] = [] 265 | self.seqids[feature.seqid].append(feature) 266 | self.genes[feature.identifier] = feature 267 | # add mrna to mrnas and mark for gene assignment 268 | elif feature.ftype == "mRNA": 269 | self.features.append(feature) 270 | self.mrnas[feature.identifier] = feature 271 | genes2mrnas.append({"geneid":attributes["Parent"], "mrna":feature}) 272 | # mark remaining features for mrna assignment 273 | elif feature.ftype in ["exon", "three_prime_UTR", "five_prime_UTR", "CDS", "intron"]: 274 | self.features.append(feature) 275 | mrnas2features.append({"mrnaid":attributes["Parent"], "feature":feature}) 276 | 277 | # assign genes to mrnas 278 | for assignment in genes2mrnas: 279 | geneid = assignment["geneid"] 280 | mrna = assignment["mrna"] 281 | if geneid in self.genes.keys(): 282 | gene = self.genes[geneid] 283 | mrna.parent = gene 284 | gene.features.append(mrna) 285 | else: 286 | print("gene missing") 287 | 288 | # assign mrnas to features 289 | for assignment in mrnas2features: 290 | mrnaid = assignment["mrnaid"] 291 | feature = assignment["feature"] 292 | if mrnaid in self.mrnas.keys(): 293 | mrna = self.mrnas[mrnaid] 294 | feature.parent = mrna 295 | mrna.features.append(feature) 296 | else: 297 | print("mrna missing") 298 | 299 | # sort features and return 300 | self.features = sorted(self.features) 301 | return(self) 302 | 303 | def combine(self, geneannotations, annoversion="PGSB"): 304 | self.features = [] 305 | for geneannotation in geneannotations: 306 | self.features += geneannotation.features 307 | self.features = sorted(self.features) 308 | genecounter = 0 309 | mrnacounter = 0 310 | self.genes = {} 311 | self.mrnas = {} 312 | self.seqids = {} 313 | for feature in self.features: 314 | if feature.ftype=="gene": 315 | genecounter += 1 316 | if not feature.seqid in self.seqids.keys(): 317 | self.seqids[feature.seqid] = [] 318 | feature.identifier = "%s_gene_%i" % (annoversion, genecounter) 319 | self.genes[feature.identifier] = feature 320 | self.seqids[feature.seqid].append(feature) 321 | if feature.ftype=="mRNA": 322 | mrnacounter += 1 323 | feature.identifier = "%s_mRNA_%i" % (annoversion, mrnacounter) 324 | self.mrnas[feature.identifier] = feature 325 | return(self) 326 | 327 | def recalcGeneids(self, annoversion="PGSB"): 328 | #1) get one new gene for each mrna; import all attributes from former genes 329 | tmpnewgenes = [] 330 | tmpcounter = 0 331 | for feature in self.features: 332 | if feature.ftype == "mRNA": 333 | tmpcounter += 1 334 | tmpnewgeneid = "%s_gene_%i" % (annoversion, tmpcounter) 335 | tmpnewgene = Feature(seqid=feature.seqid, source=feature.source, ftype="gene", start=feature.start, end=feature.end, score=feature.score, strand=feature.strand, phase=feature.phase) 336 | tmpnewgene.identifier = tmpnewgeneid 337 | tmpnewgene.name = tmpnewgeneid 338 | tmpnewgene.alias = feature.parent.alias 339 | tmpnewgene.notes = feature.parent.notes 340 | tmpnewgene.target = feature.parent.target 341 | tmpnewgene.primary_confidence_class = feature.parent.primary_confidence_class 342 | tmpnewgene.secondary_condidence_class = feature.parent.secondary_condidence_class 343 | feature.parent = tmpnewgene 344 | tmpnewgene.features = [feature] 345 | tmpnewgenes.append(tmpnewgene) 346 | 347 | #2) merge genes with overlapping CDS (features need to be sorted) 348 | opencdss = {} 349 | removegeneids = set([]) 350 | for feature in self.features: 351 | if feature.ftype=="CDS": 352 | tmpopencdss = [] 353 | opengeneid = "none" 354 | currentgene = feature.parent.parent 355 | # if there are no open CDS for current seqid initialze empty array 356 | if not feature.seqid in opencdss.keys(): 357 | opencdss[feature.seqid] = [] 358 | # go through all open cds and keep if still open; set new gene to last open cds (gene are same for all open CDS on same strand) 359 | for opencds in opencdss[feature.seqid]: 360 | if opencds.end>=feature.start: 361 | tmpopencdss.append(opencds) 362 | if opencds.strand==feature.strand: 363 | opengene = opencds.parent.parent 364 | opengeneid = opengene.identifier 365 | # set new gene to last open cds gene 366 | if currentgene.identifier!=opengeneid and opengeneid!="none": 367 | tmpstart=math.inf 368 | tmpend=0 369 | for tmpmrna in currentgene.features: 370 | tmpmrna.parent = opengene 371 | opengene.features.append(tmpmrna) 372 | tmpstart = min(tmpstart, tmpmrna.start) 373 | tmpend = max(tmpend, tmpmrna.end) 374 | opengene.start = min(tmpstart, opengene.start) 375 | opengene.end = max(tmpend, opengene.end) 376 | if currentgene.source!=opengene.source: 377 | opengene.source = "multiple" 378 | currentgene.mrnas = [] 379 | removegeneids.add(currentgene.identifier) 380 | tmpopencdss.append(feature) 381 | opencdss[feature.seqid] = tmpopencdss 382 | 383 | #3) update features and return object 384 | newfeatures = [] 385 | newgenes = {} 386 | newseqids = {} 387 | for feature in self.features: 388 | if feature.ftype!="gene": 389 | newfeatures.append(feature) 390 | for gene in tmpnewgenes: 391 | if not gene.identifier in removegeneids: 392 | if not gene.seqid in newseqids.keys(): 393 | newseqids[gene.seqid] = [] 394 | newgenes[gene.identifier] = gene 395 | newseqids[gene.seqid].append(gene) 396 | newfeatures.append(gene) 397 | self.genes = newgenes 398 | self.seqids = newseqids 399 | self.features = sorted(newfeatures) 400 | return(self) 401 | 402 | def collapseMrnas(self): 403 | """ 404 | This function removes redundant mRNAs 405 | """ 406 | 407 | newfeatures = [] 408 | newmrnas = {} 409 | # go through all genes 410 | for geneid in self.genes: 411 | gene = self.genes[geneid] 412 | newfeatures.append(gene) 413 | tmp_keeptranscripts = [] 414 | # go through all mRNAs 415 | for mrna1 in gene.features: 416 | isequal = False 417 | # check if there is already equal mRNA in set of new mRNAs 418 | for mrna2 in tmp_keeptranscripts: 419 | if mrna1 == mrna2: 420 | isequal = True 421 | if not isequal: 422 | tmp_keeptranscripts.append(mrna1) 423 | # set new mRNAs 424 | gene.features = tmp_keeptranscripts 425 | # add features to newfeatures (those to keep) 426 | for mrna in tmp_keeptranscripts: 427 | newfeatures.append(mrna) 428 | newmrnas[mrna.identifier] = mrna 429 | newfeatures += mrna.features 430 | self.features = sorted(newfeatures) 431 | self.mrnas = newmrnas 432 | return(self) 433 | 434 | def writeGff3Genes(self, path): 435 | with open(path, "w") as outgff: 436 | writer = csv.writer(outgff, delimiter="\t", quotechar="#", quoting = csv.QUOTE_NONE) 437 | for feature in self.features: 438 | writer.writerow(feature.getLine()) 439 | 440 | def printGeneStats(self, path): 441 | with open(path, "w") as outfile: 442 | rowpattern = {"id":"none", "source":"none", "seqid":"none", "start":0, "end":0, "ntranscripts":0, "primconf":""} 443 | variables = ["id", "source", "seqid", "start", "end", "ntranscripts", "primconf"] 444 | writer = csv.DictWriter(outfile, fieldnames=variables) 445 | writer.writeheader() 446 | for geneid in self.genes: 447 | gene = self.genes[geneid] 448 | outdata = dict(rowpattern) 449 | outdata["id"] = geneid 450 | outdata["source"] = gene.source 451 | outdata["seqid"] = gene.seqid 452 | outdata["start"] = gene.start 453 | outdata["end"] = gene.end 454 | outdata["ntranscripts"] = len(gene.features) 455 | outdata["primconf"] = gene.primary_confidence_class 456 | writer.writerow(outdata) 457 | 458 | def printTranscriptsStats(self, path, includetargets=False): 459 | with open(path, "w") as outfile: 460 | rowpattern = {"id":"none", "gene": "none", "source":"none", "seqid":"none", "start":0, "end":0, "bpcdss":0, "ncdss":0, "primconf":"", "secconf":""} 461 | variables = ["id", "gene", "source", "seqid", "start", "end", "bpcdss", "ncdss", "primconf", "secconf"] 462 | if includetargets: 463 | rowpattern["target"] = "" 464 | variables.append("target") 465 | writer = csv.DictWriter(outfile, fieldnames=variables) 466 | writer.writeheader() 467 | for mrnaid in self.mrnas: 468 | mrna = self.mrnas[mrnaid] 469 | outdata = dict(rowpattern) 470 | outdata["id"] = mrnaid 471 | outdata["gene"] = mrna.parent.identifier 472 | outdata["source"] = mrna.source 473 | outdata["seqid"] = mrna.seqid 474 | outdata["start"] = mrna.start 475 | outdata["end"] = mrna.end 476 | outdata["primconf"] = mrna.primary_confidence_class 477 | outdata["secconf"] = mrna.secondary_condidence_class 478 | tmpbpcdss = 0 479 | tmpncdss = 0 480 | for cds in mrna.features: 481 | if cds.ftype=="CDS": 482 | tmpncdss += 1 483 | tmpbpcdss += (cds.end-cds.start)+1 484 | outdata["ncdss"] = tmpncdss 485 | outdata["bpcdss"] = tmpbpcdss 486 | if includetargets: 487 | outdata["target"] = mrna.target 488 | writer.writerow(outdata) 489 | 490 | def getHcGff3Genes(self): 491 | newfeatures = [] 492 | newgenes = {} 493 | newseqids = {} 494 | newmrnas = {} 495 | for geneid in self.genes: 496 | gene = self.genes[geneid] 497 | if gene.primary_confidence_class=="HC": 498 | newfeatures.append(gene) 499 | newgenes[gene.identifier] = gene 500 | if not gene.seqid in newseqids: 501 | newseqids[gene.seqid] = [] 502 | newseqids[gene.seqid].append(gene) 503 | for mrna in gene.features: 504 | newmrnas[mrna.identifier] = mrna 505 | newfeatures.append(mrna) 506 | newfeatures += mrna.features 507 | newanno = GeneAnnotation() 508 | newanno.features = sorted(newfeatures) 509 | newanno.genes = newgenes 510 | newanno.seqids = newseqids 511 | newanno.mrnas = newmrnas 512 | return newanno 513 | 514 | def getLcGff3Genes(self): 515 | newfeatures = [] 516 | newgenes = {} 517 | newseqids = {} 518 | newmrnas = {} 519 | for geneid in self.genes: 520 | gene = self.genes[geneid] 521 | if gene.primary_confidence_class=="LC": 522 | newfeatures.append(gene) 523 | newgenes[gene.identifier] = gene 524 | if not gene.seqid in newseqids: 525 | newseqids[gene.seqid] = [] 526 | newseqids[gene.seqid].append(gene) 527 | for mrna in gene.features: 528 | newmrnas[mrna.identifier] = mrna 529 | newfeatures.append(mrna) 530 | newfeatures += mrna.features 531 | newanno = GeneAnnotation() 532 | newanno.features = sorted(newfeatures) 533 | newanno.genes = newgenes 534 | newanno.seqids = newseqids 535 | newanno.mrnas = newmrnas 536 | return newanno 537 | 538 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 liusc 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SPART 2 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.14551115.svg)](https://doi.org/10.5281/zenodo.14551115) 3 | [![Published in Nat Genet](https://img.shields.io/badge/Published%20in-NatGenet-blue.svg)](https://doi.org/10.1038/s41588-025-02137-x) 4 | ![Citation Badge](https://api.juleskreuer.eu/citation-badge.php?doi=10.1038/s41588-025-02137-x) 5 | 6 | SPART, a Semi-automated pipeline for assembling reference sequence of telomere-to-telomere (T2T). 7 | 8 | 9 | **See [tutorial]( https://spart1.readthedocs.io/en/latest/) for more details.** 10 | ## Table of Contents 11 | 12 | - [Quick install and start](#started) 13 | - [Install](#Install) 14 | - [Dependencies](#Dependencies) 15 | - [Running pipeline with snakemake](#pipe) 16 | - [Output files](#Output) 17 | - [Run step by step](#step) 18 | - [00_Contig screen](#00_Contig) 19 | - [01_Contig scaffolding](#01_Contig) 20 | - [02_Gap patching](#02_Gap) 21 | - [03_Polishing](#03_Polishing) 22 | - [04_Evaluation](#04_Evaluation) 23 | - [05_Annotation](#05_Annotation) 24 | 25 | ## Quick install and start 26 | ### Install 27 | ```sh 28 | git clone https://github.com/liushoucheng/SPART.git 29 | cd SPART 30 | conda env create -f SPART.yaml 31 | conda activate spart 32 | ``` 33 | ### Dependencies 34 | 35 | List of tools assumed loadable or accessible with no path are: 36 | 37 | * [Bionano DLS map]( https://bionano.com) 38 | 39 | * [HiC-Pro v3.1.0]( https://github.com/nservant/HiC-Pro) 40 | 41 | * [_submit_telomere.sh]( https://github.com/VGP/vgp-assembly/blob/master/pipeline/telomere/_submit_telomere.sh) 42 | 43 | * [Medaka]( https://anaconda.org/bioconda/medaka) 44 | 45 | * [racon]( https://anaconda.org/bioconda/racon) 46 | 47 | * [hisat2]( https://github.com/DaehwanKimLab/hisat2) 48 | 49 | * [DeepVariant v1.5.0-gpu]( https://github.com/google/deepvariant) 50 | 51 | * [PEPPER-Margin-DeepVariant v0.8-gpu]( https://github.com/kishwarshafin/pepper) 52 | 53 | * [hap.py v0.3.15]( https://github.com/Illumina/hap.py) 54 | 55 | * [vcf_merge_t2t.py](https://github.com/kishwarshafin/T2T_polishing_scripts/blob/master/polishing_merge_script/vcf_merge_t2t.py) 56 | 57 | * [miniprot_GFF_2_EVM_alignment_GFF3.py](https://github.com/EVidenceModeler/EVidenceModeler/blob/master/EvmUtils/misc/miniprot_GFF_2_EVM_alignment_GFF3.py) 58 | 59 | ### Using snakemake to run the pipeline can be assembled to the chromosome level but may contain gaps that require the rest to be done manually.(Exclude Verkko,Bionano DLS Map,Telomere determination and patch,Centromeric region analysis,Variant calls and Evaluation): 60 | * [Download the example in SPART/example/]( https://gofile.me/77wE8/Vj6Vlp1LK) 61 | * [Download the digest_genome.py of HiC-Pro in SPART/]( https://github.com/nservant/HiC-Pro/blob/master/bin/utils/digest_genome.py) 62 | ```sh 63 | # Replace SPART_PATH with the current working directory 64 | sed -i "s#^ SPART_PATH# ${PWD}#g" conf_ck.yaml 65 | # HiC enzyme 66 | HiC_enzyme=" GATC" 67 | # Replace hic_sca_enzyme with the value stored in the HiC_enzyme variable 68 | sed -i "s#^ hic_sca_enzyme# ${HiC_enzyme}#g" conf_ck.yaml 69 | # Ligation site sequence used for reads trimming. Depends on the fill in strategy. Example: AAGCTAGCTT 70 | HiC_ligation_site=" GATCGATC" 71 | sed -i "s#^ hic_sca_ligation_site# ${HiC_ligation_site}#g" conf_ck.yaml #Replace hic_sca_ligation_site with the value stored in the HiC_ligation_site variable 72 | # This process uses the centos 7.6 operating system, slurm job scheduling system, please modify your SPART/clust.json according to the cluster situation. 73 | # This process requires the use of HiC-Pro, please add it to the environment before running. 74 | snakemake -s SPART.py --cluster-config clust.json --configfile conf_ck.yaml --cluster '{cluster.account}' --jobs $threads --rerun-incomplete --restart-times 1 -np --rulegraph |dot -Tpng > rule.png #Running pipeline with snakemake 75 | # configfile:The config file can be used to define a dictionary of configuration parameters and their values. 76 | # cluster-config:A JSON or YAML file that defines the wildcards used in 'cluster'for specific rules. 77 | ``` 78 |
79 | 80 |
81 | 82 | ### Output files 83 | please see the complete [documentation](https://github.com/liushoucheng/SPART/tree/main/example). 84 | 85 | ## Run step by step 86 | 87 | ### 00_Contig screen 88 | ```sh 89 | HiFi_reads=# file names of HiFi reads 90 | ONT_reads=# file names of Ultra-Long reads 91 | thread=# number of threads 92 | memory=# Specify the upper limit on memory to use 93 | output_prefix=# prefix of output files 94 | mitochondrion=# mitochondrion fasta 95 | chloroplast=# chloroplast fasta 96 | ref=# Sequences of mitochondria and chloroplasts need to be removed 97 | # Fastp :was used to filter adapter sequences, primers and other low quality sequence from raw sequencing reads. 98 | SPART/00_Contig_screen/fastp.sh $HiFi_reads $ONT_reads 99 | # Hifiasm 100 | SPART/00_Contig_screen/hifiasm.sh $HiFi_reads $ONT_reads $output_prefix $thread 101 | # Verkko 102 | SPART/00_Contig_screen/verkko.sh $output_prefix $HiFi_reads $ONT_reads $threads $memory 103 | # Flye 104 | SPART/00_Contig_screen/flye.sh $ONT_reads $output_prefix $threads 105 | # Remove mitochondrion && chloroplast 106 | SPART/00_Contig_screen/rm_mt_cp.sh $mitochondrion $chloroplast $ref $threads 107 | ``` 108 | ### 01_Contig scaffolding 109 | ```sh 110 | threads=# Nominal threads per Node, without overloading (non-zero value will override -T -Tp -Te -TJ) 111 | bnx=# Input molecule (.bnx) file, required 112 | ref_cmap=# Reference file (must be .cmap), to compare resulting contigs 113 | prefix=# Location of output files root directory, required, will be created if does not exist; if does exist, will overwrite contents 114 | xml=# Read XML file for parameters 115 | Bio_dir=# Location of executable files (RefAligner and Assembler, required) 116 | cluster_xml=# Run on cluster, read XML file for submission arguments (optional--will not use cluster submission if absent) 117 | ref=# Input NGS FASTA 118 | bio_camp=# Input BioNano CMAP 119 | merge_xml=# Merge configuration file 120 | RefAligner=# RefAligner program 121 | hicpro_data=# input data folder; Must contains a folder per sample with input files 122 | hicpro_config=# configuration file for Hi-C processing 123 | hicpro_outdir=# output folder 124 | enzyme=# restriction enzyme cutting sites 125 | #### Bionano 126 | SPART/01_Contig_scaffolding/Bionano_DLS_map.sh $threads $bnx $ref_cmap $prefix $xml $Bio_dir $cluster_xml $ref $bio_camp $merge_xml $RefAligner 127 | #### Hi-C 128 | # hic-pro 129 | SPART/01_Contig_scaffolding/HiC-Pro.sh $ref $prefix $hicpro_data $hicpro_config $hicpro_outdir 130 | # yahs 131 | SPART/01_Contig_scaffolding/yahs.sh $enzyme $ref $bed/bam/bin $profix 132 | ``` 133 | ### 02_Gap patching 134 | ```sh 135 | query=# query fasta file (uncompressed or bgzipped) 136 | ref=# target fasta file (uncompressed or bgzipped) 137 | region=# output directory 138 | SPART/02_Gap_patching/wfmash_ragtag.sh $query $ref $region 139 | ``` 140 | #### Manual operation 141 | ```sh 142 | cd ragtag_output 143 | perl SPART/02_Gap_patching/paf_filter.pl -i ragtag.patch.debug.filtered.paf -minlen 10000000 -iden 0.5 144 | ``` 145 | **Manually editing the ragtag.patch.debug.filtered.paf file.Keep the high-quality contig and preserve the location of the only high confidence match in ragtag.patch.debug.filtered.paf that matches the sequence at both ends of the gap.** 146 | ```sh 147 | perl SPART/02_Gap_patching/renameagp.pl -i ragtag.patch.ctg.agp -i1 ragtag.patch.debug.filtered.paf -start seq00000000 -end seq00000001 -o test.agp 148 | ``` 149 | **Test.agp is merged into ragtag.patch.agp and fasta is generated.** 150 | 151 | #### e.g. 152 | ```sh 153 | # make joins and fill gaps in target.fa using sequences from query.fa 154 | cd SPART/example 155 | ragtag.py patch -i 0.99 --remove-small -q 10 --debug -u --aligner minimap2 -t 128 --mm2-params "-x asm20 -I1G -t 128" reference1A.fasta query1A.fasta 156 | # filter 157 | cd ragtag_output 158 | perl SPART/02_Gap_patching/paf_filter.pl -i ragtag.patch.debug.filtered.paf -minlen 10000000 -iden 0.5 159 | # Manually editing the ragtag.patch.debug.filtered.paf_fiter.paf file.Keep the high-quality contig and preserve the location of the only high confidence match in ragtag.patch.debug.filtered.paf_fiter.paf that matches the sequence at both ends of the gap. 160 | less ragtag.patch.debug.filtered.paf_fiter.paf 161 | qseq00000000 600453479 27150 3999147 + seq00000001 3972000 4 3971997 2266668 3972018 60 162 | qseq00000000 600453479 4038251 35116708 + seq00000002 597339226 17 31075089 17568679 31079144 60 163 | # gain agp 164 | perl SPART/02_Gap_patching/renameagp.pl -i ragtag.patch.ctg.agp -i1 ragtag.patch.debug.filtered.paf_fiter.paf -start seq00000001 -end seq00000002 -o test.agp 165 | less -S ragtag.patch.agp 166 | chr1A_RagTag_MOD_MOD 1 2046621 1 W seq00000000 1 2046621 + 167 | chr1A_RagTag_MOD_MOD 2046622 2046821 2 N 200 scaffold yes align_genus 168 | chr1A_RagTag_MOD_MOD 2046822 6018821 3 W seq00000001 1 3972000 + 169 | chr1A_RagTag_MOD_MOD 6018822 6019021 4 N 200 scaffold yes align_genus 170 | chr1A_RagTag_MOD_MOD 6019022 603358247 5 W seq00000002 1 597339226 + 171 | # Test.agp is merged into ragtag.patch.agp and fasta is generated. 172 | less -S ragtag.patch.agp 173 | scf00000000 1 2046621 1 W seq00000000 1 2046621 + 174 | scf00000000 2046622 2046821 2 N 200 scaffold yes align_genus 175 | scf00000000 2046822 6018821 3 W seq00000001 1 3972000 + 176 | scf00000000 6018822 6057905 4 W qseq00000000 3999151 4038234 + 177 | scf00000000 6057906 603397131 5 W seq00000002 1 597339226 + 178 | ragtag_agp2fa.py ragtag.patch.agp ragtag.patch.comps.fasta > ragtag.patch.fasta 179 | ``` 180 | #### telomere patching 181 | We used _submit_telomere.sh in ONT reads >100kb.ONT reads with telomere sequence mapping to this locus based on minimap2 alignments were manually identified. The longest was selected as template , all others aligned to it and polished with Medaka: 182 | ```sh 183 | medaka -v -i ONT_tel_reads.fasta -d longest_ont_tel.fasta -o ont_tel_medaka.fasta 184 | ``` 185 | Telomere signal in all HiFi reads was identified with the commands: 186 | ```sh 187 | _submit_telomere.sh hifi_reads.fasta 188 | ``` 189 | Additional HiFi reads were recruited from a manual analysis. We looked for trimmed tips that could extend. All reads had telomere signal and were aligned to the medaka consensus and polished with Racon with the commands: 190 | ```sh 191 | minimap2 -t16 -ax map-pb ont_tel_medaka.fasta hifi_tel.fasta > medaka.sam 192 | racon hifi_tel.fasta medaka.sam ont_tel_medaka.fasta > racon.fasta 193 | ``` 194 | Finally, the polished result was patched into the assembly with ragtag patch or manually patched. 195 | ##### Citation 196 | https://github.com/marbl/CHM13-issues/blob/main/error_detection.md. 197 | #### Centromeric region analysis 198 | ```sh 199 | workdir=# work directory 200 | FASTA=# target fasta file (uncompressed or bgzipped) 201 | prefix=# prefix of output files 202 | CHIP1_treatment=# Treatment (pull-down) file(s). 203 | CHIP2_treatment=# Treatment (pull-down) file(s). 204 | threads=# number of threads 205 | CHIP1_control=# Control (input) file(s) 206 | CHIP2_control=# Control (input) file(s) 207 | SPART/02_Gap_patching/Centromeric_region_analysis.sh $workdir $FASTA $prefix $CHIP1_treatment $CHIP2_treatment $threads $CHIP1_control $CHIP2_control 208 | ``` 209 | ### 03_Polishing 210 | ```sh 211 | # Use singularity and docker to download google_deepvariant_latest-gpu.sif and kishwars/pepper_deepvariant:r0.8-gpu respectively and modify the cluster-config and configfile in snakemake 212 | workdir=# work directory 213 | ref=# target fasta file (uncompressed or bgzipped) 214 | threads=# number of threads 215 | SPART/03_Polishing/calsv_snv.sh $workdir $ref $threads 216 | ``` 217 | ### 04_Evaluation 218 | ```sh 219 | ref=# target fasta file (uncompressed or bgzipped) 220 | prefix=# prefix of output files 221 | query=# query fasta file (uncompressed or bgzipped) 222 | threads=# number of threads 223 | partition=# your cluster partition 224 | bac_reads=# bac reads 225 | ref_chr=# target chromosome fasta file (uncompressed or bgzipped) 226 | protein=# target protein fasta file 227 | name=# output file name 228 | gff3# target gff file 229 | #### BUSCO 230 | SPART/04_Evaluation/BUSCO.sh $ref $prefix 231 | #### mapping rates & coverages 232 | SPART/04_Evaluation/mapping_rates_coverages.sh hybrid_bam single_bam ont_bam 233 | #### LTR 234 | SPART/04_Evaluation/ltr.sh $ref $prefix 235 | #### QV 236 | SPART/04_Evaluation/qv.sh $query $ref 237 | #### BACs 238 | SPART/04_Evaluation/bac.sh $bac_reads $ref_chr 239 | ### Addition 240 | SPART/04_Evaluation/while.sh $threads $partition $ref $query 241 | ### Analysis of synteny 242 | SPART/04_Evaluation/synteny.sh $protein $name $gff3 243 | ``` 244 | ### 05_Annotation 245 | #### RNA-seq 246 | Detect adapter 247 | ```sh 248 | fastp --detect_adapter_for_pe -w ${threads} -i ${RNAseq1} -I ${RNAseq2} -o ${RNAseq1_clean} -O ${RNAseq2_clean} --json ${output}.json --html ${output}.html 249 | ``` 250 | Build genome index 251 | ```sh 252 | STAR --runThreadN ${threads} --runMode genomeGenerate --genomeDir ${Output Dir} --genomeFastaFiles ${genome} --sjdbGTFtagExonParentTranscript Parent --sjdbGTFfile ${annotations} --limitGenomeGenerateRAM 40000000000 --sjdbOverhang 149 --sjdbFileChrStartEnd ${genomic coordinates} --limitSjdbInsertNsj 1854820 253 | ``` 254 | Mapping to genome 255 | ```sh 256 | STAR --runThreadN ${threads} --genomeDir ${Output Dir} --readFilesIn ${RNAseq1_clean} ${RNAseq2_clean} --sjdbGTFtagExonParentTranscript Parent --sjdbGTFfile ${annotations} --outFileNamePrefix "$profix" --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterType BySJout --outSAMunmapped Within --outFilterMultimapNmax 20 --outSAMstrandField intronMotif --outFilterMismatchNoverLmax 0.02 --outFilterMismatchNmax 999 --alignIntronMin 20 --alignIntronMax 10000 --alignMatesGapMax 100000 --sjdbScore 1 --genomeLoad NoSharedMemory --outSAMtype BAM SortedByCoordinate --limitSjdbInsertNsj 1854820 257 | ``` 258 | Assembly and merge 259 | ```sh 260 | stringtie -j 2 -c 2 -m 150 -f 0.3 -G ${reference annotation} -l rna-seq -t -p ${threads} -l "$profix" -A "$profix"gene_abund.tab -C "$profix"cov_refs.gtf -o "$profix".gtf "$profix"Aligned.sortedByCoord.out.bam 261 | stringtie --merge -p 96 -m 150 -c 10 -G ${reference annotation} -l rna_merge -o rna_all.gtf { gtf_list | strg1.gtf ...} 262 | ``` 263 | TransDecoder 264 | ```sh 265 | python snakemake -s Snakefile --cluster-config clust.json --configfile config.yaml --jobs 2000 --cluster '{cluster.account}' --rerun-incomplete --restart-times 1 266 | ``` 267 | #### ISO-seq 268 | Build genome index 269 | ```sh 270 | minimap2 -t 96 -I 16G -d $mmi $genome 271 | ``` 272 | Align && Correct && Collapse 273 | ```sh 274 | flair 123 --mm2_args=-I15g,-axsplice:hq,-uf,-secondary=no -g $genome -r $iso_seq --mm_index $mmi -f $gtf -o flair.output --temp_dir temp_flair --stringent --no_gtf_end_adjustment --check_splice --generate_map --trust_end -t 96 --annotation_reliant generate --junction_bed $stringtie.bed 275 | ``` 276 | TransDecoder 277 | ```sh 278 | python snakemake -s Snakefile --cluster-config clust.json --configfile config.yaml --jobs 2000 --cluster '{cluster.account}' --rerun-incomplete --restart-times 1 279 | ``` 280 | #### Homology protein 281 | miniprot 282 | ```sh 283 | miniprot -t96 -d CS-IAAS_v1.softmask.mpi CS-IAAS_v1.softmask.fasta 284 | miniprot -It96 --gff CS-IAAS_v1.softmask.mpi ${Homology protein} > miniprot.gff3 285 | python miniprot_GFF_2_EVM_alignment_GFF3.py miniprot.gff3 > protein_alignments.gff3 286 | ``` 287 | #### Ab initio gene prediction 288 | Braker3 289 | ```sh 290 | ##### RNA-seq && Homology protein 291 | docker run -c ${threads} --user 1000:100 -v /tmp:/tmp -v /home:/home -v /data:/data -v "$PWD":"$PWD" teambraker/braker3:latest braker.pl --workingdir="$PWD" --species=CS-IAAS --softmasking --genome=CS-IAAS_v1.softmask.fasta --addUTR=on --gff3 --nocleanup --bam=rna_seq.bam --prot_seq=${Homology protein} --threads ${threads} --BAMTOOLS_PATH= --AUGUSTUS_BIN_PATH= --JAVA_PATH= 292 | ##### ISO-seq && Homology protein 293 | docker run -c ${threads} --user 1000:100 -v /tmp:/tmp -v /home:/home -v /data:/data -v "$PWD":"$PWD" katharinahoff/playground:devel braker.pl --workingdir="$PWD" --species=CS-IAAS --softmasking --genome=CS-IAAS_v1.softmask.fasta --gff3 --nocleanup --bam=iso_seq.bam --prot_seq=${Homology protein} --threads ${threads} --BAMTOOLS_PATH= --AUGUSTUS_BIN_PATH= 294 | ``` 295 | ## contacts 296 | Shoucheng Liu (liusc_work@163.com) 297 | Xiaopeng Li (xiaopeng.li@pku-iaas.edu.cn) 298 | 299 | ## Citating SPART 300 | If you use SPART in your work, please cite: 301 | 302 | Liu, S., Li, K., Dai, X. et al. A telomere-to-telomere genome assembly coupled with multi-omic data provides insights into the evolution of hexaploid bread wheat. Nat Genet (2025). 303 | https://doi.org/10.1038/s41588-025-02137-x 304 | -------------------------------------------------------------------------------- /SPART.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | b={} 5 | hifi_single={} 6 | hifi_mix={} 7 | e={} 8 | d={} 9 | HiFi_hybrid_all=config["HiFi_reads_merge"] 10 | ONT_all=config["ONT_reads_merge"] 11 | mitochondrion=config["mitochondrion"] 12 | chloroplast=config["chloroplast"] 13 | hic_hybrid_dir=config["hic_dir"] 14 | SPART_dir=config["SPART_dir"] 15 | hic_hybrid_enzyme=config["hic_enzyme"] 16 | hic_enzyme_ligation_site=config["hic_enzyme_ligation_site"] 17 | verkko_fa=config["verkko_assemble"] 18 | pcrfree_hybrid_r1=config["pcrfree_r1"] 19 | pcrfree_hybrid_r2=config["pcrfree_r2"] 20 | google_deepvariant_latest_gpu_sif=config["google_deepvariant_latest-gpu_sif"] 21 | W=config["WORKDIR"] 22 | DIR=config["DIR"] 23 | DIRont=config["DIRont"] 24 | for dirs in os.listdir(DIR): 25 | b2 = dirs.split(".fastq") 26 | if ".fastq" in dirs: 27 | absPath = os.path.join(DIR, dirs) 28 | hifi_mix[b2[0]]=absPath 29 | 30 | for dirs in os.listdir(DIRont): 31 | b2 = dirs.split(".fastq") 32 | if ".fastq" in dirs: 33 | absPath = os.path.join(DIRont, dirs) 34 | e[b2[0]]=absPath 35 | 36 | rule final: 37 | input: 38 | W+"hybrid_hifi_pcr/hybrid.bam", 39 | W + "ont_merge/q10l120k.bam" 40 | 41 | rule hifi_fastp: 42 | input: 43 | HiFi_hybrid_all 44 | output: 45 | W+"fastp/hybrid.fq" 46 | shell: 47 | "fastp -w 16 -i {input} -o {output}" 48 | 49 | rule ont_fastp: 50 | input: 51 | ONT_all 52 | output: 53 | W+"fastp/ont.fq" 54 | shell: 55 | "fastp -q 10 -l 100000 -w 16 -i {input} -o {output}" 56 | 57 | rule hifiasm: 58 | input: 59 | hifi=W+"fastp/hybrid.fq", 60 | ont=W+"fastp/ont.fq" 61 | output: 62 | W+"hifiasm_hybrid/hybrid.all.asm.p_ctg.fa" 63 | params: 64 | W+"hifiasm_hybrid" 65 | shell: 66 | """ 67 | cd {params} 68 | hifiasm -o hybrid.all.asm --primary -t 96 --ul {input.ont} -k 63 {input.hifi} 69 | awk '/^S/{{print ">"$2;print $3}}' hybrid.all.asm.p_ctg.gfa > {output} 70 | """ 71 | 72 | rule flye: 73 | input: 74 | W+"fastp/ont.fq" 75 | output: 76 | W + "flye/assembly.fasta" 77 | params: 78 | W 79 | shell: 80 | """ 81 | cd {params} 82 | flye --nano-hq {input} --read-error 0.1 -g 5.4g --asm-coverage 80 --scaffold --out-dir flye --threads 96 --no-alt-contigs 83 | """ 84 | 85 | rule rm_mt_cp: 86 | input: 87 | hybrid=W+"hifiasm_hybrid/hybrid.all.asm.p_ctg.fa", 88 | mt=mitochondrion, 89 | cp=chloroplast 90 | output: 91 | W+"hifiasm_hybrid/hybrid.remove_cp_mt.fa" 92 | params: 93 | dir=W+"hifiasm_hybrid", 94 | workdir=SPART_dir 95 | shell: 96 | """ 97 | cd {params.dir} 98 | minimap2 -t 96 -x asm5 {input.mt} {input.hybrid}> mitochondrion.paf 99 | minimap2 -t 96 -x asm5 {input.cp} {input.hybrid}> chloroplast.paf 100 | python {params.workdir}/gemma_los.py mitochondrion.paf > mitochondrion.txt 101 | python {params.workdir}/gemma_los.py chloroplast.paf > chloroplast.txt 102 | seqkit grep -v -f chloroplast.txt {input.hybrid} > wheat_remove_cp.fa 103 | seqkit grep -v -f mitochondrion.txt wheat_remove_cp.fa > {output} 104 | """ 105 | 106 | rule hicpro: 107 | input: 108 | hic=hic_hybrid_dir, 109 | ref=W+"hifiasm_hybrid/hybrid.remove_cp_mt.fa" 110 | output: 111 | W+"hic_hybrid/hic_hybrid.bam" 112 | params: 113 | dir=W+"hic_hybrid", 114 | prefix="hybrid.remove_cp_mt", 115 | spart_dir=SPART_dir, 116 | enzyme=hic_hybrid_enzyme, 117 | LIGATION_SITE=hic_enzyme_ligation_site 118 | shell: 119 | """ 120 | cd {params.dir} 121 | ln -s {input.ref} ./ 122 | bowtie2-build --large-index --threads 96 {params.prefix}.fa {params.prefix} 123 | samtools faidx {params.prefix}.fa 124 | awk '{{print $1 "\t" $2}}' {params.prefix}.fa.fai > genome_sizes.bed 125 | python {params.spart_dir}/digest_genome.py -r ^{params.enzyme} -o enzyme.bed {params.prefix}.fa 126 | makeblastdb -in {params.prefix}.fa -dbtype nucl -parse_seqids -out {params.prefix} 127 | cp {params.spart_dir}/01_Contig_scaffolding/hicpro_config.txt ./ 128 | sed -i 's#^N_CPU = #N_CPU = 128#g' hicpro_config.txt 129 | sed -i 's#^BOWTIE2_IDX_PATH = #BOWTIE2_IDX_PATH = {params.dir}#g' hicpro_config.txt 130 | sed -i 's#^REFERENCE_GENOME = #REFERENCE_GENOME = {params.prefix}#g' hicpro_config.txt 131 | sed -i 's#^GENOME_SIZE = #GENOME_SIZE = {params.dir}/genome_sizes.bed#g' hicpro_config.txt 132 | sed -i 's#^GENOME_FRAGMENT = #GENOME_FRAGMENT = {params.dir}/enzyme.bed#g' hicpro_config.txt 133 | HiC-Pro -i {input.hic} -c hicpro_config.txt -o {params.dir}/result 134 | cd result/bowtie_results/bwt2/sample 135 | samtools sort -m 1500M -n -@ 96 HiC_hybrid.remove_cp_mt.bwt2pairs.bam > {params.dir}/hic_hybrid.bam 136 | """ 137 | 138 | rule yahs: 139 | input: 140 | bam=W+"hic_hybrid/hic_hybrid.bam", 141 | ref=W+"hifiasm_hybrid/hybrid.remove_cp_mt.fa" 142 | output: 143 | W + "yahs_hybrid/yahs_hybrid.fa" 144 | params: 145 | dir = W + "yahs_hybrid", 146 | prefix = "hybrid_bam", 147 | enzyme = hic_hybrid_enzyme 148 | shell: 149 | """ 150 | cd {params.dir} 151 | samtools faidx {input.ref} 152 | samtools sort -@ 128 -o hic_hybrid_sort.bam {input.bam} 153 | samtools index hic_hybrid_sort.bam 154 | yahs -e {params.enzyme} {input.ref} hic_hybrid_sort.bam -o {params.prefix} 155 | cp {params.dir}/{params.prefix}_scaffolds_final.fa {output} 156 | """ 157 | 158 | rule patch_flye: 159 | input: 160 | single_hybrid=W + "yahs_hybrid/yahs_hybrid.fa", 161 | flye=W + "flye/assembly.fasta" 162 | output: 163 | W + "patch_flye/patch_single_hybrid_flye.fa" 164 | params: 165 | dir = W + "patch_flye", 166 | prefix = "single_hybrid_flye" 167 | shell: 168 | """ 169 | cd {params.dir} 170 | samtools faidx {input.single_hybrid} 171 | samtools faidx {input.flye} 172 | wfmash {input.single_hybrid} {input.flye} > {params.prefix}.paf 173 | mkdir ragtag_output 174 | cd ragtag_output 175 | ln -s ../{params.prefix}.paf ragtag.patch.asm.paf 176 | cd .. 177 | ragtag.py patch -f 10000 --remove-small {input.single_hybrid} {input.flye} 178 | cp {params.dir}/ragtag_output/ragtag.patch.fasta {output} 179 | """ 180 | 181 | rule patch_verkko: 182 | input: 183 | single_hybrid_flye=W + "patch_flye/patch_single_hybrid_flye.fa", 184 | verkko=verkko_fa 185 | output: 186 | ref=W + "patch_verkko/patch_single_hybrid_flye_verkko.fa", 187 | txt = W + "repetitive_k27.txt" 188 | params: 189 | dir = W + "patch_verkko", 190 | prefix = "single_hybrid_flye_verkko", 191 | shell: 192 | """ 193 | cd {params.dir} 194 | samtools faidx {input.single_hybrid_flye} 195 | samtools faidx {input.verkko} 196 | wfmash {input.single_hybrid_flye} {input.verkko} > {params.prefix}.paf 197 | mkdir ragtag_output 198 | cd ragtag_output 199 | ln -s ../{params.prefix}.paf ragtag.patch.asm.paf 200 | cd .. 201 | ragtag.py patch -f 10000 --remove-small {input.single_hybrid_flye} {input.verkko} 202 | cp {params.dir}/ragtag_output/ragtag.patch.fasta {output.ref} 203 | bwa-mem2 index {output.ref} 204 | meryl count k=27 output merylDB {output.ref} 205 | meryl print greater-than distinct=0.9998 merylDB > {output.txt} 206 | """ 207 | 208 | rule winnowmap_hifi: 209 | input: 210 | fq=W+"fastp/hybrid.fq", 211 | ref=W + "patch_verkko/patch_single_hybrid_flye_verkko.fa", 212 | txt = W + "repetitive_k27.txt" 213 | output: 214 | sam=W+"hifi_mix_winnowmap/{hifi_mix}_q40l15k.sam" 215 | benchmark: 216 | W+"benchmarks/hifi_mix_winnowmap/{hifi_mix}.benchmark.txt" 217 | shell: 218 | """ 219 | winnowmap --MD -W {input.txt} -ax map-pb -H -K 1500M -k 27 -w27 -t32 {input.ref} {input.fq} > {output.sam} 220 | """ 221 | 222 | rule winnowmap_hifi_sort: 223 | input: 224 | W+"hifi_mix_winnowmap/{hifi_mix}_q40l15k.sam" 225 | output: 226 | W+"hifi_mix_sort/{hifi_mix}_q40l15k.bam" 227 | params: 228 | W + "patch_verkko/patch_single_hybrid_flye_verkko.fa.fai" 229 | benchmark: 230 | W + "benchmarks/hifi_mix_sort/{hifi_mix}.benchmark.txt" 231 | shell: 232 | "samtools view -@32 -bt {params} {input}|samtools sort -@32 -m1500M -O bam -o {output} -" 233 | 234 | rule winnowmap_hifi_sort_filter: 235 | input: 236 | W+"hifi_mix_sort/{hifi_mix}_q40l15k.bam" 237 | output: 238 | W+"hifi_mix_sort_filter/{hifi_mix}_q40l15k.bam" 239 | benchmark: 240 | W + "benchmarks/hifi_mix_sort_filter/{hifi_mix}.benchmark.txt" 241 | shell: 242 | "samtools view -@32 -F0x104 -hb {input} > {output}" 243 | 244 | rule winnowmap_hifi_sort_filter_merge: 245 | input: 246 | expand(W+"hifi_mix_sort_filter/{hifi_mix}_q40l15k.bam",hifi_mix=hifi_mix) 247 | output: 248 | W+"hybrid/hybrid.bam" 249 | benchmark: 250 | W + "benchmarks/hybrid/hybrid.benchmark.txt" 251 | shell: 252 | "samtools merge -@ 128 -l 0 {output} {input}" 253 | 254 | rule pcr_free: 255 | input: 256 | fa=W + "patch_verkko/patch_single_hybrid_flye_verkko.fa", 257 | r1=pcrfree_hybrid_r1, 258 | r2=pcrfree_hybrid_r2 259 | output: 260 | W+"hybrid_hifi_pcr/pcr.bam" 261 | shell: 262 | "bwa-mem2 mem -t 96 {input.fa} {input.r1} {input.r2}|samtools view -@ 96 -b -|samtools sort -@ 96 -m 1G -o {output} -" 263 | 264 | rule winnowmap_hifi_filter_pcr_merge: 265 | input: 266 | hifi=expand(W+"hifi_mix_sort_filter/{hifi_mix}_q40l15k.bam",hifi_mix=hifi_mix), 267 | pcr=W+"hybrid_hifi_pcr/pcr.bam" 268 | output: 269 | W+"hybrid_hifi_pcr/hybrid.bam" 270 | benchmark: 271 | W + "benchmarks/hybrid_pcr/hybrid.benchmark.txt" 272 | shell: 273 | "samtools merge -@ 128 -l 0 {output} {input.hifi} {input.pcr}" 274 | 275 | rule winnowmap_ont: 276 | input: 277 | fq=W+"fastp/ont.fq", 278 | ref=W + "patch_verkko/patch_single_hybrid_flye_verkko.fa", 279 | txt=W+"repetitive_k27.txt" 280 | output: 281 | W+"ont_winnowmap/{e}/{e}_q10l120k.sam" 282 | benchmark: 283 | W+"benchmarks/ont_winnowmap/{e}.benchmark.txt" 284 | shell: 285 | "winnowmap --MD -W {input.txt} -ax map-ont -H -K 1500M -k 27 -w27 -t32 {input.ref} {input.fq} > {output}" 286 | 287 | rule winnowmap_ont_sort: 288 | input: 289 | W+"ont_winnowmap/{e}/{e}_q10l120k.sam" 290 | output: 291 | W+"ont_sort/{e}/{e}_q10l120k.bam" 292 | params: 293 | W + "patch_verkko/patch_single_hybrid_flye_verkko.fa.fai" 294 | benchmark: 295 | W + "benchmarks/ont_sort/{e}.benchmark.txt" 296 | shell: 297 | "samtools view -@32 -bt {params} {input}|samtools sort -@32 -m1500M -O bam -o {output} -" 298 | 299 | rule winnowmap_ont_sort_filter: 300 | input: 301 | W+"ont_sort/{e}/{e}_q10l120k.bam" 302 | output: 303 | W+"ont_filter/{e}_q10l120k.bam" 304 | benchmark: 305 | W + "benchmarks/ont_filter/{e}.benchmark.txt" 306 | shell: 307 | "samtools view -@ 128 -F0x104 -hb {input} > {output}" 308 | 309 | rule winnowmap_ont_sort_filter_merge: 310 | input: 311 | expand(W+"ont_filter/{e}_q10l120k.bam",e=e) 312 | output: 313 | W + "ont_merge/q10l120k.bam" 314 | benchmark: 315 | W + "benchmarks/ont_merge/benchmark.txt" 316 | shell: 317 | "samtools merge -@ 128 {output} {input}" 318 | -------------------------------------------------------------------------------- /SPART.yaml: -------------------------------------------------------------------------------- 1 | name: Spart 2 | channels: 3 | - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free 4 | - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ 5 | - bioconda 6 | - defaults 7 | - etetoolkit 8 | - conda-forge 9 | dependencies: 10 | - _libgcc_mutex=0.1=conda_forge 11 | - _openmp_mutex=4.5=2_gnu 12 | - _r-mutex=1.0.0=anacondar_1 13 | - _sysroot_linux-64_curr_repodata_hack=3=haa98f57_10 14 | - abseil-cpp=20211102.0=hd4dd3e8_0 15 | - aioeasywebdav=2.4.0=py39hf3d152e_1001 16 | - aiohttp=3.8.3=py39h5eee18b_0 17 | - aiosignal=1.2.0=pyhd3eb1b0_0 18 | - amply=0.1.6=pyhd8ed1ab_0 19 | - appdirs=1.4.4=pyhd3eb1b0_0 20 | - async-timeout=4.0.2=py39h06a4308_0 21 | - attmap=0.13.2=pyhd8ed1ab_0 22 | - attrs=22.1.0=py39h06a4308_0 23 | - augustus=3.5.0=pl5321h700735d_3 24 | - bamtools=2.5.1=hd03093a_10 25 | - bbmap=39.01=h92535d8_1 26 | - bcftools=1.17=h3cc50cf_1 27 | - bcrypt=3.2.0=py39h5eee18b_1 28 | - bedtools=2.31.0=hf5e1c6e_2 29 | - binutils_impl_linux-64=2.38=h2a08ee3_1 30 | - biopython=1.78=py39h7f8727e_0 31 | - blas=1.0=openblas 32 | - blast=2.14.0=pl5321h6f7f691_2 33 | - boost-cpp=1.78.0=h5adbc97_2 34 | - boto3=1.24.28=py39h06a4308_0 35 | - botocore=1.27.59=py39h06a4308_0 36 | - bottleneck=1.3.5=py39h7deecbd_0 37 | - bowtie2=2.5.1=py39h3321a2d_0 38 | - brotlipy=0.7.0=py39h27cfd23_1003 39 | - busco=5.4.6=pyhdfd78af_0 40 | - bwa-mem2=2.2.1=hd03093a_5 41 | - bwidget=1.9.11=1 42 | - bzip2=1.0.8=h7b6447c_0 43 | - c-ares=1.19.0=h5eee18b_0 44 | - ca-certificates=2023.05.30=h06a4308_0 45 | - cachetools=4.2.2=pyhd3eb1b0_0 46 | - cairo=1.16.0=hb05425b_5 47 | - cdbtools=0.99=hdcf5f25_9 48 | - certifi=2023.5.7=py39h06a4308_0 49 | - cffi=1.15.1=py39h5eee18b_3 50 | - charset-normalizer=2.0.4=pyhd3eb1b0_0 51 | - cigar=0.1.3=pyh864c0ab_1 52 | - coin-or-cbc=2.10.7=hd28fd6d_0 53 | - coin-or-cgl=0.60.5=h3cb4718_0 54 | - coin-or-clp=1.17.7=h4e0f3ec_0 55 | - coin-or-osi=0.108.7=h3cb4718_0 56 | - coin-or-utils=2.11.6=h4e0f3ec_0 57 | - coincbc=2.10.7=0_metapackage 58 | - configargparse=1.4=pyhd3eb1b0_0 59 | - connection_pool=0.0.3=pyhd3deb0d_0 60 | - cryptography=39.0.1=py39h9ce1e76_0 61 | - curl=7.26.0=1 62 | - cutesv=2.0.3=pyhdfd78af_0 63 | - datrie=0.8.2=py39h27cfd23_0 64 | - dbus=1.10.20=0 65 | - defusedxml=0.7.1=pyhd3eb1b0_0 66 | - dendropy=4.6.1=pyhdfd78af_0 67 | - diamond=2.1.8=h43eeafb_0 68 | - docutils=0.18.1=py39h06a4308_3 69 | - dpath=2.1.6=pyha770c72_0 70 | - dropbox=11.36.1=py39h06a4308_0 71 | - entrez-direct=16.2=he881be0_1 72 | - epic2=0.0.52=py39h9f35bd6_6 73 | - exceptiongroup=1.0.4=py39h06a4308_0 74 | - expat=2.5.0=hcb278e6_1 75 | - fastp=0.22.0=h2e03b76_0 76 | - filechunkio=1.8=py_2 77 | - filelock=3.9.0=py39h06a4308_0 78 | - findutils=4.6.0=h166bdaf_1001 79 | - flye=2.9.1=py39h6935b12_0 80 | - font-ttf-dejavu-sans-mono=2.37=0 81 | - font-ttf-inconsolata=2.000=0 82 | - font-ttf-source-code-pro=2.030=0 83 | - font-ttf-ubuntu=0.83=0 84 | - fontconfig=2.14.2=h14ed4e7_0 85 | - fonts-anaconda=1=h8fa9717_0 86 | - fonts-conda-ecosystem=1=hd3eb1b0_0 87 | - freetype=2.12.1=h4a9f257_0 88 | - fribidi=1.0.10=h7b6447c_0 89 | - frozenlist=1.3.3=py39h5eee18b_0 90 | - ftputil=5.0.4=pyhd8ed1ab_0 91 | - gawk=5.1.0=h7b6447c_0 92 | - gcc_impl_linux-64=11.2.0=h1234567_1 93 | - gettext=0.21.1=h27087fc_0 94 | - gfortran_impl_linux-64=11.2.0=h7a446d4_16 95 | - gitdb=4.0.7=pyhd3eb1b0_0 96 | - gitpython=3.1.30=py39h06a4308_0 97 | - glib=2.76.4=hfc55251_0 98 | - glib-tools=2.76.4=hfc55251_0 99 | - gmp=6.2.1=h295c915_3 100 | - google-api-core=2.10.1=py39h06a4308_0 101 | - google-api-python-client=2.93.0=pyhd8ed1ab_0 102 | - google-auth=2.6.0=pyhd3eb1b0_0 103 | - google-auth-httplib2=0.1.0=pyhd8ed1ab_1 104 | - google-cloud-core=2.3.2=py39h06a4308_0 105 | - google-cloud-storage=2.6.0=py39h06a4308_0 106 | - google-crc32c=1.5.0=py39h5eee18b_0 107 | - google-resumable-media=2.4.0=py39h06a4308_0 108 | - googleapis-common-protos=1.56.4=py39h06a4308_0 109 | - graphaligner=1.0.17=hd03093a_0 110 | - graphite2=1.3.14=h295c915_1 111 | - grpc-cpp=1.48.2=h5bf31a4_0 112 | - grpcio=1.48.2=py39h5bf31a4_0 113 | - gsl=2.7=he838d99_0 114 | - gxx_impl_linux-64=11.2.0=h1234567_1 115 | - harfbuzz=6.0.0=h8e241bc_0 116 | - hifiasm=0.19.5=h43eeafb_2 117 | - hmmer=3.1b2=3 118 | - htslib=1.17=h6bc39ce_1 119 | - httplib2=0.22.0=pyhd8ed1ab_0 120 | - humanfriendly=10.0=py39h06a4308_1 121 | - icu=70.1=h27087fc_0 122 | - idna=3.4=py39h06a4308_0 123 | - iniconfig=1.1.1=pyhd3eb1b0_0 124 | - intervaltree=3.1.0=pyhd3eb1b0_0 125 | - irissv=1.0.4=hdfd78af_2 126 | - jasminesv=1.1.5=hdfd78af_0 127 | - jinja2=3.1.2=py39h06a4308_0 128 | - jmespath=0.10.0=pyhd3eb1b0_0 129 | - jpeg=9e=h5eee18b_1 130 | - jsonschema=4.17.3=py39h06a4308_0 131 | - jupyter_core=5.3.0=py39h06a4308_0 132 | - k8=0.2.5=hdcf5f25_4 133 | - kernel-headers_linux-64=3.10.0=h57e8cba_10 134 | - krb5=1.19.4=h568e23c_0 135 | - ld_impl_linux-64=2.38=h1181459_1 136 | - lerc=4.0.0=h27087fc_0 137 | - libblas=3.9.0=17_linux64_openblas 138 | - libcblas=3.9.0=17_linux64_openblas 139 | - libcrc32c=1.1.2=h6a678d5_0 140 | - libcurl=7.88.1=h91b91d3_0 141 | - libdeflate=1.13=h166bdaf_0 142 | - libdivsufsort=2.0.2=h031d066_8 143 | - libedit=3.1.20221030=h5eee18b_0 144 | - libev=4.33=h7f8727e_1 145 | - libexpat=2.5.0=hcb278e6_1 146 | - libffi=3.4.4=h6a678d5_0 147 | - libgcc-devel_linux-64=11.2.0=h1234567_1 148 | - libgcc-ng=13.1.0=he5830b7_0 149 | - libgfortran-ng=13.1.0=h69a702a_0 150 | - libgfortran5=13.1.0=h15d22d2_0 151 | - libglib=2.76.4=hebfc3b9_0 152 | - libgomp=13.1.0=he5830b7_0 153 | - libiconv=1.17=h166bdaf_0 154 | - libidn2=2.3.4=h5eee18b_0 155 | - libjemalloc=5.3.0=hcb278e6_0 156 | - liblapack=3.9.0=17_linux64_openblas 157 | - libnghttp2=1.52.0=ha637b67_1 158 | - libnsl=2.0.0=h5eee18b_0 159 | - libopenblas=0.3.23=pthreads_h80387f5_0 160 | - libpng=1.6.39=h5eee18b_0 161 | - libprotobuf=3.20.3=he621ea3_0 162 | - libsodium=1.0.18=h7b6447c_0 163 | - libsqlite=3.42.0=h2797004_0 164 | - libssh2=1.10.0=h37d81fd_2 165 | - libstdcxx-devel_linux-64=11.2.0=h1234567_1 166 | - libstdcxx-ng=13.1.0=hfd8a6a1_0 167 | - libtiff=4.4.0=h0e0dad5_3 168 | - libunistring=0.9.10=h27cfd23_0 169 | - libuuid=2.38.1=h0b41bf4_0 170 | - libwebp-base=1.2.4=h5eee18b_1 171 | - libxcb=1.15=h7f8727e_0 172 | - libxml2=2.10.3=hca2bb57_4 173 | - libzlib=1.2.13=hd590300_5 174 | - logmuse=0.2.6=pyh8c360ce_0 175 | - lp_solve=5.5.2.5=h14c3975_1001 176 | - lz4-c=1.9.4=h6a678d5_0 177 | - make=4.2.1=h1bed415_1 178 | - markdown-it-py=2.2.0=py39h06a4308_1 179 | - markupsafe=2.1.1=py39h7f8727e_0 180 | - mashmap=3.0.5=h97b747e_0 181 | - mbg=1.0.15=hdcf5f25_2 182 | - mdurl=0.1.0=py39h06a4308_0 183 | - merqury=1.3=hdfd78af_1 184 | - meryl=1.3=hdbdd923_2 185 | - metaeuk=6.a5d39d9=pl5321h6a68c12_3 186 | - metis=5.1.0=hf484d3e_4 187 | - minimap2=2.24=h7132678_1 188 | - mpfr=4.0.2=hb69a4c5_1 189 | - mscorefonts=0.0.1=3 190 | - multidict=6.0.2=py39h5eee18b_0 191 | - mummer=3.23=pl5321hdbdd923_16 192 | - mysql-connector-c=6.1.11=h24aacaa_2 193 | - natsort=7.1.1=pyhd3eb1b0_0 194 | - nbformat=5.7.0=py39h06a4308_0 195 | - ncbi-vdb=3.0.0=pl5321h87f3376_0 196 | - ncurses=6.4=h6a678d5_0 197 | - networkx=2.8.4=py39h06a4308_1 198 | - numexpr=2.8.4=py39hd2a5715_1 199 | - numpy=1.25.0=py39heeff2f4_0 200 | - numpy-base=1.25.0=py39h8a23956_0 201 | - oauth2client=4.1.3=py_0 202 | - openblas=0.3.23=pthreads_h855a84d_0 203 | - openjdk=11.0.13=h87a67e3_0 204 | - openssl=1.1.1u=h7f8727e_0 205 | - packaging=23.0=py39h06a4308_0 206 | - pandas=1.5.3=py39h417a72b_0 207 | - pango=1.50.14=hd33c08f_0 208 | - paramiko=2.8.1=pyhd3eb1b0_0 209 | - parasail-python=1.3.4=py39h4e691d4_0 210 | - pbzip2=1.1.13=0 211 | - pcre=8.45=h295c915_0 212 | - pcre2=10.40=hc3806b6_0 213 | - peppy=0.35.6=pyhd8ed1ab_0 214 | - perl=5.32.1=0_h5eee18b_perl5 215 | - perl-apache-test=1.43=pl5321hdfd78af_0 216 | - perl-app-cpanminus=1.7046=pl5321hd8ed1ab_0 217 | - perl-archive-tar=2.40=pl5321hdfd78af_0 218 | - perl-base=2.23=pl5321hdfd78af_2 219 | - perl-carp=1.38=pl5321hdfd78af_4 220 | - perl-class-load=0.25=pl5321hdfd78af_1 221 | - perl-class-load-xs=0.10=pl5321h9f5acd7_6 222 | - perl-class-method-modifiers=2.13=pl5321hdfd78af_0 223 | - perl-common-sense=3.75=pl5321hdfd78af_0 224 | - perl-compress-raw-bzip2=2.201=pl5321h87f3376_1 225 | - perl-compress-raw-zlib=2.105=pl5321h87f3376_0 226 | - perl-constant=1.33=pl5321hdfd78af_2 227 | - perl-cpan-meta-check=0.014=pl5321hdfd78af_1 228 | - perl-cpan-meta-requirements=2.143=pl5321hdfd78af_0 229 | - perl-data-optlist=0.113=pl5321ha770c72_0 230 | - perl-dbi=1.643=pl5321hec16e2b_1 231 | - perl-devel-globaldestruction=0.14=pl5321hdfd78af_1 232 | - perl-devel-overloadinfo=0.007=pl5321hdfd78af_0 233 | - perl-devel-stacktrace=2.04=pl5321hdfd78af_1 234 | - perl-dist-checkconflicts=0.11=pl5321hdfd78af_3 235 | - perl-encode=3.19=pl5321hec16e2b_1 236 | - perl-eval-closure=0.14=pl5321h9f5acd7_6 237 | - perl-exporter=5.72=pl5321hdfd78af_2 238 | - perl-exporter-tiny=1.002002=pl5321hdfd78af_0 239 | - perl-extutils-makemaker=7.70=pl5321hd8ed1ab_0 240 | - perl-file-path=2.18=pl5321hd8ed1ab_0 241 | - perl-file-temp=0.2304=pl5321hd8ed1ab_0 242 | - perl-file-which=1.24=pl5321hd8ed1ab_0 243 | - perl-getopt-long=2.54=pl5321hdfd78af_0 244 | - perl-inc-latest=0.500=pl5321ha770c72_0 245 | - perl-io-compress=2.201=pl5321hdbdd923_2 246 | - perl-io-zlib=1.14=pl5321hdfd78af_0 247 | - perl-json=4.10=pl5321hdfd78af_0 248 | - perl-json-xs=2.34=pl5321h4ac6f70_6 249 | - perl-list-moreutils=0.430=pl5321hdfd78af_0 250 | - perl-list-moreutils-xs=0.430=pl5321h031d066_2 251 | - perl-module-build=0.4234=pl5321ha770c72_0 252 | - perl-module-implementation=0.09=pl5321hdfd78af_3 253 | - perl-module-metadata=1.000038=pl5321hdfd78af_0 254 | - perl-module-runtime=0.016=pl5321hdfd78af_2 255 | - perl-module-runtime-conflicts=0.003=pl5321hdfd78af_1 256 | - perl-moo=2.005004=pl5321hdfd78af_0 257 | - perl-moose=2.2202=pl5321hec16e2b_0 258 | - perl-mro-compat=0.15=pl5321hdfd78af_0 259 | - perl-package-deprecationmanager=0.17=pl5321hdfd78af_1 260 | - perl-package-stash=0.40=pl5321h87f3376_1 261 | - perl-package-stash-xs=0.30=pl5321h0b41bf4_0 262 | - perl-parallel-forkmanager=2.02=pl5321hdfd78af_1 263 | - perl-params-util=1.102=pl5321h9f5acd7_1 264 | - perl-parent=0.236=pl5321hdfd78af_2 265 | - perl-pathtools=3.75=pl5321hec16e2b_3 266 | - perl-role-tiny=2.002004=pl5321hdfd78af_0 267 | - perl-scalar-list-utils=1.62=pl5321hec16e2b_1 268 | - perl-storable=3.15=pl5321hec16e2b_3 269 | - perl-sub-exporter=0.988=pl5321hdfd78af_0 270 | - perl-sub-exporter-progressive=0.001013=pl5321hdfd78af_1 271 | - perl-sub-identify=0.14=pl5321hec16e2b_2 272 | - perl-sub-install=0.928=pl5321hdfd78af_3 273 | - perl-sub-name=0.21=pl5321hec16e2b_3 274 | - perl-sub-quote=2.006006=pl5321hdfd78af_0 275 | - perl-test-fatal=0.016=pl5321hdfd78af_0 276 | - perl-try-tiny=0.31=pl5321hdfd78af_1 277 | - perl-types-serialiser=1.01=pl5321hdfd78af_0 278 | - perl-version=0.9924=pl5321hec16e2b_2 279 | - perl-xsloader=0.24=pl5321hd8ed1ab_0 280 | - perl-yaml=1.30=pl5321hdfd78af_0 281 | - pip=23.1.2=py39h06a4308_0 282 | - pixman=0.40.0=h7f8727e_1 283 | - plac=1.3.4=pyhd3eb1b0_0 284 | - platformdirs=2.5.2=py39h06a4308_0 285 | - pluggy=1.0.0=py39h06a4308_1 286 | - ply=3.11=py39h06a4308_0 287 | - prettytable=3.5.0=py39h06a4308_0 288 | - prodigal=2.6.3=h031d066_6 289 | - protobuf=3.20.3=py39h6a678d5_0 290 | - psutil=5.9.0=py39h5eee18b_0 291 | - pulp=2.7.0=py39hf3d152e_0 292 | - pyasn1=0.4.8=pyhd3eb1b0_0 293 | - pyasn1-modules=0.2.8=py_0 294 | - pycparser=2.21=pyhd3eb1b0_0 295 | - pygments=2.15.1=py39h06a4308_1 296 | - pynacl=1.5.0=py39h5eee18b_0 297 | - pyopenssl=23.0.0=py39h06a4308_0 298 | - pyparsing=3.0.9=py39h06a4308_0 299 | - pyrsistent=0.18.0=py39heee7806_0 300 | - pysam=0.21.0=py39h9abd093_0 301 | - pysftp=0.2.9=pyhd3eb1b0_1 302 | - pysocks=1.7.1=py39h06a4308_0 303 | - pytest=7.3.1=py39h06a4308_0 304 | - python=3.9.16=h7a1cb2a_2 305 | - python-dateutil=2.8.2=pyhd3eb1b0_0 306 | - python-fastjsonschema=2.16.2=py39h06a4308_0 307 | - python-irodsclient=1.1.8=pyhd8ed1ab_0 308 | - python_abi=3.9=2_cp39 309 | - pytz=2022.7=py39h06a4308_0 310 | - pyvcf=0.6.8=py39hde42818_1002 311 | - pyyaml=6.0=py39h5eee18b_1 312 | - r-argparse=2.1.5=r42h6115d3f_0 313 | - r-base=4.2.2=hb87df5d_1 314 | - r-cli=3.3.0=r42h884c59f_0 315 | - r-colorspace=2.0_3=r42h76d94ec_0 316 | - r-crayon=1.5.1=r42h6115d3f_0 317 | - r-digest=0.6.29=r42h884c59f_0 318 | - r-ellipsis=0.3.2=r42h76d94ec_0 319 | - r-fansi=1.0.3=r42h76d94ec_0 320 | - r-farver=2.1.0=r42h884c59f_0 321 | - r-findpython=1.0.7=r42h6115d3f_0 322 | - r-ggplot2=3.3.6=r42h6115d3f_0 323 | - r-glue=1.6.2=r42h76d94ec_0 324 | - r-gtable=0.3.0=r42h6115d3f_0 325 | - r-isoband=0.2.5=r42h884c59f_0 326 | - r-jsonlite=1.8.0=r42h76d94ec_0 327 | - r-labeling=0.4.2=r42h6115d3f_0 328 | - r-lattice=0.20_45=r42h76d94ec_0 329 | - r-lifecycle=1.0.1=r42h142f84f_0 330 | - r-magrittr=2.0.3=r42h76d94ec_0 331 | - r-mass=7.3_57=r42h76d94ec_0 332 | - r-matrix=1.4_1=r42h76d94ec_0 333 | - r-mgcv=1.8_40=r42h76d94ec_0 334 | - r-munsell=0.5.0=r42h6115d3f_0 335 | - r-nlme=3.1_157=r42h640688f_0 336 | - r-pillar=1.7.0=r42h6115d3f_0 337 | - r-pkgconfig=2.0.3=r42h6115d3f_0 338 | - r-r6=2.5.1=r42h6115d3f_0 339 | - r-rcolorbrewer=1.1_3=r42h6115d3f_0 340 | - r-rlang=1.0.2=r42h884c59f_0 341 | - r-scales=1.2.0=r42h6115d3f_0 342 | - r-tibble=3.1.7=r42h76d94ec_0 343 | - r-utf8=1.2.2=r42h76d94ec_0 344 | - r-vctrs=0.4.1=r42h884c59f_0 345 | - r-viridislite=0.4.0=r42h6115d3f_0 346 | - r-withr=2.5.0=r42h6115d3f_0 347 | - racon=1.5.0=h21ec9f0_2 348 | - ragtag=2.1.0=pyhb7b1952_0 349 | - re2=2022.04.01=h295c915_0 350 | - readline=8.2=h5eee18b_0 351 | - requests=2.29.0=py39h06a4308_0 352 | - reretry=0.11.8=pyhd8ed1ab_0 353 | - rich=13.3.5=py39h06a4308_0 354 | - rsa=4.7.2=pyhd3eb1b0_1 355 | - s3transfer=0.6.0=py39h06a4308_0 356 | - samtools=1.17=hd87286a_1 357 | - scipy=1.11.3=py39heeff2f4_0 358 | - sed=4.8=h7b6447c_0 359 | - sepp=4.4.0=py39_0 360 | - setuptools=67.8.0=py39h06a4308_0 361 | - seqkit=2.5.1=h9ee0642_0 362 | - six=1.16.0=pyhd3eb1b0_1 363 | - slacker=0.14.0=py_0 364 | - smart_open=5.2.1=py39h06a4308_0 365 | - smmap=4.0.0=pyhd3eb1b0_0 366 | - snakemake=7.21.0=hdfd78af_0 367 | - snakemake-minimal=7.21.0=pyhdfd78af_0 368 | - sniffles=2.0.7=pyhdfd78af_0 369 | - sortedcontainers=2.4.0=pyhd3eb1b0_0 370 | - sqlite=3.41.2=h5eee18b_0 371 | - stone=3.3.1=py39h06a4308_0 372 | - stopit=1.1.2=py_0 373 | - suitesparse=5.10.1=he2db622_2 374 | - sysroot_linux-64=2.17=h57e8cba_10 375 | - tabulate=0.8.10=py39h06a4308_0 376 | - tar=1.34=hb2e2bae_1 377 | - tbb=2021.8.0=hdb19cb5_0 378 | - throttler=1.2.1=pyhd8ed1ab_0 379 | - tk=8.6.12=h1ccaba5_0 380 | - tktable=2.10=h14c3975_0 381 | - tomli=2.0.1=py39h06a4308_0 382 | - toposort=1.10=pyhd8ed1ab_0 383 | - traitlets=5.7.1=py39h06a4308_0 384 | - tzdata=2023c=h04d1e81_0 385 | - ubiquerg=0.6.2=pyhd8ed1ab_0 386 | - ucsc-fatotwobit=447=h954228d_0 387 | - ucsc-twobitinfo=447=h954228d_0 388 | - uritemplate=4.1.1=pyhd8ed1ab_0 389 | - urllib3=1.26.16=py39h06a4308_0 390 | - veracitools=0.1.3=py_0 391 | - verkko=1.3.1=h64afbab_0 392 | - wcwidth=0.2.5=pyhd3eb1b0_0 393 | - wfmash=0.10.3=hea8008d_2 394 | - wget=1.21.4=h91b91d3_1 395 | - wheel=0.38.4=py39h06a4308_0 396 | - winnowmap=2.03=h43eeafb_2 397 | - wrapt=1.14.1=py39h5eee18b_0 398 | - xorg-kbproto=1.0.7=h7f98852_1002 399 | - xorg-libice=1.0.10=h7f98852_0 400 | - xorg-libsm=1.2.3=hd9c2040_1000 401 | - xorg-libx11=1.8.6=h8ee46fc_0 402 | - xorg-libxt=1.3.0=hd590300_0 403 | - xorg-xextproto=7.3.0=h0b41bf4_1003 404 | - xorg-xproto=7.0.31=h27cfd23_1007 405 | - xz=5.2.10=h5eee18b_1 406 | - yahs=1.2a.2=he4a0461_2 407 | - yaml=0.2.5=h7b6447c_0 408 | - yarl=1.8.1=py39h5eee18b_0 409 | - yte=1.5.1=pyha770c72_2 410 | - zlib=1.2.13=hd590300_5 411 | - zstd=1.5.5=hc292b87_0 412 | -------------------------------------------------------------------------------- /clust.json: -------------------------------------------------------------------------------- 1 | #snakemake --cluster-config clust.json --cluster '{cluster.account}' 2 | { 3 | "__default__" : 4 | { 5 | "account" : "sbatch -N 1 -n 1 -c 96 -p Partition", 6 | "jobs" : "59" 7 | }, 8 | "hifi_fastp" : 9 | { 10 | "account" : "sbatch -N 1 -n 1 -c 16 -p Partition", 11 | "jobs" : "59" 12 | }, 13 | "ont_fastp" : 14 | { 15 | "account" : "sbatch -N 1 -n 1 -c 16 -p Partition", 16 | "jobs" : "59" 17 | }, 18 | } 19 | -------------------------------------------------------------------------------- /conf_ck.yaml: -------------------------------------------------------------------------------- 1 | HiFi_reads_merge: 2 | SPART_PATH/example/hifi/HiFi.fastq 3 | ONT_reads_merge: 4 | SPART_PATH/example/ont/ONT.fastq 5 | mitochondrion: 6 | SPART_PATH/example/mt/mitochondrion.fasta 7 | chloroplast: 8 | SPART_PATH/example/cp/chloroplast.fasta 9 | hic_dir: 10 | SPART_PATH/example/hic 11 | SPART_dir: 12 | SPART_PATH 13 | hic_enzyme: 14 | GATC 15 | hic_enzyme_ligation_site: 16 | GATCGATC 17 | verkko_assemble: 18 | SPART_PATH/example/verkko/verkko.fasta 19 | pcrfree_r1: 20 | SPART_PATH/example/pcr/PCRFREE_1.fq 21 | pcrfree_r2: 22 | SPART_PATH/example/pcr/PCRFREE_2.fq 23 | google_deepvariant_latest-gpu_sif: 24 | google_deepvariant_latest_gpu_sif 25 | DIR: 26 | SPART_PATH/example/hifi 27 | DIRont: 28 | SPART_PATH/example/ont 29 | WORKDIR: 30 | SPART_PATH/example/test/ 31 | -------------------------------------------------------------------------------- /contacts.md: -------------------------------------------------------------------------------- 1 | Shoucheng Liu (liusc_work@163.com) 2 | 3 | Xiaopeng Li (xiaopeng.li@pku-iaas.edu.cn) 4 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/README.md: -------------------------------------------------------------------------------- 1 | # Install 2 | git clone https://github.com/liushoucheng/SPART.git 3 | 4 | cd SPART 5 | 6 | conda env create -f SPART.yaml 7 | 8 | conda activate spart 9 | 10 | # Dependencies 11 | 12 | List of tools assumed loadable or accessible with no path are: 13 | 14 | * [Bionano DLS map]( https://bionano.com) 15 | 16 | * [HiC-Pro v3.1.0]( https://github.com/nservant/HiC-Pro) 17 | 18 | * [_submit_telomere.sh]( https://github.com/VGP/vgp-assembly/blob/master/pipeline/telomere/_submit_telomere.sh) 19 | 20 | * [Medaka]( https://anaconda.org/bioconda/medaka) 21 | 22 | * [racon]( https://anaconda.org/bioconda/racon) 23 | 24 | * [hisat2]( https://github.com/DaehwanKimLab/hisat2) 25 | 26 | * [DeepVariant v1.5.0-gpu]( https://github.com/google/deepvariant) 27 | 28 | * [PEPPER-Margin-DeepVariant v0.8-gpu]( https://github.com/kishwarshafin/pepper) 29 | 30 | * [hap.py v0.3.15]( https://github.com/Illumina/hap.py) 31 | 32 | * [vcf_merge_t2t.py](https://github.com/kishwarshafin/T2T_polishing_scripts/blob/master/polishing_merge_script/vcf_merge_t2t.py) 33 | 34 | # Running pipeline with snakemake 35 | Exclude Verkko,Bionano DLS Map,Telomere determination and patch,Centromeric region analysis,Variant calls and Evaluation 36 | 37 | sed -i "s#^ SPART_PATH# ${PWD}#g" conf_ck.yaml 38 | 39 | HiC_enzyme=" GATC" 40 | 41 | sed -i "s#^ hic_sca_enzyme# ${HiC_enzyme}#g" conf_ck.yaml 42 | 43 | HiC_ligation_site=" GATCGATC" 44 | 45 | sed -i "s#^ hic_sca_ligation_site# ${HiC_ligation_site}#g" conf_ck.yaml 46 | 47 | snakemake -s SPART.py --cluster-config clust.json --configfile conf_ck.yaml --cluster '{cluster.account}' --jobs $threads --rerun-incomplete --restart-times 1 -np --rulegraph |dot -Tpng > rule.png 48 | 49 | configfile:The config file can be used to define a dictionary of configuration parameters and their values. 50 | 51 | cluster-config:A JSON or YAML file that defines the wildcards used in 'cluster'for specific rules. 52 |
53 | 54 |
55 | 56 | ## Output files 57 | please see the complete [documentation]( https://github.com/liushoucheng/SPART/tree/main/exmple). 58 | 59 | -------------------------------------------------------------------------------- /docs/source/README1.md: -------------------------------------------------------------------------------- 1 | # 00_Contig screen 2 | ## Fastp :was used to filter adapter sequences, primers and other low quality sequence from raw sequencing reads. 3 | SPART/00_Contig_screen/fastp.sh $HiFi_reads $ONT_reads 4 | ## Hifiasm 5 | SPART/00_Contig_screen/hifiasm.sh $HiFi_reads $ONT_reads $output_prefix 6 | ## Verkko 7 | SPART/00_Contig_screen/verkko.sh $output_prefix $HiFi_reads $ONT_reads $threads $memory 8 | ## Flye 9 | SPART/00_Contig_screen/flye.sh $ONT_reads $output_prefix $threads 10 | ## Remove MT & CP 11 | SPART/00_Contig_screen/rm_mt_cp.sh $mitochondrion $chloroplast $ref 12 | # 01_Contig scaffolding 13 | ## Bionano 14 | SPART/01_Contig_scaffolding/Bionano_DLS_map.sh threads bnx ref_cmap prefix xml Bio_dir cluster_xml ref bio_camp merge_xml RefAligner 15 | ## Hi-C 16 | SPART/01_Contig_scaffolding/HiC-Pro.sh ref ref_prefix hicpro_data hicpro_config hicpro_outdir 17 | 18 | SPART/01_Contig_scaffolding/yahs.sh enzyme ref bed/bam/bin profix 19 | # 02_Gap patching 20 | SPART/02_Gap_patching/wfmash_ragtag.sh prefix ref region 21 | 22 | ## Manual operation 23 | 24 | cd ragtag_output 25 | 26 | perl SPART/02_Gap_patching/paf_filter.pl -i ragtag.patch.debug.filtered.paf -minlen 10000000 -iden 0.5 27 | 28 | **Manually editing the ragtag.patch.debug.filtered.paf file.Keep the high-quality contig and preserve the location of the only high confidence match in ragtag.patch.debug.filtered.paf that matches the sequence at both ends of the gap.** 29 | 30 | perl SPART/02_Gap_patching/renameagp.pl -i ragtag.patch.ctg.agp -i1 ragtag.patch.debug.filtered.paf -start seq00000000 -end seq00000001 -o test.agp 31 | 32 | **Test.agp is merged into ragtag.patch.agp and fasta is generated.** 33 | 34 | ## telomere patching 35 | We used _submit_telomere.sh in ONT reads >100kb.ONT reads with telomere sequence mapping to this locus based on minimap2 alignments were manually identified. The longest was selected as template , all others aligned to it and polished with Medaka: 36 | 37 | medaka -v -i ONT_tel_reads.fasta -d longest_ont_tel.fasta -o ont_tel_medaka.fasta 38 | 39 | Telomere signal in all HiFi reads was identified with the commands: 40 | 41 | _submit_telomere.sh hifi_reads.fasta 42 | 43 | Additional HiFi reads were recruited from a manual analysis. We looked for trimmed tips that could extend. All reads had telomere signal and were aligned to the medaka consensus and polished with Racon with the commands: 44 | 45 | minimap2 -t16 -ax map-pb ont_tel_medaka.fasta hifi_tel.fasta > medaka.sam 46 | 47 | racon hifi_tel.fasta medaka.sam ont_tel_medaka.fasta > racon.fasta 48 | 49 | Finally, the polished result was patched into the assembly with ragtag patch or manually patched. 50 | ### Citation 51 | https://github.com/marbl/CHM13-issues/blob/main/error_detection.md. 52 | ## Centromeric region analysis 53 | 54 | SPART/02_Gap_patching/Centromeric_region_analysis.sh workdir FASTA INDEX prefix CHIP1 CHIP2 threads 55 | 56 | # 03_Polishing 57 | SPART/03_Polishing/calsv_snv.sh workdir ref threads 58 | # 04_Evaluation 59 | ## BUSCO 60 | SPART/04_Evaluation/BUSCO.sh ref prefix 61 | ## mapping rates & coverages 62 | SPART/04_Evaluation/mapping_rates_coverages.sh hybrid_bam single_bam ont_bam 63 | ## LTR 64 | SPART/04_Evaluation/ltr.sh ref prefix 65 | ## QV 66 | SPART/04_Evaluation/qv.sh query ref 67 | ## BACs 68 | SPART/04_Evaluation/bac.sh bac_reads ref_chr 69 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | 9 | project = 'SPART' 10 | copyright = '2023, Shoucheng Liu' 11 | author = 'Shoucheng Liu' 12 | 13 | # -- General configuration --------------------------------------------------- 14 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 15 | 16 | extensions = ['recommonmark', 'sphinx_markdown_tables'] 17 | 18 | templates_path = ['_templates'] 19 | exclude_patterns = [] 20 | 21 | 22 | 23 | # -- Options for HTML output ------------------------------------------------- 24 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 25 | 26 | import sphinx_rtd_theme 27 | html_theme = "sphinx_rtd_theme" 28 | html_theme_path = ['/home/liusc/software/miniconda3/lib/python3.9/site-packages'] 29 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. SPART documentation master file, created by 2 | sphinx-quickstart on Thu Oct 12 17:50:51 2023. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | SPART 7 | ================================= 8 | SPART, a Semi-automated pipeline for assembling reference sequence of telomere-to-telomere (T2T). 9 | 10 | .. image:: pipeline.jpg 11 | 12 | 13 | Quick install and start 14 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 15 | .. toctree:: 16 | :maxdepth: 2 17 | 18 | README 19 | 20 | Run step by step 21 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 22 | .. toctree:: 23 | :maxdepth: 2 24 | 25 | README1 26 | -------------------------------------------------------------------------------- /docs/source/pipeline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liushoucheng/SPART/8b2d0cf778a446c1e24dbdc772e914262cff5930/docs/source/pipeline.jpg -------------------------------------------------------------------------------- /docs/source/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx-autobuild==2021.3.14 2 | sphinx-markdown-tables==0.0.17 3 | sphinx-rtd-theme==1.3.0 4 | sphinxcontrib-applehelp==1.0.7 5 | sphinxcontrib-devhelp==1.0.5 6 | sphinxcontrib-htmlhelp==2.0.4 7 | sphinxcontrib-jquery==4.1 8 | sphinxcontrib-jsmath==1.0.1 9 | sphinxcontrib-qthelp==1.0.6 10 | sphinxcontrib-serializinghtml==1.1.9 11 | recommonmark==0.7.1 12 | -------------------------------------------------------------------------------- /example/README.md: -------------------------------------------------------------------------------- 1 | # Running pipeline with snakemake(Exclude Verkko,Bionano DLS Map,Telomere determination and patch,Centromeric region analysis,Variant calls and Evaluation): 2 | sed -i "s#^ SPART_PATH# ${PWD}#g" conf_ck.yaml 3 | 4 | snakemake -s SPART.py --cluster-config clust.json --configfile conf_ck.yaml --cluster '{cluster.account}' --jobs $threads --rerun-incomplete --restart-times 1 -np --rulegraph |dot -Tsvg > rule.svg 5 | 6 | **configfile**:The config file can be used to define a dictionary of configuration parameters and their values. 7 | 8 | **cluster-config**:A JSON or YAML file that defines the wildcards used in 'cluster'for specific rules. 9 | # Output files 10 | **Workdir/fastp**:Filtered adapter sequences, primers and other low quality sequence from raw HiFi and ONT sequencing reads. 11 | 12 | **Workdir/hifiasm_hybrid/hybrid.all.asm.p_ctg.fa**:Hifiasm generated a preliminary contig genome assembly. 13 | 14 | **Workdir/flye/assembly.fasta**:Flye generated the ONT UL reads assembly. 15 | 16 | **Workdir/hifiasm_hybrid/hybrid.remove_cp_mt.fa**:Contigs with at least 50% of their bases covered by alignments were considered to be chloroplast or mitochondria genome sequences and were removed from the assembly. 17 | 18 | **Workdir/hic_hybrid/hic_hybrid.bam**:Hi-C data were classified as valid or invalid interaction pairs. 19 | 20 | **Workdir/yahs_hybrid/yahs_hybrid.fa**:Only valid interaction pairs were retained for subsequent assembly and scaffolding into chromosomes. 21 | 22 | **Workdir/patch_flye/patch_single_hybrid_flye.fa**:Assembly gaps in chromosome scaffolds were directly filled by the corresponding Flye. 23 | 24 | **Workdir/patch_verkko/patch_single_hybrid_flye_verkko.fa**:Assembly gaps in chromosome scaffolds were directly filled by the corresponding Verkko. 25 | 26 | **Workdir/hybrid/hybrid.bam**:Alignment data file between patch_single_hybrid_flye_verkko.fa and HiFi reads. 27 | 28 | **Workdir/hybrid_hifi_pcr/pcr.bam**:Alignment data file between patch_single_hybrid_flye_verkko.fa and PCR-FREE reads. 29 | 30 | **Workdir/hybrid_hifi_pcr/hybrid.bam**:Merged Workdir/hybrid/hybrid.bam and Workdir/hybrid_hifi_pcr/pcr.bam. 31 | 32 | **Workdir/ont_merge/q10l120k.bam**:Alignment data file between patch_single_hybrid_flye_verkko.fa and ONT reads. 33 | 34 |
35 | 36 |
37 | -------------------------------------------------------------------------------- /example/pcr/PCRFREE_1.fq: -------------------------------------------------------------------------------- 1 | @E100053086L1C001R0020000056/1 2 | ACCAGGAATATCAATGAACCCCATTCTTGCAATTGCTCAGGATACCCTCTTTTAGCTGCTAGGTCTATTTCTTAGTTCAAGATCCCTCTTACTAACTGGAATAAAAGAATTAGTAGATCTGTTCCGCCCAAAATGGGAATGGGCGCTAGG 3 | + 4 | GGFGGGGGGGFGGGGGFGGGFHCFGFGFGGFGGEFFFGGGEGEGFGGGFGEGGGGGEGGEGGG>GGEGFFFGFEGGFFGEFEGFGGFGFGFGGFGGGGFFGGGFGFGGBGGG 5 | @E100053086L1C001R0020000058/1 6 | CAAAGCTAGGATGTCGGGTCTCGTTATGTAGACCTTTAGCCCACTGGCACTTAGATATTGCACTCTCCTCGGCAACTTCATCAAGGTCCTATAACATGTTCCTATAGTAGCTAGGCGGCGCTTTATTGTTTGCCACCTGCCTCATCTTGG 7 | + 8 | FGGFFGGGGFGGGFGGFGFGFGGGGGGGFGGFGGGFFGGFGGFGGGFGGGGGGFGGGGGGGGGGGGGGGGFGFGGGGGGGFGGFGGGGGGGGGGGGGGGGGGGGGGGEGGGGGFGFGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG 9 | @E100053086L1C001R0020000070/1 10 | AAAGCACGGCACCTCCGAGTTCTGCACACGTTCGGCTCGGTGACGTCCTCGCCTTCTCGATCCAACAAGAGGGGCGAAGTAGTAGATGAGTTCCGGCAGCACGACGGCGTGGTGACGGTGTTGGTGAAGAACAATCTTCGCAGGGCTTCG 11 | + 12 | FFGGGGGGFGGGGGFGGFFFFGGGFGGFGGFGGEGGFGGFFFFGGFFFGGGFGFGGGGAEGGFGGFGGGFFFG=GFFFGFGGFFGFFEFFFFGGGFFFGFFGGFGDFFFFGFFFCGGGFGFGGFEFFFGFFGGFGGFGFFGEDGFFGFGF 13 | @E100053086L1C001R0020000093/1 14 | TCGAACCCTTGCAGTTGAAGAAAGAGCCTACTCCTGGTACTGTATATTAGTGATAGGGGTGTATACAAAGTACACGTGAATACCAGTCATTGTGCGTGTGTGTATACTATCGACGAACTAGCCCCCAAGCTTTTATAACATACTGGGGGC 15 | + 16 | FFGFFGFGFFFEFFFFFFFFFFFFFGFGFFGFFFFFFFGFGGFGFGFGGFFFFFGFDFGFGFFFGFFFFFFGFGFEFFFFFFGFGFFFGFFGFFFFFGFGFGFGFFGFGGFFEFFFFGFFFFGGFFFFFFFFGFGGFGFGGGGGFFFDFF 17 | @E100053086L1C001R0020000112/1 18 | CCCTAGATATTTAATGACTGATGGTGGTTCACATTTTATTCATGGTGCTTTCCGTAAAATGCTTGCTAAATATGATGTTAATCATAGAATTGCATCTCCTTATCACCCTCAGTCTAGTGGTCAAGTAGAATTGAGTAATAGAGAACTCAA 19 | + 20 | FGFGGFGGHGGGGGGGFFGFGFGFFGFFGGGGGGGGGGGGGGFGFFGGGGGGGGGGGGFFFGGGGFGGGGGGGGGGGGGGGGGGGGFFGGGFGGGGGGGGGGGGGGGGGGGFGGGGFGFFGFGGGGGFGGGGGGGGGGGGDGEFFGGGGG 21 | @E100053086L1C001R0020000131/1 22 | TAAGTACTCTACTACTACTATATTAATTGGAGGCACATATTAGGTTTTATATTTGTTTACGTGTATGCCCCGAGACCTTCCAACTAGACATGTCTTTCTCTCCAGGTCGCACTTTGTATCGTTCATACCACTAGTACAGTAGTACTACTA 23 | + 24 | GGFGFGGGGGGGFGFFGGGGGGGGGFFGG7DG@GGFGGGGGGEFEGGGFGGGGGGFGGGGFGGFGGGGGGGGEDFFGGGGEGFGGGFGGGFGFGGFGGGFGGFGEGAGGGEGGGG*FEGG@FGGEGGGGFGGG5GGGGFGGGGGGGGGFG 25 | @E100053086L1C001R0020000150/1 26 | AGTGATTTTCACGATTAACGAACCCTGGGATGCGTTTACGTGTCGGTCATCAACACTTGTAGTTTTGGCTGATTCTGGCCCGTTTCTTGGACTATTACTCACTGATTTGGGGTCCGTGAGTGAGTTCCATGATTTTCGAACCCCGGGGTG 27 | + 28 | GEFFFFFFGGGFFFFGGFFGGFGGFFFFFEFGGFFFGGGGGG9GFGFGGFGFFGGGGFGFGGGGFGFDFGGEGFGGGFGFGFFFGGGGGAEGGGGGGGGGGGFGFGGGGEFFFFGGEGFGGGFFFGGGGGGGGFGGGFFFGGGG?FFGFG 29 | @E100053086L1C001R0020000158/1 30 | CAGTTCCACAGAGTCATTCCAGGGAAGAAGGCAAAATCACTCGGTCAGATTGCTCTGGATGTGGTTTTTGGTGACTCAAAGAATTTTCGAAAGGAGAAACTGGCGTTTGAAGTGGTAGACTTTCACAATGCCTATCATGCTATCCTCGGC 31 | + 32 | FFGGGGGGGGFFFFGGGGGGGFFEFGFFFGGGGGFFGGGGGGGFFGGFFFGGFGGGGFGGGGFFGGGGGGFGGFGGGGGFFGGGGGGGFGGGFFFGGGDFGGFEGGFGG=G@EGFFFEFGGGGFGGGGFGFGGGFGGFGGGDEGGGGDCF 33 | @E100053086L1C001R0020000169/1 34 | ATGTTCTCGCTCCTGCTCTCTTTTACTGCATTAAGACAACGCGATTCAAACTGTTGTGTGCTACGGTAGTTGAACCCATTTCCTCTGCATGACCTGTCATTGCCACAGTAACTAGATGAAACCCACTAGCATGTGTAGGAGTTGATTGAG 35 | + 36 | GDGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGFGGEGGGGEGGFG>GGGFGGGGEGGGGGFGGGGGGGGGGGGGGGGGGGGGGFGGGDGGGGGF1@GFGCG?DGGFEFGGGFGGG?GGGCGGGGF?F/GGGGFGGFF 37 | @E100053086L1C001R0020000179/1 38 | TAAATAGGACTAGCCACCATAAGGTAGAGGCATCTAGAGACCAACCAGAGAGAGAGAAAAAAGCGAGTGAACTCACCCAAGCAGTTCATCACACCATCTCAAGAACAGCCCCTCGCGAGGCTGTTCTTCCTTTGTACTGTTCACTATCAG 39 | + 40 | GGGGGGGGFFGFGGGGGGGGGGGFFGGFFFGGGFFGGFF>G@GGGGGEGGFGGGGGFGFFFFFGGGFFGFFFGGGGGGGFGGGGFGGGGGGGGGGGGGGGGDGFEGGFGGGGGGGFFEGDGGAFGGGGGGGGGBGFGGGGFGFGG@FCFF 41 | -------------------------------------------------------------------------------- /example/pcr/PCRFREE_2.fq: -------------------------------------------------------------------------------- 1 | @E100053086L1C001R0020000056/2 2 | ATGGAATGAACTTATAATCTGATGATCGAGTCGATTCCATGATTATAAGTTCATAACCCTAGCGCCCATTCCCATTTTGGGCGGAACAGATCTACTAATTCTTTTATTCCAGTTAGTAAGAGGGATCTTGAACTAAGAAATAGACCTAGC 3 | + 4 | FGHFGGGGGGGIGFGGGGGGGGFGGGGGGGGGGGBFFFGGGGFGFFGGGGGGHGGGGFGGGGGGFGFGGGGGGFGGGGGGFFGGEDGGGGFEHGGGGFGGFGGFGGGGGGGGGEDGGFFGGGGGGGGGGGAGGGHGAEEGGGGGGFHFGG 5 | @E100053086L1C001R0020000058/2 6 | GTTAGCAGATCGGTTTTGGCATGACCTTTGTGCCAGGCCATATCTTTGTTTTCAGCAGCATCATCCCCTTCACTGATTCGACCGACCATCTAGGACAAGTCGAGAATTTTGCTCCTGGACAAATCATTAGGTTTGGCAACTTGGAGTATG 7 | + 8 | GGGGGGGGGGGFFFGGGGFGGGGGGGGGGGFGGGGFFGGGGGGGGGGGGGGGGGGGGGFGGGGGGGFGGGGGGGGFGGGGFGGGFGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGFGGGGFGGGFGGFGGGGGGFGGGGGGFFGGGGG 9 | @E100053086L1C001R0020000070/2 10 | GGGAACCCTGGCGTCTGGCCCTGGAGTCCGAGAAGGACTCTTGCCTTTCGGGTGAAACCGACTTTGTGGAGGCTTTTACTCCAAGTTTCGACCCCAAGGCTCAACATATAAATAGAGGGGTAGGGCTAGCACCCAAGGCACATCAAGAAA 11 | + 12 | FFGFFFGGGGFGFG?GGFFGGEGFEGFFFFFGGFGFFGGGEGGFGGGGFGFFGGBFEFFFEGGGGFEGFEGF9EGGGGFGGGFGECFGGFFFGEG6FFEFEGFFFGGGGGGGGGFFF?FFFFFFFFDFFGGGGGFFF?EGFGFGGFFFEF 13 | @E100053086L1C001R0020000093/2 14 | CATGAAGACTCATGGGTCGGCTCCATCCTGCACCACATATAGGTTGGACCTAGATGCGTTATAGGAAGACCCATGTGGAGGACTCGGCGTACAACACGACGCTACTAGAGTCGCGGAGGACTCTATCCCCACTGGTGATAAGCCGACTAT 15 | + 16 | FGGGFFFFFGFGGFFFFFBFFFGFGFFGFGFFFGFFGFGFFFFFFFFFFFGFFFGGFGFFGFFFFFFFFEGGFGGFFFFFFEFGGFFCFFFFFFGFFGEFFFFGFGFFFFEFDFFADFFFFGEFFFFFFGFGGEEFFDFFFFEEDEFDGF 17 | @E100053086L1C001R0020000112/2 18 | GAGAGGTAAGTGACATGCTTTTCCATAAACCATTTTATACGGAGACATACCCATAGGATTTTTATATGCAGTTCTATAGGCCCATAATGCATCATCAAGTTTCTTGGGCCAATTCTTTCTAGACCTATTAACAGTCTTTTGCAAAATTAA 19 | + 20 | GGGGFBGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGFGGGGGGGGGG?GGGGGGGGGGGGGFGGGGGGGGGGGGG 21 | @E100053086L1C001R0020000131/2 22 | GTTTTTATTTCTAATGAATGCTAGCGTTTCAACTCTTACGACGAAGGAGTGCCAAACACACGGCACGTGCTGGATGTCGCACACGTCAGCGAAAGGAGTGGGACATGAACAGCGTGGTAGAGCGTCTCCACTTCTGGGTCGGAGACCACA 23 | + 24 | GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGFGGGGG 25 | @E100053086L1C001R0020000150/2 26 | TTCAACAACTATGGTGATCGCTTCGGAACCCCAAAACGGTGAGTAATAGTGCACGAAACGGGTCAGAACTGGCCAAAACTTCGAGTGTTGGTGACCGACACGTAAGCACACATTGGGGTTCGACAACCATGGAAATCGCTTTGTGACCCC 27 | + 28 | GGFFFFGGFGFFFEFGFFFFFFFFFFFEFFFFFFDDFECFFEDFFFFFFFFCFFFFFFFFBCFEFEDFFFFECFFFEEFFFFEFCEFFFGBFFDFFFFFFFFFEEGFFFGFGGFF?FEEFFFDDEFFGFFFCFE@FFFFGEGGFG7FFFG 29 | @E100053086L1C001R0020000158/2 30 | GTTGGCTTCTTGGATTGGAGCAAATCAATCACATCTGCATTTCTCTTGTATTCTTCCAGTTCCACCATGGCCATCTGCTCATCAGCAATTTTGGATCCTTGCTAAACGCACTCGGCTCTTTGCCTATTCCATGTAATTATGATTACGCCC 31 | + 32 | FFFFEFGGEGGGCGGGFFFFFGEFFFGFFFFFGGGGFFGFFGFGFGGFFGGGFGGGGGGGGGFGGGFFGCFGFGFGGFGGGGFGFGGFFFGGEFFGFGGGFFGGFFGFFGFGGFDDGGGGGGFFGFGGGGGGGGGFFGGFGFGGFGBFGG 33 | @E100053086L1C001R0020000169/2 34 | CCAAATCGTGTTCACCACACCACTCTCATTACCGACGTAATCATTTCACTCCAGCCCATCACCCAGATGAACCAGACCTGACACGACTCTAAGCATAGCAGGCATAGCAAGGTAGGAACAACACATACATATGGCTCAATCAACTCCTAC 35 | + 36 | GGGGGGGGGGGGFGGFGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGFGGFGGGGGFGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGFFGGGGGGGFGFFFGFGGGGGGGFGGGGGGGGGGFGGDG 37 | @E100053086L1C001R0020000179/2 38 | TCTGTCAGACCCTTGAGGTTCGAACACTGGGGTGCACACGAAGATCTCTCCCCTACCAGCTCACGTCTCGAAGTCTCGCAAAGATCTAAGCAAGAAAGATGAACACATAAGGGACACGAGATTTATACTAGTTCAGGCCACCATTGTGGT 39 | + 40 | GGGGGGGGGGGGGGGCGFGGGGFGGGGGGFFEGGGGGGFFFGGFGGGGGGGGGGGGGGFGGGGFGGGGGGGFFGGGGGGGGFFFGGGGGGGGGFGGGGGGGGGGGGGGGFFEFFFGGFGGGGGGGGGGGGFGGGGGEFGDGGGGGGGGFG 41 | -------------------------------------------------------------------------------- /example/rule.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | snakemake_dag 11 | 12 | 13 | 14 | 0 15 | 16 | final 17 | 18 | 19 | 20 | 1 21 | 22 | minimap_hifi_filter_pcr_merge 23 | 24 | 25 | 26 | 1->0 27 | 28 | 29 | 30 | 31 | 32 | 2 33 | 34 | minimap_hifi_sort_filter 35 | 36 | 37 | 38 | 2->1 39 | 40 | 41 | 42 | 43 | 44 | 3 45 | 46 | minimap_hifi_sort 47 | 48 | 49 | 50 | 3->2 51 | 52 | 53 | 54 | 55 | 56 | 4 57 | 58 | minimap_hifi 59 | 60 | 61 | 62 | 4->3 63 | 64 | 65 | 66 | 67 | 68 | 5 69 | 70 | hifi_fastp 71 | 72 | 73 | 74 | 5->4 75 | 76 | 77 | 78 | 79 | 80 | 11 81 | 82 | hifiasm 83 | 84 | 85 | 86 | 5->11 87 | 88 | 89 | 90 | 91 | 92 | 6 93 | 94 | patch_verkko 95 | 96 | 97 | 98 | 6->4 99 | 100 | 101 | 102 | 103 | 104 | 14 105 | 106 | pcr_free 107 | 108 | 109 | 110 | 6->14 111 | 112 | 113 | 114 | 115 | 116 | 18 117 | 118 | minimap_ont 119 | 120 | 121 | 122 | 6->18 123 | 124 | 125 | 126 | 127 | 128 | 7 129 | 130 | patch_flye 131 | 132 | 133 | 134 | 7->6 135 | 136 | 137 | 138 | 139 | 140 | 8 141 | 142 | yahs 143 | 144 | 145 | 146 | 8->7 147 | 148 | 149 | 150 | 151 | 152 | 9 153 | 154 | hicpro 155 | 156 | 157 | 158 | 9->8 159 | 160 | 161 | 162 | 163 | 164 | 10 165 | 166 | rm_mt_cp 167 | 168 | 169 | 170 | 10->8 171 | 172 | 173 | 174 | 175 | 176 | 10->9 177 | 178 | 179 | 180 | 181 | 182 | 11->10 183 | 184 | 185 | 186 | 187 | 188 | 12 189 | 190 | ont_fastp 191 | 192 | 193 | 194 | 12->11 195 | 196 | 197 | 198 | 199 | 200 | 13 201 | 202 | flye 203 | 204 | 205 | 206 | 12->13 207 | 208 | 209 | 210 | 211 | 212 | 12->18 213 | 214 | 215 | 216 | 217 | 218 | 13->7 219 | 220 | 221 | 222 | 223 | 224 | 14->1 225 | 226 | 227 | 228 | 229 | 230 | 15 231 | 232 | minimap_ont_sort_filter_merge 233 | 234 | 235 | 236 | 15->0 237 | 238 | 239 | 240 | 241 | 242 | 16 243 | 244 | minimap_ont_sort_filter 245 | 246 | 247 | 248 | 16->15 249 | 250 | 251 | 252 | 253 | 254 | 17 255 | 256 | minimap_ont_sort 257 | 258 | 259 | 260 | 17->16 261 | 262 | 263 | 264 | 265 | 266 | 18->17 267 | 268 | 269 | 270 | 271 | 272 | -------------------------------------------------------------------------------- /pic/pipeline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liushoucheng/SPART/8b2d0cf778a446c1e24dbdc772e914262cff5930/pic/pipeline.jpg -------------------------------------------------------------------------------- /pic/rule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liushoucheng/SPART/8b2d0cf778a446c1e24dbdc772e914262cff5930/pic/rule.png --------------------------------------------------------------------------------