├── .gitignore ├── README.md ├── Snakefile ├── bin ├── filter_sites.py ├── group_pileup.py ├── join_pileup.py └── select_sites.py ├── config.yaml ├── data ├── test1_R1.fq.gz ├── test1_R2.fq.gz ├── test2_R1.fq.gz ├── test2_R2.fq.gz ├── test3_R1.fq.gz └── test3_R2.fq.gz └── docs └── flow.svg /.gitignore: -------------------------------------------------------------------------------- 1 | .snakemake/ 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.11046885.svg)](https://doi.org/10.5281/zenodo.11046885) 2 | 3 | # m5C-UBSseq 4 | 5 | ## Changelog 6 | 7 | - 4/23/2024: rewrite code using polars 8 | 9 | ## workflow 10 | 11 | [![](./docs/flow.svg)](https://github.com/y9c/m5C-UBSseq) 12 | 13 | ## Citation 14 | 15 | - cite this software 16 | 17 | ```BibTex 18 | @software{chang_y_2024_11046885, 19 | author = {Chang Y}, 20 | title = {y9c/m5C-UBSseq: V0.1}, 21 | publisher = {Zenodo}, 22 | version = {v0.1}, 23 | doi = {10.5281/zenodo.11046885}, 24 | url = {https://doi.org/10.5281/zenodo.11046885} 25 | } 26 | ``` 27 | 28 | - cite the method 29 | 30 | ```BibTex 31 | @article{dai_ultrafast_2024, 32 | title = {Ultrafast bisulfite sequencing detection of 5-methylcytosine in {DNA} and {RNA}}, 33 | url = {https://www.nature.com/articles/s41587-023-02034-w}, 34 | doi = {10.1038/s41587-023-02034-w}, 35 | author = {Dai, Qing and Ye, Chang and Irkliyenko, Iryna and Wang, Yiding and Sun, Hui-Lung and Gao, Yun and Liu, Yushuai and Beadell, Alana and Perea, José and Goel, Ajay and He, Chuan}, 36 | date = {2024-01-02}, 37 | } 38 | ``` 39 | 40 |   41 | 42 |

43 | 46 |

47 |

48 | Copyright © 2021-present 49 | Chang Y 50 |

51 |

52 | -------------------------------------------------------------------------------- /Snakefile: -------------------------------------------------------------------------------- 1 | from snakemake.utils import min_version 2 | from collections import defaultdict 3 | from pathlib import Path 4 | 5 | min_version("8.0") 6 | 7 | 8 | configfile: Path(workflow.basedir) / "config.yaml" 9 | 10 | 11 | workdir: "workspace" 12 | 13 | 14 | BIN = config["path"] 15 | REF = config["reference"] 16 | 17 | CUSTOMIZED_GENES = [os.path.expanduser(i) for i in config.get("customized_genes", [])] 18 | WITH_UMI = config.get("library", "") in ["INLINE", "TAKARAV3"] 19 | MARKDUP = config.get("markdup", False) 20 | 21 | 22 | SAMPLE2DATA = defaultdict(dict) 23 | SAMPLE2LIB = defaultdict(dict) 24 | GROUP2SAMPLE = defaultdict(list) 25 | for s, v in config["samples"].items(): 26 | if v.get("treated", True): 27 | # set default group as sample name, if not specified 28 | GROUP2SAMPLE[v.get("group", s)].append(s) 29 | SAMPLE2LIB[s] = v.get("library", config.get("library", "")) 30 | for i, v2 in enumerate(v.get("data", []), 1): 31 | r = f"run{i}" 32 | SAMPLE2DATA[s][r] = {k3: os.path.expanduser(v3) for k3, v3 in v2.items()} 33 | 34 | 35 | INTERNALDIR = Path("internal_files") 36 | TEMPDIR = Path(".tmp") 37 | 38 | if os.environ.get("TMPDIR") is None: 39 | os.environ["TMPDIR"] = str(TEMPDIR) 40 | 41 | 42 | envvars: 43 | "TMPDIR", 44 | 45 | 46 | rule all: 47 | input: 48 | expand("report_reads/mapped/{sample}.tsv", sample=SAMPLE2DATA.keys()), 49 | [ 50 | ( 51 | INTERNALDIR / f"discarded_reads/{sample}_{rn}_R1.unmapped.fq.gz" 52 | if len(v) == 1 53 | else [ 54 | INTERNALDIR / f"discarded_reads/{sample}_{rn}_R1.unmapped.fq.gz", 55 | INTERNALDIR / f"discarded_reads/{sample}_{rn}_R2.unmapped.fq.gz", 56 | ] 57 | ) 58 | for sample, v in SAMPLE2DATA.items() 59 | for rn, v2 in v.items() 60 | ], 61 | expand( 62 | "detected_sites/filtered/{sample}.{ref}.tsv", 63 | sample=SAMPLE2DATA.keys(), 64 | ref=["genes", "genome"], 65 | ), 66 | 67 | 68 | rule cutadapt_SE: 69 | input: 70 | lambda wildcards: SAMPLE2DATA[wildcards.sample][wildcards.rn].get("R1", "/"), 71 | output: 72 | fastq_cut=temp(TEMPDIR / "cut_adapter_SE/{sample}_{rn}_R1.fq.gz"), 73 | fastq_tooshort=INTERNALDIR / "discarded_reads/{sample}_{rn}_R1.tooshort.fq.gz", 74 | fastq_untrimmed=INTERNALDIR / "discarded_reads/{sample}_{rn}_R1.untrimmed.fq.gz", 75 | report="report_reads/trimming/{sample}_{rn}.json", 76 | params: 77 | library=lambda wildcards: SAMPLE2LIB[wildcards.sample], 78 | threads: 20 79 | shell: 80 | """ 81 | cutseq -t {threads} -A {params.library} -m 20 --trim-polyA --ensure-inline-barcode --auto-rc -o {output.fastq_cut} -s {output.fastq_tooshort} -u {output.fastq_untrimmed} --json-file {output.report} {input} 82 | """ 83 | 84 | 85 | rule cutadapt_PE: 86 | input: 87 | lambda wildcards: SAMPLE2DATA[wildcards.sample][wildcards.rn].get("R1", "/"), 88 | lambda wildcards: SAMPLE2DATA[wildcards.sample][wildcards.rn].get("R2", "/"), 89 | output: 90 | fastq_cut=[ 91 | temp(TEMPDIR / "cut_adapter_PE/{sample}_{rn}_R1.fq.gz"), 92 | temp(TEMPDIR / "cut_adapter_PE/{sample}_{rn}_R2.fq.gz"), 93 | ], 94 | fastq_tooshort=[ 95 | INTERNALDIR / "discarded_reads/{sample}_{rn}_R1.tooshort.fq.gz", 96 | INTERNALDIR / "discarded_reads/{sample}_{rn}_R2.tooshort.fq.gz", 97 | ], 98 | fastq_untrimmed=[ 99 | INTERNALDIR / "discarded_reads/{sample}_{rn}_R1.untrimmed.fq.gz", 100 | INTERNALDIR / "discarded_reads/{sample}_{rn}_R2.untrimmed.fq.gz", 101 | ], 102 | report="report_reads/trimming/{sample}_{rn}.report", 103 | params: 104 | library=lambda wildcards: SAMPLE2LIB[wildcards.sample], 105 | threads: 20 106 | shell: 107 | """ 108 | cutseq -t {threads} -A {params.library} -m 20 --trim-polyA --ensure-inline-barcode --auto-rc -o {output.fastq_cut} -s {output.fastq_tooshort} -u {output.fastq_untrimmed} --json-file {output.report} {input} 109 | """ 110 | 111 | 112 | rule prepare_genes_index: 113 | input: 114 | CUSTOMIZED_GENES, 115 | output: 116 | fa="prepared_genes/genes.fa", 117 | index="prepared_genes/genes.3n.CT.1.ht2", 118 | params: 119 | index="prepared_genes/genes", 120 | threads: 12 121 | shell: 122 | """ 123 | cat {input} >{output.fa} 124 | rm -f `dirname {output.index}`/`basename {output.index} ".CT.1.ht2"`.*.ht2 125 | ~/tools/hisat2/hisat-3n-build -p 12 --base-change C,T {output.fa} {params.index} 126 | """ 127 | 128 | 129 | rule build_gene_index: 130 | input: 131 | fa="prepared_genes/genes.fa", 132 | output: 133 | fai="prepared_genes/genes.fa.fai", 134 | shell: 135 | """ 136 | samtools faidx {output.fa} 137 | """ 138 | 139 | 140 | rule generate_saf_gene: 141 | input: 142 | fai="prepared_genes/genes.fa.fai", 143 | output: 144 | saf="prepared_genes/genes.saf", 145 | shell: 146 | """ 147 | awk 'BEGIN{{OFS="\\t"}}{{print $1,$1,0,$2,"+"}}' {input} > {output} 148 | """ 149 | 150 | 151 | # Mapping (SE mapping mode) 152 | 153 | 154 | rule hisat2_3n_mapping_contamination_SE: 155 | input: 156 | TEMPDIR / "cut_adapter_SE/{sample}_{rn}_R1.fq.gz", 157 | output: 158 | mapped=temp(TEMPDIR / "mapping_unsorted_SE/{sample}_{rn}.contamination.bam"), 159 | unmapped=temp(TEMPDIR / "mapping_discarded_SE/{sample}_{rn}.contamination.bam"), 160 | summary="report_reads/mapping/{sample}_{rn}.contamination.summary", 161 | params: 162 | index=REF["contamination"]["hisat3n"], 163 | threads: 24 164 | shell: 165 | """ 166 | {BIN[hisat3n]} --index {params.index} -p {threads} --summary-file {output.summary} --new-summary -q -U {input[0]} --directional-mapping --base-change C,T --mp 8,2 --no-spliced-alignment | \ 167 | {BIN[samtools]} view -@ {threads} -e '!flag.unmap' -O BAM -U {output.unmapped} -o {output.mapped} 168 | """ 169 | 170 | 171 | rule hisat2_3n_mapping_genes_SE: 172 | input: 173 | TEMPDIR / "unmapped_internal_SE/{sample}_{rn}_R1.contamination.fq.gz", 174 | "prepared_genes/genes.3n.CT.1.ht2" if CUSTOMIZED_GENES else [], 175 | output: 176 | mapped=temp(TEMPDIR / "mapping_unsorted_SE/{sample}_{rn}.genes.bam"), 177 | unmapped=temp(TEMPDIR / "mapping_discarded_SE/{sample}_{rn}.genes.bam"), 178 | summary="report_reads/mapping/{sample}_{rn}.genes.summary", 179 | params: 180 | index=( 181 | REF["genes"]["hisat3n"] if not CUSTOMIZED_GENES else "prepared_genes/genes" 182 | ), 183 | threads: 24 184 | shell: 185 | """ 186 | {BIN[hisat3n]} --index {params.index} -p {threads} --summary-file {output.summary} --new-summary -q -U {input[0]} --directional-mapping --all --norc --base-change C,T --mp 8,2 --no-spliced-alignment | \ 187 | {BIN[samtools]} view -@ {threads} -e '!flag.unmap' -O BAM -U {output.unmapped} -o {output.mapped} 188 | """ 189 | 190 | 191 | rule hisat2_3n_mapping_genome_SE: 192 | input: 193 | TEMPDIR / "unmapped_internal_SE/{sample}_{rn}_R1.genes.fq.gz", 194 | output: 195 | mapped=temp(TEMPDIR / "mapping_unsorted_SE/{sample}_{rn}.genome.bam"), 196 | unmapped=temp(TEMPDIR / "mapping_discarded_SE/{sample}_{rn}.genome.bam"), 197 | summary="report_reads/mapping/{sample}_{rn}.genome.summary", 198 | params: 199 | index=REF["genome"]["hisat3n"], 200 | threads: 24 201 | shell: 202 | """ 203 | {BIN[hisat3n]} --index {params.index} -p {threads} --summary-file {output.summary} --new-summary -q -U {input[0]} --directional-mapping --base-change C,T --pen-noncansplice 20 --mp 4,1 | \ 204 | {BIN[samtools]} view -@ {threads} -e '!flag.unmap' -O BAM -U {output.unmapped} -o {output.mapped} 205 | """ 206 | 207 | 208 | rule extract_unmap_bam_internal_SE: 209 | input: 210 | TEMPDIR / "mapping_discarded_SE/{sample}_{rn}.{reftype}.bam", 211 | output: 212 | temp(TEMPDIR / "unmapped_internal_SE/{sample}_{rn}_R1.{reftype}.fq.gz"), 213 | threads: 4 214 | shell: 215 | """ 216 | {BIN[samtools]} fastq -@ {threads} -0 {output} {input} 217 | """ 218 | 219 | 220 | # Mapping (PE mapping mode) 221 | 222 | 223 | rule hisat2_3n_mapping_contamination_PE: 224 | input: 225 | TEMPDIR / "cut_adapter_PE/{sample}_{rn}_R1.fq.gz", 226 | TEMPDIR / "cut_adapter_PE/{sample}_{rn}_R2.fq.gz", 227 | output: 228 | mapped=temp(TEMPDIR / "mapping_unsorted_PE/{sample}_{rn}.contamination.bam"), 229 | unmapped=temp(TEMPDIR / "mapping_discarded_PE/{sample}_{rn}.contamination.bam"), 230 | summary="report_reads/mapping/{sample}_{rn}.contamination.summary", 231 | params: 232 | index=REF["contamination"]["hisat3n"], 233 | threads: 24 234 | shell: 235 | """ 236 | {BIN[hisat3n]} --index {params.index} -p {threads} --summary-file {output.summary} --new-summary -q -1 {input[0]} -2 {input[1]} --directional-mapping --base-change C,T --mp 8,2 --no-spliced-alignment | \ 237 | {BIN[samtools]} view -@ {threads} -e 'flag.proper_pair && !flag.unmap && !flag.munmap' -O BAM -U {output.unmapped} -o {output.mapped} 238 | """ 239 | 240 | 241 | rule hisat2_3n_mapping_genes_PE: 242 | input: 243 | TEMPDIR / "unmapped_internal_PE/{sample}_{rn}_R1.contamination.fq.gz", 244 | TEMPDIR / "unmapped_internal_PE/{sample}_{rn}_R2.contamination.fq.gz", 245 | "prepared_genes/genes.3n.CT.1.ht2" if CUSTOMIZED_GENES else [], 246 | output: 247 | mapped=temp(TEMPDIR / "mapping_unsorted_PE/{sample}_{rn}.genes.bam"), 248 | unmapped=temp(TEMPDIR / "mapping_discarded_PE/{sample}_{rn}.genes.bam"), 249 | summary="report_reads/mapping/{sample}_{rn}.genes.summary", 250 | params: 251 | index=( 252 | REF["genes"]["hisat3n"] if not CUSTOMIZED_GENES else "prepared_genes/genes" 253 | ), 254 | threads: 24 255 | shell: 256 | """ 257 | {BIN[hisat3n]} --index {params.index} -p {threads} --summary-file {output.summary} --new-summary -q -1 {input[0]} -2 {input[1]} --directional-mapping --all --norc --base-change C,T --mp 8,2 --no-spliced-alignment | \ 258 | {BIN[samtools]} view -@ {threads} -e 'flag.proper_pair && !flag.unmap && !flag.munmap' -O BAM -U {output.unmapped} -o {output.mapped} 259 | """ 260 | 261 | 262 | rule hisat2_3n_mapping_genome_PE: 263 | input: 264 | TEMPDIR / "unmapped_internal_PE/{sample}_{rn}_R1.genes.fq.gz", 265 | TEMPDIR / "unmapped_internal_PE/{sample}_{rn}_R2.genes.fq.gz", 266 | output: 267 | mapped=temp(TEMPDIR / "mapping_unsorted_PE/{sample}_{rn}.genome.bam"), 268 | unmapped=temp(TEMPDIR / "mapping_discarded_PE/{sample}_{rn}.genome.bam"), 269 | summary="report_reads/mapping/{sample}_{rn}.genome.summary", 270 | params: 271 | index=REF["genome"]["hisat3n"], 272 | threads: 24 273 | shell: 274 | """ 275 | {BIN[hisat3n]} --index {params.index} -p {threads} --summary-file {output.summary} --new-summary -q -1 {input[0]} -2 {input[1]} --directional-mapping --base-change C,T --pen-noncansplice 20 --mp 4,1 | \ 276 | {BIN[samtools]} view -@ {threads} -e 'flag.proper_pair && !flag.unmap && !flag.munmap' -O BAM -U {output.unmapped} -o {output.mapped} 277 | """ 278 | 279 | 280 | rule extract_unmap_bam_internal_PE: 281 | input: 282 | TEMPDIR / "mapping_discarded_PE/{sample}_{rn}.{reftype}.bam", 283 | output: 284 | r1=temp(TEMPDIR / "unmapped_internal_PE/{sample}_{rn}_R1.{reftype}.fq.gz"), 285 | r2=temp(TEMPDIR / "unmapped_internal_PE/{sample}_{rn}_R2.{reftype}.fq.gz"), 286 | threads: 4 287 | shell: 288 | """ 289 | {BIN[samtools]} fastq -@ {threads} -1 {output.r1} -2 {output.r2} -0 /dev/null -s /dev/null -n {input} 290 | """ 291 | 292 | 293 | ruleorder: extract_unmap_bam_final_PE > extract_unmap_bam_final_SE 294 | 295 | 296 | rule extract_unmap_bam_final_SE: 297 | input: 298 | r1=TEMPDIR / "unmapped_internal_SE/{sample}_{rn}_R1.genome.fq.gz", 299 | output: 300 | r1=INTERNALDIR / "discarded_reads/{sample}_{rn}_R1.unmapped.fq.gz", 301 | threads: 4 302 | shell: 303 | """ 304 | mv {input.r1} {output.r1} 305 | """ 306 | 307 | 308 | rule extract_unmap_bam_final_PE: 309 | input: 310 | r1=TEMPDIR / "unmapped_internal_PE/{sample}_{rn}_R1.genome.fq.gz", 311 | r2=TEMPDIR / "unmapped_internal_PE/{sample}_{rn}_R2.genome.fq.gz", 312 | output: 313 | r1=INTERNALDIR / "discarded_reads/{sample}_{rn}_R1.unmapped.fq.gz", 314 | r2=INTERNALDIR / "discarded_reads/{sample}_{rn}_R2.unmapped.fq.gz", 315 | threads: 4 316 | shell: 317 | """ 318 | mv {input.r1} {output.r1} 319 | mv {input.r2} {output.r2} 320 | """ 321 | 322 | 323 | rule hisat2_3n_sort: 324 | input: 325 | lambda wildcards: TEMPDIR 326 | / ( 327 | "mapping_unsorted_SE/{sample}_{rn}.{ref}.bam" 328 | if len(SAMPLE2DATA[wildcards.sample][wildcards.rn]) == 1 329 | else "mapping_unsorted_PE/{sample}_{rn}.{ref}.bam" 330 | ), 331 | output: 332 | INTERNALDIR / "run_sorted/{sample}_{rn}.{ref}.bam", 333 | threads: 16 334 | shell: 335 | """ 336 | {BIN[samtools]} sort -@ {threads} --write-index -m 3G -O BAM -o {output} {input} 337 | """ 338 | 339 | 340 | # combine mapping results (multi run) 341 | 342 | 343 | rule combine_runs: 344 | input: 345 | lambda wildcards: [ 346 | INTERNALDIR / f"run_sorted/{wildcards.sample}_{r}.{wildcards.ref}.bam" 347 | for r in SAMPLE2DATA[wildcards.sample] 348 | ], 349 | output: 350 | temp(TEMPDIR / "combined_mapping/{sample}.{ref}.bam"), 351 | params: 352 | path_samtools=config["path"]["samtools"], 353 | threads: 8 354 | shell: 355 | "{params.path_samtools} merge -@ {threads} -o {output} {input}" 356 | 357 | 358 | rule stat_mapping_number: 359 | input: 360 | bam=lambda wildcards: [ 361 | TEMPDIR / f"combined_mapping/{wildcards.sample}.{ref}.bam" 362 | for ref in ["contamination", "genes", "genome"] 363 | ], 364 | output: 365 | tsv="report_reads/mapped/{sample}.tsv", 366 | params: 367 | refs=["contamination", "genes", "genome"], 368 | threads: 4 369 | shell: 370 | """ 371 | paste <(echo {params.refs} | tr " " "\\n") <(echo {input.bam} | tr " " "\\n") | while read ref file; do 372 | {BIN[samtools]} view -@ {threads} -F 3980 -c $file | awk -v ref="$ref" '{{FS="\\t";OFS="\\t"}}NR==1{{print ref,$1}}' >> {output} 373 | done 374 | """ 375 | 376 | 377 | rule dedup_mapping: 378 | input: 379 | bam=TEMPDIR / "combined_mapping/{sample}.{ref}.bam", 380 | output: 381 | bam=INTERNALDIR / "aligned_bam/{sample}.{ref}.bam", 382 | txt="report_reads/dedup/{sample}.{ref}.log", 383 | params: 384 | tmp=os.environ["TMPDIR"], 385 | threads: 20 386 | run: 387 | if WITH_UMI: 388 | shell( 389 | """ 390 | /software/java-15.0.2-el8-x86_64/bin/java -server -Xms8G -Xmx40G -Xss100M -Djava.io.tmpdir={params.tmp} -jar {BIN[umicollapse]} bam \ 391 | -t 2 -T {threads} --data naive --merge avgqual --two-pass -i {input.bam} -o {output.bam} >{output.txt} 392 | """ 393 | ) 394 | elif MARKDUP: 395 | shell( 396 | """ 397 | ~/tools/jdk8u322-b06-jre/bin/java -Xmx36G -jar ~/tools/gatk-4.2.5.0/gatk-package-4.2.5.0-local.jar MarkDuplicates \ 398 | -I {input} -O {output.bam} -M {output.txt} --DUPLICATE_SCORING_STRATEGY SUM_OF_BASE_QUALITIES --REMOVE_DUPLICATES true --VALIDATION_STRINGENCY SILENT --TMP_DIR {params.tmp} 399 | """ 400 | ) 401 | else: 402 | shell( 403 | """ 404 | cp {input.bam} {output.bam} 405 | touch {output.txt} 406 | """ 407 | ) 408 | 409 | 410 | rule dedup_index: 411 | input: 412 | bam=INTERNALDIR / "aligned_bam/{sample}.{ref}.bam", 413 | output: 414 | bai=INTERNALDIR / "aligned_bam/{sample}.{ref}.bam.bai", 415 | threads: 6 416 | shell: 417 | """ 418 | {BIN[samtools]} index -@ {threads} {input} 419 | """ 420 | 421 | 422 | # call mutation 423 | 424 | 425 | rule hisat2_3n_calling_unfiltered_unique: 426 | input: 427 | INTERNALDIR / "aligned_bam/{sample}.{ref}.bam", 428 | output: 429 | temp(TEMPDIR / "unfiltered_unique/{sample}.{ref}.tsv.gz"), 430 | params: 431 | fa=lambda wildcards: ( 432 | REF[wildcards.ref]["fa"] 433 | if wildcards.ref != "genes" or not CUSTOMIZED_GENES 434 | else "prepared_genes/genes.fa" 435 | ), 436 | threads: 16 437 | shell: 438 | """ 439 | {BIN[samtools]} view -e "rlen<100000" -h {input} | {BIN[hisat3ntable]} -p {threads} -u --alignments - --ref {params.fa} --output-name /dev/stdout --base-change C,T | cut -f 1,2,3,5,7 | bgzip -@ {threads} -c > {output} 440 | """ 441 | 442 | 443 | rule hisat2_3n_calling_unfiltered_multi: 444 | input: 445 | INTERNALDIR / "aligned_bam/{sample}.{ref}.bam", 446 | output: 447 | temp(TEMPDIR / "unfiltered_multi/{sample}.{ref}.tsv.gz"), 448 | params: 449 | fa=lambda wildcards: ( 450 | REF[wildcards.ref]["fa"] 451 | if wildcards.ref != "genes" or not CUSTOMIZED_GENES 452 | else "prepared_genes/genes.fa" 453 | ), 454 | threads: 16 455 | shell: 456 | """ 457 | {BIN[samtools]} view -e "rlen<100000" -h {input} | {BIN[hisat3ntable]} -p {threads} -m --alignments - --ref {params.fa} --output-name /dev/stdout --base-change C,T | cut -f 1,2,3,5,7 | bgzip -@ {threads} -c > {output} 458 | """ 459 | 460 | 461 | rule hisat2_3n_filtering: 462 | input: 463 | INTERNALDIR / "aligned_bam/{sample}.{ref}.bam", 464 | output: 465 | temp(TEMPDIR / "hisat_converted/{sample}.{ref}.bam"), 466 | threads: 4 467 | shell: 468 | """ 469 | {BIN[samtools]} view -@ {threads} -e "[XM] * 20 <= (qlen-sclen) && [Zf] <= 3 && 3 * [Zf] <= [Zf] + [Yf]" {input} -O BAM -o {output} 470 | """ 471 | 472 | 473 | rule hisat2_3n_calling_filtered_unqiue: 474 | input: 475 | TEMPDIR / "hisat_converted/{sample}.{ref}.bam", 476 | output: 477 | temp(TEMPDIR / "filtered_unique/{sample}.{ref}.tsv.gz"), 478 | params: 479 | fa=lambda wildcards: ( 480 | REF[wildcards.ref]["fa"] 481 | if wildcards.ref != "genes" or not CUSTOMIZED_GENES 482 | else "prepared_genes/genes.fa" 483 | ), 484 | threads: 16 485 | shell: 486 | """ 487 | {BIN[samtools]} view -e "rlen<100000" -h {input} | {BIN[hisat3ntable]} -p {threads} -u --alignments - --ref {params.fa} --output-name /dev/stdout --base-change C,T | cut -f 1,2,3,5,7 | bgzip -@ {threads} -c > {output} 488 | """ 489 | 490 | 491 | rule hisat2_3n_calling_filtered_multi: 492 | input: 493 | TEMPDIR / "hisat_converted/{sample}.{ref}.bam", 494 | output: 495 | temp(TEMPDIR / "filtered_multi/{sample}.{ref}.tsv.gz"), 496 | params: 497 | fa=lambda wildcards: ( 498 | REF[wildcards.ref]["fa"] 499 | if wildcards.ref != "genes" or not CUSTOMIZED_GENES 500 | else "prepared_genes/genes.fa" 501 | ), 502 | threads: 16 503 | shell: 504 | """ 505 | {BIN[samtools]} view -e "rlen<100000" -h {input} | {BIN[hisat3ntable]} -p {threads} -m --alignments - --ref {params.fa} --output-name /dev/stdout --base-change C,T | cut -f 1,2,3,5,7 | bgzip -@ {threads} -c > {output} 506 | """ 507 | 508 | 509 | rule join_pileup: 510 | input: 511 | lambda wildcards: [ 512 | TEMPDIR / f"{t}/{wildcards.sample}.{wildcards.ref}.tsv.gz" 513 | for t in [ 514 | "unfiltered_unique", 515 | "unfiltered_multi", 516 | "filtered_unique", 517 | "filtered_multi", 518 | ] 519 | ], 520 | output: 521 | INTERNALDIR / "count_sites/{sample}.{ref}.arrow", 522 | threads: 6 523 | shell: 524 | """ 525 | {BIN[join_pileup.py]} -i {input} -o {output} 526 | """ 527 | 528 | 529 | rule group_pileup: 530 | input: 531 | lambda wildcards: [ 532 | INTERNALDIR / f"count_sites/{sample}.{wildcards.ref}.arrow" 533 | for sample in GROUP2SAMPLE[wildcards.group] 534 | ], 535 | output: 536 | INTERNALDIR / "group_sites/{group}.{ref}.arrow", 537 | threads: 6 538 | shell: 539 | """ 540 | {BIN[group_pileup.py]} -i {input} -o {output} 541 | """ 542 | 543 | 544 | rule combined_select_sites: 545 | input: 546 | expand( 547 | INTERNALDIR / "group_sites/{group}.{{ref}}.arrow", 548 | group=GROUP2SAMPLE.keys(), 549 | ), 550 | output: 551 | "detected_sites/prefilter/{ref}.tsv", 552 | shell: 553 | """ 554 | {BIN[select_sites.py]} -i {input} -o {output} 555 | """ 556 | 557 | 558 | rule stat_sample_background: 559 | input: 560 | site=INTERNALDIR / "count_sites/{sample}.{ref}.arrow", 561 | mask="detected_sites/prefilter/{ref}.tsv", 562 | output: 563 | background="detected_sites/background/{sample}.{ref}.tsv", 564 | filtered="detected_sites/filtered/{sample}.{ref}.tsv", 565 | threads: 2 566 | shell: 567 | """ 568 | {BIN[filter_sites.py]} -i {input.site} -m {input.mask} -b {output.background} -o {output.filtered} 569 | """ 570 | -------------------------------------------------------------------------------- /bin/filter_sites.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2021 Ye Chang yech1990@gmail.com 5 | # Distributed under terms of the MIT license. 6 | # 7 | # Created: 2021-10-06 01:53 8 | 9 | 10 | import argparse 11 | 12 | import polars as pl 13 | from scipy.stats import binomtest 14 | 15 | arg_parser = argparse.ArgumentParser() 16 | arg_parser.add_argument("-i", "--input-file", help="Input site file") 17 | arg_parser.add_argument("-m", "--mask-file", help="mask file") 18 | arg_parser.add_argument("-b", "--background-file", help="background file") 19 | arg_parser.add_argument("-o", "--output-file", help="output file") 20 | 21 | 22 | args = arg_parser.parse_args() 23 | 24 | df_site = ( 25 | pl.read_ipc(args.input_file) 26 | .with_columns( 27 | u=pl.col("unconvertedBaseCount_filtered_uniq"), 28 | d=pl.col("convertedBaseCount_filtered_uniq") 29 | + pl.col("unconvertedBaseCount_filtered_uniq"), 30 | ) 31 | .with_columns(ur=pl.col("u") / pl.col("d")) 32 | ) 33 | 34 | df_pre = pl.read_csv( 35 | args.mask_file, 36 | separator="\t", 37 | has_header=False, 38 | new_columns=["ref", "pos", "strand"], 39 | dtypes={"ref": pl.Utf8, "pos": pl.Int64, "strand": pl.Utf8}, 40 | ) 41 | 42 | bg_ratio = ( 43 | df_site.join(df_pre, on=["ref", "pos", "strand"], how="anti") 44 | .get_column("ur") 45 | .drop_nans() 46 | .mean() 47 | ) 48 | with open(args.background_file, "w") as f: 49 | f.write(f"{bg_ratio}\n") 50 | 51 | 52 | def testp(successes, trials, p): 53 | if successes == 0 or trials == 0: 54 | return 1.0 55 | return binomtest(successes, trials, p, alternative="greater").pvalue 56 | 57 | 58 | df_filter = ( 59 | df_pre.join(df_site, on=["ref", "pos", "strand"], how="left") 60 | .with_columns(pl.col("u").fill_null(strategy="zero")) 61 | .with_columns(pl.col("d").fill_null(strategy="zero")) 62 | .select(["ref", "pos", "strand", "u", "d", "ur"]) 63 | .with_columns( 64 | pval=pl.struct(["u", "d"]).map_elements( 65 | lambda x: testp(x["u"], x["d"], bg_ratio) 66 | ) 67 | ) 68 | .with_columns( 69 | passed=(pl.col("pval") < 0.001) 70 | & (pl.col("u") >= 2) 71 | & (pl.col("d") >= 10) 72 | & (pl.col("ur") > 0.02) 73 | ) 74 | ) 75 | 76 | 77 | df_filter.write_csv(args.output_file, separator="\t", include_header=True) 78 | -------------------------------------------------------------------------------- /bin/group_pileup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2024 Ye Chang yech1990@gmail.com 5 | # Distributed under terms of the GNU license. 6 | # 7 | # Created: 2024-02-09 13:41 8 | 9 | 10 | import polars as pl 11 | 12 | 13 | def import_df(file_name, suffix): 14 | count_cols = [ 15 | "convertedBaseCount_unfiltered_uniq", 16 | "unconvertedBaseCount_unfiltered_uniq", 17 | "convertedBaseCount_unfiltered_multi", 18 | "unconvertedBaseCount_unfiltered_multi", 19 | "convertedBaseCount_filtered_uniq", 20 | "unconvertedBaseCount_filtered_uniq", 21 | "convertedBaseCount_filtered_multi", 22 | "unconvertedBaseCount_filtered_multi", 23 | ] 24 | df = pl.read_ipc(file_name).rename({col: col + "_" + suffix for col in count_cols}) 25 | return df 26 | 27 | 28 | def combine_files(*files): 29 | samples = [f.split("/")[-1].rsplit(".", 2)[0] for f in files] 30 | f = files[0] 31 | s = samples[0] 32 | df_com = import_df(f, s) 33 | for f, s in zip(files[1:], samples[1:]): 34 | df = import_df(f, s) 35 | df_com = df_com.join(df, on=["ref", "pos", "strand"], how="outer_coalesce") 36 | 37 | df_com = ( 38 | df_com.with_columns( 39 | u=pl.sum_horizontal( 40 | f"unconvertedBaseCount_filtered_uniq_{s}" for s in samples 41 | ), 42 | d=pl.sum_horizontal( 43 | f"{t}_filtered_uniq_{s}" 44 | for s in samples 45 | for t in ["convertedBaseCount", "unconvertedBaseCount"] 46 | ), 47 | _t=pl.sum_horizontal( 48 | f"{t1}_unfiltered_{t2}_{s}" 49 | for s in samples 50 | for t1 in ["convertedBaseCount", "unconvertedBaseCount"] 51 | for t2 in ["uniq", "multi"] 52 | ), 53 | ) 54 | .with_columns( 55 | # ur: unconverted ratio 56 | ur=pl.col("u") / pl.col("d"), 57 | # mr: multiple mapping ratio 58 | mr=pl.sum_horizontal( 59 | f"{t}_unfiltered_multi_{s}" 60 | for s in samples 61 | for t in ["convertedBaseCount", "unconvertedBaseCount"] 62 | ) 63 | / pl.col("_t"), 64 | # cr: cluster ratio 65 | cr=1 66 | - pl.sum_horizontal( 67 | f"{t1}_filtered_{t2}_{s}" 68 | for s in samples 69 | for t1 in ["convertedBaseCount", "unconvertedBaseCount"] 70 | for t2 in ["uniq", "multi"] 71 | ) 72 | / pl.col("_t"), 73 | ) 74 | .with_columns( 75 | [ 76 | pl.col(f"unconvertedBaseCount_filtered_uniq_{s}").alias(f"u_{s}") 77 | for s in samples 78 | ] 79 | + [ 80 | ( 81 | pl.col(f"unconvertedBaseCount_filtered_uniq_{s}") 82 | + pl.col(f"convertedBaseCount_filtered_uniq_{s}") 83 | ).alias(f"d_{s}") 84 | for s in samples 85 | ] 86 | ) 87 | .select( 88 | ["ref", "pos", "strand", "u", "d", "ur", "mr", "cr"] 89 | + [f"{t}_{s}" for s in samples for t in ["u", "d"]] 90 | ) 91 | .fill_null(0) 92 | ) 93 | 94 | return df_com 95 | 96 | 97 | if __name__ == "__main__": 98 | import argparse 99 | 100 | arg_parser = argparse.ArgumentParser() 101 | arg_parser.add_argument( 102 | "-i", 103 | "--input-files", 104 | nargs="+", 105 | required=True, 106 | help="Multiple input files to be combined", 107 | ) 108 | arg_parser.add_argument("-o", "--output-file", help="output file") 109 | args = arg_parser.parse_args() 110 | 111 | # Write the combined DataFrame to a CSV file 112 | combine_files(*args.input_files).write_ipc(args.output_file, compression="lz4") 113 | -------------------------------------------------------------------------------- /bin/join_pileup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2024 Ye Chang yech1990@gmail.com 5 | # Distributed under terms of the GNU license. 6 | # 7 | # Created: 2024-02-09 13:41 8 | 9 | 10 | import polars as pl 11 | 12 | 13 | def import_df(file_name, suffix): 14 | df = pl.read_csv( 15 | file_name, 16 | separator="\t", 17 | columns=["ref", "pos", "strand", "convertedBaseCount", "unconvertedBaseCount"], 18 | dtypes={ 19 | "ref": pl.Utf8, 20 | "pos": pl.Int64, 21 | "strand": pl.Utf8, 22 | "convertedBaseCount": pl.Int64, 23 | "unconvertedBaseCount": pl.Int64, 24 | }, 25 | ) 26 | df = df.rename( 27 | { 28 | "convertedBaseCount": "convertedBaseCount_" + suffix, 29 | "unconvertedBaseCount": "unconvertedBaseCount_" + suffix, 30 | } 31 | ) 32 | return df 33 | 34 | 35 | def combine_files(*files): 36 | suffixes = [ 37 | "unfiltered_uniq", 38 | "unfiltered_multi", 39 | "filtered_uniq", 40 | "filtered_multi", 41 | ] 42 | f = files[0] 43 | s = suffixes[0] 44 | df_com = import_df(f, s) 45 | for f, s in zip(files[1:], suffixes[1:]): 46 | df = import_df(f, s) 47 | df_com = df_com.join(df, on=["ref", "pos", "strand"], how="outer_coalesce") 48 | return df_com.fill_null(0) 49 | 50 | 51 | if __name__ == "__main__": 52 | import argparse 53 | 54 | arg_parser = argparse.ArgumentParser() 55 | arg_parser.add_argument( 56 | "-i", 57 | "--input-files", 58 | nargs=4, 59 | required=True, 60 | help="4 input files: unfiltered_uniq, unfiltered_multi, filtered_uniq, filtered_multi", 61 | ) 62 | arg_parser.add_argument("-o", "--output-file", help="output file") 63 | args = arg_parser.parse_args() 64 | 65 | # Write the combined DataFrame to a CSV file 66 | combine_files(*args.input_files).write_ipc(args.output_file, compression="lz4") 67 | -------------------------------------------------------------------------------- /bin/select_sites.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2021 Ye Chang yech1990@gmail.com 5 | # Distributed under terms of the MIT license. 6 | # 7 | # Created: 2021-10-06 01:53 8 | 9 | """pre-filter sites. 10 | 11 | - V3: select sites with both unique and multi mapped reads. 12 | change pandas into polars to speed up. 13 | - V4: read combined file directly. 14 | """ 15 | 16 | import argparse 17 | import sys 18 | 19 | import polars as pl 20 | 21 | arg_parser = argparse.ArgumentParser() 22 | arg_parser.add_argument( 23 | "-i", 24 | "--input-files", 25 | nargs="+", 26 | required=True, 27 | help="Multiple input files to be combined", 28 | ) 29 | arg_parser.add_argument("-o", "--output-file", help="output file") 30 | 31 | args = arg_parser.parse_args() 32 | 33 | TOTAL_DEPTH = 20 34 | TOTAL_SUPPORT = 3 35 | AVERAGE_UNC_RATIO = 0.02 36 | AVERAGE_CLU_RATIO = 0.5 37 | AVERAGE_MUL_RATIO = 0.2 38 | 39 | 40 | dfs = [] 41 | for f in args.input_files: 42 | df = ( 43 | pl.read_ipc(f) 44 | .filter( 45 | (pl.col("d") >= TOTAL_DEPTH) 46 | & (pl.col("u") >= TOTAL_SUPPORT) 47 | & (pl.col("ur") >= AVERAGE_UNC_RATIO) 48 | & (pl.col("cr") < AVERAGE_CLU_RATIO) 49 | & (pl.col("mr") < AVERAGE_MUL_RATIO) 50 | ) 51 | .select(["ref", "pos", "strand"]) 52 | ) 53 | print(f"Read data for {f}...", file=sys.stderr) 54 | dfs.append(df) 55 | 56 | pl.concat(dfs, how="vertical").unique(maintain_order=True).write_csv( 57 | args.output_file, separator="\t", include_header=False 58 | ) 59 | -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | # tools and scripts 2 | path: 3 | samtools: /path/to/samtools 4 | hisat3n: /path/to/hisat-3n 5 | hisat3ntable: /path/to/hisat-3n-table 6 | join_pileup.py: ../bin/join_pileup.py 7 | group_pileup.py: ../bin/group_pileup.py 8 | select_sites.py: ../bin/select_sites.py 9 | filter_sites.py: ../bin/filter_sites.py 10 | 11 | # global config 12 | # 13 | # prepare genes index 14 | # premap to rRNA, tRNA and other small RNA 15 | # If study virus, then also premap to virus genome 16 | # customized_genes: 17 | # - a.fa 18 | # - b.fa 19 | # library: INLINE 20 | # makedup: false 21 | 22 | # reference genome and index 23 | reference: 24 | contamination: 25 | fa: ~/reference/genome/contamination/contamination.fa 26 | hisat3n: ~/reference/index/hisat3n/contamination/contamination 27 | genes: 28 | fa: ~/reference/genome/Homo_sapiens.GRCh38.sncRNA.fa 29 | hisat3n: ~/reference/index/hisat3n/Homo_sapiens.GRCh38.sncRNA/Homo_sapiens.GRCh38.sncRNA 30 | genome: 31 | fa: ~/reference/genome/Homo_sapiens.GRCh38.genome.fa 32 | hisat3n: ~/reference/index/hisat3n/Homo_sapiens.GRCh38.genome/Homo_sapiens.GRCh38.genome 33 | 34 | # Sample name should be indentical and listed in the 2nd level of the yaml file 35 | # Each sample will be analysis seperately, but 36 | # samples sharing the same group id will be regared as biological replicates and combined in the comparing step 37 | samples: 38 | CONTROL-rep1: 39 | data: 40 | - R1: ../data/test1_R1.fq.gz 41 | R2: ../data/test1_R2.fq.gz 42 | group: CONTROL 43 | DRUG-rep1: 44 | data: 45 | - R1: ../data/test2_R1.fq.gz 46 | group: DRUG 47 | DRUG-rep2: 48 | data: 49 | - R1: ../data/test3_R1.fq.gz 50 | R2: ../data/test3_R2.fq.gz 51 | group: DRUG 52 | -------------------------------------------------------------------------------- /data/test1_R1.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/y9c/m5C-UBSseq/9f5e719edba39ac0ae0b74ec126ec7f8fe6de2bf/data/test1_R1.fq.gz -------------------------------------------------------------------------------- /data/test1_R2.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/y9c/m5C-UBSseq/9f5e719edba39ac0ae0b74ec126ec7f8fe6de2bf/data/test1_R2.fq.gz -------------------------------------------------------------------------------- /data/test2_R1.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/y9c/m5C-UBSseq/9f5e719edba39ac0ae0b74ec126ec7f8fe6de2bf/data/test2_R1.fq.gz -------------------------------------------------------------------------------- /data/test2_R2.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/y9c/m5C-UBSseq/9f5e719edba39ac0ae0b74ec126ec7f8fe6de2bf/data/test2_R2.fq.gz -------------------------------------------------------------------------------- /data/test3_R1.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/y9c/m5C-UBSseq/9f5e719edba39ac0ae0b74ec126ec7f8fe6de2bf/data/test3_R1.fq.gz -------------------------------------------------------------------------------- /data/test3_R2.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/y9c/m5C-UBSseq/9f5e719edba39ac0ae0b74ec126ec7f8fe6de2bf/data/test3_R2.fq.gz -------------------------------------------------------------------------------- /docs/flow.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 16 | 17 | results 18 | 19 | stat_mapping_number 20 | 21 | 22 | 23 | combine_runs 24 | 25 | 26 | 27 | dedup_mapping 28 | 29 | 30 | 31 | hisat2_3n_sort 32 | 33 | 34 | 35 | hisat2_3n_mapping_contamination_PE 36 | 37 | 38 | 40 | extract_unmap_bam_internal_PE 41 | 42 | 43 | 44 | cutadapt_PE 45 | 46 | 47 | 49 | hisat2_3n_mapping_genes_PE 50 | 51 | 52 | 53 | 54 | 55 | 56 | 58 | hisat2_3n_mapping_genome_PE 59 | 60 | 61 | 62 | extract_unmap_bam_final_PE 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | hisat2_3n_mapping_contamination_SE 71 | 72 | 73 | 75 | extract_unmap_bam_internal_SE 76 | 77 | 78 | 79 | cutadapt_SE 80 | 81 | 82 | 84 | hisat2_3n_mapping_genes_SE 85 | 86 | 87 | 88 | 89 | 90 | 91 | 93 | hisat2_3n_mapping_genome_SE 94 | 95 | 96 | 98 | extract_unmap_bam_final_SE 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | stat_sample_background 111 | 112 | 113 | 114 | join_pileup 115 | 116 | 117 | 118 | group_pileup 119 | 120 | 121 | 123 | hisat2_3n_calling_unfiltered_unique 124 | 125 | 126 | 127 | 128 | 130 | hisat2_3n_calling_unfiltered_multi 131 | 132 | 133 | 134 | hisat2_3n_filtering 135 | 136 | 137 | 138 | 139 | 141 | hisat2_3n_calling_filtered_unqiue 142 | 143 | 144 | 145 | 146 | 148 | hisat2_3n_calling_filtered_multi 149 | 150 | 151 | 152 | 153 | 155 | combined_select_sites 156 | 157 | 158 | 159 | 160 | 161 | --------------------------------------------------------------------------------