├── test ├── BioBits ├── FqRecords ├── AtriaTest.jl ├── runtests.jl └── trimmer_and_benchmark.jl ├── docs ├── logo.png ├── Figure 2 Speed.png ├── Figure 1 Simulation Accuracy2.png ├── 4.Development_notes.md ├── 1.1.Release_installation_guide.md ├── 1.2.Install_from_source.md └── 5.Accuracy_and_speed_benchmark.md ├── .gitignore ├── src ├── AtriaTest │ ├── FqRecords │ │ ├── runtests.jl │ │ └── primer_match.jl │ ├── BioBits │ │ ├── runtests.jl │ │ ├── get_seq.jl │ │ ├── algorithm_basis.jl │ │ ├── bit_match.jl │ │ └── biosequences_safety.jl │ ├── AtriaTest.jl │ └── trimmer_and_benchmark.jl ├── atria ├── Benchmark │ ├── Benchmark.jl │ ├── rand_trim.jl │ ├── read_stats.jl │ ├── read_simulation.jl │ └── read_simulation_primer.jl ├── atria_profile ├── BioBits │ ├── BioBits.jl │ ├── biosequences_safety.jl │ ├── insert_size_decision.jl │ └── get_seq.jl ├── Trimmer │ ├── Trimmer.jl │ ├── thread_trim.jl │ ├── wrapper_detect_adapter_se.jl │ ├── markdown_help.jl │ └── wrapper_detect_adapter_pe.jl ├── FqRecords │ ├── adapter_match_se.jl │ ├── FqRecords.jl │ ├── copy.jl │ ├── interface.jl │ ├── quality.jl │ ├── basic_io.jl │ ├── pcr_dedup.jl │ ├── util.jl │ ├── consensus.jl │ ├── check_and_trim.jl │ └── thread_output.jl └── Atria.jl ├── benchmark ├── atria-simulate-main.bash ├── real-data-time.bash ├── aln2len.pl ├── atria-simulate.bash ├── replicates-stats.jl ├── art-simulate-main.bash ├── real-data-rnaseq.bash ├── real-data-human.bash ├── atria-similate-for-atria-only.bash ├── evalTrimming.pl ├── time_stats.jl ├── time_stats_plot.R └── trimming-functions.bash ├── adapter.known.txt ├── Project.toml ├── README.md ├── LICENSE.md └── CHANGELOG.md /test/BioBits: -------------------------------------------------------------------------------- 1 | ../src/AtriaTest/BioBits -------------------------------------------------------------------------------- /test/FqRecords: -------------------------------------------------------------------------------- 1 | ../src/AtriaTest/FqRecords -------------------------------------------------------------------------------- /test/AtriaTest.jl: -------------------------------------------------------------------------------- 1 | ../src/AtriaTest/AtriaTest.jl -------------------------------------------------------------------------------- /test/runtests.jl: -------------------------------------------------------------------------------- 1 | 2 | using Atria 3 | 4 | Atria.test_atria() 5 | -------------------------------------------------------------------------------- /test/trimmer_and_benchmark.jl: -------------------------------------------------------------------------------- 1 | ../src/AtriaTest/trimmer_and_benchmark.jl -------------------------------------------------------------------------------- /docs/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cihga39871/Atria/HEAD/docs/logo.png -------------------------------------------------------------------------------- /docs/Figure 2 Speed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cihga39871/Atria/HEAD/docs/Figure 2 Speed.png -------------------------------------------------------------------------------- /docs/Figure 1 Simulation Accuracy2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cihga39871/Atria/HEAD/docs/Figure 1 Simulation Accuracy2.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.jl.cov 2 | *.jl.*.cov 3 | *.jl.mem 4 | deps/deps.jl 5 | .Rproj.user 6 | .*DS_Store 7 | nohup.out 8 | bin/* 9 | lib*/* 10 | tmp*/* 11 | .vscode/ 12 | app*/ 13 | app 14 | /atria-* 15 | Atria* 16 | Manifest.toml 17 | -------------------------------------------------------------------------------- /src/AtriaTest/FqRecords/runtests.jl: -------------------------------------------------------------------------------- 1 | include("fq_records.jl") 2 | include("primer_match.jl") 3 | 4 | @noinline function test_fq_records() 5 | @testset "BioBits" begin 6 | test_fq_records_basis() 7 | # test_primer_match() 8 | end 9 | end 10 | -------------------------------------------------------------------------------- /src/atria: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env julia 2 | 3 | @info "Atria without precompilation. It may take a while to precompile." 4 | 5 | using Pkg 6 | 7 | Pkg.activate(dirname(@__DIR__)) 8 | Pkg.instantiate() 9 | 10 | include(joinpath(@__DIR__, "Atria.jl")) 11 | 12 | Atria.julia_main() 13 | -------------------------------------------------------------------------------- /docs/4.Development_notes.md: -------------------------------------------------------------------------------- 1 | ## Development 2 | 3 | ### Run `Atria` directly (development only) 4 | It is an easy way to debug Atria without building binaries: 5 | 6 | ```sh 7 | # replace ARGS... with Atria arguments 8 | julia -O3 -i --check-bounds=yes --color=yes $atria/src/atria ARGS... 9 | ``` 10 | -------------------------------------------------------------------------------- /benchmark/atria-simulate-main.bash: -------------------------------------------------------------------------------- 1 | #! bash 2 | 3 | for i in 16 20 24 28 33 4 | do 5 | echo "Start: adapter length $i" 6 | echo "Start: adapter length $i" 7 | echo "Start: adapter length $i" 8 | bash $atria/benchmark/atria-simulate.bash $i 9 | done 10 | 11 | working_dir=~/analysis/atria-benchmark/atria_simulate 12 | 13 | cd $working_dir 14 | 15 | atria statplot -i auto -l DIR2 16 | -------------------------------------------------------------------------------- /src/AtriaTest/BioBits/runtests.jl: -------------------------------------------------------------------------------- 1 | 2 | 3 | # include("insert_size_decision.jl") 4 | include("algorithm_basis.jl") 5 | include("biosequences_safety.jl") 6 | include("get_seq.jl") 7 | include("bit_match.jl") 8 | 9 | @noinline function test_bio_bits() 10 | @testset "BioBits" begin 11 | test_algorithm_basis() 12 | test_biosequences_safety() 13 | test_get_seq() 14 | test_bit_match() 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /src/Benchmark/Benchmark.jl: -------------------------------------------------------------------------------- 1 | 2 | module Benchmark 3 | 4 | export julia_wrapper_simulate, 5 | julia_wrapper_randtrim, 6 | julia_wrapper_readstat, 7 | julia_wrapper_rscript, 8 | statplot_code 9 | 10 | using ArgParse 11 | using Statistics 12 | 13 | using ..BioBits.BioSymbols 14 | using ..BioBits.BioSequences 15 | using ..FqRecords 16 | 17 | include("read_simulation.jl") 18 | include("rand_trim.jl") 19 | include("read_stats.jl") 20 | include("external_code.jl") 21 | 22 | end 23 | -------------------------------------------------------------------------------- /src/atria_profile: -------------------------------------------------------------------------------- 1 | #! julia -i 2 | 3 | @info "Atria without precompilation. It may take a while to precompile." 4 | 5 | include(joinpath(@__DIR__, "Atria.jl")) 6 | 7 | # using ProfileView 8 | 9 | empty!(ARGS) 10 | append!(ARGS, ["-r", "/home/jc/analysis/atria-benchmark/julia1.8.5/reads_diff_indel.R1.fastq", "-R", "/home/jc/analysis/atria-benchmark/julia1.8.5/reads_diff_indel.R2.fastq", "-t", "8", "-o", "/home/jc/analysis/atria-benchmark/julia1.8.5/outprofile", "-f"]) 11 | 12 | @profview Atria.julia_main() 13 | @profview Atria.julia_main() 14 | -------------------------------------------------------------------------------- /src/BioBits/BioBits.jl: -------------------------------------------------------------------------------- 1 | 2 | module BioBits 3 | 4 | using Reexport 5 | 6 | @reexport using BioSymbols 7 | @reexport using BioSequences 8 | 9 | # include("biosequences_safety.jl") 10 | # export bitsafe!, 11 | # isbitsafe 12 | 13 | include("get_seq.jl") 14 | export N2gap, 15 | SeqHead, 16 | SeqHeadSet, 17 | get_pointer, 18 | get_unsafe_index_of_last_bitseq, 19 | unsafe_bitseq, 20 | bin 21 | 22 | include("bit_match.jl") 23 | export MatchRes, 24 | bitwise_scan, 25 | _bitwise_scan_fullseq, 26 | bitwise_scan_rc!, 27 | bitwise_scan_rc 28 | 29 | include("insert_size_decision.jl") 30 | export insert_size_decision, 31 | insert_size_decision_separate, 32 | is_false_positive, 33 | one_bp_check 34 | 35 | end 36 | -------------------------------------------------------------------------------- /src/AtriaTest/AtriaTest.jl: -------------------------------------------------------------------------------- 1 | 2 | module AtriaTest 3 | export test_atria 4 | 5 | using Test 6 | 7 | using ..BioBits 8 | using ..BioBits.BioSymbols 9 | using ..BioBits.BioSequences 10 | using ..FqRecords 11 | using ..Trimmer 12 | using ..Benchmark 13 | using ...Atria 14 | 15 | #= 16 | using Test 17 | using .Atria 18 | using .Atria.BioBits 19 | using .Atria.BioBits.BioSymbols 20 | using .Atria.BioBits.BioSequences 21 | using .Atria.FqRecords 22 | using .Atria.Trimmer 23 | using .Atria.Benchmark 24 | =# 25 | 26 | include(joinpath("BioBits", "runtests.jl")) 27 | include(joinpath("FqRecords", "runtests.jl")) 28 | include("trimmer_and_benchmark.jl") 29 | 30 | @noinline function test_atria() 31 | @testset "Atria" begin 32 | test_bio_bits() 33 | test_fq_records() 34 | test_trimmer_and_benchmark() 35 | end 36 | true 37 | end 38 | 39 | end 40 | -------------------------------------------------------------------------------- /adapter.known.txt: -------------------------------------------------------------------------------- 1 | AAGTCGGAGGCCAAGC 2 | AAGTCGGATCGTAGCC 3 | AATGATACGGCGACCA 4 | ACACTCTTTCCCTACA 5 | AGATCGGAAGAGCACA 6 | AGATCGGAAGAGCGGT 7 | AGATCGGAAGAGCGTC 8 | AGATCGGAAGAGCTCG 9 | CAAGCAGAAGACGGCA 10 | CCACTACGCCTCCGCT 11 | CCGACAGGTTCAGAGT 12 | CCGAGCCCACGAGACA 13 | CCGAGCCCACGAGACC 14 | CCGAGCCCACGAGACG 15 | CCGAGCCCACGAGACT 16 | CGACAGGTTCAGAGTT 17 | CGGTCTCGGCATTCCT 18 | CTAATACGACTCACTA 19 | CTGAGCGGGCTGGCAA 20 | CTGATGGCGCGAGGGA 21 | CTGCCCCGGGTTCCTC 22 | CTGTCTCTTATACACA 23 | GACGCTGCCGACGAAC 24 | GACGCTGCCGACGAAG 25 | GACGCTGCCGACGAAT 26 | GACGCTGCCGACGACG 27 | GACGCTGCCGACGACT 28 | GACGCTGCCGACGAGC 29 | GACGCTGCCGACGATA 30 | GACGCTGCCGACGATC 31 | GATCGGAAGAGCACAC 32 | GATCGGAAGAGCGGTT 33 | GATCGGAAGAGCGTCG 34 | GATCGGAAGAGCTCGT 35 | GATCGTCGGACTGTAG 36 | GTCTCGTGGGCTCGGA 37 | GTGACTGGAGTTCAGA 38 | TACACTCTTTCCCTAC 39 | TCGGACTGTAGAACTC 40 | TCGTCGGCAGCGTCAG 41 | TGGAATTCTCGGGTGC 42 | -------------------------------------------------------------------------------- /src/AtriaTest/BioBits/get_seq.jl: -------------------------------------------------------------------------------- 1 | @noinline function test_get_seq() 2 | @testset "get seq" begin 3 | a = dna"NNNNATCGNNSANNNNNNNNNNNN" |> bitsafe! 4 | 5 | a.data = N2gap.(a.data) 6 | @test a == dna"----ATCG--SA------------" 7 | 8 | a = dna"ATCGACTGCGTACGTACGTAC" |> bitsafe! 9 | SeqHeadSet(a) 10 | 11 | b = dna"" |> bitsafe! 12 | SeqHeadSet(b) 13 | 14 | pa = get_pointer(0x00, a) 15 | @test unsafe_load(pa) == 0x81 16 | 17 | pa = get_pointer(0x0000, a) 18 | @test unsafe_load(pa) == 0x4281 19 | 20 | pa = get_pointer(0x00000000, a) 21 | @test unsafe_load(pa) == 0x48214281 22 | 23 | pa = get_pointer(0x0000000000000000, a) 24 | @test unsafe_load(pa) == 0x1842184248214281 25 | 26 | @test unsafe_bitseq(pa, 1) == 0x1842184248214281 27 | @test unsafe_bitseq(pa, 2) == 0x0184218424821428 28 | @test unsafe_bitseq(pa, 21, 21) == (0x0000000000000002, 1) 29 | end 30 | end -------------------------------------------------------------------------------- /src/AtriaTest/BioBits/algorithm_basis.jl: -------------------------------------------------------------------------------- 1 | 2 | @noinline function test_algorithm_basis() 3 | @testset "algorithm_basis" begin 4 | 5 | @test UInt === UInt64 6 | @test Int === Int64 7 | @test sizeof(UInt64) == 8 8 | @test sizeof(UInt32) == 4 9 | @test sizeof(UInt16) == 2 10 | @test sizeof(UInt8) == 1 11 | 12 | 13 | seq = dna"ANATATATATATATGGANNNNATATATNNNGGGG" 14 | 15 | @test typeof(seq) === LongDNA{4} 16 | @test typeof(seq) === LongSequence{DNAAlphabet{4}} 17 | 18 | @test typeof(seq.data) === Array{UInt64,1} 19 | @test typeof(seq.len) === UInt 20 | 21 | @test seq.data == UInt64[0x44818181818181f1, 22 | 0x44fff818181ffff1, 23 | 0x0000000000000044] 24 | 25 | p_seq = pointer(seq.data) 26 | @test unsafe_load(p_seq, 2) == 0x44fff818181ffff1 27 | @test unsafe_load(p_seq + 1) == 0xf144818181818181 28 | 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /src/Trimmer/Trimmer.jl: -------------------------------------------------------------------------------- 1 | 2 | module Trimmer 3 | 4 | export julia_wrapper_atria_pe, 5 | julia_wrapper_atria_se, 6 | julia_wrapper_detect_adapter_se, 7 | julia_wrapper_detect_adapter_pe, 8 | sub_procs, sub_procs_single_end, 9 | atria_markdown_help, 10 | processing_reads!, 11 | processing_reads_range!, 12 | processing_reads_threads!, 13 | parsing_args, 14 | args_range_test, 15 | get_quality_offset, 16 | get_length_range, 17 | f_procs 18 | 19 | using Reexport 20 | 21 | @reexport using ArgParse 22 | @reexport using BioSymbols 23 | @reexport using BioSequences 24 | @reexport using Distributed 25 | @reexport using Logging 26 | @reexport using JSON 27 | @reexport using DataStructures 28 | @reexport using Printf 29 | @reexport using Markdown 30 | @reexport using PrettyTables 31 | @reexport using DataFrames 32 | @reexport using CSV 33 | @reexport using Dates 34 | @reexport using Statistics 35 | 36 | @reexport using ..BioBits 37 | @reexport using ..FqRecords 38 | 39 | using Pkg 40 | const atria_version = @eval($(string("v", Pkg.project().version))) 41 | 42 | include("markdown_help.jl") 43 | include("args.jl") 44 | include("thread_trim.jl") 45 | include("wrapper_pe.jl") 46 | include("wrapper_se.jl") 47 | include("detect_adapter.jl") 48 | include("wrapper_detect_adapter_se.jl") 49 | include("wrapper_detect_adapter_pe.jl") 50 | end 51 | -------------------------------------------------------------------------------- /src/FqRecords/adapter_match_se.jl: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | @inline function adapter_match_se(adapter1_seqheadset::SeqHeadSet, 5 | r1::FqRecord, 6 | kmer_tolerance::Int64, 7 | trim_score::Float64) 8 | r1_adapter_match = bitwise_scan(adapter1_seqheadset, r1.seq, 1, kmer_tolerance) 9 | compute_prob_and_score!(r1_adapter_match, r1, r1_adapter_match.idx, r1_adapter_match.idx + 15) 10 | 11 | r1_insert_size = r1_adapter_match.idx - 1 12 | 13 | if r1_adapter_match.score > trim_score 14 | # r1_insert_size can be -1 15 | # trim 16 | r1_insert_size < 0 ? 0 : r1_insert_size 17 | else 18 | 9223372036854775807 # typemax, no trim 19 | end 20 | end 21 | 22 | @inline function adapter_match_se(adapter1_seqheadsets::Vector{SeqHeadSet}, 23 | r1::FqRecord, 24 | kmer_tolerance::Int64, 25 | trim_score::Float64) 26 | 27 | nremain = 9223372036854775807 # typemax, no trim 28 | for adapter1_seqheadset in adapter1_seqheadsets 29 | nremain_new = adapter_match_se(adapter1_seqheadset, r1, kmer_tolerance, trim_score) 30 | if nremain_new < nremain 31 | nremain = nremain_new 32 | 33 | if nremain_new == 0 34 | break 35 | end 36 | end 37 | end 38 | nremain 39 | end -------------------------------------------------------------------------------- /Project.toml: -------------------------------------------------------------------------------- 1 | name = "Atria" 2 | uuid = "226cbef3-b485-431c-85c2-d8bd8da14025" 3 | authors = ["Jiacheng Chuan "] 4 | version = "4.1.4" 5 | 6 | [deps] 7 | ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" 8 | BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" 9 | BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9" 10 | CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" 11 | DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" 12 | DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" 13 | Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" 14 | DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" 15 | Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" 16 | JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" 17 | Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" 18 | Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" 19 | PackageCompiler = "9b87118b-4619-50d2-8e1e-99f35a4d4d9d" 20 | Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" 21 | PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" 22 | Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" 23 | Reexport = "189a3867-3050-52da-a836-e630ba90ab69" 24 | Revise = "295af30f-e4ad-537b-8983-00126c2a3abe" 25 | Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" 26 | Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" 27 | 28 | [compat] 29 | ArgParse = "1" 30 | BioSequences = "= 3.1.6" 31 | BioSymbols = "5" 32 | CSV = "^0.10" 33 | DataFrames = "1" 34 | DataStructures = "^0.18" 35 | JSON = "^0.21" 36 | PackageCompiler = "2" 37 | PrettyTables = "2" 38 | julia = "1.8" 39 | -------------------------------------------------------------------------------- /src/FqRecords/FqRecords.jl: -------------------------------------------------------------------------------- 1 | 2 | module FqRecords 3 | 4 | export FqRecord, 5 | qualpval, 6 | qualprob, 7 | update_prob_from_qual, 8 | probsum, 9 | probmean, 10 | copyto!, 11 | safe_copyto!, 12 | fqreadrecord, 13 | fqreadrecord!, 14 | fqwriterecord, 15 | check_identifier, 16 | throw_identifier_error, 17 | iscomplement, 18 | load_fqs_threads!, 19 | read_chunks!, 20 | StringChunk2FqRecord!, 21 | chunk_sizes, 22 | get_ideal_inbyte_sizes, 23 | get_ideal_inbyte_sizes!, 24 | write_fqs_threads!, 25 | isinreadlength!, 26 | count_N, 27 | isnotmuchN!, 28 | front_trim!, 29 | tail_trim!, 30 | tail_N_trim!, 31 | tail_low_qual_trim!, 32 | qualitymatch, 33 | seq_complexity, 34 | polyX_tail_scan, 35 | pe_consensus! 36 | 37 | using Reexport 38 | 39 | @reexport using Base.Threads 40 | using ..BioBits 41 | using ..BioBits.BioSymbols 42 | using ..BioBits.BioSequences 43 | 44 | include("interface.jl") 45 | export TrimStats 46 | 47 | include("quality.jl") 48 | export compute_prob_and_score! 49 | 50 | include("copy.jl") 51 | include("basic_io.jl") 52 | include("util.jl") 53 | include("consensus.jl") 54 | 55 | include("thread_input.jl") 56 | include("thread_output.jl") 57 | 58 | include("check_and_trim.jl") 59 | 60 | include("adapter_match_se.jl") 61 | export adapter_match_se 62 | 63 | include("adapter_match_pe.jl") 64 | export adapter_match_and_trim_pe!, 65 | adapter_match_pe, 66 | PEOptions, AdapterPERes 67 | 68 | include("pcr_dedup.jl") 69 | export DupCount, get_dup_count, pcr_dedup, write_pcr_dedup_count, write_pcr_hash_collision, 70 | hash_dna 71 | 72 | end 73 | -------------------------------------------------------------------------------- /src/AtriaTest/BioBits/bit_match.jl: -------------------------------------------------------------------------------- 1 | 2 | @noinline function test_bit_match() 3 | @testset "bit match" begin 4 | a = dna"ACCCGGTCAGTACGTCAGTACGCAGTAGTGTA" |> bitsafe! 5 | b = dna"NNNACCCGGTCAGTACGTCAGTACGCAGTAGTGTA" |> bitsafe! 6 | c = dna"NNNNACCCGGTCAGTACGTCAGTACGCAGTAGTGTA" |> bitsafe! 7 | d = dna"GGTCAGTACGTCAGTACGCAGTAGTGTANNNNACCC" |> bitsafe! 8 | e = dna"GGTCAGTACGTCAGTACGCAGTAGTGTANNNNCCC" |> bitsafe! 9 | f = dna"GGTCAGTACGTCAGTACGCAGTAGTGTATTTTCCC" |> bitsafe! 10 | g = dna"GGTCAGTACGTCAGTACGCAGTAGTGTATTTTACCC" |> bitsafe! 11 | h = dna"GGTCAGTACGTCAGTACGCAGTAGTGTATTTTAC" |> bitsafe! 12 | i = dna"GGTCAGTACGTCAGTACGCAGTAGTGTATTTAC" |> bitsafe! 13 | j = dna"GGTCAGTACGTCAGTACGCAGTAGTGTATTTACC" |> bitsafe! 14 | 15 | # to speed up, bitwise_scan does not handle tail match well. 16 | @test bitwise_scan(a, b, 1, 0) == MatchRes(4, 16, NaN, NaN) 17 | @test bitwise_scan(a, c, 1, 0) == MatchRes(5, 16, NaN, NaN) 18 | @test bitwise_scan(a, d, 1, 0) == MatchRes(33, 4, NaN, NaN) 19 | @test bitwise_scan(a, e, 1, 0) == MatchRes(31, 4, NaN, NaN) # actually best is 32,4 20 | @test bitwise_scan(a, f, 1, 5) == MatchRes(32, 3, NaN, NaN) 21 | @test bitwise_scan(a, g, 1, 5) == MatchRes(33, 4, NaN, NaN) 22 | @test bitwise_scan(a, h, 1, 5) == MatchRes(33, 2, NaN, NaN) 23 | @test bitwise_scan(a, i, 1, 5) == MatchRes(32, 1, NaN, NaN) # actually best is 32,2 24 | @test bitwise_scan(a, j, 1, 5) == MatchRes(31, 2, NaN, NaN) # actually best is 32,3 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | Atria Logo 3 |

4 | 5 | # Atria 6 | 7 | ![](https://img.shields.io/github/downloads/cihga39871/Atria/total) 8 | 9 | Atria is designed to trim adapters and low-quality bases of next-generation sequencing data. It infers the insert DNA precisely by integrating both adapter information and reverse-complementary properties of pair-end reads within a delicate decision tree. It can also remove duplicated sequences due to PCR amplification. 10 | 11 | If you use Atria, please cite the paper: 12 | > Jiacheng Chuan, Aiguo Zhou, Lawrence Richard Hale, Miao He, Xiang Li, Atria: an ultra-fast and accurate trimmer for adapter and quality trimming, Gigabyte, 1, 2021 https://doi.org/10.46471/gigabyte.31 13 | 14 | ## Features 15 | 16 | - FAST, even for compressed fastqs 17 | - Highly accurate Illumina adapter trimming 18 | - Paired-end consensus calling 19 | - Quality trimming 20 | - Poly X tail trimming 21 | - Hard clip 3' and 5' ends 22 | - N tail trimming 23 | - Filtering reads by the number of N bases 24 | - Filtering reads by length 25 | - Filtering reads by read complexity 26 | - Remove PCR duplicates (dedup) 27 | 28 | ## Contents 29 | 30 | 1. Installation guide 31 | 32 | 1.1 [Release installation guide](docs/1.1.Release_installation_guide.md) 33 | 34 | 1.2 [Install from source](docs/1.2.Install_from_source.md) 35 | 36 | 2. **[Atria trimming methods and usages](docs/2.Atria_trimming_methods_and_usages.md)** 37 | 38 | 3. [Benchmark toolkit](docs/3.Benchmark_toolkit.md) 39 | 40 | 4. [Atria development notes](docs/4.Development_notes.md) 41 | 42 | 5. **[Accuracy and speed benchmark](docs/5.Accuracy_and_speed_benchmark.md)** 43 | -------------------------------------------------------------------------------- /src/FqRecords/copy.jl: -------------------------------------------------------------------------------- 1 | 2 | @inline function Base.copy(r::FqRecord) 3 | id = Vector{UInt8}(undef, length(r.id)) 4 | des = Vector{UInt8}(undef, length(r.des)) 5 | qual = Vector{UInt8}(undef, length(r.qual)) 6 | prob = Vector{Float64}(undef, length(r.prob)) 7 | 8 | copyto!(id, 1, r.id, 1, length(r.id)) 9 | copyto!(des, 1, r.des, 1, length(r.des)) 10 | copyto!(qual, 1, r.qual, 1, length(r.qual)) 11 | copyto!(prob, 1, r.prob, 1, length(r.prob)) 12 | 13 | seq = copy(r.seq) 14 | 15 | FqRecord(id, seq, des, qual, prob) 16 | end 17 | 18 | @inline function safe_copyto!(dest::Vector{UInt8}, src::T) where T <: AbstractArray 19 | resize!(dest, length(src)) 20 | copyto!(dest, src) 21 | end 22 | 23 | @inline function safe_copyto!(dest::Vector{T}, src::Vector{T}, src_offset, N) where T <: Any 24 | resize!(dest, N) 25 | unsafe_copyto!(dest, 1, src, src_offset, N) 26 | end 27 | 28 | @inline function safe_copyto!(dest::LongDNA{4}, src::Vector{UInt8}, src_offset, N) 29 | resize!(dest, N) 30 | # BioSequences.encode_chunks!(dest, 1, src, src_offset, N) 31 | @inbounds copyto!(dest, 1, src, src_offset, N) 32 | end 33 | @inline function safe_copyto!(dest::LongDNA{4}, src::Vector{UInt8}) 34 | copy!(dest, src) 35 | end 36 | 37 | @inline function safe_copyto!(dest::FqRecord, src::FqRecord) 38 | safe_copyto!(dest.id, src.id, 1, length(src.id)) 39 | 40 | safe_copyto!(dest.seq.data, src.seq.data, 1, length(src.seq.data)) 41 | dest.seq.part = src.seq.part 42 | dest.seq.shared = src.seq.shared 43 | 44 | safe_copyto!(dest.des, src.des, 1, length(src.des)) 45 | safe_copyto!(dest.qual, src.qual, 1, length(src.qual)) 46 | safe_copyto!(dest.prob, src.prob, 1, length(src.prob)) 47 | 48 | end 49 | -------------------------------------------------------------------------------- /benchmark/real-data-time.bash: -------------------------------------------------------------------------------- 1 | #!bash 2 | 3 | atria=/home/jiacheng/projects/atria 4 | 5 | . $atria/benchmark/trimming-functions.bash 6 | 7 | run_all_trimmer() { 8 | rm -f stderr-simple-time.log 9 | run_atria 8 2>> stderr-simple-time.log 10 | run_atria_consensus 8 2>> stderr-simple-time.log 11 | run_adapterremoval 8 2>> stderr-simple-time.log 12 | run_skewer 8 2>> stderr-simple-time.log 13 | run_trim_galore 8 2>> stderr-simple-time.log 14 | run_trimmomatic 8 2>> stderr-simple-time.log 15 | run_ktrim 8 2>> stderr-simple-time.log 16 | pigz -f Ktrim/ktrim.read1.fq Ktrim/ktrim.read2.fq 17 | run_fastp 8 2>> stderr-simple-time.log 18 | run_seqpurge 8 2>> stderr-simple-time.log 19 | run_atropos 8 2>> stderr-simple-time.log 20 | pasteTimeOutput stderr-simple-time.log > time_benchmark-simple_time.txt 21 | } 22 | 23 | 24 | ####### human data 25 | 26 | working_dir=~/analysis/atria-benchmark/ERR4695159 27 | cd $working_dir 28 | 29 | r1=ERR4695159_1.fastq.gz 30 | r2=ERR4695159_2.fastq.gz 31 | a1=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA 32 | a2=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT 33 | bwa_ref=`pwd`/genomes/hg38.fasta.gz 34 | 35 | run_all_trimmer 36 | 37 | 38 | ######## SRR330569: RNA-seq D. simulans 39 | 40 | working_dir=~/analysis/atria-benchmark/SRR330569 41 | cd $working_dir 42 | 43 | r1=SRR330569.3_1.fastq.gz 44 | r2=SRR330569.3_2.fastq.gz 45 | a1=AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCG 46 | a2=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGAT 47 | bwa_ref=`pwd`/genomes/dsim-all-chromosome-r2.02.fasta 48 | 49 | run_all_trimmer 50 | 51 | 52 | ##### Ensifer spp associated with Medicago whole genome sequencing 53 | # working_dir=~/analysis/atria-benchmark/SRR7243169 54 | # cd $working_dir 55 | # 56 | # r1=SRR7243169_1.fastq.gz 57 | # r2=SRR7243169_2.fastq.gz 58 | # a1=CTGTCTCTTATACACATCT 59 | # a2=CTGTCTCTTATACACATCT 60 | # bwa_ref=`pwd`/genomes/Pseudomonas.sp.Z003-0.4C.fasta 61 | # 62 | # run_all_trimmer 63 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The Atria Software is licensed under the MIT License: 2 | 3 | > Copyright 2021 Jiacheng Chuan 4 | > 5 | > Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | > 7 | > The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | > 9 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | 11 | The Atria Software includes code from the following projects, which have their own licenses: 12 | 13 | - [The Julia Language](https://github.com/JuliaLang/julia/blob/master/LICENSE.md) [MIT License] 14 | - [ArgParse.jl](https://github.com/carlobaldassi/ArgParse.jl/blob/master/LICENSE.md) (Parsing command-line arguments in Julia) [MIT License] 15 | - [BioSequences.jl](https://github.com/BioJulia/BioSequences.jl/blob/master/LICENSE) (Biological sequences in Julia) [MIT License] 16 | - [BioSymbols.jl](https://github.com/BioJulia/BioSymbols.jl/blob/master/LICENSE) (Nucleic and amino acid primitive types in Julia) [MIT License] 17 | - [DataStructures.jl](https://github.com/JuliaCollections/DataStructures.jl/blob/master/License.md) (Writing ordered dictionary) [MIT License] 18 | - [JSON.jl](https://github.com/JuliaIO/JSON.jl/blob/master/LICENSE.md) (IO of JSON files in Julia) [MIT "Expat" License] 19 | -------------------------------------------------------------------------------- /src/AtriaTest/FqRecords/primer_match.jl: -------------------------------------------------------------------------------- 1 | #= 2 | @noinline function test_primer_match() 3 | @testset "Primer Match" begin 4 | 5 | args2=["-r", "peReadSimulated.R1.randtrim.fastq.gz", "-R", "peReadSimulated.R2.randtrim.fastq.gz", "-e", "8", "-E", "8", "--compress", "gz"] 6 | args = parsing_args(args2) 7 | op = PEOptions(args) 8 | 9 | r1= fqreadrecord("@M03737:51:000000000-KTBYH:1:1101:21125:2738 1:N:0:GTATCGTCGT+CACCTGTT 10 | ACCGATGAAGAACGCAGCGAAATGCGATACGTAATGTGAATTGCAGAATTCAGTGAATCATCGAATCTTTGAACGCACATTGCGCCCGCCAGTATTCTGGCGGGCATGCCCGTTCGAGCGTCATTTCAACCCTCAAGCCCTGCTTGGTGTTGGGGACCGGCTCAGCGGGTGCGGGCTTCGGCCCGTCCCGTGCCGCCCCCGAAATGGATCGGCGGTCTCGTCGCAGCCTTCTTTGCGTAGTAACATACCACCTCGCAACAGGAGCGCGGCGCGGCCACTGCCGTAAAACGCCCAACTTTT 11 | + 12 | CCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCEGGGGGGGGGGGGGGGFGGGGGGDG:FGGGFEFFFFFFFFFGFGD<@E;AFBFFF=FFFG@FEFFFFFFF?BFFFFFFFBFF:D>B>?F?61186ADF9;BFFB09A<") 13 | 14 | r2 = fqreadrecord("@M03737:51:000000000-KTBYH:1:1101:21125:2738 2:N:0:GTATCGTCGT+CACCTGTT 15 | CTTATTGATATGCTTAAGTTCAGCGGGTATTCCTACCTGATTCGAGGTCAACTCTAAAAAGTTGGGCGTTTTACGGCAGTGGCCGCGCCGCGCTCCTGTTGCGAGGTGGTATGTTACTACGCAAAGAAGGCTGCGACGAGACCGCCGATCCATTTCGGGGGCGGCACGGGACGGGCCGAAGCCCGCACCCGCTGAGCCGGTCCCCAACACCAAGCAGGGCTTGAGGGTTGAAATGACGCTCGAACGGGCATGCCCGCCAGAATACTGGCGGGCGCAAGGGGGGTTCAAAGGTTCGAAGAA 16 | + 17 | CCCCCGGGGGGGGGGGGGGGGGGGEDDFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG3BGFFGGGDGGFFFFEBFFFFF:BFF>?4:>12868?FBFBDB253-((41<:1122((.9:??696-4:)).)-4,43((342900600((,,62)-(,)43((4,(") 18 | 19 | primer1 = dna"AHCGATGAAGAACRYAG" 20 | primer2 = dna"CTTATTGATATGCTTAAGTTCAG" 21 | ps = PrimerSet(primer1, primer2) 22 | 23 | init_seq_rc = true 24 | r1_seq_rc = LongDNA{4}() 25 | r2_seq_rc = LongDNA{4}() 26 | 27 | 28 | end 29 | end 30 | =# -------------------------------------------------------------------------------- /src/FqRecords/interface.jl: -------------------------------------------------------------------------------- 1 | 2 | struct FqRecord 3 | id::Vector{UInt8} 4 | seq::LongDNA{4} 5 | des::Vector{UInt8} 6 | qual::Vector{UInt8} 7 | prob::Vector{Float64} 8 | function FqRecord(id::Vector{UInt8}, seq::LongDNA{4}, des::Vector{UInt8}, qual::Vector{UInt8}, prob::Vector{Float64}) 9 | new(id::Vector{UInt8}, seq::LongDNA{4} |> bitsafe!, des::Vector{UInt8}, qual::Vector{UInt8}, prob::Vector{Float64}) 10 | end 11 | end 12 | @inline FqRecord() = FqRecord(Vector{UInt8}(), LongDNA{4}(), Vector{UInt8}(), Vector{UInt8}(), Vector{Float64}()) 13 | 14 | @inline function FqRecord(id::Vector{UInt8}, seq::LongDNA{4}, des::Vector{UInt8}, qual::Vector{UInt8}; quality_offset=33) 15 | FqRecord(id, seq, des, qual, qualprob.(qual, quality_offset)) 16 | end 17 | 18 | @inline Base.:(==)(r1::FqRecord, r2::FqRecord) = 19 | r1.id == r2.id && r1.seq == r2.seq && r1.des == r2.des && r1.qual == r2.qual 20 | 21 | @inline function Base.isempty(r::FqRecord)::Bool 22 | isempty(r.id::Vector{UInt8}) && isempty(r.seq::LongDNA{4}) && isempty(r.des::Vector{UInt8}) && isempty(r.qual::Vector{UInt8}) 23 | end 24 | 25 | mutable struct TrimStats 26 | @atomic polyG::Int 27 | @atomic polyT::Int 28 | @atomic polyA::Int 29 | @atomic polyC::Int 30 | @atomic complexity_filtered::Int 31 | @atomic hard_clip_after::Int 32 | @atomic tail_low_qual_trim::Int 33 | @atomic tail_N_trim::Int 34 | @atomic length_filtered::Int 35 | @atomic max_n_filtered::Int 36 | @atomic pcr_dedup_removed::Int 37 | @atomic quality_trim::Int 38 | @atomic adapter_trim::Int 39 | end 40 | TrimStats() = TrimStats(0,0,0,0,0,0,0,0,0,0,0,0,0) 41 | 42 | function Base.empty!(t::TrimStats) 43 | @atomic t.polyG = 0 44 | @atomic t.polyT = 0 45 | @atomic t.polyA = 0 46 | @atomic t.polyC = 0 47 | @atomic t.complexity_filtered = 0 48 | @atomic t.hard_clip_after = 0 49 | @atomic t.tail_low_qual_trim = 0 50 | @atomic t.tail_N_trim = 0 51 | @atomic t.length_filtered = 0 52 | @atomic t.max_n_filtered = 0 53 | @atomic t.pcr_dedup_removed = 0 54 | @atomic t.quality_trim = 0 55 | @atomic t.adapter_trim = 0 56 | end -------------------------------------------------------------------------------- /docs/1.1.Release_installation_guide.md: -------------------------------------------------------------------------------- 1 | # Atria 2 | 3 | ## Release Installation Guide 4 | 5 | Atria is written in [Julia Language](https://julialang.org/) v1.9 and works on 64-bit Linux and OSX systems. 6 | 7 | The generic binaries do not require any special installation steps, but you will need to ensure that your system can find the `atria` executable, and `pigz` & `pbzip2` command for compression/decompression. 8 | 9 | ### Linux 10 | 11 | 12 | #### Dependency 13 | 14 | `pigz` and `pbzip2` are required. 15 | 16 | If you use Ubuntu, try `sudo apt install pigz pbzip2`. You can also download them from [pigz's official site](https://zlib.net/pigz/) and [pbzip2's official site](http://compression.ca/pbzip2/). 17 | 18 | #### Atria 19 | 20 | First, extract the `.linux.tar.gz` file downloaded from the [release page](https://github.com/cihga39871/Atria/releases/) to a folder on your computer: 21 | 22 | ```bash 23 | tar -zxf Atria-VERSION-linux.tar.gz 24 | ``` 25 | 26 | Atria is extracted to `Atria-VERSION` directory. To run Atria, you can do any of the following: 27 | 28 | - Create a symbolic link to `atria` inside a folder which is on your system `PATH` (recommended) 29 | - Add Atria's bin folder to your system `PATH` environment variable 30 | - Invoke the `atria` executable by using its full path, as in `~/Atria/bin/atria` 31 | 32 | For example, to create a symbolic link to `atria` inside the `/usr/local/bin` folder, you can do the following: 33 | 34 | ```bash 35 | sudo ln -s /bin/atria /usr/local/bin/atria 36 | ``` 37 | 38 | ### Mac OS 39 | 40 | > Sorry, we no longer provide binary files for Mac OS (x86 or M-series). You need to [install Atria from source](./1.2.Install_from_source.md). It is easy to follow and doesn't take much time. 41 | 42 | #### Dependency 43 | 44 | `pigz` and `pbzip2` are required. 45 | 46 | If you use [Homebrew](https://brew.sh/), try `brew install pigz; brew install pbzip2`. You can also download it from [pigz's official site](https://zlib.net/pigz/) and [pbzip2's official site](http://compression.ca/pbzip2/). 47 | 48 | #### Atria 49 | 50 | Sorry, we no longer provide Atria release for OSX (x86 or M-series). Please [install Atria from source](./1.2.Install_from_source.md). 51 | -------------------------------------------------------------------------------- /benchmark/aln2len.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # This code is part of Skewer (https://sourceforge.net/projects/skewer/). The License: 4 | # 5 | # The MIT License (MIT) 6 | # 7 | # Copyright (c) 2013-2014 by Hongshan Jiang 8 | # 9 | # Permission is hereby granted, free of charge, to any person obtaining a copy 10 | # of this software and associated documentation files (the "Software"), to deal 11 | # in the Software without restriction, including without limitation the rights 12 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | # copies of the Software, and to permit persons to whom the Software is 14 | # furnished to do so, subject to the following conditions: 15 | # 16 | # The above copyright notice and this permission notice shall be included in all 17 | # copies or substantial portions of the Software. 18 | # 19 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | # SOFTWARE. 26 | 27 | use strict; 28 | 29 | if(@ARGV != 1){ 30 | print STDERR "Usage: $0 file.aln > lengths.tab\n"; 31 | exit(1); 32 | } 33 | my ($aln_file) = @ARGV; 34 | 35 | my ($line, $no); 36 | my @columns; 37 | my ($id, $len, $len2); 38 | my @chars; 39 | $no = -1; 40 | open(ALN, "<$aln_file") or die("Can not open $aln_file for reading\n"); 41 | while($line = ){ 42 | chomp($line); 43 | if($line =~ /^>/){ 44 | @columns = split(/\t/, $line); 45 | @columns = split(/\//, $columns[1]); 46 | $id = $columns[0]; 47 | $no = 0; 48 | next; 49 | } 50 | next if($no < 0); 51 | $no++; 52 | if($no == 1){ # first sequence 53 | $len = length($line); 54 | next; 55 | } 56 | if($no == 2){ # second sequence 57 | @chars = split(//, substr($line,0,$len)); 58 | my $del=0; 59 | for(my $i=$#chars; $i>=0; $i--){ 60 | if($chars[$i] eq '-'){ 61 | $del++; 62 | } 63 | } 64 | $len -= $del; 65 | print "$id\t$len\n"; 66 | } 67 | } 68 | close ALN; 69 | 70 | exit(0); 71 | -------------------------------------------------------------------------------- /benchmark/atria-simulate.bash: -------------------------------------------------------------------------------- 1 | #! bash 2 | 3 | adapter_length=33 4 | 5 | if [[ $1 -lt 33 ]] 6 | then 7 | adapter_length=$1 8 | fi 9 | 10 | working_dir=~/analysis/atria-benchmark/atria_simulate 11 | 12 | mkdir -p $working_dir 13 | cd $working_dir 14 | 15 | mkdir adapter_length_$adapter_length 16 | cd adapter_length_$adapter_length 17 | 18 | # select first adapter_length bp of adapters 19 | a1=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA 20 | a2=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT 21 | 22 | a1=${a1:0:$adapter_length} 23 | a2=${a2:0:$adapter_length} 24 | 25 | #### simulate data with different indels 26 | # docs: https://github.com/cihga39871/Atria/blob/master/docs/3.Benchmark_toolkit.md#data-simulation 27 | atria simulate --prefix reads_diff_indel --adapter1 $a1 --adapter2 $a2 --repeat 30000 --subsitution-rate 0.001 0.002 0.003 0.004 0.005 --insertion-rate 1.0e-5 2.0e-5 3.0e-5 4.0e-5 5.0e-5 --deletion-rate 1.0e-5 2.0e-5 3.0e-5 4.0e-5 5.0e-5 -s 100 -i `seq 66 2 120` 28 | 29 | r1="reads_diff_indel.R1.fastq" 30 | r2="reads_diff_indel.R2.fastq" 31 | 32 | # load trimming functions 33 | . $atria/benchmark/trimming-functions.bash 34 | 35 | rm -f stderr.log 36 | run_atria 16 2>> stderr.log 37 | run_adapterremoval 16 2>> stderr.log 38 | run_skewer 16 2>> stderr.log 39 | run_trim_galore 16 2>> stderr.log 40 | run_trimmomatic 16 2>> stderr.log 41 | # run_ktrim 16 2>> stderr.log # ktrim fails to output validate fastq 42 | run_fastp 1 2>> stderr.log 43 | run_atropos 16 2>> stderr.log 44 | run_seqpurge 1 2>> stderr.log 45 | run_cutadapt 16 2>> stderr.log 46 | 47 | pigz -d SeqPurge/*gz 48 | 49 | mv AdapterRemoval-3/adapterremoval.pair1.truncated AdapterRemoval-3/adapterremoval.pair1.fq 50 | mv AdapterRemoval-3/adapterremoval.pair2.truncated AdapterRemoval-3/adapterremoval.pair2.fq 51 | 52 | # cat Trimmomatic/out-pair1.unpaired.fq >> Trimmomatic/out-pair1.paired.fq 53 | # cat Trimmomatic/out-pair2.unpaired.fq >> Trimmomatic/out-pair2.paired.fq 54 | # rm Trimmomatic/out-pair1.unpaired.fq Trimmomatic/out-pair2.unpaired.fq 55 | 56 | ll */*fastq */*fq 57 | 58 | for i in * 59 | do 60 | if [[ -d $i ]] 61 | then 62 | julia -L $atria/src/Atria.jl -e "Atria.Benchmark.julia_wrapper_readstat(ARGS)" $i/*.f*q & 63 | fi 64 | done 65 | 66 | # atria readstat Cutadapt/out.cutadapt.R*.fq 67 | 68 | ps -x | grep -c "Atria.Benchmark.julia_wrapper_readstat" 69 | 70 | 71 | ### Adapter length 16, 20, 24, 28, 33 72 | # atria statplot -i */*r12.stat.tsv 73 | -------------------------------------------------------------------------------- /src/Trimmer/thread_trim.jl: -------------------------------------------------------------------------------- 1 | 2 | function processing_reads!(r1s::Vector{FqRecord}, r2s::Vector{FqRecord}, isgoods::Vector{Bool}, n_reads::Int) 3 | if length(isgoods) < n_reads 4 | resize!(isgoods, n_reads) 5 | end 6 | for i in 1:n_reads 7 | @inbounds isgoods[i] = read_processing!(r1s[i]::FqRecord, r2s[i]::FqRecord, 1) 8 | end 9 | nothing 10 | end 11 | 12 | # function processing_reads_range!(r1s::Vector{FqRecord}, r2s::Vector{FqRecord}, isgoods::Vector{Bool}, reads_range::UnitRange{Int64}) 13 | # this_threadid = Threads.threadid() 14 | # for i in reads_range 15 | # @inbounds isgoods[i] = read_processing!(r1s[i], r2s[i], this_threadid) 16 | # end 17 | # nothing 18 | # end 19 | 20 | # function processing_reads_threads!(r1s::Vector{FqRecord}, r2s::Vector{FqRecord}, isgoods::Vector{Bool}, n_reads::Int) 21 | # if length(isgoods) < n_reads 22 | # resize!(isgoods, n_reads) 23 | # end 24 | # # split reads to N reads per batch 25 | # @sync for reads_start in 1:256:n_reads 26 | # reads_end = min(reads_start + 255, n_reads) 27 | # reads_range = reads_start:reads_end 28 | 29 | # Threads.@spawn processing_reads_range!(r1s::Vector{FqRecord}, r2s::Vector{FqRecord}, isgoods::Vector{Bool}, reads_range) 30 | # end 31 | # nothing 32 | # end 33 | 34 | ## single end 35 | function processing_reads!(r1s::Vector{FqRecord}, isgoods::Vector{Bool}, n_reads::Int) 36 | if length(isgoods) < n_reads 37 | resize!(isgoods, n_reads) 38 | end 39 | for i in 1:n_reads 40 | @inbounds isgoods[i] = read_processing!(r1s[i]::FqRecord, 1) 41 | end 42 | nothing 43 | end 44 | 45 | # function processing_reads_range!(r1s::Vector{FqRecord}, isgoods::Vector{Bool}, reads_range::UnitRange{Int64}) 46 | # this_threadid = Threads.threadid() 47 | # for i in reads_range 48 | # @inbounds isgoods[i] = read_processing!(r1s[i], this_threadid) 49 | # end 50 | # nothing 51 | # end 52 | 53 | # function processing_reads_threads!(r1s::Vector{FqRecord}, isgoods::Vector{Bool}, n_reads::Int) 54 | # if length(isgoods) < n_reads 55 | # resize!(isgoods, n_reads) 56 | # end 57 | # # split reads to N reads per batch 58 | # @sync for reads_start in 1:512:n_reads 59 | # reads_end = min(reads_start + 511, n_reads) 60 | # reads_range = reads_start:reads_end 61 | 62 | # Threads.@spawn processing_reads_range!(r1s::Vector{FqRecord}, isgoods::Vector{Bool}, reads_range) 63 | # end 64 | # nothing 65 | # end 66 | -------------------------------------------------------------------------------- /benchmark/replicates-stats.jl: -------------------------------------------------------------------------------- 1 | 2 | using Statistics 3 | 4 | if isempty(ARGS) 5 | println(""" 6 | Usage: $(@__FILE__) FILE1 FILE2... 7 | 8 | Format of FILEs has to be the same. The numeric values in the same position of FILEs are replaced with their mean and standard deviation. 9 | """) 10 | exit() 11 | end 12 | 13 | contents = map(x -> read(x, String), ARGS) 14 | cells = map(x -> split(x, r"[^\w\d\.\-\:]+"), contents) 15 | 16 | cell_matrix = String.(hcat(cells...)) 17 | 18 | function Base.isnumeric(x::String) 19 | isfloat = !(tryparse(Float64, x) === nothing) 20 | istime = occursin(r"^(\d+:)+\d+(\.\d+)?$", x) 21 | isfloat | istime 22 | end 23 | 24 | function Base.isnumeric(x::AbstractArray) 25 | all(isnumeric.(x)) 26 | end 27 | 28 | function parse_numeric(x::String) 29 | float = tryparse(Float64, x) 30 | if !isnothing(float) 31 | return float 32 | end 33 | # parse time as D:H:M:S.MS 34 | xs = split(x, ":") |> reverse! 35 | second = parse(Float64, xs[1]) 36 | for i in 2:length(xs) 37 | if i == 2 38 | second += 60 * parse(Float64, xs[i]) 39 | elseif i == 3 40 | second += 3600 * parse(Float64, xs[i]) 41 | elseif i == 4 42 | second += 24 * 3600 * parse(Float64, xs[i]) 43 | else 44 | error("Failed to parse $x as the time format D:H:M:S") 45 | end 46 | end 47 | second 48 | end 49 | 50 | 51 | mean_std_strings = map(eachrow(cell_matrix)) do vec 52 | if isnumeric(vec) 53 | vals = parse_numeric.(vec) 54 | std = Statistics.std(vals) 55 | if std != 0 56 | digit = -(floor(Int, log10(std)) - 1) 57 | else 58 | digit = 0 59 | end 60 | if digit <= 0 61 | std_string = round(Int, std) |> string 62 | mean_string = round(Int, Statistics.mean(vals)) |> string 63 | else 64 | std_string = round(std, sigdigits=2) |> string 65 | mean_string = round(Statistics.mean(vals), digits=digit) |> string 66 | end 67 | mean_string * " ± " * std_string 68 | else 69 | vec[1] 70 | end 71 | end 72 | 73 | specials = split(contents[1], r"[\w\d\.\-\:]+") 74 | 75 | N = min(length(specials), length(mean_std_strings)) 76 | result = "" 77 | 78 | if mean_std_strings[1] == "" 79 | for i = 1:N 80 | global result 81 | result *= mean_std_strings[i] * specials[i] 82 | end 83 | else 84 | for i = 1:N 85 | global result 86 | result *= specials[i] * mean_std_strings[i] 87 | end 88 | end 89 | 90 | filename = "stats." * basename(ARGS[1]) 91 | write(filename, result) 92 | 93 | @info "Done" Output=filename 94 | -------------------------------------------------------------------------------- /src/FqRecords/quality.jl: -------------------------------------------------------------------------------- 1 | 2 | 3 | const qualpval_table = map(q -> 10 ^ (-q/10), 0:50) 4 | const qualprob_table = 1.0 .- qualpval_table 5 | 6 | """ 7 | The quality offset used in FqRecords should be the real quality offset - 1, such as Illumina 1.8 => 33-1 8 | """ 9 | @inline function qualpval(Q, quality_offset)::Float64 10 | q = Q - quality_offset + 1 11 | q <= 0 && error("Input quality < 0 detected. Wrong --quality-format FORMAT or the input file in truncated.") 12 | @inbounds qualpval_table[q > 51 ? 51 : q] 13 | end 14 | 15 | """ 16 | The quality offset used in FqRecords should be the real quality offset - 1, such as Illumina 1.8 => 33-1 17 | """ 18 | @inline function qualprob(Q, quality_offset)::Float64 19 | q = Q - quality_offset + 1 20 | q <= 0 && error("Input quality < 0 detected. Wrong --quality-format FORMAT or the input file in truncated.") 21 | @inbounds qualprob_table[q > 51 ? 51 : q] 22 | end 23 | 24 | 25 | @inline function update_prob_from_qual(r::FqRecord; quality_offset::Int64=33)::Nothing 26 | resize!(r.prob, length(r.qual)) 27 | @inbounds for (i,Q) in enumerate(r.qual) 28 | r.prob[i] = qualprob(Q, quality_offset) 29 | end 30 | return 31 | end 32 | 33 | 34 | 35 | @inline function probsum(r::FqRecord, from::Int64, to::Int64)::Float64 36 | r_prob = r.prob 37 | nprob = length(r_prob) 38 | to > nprob && (to = nprob) 39 | from < 1 && (from = 1 ) 40 | 41 | value = 0.0 42 | @inbounds while from <= to 43 | value += r_prob[from] 44 | from += 1 45 | end 46 | value 47 | end 48 | 49 | @inline function probmean(r::FqRecord, from::Int64, to::Int64)::Float64 50 | r_prob = r.prob 51 | nprob = length(r_prob) 52 | to > nprob && (to = nprob) 53 | from < 1 && (from = 1 ) 54 | n = to - from + 1 55 | n <= 0 && return 0.0 56 | 57 | value = 0.0 58 | @inbounds while from <= to 59 | value += r_prob[from] 60 | from += 1 61 | end 62 | @fastmath value/n 63 | end 64 | 65 | 66 | 67 | @inline function compute_prob_and_score!(match_res::MatchRes, r::FqRecord, r_start::Int, r_end::Int; min_prob::Float64 = 0.75) 68 | match_res.prob = max(probmean(r, r_start, r_end), min_prob) 69 | match_res.score = @fastmath match_res.ncompatible * match_res.prob 70 | end 71 | @inline function compute_prob_and_score!(match_res::MatchRes, r1::FqRecord, r1_start::Int, r1_end::Int, r2::FqRecord, r2_start::Int, r2_end::Int; min_prob::Float64 = 0.75) 72 | prob1 = max(probmean(r1, r1_start, r1_end), min_prob) 73 | prob2 = max(probmean(r2, r2_start, r2_end), min_prob) 74 | match_res.prob = @fastmath prob1 * prob2 75 | match_res.score = @fastmath match_res.ncompatible * match_res.prob 76 | end -------------------------------------------------------------------------------- /benchmark/art-simulate-main.bash: -------------------------------------------------------------------------------- 1 | 2 | # Genome information 3 | # Arabidopsis thaliana (thale cress) reference genome TAIR10.1 4 | # https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/735/GCF_000001735.4_TAIR10.1/GCF_000001735.4_TAIR10.1_genomic.fna.gz 5 | 6 | # Simulation program information 7 | # ART (Skewer modified version) 8 | # ART: a next-generation sequencing read simulator. Bioinformatics. 2012 Feb 15; 28(4): 593–594. 9 | # Simulated from a real public sequence: SRR7243169.1 10 | 11 | 12 | working_dir=~/analysis/atria-benchmark/art_simulate 13 | 14 | mkdir -p $working_dir 15 | cd $working_dir 16 | 17 | # download genome 18 | mkdir genomes 19 | wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/735/GCF_000001735.4_TAIR10.1/GCF_000001735.4_TAIR10.1_genomic.fna.gz -O genomes/TAIR10.1.fasta.gz 20 | pigz -d genomes/TAIR10.1.fasta.gz 21 | 22 | ### simulate data 23 | # Download from https://sourceforge.net/projects/skewer/files/Simulator/ 24 | cd $working_dir/ART/art_profiler_illumina 25 | 26 | # download real data to generate profiles 27 | fastq-dump --split-files --origfmt SRR330569.3 28 | # generate profiles 29 | ./Illumina_readprofile_art profile_SRR330569 . fastq 30 | 31 | for i in `seq 1 3` 32 | do 33 | echo "Replicate $i ----------------------------------" 34 | bash $atria/benchmark/art-simulate-run-bench.bash 35 | done 36 | 37 | cd $working_dir 38 | 39 | julia $atria/benchmark/replicates-stats.jl replicate_*/summary.AdapterRemoval 40 | julia $atria/benchmark/replicates-stats.jl replicate_*/summary.Atria 41 | julia $atria/benchmark/replicates-stats.jl replicate_*/summary.Skewer 42 | julia $atria/benchmark/replicates-stats.jl replicate_*/summary.TrimGalore 43 | julia $atria/benchmark/replicates-stats.jl replicate_*/summary.Trimmomatic 44 | julia $atria/benchmark/replicates-stats.jl replicate_*/summary.Ktrim 45 | julia $atria/benchmark/replicates-stats.jl replicate_*/summary.Fastp 46 | julia $atria/benchmark/replicates-stats.jl replicate_*/summary.Atropos 47 | julia $atria/benchmark/replicates-stats.jl replicate_*/summary.SeqPurge 48 | 49 | awk 'BEGIN {print "Trimmer\tTP\tFP_ft\tFP_ot\tFN_fr\tFN_ut\tTN\tPPV\tSen.\tSpec.\tmCC" }; NR%3==2{FNR="\t"; sub("stats.summary.", "", FILENAME); print FILENAME"\t"$0}' stats.summary.* > performance_stats.df.txt 50 | 51 | julia $atria/benchmark/replicates-stats.jl replicate_*/time_benchmark.df.txt 52 | julia $atria/benchmark/replicates-stats.jl replicate_*/time_benchmark_gz.df.txt 53 | 54 | julia $atria/benchmark/replicates-stats.jl replicate_*/time_benchmark.new.df.txt 55 | julia $atria/benchmark/replicates-stats.jl replicate_*/time_benchmark.new_gz.df.txt 56 | 57 | Rscript $atria/benchmark/time_stats_plot.R -i stats.time_benchmark.df.txt stats.time_benchmark_gz.df.txt -o time_stats_plot.html 58 | -------------------------------------------------------------------------------- /src/AtriaTest/BioBits/biosequences_safety.jl: -------------------------------------------------------------------------------- 1 | @noinline function test_biosequences_safety() 2 | 3 | @testset "biosequences safety" begin 4 | @testset "bitsafe" begin 5 | s1 = dna"" 6 | s2 = dna"NASTTGGTTATCNNNN" 7 | s3 = LongDNA{4}([0x4214824181422181, 0x0ff0000084128142, 0x0000000084128142], UInt(40)) 8 | 9 | bitsafe!(s1) 10 | @test length(s1) == 0 11 | @test length(s1.data) == 1 12 | # @test s1.data[1] == 0x0 13 | 14 | bitsafe!(s2) 15 | @test s2.data[1] == 0xffff28188448861f 16 | @test length(s2.data) == 2 17 | 18 | @test !isbitsafe(s3) 19 | @test isbitsafe(s1) 20 | @test isbitsafe(s2) 21 | end 22 | 23 | @testset "bitsafe resize" begin 24 | s3 = LongDNA{4}([0x4214824181422181, 0x0ff0000084128142, 0x0000000084128142], UInt(40)) 25 | resize!(s3, 5) 26 | @test isbitsafe(s3) 27 | 28 | s3 = LongDNA{4}([0x4214824181422181, 0x0ff0000084128142, 0x0000000084128142], UInt(40)) 29 | resize!(s3, 100) 30 | @test length(s3.data) == 8 31 | @test isbitsafe(s3) 32 | 33 | s3 = LongDNA{4}([0x4214824181422181, 0x0ff0000084128142, 0x0000000084128142], UInt(40)) 34 | s3 = s3[4:36] 35 | resize!(s3, 40) 36 | @test s3.data[1] == 0x1424214824181422 37 | @test s3.data[2] == 0x1420ff0000084128 38 | @test s3.data[3] | 0xfffffffffffffff0 == 0xfffffffffffffff8 39 | @test length(s3.data) == 4 40 | end 41 | 42 | @testset "bitsafe reverse complement" begin 43 | s3 = LongDNA{4}([0x4214824181422181, 0x0ff0000084128142, 0x0000000084128142], UInt(40)) 44 | s3_rc = reverse_complement(s3) 45 | true_s3_rc_data = [0x00000ff042814821, 46 | 0x8241284242814821, 47 | 0x0000000081844281, 48 | 0x0000000000000000] 49 | @test s3_rc.data[1] == 0x00000ff042814821 50 | @test s3_rc.data[2] == 0x8241284242814821 51 | @test s3_rc.data[3] | 0xffffffff00000000 == 0xffffffff81844281 52 | 53 | bitsafe!(s3) 54 | s3_rc = reverse_complement(s3) 55 | @test s3_rc.data[1] == 0x00000ff042814821 56 | @test s3_rc.data[2] == 0x8241284242814821 57 | @test s3_rc.data[3] | 0xffffffff00000000 == 0xffffffff81844281 58 | @test length(s3.data) == 4 59 | 60 | s4 = s3[17:25] 61 | s4_rc = reverse_complement!(s4) 62 | @test s4_rc.data[1] | 0xfffffff000000000 == 0xfffffff428148210 63 | @test length(s4.data) == 2 64 | end 65 | end 66 | end 67 | -------------------------------------------------------------------------------- /src/FqRecords/basic_io.jl: -------------------------------------------------------------------------------- 1 | 2 | function Base.println(io::IO, r::FqRecord) 3 | println(String(copy(r.id))) 4 | println(r.seq) 5 | println(String(copy(r.des))) 6 | println(String(copy(r.qual))) 7 | end 8 | 9 | Base.print(io::IO, r::FqRecord) = println(io, r) 10 | Base.display(r::FqRecord) = println(stdout, r) 11 | Base.show(r::FqRecord) = println(stdout, r) 12 | 13 | 14 | remove_blank(s::AbstractString) = replace(s, r"\n[ \t]*" => "\n") 15 | 16 | """ 17 | fqreadrecord(s::IO ; quality_offset=33)::FqRecord 18 | fqreadrecord(s::String; quality_offset=33)::FqRecord 19 | 20 | It is very slow and not recommended. See also `load_fqs_threads!`. 21 | """ 22 | function fqreadrecord(s::IO; quality_offset=33)::FqRecord 23 | # 0x0a is \n 24 | # do not compatible with \r\n 25 | id = readuntil(s, 0x0a, keep=false)::Vector{UInt8} 26 | seq = LongDNA{4}(readuntil(s, 0x0a, keep=false))::LongDNA{4} 27 | des = readuntil(s, 0x0a, keep=false)::Vector{UInt8} 28 | qual = readuntil(s, 0x0a, keep=false)::Vector{UInt8} 29 | # nqual = length(qual::Vector{UInt8})::Int64 30 | FqRecord(id, seq, des, qual; quality_offset=quality_offset)::FqRecord 31 | end 32 | fqreadrecord(s::String; quality_offset=33)::FqRecord = fqreadrecord(IOBuffer(remove_blank(s)), quality_offset=quality_offset) 33 | 34 | """ 35 | fqreadrecord!(r::FqRecord, s::IO) 36 | 37 | It is very slow and not recommended. See also `load_fqs_threads!`. 38 | """ 39 | function fqreadrecord!(r::FqRecord, s::IO; quality_offset=33) 40 | safe_copyto!(r.id, readuntil(s, 0x0a, keep=false)::Vector{UInt8}) 41 | safe_copyto!(r.seq, readuntil(s, 0x0a, keep=false)::Vector{UInt8}) 42 | bitsafe!(r.seq) 43 | safe_copyto!(r.des, readuntil(s, 0x0a, keep=false)::Vector{UInt8}) 44 | resize!(r.qual, length(r.seq)) 45 | readfill!(s, r.qual) 46 | 47 | resize!(r.prob, length(r.seq)) 48 | @inbounds for (i, q) in enumerate(r.qual) 49 | r.prob[i] = qualprob(q, quality_offset) 50 | end 51 | 52 | eof(s) && return 53 | read(s, UInt8) == 0xa || error("FASTQ is not valid: the lengths of sequence and quality are not the same for $(string(r.id)): $s") 54 | return 55 | end 56 | 57 | 58 | function fqwriterecord(io::IO, r::FqRecord) 59 | if isempty(r.seq::LongDNA{4}) 60 | write(io, r.id::Vector{UInt8}) 61 | write(io, '\n') 62 | write(io, 'N') 63 | write(io, '\n') 64 | write(io, r.des::Vector{UInt8}) 65 | write(io, '\n') 66 | write(io, '!') 67 | write(io, '\n') 68 | else 69 | write(io, r.id::Vector{UInt8}) 70 | write(io, '\n') 71 | print(io, r.seq::LongDNA{4}) # no write method for LongDNA{4} 72 | write(io, '\n') 73 | write(io, r.des::Vector{UInt8}) 74 | write(io, '\n') 75 | write(io, r.qual::Vector{UInt8}) 76 | write(io, '\n') 77 | end 78 | end 79 | -------------------------------------------------------------------------------- /docs/1.2.Install_from_source.md: -------------------------------------------------------------------------------- 1 | # Atria 2 | 3 | ## Install from source 4 | 5 | Atria is tested in [Julia Language](https://julialang.org/) v1.8 and v1.9. 6 | 7 | It is recommended to build Atria using Julia v1.8.5 because it is 3-20% faster than v1.9. 8 | 9 | ### Mac OS 10 | 11 | #### Prerequisite 12 | 13 | Mac OS 10.8 or higher, 64-bit system. 14 | 15 | ##### Julia 16 | 17 | 1. Download Julia package manager 18 | 19 | ```bash 20 | curl -fsSL https://install.julialang.org | sh 21 | ``` 22 | 23 | 2. Download and select Julia version v1.8.5. It is recommended to build Atria using Julia v1.8.5 because it is 3-20% faster than v1.9. 24 | 25 | ```bash 26 | juliaup add 1.8 27 | juliaup default 1.8 28 | ``` 29 | 30 | ##### Pigz and Pbzip2 31 | 32 | Pigz and Pbzip2 are parallel Gzip/Bzip2 commandline tools required in Atria. You can install with [Homebrew](https://brew.sh/): 33 | 34 | ```bash 35 | brew install pigz 36 | brew install pbzip2 37 | ``` 38 | 39 | > If you do not use Homebrew, you can also download them from [pigz's official site](https://zlib.net/pigz/) and [pbzip2](https://pkgs.org/download/pbzip2). 40 | 41 | #### Atria 42 | 43 | Download the Atria git repository: 44 | 45 | ```bash 46 | git clone https://github.com/cihga39871/Atria.git 47 | ``` 48 | 49 | Go to `Atria` directory, and run `build_atria.jl` with Julia: 50 | 51 | ```bash 52 | cd Atria 53 | julia build_atria.jl 54 | ``` 55 | 56 | After installation, Atria is available at `./app-*/bin/atria`. Link `atria` to one of your PATH: 57 | 58 | ```bash 59 | sudo ln -s ./app-*/bin/atria /usr/local/bin 60 | ``` 61 | 62 | ### Linux 63 | 64 | #### Prerequisite 65 | 66 | Linux, 64-bit system. 67 | 68 | 1. Download Julia package manager 69 | 70 | ```bash 71 | curl -fsSL https://install.julialang.org | sh 72 | ``` 73 | 74 | 2. Download and select Julia version v1.8.5. It is recommended to build Atria using Julia v1.8.5 because it is 3-20% faster than v1.9. 75 | 76 | ```bash 77 | juliaup add 1.8 78 | juliaup default 1.8 79 | ``` 80 | 81 | Then, download `pigz` and `pbzip2` (a compression/decompression software used in Atria). 82 | 83 | If you use `apt` package manager (Ubuntu/Debian), try `sudo apt install pigz pbzip2`. 84 | If you use `yum` package manager (CentOS), try `sudo yum install pigz pbzip2`. 85 | 86 | You can also download them from [pigz's official site](https://zlib.net/pigz/) and [pbzip2](https://pkgs.org/download/pbzip2). 87 | 88 | #### Atria 89 | 90 | Download the Atria git repository: 91 | 92 | ```bash 93 | git clone https://github.com/cihga39871/Atria.git 94 | ``` 95 | 96 | Go to `Atria` directory, and run `build_atria.jl` with Julia: 97 | 98 | ```bash 99 | cd Atria 100 | julia build_atria.jl 101 | ``` 102 | 103 | After installation, Atria is available at `./atria-*/bin/atria`. Link `atria` to one of your PATH: 104 | 105 | ```bash 106 | sudo ln -s ./app-*/bin/atria /usr/local/bin 107 | ``` 108 | -------------------------------------------------------------------------------- /docs/5.Accuracy_and_speed_benchmark.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | This section evaluates trimming accuracy regarding different read properties, including adapter presence or absence, base error, and adapter length. To achieve the goal, Atria integrates a benchmarking toolkit for read simulation and trimming analysis. 4 | 5 | The details are described in the [Atria paper](https://gigabytejournal.com/articles/31). 6 | 7 | ## Trimmers 8 | 9 | - Atria v3.0.0 10 | - AdapterRemoval v2.3.1 11 | - Skewer v0.2.2 12 | - Fastp v0.21.0 13 | - Ktrim v1.2.1 14 | - Atropos v1.1.29 15 | - SeqPurge v2012_12 16 | - Trim Galore v0.6.5 17 | - Trimmomatic v0.39 18 | - Cutadapt v2.8 (#3) 19 | 20 | ## Data 21 | 22 | Twenty-one million read pairs were simulated with a uniform read length (100 bp), different error profiles, adapter length, and original insert sizes. 23 | 24 | The baseline error profile comprises a 0.1% substitution rate, 0.001% insertion rate, and 0.001% deletion rate, inspired by an Illumina error profile analysis. 1x, 2x, 3x, 4x, and 5x baseline error profile, and 66 to 120 even insert sizes are chosen. 25 | 26 | In this way, the reads with the least insert size have full lengths of adapters. The reads with 66-98 original insert sizes contain adapters, and the reads with 100-120 original insert sizes are free from adapter contamination, except for few reads with a 100 bp insert size containing indels. In each condition combination, 30 thousand read pairs were simulated to avoid random errors. 27 | 28 | ## Results 29 | 30 | ![Figure 1](https://github.com/cihga39871/Atria/raw/master/docs/Figure%201%20Simulation%20Accuracy2.png) 31 | 32 | **Figure 1 Adapter trimming accuracy on adapter presence and absence, different base errors, and adapter lengths** ([Interactive plots can be downloaded here](https://github.com/cihga39871/Atria/raw/master/docs/Figure%201%20Simulation%20Accuracy.html)) 33 | 34 | A1, B1, and C1 are statistics for reads with adapter contamination, while A2, B2, C2 for reads without adapters. 35 | 36 | A1 and A2 show the accumulated rates of accurate trim, one bp over trim, one bp under trim, multiple bp over trim, and multiple bp under trim. 37 | 38 | B1 and B2 show the trimming accuracy on different error profiles. 39 | 40 | C1 and C2 show the trimming accuracy on different adapter lengths. 41 | 42 | > Ktrim throwed an error when processing simulated fastq files. Its accuracy was benched using other methods in the [Atria paper](https://gigabytejournal.com/articles/31). 43 | 44 | 45 | 46 | ![Figure 2](https://github.com/cihga39871/Atria/raw/master/docs/Figure%202%20Speed.png) 47 | 48 | **Figure 2 Benchmark of adapter-trimming speed for uncompressed and compressed files on different threading options** ([Interactive plots can be downloaded here](https://github.com/cihga39871/Atria/raw/master/docs/Figure%202%20Speed.html)) 49 | 50 | The simulated paired-end data with a 100 bp read length was trimmed in both uncompressed and compressed format using up to 32 threads. Speed is the ratio of the number of bases to elapsed time (wall time). SeqPurge does not support uncompressed outputs, so it is not shown in the uncompressed benchmark. In the trimming for compressed data, the speed of AdapterRemoval, Skewer, Fastp, Atropos, and Trimmomatic kept constant when the number of threads increased from 4 to 32, so we only benchmark those trimmers using 1, 2, and 4 threads. Ktrim does not support output compressed files, so it is not shown in the compressed benchmark. 51 | 52 | -------------------------------------------------------------------------------- /src/Benchmark/rand_trim.jl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env julia 2 | 3 | # using BioSymbols 4 | # using BioSequences 5 | # 6 | # 7 | # include("apiBioFqRecords.jl") 8 | 9 | function julia_wrapper_randtrim(ARGS) 10 | 11 | help_page = """ 12 | usage: atria randtrim [-h] R1_FASTQ R2_FASTQ 13 | 14 | positional arguments: 15 | R?_FASTQ input fastqs. caution: raw fastq has to be 16 | generated by `atria simulate`. 17 | 18 | optional arguments: 19 | -h, --help show this help message and exit 20 | """ 21 | 22 | if "-h" in ARGS || "--help" in ARGS || length(ARGS) == 0 || length(ARGS) % 2 == 1 23 | println(help_page) 24 | return 0 25 | end 26 | 27 | time0 = time() 28 | 29 | npair = length(ARGS)÷2 30 | 31 | for i in 1:npair 32 | peReadRandomTrim_main(ARGS[2i-1], ARGS[2i]) 33 | end 34 | 35 | @info "read random trim: all done" elapsed=time() - time0 36 | return 0 37 | end 38 | 39 | @inline function random_trim!(r::FqRecord) 40 | 41 | nremain = rand(0:length(r.seq)) 42 | 43 | splitted = split(String(copy(r.id)), " ") 44 | # seq_id = splitted[1] 45 | true_length = parse(Int64, splitted[2][6:end]) 46 | # insert_size = parse(Int64, splitted[3][13:end]) 47 | # error_rate = parse(Float64, splitted[4][12:end]) 48 | # seq_length = parse(Int64, splitted[5][12:end]) 49 | # error_insert = parse(Int64, splitted[6][14:end]) 50 | # error_adapter = parse(Int64, splitted[7][15:end]) 51 | 52 | if nremain < true_length 53 | splitted[2] = "TRUE=$nremain" 54 | safe_copyto!(r.id, map(UInt8, collect(join(splitted, " ")))) 55 | end 56 | 57 | resize!(r.seq, nremain) 58 | # r.qual = view(r.qual, 1:nremain) 59 | resize!(r.qual, nremain) 60 | end 61 | 62 | @inline function random_trim!(r1::FqRecord, r2::FqRecord) 63 | if rand() < 0.5 64 | random_trim!(r1) 65 | else 66 | random_trim!(r2) 67 | end 68 | end 69 | 70 | 71 | function peReadRandomTrim_main(file1::String, file2::String) 72 | @info "read random trim: start" file1 file2 73 | 74 | for input in [file1,file2] 75 | if !isfile(input) 76 | @warn "read random trim: input FASTQ file not valid: skip" FILE=input _module=nothing _group=nothing _id=nothing _file=nothing 77 | return nothing 78 | end 79 | end 80 | 81 | outfile1 = joinpath(replace(file1, r"(fastq$|fq$|[^.]*)(\.gz)?$"i => s"randtrim.\1", count=1)) 82 | outfile2 = joinpath(replace(file2, r"(fastq$|fq$|[^.]*)(\.gz)?$"i => s"randtrim.\1", count=1)) 83 | 84 | io1 = open(file1, "r") 85 | io2 = open(file2, "r") 86 | io1out = open(outfile1, "w") 87 | io2out = open(outfile2, "w") 88 | 89 | r1 = FqRecord() 90 | r2 = FqRecord() 91 | 92 | #================== Read iteration ====================# 93 | # @label start_loop 94 | # eof(io1::IO) && @goto stop_loop 95 | while !eof(io1::IO) 96 | # read record 97 | fqreadrecord!(r1::FqRecord, io1::IO) 98 | fqreadrecord!(r2::FqRecord, io2::IO) 99 | 100 | random_trim!(r1::FqRecord, r2::FqRecord) 101 | 102 | fqwriterecord(io1out::IO, r1::FqRecord) 103 | fqwriterecord(io2out::IO, r2::FqRecord) 104 | 105 | # @goto start_loop 106 | end 107 | 108 | @label stop_loop 109 | 110 | #================== Close files ====================# 111 | 112 | close(io1) 113 | close(io2) 114 | close(io1out) 115 | close(io2out) 116 | end 117 | -------------------------------------------------------------------------------- /src/Atria.jl: -------------------------------------------------------------------------------- 1 | 2 | module Atria 3 | 4 | # add ArgParse BioSymbols BioSequences Printf JSON Statistics DelimitedFiles Distributed Logging DataStructures Markdown PrettyTables 5 | 6 | # using ArgParse 7 | # using BioSymbols 8 | # using BioSequences 9 | # using Printf 10 | # using JSON 11 | # using Statistics 12 | # using DelimitedFiles 13 | # using Distributed 14 | # using Base.Threads 15 | # using Logging 16 | # using DataStructures 17 | # using Markdown 18 | # using PrettyTables 19 | 20 | using Reexport 21 | 22 | include(joinpath("BioBits", "BioBits.jl")) 23 | @reexport using .BioBits 24 | 25 | include(joinpath("FqRecords", "FqRecords.jl")) 26 | @reexport using .FqRecords 27 | 28 | include(joinpath("Trimmer", "Trimmer.jl")) 29 | @reexport using .Trimmer 30 | 31 | include(joinpath("Benchmark", "Benchmark.jl")) 32 | @reexport using .Benchmark 33 | 34 | include(joinpath("AtriaTest", "AtriaTest.jl")) 35 | @reexport using .AtriaTest 36 | 37 | 38 | function julia_main()::Cint 39 | 40 | help_programs = """ 41 | Available programs: 42 | atria Pair-end trimming software (default) 43 | simulate Generate artificial pair-end reads 44 | randtrim Randomly trim R1 or R2 at a random position 45 | readstat Collect trimming statistics 46 | (reads should be generated by `atria simulate`) 47 | statplot Plot trimming statistics 48 | (`Rscript` in PATH required) 49 | test Test Atria program 50 | p | prog Show this program list 51 | """ 52 | 53 | if length(ARGS)::Int64 >= 1 54 | if ARGS[1] in ["prog", "p"] 55 | println(help_programs) 56 | elseif ARGS[1] in ("atria", "Atria") 57 | if "--detect-adapter" in ARGS 58 | if "-R" in ARGS || "--read2" in ARGS 59 | julia_wrapper_detect_adapter_pe(ARGS[2:end]) 60 | else 61 | julia_wrapper_detect_adapter_se(ARGS[2:end]) 62 | end 63 | elseif "-R" in ARGS || "--read2" in ARGS 64 | # paired-end 65 | julia_wrapper_atria_pe(ARGS[2:end]::Vector{String}) 66 | else 67 | julia_wrapper_atria_se(ARGS[2:end]::Vector{String}) 68 | end 69 | elseif ARGS[1] == "simulate" 70 | julia_wrapper_simulate(ARGS[2:end]::Vector{String}) 71 | elseif ARGS[1] == "randtrim" 72 | julia_wrapper_randtrim(ARGS[2:end]::Vector{String}) 73 | elseif ARGS[1] == "readstat" 74 | julia_wrapper_readstat(ARGS[2:end]::Vector{String}) 75 | elseif ARGS[1] == "statplot" 76 | julia_wrapper_rscript(statplot_code, ARGS[2:end]::Vector{String}) 77 | elseif ARGS[1] == "test" 78 | test_atria() 79 | else 80 | if "--detect-adapter" in ARGS 81 | if "-R" in ARGS || "--read2" in ARGS 82 | julia_wrapper_detect_adapter_pe(ARGS) 83 | else 84 | julia_wrapper_detect_adapter_se(ARGS) 85 | end 86 | elseif "-R" in ARGS || "--read2" in ARGS 87 | # paired-end 88 | julia_wrapper_atria_pe(ARGS::Vector{String}) 89 | else 90 | julia_wrapper_atria_se(ARGS::Vector{String}) 91 | end 92 | end 93 | else 94 | atria_markdown_help() 95 | end 96 | return 0 97 | end 98 | 99 | 100 | 101 | end # module end 102 | -------------------------------------------------------------------------------- /src/BioBits/biosequences_safety.jl: -------------------------------------------------------------------------------- 1 | 2 | #= 3 | Some functions, such as BioSequences._orphan!, Base.resize!, and 4 | BioSequences.reverse_complement! were modified from BioSequences package 5 | developped by BioJulia. Those functions have their own license: 6 | 7 | MIT License 8 | 9 | Copyright (c) 2018: BioJulia. 10 | 11 | Permission is hereby granted, free of charge, to any person obtaining 12 | a copy of this software and associated documentation files (the 13 | "Software"), to deal in the Software without restriction, including 14 | without limitation the rights to use, copy, modify, merge, publish, 15 | distribute, sublicense, and/or sell copies of the Software, and to 16 | permit persons to whom the Software is furnished to do so, subject to 17 | the following conditions: 18 | 19 | The above copyright notice and this permission notice shall be 20 | included in all copies or substantial portions of the Software. 21 | 22 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 23 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 24 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 25 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 26 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 27 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 28 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 29 | =# 30 | 31 | @inline function isbitsafe(seq::LongDNA{4}) 32 | unsafe_isbitsafe(seq) && 33 | seq.data[end] == 0x0000000000000000 && 34 | if length(seq.data) > 1 && seq.len % 16 != 0x0000000000000000 35 | (seq.data[end-1] >> (seq.len % 16 * 4) == 0x0000000000000000) 36 | else 37 | true 38 | end 39 | end 40 | 41 | @inline function unsafe_isbitsafe(seq::LongDNA{4}) 42 | length(seq.data) == cld(seq.len, 16) + 1 43 | end 44 | 45 | """ 46 | unsafe_extra_bits_to_zeros!(seq::LongDNA{4}) 47 | 48 | Caution: use only in bitsafe seq! 49 | """ 50 | @inline function unsafe_extra_bits_to_zeros!(seq::LongDNA{4}) 51 | if !isempty(seq) 52 | remain = (seq.len % 16) 53 | @inbounds if remain != 0 54 | seq.data[end-1] &= ~(0xffffffffffffffff << (remain * 4)) 55 | end 56 | end 57 | @inbounds seq.data[end] = 0x0000000000000000 58 | return seq 59 | end 60 | 61 | """ 62 | bitsafe!(seq::LongDNA{4}) 63 | 64 | Resize `seq.data` to allow loading a pointer `Ptr{UInt64}` safely at the end of `seq`. 65 | 66 | Caution: bitsafe LongDNA{4} may not be compatible on all BioSequences functions, especially those do in-place replacement. 67 | """ 68 | @inline function bitsafe!(seq::LongDNA{4}) 69 | if !unsafe_isbitsafe(seq) 70 | resize!(seq.data, cld(seq.len, 16) + 1) 71 | end 72 | unsafe_extra_bits_to_zeros!(seq) 73 | end 74 | 75 | """ 76 | resize!(seq::LongDNA{4}, size::Int[, force::Bool=false]) 77 | 78 | It overrides `resize!` in BioSequences. Resize a biological sequence `seq`, to a given `size`. The underlying data is bitsafe. 79 | """ 80 | @inline function Base.resize!(seq::LongSequence{A}, size::Int, force::Bool=false) where {A} 81 | if size < 0 82 | throw(ArgumentError("size must be non-negative")) 83 | else 84 | if force | (BioSequences.seq_data_len(A, size) > BioSequences.seq_data_len(A, length(seq))) 85 | resize!(seq.data, BioSequences.seq_data_len(A, size)) 86 | end 87 | seq.len = size 88 | bitsafe!(seq) 89 | end 90 | end 91 | 92 | function BioSequences.reverse_complement!(seq::LongSequence{<:NucleicAcidAlphabet}) 93 | pred = x -> BioSequences.complement_bitpar(x, Alphabet(seq)) 94 | BioSequences.reverse_data!(pred, seq.data, BioSequences.seq_data_len(seq) % UInt, BioSequences.BitsPerSymbol(seq)) 95 | BioSequences.zero_offset!(seq) 96 | bitsafe!(seq) 97 | end 98 | 99 | function BioSequences.reverse_complement(seq::LongSequence{<:NucleicAcidAlphabet}) 100 | cp = typeof(seq)(undef, unsigned(length(seq))) 101 | pred = x -> BioSequences.complement_bitpar(x, Alphabet(seq)) 102 | BioSequences.reverse_data_copy!(pred, cp.data, seq.data, BioSequences.seq_data_len(seq) % UInt, BioSequences.BitsPerSymbol(seq)) 103 | BioSequences.zero_offset!(cp) 104 | bitsafe!(cp) 105 | end 106 | -------------------------------------------------------------------------------- /benchmark/real-data-rnaseq.bash: -------------------------------------------------------------------------------- 1 | 2 | ##### SRR330569: RNA-seq D. simulans 3 | working_dir=~/analysis/atria-benchmark/SRR330569 4 | cd $working_dir 5 | 6 | r1=SRR330569.3_1.fastq.gz 7 | r2=SRR330569.3_2.fastq.gz 8 | a1=AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCG 9 | a2=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGAT 10 | bwa_ref=`pwd`/genomes/dsim-all-chromosome-r2.02.fasta 11 | 12 | # download reference 13 | mkdir genomes 14 | # wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/754/195/GCF_000754195.2_ASM75419v2/GCF_000754195.2_ASM75419v2_genomic.fna.gz -O $bwa_ref_genbank.gz 15 | wget ftp://ftp.flybase.net/genomes/Drosophila_simulans/dsim_r2.02_FB2020_03/fasta/dsim-all-chromosome-r2.02.fasta.gz -O $bwa_ref.gz 16 | 17 | gzip -d $bwa_ref.gz 18 | 19 | # build reference 20 | hisat2-build $bwa_ref $bwa_ref-hisat2 21 | 22 | 23 | ##### Pipelines 24 | 25 | . $atria/benchmark/trimming-functions.bash 26 | 27 | rm -f stderr.log 28 | run_atria 8 2>> stderr.log 29 | 30 | run_atria_consensus 8 2>> stderr.log 31 | 32 | run_adapterremoval 8 2>> stderr.log 33 | 34 | run_skewer 8 2>> stderr.log 35 | 36 | run_trim_galore 8 2>> stderr.log 37 | 38 | run_trimmomatic 8 2>> stderr.log 39 | 40 | run_ktrim 8 2>> stderr.log 41 | pigz Ktrim/ktrim.read1.fq Ktrim/ktrim.read2.fq 42 | 43 | run_fastp 8 2>> stderr.log 44 | 45 | run_seqpurge 8 2>> stderr.log 46 | 47 | run_atropos 8 2>> stderr.log 48 | 49 | # mapping without qualtrim 50 | mkdir -p trimmed 51 | ln -s ../AdapterRemoval-3/adapterremoval.pair1.truncated.gz trimmed/adapterremoval.R1.fastq.gz 52 | ln -s ../AdapterRemoval-3/adapterremoval.pair2.truncated.gz trimmed/adapterremoval.R2.fastq.gz 53 | 54 | ln -s ../Atria/${r1/.fastq*/}.atria.fastq.gz trimmed/atria.R1.fastq.gz 55 | ln -s ../Atria/${r2/.fastq*/}.atria.fastq.gz trimmed/atria.R2.fastq.gz 56 | 57 | ln -s ../Atria-consensus/${r1/.fastq*/}.atria.fastq.gz trimmed/atria-consensus.R1.fastq.gz 58 | ln -s ../Atria-consensus/${r2/.fastq*/}.atria.fastq.gz trimmed/atria-consensus.R2.fastq.gz 59 | 60 | ln -s ../Skewer/Skewer-trimmed-pair1.fastq.gz trimmed/Skewer.R1.fastq.gz 61 | ln -s ../Skewer/Skewer-trimmed-pair2.fastq.gz trimmed/Skewer.R2.fastq.gz 62 | 63 | ln -s ../TrimGalore/${r1/.fastq*/}_val_1.fq.gz trimmed/trimgalore.R1.fastq.gz 64 | ln -s ../TrimGalore/${r2/.fastq*/}_val_2.fq.gz trimmed/trimgalore.R2.fastq.gz 65 | 66 | ln -s ../Trimmomatic/out-pair1.paired.fq.gz trimmed/trimmomatic.R1.fastq.gz 67 | ln -s ../Trimmomatic/out-pair2.paired.fq.gz trimmed/trimmomatic.R2.fastq.gz 68 | 69 | ln -s ../Ktrim/ktrim.read1.fq.gz trimmed/ktrim.R1.fastq.gz 70 | ln -s ../Ktrim/ktrim.read2.fq.gz trimmed/ktrim.R2.fastq.gz 71 | 72 | ln -s ../fastp/out.fastp.r1.fq.gz trimmed/fastp.R1.fastq.gz 73 | ln -s ../fastp/out.fastp.r2.fq.gz trimmed/fastp.R2.fastq.gz 74 | 75 | ln -s ../SeqPurge/SRR330569.3_1.fastq.gz.seqpurge.fq.gz trimmed/seqpurge.R1.fastq.gz 76 | ln -s ../SeqPurge/SRR330569.3_2.fastq.gz.seqpurge.fq.gz trimmed/seqpurge.R2.fastq.gz 77 | 78 | ln -s ../Atropos/SRR330569.3_1.fastq.gz.atropos.fq.gz trimmed/atropos.R1.fastq.gz 79 | ln -s ../Atropos/SRR330569.3_2.fastq.gz.atropos.fq.gz trimmed/atropos.R2.fastq.gz 80 | 81 | 82 | # mapping after qualtrim 83 | QSCORE=15 84 | time atria -r trimmed/*.R1.fastq.gz -R trimmed/*.R2.fastq.gz -t 5 -p 6 -o trimmed-qualtrim --no-tail-n-trim --max-n=-1 --no-adapter-trim --no-length-filtration --quality-score $QSCORE --check-identifier 85 | rename --force "s/atria.fastq/qual$QSCORE.fastq/" trimmed-qualtrim/*fastq* 86 | 87 | # time atria -r trimmed/ktrim.R1.fastq.gz trimmed/fastp.R1.fastq.gz trimmed/seqpurge.R1.fastq.gz trimmed/atropos.R1.fastq.gz -R trimmed/ktrim.R2.fastq.gz trimmed/fastp.R2.fastq.gz trimmed/seqpurge.R2.fastq.gz trimmed/atropos.R2.fastq.gz -t 7 -p 4 -o trimmed-qualtrim --no-tail-n-trim --max-n=-1 --no-adapter-trim --no-length-filtration --quality-score $QSCORE --check-identifier 88 | # rename --force "s/atria.fastq/qual$QSCORE.fastq/" trimmed-qualtrim/*fastq* 89 | 90 | # trimmed*/{atropos,fastp,ktrim,seqpurge}.R1*fastq.gz 91 | for i in trimmed*/*.R1*fastq.gz 92 | do 93 | echo $i 94 | mapping_hisat2 $i ${i/.R1/.R2} 95 | 96 | samtools stats $i.hisat2.sam > $i.hisat2.sam.samtools-stats & 97 | pigz $i.hisat2.sam 98 | done 2>&1 | tee mapping.log 99 | 100 | 101 | cd trimmed 102 | pasteSamtoolsStats *samtools-stats 103 | cd .. 104 | 105 | 106 | cd trimmed-qualtrim 107 | pasteSamtoolsStats *samtools-stats 108 | cd .. 109 | -------------------------------------------------------------------------------- /benchmark/real-data-human.bash: -------------------------------------------------------------------------------- 1 | 2 | ##### SRR330569: RNA-seq D. simulans 3 | working_dir=~/analysis/atria-benchmark/ERR4695159 4 | cd $working_dir 5 | 6 | r1=ERR4695159_1.fastq.gz 7 | r2=ERR4695159_2.fastq.gz 8 | a1=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA 9 | a2=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT 10 | bwa_ref=`pwd`/genomes/hg38.fasta.gz 11 | 12 | # download reference 13 | mkdir genomes 14 | wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.fna.gz -O $bwa_ref 15 | 16 | gzip -d $bwa_ref.gz 17 | 18 | # build reference 19 | bowtie2-build $bwa_ref $bwa_ref-bowtie2 20 | 21 | 22 | ##### Pipelines 23 | 24 | . $atria/benchmark/trimming-functions.bash 25 | 26 | rm -f stderr.log 27 | run_atria 8 2>> stderr.log 28 | 29 | run_atria_consensus 8 2>> stderr.log 30 | 31 | run_adapterremoval 8 2>> stderr.log 32 | 33 | run_skewer 8 2>> stderr.log 34 | 35 | run_trim_galore 8 2>> stderr.log 36 | 37 | run_trimmomatic 8 2>> stderr.log 38 | 39 | run_ktrim 8 2>> stderr.log 40 | pigz Ktrim/ktrim.read1.fq Ktrim/ktrim.read2.fq 41 | 42 | run_fastp 8 2>> stderr.log 43 | 44 | run_seqpurge 8 2>> stderr.log 45 | 46 | run_atropos 8 2>> stderr.log 47 | 48 | # mapping without qualtrim 49 | mkdir -p trimmed 50 | ln -s ../AdapterRemoval-3/adapterremoval.pair1.truncated.gz trimmed/adapterremoval.R1.fastq.gz 51 | ln -s ../AdapterRemoval-3/adapterremoval.pair2.truncated.gz trimmed/adapterremoval.R2.fastq.gz 52 | 53 | ln -s ../Atria/${r1/.fastq*/}.atria.fastq.gz trimmed/atria.R1.fastq.gz 54 | ln -s ../Atria/${r2/.fastq*/}.atria.fastq.gz trimmed/atria.R2.fastq.gz 55 | 56 | ln -s ../Atria-consensus/${r1/.fastq*/}.atria.fastq.gz trimmed/atria-consensus.R1.fastq.gz 57 | ln -s ../Atria-consensus/${r2/.fastq*/}.atria.fastq.gz trimmed/atria-consensus.R2.fastq.gz 58 | 59 | ln -s ../Skewer/Skewer-trimmed-pair1.fastq.gz trimmed/Skewer.R1.fastq.gz 60 | ln -s ../Skewer/Skewer-trimmed-pair2.fastq.gz trimmed/Skewer.R2.fastq.gz 61 | 62 | ln -s ../TrimGalore/${r1/.fastq*/}_val_1.fq.gz trimmed/trimgalore.R1.fastq.gz 63 | ln -s ../TrimGalore/${r2/.fastq*/}_val_2.fq.gz trimmed/trimgalore.R2.fastq.gz 64 | 65 | ln -s ../Trimmomatic/out-pair1.paired.fq.gz trimmed/trimmomatic.R1.fastq.gz 66 | ln -s ../Trimmomatic/out-pair2.paired.fq.gz trimmed/trimmomatic.R2.fastq.gz 67 | 68 | ln -s ../Ktrim/ktrim.read1.fq.gz trimmed/ktrim.R1.fastq.gz 69 | ln -s ../Ktrim/ktrim.read2.fq.gz trimmed/ktrim.R2.fastq.gz 70 | 71 | ln -s ../fastp/out.fastp.r1.fq.gz trimmed/fastp.R1.fastq.gz 72 | ln -s ../fastp/out.fastp.r2.fq.gz trimmed/fastp.R2.fastq.gz 73 | 74 | ln -s ../SeqPurge/ERR4695159_1.fastq.gz.seqpurge.fq.gz trimmed/seqpurge.R1.fastq.gz 75 | ln -s ../SeqPurge/ERR4695159_2.fastq.gz.seqpurge.fq.gz trimmed/seqpurge.R2.fastq.gz 76 | 77 | ln -s ../Atropos/ERR4695159_1.fastq.gz.atropos.fq.gz trimmed/atropos.R1.fastq.gz 78 | ln -s ../Atropos/ERR4695159_2.fastq.gz.atropos.fq.gz trimmed/atropos.R2.fastq.gz 79 | 80 | # ln -s ../Atria-v3.1.4/${r1/.fastq*/}.atria.fastq.gz trimmed/atria-v3.1.4.R1.fastq.gz 81 | # ln -s ../Atria-v3.1.4/${r2/.fastq*/}.atria.fastq.gz trimmed/atria-v3.1.4.R2.fastq.gz 82 | 83 | # ln -s ../Atria-consensus-v3.1.4/${r1/.fastq*/}.atria.fastq.gz trimmed/atria-consensus-v3.1.4.R1.fastq.gz 84 | # ln -s ../Atria-consensus-v3.1.4/${r2/.fastq*/}.atria.fastq.gz trimmed/atria-consensus-v3.1.4.R2.fastq.gz 85 | 86 | # mapping after qualtrim 87 | QSCORE=15 88 | time atria -r trimmed/*.R1.fastq.gz -R trimmed/*.R2.fastq.gz -t 5 -p 6 -o trimmed-qualtrim --no-tail-n-trim --max-n=-1 --no-adapter-trim --no-length-filtration --quality-score $QSCORE --check-identifier 89 | rename --force "s/atria.fastq/qual$QSCORE.fastq/" trimmed-qualtrim/*fastq* 90 | 91 | # time atria -r trimmed/ktrim.R1.fastq.gz trimmed/fastp.R1.fastq.gz trimmed/seqpurge.R1.fastq.gz trimmed/atropos.R1.fastq.gz -R trimmed/ktrim.R2.fastq.gz trimmed/fastp.R2.fastq.gz trimmed/seqpurge.R2.fastq.gz trimmed/atropos.R2.fastq.gz -t 7 -p 4 -o trimmed-qualtrim --no-tail-n-trim --max-n=-1 --no-adapter-trim --no-length-filtration --quality-score $QSCORE --check-identifier 92 | # rename --force "s/atria.fastq/qual$QSCORE.fastq/" trimmed-qualtrim/*fastq* 93 | 94 | # trimmed*/{atropos,fastp,ktrim,seqpurge}.R1*fastq.gz 95 | for i in trimmed*/*.R1*fastq.gz 96 | do 97 | echo $i 98 | mapping_bowtie2 $i ${i/.R1/.R2} 99 | 100 | samtools stats $i.bowtie2.bam > $i.bowtie2.bam.samtools-stats & 101 | done 2>&1 | tee mapping.log 102 | 103 | 104 | cd trimmed 105 | pasteSamtoolsStats *samtools-stats 106 | cd .. 107 | 108 | 109 | cd trimmed-qualtrim 110 | pasteSamtoolsStats *samtools-stats 111 | cd .. 112 | -------------------------------------------------------------------------------- /src/BioBits/insert_size_decision.jl: -------------------------------------------------------------------------------- 1 | 2 | @inline function insert_size_decision(a_insert_size::Int64, a_score::Float64, b_insert_size::Int64, b_score::Float64; insert_size_diff::Int64 = 0) 3 | if a_insert_size == b_insert_size 4 | insert_size = a_insert_size 5 | score = a_score + b_score 6 | elseif abs(a_insert_size - b_insert_size) <= insert_size_diff 7 | insert_size = min(a_insert_size, b_insert_size) 8 | score = a_score + b_score 9 | elseif a_score > b_score 10 | insert_size = a_insert_size 11 | score = a_score 12 | else 13 | insert_size = b_insert_size 14 | score = b_score 15 | end 16 | insert_size, score 17 | end 18 | 19 | 20 | @inline function insert_size_decision_separate(a_insert_size::Int64, a_score::Float64, b_insert_size::Int64, b_score::Float64; insert_size_diff::Int64 = 0) 21 | if abs(a_insert_size - b_insert_size) <= insert_size_diff 22 | score = a_score + b_score 23 | # insert sizes not changed 24 | # NOTE: remove the following elseif because 25 | # elseif abs(a_score - b_score) < score_diff 0 <= score_diff <= 3 get the highest result. 26 | # score = (a_score + b_score) / 2 27 | # # choose the min insert size for both a and b 28 | # if a_insert_size > b_insert_size 29 | # a_insert_size = b_insert_size 30 | # else 31 | # b_insert_size = a_insert_size 32 | # end 33 | elseif a_score > b_score 34 | score = a_score 35 | b_insert_size = a_insert_size 36 | else 37 | score = b_score 38 | a_insert_size = b_insert_size 39 | end 40 | a_insert_size, b_insert_size, score 41 | end 42 | 43 | @inline function is_false_positive(r1_adapter_insert_size::Int64, r1_pe_insert_size::Int64, r1_length::Int64, r2_adapter_insert_size::Int64, r2_pe_insert_size::Int64, r2_length::Int64; insert_size_diff::Int64 = 0, tail_length::Int64 = 8)::Bool 44 | 45 | # skip running this function when length are different. 46 | if r1_length < max(r2_adapter_insert_size, r2_pe_insert_size) || 47 | r2_length < max(r1_adapter_insert_size, r1_pe_insert_size) 48 | return false 49 | end 50 | 51 | r1_adapter_error = (r1_adapter_insert_size > r1_length - tail_length) | (r1_adapter_insert_size == -1) 52 | r1_pe_error = r1_pe_insert_size < r1_length - tail_length 53 | r1_error = r1_adapter_error & r1_pe_error 54 | 55 | r2_adapter_error = (r2_adapter_insert_size > r2_length - tail_length) | (r2_adapter_insert_size == -1) 56 | r2_pe_error = r2_pe_insert_size < r2_length - tail_length 57 | r2_error = r2_adapter_error & r2_pe_error 58 | 59 | r1_adapter_inrange = abs(r1_adapter_insert_size - r1_pe_insert_size) <= insert_size_diff 60 | r2_adapter_inrange = abs(r2_adapter_insert_size - r2_pe_insert_size) <= insert_size_diff 61 | 62 | not_false_positive = r1_adapter_inrange | r2_adapter_inrange 63 | 64 | (r1_error | r2_error) & !not_false_positive 65 | end 66 | 67 | """ 68 | one_bp_check(r::LongDNA{4}, a::LongDNA{4}, nremain::Int64, length_to_check::Int64) 69 | 70 | v3.0.0: When finishing matching, Atria might have 1 bp offset because of insert size decision. Check 1 bp offset of reads at adapter (`a`) position (`nremain + 1`) to adapter is necessary. 71 | 72 | Return best nremain::Int64. 73 | """ 74 | @inline function one_bp_check(r::LongDNA{4}, a::LongDNA{4}, nremain::Int64, length_to_check::Int64) 75 | n = length(r) 76 | if nremain >= n - 3 ## no need to check adapter when no adapter. 77 | return nremain 78 | end 79 | nmatch = unsafe_seq_identity(r, a, nremain + 1, 1, length_to_check) 80 | nmatch_left = unsafe_seq_identity(r, a, nremain, 1, length_to_check) 81 | nmatch_right = unsafe_seq_identity(r, a, nremain + 2, 1, length_to_check) 82 | if nmatch >= nmatch_left 83 | if nmatch >= nmatch_right 84 | nremain 85 | else 86 | nremain + 1 87 | end 88 | else 89 | if nmatch_left >= nmatch_right 90 | nremain - 1 91 | else 92 | nremain + 1 93 | end 94 | end 95 | end 96 | 97 | @inline function unsafe_seq_identity(a::LongDNA{4}, b::LongDNA{4}, ia::Int64, ib::Int64, max_check::Int64; max_a::Int64 = length(a)) 98 | nmatch = 0 99 | ncheck = 0 100 | @inbounds while ncheck < max_check && ia <= max_a 101 | if a[ia] === b[ib] 102 | nmatch += 1 103 | end 104 | ncheck +=1 105 | ia += 1 106 | ib += 1 107 | end 108 | nmatch 109 | end 110 | -------------------------------------------------------------------------------- /benchmark/atria-similate-for-atria-only.bash: -------------------------------------------------------------------------------- 1 | 2 | cd ~/analysis/atria-benchmark/julia1.8.5-atria4.0.0 3 | 4 | a1=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA 5 | a2=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT 6 | 7 | r1="reads_diff_indel.R1.fastq" 8 | r2="reads_diff_indel.R2.fastq" 9 | 10 | # r1="reads_diff_indel.R1.fastq.gz" 11 | # r2="reads_diff_indel.R2.fastq.gz" 12 | 13 | atria_old=atria 14 | atria_new=/home/jc/projects/Atria-jl1.8/app-4.0.0_2023-11-10T09-58/bin/atria 15 | 16 | 17 | . $atria/benchmark/trimming-functions.bash 18 | 19 | # atria simulate --prefix reads_diff_indel --adapter1 $a1 --adapter2 $a2 --repeat 300000 --subsitution-rate 0.001 0.002 0.003 0.004 0.005 --insertion-rate 1.0e-5 2.0e-5 3.0e-5 4.0e-5 5.0e-5 --deletion-rate 1.0e-5 2.0e-5 3.0e-5 4.0e-5 5.0e-5 -s 100 -i `seq 66 2 120` 20 | atria simulate --prefix reads_diff_indel --adapter1 $a1 --adapter2 $a2 --repeat 300000 --subsitution-rate 0.001 0.003 0.005 --insertion-rate 1.0e-5 3.0e-5 5.0e-5 --deletion-rate 1.0e-5 3.0e-5 5.0e-5 -s 100 -i `seq 78 2 108` 21 | 22 | NUM_READS=`wc -l reads_diff_indel.R1.fastq | cut -f1 -d" "` 23 | NUM_BASES=`echo "$NUM_READS / 4 * 200" | bc` 24 | echo NUM_BASES=$NUM_BASES 25 | 26 | 27 | run_atria_src(){ 28 | local num_threads=1 29 | local outdir=Atria-src 30 | if [[ $1 ]]; then 31 | num_threads=$1 32 | fi 33 | if [[ $2 ]]; then 34 | outdir=$2 35 | fi 36 | export JULIA_NUM_THREADS=$num_threads 37 | $time $atria/src/atria --no-consensus -t $num_threads -r $r1 -R $r2 -o $outdir --no-tail-n-trim --max-n=-1 --no-quality-trim --no-length-filtration --adapter1 $a1 --adapter2 $a2 --force 38 | } 39 | 40 | run_atria(){ 41 | local num_threads=1 42 | if [[ $1 ]]; then 43 | num_threads=$1 44 | fi 45 | $time -v $atria_old --no-consensus -t $num_threads \ 46 | -r $r1 -R $r2 \ 47 | -o Atria-old \ 48 | --no-tail-n-trim --max-n=-1 --no-quality-trim --no-length-filtration \ 49 | --adapter1 $a1 --adapter2 $a2 --force 50 | } 51 | 52 | run_atria_consensus(){ 53 | local num_threads=1 54 | if [[ $1 ]]; then 55 | num_threads=$1 56 | fi 57 | $time -v $atria_old -t $num_threads \ 58 | -r $r1 -R $r2 \ 59 | -o Atria-consensus-old \ 60 | --no-tail-n-trim --max-n=-1 --no-quality-trim --no-length-filtration \ 61 | --adapter1 $a1 --adapter2 $a2 --force 62 | } 63 | 64 | run_atria_new(){ 65 | local num_threads=1 66 | if [[ $1 ]]; then 67 | num_threads=$1 68 | fi 69 | $time -v $atria_new --no-consensus -t $num_threads \ 70 | -r $r1 -R $r2 \ 71 | -o Atria-new \ 72 | --no-tail-n-trim --max-n=-1 --no-quality-trim --no-length-filtration \ 73 | --adapter1 $a1 --adapter2 $a2 --force 74 | } 75 | 76 | run_atria_consensus_new(){ 77 | local num_threads=1 78 | if [[ $1 ]]; then 79 | num_threads=$1 80 | fi 81 | $time -v $atria_new \ 82 | -r $r1 -R $r2 \ 83 | -o Atria-consensus-new \ 84 | --no-tail-n-trim --max-n=-1 --no-quality-trim --no-length-filtration \ 85 | --adapter1 $a1 --adapter2 $a2 -t $num_threads --force 86 | } 87 | 88 | 89 | echo "" 2> stderr.base.log 90 | run_atria 1 2>> stderr.base.log 91 | run_atria 2 2>> stderr.base.log 92 | run_atria 4 2>> stderr.base.log 93 | run_atria 8 2>> stderr.base.log 94 | run_atria 16 2>> stderr.base.log 95 | 96 | run_atria_consensus 1 2>> stderr.base.log 97 | run_atria_consensus 2 2>> stderr.base.log 98 | run_atria_consensus 4 2>> stderr.base.log 99 | run_atria_consensus 8 2>> stderr.base.log 100 | run_atria_consensus 16 2>> stderr.base.log 101 | 102 | echo "" 2> stderr.dev.log 103 | 104 | 105 | run_atria_new 1 2>> stderr.dev.log 106 | run_atria_new 2 2>> stderr.dev.log 107 | run_atria_new 4 2>> stderr.dev.log 108 | run_atria_new 8 2>> stderr.dev.log 109 | run_atria_new 16 2>> stderr.dev.log 110 | 111 | run_atria_consensus_new 1 2>> stderr.dev.log 112 | run_atria_consensus_new 2 2>> stderr.dev.log 113 | run_atria_consensus_new 4 2>> stderr.dev.log 114 | run_atria_consensus_new 8 2>> stderr.dev.log 115 | run_atria_consensus_new 16 2>> stderr.dev.log 116 | 117 | # run_atria 16 2>> stderr.log 118 | # run_atria_new 16 2>> stderr.log 119 | # run_atria_consensus 16 2>> stderr.log 120 | # run_atria_consensus_new 16 2>> stderr.log 121 | 122 | ll */*fastq 123 | 124 | for i in * 125 | do 126 | if [[ -d $i ]] 127 | then 128 | atria readstat $i/*.f*q & 129 | fi 130 | done 131 | wait 132 | 133 | atria statplot -i auto -l DIR -F 134 | 135 | cat stderr.base.log stderr.dev.log > std_all.log 136 | pasteTimeOutput std_all.log > time_benchmark.txt 137 | $atria/benchmark/time_stats.jl time_benchmark.txt $NUM_BASES > time_benchmark.df.txt 138 | wps time_benchmark.df.txt & -------------------------------------------------------------------------------- /src/Trimmer/wrapper_detect_adapter_se.jl: -------------------------------------------------------------------------------- 1 | 2 | # f_procs(x::String) = x == "-p" || x == "--procs" 3 | 4 | function julia_wrapper_detect_adapter_se(ARGS::Vector{String}; exit_after_help = true) 5 | 6 | args = parsing_args(ARGS; exit_after_help = exit_after_help) 7 | 8 | if args === nothing # ARGS is ["-h"] 9 | return 0 10 | end 11 | args_range_test(args) 12 | 13 | 14 | #================== Arguments ====================# 15 | 16 | nthread = args["threads" ] 17 | max_chunk_size = 2 ^ args["log2-chunk-size"] 18 | 19 | #================== Main function and common variables ====================# 20 | 21 | in1bytes = Vector{UInt8}(undef, max_chunk_size) 22 | 23 | # number of jobs to boxing FqRecord from UInt8 Vector 24 | njobs = nthread * 5 25 | vr1s = ntuple(_ -> Vector{FqRecord}(), njobs) 26 | 27 | r1s = Vector{FqRecord}() 28 | 29 | 30 | #================== Iteration for files ====================# 31 | 32 | append!(args["read1"], args["read2"]) 33 | 34 | for file1 in args["read1"] 35 | 36 | #===== file names =====# 37 | 38 | isingzip = occursin(r"\.gz$"i, file1) 39 | isinbzip2 = occursin(r"\.bz2$"i, file1) 40 | 41 | #===== file IO =====# 42 | 43 | if isingzip 44 | io1 = open(`pigz -p$nthread -cd $file1`, write=false) 45 | elseif isinbzip2 46 | io1 = open(`pbzip2 -p$nthread -cd $file1`, write=false) 47 | else 48 | io1 = open(file1, "r") 49 | end 50 | 51 | 52 | #================== Renew variables for read processing ====================# 53 | 54 | # clear common variables 55 | empty!(r1s) 56 | 57 | n_reads = 0 58 | n_r1 = 0 59 | in1bytes_nremain = 0 60 | task_r1s_unbox = Threads.@spawn 1 61 | 62 | #================== File processing ====================# 63 | 64 | # the first cycle to generate compiled code? 65 | function cycle_wrapper_detect_adapter() 66 | 67 | if typeof(io1) <: IOStream # not compressed 68 | (n_r1, r1s, ncopied) = load_fqs_threads!(io1, in1bytes, vr1s, r1s, task_r1s_unbox; remove_first_n = n_reads, njobs=njobs) 69 | else # gziped 70 | (n_r1, r1s, in1bytes_nremain, ncopied) = load_fqs_threads!( 71 | io1, 72 | in1bytes, 73 | in1bytes_nremain, 74 | vr1s, 75 | r1s, 76 | task_r1s_unbox; 77 | remove_first_n = n_reads, 78 | njobs = njobs 79 | ) 80 | end 81 | 82 | top5, headers = detect_adapter_threads!(r1s) 83 | 84 | adapter_frequency = top5[1,2] / n_r1 85 | if adapter_frequency < 0.0004 86 | @info "$file1:\n No adapter detected in the first $n_r1 reads." 87 | else 88 | adapter_table = pretty_table(String, top5, header = headers) 89 | @info "$file1:\n Top 5 adapters detected in the first $n_r1 reads:\n$adapter_table" 90 | end 91 | end 92 | 93 | cycle_wrapper_detect_adapter() 94 | 95 | #================== Close files ====================# 96 | 97 | close(io1) 98 | end 99 | 100 | println(""" 101 | _________________________________ 102 | 103 | Single-end Adapter Detection Note: 104 | 105 | Atria detects adapter sequences using a known adapter file. Adapter sequences are truncated to 16-bp, which are accurate enough for trimming. From experiments of many popular trimmers, increasing adapter length from 16 to 33 does not increase accuracy (Figure 4C of https://doi.org/10.46471/gigabyte.31). 106 | 107 | Adapter detection is the last choice because its accuracy is highly based on your data. If your data has been trimmed, the remaining adapters may not be enough for accurate guessing. Also, Atria cannot find adapters not listed in the known adapter file. We suggest using adapter detection only when you cannot find the actual adapter sequence. 108 | 109 | Besides, Atria does not automatically trim auto-detected adapters. It is your responsibility to check whether the detected adapters are real. 110 | 111 | Those rules can be used to check the adapter results: 112 | 113 | (1) An Illumina sequence file only has ONE adapter sequence. 114 | 115 | (2) In the same batch of NGS experiments, all single-end samples should have the SAME adapter sequence. The most prevalent adapters might be true for all your single-end data. 116 | _________________________________ 117 | 118 | """) 119 | 120 | return 0 121 | end # func 122 | -------------------------------------------------------------------------------- /benchmark/evalTrimming.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # This code is part of Skewer (https://sourceforge.net/projects/skewer/). The License: 4 | # 5 | # The MIT License (MIT) 6 | # 7 | # Copyright (c) 2013-2014 by Hongshan Jiang 8 | # 9 | # Permission is hereby granted, free of charge, to any person obtaining a copy 10 | # of this software and associated documentation files (the "Software"), to deal 11 | # in the Software without restriction, including without limitation the rights 12 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | # copies of the Software, and to permit persons to whom the Software is 14 | # furnished to do so, subject to the following conditions: 15 | # 16 | # The above copyright notice and this permission notice shall be included in all 17 | # copies or substantial portions of the Software. 18 | # 19 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | # SOFTWARE. 26 | 27 | use strict; 28 | 29 | if(@ARGV < 3){ 30 | print STDERR "Usage: $0 fullLen lengths.tab file1.fastq [file2.fastq [lengths2.tab]] > summary\n"; 31 | exit(1); 32 | } 33 | 34 | my ($full_len, $tab_file, $file1, $file2, $tab_file2) = @ARGV; 35 | our ($tp, $tn, $fp, $fp2, $fn, $fn2) = (0, 0, 0, 0, 0, 0); 36 | 37 | open(TAB, "<$tab_file") or die("Can not open $tab_file for reading\n"); 38 | 39 | if(!open(IN, "<$file1")){ 40 | close TAB; 41 | die("Can not open $file1 for reading\n"); 42 | } 43 | 44 | my $id = &calcMetrics(\*TAB, \*IN); 45 | die("read $id is in $file1 but not in $tab_file\n") if(defined $id); 46 | 47 | close IN; 48 | if(defined $file2){ 49 | my $bUseTab2; 50 | if(defined $tab_file2){ 51 | if(open(TAB2, "<$tab_file2")){ 52 | $bUseTab2 = 1; 53 | } 54 | else{ 55 | print STDERR "Warning: Can not $tab_file2 for reading, using $tab_file instead\n"; 56 | $bUseTab2 = 0; 57 | } 58 | } 59 | else{ 60 | $bUseTab2 = 0; 61 | } 62 | # 63 | if(open(IN, "<$file2")){ 64 | my $fh = $bUseTab2 ? \*TAB2 : \*TAB; 65 | my $fname = $bUseTab2 ? $tab_file2 : $tab_file; 66 | $id = &calcMetrics($fh, \*IN); 67 | die("read $id is in $file2 but not in $fname\n") if(defined $id); 68 | } 69 | else{ 70 | print STDERR "Warning: Can not $file2 for reading, using information in $file1 only\n"; 71 | } 72 | if($bUseTab2){ 73 | close TAB2; 74 | } 75 | } 76 | 77 | close TAB; 78 | 79 | my $ppv = ($tp+$fp+$fp2+$fn2) > 0 ? $tp/($tp+$fp+$fp2+$fn2) : 0; 80 | my $sen = ($tp+$fn+$fp2+$fn2) > 0 ? $tp/($tp+$fn+$fp2+$fn2) : 0; 81 | my $spec = $tn/($tn+$fp); 82 | my $dom = sqrt(($tp+($fp+$fp2))*($tp+($fn+$fn2))*($tn+($fp+$fp2))*($tn+($fn+$fn2))); 83 | my $cc = ($dom > 0) ? (($tp * $tn - ($fp+$fp2) * ($fn+$fn2)) / $dom) : 0; 84 | my $fpr = (1 - $spec); 85 | print "TP\tFP_ft\tFP_ot\tFN_fr\tFN_ut\tTN\tPPV\tSen.\tSpec.\tmCC\n"; 86 | print "$tp\t$fp\t$fp2\t$fn\t$fn2\t$tn\t$ppv\t$sen\t$spec\t$cc\n"; 87 | print "(FPR, TPR) = ($fpr, $sen)\n"; 88 | 89 | exit(0); 90 | 91 | sub calcMetrics 92 | { 93 | my ($fh_tab, $fh_file) = @_; 94 | our ($tp, $tn, $fp, $fp2, $fn, $fn2); 95 | 96 | my $line; 97 | my ($id, $len); 98 | my ($id2, $len2, $seq); 99 | while($line = <$fh_tab>){ 100 | chomp($line); 101 | ($id, $len) = split(/\t/, $line); 102 | 103 | $id2 = <$fh_file>; chomp($id2); 104 | $seq = <$fh_file>; chomp($seq); 105 | <$fh_file>; <$fh_file>; 106 | ($id2) = split(/\//, substr($id2,1)); 107 | while($id2 ne $id){ 108 | if($len == 0){ 109 | $tp++; 110 | } 111 | else{ 112 | if($len == $full_len){ 113 | $fp++; 114 | } 115 | else{ 116 | $fp2++; 117 | } 118 | } 119 | if(!($line=<$fh_tab>)){ 120 | return $id2; 121 | #die("read $id2 is $file1 but not in $tab_file\n"); 122 | } 123 | chomp($line); 124 | ($id, $len) = split(/\t/, $line); 125 | } 126 | $len2 = length($seq); 127 | if($len == $len2){ 128 | if($len == $full_len){ 129 | $tn++; 130 | } 131 | else{ 132 | $tp++; 133 | } 134 | } 135 | else{ # $len != $len2 136 | if($len < $len2){ 137 | if($len2 == $full_len){ 138 | $fn++; 139 | } 140 | else{ 141 | $fn2++; 142 | } 143 | } 144 | else{ # $len > $len2 145 | if($len == $full_len){ 146 | $fp++; 147 | } 148 | else{ 149 | $fp2++; 150 | } 151 | } 152 | } 153 | } 154 | return undef; 155 | } 156 | -------------------------------------------------------------------------------- /src/FqRecords/pcr_dedup.jl: -------------------------------------------------------------------------------- 1 | 2 | mutable struct DupCount 3 | @atomic count::Int 4 | id::String 5 | end 6 | 7 | const empty_id = "" 8 | DupCount(count::Int) = DupCount(count, empty_id) 9 | 10 | function write_pcr_dedup_count(out_pcr_dedup_count::AbstractString, dup_dict::Dict{Vector{UInt64}, DupCount}) 11 | dup_count = 0 12 | open(out_pcr_dedup_count, "w+") do io 13 | println(io, "count\tid") 14 | for v in values(dup_dict) 15 | if v.count > 1 16 | @inbounds println(io, "$(v.count)\t$(v.id)") 17 | dup_count += v.count 18 | end 19 | end 20 | end 21 | dup_count 22 | end 23 | 24 | function get_dup_count(dup_dict::Dict) 25 | dup_count = 0 26 | for v in values(dup_dict) 27 | if v.count > 1 28 | dup_count += v.count 29 | end 30 | end 31 | dup_count 32 | end 33 | 34 | function write_pcr_hash_collision(out_pcr_hash_collision::AbstractString, hash_collision_dict::Dict{Vector{UInt64}, Set{Tuple{LongDNA{4},LongDNA{4}}}}) 35 | open(out_pcr_hash_collision, "w+") do io 36 | for s in values(hash_collision_dict) 37 | if length(s) > 1 38 | println(io, "\n", length(s)) 39 | for (s1,s2) in values(s) 40 | println(io, "\t", s1, "\t", s2) 41 | end 42 | end 43 | end 44 | end 45 | end 46 | 47 | function alphabet_dna_2bit() 48 | ab = Vector{UInt8}(undef, 16) 49 | fill!(ab, 0x01) # unknown to C 50 | ab[reinterpret(UInt8, DNA_A)+1] = 0b00 51 | ab[reinterpret(UInt8, DNA_T)+1] = 0b11 52 | ab[reinterpret(UInt8, DNA_G)+1] = 0b10 53 | ab 54 | end 55 | const ALPHABET_DNA_2BIT = alphabet_dna_2bit() 56 | 57 | function alphabet_2dna() 58 | ab = Vector{UInt8}(undef, 256) 59 | fill!(ab, 0b01 << 2 | 0b01) # unknown to CC 60 | for x in (0b0001, 0b0010, 0b0100, 0b1000) 61 | x2 = ALPHABET_DNA_2BIT[x+1] 62 | for y in (0b0001, 0b0010, 0b0100, 0b1000) 63 | y2 = ALPHABET_DNA_2BIT[y+1] 64 | double_dna_8bit = x << 4 | y 65 | ab[double_dna_8bit+1] = x2 << 2 | y2 66 | end 67 | end 68 | ab 69 | end 70 | const ALPHABET_2DNA = alphabet_2dna() 71 | 72 | 73 | """ 74 | hash_dna(s1::LongDNA{4}) 75 | 76 | Hash DNA to `[num_bits_in_it; ::LongDNA{2}.data]`. Ambiguous/Gap DNA converts to C. 77 | """ 78 | function hash_dna(s1::LongDNA{4}) 79 | global ALPHABET_2DNA 80 | 81 | len = length(s1) 82 | data = zeros(UInt64, 1 + BioSequences.seq_data_len(DNAAlphabet{2}, len)) 83 | 84 | count_bits = 0 85 | count_c = 0 86 | for x in s1.data 87 | count_bits += count_ones(x) 88 | count_c += count_ones(x & 0x2222222222222222) 89 | end 90 | 91 | dt_32 = reinterpret(reshape, UInt32, data) 92 | @inbounds dt_32[1] = UInt32(count_bits) 93 | @inbounds dt_32[2] = UInt32(count_c) 94 | 95 | dt_re = reinterpret(reshape, UInt8, data) 96 | 97 | s1_re = reinterpret(reshape, UInt8, s1.data) 98 | dt_re_offset = 8 99 | 100 | double_dna_size = cld(len, 4) 101 | @inbounds @simd for i in 1:double_dna_size 102 | double_dna1 = ALPHABET_2DNA[s1_re[2i-1] + 1] 103 | double_dna2 = ALPHABET_2DNA[s1_re[2i] + 1] 104 | 105 | dt_re[dt_re_offset + i] = double_dna2 << 4 | double_dna1 106 | end 107 | data 108 | end 109 | 110 | function hash_dna(s1::LongDNA{4}, s2::LongDNA{4}) 111 | global ALPHABET_2DNA 112 | 113 | len1 = length(s1) 114 | len2 = length(s2) 115 | data_len1 = BioSequences.seq_data_len(DNAAlphabet{2}, len1) 116 | data_len2 = BioSequences.seq_data_len(DNAAlphabet{2}, len2) 117 | data = zeros(UInt64, 1 + data_len1 + data_len2) 118 | 119 | count_bits = 0 120 | count_c = 0 121 | for x in s1.data 122 | count_bits += count_ones(x) 123 | count_c += count_ones(x & 0x2222222222222222) 124 | end 125 | for x in s2.data 126 | count_bits += count_ones(x) 127 | count_c += count_ones(x & 0x2222222222222222) 128 | end 129 | 130 | dt_32 = reinterpret(reshape, UInt32, data) 131 | @inbounds dt_32[1] = UInt32(count_bits) 132 | @inbounds dt_32[2] = UInt32(count_c) 133 | 134 | dt_re = reinterpret(reshape, UInt8, data) 135 | 136 | s1_re = reinterpret(reshape, UInt8, s1.data) 137 | dt_re_offset = 8 138 | 139 | double_dna_size1 = cld(len1, 4) 140 | @inbounds for i in 1:double_dna_size1 141 | double_dna1 = ALPHABET_2DNA[s1_re[2i-1] + 1] 142 | double_dna2 = ALPHABET_2DNA[s1_re[2i] + 1] 143 | 144 | dt_re[dt_re_offset + i] = double_dna2 << 4 | double_dna1 145 | end 146 | 147 | s2_re = reinterpret(reshape, UInt8, s2.data) 148 | dt_re_offset += data_len1 * 8 149 | 150 | double_dna_size2 = cld(len2, 4) 151 | @inbounds for i in 1:double_dna_size2 152 | double_dna1 = ALPHABET_2DNA[s2_re[2i-1] + 1] 153 | double_dna2 = ALPHABET_2DNA[s2_re[2i] + 1] 154 | 155 | dt_re[dt_re_offset + i] = double_dna2 << 4 | double_dna1 156 | end 157 | data 158 | end -------------------------------------------------------------------------------- /benchmark/time_stats.jl: -------------------------------------------------------------------------------- 1 | #!julia 2 | 3 | using DataFrames, CSV 4 | 5 | if isempty(ARGS) || !isfile(ARGS[1]) 6 | println(""" 7 | Usage: $(@__FILE__) time_benchmark.txt num_bases [stderr.pigz.log] 8 | 9 | time_benchmark.txt is the result of `pasteTimeOutput` (see simulate-run-bench.bash); 10 | num_bases is the number of bases processed. 11 | stderr.pigz.log compensate the bug of GNU TIME which cannot stat the subprocess (pigz) of Julia. The file is the result of ``` 12 | /usr/bin/time pigz -p 8 -c Atria-consensus/*atria.fq 1>/dev/null 2>> stderr.pigz.log 13 | /usr/bin/time pigz -p 8 -c Atria/*atria.fq 1>/dev/null 2>> stderr.pigz.log 14 | /usr/bin/time pigz -cd \$r1 \$r2 > /dev/null 2>> stderr.pigz.log 15 | ``` 16 | 17 | Result output to stdout. 18 | """) 19 | exit() 20 | end 21 | 22 | function parse_numeric(x::String) 23 | float = tryparse(Float64, x) 24 | if !isnothing(float) 25 | return float 26 | end 27 | # parse percentage 28 | if occursin(r"\%$", x) 29 | float = tryparse(Float64, x[1:end-1]) 30 | if !isnothing(float) 31 | return float/100 32 | end 33 | end 34 | # parse time as D:H:M:S.MS 35 | xs = split(x, ":") |> reverse! 36 | second = parse(Float64, xs[1]) 37 | for i in 2:length(xs) 38 | if i == 2 39 | second += 60 * parse(Float64, xs[i]) 40 | elseif i == 3 41 | second += 3600 * parse(Float64, xs[i]) 42 | elseif i == 4 43 | second += 24 * 3600 * parse(Float64, xs[i]) 44 | else 45 | error("Failed to parse $x as the time format D:H:M:S") 46 | end 47 | end 48 | second 49 | end 50 | 51 | const THREADS_STR = ["-threads", "-thread", "-cores", "-t"] 52 | function get_threads(x; THREADS_STR=THREADS_STR) 53 | thread = 1 54 | for thread_str in THREADS_STR 55 | m = match(Regex("$thread_str[= ]*([\\d]+)"), x) 56 | isnothing(m) && continue 57 | if length(m.captures) == 1 58 | thread = parse(Int, m.captures[1]) 59 | end 60 | end 61 | @warn "Set threads == $thread for command: $x" 62 | thread 63 | end 64 | 65 | df = CSV.File(ARGS[1], header=false) |> DataFrame 66 | NUM_BASES = length(ARGS) >= 2 ? parse(Int, ARGS[2]) : 1 67 | 68 | const USERTIME_STR = "User time (seconds): " 69 | const SYSTEMTIME_STR = "System time (seconds): " 70 | const CPU_STR = "Percent of CPU this job got: " 71 | const ELAPSEDTIME_STR = "Elapsed (wall clock) time (h:mm:ss or m:ss): " 72 | const MEMORY_STR = "Maximum resident set size (kbytes): " 73 | 74 | USERTIME = findfirst(x -> typeof(x)<:AbstractString && occursin(USERTIME_STR, x), df[1,1:end]) 75 | SYSTEMTIME = findfirst(x -> typeof(x)<:AbstractString && occursin(SYSTEMTIME_STR, x), df[1,1:end]) 76 | CPU = findfirst(x -> typeof(x)<:AbstractString && occursin(CPU_STR, x), df[1,1:end]) 77 | ELAPSEDTIME = findfirst(x -> typeof(x)<:AbstractString && occursin(ELAPSEDTIME_STR, x), df[1,1:end]) 78 | MEMORY = findfirst(x -> typeof(x)<:AbstractString && occursin(MEMORY_STR, x), df[1,1:end]) 79 | 80 | usertimes = parse_numeric.(replace.(df[!, USERTIME], USERTIME_STR=>"")) 81 | systemtimes = parse_numeric.(replace.(df[!, SYSTEMTIME], SYSTEMTIME_STR=>"")) 82 | # cpus = parse_numeric.(replace.(df[!, CPU], CPU_STR=>"")) 83 | elapsedtimes = parse_numeric.(replace.(df[!, ELAPSEDTIME], ELAPSEDTIME_STR=>"")) 84 | memories = parse_numeric.(replace.(df[!, MEMORY], MEMORY_STR=>"")) 85 | threads = get_threads.(df[!,1]) 86 | 87 | if length(ARGS) == 3 88 | #= 89 | stderr.pigz.log compensate the bug of GNU TIME which cannot stat the subprocess (pigz) of Julia. The file is the result of ``` 90 | /usr/bin/time pigz -p 8 -c Atria-consensus/*atria.fq 1>/dev/null 2>> stderr.pigz.log 91 | /usr/bin/time pigz -p 8 -c Atria/*atria.fq 1>/dev/null 2>> stderr.pigz.log 92 | /usr/bin/time pigz -cd $r1 $r2 > /dev/null 2>> stderr.pigz.log 93 | ``` 94 | =# 95 | pigz_time_file = ARGS[3] 96 | usertimes_pigz = parse_numeric.(readlines(pipeline(`grep -oE "[0-9\.\:]+user" $pigz_time_file`, `sed 's/user//'`))) 97 | systemtimes_pigz = parse_numeric.(readlines(pipeline(`grep -oE "[0-9\.\:]+system" $pigz_time_file`, `sed 's/system//'`))) 98 | 99 | rows_atria = map(x -> occursin(r"atria", x), df[!,1]) 100 | rows_atria_no_consensus = map(x -> occursin(r"atria .*--no-consensus", x), df[!,1]) 101 | rows_atria_consensus = rows_atria .⊻ rows_atria_no_consensus 102 | 103 | # add decompressing time 104 | usertimes[rows_atria] .+= usertimes_pigz[3] 105 | systemtimes[rows_atria] .+= systemtimes_pigz[3] 106 | 107 | # add compressing time 108 | usertimes[rows_atria_no_consensus] .+= usertimes_pigz[2] 109 | systemtimes[rows_atria_no_consensus] .+= systemtimes_pigz[2] 110 | 111 | usertimes[rows_atria_consensus] .+= usertimes_pigz[1] 112 | systemtimes[rows_atria_consensus] .+= systemtimes_pigz[1] 113 | end 114 | 115 | cpus = @. (usertimes + systemtimes) / elapsedtimes 116 | efficiencies = @. NUM_BASES / elapsedtimes / cpus / 10^6 # M Bases/s/CPU 117 | speeds = NUM_BASES ./ elapsedtimes / 10^6 # M Bases/s 118 | 119 | dfout = DataFrame( 120 | "Threads" => threads, 121 | "Command" => df[!,1], 122 | "Efficiency (M Bases/s/CPU)" => efficiencies, 123 | "Speed (M Bases/s)" => speeds, 124 | "UserTime (s)" => usertimes, 125 | "SystemTime (s)" => systemtimes, 126 | "CPU" => cpus, 127 | "ElapsedTime (s)" => elapsedtimes, 128 | "MaxMemory (kB)" => memories 129 | ) 130 | 131 | sort!(dfout, :Threads) 132 | 133 | CSV.write(stdout, dfout; delim='\t') 134 | -------------------------------------------------------------------------------- /src/FqRecords/util.jl: -------------------------------------------------------------------------------- 1 | 2 | #= 3 | Some functions, such as BioSequences.throw_encode_error 4 | were modified from BioSequences package developped by BioJulia. 5 | Those functions have their own license: 6 | 7 | MIT License 8 | 9 | Copyright (c) 2018: BioJulia. 10 | 11 | Permission is hereby granted, free of charge, to any person obtaining 12 | a copy of this software and associated documentation files (the 13 | "Software"), to deal in the Software without restriction, including 14 | without limitation the rights to use, copy, modify, merge, publish, 15 | distribute, sublicense, and/or sell copies of the Software, and to 16 | permit persons to whom the Software is furnished to do so, subject to 17 | the following conditions: 18 | 19 | The above copyright notice and this permission notice shall be 20 | included in all copies or substantial portions of the Software. 21 | 22 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 23 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 24 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 25 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 26 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 27 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 28 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 29 | =# 30 | 31 | @inline function check_identifier(r1_id::Vector{UInt8}, r2_id::Vector{UInt8})::Bool 32 | stop1 = findfirst(x -> x == 0x20 || x == 0x2f, r1_id) # ' ' or '/' 33 | if isnothing(stop1) 34 | return r1_id == r2_id 35 | end 36 | stop1 -= 1 # do not count ' ' or '/' 37 | if length(r2_id) < stop1 38 | return false 39 | end 40 | @inbounds for i in 1:stop1 41 | if r1_id[i] != r2_id[i] 42 | return false 43 | end 44 | end 45 | true 46 | end 47 | 48 | @inline function check_identifier(r1::FqRecord, r2::FqRecord)::Bool 49 | check_identifier(r1.id, r2.id) 50 | end 51 | 52 | @noinline function throw_identifier_error(r1::FqRecord, r2::FqRecord) 53 | error("Identifiers of r1 and r2 are not the same!\n R1: $(String(copy(r1.id)))\n R2: $(String(copy(r2.id)))") 54 | end 55 | 56 | # only modify the error message. 57 | # BioSequences: longsequences/copying.jl 58 | # @noinline function BioSequences.throw_encode_error(A::BioSequences.Alphabet, src::AbstractArray{UInt8}, soff::Integer) 59 | # for i in 1:div(64, BioSequences.bits_per_symbol(A)) 60 | # index = soff + i - 1 61 | # sym = src[index] 62 | # if BioSequences.ascii_encode(A, sym) & 0x80 == 0x80 63 | # # find the context around the error: one previous line and the current line 64 | # nsrc = length(src) 65 | # context_start = soff + i - 2 66 | # context_previous_line = true 67 | # context_end = soff + i 68 | # while context_start > 1 69 | # char = src[context_start] 70 | # if char == 0x0a # \n 71 | # if context_previous_line 72 | # context_previous_line = false 73 | # else 74 | # context_start += 1 75 | # break 76 | # end 77 | # elseif soff - context_start > 300 + 300 * !context_previous_line 78 | # break 79 | # end 80 | # context_start -= 1 81 | # end 82 | # while context_end < nsrc 83 | # char = src[context_end] 84 | # if char == 0x0a # \n or \r 85 | # context_end -= 1 86 | # break 87 | # elseif context_end - soff > 100 88 | # break 89 | # end 90 | # context_end += 1 91 | # end 92 | # context = String(copy(src[context_start:context_end])) 93 | 94 | # repr_char = if sym in UInt8('\a'):UInt8('\r') || sym in UInt8(' '):UInt8('~') 95 | # " (char '$(Char(sym))')" 96 | # else 97 | # "" 98 | # end 99 | 100 | # error("Cannot encode byte $(repr(sym))$(repr_char) at index $(index) to $A. Is the input file valid? Does the disk have bad sections? The error is found in the following context:\n\n$context\n") 101 | # end 102 | # end 103 | # @assert false "Expected error in encoding" 104 | # end 105 | 106 | @inline function iscomplement(a::DNA, b::DNA) 107 | BioSequences.complement(a) === b 108 | end 109 | 110 | 111 | # codes modified from Julia Base 112 | 113 | function write_no_lock(s::IOStream, b::UInt8) 114 | Int(ccall(:ios_putc, Cint, (Cint, Ptr{Cvoid}), b, s.ios)) 115 | end 116 | function write_no_lock(s::IOStream, a::Vector{UInt8}) 117 | GC.@preserve a unsafe_write_no_lock(s, pointer(a), UInt64(sizeof(a))) 118 | end 119 | # """ 120 | # unsafe_write_no_lock(io::IO, ref, nbytes::UInt) 121 | # 122 | # Copy `nbytes` from `ref` (converted to a pointer) into the `IO` object. 123 | # 124 | # It is recommended that subtypes `T<:IO` override the following method signature 125 | # to provide more efficient implementations: 126 | # `unsafe_write_no_lock(s::T, p::Ptr{UInt8}, n::UInt)` 127 | # """ 128 | # function unsafe_write_no_lock(s::IO, p::Ptr{UInt8}, n::UInt) 129 | # written::Int = 0 130 | # for i = 1:n 131 | # written += write(s, unsafe_load(p, i)) 132 | # end 133 | # return written 134 | # end 135 | @inline function unsafe_write_no_lock(s::IOStream, p::Ptr{UInt8}, nb::UInt) 136 | Int(ccall(:ios_write, Csize_t, (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t), s.ios, p, nb)) 137 | end 138 | 139 | # write(io::AbstractPipe, byte::UInt8) = write(Base.pipe_writer(io)::IO, byte) 140 | -------------------------------------------------------------------------------- /src/FqRecords/consensus.jl: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | pe_consensus!(r1::FqRecord, r2::FqRecord, r2_seq_rc::LongDNA{4}, insert_size::Int64; min_ratio_mismatch::Float64 = 0.2, prob_diff::Float64 = 0.0) 4 | 5 | Paired-end consensus calling for read pairs with adapters trimmed. Return `is_consensused::Bool`. 6 | """ 7 | function pe_consensus!(r1::FqRecord, r2::FqRecord, r2_seq_rc::LongDNA{4}, insert_size::Int64; min_ratio_mismatch::Float64 = 0.2, prob_diff::Float64 = 0.0) 8 | 9 | r1_seq = r1.seq 10 | r2_seq = r2.seq 11 | r1_length = length(r1_seq) 12 | r2_length = length(r2_seq_rc) 13 | 14 | # get the overlapped region 15 | if r2_length < insert_size 16 | r1_i = insert_size - r2_length + 1 17 | # check 8 bit alignment 18 | if r1_i % 2 == 1 19 | r2_seq_rc.len = r2_length 20 | else 21 | r1_i += 1 22 | deleteat!(r2_seq_rc, 1) 23 | BioSequences.unsafe_extra_bits_to_zeros!(r2_seq_rc) # deleteat is not bitsafe, so have to use it. 24 | # r2_seq_rc.part = 2:r2_length 25 | end 26 | else 27 | deleteat!(r2_seq_rc, 1:r2_length-insert_size) 28 | BioSequences.unsafe_extra_bits_to_zeros!(r2_seq_rc) # deleteat is not bitsafe, so have to use it. 29 | # r2_seq_rc.part = (r2_length-insert_size+1):r2_length 30 | r1_i = 1 31 | end 32 | length_overlap = min(length(r2_seq_rc), r1_length - r1_i + 1) 33 | length_overlap <= 0 && return false, 0.0 34 | 35 | # align r2_seq_rc.data and r1_seq.data 36 | if length(r2_seq_rc) != length_overlap # when r1 length < insert size 37 | resize!(r2_seq_rc, length_overlap) 38 | end 39 | 40 | # Ptr{UInt32}: scan 8 bases each time 41 | p1 = get_pointer(0x0000000000000000, r1_seq) 42 | p2_rc = get_pointer(0x0000000000000000, r2_seq_rc) 43 | 44 | p1_offset = r1_i ÷ 2 # r1_i cannot be even, so r1_i -1 not necessary 45 | p2_rc_offset = 0 46 | offset_max = cld(length_overlap, 2) 47 | # the start of ncompatible, minus the score of extra tail match 48 | num_ones = length_overlap - cld(offset_max,8)*16 49 | 50 | max_num_ones = floor(Int, (1+min_ratio_mismatch) * length_overlap) 51 | 52 | while p2_rc_offset <= offset_max 53 | 54 | if num_ones > max_num_ones 55 | return false 56 | end 57 | 58 | # global ncompatible 59 | # global p1_offset 60 | # global p2_rc_offset 61 | # global num_ones 62 | bit1 = unsafe_load(p1 + p1_offset) 63 | num_ambiguous_bits = count_ones(bit1) - 16 64 | bit2 = N2gap(unsafe_load(p2_rc + p2_rc_offset)) 65 | num_ones += count_ones(bit1|bit2) - num_ambiguous_bits 66 | 67 | p1_offset += 8 68 | p2_rc_offset += 8 69 | end 70 | 71 | # ratio_mismatch = (num_ones - length_overlap) / length_overlap 72 | # ratio_mismatch > min_ratio_mismatch && return false 73 | 74 | # equals to num_ones > (1-min_ratio_mismatch) * length_overlap && return false 75 | # see max_num_ones 76 | 77 | 78 | # start comsensus calling 79 | r1_end = min(r1_length, insert_size) 80 | r2_i = insert_size - r1_i + 1 81 | 82 | r1_qual = r1.qual 83 | r2_qual = r2.qual 84 | 85 | r1_prob = r1.prob 86 | r2_prob = r2.prob 87 | 88 | @inbounds while r1_i <= r1_end 89 | a = r1_seq[r1_i] 90 | b = r2_seq[r2_i] 91 | # if !((a | b) in (DNA_W, DNA_S)) # not complement 92 | if !iscomplement(a, b) # not complement 93 | a_prob = r1_prob[r1_i] 94 | b_prob = r2_prob[r2_i] 95 | if a_prob - b_prob > prob_diff 96 | # modify b to a 97 | r2_seq[r2_i] = complement(a) 98 | r2_qual[r2_i] = r1_qual[r1_i] 99 | r2_prob[r2_i] = a_prob 100 | elseif b_prob - a_prob > prob_diff 101 | r1_seq[r1_i] = complement(b) 102 | r1_qual[r1_i] = r2_qual[r2_i] 103 | r1_prob[r1_i] = b_prob 104 | end 105 | end 106 | r1_i += 1 107 | r2_i -= 1 108 | end 109 | return true 110 | end 111 | 112 | """ 113 | pe_consensus!(r1::FqRecord, r2::FqRecord, r1_seq_rc::LongDNA{4}, r2_seq_rc::LongDNA{4}; kmer_tolerance::Int64 = 2, overlap_score::Float64 = 0.0, min_ratio_mismatch::Float64 = 0.2, prob_diff::Float64 = 0.0) 114 | 115 | Paired-end consensus calling for read pairs without adapters. Check whether the read pair has an overlapped region first. Return `is_consensused::Bool`. 116 | """ 117 | function pe_consensus!(r1::FqRecord, r2::FqRecord, r1_seq_rc::LongDNA{4}, r2_seq_rc::LongDNA{4}; kmer_tolerance::Int64 = 2, overlap_score::Float64 = 0.0, min_ratio_mismatch::Float64 = 0.2, prob_diff::Float64 = 0.0) 118 | 119 | r1_seq = r1.seq 120 | r2_seq = r2.seq 121 | r1_length = length(r1_seq) 122 | r2_length = length(r2_seq) 123 | 124 | # r1_overlap_from, r1_overlap_nmatch, ... 125 | r1_ms = bitwise_scan(r2_seq_rc, r1_seq, 1, kmer_tolerance) 126 | # r2_overlap_from, r2_overlap_nmatch, ... 127 | r2_ms = bitwise_scan(r1_seq_rc, r2_seq, 1, kmer_tolerance) 128 | 129 | # r1_overlap_from == 0 && return false, -1.0 130 | # r2_overlap_from == 0 && return false, -1.0 131 | 132 | r1_overlap_nbase = r1_length - r1_ms.idx + 1 133 | r2_overlap_nbase = r2_length - r2_ms.idx + 1 134 | r1_overlap_nbase != r2_overlap_nbase && return false, -1.0 135 | 136 | if overlap_score > 0 137 | r1_overlap_prob = probmean(r1, r1_ms.idx, r1_ms.idx + 15) 138 | r2_overlap_prob = probmean(r2, r2_ms.idx, r2_ms.idx + 15) 139 | 140 | max_overlap_score = max(r1_ms.ncompatible * r1_overlap_prob, r2_ms.ncompatible * r2_overlap_prob) 141 | max_overlap_score < overlap_score && return false, -1.0 142 | end 143 | 144 | insert_size = r1_length + r2_length - r1_overlap_nbase 145 | 146 | pe_consensus!(r1, r2, r2_seq_rc, insert_size; min_ratio_mismatch = min_ratio_mismatch, prob_diff = prob_diff) 147 | end 148 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Atria Change Log 2 | 3 | ## TODO 4 | 5 | - Feature: multiple primer trimming. 6 | - Feature: UMI trimming. 7 | 8 | ## v4.1.4 9 | 10 | - Change: compressed file: better infer inbyte size when `file` does not output actual original file size. 11 | 12 | ## v4.1.3 13 | 14 | - Fix: when the paired end files are compressed, read chunks did not resize, which led to excessive copy, and copy number might accumulate round by round. 15 | 16 | ## v4.1.2 17 | 18 | - Fix: do not throw error if input paired end files are empty when doing `--detect-adapter`. 19 | 20 | ## v4.1.1 21 | 22 | - Fix: `-z NUM -Z NUM` error when length to trim < 0. 23 | 24 | ## v4.1.0 25 | 26 | - Change: --length-range default change from 50:500 to 30:999999. 27 | - Feature: HardClipEnd: new process to hard remove the last N bases. 28 | - Change: names in processing order (--order -O) changed. 29 | - Feature: PCRDedup: remove PCR duplicates from fastq files. The entire paired sequence is compared and hashed. This method require large memory because it stores hashes of reads. To enable, use `--pcr-dedup`. 30 | - Feature: processing stats are recorded in the json file. 31 | - Fix: `polyX_tail_scan` algorithm now is more precise, and tailing Ns also count. 32 | 33 | ## v4.0.3 34 | 35 | - Fix: `--order` or `-O` option should accept multiple arguments. 36 | 37 | ## v4.0.2 38 | 39 | - Fix: `--detect-adapter` for paired reads: refer to index 1 of empty vector when no adapter is found. 40 | 41 | ## v4.0.1 42 | 43 | - Fix: dep cihga39871/BioSequences.jl: detailed error message if input files' line break is '\r\n'. 44 | 45 | ## v4.0.0 46 | 47 | - Optimize: algorithm: now the non-overtrim rate for reads without adapters are higher. 48 | - Feature: re-write trimming to allow trim multiple adapters at the same time. This change is adjusted for metabarcoding data. 49 | - Feature: hard-clip: now hard-clip arguments do differently for r1 and r2. This change is adjusted for metabarcoding data. Remove `-C --clip-after -c --clip5`; add `-b --clip-after-r1 -B --clip-after-r2 -e --clip5-r1 -E --clip5-r2`. 50 | - Optimize: --detect-adapter for paired-end reads now guess adapters from pair information, rather than the existing adapter pool. 51 | - Feature: users can customize order of processing: `-O | --order`. 52 | 53 | ## v3.2.2-1 54 | 55 | - Fix: undef error of is_concensused when enabling --stat (thanks to kalavattam, #3) 56 | 57 | ## v3.2.2 58 | 59 | - Optimize: speed up for threads <= 2. 60 | - Fix: `atria test` should not depend on source files. 61 | 62 | ## v3.2.1 63 | 64 | - Feature: automatically skip completed analyses. Use --force or -f to disable the feature. 65 | 66 | ## v3.2.0 67 | 68 | - Remove multi-proc mode since it is unstable. 69 | 70 | ## v3.1.4 71 | 72 | - Logging: new logging for versions and sample completion. 73 | - Fix v3.1.3: multi-proc mode: Julia v1.8.1 does not allow assign new ARGS, and add `-t nthread` in `julia_args`. 74 | - Fix v3.1.3: pe-consensus: error when `insert_size = -1`; fix trimming when `insert_size = -1`. 75 | - Benchmark `iscomplement` in Atria v3.1.2 and that in BioSequences, and found it is good to stick to BioSequences. 76 | 77 | ## v3.1.3 78 | 79 | - Compatible: Julia v1.8 and BioSequences v3.1.0. 80 | - Fix: quality offset not changed in some places when providing a different --quality-format. 81 | - Fix: use `Base.invokelatest` to bypass world age for functions evaluated at run time. 82 | - Docs: update. 83 | 84 | ## v3.1.2 85 | 86 | - Fix: optimize output file names if ending with .bz2. 87 | 88 | ## v3.1.1 89 | 90 | - Fix: when reporting an encode error, report the previous and current lines instead of the whole chunk of data. 91 | 92 | ## v3.1.0 93 | 94 | - New feature: `--detect-adapter` for adapter determination. 95 | - Fix: when input is an empty compressed fastq, atria exits with error because `read_chunks!(::IO, ...)` should return 4 elements, but returned 2. 96 | 97 | ## v3.0.3 98 | 99 | - Fix v3.0.2: `will_eof` should be true when unknown. 100 | - Do not resize chunk sizes before cycle 1 when inputs are compressed and cannot determine uncompressed sizes. Just assume data are not trimmed before. 101 | 102 | ## v3.0.2 103 | 104 | - Fix uncompressed_size1 not defined on gzipped single-end input (#2). 105 | 106 | ## v3.0.1 107 | 108 | - Avoid to lock `IOStream` when write fastq in thread_output.jl: replace `write(::IOStream, ...)` with `write_no_lock(::IOStream, ...)`. It is slightly faster. 109 | - Speed optimization for consensus calling: overwrite `BioSequences.complement(::DNA)` (1.40X), and define `iscomplement(::DNA, ::DNA)` (1.79X). 110 | - Other minor parallel implementations. 111 | 112 | ## v3.0.0 113 | 114 | - If users choose to trim adapter, check 1 bp offset of adapter sequences. It is because Atria might have 1 bp error in some cases. 115 | 116 | ## v2.1.2 117 | 118 | - Parameter optimization using `atria simulate`: --trim-score-pe 19->10, --tail-length 8->12. 119 | - Development of Atria simulation methods. 120 | 121 | ## v2.1.1 122 | 123 | - Fixing wrapper_single_end.jl: cannot trim true adapter position at index of -1. 124 | 125 | ## v2.1.0 126 | 127 | - If a r1/2 adapter is found, but the region of r2/1 is missing or its quality too low (mean prob < 0.6), skip PE check and just trim like single-end. With this, trim_score do not need to compensate for the situation, so rise the default trim-score-pe (10->19). 128 | 129 | ## v2.0.0 130 | 131 | - Supporting low-complexity filtration. 132 | - Supporting polyX tail trimming. 133 | - Supporting single-end fastq. 134 | - Supporting bzip2 compression/decompression. 135 | - Supporting non standardized gzip compression files. 136 | - Optimizing default parameters. (r1-r2-diff 0->0, trim-score-pe 8->10, score-diff removed, kmer-n-match 8->9) 137 | - Robustness optimization: the lower bound of match probability is set to 0.75 because match probability lower than 0.75 is outlier and affect trim score strongly. 138 | 139 | ## v1.1.1 140 | 141 | - Performance optimization: adapter and PE trimming: following v1.1.0-1, if the loosen match's nmatch > trim_score, replace the old one. 142 | 143 | ## v1.1.0 144 | 145 | - Performance optimization: adapter and PE trimming: if no adapters were matched, the number of errors of PE match is loosen. 146 | - Performance optimization: consensus calling: new arg `--kmer-tolerance-consensus 2->10`; optimized arg `--min-ratio-mismatch 0.2->0.28`. 147 | - Speed optimization: check `overlap_score > 0` before computing score (`pe_consensus!`). 148 | 149 | ## v1.0.3 150 | 151 | - More detailed error output when encoding a non-nucleotide character (`throw_encode_error(...)`). 152 | - Following symbolic link before checking file size for non-Windows platforms (`check_filesize(::String)`). 153 | - When run in multi-file parallel mode, write stdout and stderr to a 'stdlog' file (`julia_wrapper_atria(...)`). 154 | - Add option `--check-identifier` to check whether the identifiers of r1 and r2 are the same. 155 | 156 | ## v1.0.2 157 | 158 | - First mature version of Atria. 159 | -------------------------------------------------------------------------------- /src/AtriaTest/trimmer_and_benchmark.jl: -------------------------------------------------------------------------------- 1 | @noinline function test_trimmer_and_benchmark() 2 | @testset "trimmer_and_benchmark" begin 3 | 4 | pwd_backup = pwd() 5 | 6 | tmp_path = tempname() 7 | mkpath(tmp_path) 8 | cd(tmp_path) 9 | 10 | try 11 | args = Trimmer.parsing_args(["-r", "peReadSimulated.R1.fastq", "-R", "peReadSimulated.R2.fastq"]) 12 | logjson = Trimmer.OrderedDict() 13 | logjson["version"] = Trimmer.OrderedDict( 14 | "julia-version" => VERSION, 15 | "atria-version" => "9.9.9" 16 | ) 17 | logjson["arguments"] = sort!(Trimmer.OrderedDict(args)) 18 | fio = open("json","w+") 19 | Trimmer.JSON.print(fio, sort!(logjson), 4) 20 | close(fio) 21 | 22 | Trimmer.Distributed.addprocs(1) 23 | var = ["var"] 24 | @eval Trimmer.Distributed.@everywhere var = $var 25 | Trimmer.Distributed.pmap(+,[1,2],[4,5]) 26 | 27 | 28 | Benchmark.julia_wrapper_simulate(["-o" ,"peReadSimulated", "-x", "2000"]) 29 | Benchmark.julia_wrapper_simulate(["-h"], exit_after_help=false) 30 | 31 | Benchmark.julia_wrapper_randtrim(["peReadSimulated.R1.fastq", "peReadSimulated.R2.fastq"]) 32 | Benchmark.julia_wrapper_randtrim(["-h"]) 33 | 34 | if Sys.iswindows() 35 | julia_wrapper_atria_pe(["-r", "peReadSimulated.R1.randtrim.fastq", "-R", "peReadSimulated.R2.randtrim.fastq", "-e", "8", "-E", "8", "--compress", "gz", "-f"]) 36 | else 37 | 38 | @info "rand trim - gz" 39 | 40 | run(`pigz --keep peReadSimulated.R1.randtrim.fastq`) 41 | run(`pigz --keep peReadSimulated.R2.randtrim.fastq`) 42 | 43 | julia_wrapper_atria_se(["-r", "peReadSimulated.R1.randtrim.fastq.gz", "-e", "8", "-E", "8", "--compress", "gz", "-f"]) 44 | julia_wrapper_atria_pe(["-r", "peReadSimulated.R1.randtrim.fastq.gz", "-R", "peReadSimulated.R2.randtrim.fastq.gz", "-e", "8", "-E", "8", "--compress", "gz", "--check-identifier", "-f"]) 45 | 46 | @info "rand trim - gz - check paired ID" 47 | 48 | julia_wrapper_atria_pe(["-r", "peReadSimulated.R1.randtrim.atria.fastq.gz", "-R", "peReadSimulated.R2.randtrim.atria.fastq.gz", "-e", "8", "-E", "8", "--compress", "gz", "--check-identifier", "-f"]) 49 | 50 | @info "rand trim - gz - detect adapter" 51 | 52 | julia_wrapper_detect_adapter_se(["-r", "peReadSimulated.R1.randtrim.fastq.gz", "-e", "8", "-E", "8", "--compress", "gz"]) 53 | julia_wrapper_detect_adapter_pe(["-r", "peReadSimulated.R1.randtrim.fastq.gz", "-R", "peReadSimulated.R2.randtrim.fastq.gz", "-e", "8", "-E", "8", "--compress", "gz"]) 54 | 55 | @info "rand trim - bzip" 56 | 57 | run(`pbzip2 peReadSimulated.R1.randtrim.fastq`) 58 | run(`pbzip2 peReadSimulated.R2.randtrim.fastq`) 59 | 60 | julia_wrapper_atria_se(["-r", "peReadSimulated.R1.randtrim.fastq.gz", "-e", "8", "-E", "8", "--compress", "bz2", "-f"]) 61 | julia_wrapper_atria_pe(["-r", "peReadSimulated.R1.randtrim.fastq.gz", "-R", "peReadSimulated.R2.randtrim.fastq.gz", "-e", "8", "-E", "8", "--compress", "bz2", "--check-identifier", "-f"]) 62 | 63 | end 64 | 65 | @info "normal trim - all filters" 66 | 67 | julia_wrapper_atria_se(["-r", "peReadSimulated.R1.fastq", "--polyG", "--enable-complexity-filtration", "--pcr-dedup", "-f", "--log2-chunk-size", "24"]) 68 | julia_wrapper_atria_pe(["-r", "peReadSimulated.R1.fastq", "-R", "peReadSimulated.R2.fastq", "--polyG", "--enable-complexity-filtration", "--pcr-dedup", "-f", "--stats", "--log2-chunk-size", "24"]) 69 | 70 | @info "normal trim - PCR Dedup with counts" 71 | 72 | julia_wrapper_atria_se(["-r", "peReadSimulated.R1.fastq", "-O", "PCRDedup", "--pcr-dedup", "--pcr-dedup-count", "-f", "--log2-chunk-size", "24"]) 73 | julia_wrapper_atria_pe(["-r", "peReadSimulated.R1.fastq", "-R", "peReadSimulated.R2.fastq", "-O", "PCRDedup", "--pcr-dedup", "--pcr-dedup-count", "-f", "--stats", "--log2-chunk-size", "24"]) 74 | 75 | @info "normal trim - all filters - check ID pair" 76 | 77 | julia_wrapper_atria_pe(["-r", "peReadSimulated.R1.atria.fastq", "-R", "peReadSimulated.R2.atria.fastq", "--polyG", "--enable-complexity-filtration", "-f", "--stats", "--log2-chunk-size", "24", "--check-identifier"]) 78 | 79 | 80 | @info "normal trim - all filters - multiple adapters" 81 | 82 | julia_wrapper_atria_se(["-r", "peReadSimulated.R1.fastq", "--polyG", "--enable-complexity-filtration", "-a", "AGATCGGAAGAGCACACGTCTGAACTCCAGTCA", "CTGTCTCTTATACACATCT", "-f"]) 83 | julia_wrapper_atria_pe(["-r", "peReadSimulated.R1.fastq", "-R", "peReadSimulated.R2.fastq", "--polyG", "--enable-complexity-filtration", "-f", "-a", "AGATCGGAAGAGCACACGTCTGAACTCCAGTCA", "CTGTCTCTTATACACATCT", "-A", "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT", "CTGTCTCTTATACACATCT"]) 84 | 85 | @info "normal trim - all filters - check ID pair" 86 | 87 | julia_wrapper_atria_pe(["-r", "peReadSimulated.R1.atria.fastq", "-R", "peReadSimulated.R2.atria.fastq", "--polyG", "--enable-complexity-filtration", "-f", "-a", "AGATCGGAAGAGCACACGTCTGAACTCCAGTCA", "CTGTCTCTTATACACATCT", "-A", "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT", "CTGTCTCTTATACACATCT", "--log2-chunk-size", "24", "--check-identifier"]) 88 | 89 | @info "normal trim - skip finished" 90 | 91 | julia_wrapper_atria_pe(["-r", "peReadSimulated.R1.fastq", "-R", "peReadSimulated.R2.fastq", "--polyG", "--enable-complexity-filtration"]) 92 | julia_wrapper_atria_se(["-r", "peReadSimulated.R1.fastq", "--polyG", "--enable-complexity-filtration"]) 93 | 94 | @info "normal trim - detect adapter" 95 | 96 | julia_wrapper_detect_adapter_se(["-r", "peReadSimulated.R1.fastq", "--polyG", "--enable-complexity-filtration"]) 97 | 98 | @info "trimmer's help page" 99 | 100 | julia_wrapper_atria_pe(["-h"], exit_after_help=false) 101 | julia_wrapper_atria_se(["-h"], exit_after_help=false) 102 | atria_markdown_help() 103 | 104 | @info "read stat" 105 | 106 | julia_wrapper_readstat(["-h"]) 107 | julia_wrapper_readstat(["peReadSimulated.R1.atria.fastq", "peReadSimulated.R2.atria.fastq"]) 108 | 109 | Rscript_check_package = """ 110 | if (is.na(packageDescription("argparse")[1])) writeLines("R package 'argparse' not found. Please run `install.packages('argparse')` in R session.") 111 | if (is.na(packageDescription("plotly")[1])) writeLines("R package 'plotly' not found. Please run `install.packages('plotly')` in R session.") 112 | if (is.na(packageDescription("ggsci")[1])) writeLines("R package 'ggsci' not found. Please run `install.packages('ggsci')` in R session.") 113 | """ 114 | 115 | julia_wrapper_rscript(Rscript_check_package, ["-h"]) 116 | 117 | 118 | ARGS_old = deepcopy(ARGS) 119 | empty!(ARGS) 120 | push!(ARGS, "prog") 121 | Atria.julia_main() 122 | 123 | empty!(ARGS) 124 | append!(ARGS, ARGS_old) 125 | 126 | @info "Precompiling/test passed without errors." 127 | 128 | catch e 129 | @error "Precompiling/test failed!" exception=e 130 | cd(pwd_backup) 131 | rm(tmp_path, recursive=true, force=true) 132 | rethrow(e) 133 | finally 134 | cd(pwd_backup) 135 | rm(tmp_path, recursive=true, force=true) 136 | end 137 | end 138 | end -------------------------------------------------------------------------------- /src/Trimmer/markdown_help.jl: -------------------------------------------------------------------------------- 1 | 2 | const atria_markdown_help_text = md""" 3 | # Atria $atria_version 4 | 5 | An ultra-fast and accurate adapter and quality trimming software designed for paired-end sequencing data. 6 | 7 | If you use Atria, please cite 8 | > Jiacheng Chuan, Aiguo Zhou, Lawrence Richard Hale, Miao He, Xiang Li, Atria: an ultra-fast and accurate trimmer for adapter and quality trimming, Gigabyte, 1, 2021 https://doi.org/10.46471/gigabyte.31 9 | 10 | Github: https://github.com/cihga39871/Atria 11 | 12 | ## Usage 13 | 14 | Try `atria -h` or `atria --help` for more information. 15 | 16 | ### Input and Output 17 | 18 | The input files should be paired-end FastQ(.gz|.bz2) files (in the same order), or single-end fastqs: 19 | 20 | 1. Read 1 files: `-r XXXX_R1.fastq YYYY_R1.fastq.gz ...` 21 | 22 | 2. Read 2 files (optional): `-R XXXX_R2.fastq YYYY_R2.fastq.gz ...` 23 | 24 | Output all files to a directory: `-o PATH` or `--output-dir PATH`. Default is the current directory. 25 | 26 | Atria skips completed analysis by default. Use `-f` or `--force` to disable the feature. 27 | 28 | ### Order of processing 29 | 30 | Order of trimming and filtration processing methods. Unlisted process will not be done. See default for process names. 31 | 32 | - `--order PROCESS...` or `-O PROCESS...`: default: 33 | 34 | - CheckIdentifier 35 | - PolyG 36 | - PolyT 37 | - PolyA 38 | - PolyC 39 | - LengthFilter 40 | - AdapterTrim 41 | - HardClipEndR1 42 | - HardClipEndR2 43 | - HardClipAfterR1 44 | - HardClipAfterR2 45 | - HardClipFrontR1 46 | - HardClipFrontR2 47 | - QualityTrim 48 | - TailNTrim 49 | - MaxNFilter 50 | - LengthFilter 51 | - ComplexityFilter 52 | - PCRDedup 53 | 54 | 55 | ### Poly X Tail Trimming (PolyG / PolyT / PolyA / PolyC) 56 | 57 | Remove poly-X tails. Suggest to enable `--polyG` for Illumina NextSeq/NovaSeq data. 58 | 59 | - Enable: `--polyG`, `--polyT`, `--polyA`, and/or `--polyC` (default: disabled) 60 | 61 | - Trim poly X tail if length > INT: `--poly-length 10` 62 | 63 | ### Adapter Trimming (AdapterTrim) 64 | 65 | Multiple adapter pairs are allowed from Atria v4. 66 | 67 | - Read 1 adapter(s) (after DNA insert): `-a SEQ...` or `--adapter1 SEQ...` (default: AGATCGGAAGAGCACACGTCTGAACTCCAGTCA) 68 | 69 | - Read 2 adapter(s) (after DNA insert): `-A SEQ...` or `--adapter2 SEQ...` (default: AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT) (if paired-end) 70 | 71 | - Disable: `--no-adapter-trim` 72 | 73 | - `--detect-adapter` if you do not know adapter sequences. 74 | >Atria does not trim detected adapters automatically, please check results first. 75 | 76 | #### Paired-end Consensus Calling 77 | 78 | The overlapped regions of read pairs are checked and corrected. **It is available only when input files are paired-end and Adapter Trimming is on.** 79 | 80 | - Disable: `--no-consensus` 81 | 82 | ### Hard Clip End (HardClipEndR1 / HardClipEndR2) 83 | 84 | Remove the last INT bases from 3' end (tail). 85 | 86 | - Number of bases to keep in read 1: `-z INT` or `--clip3-r1 INT` (default: disabled) 87 | 88 | - Number of bases to keep in read 2: `-Z INT` or `--clip3-r2 INT` (default: disabled) 89 | 90 | ### Hard Clip After N Bases (HardClipAfterR1 / HardClipAfterR2) 91 | 92 | Resize reads to a fixed length by discarding extra bases in 3' end (tail). 93 | 94 | - Number of bases to keep in read 1: `-b INT` or `--clip-after-r1 INT` (default: disabled) 95 | 96 | - Number of bases to keep in read 2: `-B INT` or `--clip-after-r2 INT` (default: disabled) 97 | 98 | ### Hard Clip Front (HardClipFrontR1 / HardClipFrontR2) 99 | 100 | Remove the first INT bases from 5' end (front). 101 | 102 | - Number of bases to remove in read 1: `-e INT` or `--clip5-r1 INT` (default: disabled) 103 | 104 | - Number of bases to remove in read 2: `-E INT` or `--clip5-r2 INT` (default: disabled) 105 | 106 | ### Quality Trimming (QualityTrim) 107 | 108 | Trim low-quality tails. Trimming read tails when the average quality of bases in a sliding window is low. 109 | 110 | - Average quality threshold: `-q 20` or `--quality-score 20` (default: 20) 111 | 112 | - Sliding window length: `--quality-kmer 5` (default: 5) 113 | 114 | - FastQ quality format: `--quality-format Illumina1.8`, or `--quality-format 33` (default: 33, ie. Illumina1.8) 115 | 116 | - Disable: `--no-quality-trim` 117 | 118 | ### Tail N Trimming (TailNTrim) 119 | 120 | Trim N tails. 121 | 122 | - Disable: `--no-tail-n-trim` 123 | 124 | ### Max N Filtration (MaxNFilter) 125 | 126 | Discard a read pair if the number of N in one read is greater than a certain amount. N tails are ignored if Tail N Trimming is on. 127 | 128 | - Number of N allowed in each read: `-n 15` or `--max-n 15` (default: 15) 129 | 130 | - Disable: `-n -1` or `--max-n -1` 131 | 132 | ### Length Filtration (LengthFilter) 133 | 134 | Filter read pair length in a range. 135 | 136 | - Read length range: `--length-range 30:999999` (default: 30:999999) 137 | 138 | - Disable: `--no-length-filtration` 139 | 140 | ### Complexity Filtration (ComplexityFilter) 141 | 142 | Discard reads with low complexity. Complexity is the percentage of base that is different from its next base. 143 | 144 | - Enable: `--enable-complexity-filtration` (default: disabled) 145 | 146 | - Complexity threshold: `--min-complexity 0.3` (default: 0.3) 147 | 148 | ### Remove PCR duplicates 149 | 150 | Only write unique sequences (dedup). Paired reads are only considered identical if both reads are duplicates to both reads in a previous pair. 151 | 152 | > Dedup uses LARGE memory to store all unique sequences. 153 | 154 | - Enable: `--pcr-dedup`. 155 | 156 | - Also write a count table of PCR duplicates: `--pcr-dedup-count`. 157 | 158 | ### Parallel computing 159 | 160 | - Use INT threads: `-t 8` or `--threads 8` (default: 8) 161 | 162 | - If memory is not sufficient, use `--log2-chunk-size INT` where INT is from 23 to 25. Memory usage reduces exponentially as it decreases. 163 | 164 | Try `atria -h` or `atria --help` for more information. 165 | """ 166 | 167 | function atria_markdown_help() 168 | println(stderr) 169 | show(stderr, "text/plain", atria_markdown_help_text) 170 | println(stderr) 171 | end 172 | 173 | 174 | #= Future supports 175 | ================== 176 | 177 | ### UMI (Unique Molecular Identifier) 178 | 179 | Trim and extract UMI to the first part of read names, so they can be presented in BAM records after mapping. 180 | 181 | - Enable and specify UMI location(s): `--umi LOC...`, and LOC can be: 182 | + `INDEX1`: the R1 index is UMI. 183 | + `INDEX2`: the R2 index is UMI. 184 | + `READ1`: the head of read1 is UMI. 185 | + `READ2`: the head of read2 is UMI. 186 | (default: disabled) 187 | 188 | - If UMI locations contain `READ1` and/or `READ2`: 189 | + UMI length argument `--umi-len INT` is required. 190 | + Skip several bases after UMI: `--umi-skip INT` (default: 0) 191 | 192 | ### Primer Trimming 193 | 194 | Trim primers from 5' and 3' ends (default: no primer trimming) 195 | 196 | - Directly provide primer sequence(s): 197 | + `-m SEQ...` or `--primer1 SEQ...`: primers(s) at 5' end of read 1, and their reverse complement appended to 3' end of read 2. 198 | 199 | + `-M SEQ...` or `--primer1 SEQ...`: primers(s) at 5' end of read 1, and their reverse complement appended to 3' end of read 2. 200 | 201 | - Or provide a primer table: `-P FILE` or `--primers FILE`. Format of primer table: 202 | + Each line is a primer set. 203 | + Columns are primer1, primer2, primer name. 204 | + Deliminator is TAB (`\t`). 205 | + No header line; Lines starts with `#` are ignored. 206 | 207 | 208 | =# -------------------------------------------------------------------------------- /src/BioBits/get_seq.jl: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | N2gap(bit::T) where T <: Union{UInt8, UInt16, UInt32, UInt64, UInt128} 4 | 5 | Convert N (1111) to gap (0000) in biological `bit`. 6 | """ 7 | function N2gap end 8 | 9 | for T in (UInt8, UInt16, UInt32, UInt64, UInt128) 10 | @eval @inline function N2gap(bit::$T) 11 | nbase_1 = $(sizeof(T) * 2 - 1) 12 | 13 | N_bit = $(convert(T, 0b1111)) 14 | if bit & N_bit == N_bit 15 | bit &= ~N_bit 16 | end 17 | for i in 1:nbase_1 18 | N_bit = $(convert(T, 0b1111)) << (4*i) 19 | if bit & N_bit == N_bit 20 | bit &= ~N_bit 21 | end 22 | end 23 | bit 24 | end 25 | end 26 | 27 | 28 | struct SeqHead{T} 29 | a::T 30 | b::T 31 | function SeqHead{T}(a::T, b::T) where T <: Union{UInt8, UInt16, UInt32, UInt64} 32 | new(a, b) 33 | end 34 | end 35 | 36 | @inline function SeqHead(a::T, b::T) where T <: Union{UInt8, UInt16, UInt32, UInt64} 37 | SeqHead{T}(a,b) 38 | end 39 | 40 | """ 41 | SeqHead(::T, seq::LongDNA{4}) where T <: Union{UInt8, UInt16, UInt32, UInt64} 42 | 43 | # Fields 44 | 45 | - `a::T`: the bits of sequence from index 1. 46 | 47 | - `b::T`: the bits of sequence from index 2. 48 | 49 | # Argument 50 | 51 | - `seq::LongDNA{4}`: the seq has to be `bitsafe!`. 52 | """ 53 | function SeqHead(::T, seq::LongDNA{4}) where T <: Union{UInt8, UInt16, UInt32} 54 | bit = seq.data[1] 55 | a = unsafe_trunc(T, bit) 56 | b = unsafe_trunc(T, bit >> 4) 57 | SeqHead{T}(a, b) 58 | end 59 | function SeqHead(::UInt64, seq::LongDNA{4}) 60 | p = pointer(seq.data) 61 | a = unsafe_load(p) 62 | if length(seq) > 0 63 | c = unsafe_load(p+1) 64 | b = (a >> 4) | (c << 4) 65 | else 66 | b = a >> 4 67 | end 68 | SeqHead{UInt64}(a, b) 69 | end 70 | for T in (UInt8, UInt16, UInt32, UInt64) 71 | @eval @inline SeqHead{$T}(seq::LongDNA{4}) = SeqHead($(typemin(T)), seq) 72 | end 73 | 74 | 75 | struct SeqHeadSet 76 | s64::SeqHead{UInt64} 77 | s32::SeqHead{UInt32} 78 | s16::SeqHead{UInt16} 79 | s8::SeqHead{UInt8} 80 | function SeqHeadSet(seq::LongDNA{4}) 81 | bitsafe!(seq) 82 | s64 = SeqHead{UInt64}(seq) 83 | s32 = SeqHead{UInt32}(seq) 84 | s16 = SeqHead{UInt16}(seq) 85 | s8 = SeqHead{UInt8}(seq) 86 | new(s64, s32, s16, s8) 87 | end 88 | end 89 | function SeqHeadSet(seq::AbstractString) 90 | SeqHeadSet(LongDNA{4}(seq)) 91 | end 92 | 93 | function BioSequences.LongDNA{4}(s::SeqHeadSet) 94 | LongDNA{4}([s.s64.a], 0x0000000000000010) # 16 % UInt64 95 | end 96 | 97 | """ 98 | TruncSeq(::T, seq::LongDNA{4}) where T <: Union{UInt8, UInt16, UInt32, UInt64} 99 | 100 | # Fields 101 | 102 | - `a::T`: the bits of sequence from index 1. 103 | 104 | - `b::T`: the bits of sequence from index 2. 105 | 106 | - `a1::T`: the bits of `seq[1]`. 107 | 108 | # Argument 109 | 110 | - `seq::LongDNA{4}`: the seq has to be `bitsafe!`. 111 | """ 112 | struct TruncSeq{T} 113 | a::T 114 | b::T 115 | a1::T 116 | function TruncSeq{T}(a::T, b::T, a1::T) where T <: Union{UInt8, UInt16, UInt32, UInt64} 117 | new(a, b, a1) 118 | end 119 | end 120 | 121 | for T in (UInt8, UInt16, UInt32, UInt64) 122 | @eval @inline TruncSeq(a::$T, b::$T, a1::$T) = TruncSeq{$T}(a,b,a1) 123 | end 124 | 125 | for T in (UInt8, UInt16, UInt32) 126 | @eval @inline function TruncSeq(::$T, seq::LongDNA{4}) 127 | bit = seq.data[1] #|> N2gap 128 | a = unsafe_trunc($T, bit) 129 | b = unsafe_trunc($T, bit >> 4) 130 | a1 = $(T(0b1111)) & a 131 | TruncSeq{T}(a, b, a1) 132 | end 133 | end 134 | function TruncSeq(::UInt64, seq::LongDNA{4}) 135 | p = pointer(seq.data) 136 | a = unsafe_load(p) #|> N2gap 137 | c = unsafe_load(p+1) #|> N2gap 138 | if length(seq) > 0 139 | c = unsafe_load(p+1) 140 | b = (a >> 4) | (c << 4) 141 | else 142 | b = a >> 4 143 | end 144 | a1 = 0x000000000000000f & a 145 | TruncSeq{UInt64}(a, b, a1) 146 | end 147 | 148 | 149 | for T in (UInt8, UInt16, UInt32, UInt64) 150 | @eval @inline TruncSeq{$T}(seq::LongDNA{4}) = TruncSeq($(typemin(T)), seq) 151 | end 152 | 153 | """ 154 | get_pointer(::T, seq::LongDNA{4}) where T <: {UInt8, UInt16, UInt32, UInt64} 155 | """ 156 | @inline get_pointer(::UInt64, seq::LongDNA{4}) = pointer(seq.data) 157 | for T in (UInt8, UInt16, UInt32) 158 | @eval @inline get_pointer(::$T, seq::LongDNA{4}) = 159 | Core.bitcast($(Ptr{T}), pointer(seq.data)) 160 | end 161 | 162 | 163 | """ 164 | get_unsafe_index_of_last_bitseq(::T, seq::LongDNA{4}) 165 | get_unsafe_index_of_last_bitseq(::T, seq.len::Int64) 166 | get_unsafe_index_of_last_bitseq(::T, seq.len::UInt64) 167 | 168 | - `::T` is one of UInt8, UInt16, UInt32, UInt64. 169 | 170 | Get the index of the last full-long bitseq. It is unsafe because the returned index can be negative. 171 | """ 172 | function get_unsafe_index_of_last_bitseq end 173 | 174 | for T in (UInt8, UInt16, UInt32, UInt64) 175 | @eval @inline get_unsafe_index_of_last_bitseq(::$T, seq::LongDNA{4}) = 176 | (seq.len % Int64) - $(sizeof(T) * 2 - 2) 177 | @eval @inline get_unsafe_index_of_last_bitseq(::$T, seq_len::Int64) = 178 | seq_len - $(sizeof(T) * 2 - 2) 179 | @eval @inline get_unsafe_index_of_last_bitseq(::$T, seq_len::UInt64) = 180 | (seq_len % Int64) - $(sizeof(T) * 2 - 2) 181 | end 182 | 183 | """ 184 | unsafe_bitseq(seq_data_ptr::Ptr{T}, idx::Int) => bitseq 185 | unsafe_bitseq(seq_data_ptr::Ptr{T}, idx::Int, max_idx::Int) => bitseq, num_base_extracted 186 | 187 | - `seq_data_ptr::Ptr{T}`: the pointer to `(seq::LongDNA{4}).data`. `Ptr{T}` can be converted to `Ptr` of `UInt8`, `UInt16`, `UInt32`, or `UInt64`. 188 | 189 | - `idx`: nucleotide index of `(seq::LongDNA{4}).data`. 190 | 191 | - `max_idx`: should be equal to `(seq::LongDNA{4}).len`. Change bits after it to 0. It does not mask bits if `max_idx` < `(seq::LongDNA{4}).len`, but affects num_base_extracted. 192 | 193 | # Caution 194 | 195 | When `idx` is even, the bitseq will always start from 0b0000, because it simply shift 4 bits from `idx - 1`. 196 | """ 197 | function unsafe_bitseq end 198 | 199 | for T in (UInt8, UInt16, UInt32, UInt64) 200 | @eval @inline function unsafe_bitseq(seq_data_ptr::Ptr{$T}, idx::Int) 201 | idx_c = idx - 1 202 | bitseq = unsafe_load(seq_data_ptr + idx_c ÷ 2) 203 | access_by_shift = idx_c % 2 == 1 204 | if access_by_shift 205 | # cannot accee to this index directly 206 | # INFO: 207 | bitseq >>= 0x04 208 | end 209 | return bitseq 210 | end 211 | end 212 | 213 | 214 | for T in (UInt8, UInt16, UInt32, UInt64) 215 | @eval @inline function unsafe_bitseq(seq_data_ptr::Ptr{$T}, idx::Int, max_idx::Int) 216 | idx_c = idx - 1 217 | bitseq = unsafe_load(seq_data_ptr + idx_c ÷ 2) 218 | access_by_shift = idx_c % 2 == 1 219 | if access_by_shift 220 | # cannot accee to this index directly 221 | bitseq >>= 0x04 222 | end 223 | 224 | nbase = $(sizeof(T) * 2) 225 | idx_stop = idx_c + nbase 226 | nbase_overflow = idx_stop - max_idx 227 | if nbase_overflow > 0 228 | # mask bases after idx_stop. but it is assumed masked by bitsafe! 229 | # bitseq &= ($(typemax(T)) >> (nbase_overflow * 4)) 230 | num_base_extracted = nbase - nbase_overflow 231 | else 232 | num_base_extracted = access_by_shift ? nbase - 1 : nbase 233 | end 234 | bitseq, num_base_extracted 235 | end 236 | end 237 | 238 | 239 | function bin(x) 240 | replace(bitstring(x), r"(....)" => s"\1 ") 241 | end 242 | -------------------------------------------------------------------------------- /benchmark/time_stats_plot.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | if (is.na(packageDescription("plotly")[1])) install.packages("plotly") 4 | library(plotly, quietly = T, warn.conflicts = F) 5 | if (is.na(packageDescription("argparse")[1])) install.packages("argparse") 6 | library(argparse, quietly = T) 7 | 8 | parser <- ArgumentParser(description='Plot time stats (speed vs threads/CPU)') 9 | parser$add_argument('-i', '--input', dest='input', metavar='FILE', type='character', 10 | required=TRUE, nargs='+', 11 | help='[REQUIRED] input time stats tables generated from time_stats.jl (1st=ungz, 2nd=gz') 12 | parser$add_argument('-o', '--output', dest='out', metavar='PLOT', type='character', 13 | default="time_stats_plot.html", 14 | help='output html heatmap file (default: time_stats_plot.html)') 15 | 16 | args <- parser$parse_args() 17 | 18 | if (FALSE){ 19 | setwd("~/analysis/atria-benchmark/simulate") 20 | args <- parser$parse_args(c("-i", "stats.time_benchmark3.df.txt", "stats.time_benchmark_gz3.df.txt", "-o", "time_stats_plot2.html")) 21 | } 22 | 23 | wrapper <- function(input_path, show_legend, is_gz){ 24 | 25 | input <- read.delim(input_path, header=TRUE, as.is=TRUE) 26 | 27 | input$Trimmer <- "" 28 | input$Trimmer[grepl("atria", input$Command)] <- "Atria (consensus)" 29 | input$Trimmer[grepl("atria --no-consensus", input$Command)] <- "Atria" 30 | input$Trimmer[grepl("AdapterRemoval", input$Command)] <- "AdapterRemoval" 31 | input$Trimmer[grepl("skewer", input$Command)] <- "Skewer" 32 | input$Trimmer[grepl("trim_galore", input$Command)] <- "Trim Galore" 33 | input$Trimmer[grepl("trimmomatic", input$Command)] <- "Trimmomatic" 34 | input$Trimmer[grepl("ktrim", input$Command)] <- "Ktrim" 35 | input$Trimmer[grepl("atropos", input$Command)] <- "Atropos" 36 | input$Trimmer[grepl("fastp", input$Command)] <- "Fastp" 37 | input$Trimmer[grepl("SeqPurge", input$Command)] <- "SeqPurge" 38 | 39 | input_labels <- c("Atria (consensus)", 40 | "Atria", 41 | "AdapterRemoval", 42 | "Skewer", 43 | "Trim Galore", 44 | "Trimmomatic", 45 | "Ktrim", 46 | "Atropos", 47 | "Fastp", 48 | "SeqPurge" 49 | ) 50 | 51 | input$Trimmer <- factor(input$Trimmer, input_labels) 52 | 53 | # input$Command <- NULL 54 | 55 | input_value = input 56 | for (j in 1:ncol(input)){ 57 | for (i in 1:nrow(input)){ 58 | input_value[i,j] <- sub(" ±.*", "", input[i,j]) 59 | } 60 | if (!any(is.na(as.numeric(input_value[,j])))) { 61 | if (class(input_value[,j]) != "factor") { 62 | input_value[,j] <- as.numeric(input_value[,j]) 63 | } 64 | } 65 | } 66 | 67 | input_sd = input 68 | for (j in 1:ncol(input)){ 69 | for (i in 1:nrow(input)){ 70 | input_sd[i,j] <- sub(".*± ", "", input[i,j]) 71 | } 72 | if (!any(is.na(as.numeric(input_sd[,j])))) { 73 | if (class(input_value[,j]) != "factor") { 74 | input_sd[,j] <- as.numeric(input_sd[,j]) 75 | if (all(input_sd[,j] == input_value[,j])) { 76 | input_sd[,j] <- 0 77 | } 78 | } 79 | } 80 | } 81 | 82 | input_high = input_value 83 | for (j in 1:ncol(input)){ 84 | for (i in 1:nrow(input)){ 85 | if (is.numeric(input_sd[i,j])){ 86 | input_high[i,j] = input_value[i,j] + input_sd[i,j] 87 | } 88 | } 89 | } 90 | input_low = input_value 91 | for (j in 1:ncol(input)){ 92 | for (i in 1:nrow(input)){ 93 | if (is.numeric(input_sd[i,j])){ 94 | input_low[i,j] = input_value[i,j] - input_sd[i,j] 95 | } 96 | } 97 | } 98 | 99 | 100 | if (sum(input_sd$Speed..M.Bases.s.) == 0){ 101 | speed_error_y_array = NULL 102 | efficiency_error_y_array = NULL 103 | } else { 104 | speed_error_y_array <- input_sd$Speed..M.Bases.s. 105 | efficiency_error_y_array <- input_sd$Efficiency..M.Bases.s.CPU. 106 | } 107 | # writeLines(as.character(show_legend)) 108 | if (is_gz){ 109 | gz_title = " for Compressed Files" 110 | } else { 111 | gz_title = "" 112 | } 113 | x_tick_vals = unique(input_value$Threads) 114 | 115 | fig_speed <- plot_ly(x=input_value$Threads, y=input_value$Speed..M.Bases.s., color=input_value$Trimmer, legendgroup=input_value$Trimmer, error_y = list(array=speed_error_y_array), showlegend=show_legend) %>% 116 | add_lines(line = list(shape = "spline" )) %>% 117 | add_markers(showlegend = FALSE) %>% 118 | layout( 119 | xaxis = list( 120 | title = "Threads Assigned", 121 | tickvals = x_tick_vals 122 | ), yaxis = list( 123 | title = paste0("Speed", gz_title, "\n(M Bases / Second)") 124 | )) 125 | fig_speed 126 | 127 | fig_efficiency <- plot_ly(x=input_value$Threads, y=input_value$Efficiency..M.Bases.s.CPU., color=input_value$Trimmer, legendgroup=input_value$Trimmer, error_y = list(array=efficiency_error_y_array), showlegend=FALSE) %>% 128 | add_lines(line = list(shape = "spline")) %>% 129 | add_markers(showlegend = FALSE) %>% 130 | layout( 131 | xaxis = list( 132 | title = "Threads Assigned", 133 | tickvals = x_tick_vals 134 | ), yaxis = list( 135 | title = paste0("Efficiency", gz_title, "\n(M Bases / Second / CPU Usage)") 136 | )) 137 | fig_efficiency 138 | 139 | fig_speed_vs_realCPU <- plot_ly(x=input_value$CPU, y=input_value$Speed..M.Bases.s., color=input_value$Trimmer, legendgroup=input_value$Trimmer, error_y = list(array=efficiency_error_y_array), showlegend=FALSE) %>% 140 | add_trace(line = list(shape = "spline")) %>% 141 | add_markers(showlegend = FALSE) %>% 142 | layout( 143 | xaxis = list( 144 | title = "Real Average CPU Usage", 145 | tickvals = x_tick_vals 146 | ), yaxis = list( 147 | title = paste0("Speed", gz_title, "\n(M Bases / Second)") 148 | )) 149 | fig_speed_vs_realCPU 150 | 151 | return(list( 152 | input_value = input_value, 153 | input_sd = input_sd, 154 | input_high = input_high, 155 | input_low = input_low, 156 | fig_speed = fig_speed, 157 | fig_efficiency = fig_efficiency, 158 | fig_speed_vs_realCPU = fig_speed_vs_realCPU 159 | )) 160 | } 161 | 162 | stat_1 <- wrapper(args$input[1], T, F) 163 | stat_2 <- wrapper(args$input[2], T, T) 164 | 165 | 166 | p <- subplot(stat_1$fig_speed %>% layout(legend = list(orientation='h', 167 | y=1.3, 168 | bgcolor=rgb(0,0,0,0))), 169 | stat_1$fig_speed_vs_realCPU, 170 | stat_2$fig_speed, 171 | stat_2$fig_speed_vs_realCPU, 172 | nrows=2, shareX = T, shareY = T) 173 | 174 | writeLines(sprintf("Output plot: %s", args$out)) 175 | htmlwidgets::saveWidget(as_widget(p), args$out) 176 | 177 | plogx <- subplot(stat_1$fig_speed %>% layout(xaxis = list(type='log'), 178 | legend = list(orientation='h', 179 | y=1.3, 180 | bgcolor=rgb(0,0,0,0))), 181 | stat_1$fig_speed_vs_realCPU %>% layout(xaxis = list(type='log')), 182 | stat_2$fig_speed %>% layout(xaxis = list(type='log')), 183 | stat_2$fig_speed_vs_realCPU %>% layout(xaxis = list(type='log')), 184 | nrows=2, shareX = T, shareY = T) 185 | plogx 186 | 187 | outlogx = sub(".html$", ".logx.html", args$out) 188 | writeLines(sprintf("Output logx plot: %s", outlogx)) 189 | htmlwidgets::saveWidget(as_widget(plogx), outlogx) 190 | -------------------------------------------------------------------------------- /benchmark/trimming-functions.bash: -------------------------------------------------------------------------------- 1 | #!bash 2 | export JULIA_NUM_THREADS=16 3 | 4 | time=/usr/bin/time 5 | ls $time 2>/dev/null 6 | if [[ $? > 0 ]] 7 | then 8 | time=/export/home/CFIA-ACIA/chuanj/.local/bin/time 9 | fi 10 | 11 | run_atria(){ 12 | local num_threads=1 13 | if [[ $1 ]]; then 14 | num_threads=$1 15 | fi 16 | $time -v atria --no-consensus \ 17 | -r $r1 -R $r2 \ 18 | -o Atria \ 19 | --no-tail-n-trim --max-n=-1 --no-quality-trim --no-length-filtration \ 20 | --adapter1 $a1 --adapter2 $a2 --threads $num_threads 21 | } 22 | 23 | run_atria_consensus(){ 24 | local num_threads=1 25 | if [[ $1 ]]; then 26 | num_threads=$1 27 | fi 28 | $time -v atria \ 29 | -r $r1 -R $r2 \ 30 | -o Atria-consensus \ 31 | --no-tail-n-trim --max-n=-1 --no-quality-trim --no-length-filtration \ 32 | --adapter1 $a1 --adapter2 $a2 --threads $num_threads 33 | } 34 | 35 | run_adapterremoval() { 36 | local num_threads=1 37 | if [[ $1 ]]; then 38 | num_threads=$1 39 | fi 40 | local err=3 41 | local folder="AdapterRemoval-$err" 42 | mkdir -p "$folder" 43 | if [[ $r1 = *gz ]]; then 44 | $time -v AdapterRemoval --file1 $r1 --file2 $r2 \ 45 | --basename "$folder"/adapterremoval \ 46 | --adapter1 $a1 --adapter2 $a2 \ 47 | --mm $err --minlength 0 --threads $num_threads --gzip 48 | else 49 | $time -v AdapterRemoval --file1 $r1 --file2 $r2 \ 50 | --basename "$folder"/adapterremoval \ 51 | --adapter1 $a1 --adapter2 $a2 \ 52 | --mm $err --minlength 0 --threads $num_threads 53 | fi 54 | } 55 | 56 | run_skewer(){ 57 | local num_threads=1 58 | if [[ $1 ]]; then 59 | num_threads=$1 60 | fi 61 | local OUTDIR="Skewer" 62 | mkdir -p $OUTDIR 63 | if [[ $r1 = *gz ]]; then 64 | $time -v skewer --quiet \ 65 | -x $a1 -y $a2 -m pe \ 66 | -l 0 -o $OUTDIR/$OUTDIR $r1 $r2 --threads $num_threads --compress 67 | else 68 | $time -v skewer --quiet \ 69 | -x $a1 -y $a2 -m pe \ 70 | -l 0 -o $OUTDIR/$OUTDIR $r1 $r2 --threads $num_threads 71 | fi 72 | } 73 | 74 | run_trim_galore(){ 75 | local num_threads=1 76 | if [[ $1 ]]; then 77 | num_threads=$1 78 | fi 79 | local OUTDIR="TrimGalore" 80 | mkdir -p $OUTDIR 81 | $time -v trim_galore --cores $num_threads \ 82 | --quality 0 \ 83 | -o $OUTDIR \ 84 | --adapter $a1 \ 85 | --adapter2 $a2 \ 86 | -e 0.1 --stringency 1 \ 87 | --max_n 100 --length 0 \ 88 | --paired $r1 $r2 89 | } 90 | 91 | run_trimmomatic(){ 92 | local num_threads=1 93 | if [[ $1 ]]; then 94 | num_threads=$1 95 | fi 96 | local OUTDIR="Trimmomatic" 97 | mkdir -p $OUTDIR 98 | rm -f adapters.fa 99 | echo '>TruSeq3/1' >> adapters.fa 100 | echo $a1 >> adapters.fa 101 | echo '>TruSeq3/2' >> adapters.fa 102 | echo $a2 >> adapters.fa 103 | output=$OUTDIR/out 104 | if [[ $r1 = *gz ]]; then 105 | local isgz=.gz 106 | else 107 | local isgz= 108 | fi 109 | 110 | $time -v java -jar /usr/software/Trimmomatic-0.39/trimmomatic-0.39.jar PE -threads $num_threads -phred33 $r1 $r2 $output-pair1.paired.fq$isgz $output-pair1.unpaired.fq$isgz $output-pair2.paired.fq$isgz $output-pair2.unpaired.fq$isgz ILLUMINACLIP:adapters.fa:2:30:10:1:TRUE:keepBothReads MINLEN:1 111 | } 112 | 113 | run_ktrim(){ 114 | local num_threads=1 115 | if [[ $1 ]]; then 116 | num_threads=$1 117 | fi 118 | local OUTDIR="Ktrim" 119 | mkdir -p $OUTDIR 120 | $time -v ktrim -1 $r1 -2 $r2 -t $num_threads -p 33 -q 1 -s 10 -a $a1 -b $a2 -o Ktrim/ktrim 121 | } 122 | 123 | run_fastp(){ 124 | local num_threads=1 125 | if [[ $1 ]]; then 126 | num_threads=$1 127 | fi 128 | local OUTDIR="fastp" 129 | output=$OUTDIR/out.fastp 130 | if [[ $r1 = *gz ]]; then 131 | local isgz=.gz 132 | else 133 | local isgz= 134 | fi 135 | mkdir -p $OUTDIR 136 | $time -v fastp --in1 $r1 --in2 $r2 --out1 $output.r1.fq$isgz --out2 $output.r2.fq$isgz \ 137 | -z 6 --adapter_sequence $a1 --adapter_sequence_r2 $a2 --disable_trim_poly_g --disable_quality_filtering --disable_length_filtering --thread $num_threads 138 | } 139 | 140 | run_seqpurge() { 141 | local num_threads=1 142 | if [[ $1 ]]; then 143 | num_threads=$1 144 | fi 145 | local folder=SeqPurge 146 | mkdir -p "$folder" 147 | # output always gziped 148 | $time -v SeqPurge -in1 $r1 -in2 $r2 -out1 "$folder"/$r1.seqpurge.fq.gz -out2 "$folder"/$r2.seqpurge.fq.gz \ 149 | -a1 $a1 -a2 $a2 -mep 0.1 \ 150 | -qcut 0 -min_len 0 -summary "$folder"/seqpurge.summary -threads $num_threads 151 | } 152 | 153 | run_cutadapt() { 154 | local num_threads=1 155 | if [[ $1 ]]; then 156 | num_threads=$1 157 | fi 158 | local OUTDIR="Cutadapt" 159 | output=$OUTDIR/out.cutadapt 160 | if [[ $r1 = *gz ]]; then 161 | local isgz=.gz 162 | else 163 | local isgz= 164 | fi 165 | mkdir -p "$OUTDIR" 166 | $time -v cutadapt -j $num_threads -a $a1 -A $a2 -o $output.R1.fq$isgz -p $output.R2.fq$isgz $r1 $r2 167 | } 168 | 169 | run_atropos() { 170 | # Atropos 1.1.29 with Python 3.8.5 171 | local num_threads=1 172 | if [[ $1 ]]; then 173 | num_threads=$1 174 | fi 175 | if [[ $r1 = *gz ]]; then 176 | local isgz=.gz 177 | else 178 | local isgz= 179 | fi 180 | local folder="Atropos" 181 | mkdir -p "$folder" 182 | if [[ $num_threads == 1 ]] 183 | then 184 | $time -v atropos trim -a $a1 -A $a2 \ 185 | -o "$folder"/$r1.atropos.fq$isgz -p "$folder"/$r2.atropos.fq$isgz -pe1 $r1 -pe2 $r2 \ 186 | --aligner insert -e 0.1 187 | else 188 | $time -v atropos trim -a $a1 -A $a2 \ 189 | -o "$folder"/$r1.atropos.fq$isgz -p "$folder"/$r2.atropos.fq$isgz -pe1 $r1 -pe2 $r2 \ 190 | --aligner insert -e 0.1 --threads $num_threads --preserve-order 191 | fi 192 | } 193 | 194 | mapping() { 195 | bwa mem -v 1 -t 25 $bwa_ref $1 $2 |\ 196 | samtools view -@ 10 -b -o $1.bam 197 | } 198 | 199 | 200 | mapping_bowtie2(){ 201 | bowtie2 --maxins 800 --threads 25 -x $bwa_ref-bowtie2 -1 $1 -2 $2 2> $1.bowtie2.stat |\ 202 | samtools view -@ 10 -b -o $1.bowtie2.bam 203 | } 204 | mapping_hisat2(){ 205 | hisat2 --threads 25 -x $bwa_ref-hisat2 -1 $1 -2 $2 -S $1.hisat2.sam 2> $1.hisat2.stat 206 | } 207 | qualtrim(){ 208 | local DIR=`dirname "$1"`/../trimmed-qualtrim 209 | if [[ $1 = *gz ]] 210 | then 211 | local num_threads=20 212 | local gzext=.gz 213 | else 214 | local num_threads=8 215 | local gzext= 216 | fi 217 | if [[ $3 ]] 218 | then 219 | local num_threads=$3 220 | fi 221 | time atria -r "$1" -R "$2" -t $num_threads --check-identifier \ 222 | -o "$DIR" \ 223 | --no-tail-n-trim --max-n=-1 --no-adapter-trim --no-length-filtration \ 224 | --quality-score $QSCORE 225 | rename --force "s/atria.fastq/qual$QSCORE.fastq/" "$DIR"/*fastq$gzext 226 | rename --force "s/atria.fq/qual$QSCORE.fq/" "$DIR"/*fq$gzext 227 | rename --force "s/atria.truncated/qual$QSCORE.truncated/" "$DIR"/*truncated$gzext 228 | rename --force "s/atria.log/qual$QSCORE.log/" "$DIR"/*log* 229 | } 230 | bowtie2stat(){ 231 | if [[ $1 ]] 232 | then 233 | local QSCORE=$1 234 | else 235 | local QSCORE= 236 | fi 237 | grep -v Warning */*qualtrim/*qual$QSCORE*bowtie2.stat | sed 's#/[^:]*#\t#' | grep "aligned concordantly exactly 1 time" | column -ts$'\t' 238 | echo 239 | grep -v Warning */*qualtrim/*qual$QSCORE*bowtie2.stat | sed 's#/[^:]*#\t#' | grep "aligned 0 times concordantly or discordantly" | column -ts$'\t' 240 | } 241 | 242 | pasteSamtoolsStats(){ 243 | grep ^SN $1| cut -f 2,4 | sed 's/\t# \(.*\)/ [\1]/' | sed 's/://' | awk 'BEGIN{print "sample"};{print}' > samtools-stats.collection.txt 244 | for i in "$@" 245 | do 246 | paste samtools-stats.collection.txt <(grep ^SN $i| cut -f 3 | awk -v var=${i/.fastq*/} 'BEGIN{print var};{print}') > samtools-stats.collection.tmp 247 | mv samtools-stats.collection.tmp samtools-stats.collection.txt 248 | done 249 | echo Output: samtools-stats.collection.txt 250 | } 251 | 252 | pasteTimeOutput(){ 253 | paste \ 254 | <(grep -E "Command being timed" $1 | sed 's/.*Command being timed://') \ 255 | <(grep -E "^\sUser time" $1) \ 256 | <(grep -E "^\sSystem time" $1) \ 257 | <(grep -E "^\sPercent of CPU this job got" $1) \ 258 | <(grep -E "^\sElapsed" $1) \ 259 | <(grep -E "^\sMaximum resident set size" $1) 260 | } 261 | 262 | 263 | sam2bam(){ 264 | for i in "$@" 265 | do 266 | echo `date` - $i 267 | samtools view -b $i > ${i:0:-3}bam 268 | if [[ $? == 0 ]] 269 | then 270 | rm $i 271 | else 272 | rm ${i:0:-3}bam 273 | echo SamToBam failed: $i 274 | fi 275 | done 276 | } 277 | -------------------------------------------------------------------------------- /src/Trimmer/wrapper_detect_adapter_pe.jl: -------------------------------------------------------------------------------- 1 | 2 | # f_procs(x::String) = x == "-p" || x == "--procs" 3 | 4 | function julia_wrapper_detect_adapter_pe(ARGS::Vector{String}; exit_after_help = true) 5 | 6 | time_program_initializing = time() 7 | 8 | args = parsing_args(ARGS; exit_after_help = exit_after_help) 9 | 10 | if args === nothing # ARGS is ["-h"] 11 | return 0 12 | end 13 | args_range_test(args) 14 | 15 | nthread = args["threads"] 16 | outdir = args["output-dir"] 17 | 18 | nfile = length(args["read1"]) 19 | file_range = 1:nfile 20 | 21 | #================== Arguments ====================# 22 | 23 | max_chunk_size = 2 ^ args["log2-chunk-size"] 24 | 25 | # NOTE: TruncSeq has some unknown accuracy problems. 26 | kmer_tolerance = args["kmer-tolerance" ] 27 | kmer_n_match = args["kmer-n-match" ] 28 | 29 | # quality 30 | quality_offset = Trimmer.get_quality_offset(args["quality-format"]) 31 | 32 | 33 | mkpath(outdir) 34 | 35 | 36 | #================== Main function and common variables ====================# 37 | 38 | in1bytes = Vector{UInt8}(undef, max_chunk_size) 39 | in2bytes = Vector{UInt8}(undef, max_chunk_size) 40 | 41 | # number of jobs to boxing FqRecord from UInt8 Vector 42 | njobs = nthread * 10 43 | vr1s = ntuple(_ -> Vector{FqRecord}(), njobs) 44 | vr2s = ntuple(_ -> Vector{FqRecord}(), njobs) 45 | 46 | r1s = Vector{FqRecord}() 47 | r2s = Vector{FqRecord}() 48 | 49 | time_program_initializing = time() - time_program_initializing 50 | 51 | adapter_detection_summary = init_adapter_detection_summary() 52 | #================== Iteration for paired files ====================# 53 | for filenum in file_range 54 | # filenum = 1 55 | time_file_initializing = time() 56 | 57 | 58 | #===== file names =====# 59 | 60 | file1 = args["read1"][filenum] 61 | file2 = args["read2"][filenum] 62 | 63 | # check whether this sample is processed before 64 | 65 | isingzip = occursin(r"\.gz$"i, file1) 66 | isinbzip2 = occursin(r"\.bz2$"i, file1) 67 | 68 | 69 | #===== file IO =====# 70 | halfthread = cld(nthread, 2) 71 | if isingzip 72 | io1 = open(`pigz -p$halfthread -cd $file1`, write=false) 73 | io2 = open(`pigz -p$halfthread -cd $file2`, write=false) 74 | elseif isinbzip2 75 | io1 = open(`pbzip2 -p$halfthread -cd $file1`, write=false) 76 | io2 = open(`pbzip2 -p$halfthread -cd $file2`, write=false) 77 | else 78 | io1 = open(file1, "r") 79 | io2 = open(file2, "r") 80 | end 81 | 82 | #================== Renew variables for read processing ====================# 83 | 84 | 85 | # setting chunk size for file 1 and file2 86 | chunk_size1, chunk_size2, uncompressed_size1, uncompressed_size2 = chunk_sizes(file1, file2, max_chunk_size) 87 | if (uncompressed_size1 == -1 || uncompressed_size2 == -1) && (isingzip || isinbzip2) 88 | # file is gzip but uncompressed size not known. 89 | # do not resize. just assume R1/2 is the original data, which means insert size is evenly-distributed. 90 | chunk_size1 = length(in1bytes) 91 | chunk_size2 = length(in2bytes) 92 | else 93 | resize!(in1bytes, chunk_size1) 94 | resize!(in2bytes, chunk_size2) 95 | end 96 | 97 | # clear common variables 98 | empty!(r1s) 99 | empty!(r2s) 100 | 101 | n_reads = 0 102 | n_r1 = 0 103 | n_r2 = 0 104 | nbatch = 0 105 | total_read_copied_in_loading = 0 106 | total_n_bytes_read1 = 0 107 | total_n_bytes_read2 = 0 108 | in1bytes_nremain = 0 109 | in2bytes_nremain = 0 110 | 111 | #================== File processing ====================# 112 | task_r1s_unbox = Threads.@spawn 1 113 | task_r2s_unbox = Threads.@spawn 1 114 | 115 | # the first cycle to generate compiled code? 116 | function cycle_wrapper() 117 | nbatch += 1 118 | n_r1_before = length(r1s) - n_reads 119 | n_r2_before = length(r2s) - n_reads 120 | 121 | if typeof(io1) <: IOStream # not compressed 122 | length(in1bytes) == chunk_size1 || resize!(in1bytes, chunk_size1) 123 | length(in2bytes) == chunk_size2 || resize!(in2bytes, chunk_size2) 124 | (n_r1, n_r2, r1s, r2s, ncopied) = load_fqs_threads!(io1, io2, in1bytes, in2bytes, vr1s, vr2s, r1s, r2s, task_r1s_unbox, task_r2s_unbox; remove_first_n = n_reads, njobs = njobs, quality_offset = quality_offset) 125 | else # gziped 126 | total_n_bytes_read1 += length(in1bytes) # will read INT in this batch 127 | total_n_bytes_read2 += length(in2bytes) # will read INT in this batch 128 | will_eof1 = total_n_bytes_read1 >= uncompressed_size1 129 | will_eof2 = total_n_bytes_read2 >= uncompressed_size2 130 | (n_r1, n_r2, r1s, r2s, in1bytes_nremain, in2bytes_nremain, ncopied) = load_fqs_threads!( 131 | io1, io2, 132 | in1bytes, in2bytes, in1bytes_nremain, in2bytes_nremain, 133 | vr1s, vr2s, r1s, r2s, task_r1s_unbox, task_r2s_unbox; 134 | will_eof1 = will_eof1, will_eof2 = will_eof2, 135 | in1bytes_resize_before_read = chunk_size1, 136 | in2bytes_resize_before_read = chunk_size2, 137 | remove_first_n = n_reads, quality_offset = quality_offset, 138 | njobs = njobs 139 | ) 140 | end 141 | 142 | n_reads = min(n_r1, n_r2) 143 | total_read_copied_in_loading += ncopied 144 | 145 | # it only get the sizes, did not change the sizes. Size changing is done in the "Read" part. 146 | chunk_size1, chunk_size2 = get_ideal_inbyte_sizes(in1bytes, in2bytes, n_r1, n_r2, n_r1_before, n_r2_before, max_chunk_size, chunk_size1, chunk_size2) 147 | 148 | # check_fq_ids(r1s::Vector{FqRecord}, r2s::Vector{FqRecord}, n_reads::Int)::nothing 149 | 150 | # processing reads 151 | r1_stats, r2_stats = check_pe_match(r1s, r2s; kmer_tolerance = kmer_tolerance + 1, kmer_n_match = kmer_n_match, occurance = 0.0004) 152 | 153 | show_paired_adapter_result(file1, r1_stats, n_reads) 154 | show_paired_adapter_result(file2, r2_stats, n_reads) 155 | push_adapter_detection_summary!(adapter_detection_summary, file1, r1_stats, file2, r2_stats) 156 | end 157 | 158 | cycle_wrapper() 159 | 160 | #================== Close files ====================# 161 | 162 | close(io1) 163 | close(io2) 164 | end 165 | 166 | timestamp = replace(string(now()), r"[T:\.]" => "-") 167 | adapter_detection_summary_file = joinpath(outdir, "atria_adapter_detect_summary.$timestamp.txt") 168 | CSV.write(adapter_detection_summary_file, adapter_detection_summary, delim = '\t') 169 | println(""" 170 | _________________________________ 171 | 172 | Summary of detected adapters is saved to $adapter_detection_summary_file 173 | 174 | _________________________________ 175 | 176 | Paired-end Adapter Detection Note: 177 | 178 | Atria detects adapter sequences using paired-end information. Adapter sequences are truncated to 16-bp, which are accurate enough for trimming. From experiments of many popular trimmers, increasing adapter length from 16 to 33 does not increase accuracy (Figure 4C of https://doi.org/10.46471/gigabyte.31). 179 | 180 | Adapter detection is the last choice because its accuracy is highly based on your data. If your data has been trimmed, the remaining adapters may not be enough for accurate guessing. We suggest using adapter detection only when you cannot find the actual adapter sequence. 181 | 182 | Besides, Atria does not automatically trim auto-detected adapters. It is your responsibility to check whether the detected adapters are real. 183 | 184 | Those rules can be used to check the adapter results: 185 | 186 | (1) An Illumina sequence file only has ONE adapter sequence. 187 | 188 | (2) In the same batch of NGS experiments, all R1 samples should have the SAME adapter sequence, so do R2 samples. The most prevalent adapters of R1 and R2 might be true for all your data. 189 | _________________________________ 190 | 191 | Summary of detected adapters is saved to $adapter_detection_summary_file 192 | _________________________________ 193 | 194 | """) 195 | 196 | return 0 197 | end # func 198 | -------------------------------------------------------------------------------- /src/FqRecords/check_and_trim.jl: -------------------------------------------------------------------------------- 1 | 2 | @inline function isinreadlength!(r::FqRecord, length_range::UnitRange{Int64})::Bool 3 | (length(r.seq)::Int64 in length_range)::Bool 4 | end 5 | @inline function isinreadlength!(r1::FqRecord, r2::FqRecord, length_range::UnitRange{Int64})::Bool 6 | res1 = (length(r1.seq)::Int64 in length_range)::Bool 7 | res2 = (length(r2.seq)::Int64 in length_range)::Bool 8 | (res1 && res2) 9 | end 10 | 11 | @inline function count_N(r::FqRecord)::Float64 12 | # A/T/G/C: 0001, 0010, 0100, 1000: count_ones == 1 13 | # N : 1111 : count_ones == 4 14 | n_1s = 0 15 | for b in r.seq.data 16 | n_1s += count_ones(b::UInt64)::Int64 17 | end 18 | @fastmath((n_1s - length(r.seq)::Int64)::Int64 / 3.0)::Float64 19 | end 20 | 21 | @inline function isnotmuchN!(r::FqRecord, max_N::Int64)::Bool 22 | c1 = count_N(r)::Float64 23 | c1 <= max_N 24 | end 25 | @inline function isnotmuchN!(r1::FqRecord, r2::FqRecord, max_N::Int64)::Bool 26 | c1 = count_N(r1)::Float64 27 | c2 = count_N(r2)::Float64 28 | res1 = (c1 <= max_N::Int64) 29 | res2 = (c2 <= max_N::Int64) 30 | res1 && res2 31 | end 32 | 33 | @inline function front_trim!(r::FqRecord, ntrim::Int64)::Nothing 34 | if ntrim <= 0 35 | elseif ntrim < length(r.seq) 36 | delete_range = 1:ntrim 37 | deleteat!(r.seq, delete_range) 38 | deleteat!(r.qual, delete_range) 39 | deleteat!(r.prob, delete_range) 40 | else # ntrim >= length(r.seq) 41 | resize!(r.seq, 0) 42 | resize!(r.qual, 0) 43 | resize!(r.prob, 0) 44 | end 45 | return 46 | end 47 | 48 | # @inline function tail_trim!(r::FqRecord, m::AlignMatch)::Nothing 49 | # resize!(r.seq::LongDNA{4}, m.insert_size::Int64) 50 | # resize!(r.qual, m.insert_size) 51 | # resize!(r.prob, m.insert_size) 52 | # return 53 | # end 54 | 55 | @inline function tail_trim!(r::FqRecord, nremain::Int64)::Nothing 56 | if nremain < length(r.seq::LongDNA{4}) 57 | resize!(r.seq::LongDNA{4}, nremain::Int64) 58 | resize!(r.qual, nremain) 59 | resize!(r.prob, nremain) 60 | end 61 | return 62 | end 63 | 64 | @inline function tail_N_trim!(r::FqRecord, stats::TrimStats)::Nothing 65 | nbase = length(r.seq::LongDNA{4})::Int64 66 | # trim end 67 | n = nbase::Int64 68 | @inbounds while n::Int64 >= 1 69 | (r.seq::LongDNA{4})[n]::DNA == DNA_N ? n -= 1 : break 70 | end 71 | if n::Int64 != nbase::Int64 72 | @atomic stats.tail_N_trim += 1 73 | resize!(r.seq::LongDNA{4}, n::Int64) 74 | resize!(r.qual, n) 75 | resize!(r.prob, n) 76 | end 77 | return 78 | end 79 | 80 | @inline function tail_low_qual_trim!(r::FqRecord, stats::TrimStats)::Nothing 81 | nbase = length(r.seq::LongDNA{4})::Int64 82 | # trim end 83 | n = nbase::Int64 84 | @inbounds while n::Int64 >= 1 85 | (r.prob)[n] < 0.3 ? n -= 1 : break # 0.3: phred Q < 1.5 86 | end 87 | if n::Int64 != nbase::Int64 88 | @atomic stats.tail_low_qual_trim += 1 89 | resize!(r.seq::LongDNA{4}, n::Int64) 90 | resize!(r.qual, n) 91 | resize!(r.prob, n) 92 | end 93 | return 94 | end 95 | 96 | """ 97 | qualitymatch(r::FqRecord, q0::UInt8, qn::UInt64, n::Int64)::Int64 98 | 99 | # ARGUMENTS 100 | 1. `r::FqRecord` is FastQ record. 101 | 2. `q0::UInt8` is the adjusted quality score. (Eg: +33 if Illumina 1.9+ version). 102 | 3. `qn::UInt64` is the adjusted quality score * n. (Eg: +33 if Illumina 1.9+ version). 103 | 4. `n::Int64` is the length of sliding window to iterate the reads. 104 | 105 | Return the length `n` of reads to keep. `-1` means no need for quality trimming. 106 | """ 107 | @inline function qualitymatch(r::FqRecord, q0::UInt8, qn::UInt64, n::Int64)::Int64 108 | quals = r.qual 109 | nqual = length(quals) 110 | N = n - 1 111 | nbase = nqual - N 112 | i = 1 113 | 114 | ### check any qual less than q0 115 | while i <= nqual 116 | if @inbounds(quals[i]) < q0 117 | break # start matching sliding window 118 | end 119 | i += 1 120 | end 121 | 122 | (i > nqual) && return -1 # no qual less than q0: not trim 123 | (i > nbase) && @goto tail_qual_match # i in the last n bases, go to tail_qual_match 124 | 125 | ### check sliding window 126 | qual_sum = UInt64(@inbounds quals[i]) 127 | start = i + 1 128 | stop = i + N 129 | for m in start:stop 130 | qual_sum += @inbounds quals[m] 131 | end 132 | 133 | (qual_sum < qn) && @goto tail_qual_match # ith failed quality match 134 | 135 | i += 1 136 | while i <= nbase 137 | qual_sum += @inbounds quals[i+N] 138 | qual_sum -= @inbounds quals[i-1] 139 | (qual_sum < qn) && @goto tail_qual_match # ith failed quality match 140 | i += 1 141 | end 142 | 143 | @label tail_qual_match 144 | while i <= nqual 145 | (@inbounds(quals[i]) < q0) && return i-1 # ith failed quality match 146 | i += 1 147 | end 148 | 149 | return -1 # no trim 150 | end 151 | 152 | """ 153 | seq_complexity(r::FqRecord) 154 | seq_complexity(seq::LongDNA{4}) 155 | 156 | The complexity is defined as the percentage of bases that are different from their next bases (base[i] != base[i+1]). However, here we use an approximation algorithm. 157 | 158 | The performance of the algorithm: 159 | ``` 160 | # Test Sequence True Computed Complexity 161 | NNNNNNNNNNNNNNNNNNNNNNNN: (0.0 -2.8260869565217392) 162 | ------------------------: (0.0 1.0) 163 | AAAAAAAAAAAAAAAAAAAAAAAA: (0.0 0.04347826086956519) 164 | ATATATATATATATATATATATAT: (1.0 1.0) 165 | ATTATTATTATTATTATTATTATT: (0.65 0.6521739130434783) 166 | ATATATATGGGGGGGG : (0.5 0.5333333333333333) 167 | NANANANANANANANA : (NaN 0.0) 168 | ``` 169 | """ 170 | @inline function seq_complexity(seq::LongDNA{4}) 171 | nbase = seq.len % Int64 # cannot use length(r.seq) because seq may start from mid, which is not compatible with the algorithm 172 | seq_data = seq.data 173 | n_valid_seq_data = length(seq_data) - 1 # -1 because of bitsafe 174 | n_ones = 0 175 | for i in 1:n_valid_seq_data 176 | b = seq_data[i] 177 | n_ones += count_ones(b & (b << 4)) 178 | # Test Sequence True Computed Complexity (1 - x/15) 179 | # NNNNNNNNNNNNNNNN: 60 ones, 4 zeros (0.0 -3.0) 180 | # ----------------: 0 ones, 64 zeros (0.0 1.0) 181 | # AAAAAAAAAAAAAAAA: 15 ones, 49 zeros (0.0 0.0) 182 | # ATATATATATATATAT: 0 ones, 64 zeros (1.0 1.0) 183 | # ATTATTATTATTATTA: 5 ones, 59 zeros (0.65 0.6666666666666667) 184 | # ATATATATGGGGGGGG: 7 ones, 57 zeros (0.50 0.5333333333333333) 185 | # NANANANANANANANA: 15 ones, 49 zeros (NN 0.0) 186 | end 187 | n_compensate = nbase % 16 188 | if n_compensate == 0 189 | complexity = @fastmath(1 - n_ones / (15 * n_valid_seq_data)) 190 | else 191 | complexity = @fastmath(1 - n_ones / (15 * (n_valid_seq_data - 1) + n_compensate)) 192 | end 193 | end 194 | 195 | @inline seq_complexity(r::FqRecord) = seq_complexity(r.seq) 196 | 197 | 198 | @inline function polyX_tail_scan(a::DNA, b::LongDNA{4}, allowed_mismatch_per_16mer::Int64; until::Int64 = 1) 199 | best_idx = 0 200 | N = length(b) 201 | n = N 202 | n_mismatch = 0 203 | allowed_mismatch = allowed_mismatch_per_16mer 204 | while n >= until 205 | if @inbounds(b[n]) & a == a # ambiguous DNA is true 206 | best_idx = n 207 | else 208 | n_mismatch += 1 209 | if n_mismatch > allowed_mismatch 210 | break 211 | end 212 | end 213 | n -= 1 214 | if (N-n) % 16 == 0 215 | allowed_mismatch = allowed_mismatch_per_16mer 216 | end 217 | end 218 | 219 | # check if can elongate 220 | 221 | best_idx2 = 0 222 | n -= 1 223 | while n >= until 224 | if @inbounds(b[n]) === a 225 | best_idx2 = n 226 | else 227 | n_mismatch += 1 228 | if n_mismatch > allowed_mismatch 229 | break 230 | end 231 | end 232 | n -= 1 233 | if (N-n) % 16 == 0 234 | allowed_mismatch = allowed_mismatch_per_16mer 235 | end 236 | end 237 | if best_idx2 > 0 # found 238 | best_idx = best_idx2 239 | elseif best_idx == 0 # not found 240 | return 0,0 241 | end 242 | 243 | # reverse check 244 | n_r_match = 1 245 | n_r_mismatch = 0 246 | n = best_idx + 1 247 | in_mismatch_region = false 248 | while n_r_match <= allowed_mismatch_per_16mer && n <= N 249 | if @inbounds(b[n]) === a 250 | if in_mismatch_region 251 | break 252 | end 253 | n_r_match += 1 254 | else 255 | in_mismatch_region = true 256 | n_r_mismatch += 1 257 | end 258 | n += 1 259 | end 260 | if n_r_mismatch >= n_r_match # revert 261 | best_idx = n 262 | end 263 | if best_idx > N # occurs when very poor match, best_idx2 > 0 and best_idx == 0 264 | return 0,0 265 | end 266 | n_polyX_length = N - best_idx + 1 267 | return best_idx, n_polyX_length 268 | end 269 | 270 | @inline polyX_tail_scan(a::DNA, b::FqRecord, allowed_mismatch_per_16mer::Int64; until::Int64 = 1) = polyX_tail_scan(a, b.seq, allowed_mismatch_per_16mer; until = until) 271 | -------------------------------------------------------------------------------- /src/FqRecords/thread_output.jl: -------------------------------------------------------------------------------- 1 | """ 2 | bytes_tmp1 = Vector{UInt8}(undef, 67108864) # 2^26 3 | 4 | Used for writebytes(io1out::CodecZlibIO, outr1s, range_filter, bytes_tmp1) 5 | """ 6 | bytes_tmp1 = Vector{UInt8}(undef, 67108864) # 2^26 7 | bytes_tmp2 = Vector{UInt8}(undef, 67108864) # 2^26 8 | 9 | """ 10 | write_fqs_threads!(io1out::IO, io2out::IO, 11 | outr1s::Vector{Vector{UInt8}}, outr2s::Vector{Vector{UInt8}}, 12 | r1s::Vector{FqRecord}, r2s::Vector{FqRecord}, 13 | n_reads::Int, range_filter, task_write1, task_write2) 14 | 15 | The interface to write paired FASTQ reads. 16 | 17 | - `r1s` and `r2s`: reads to write. 18 | """ 19 | function write_fqs_threads!(io1out::IOStream, io2out::IOStream, 20 | outr1s::Vector{Vector{UInt8}}, outr2s::Vector{Vector{UInt8}}, 21 | r1s::Vector{FqRecord}, r2s::Vector{FqRecord}, 22 | n_reads::Int, range_filter, task_write1, task_write2) 23 | 24 | task_r1s_unbox = Threads.@spawn begin 25 | wait(task_write1) # last task 26 | # @info "write_fqs_threads! FqRecord2StringVec! - start - R1 : n_reads = $n_reads" 27 | FqRecord2StringVec!(outr1s::Vector{Vector{UInt8}}, r1s::Vector{FqRecord}, n_reads::Int) 28 | # @info "write_fqs_threads! FqRecord2StringVec! - done - R1 : n_reads = $n_reads" 29 | end 30 | 31 | task_write1_new = Threads.@spawn begin 32 | wait(task_r1s_unbox) # last task 33 | writebytes(io1out, outr1s, range_filter) # new task 34 | end 35 | 36 | task_r2s_unbox = Threads.@spawn begin 37 | wait(task_write2) 38 | # @info "write_fqs_threads! FqRecord2StringVec! - start - R2 : n_reads = $n_reads" 39 | FqRecord2StringVec!(outr2s::Vector{Vector{UInt8}}, r2s::Vector{FqRecord}, n_reads::Int) 40 | # @info "write_fqs_threads! FqRecord2StringVec! - done - R2 : n_reads = $n_reads" 41 | end 42 | 43 | task_write2_new = Threads.@spawn begin 44 | wait(task_r2s_unbox) # last task 45 | writebytes(io2out, outr2s, range_filter) 46 | end 47 | 48 | task_r1s_unbox, task_r2s_unbox, task_write1_new, task_write2_new 49 | end 50 | function write_fqs_threads!(io1out::IO, io2out::IO, 51 | outr1s::Vector{Vector{UInt8}}, outr2s::Vector{Vector{UInt8}}, 52 | r1s::Vector{FqRecord}, r2s::Vector{FqRecord}, 53 | n_reads::Int, range_filter, task_write1, task_write2) 54 | 55 | task_r1s_unbox = Threads.@spawn begin 56 | wait(task_write1) # last task 57 | FqRecord2StringVec!(outr1s::Vector{Vector{UInt8}}, r1s::Vector{FqRecord}, n_reads::Int) 58 | end 59 | 60 | task_write1_new = Threads.@spawn begin 61 | wait(task_r1s_unbox) # last task 62 | writebytes(io1out, outr1s, range_filter, bytes_tmp1) # new task 63 | end 64 | 65 | task_r2s_unbox = Threads.@spawn begin 66 | wait(task_write2) 67 | FqRecord2StringVec!(outr2s::Vector{Vector{UInt8}}, r2s::Vector{FqRecord}, n_reads::Int) 68 | end 69 | 70 | task_write2_new = Threads.@spawn begin 71 | wait(task_r2s_unbox) # last task 72 | writebytes(io2out, outr2s, range_filter, bytes_tmp2) 73 | end 74 | 75 | task_r1s_unbox, task_r2s_unbox, task_write1_new, task_write2_new 76 | end 77 | 78 | function write_fqs_threads!(io1out::IOStream, 79 | outr1s::Vector{Vector{UInt8}}, 80 | r1s::Vector{FqRecord}, 81 | n_reads::Int, range_filter, task_write1) 82 | 83 | wait(task_write1) 84 | task_r1s_unbox = Threads.@spawn begin 85 | FqRecord2StringVec!(outr1s::Vector{Vector{UInt8}}, r1s::Vector{FqRecord}, n_reads::Int) 86 | end 87 | task_write1_new = Threads.@spawn begin 88 | wait(task_r1s_unbox) 89 | writebytes(io1out, outr1s, range_filter) 90 | end 91 | task_r1s_unbox, task_write1_new 92 | end 93 | function write_fqs_threads!(io1out::IO, 94 | outr1s::Vector{Vector{UInt8}}, 95 | r1s::Vector{FqRecord}, 96 | n_reads::Int, range_filter, task_write1) 97 | 98 | wait(task_write1) 99 | task_r1s_unbox = Threads.@spawn begin 100 | FqRecord2StringVec!(outr1s::Vector{Vector{UInt8}}, r1s::Vector{FqRecord}, n_reads::Int) 101 | end 102 | task_write1_new = Threads.@spawn begin 103 | wait(task_r1s_unbox) 104 | writebytes(io1out, outr1s, range_filter, bytes_tmp1) 105 | end 106 | task_r1s_unbox, task_write1_new 107 | end 108 | 109 | 110 | """ 111 | FqRecord2StringVec!(out::Vector{UInt8}, r::FqRecord) 112 | 113 | Empty `out`, and then convert `r` to it continuously. If empty sequence, write a N as sequence and a ! as quality. 114 | """ 115 | @inline function FqRecord2StringVec!(out::Vector{UInt8}, r::FqRecord)::Nothing 116 | # out = Base.StringVector(0) 117 | empty!(out) 118 | if r.seq.len == 0x0000000000000000 # isempty(r.seq::LongDNA{4}) 119 | append!(out, r.id::Vector{UInt8}) 120 | append!(out, [0x0a, 0x4e, 0x0a]) # \nN\n 121 | # push!(out, 0x0a) # \n 122 | # push!(out, 0x4e) # N 123 | # push!(out, 0x0a) 124 | append!(out, r.des::Vector{UInt8}) 125 | append!(out, [0x0a, 0x21, 0x0a]) # \n!\n 126 | # push!(out, 0x0a) 127 | # push!(out, 0x21) # ! 128 | # push!(out, 0x0a) 129 | else 130 | append!(out, r.id::Vector{UInt8}) 131 | push!(out, 0x0a) 132 | length_out = length(out) 133 | r_seq = r.seq 134 | length_r = length(r_seq) 135 | 136 | resize!(out, length_out + length_r) 137 | @inbounds for (i, base) in enumerate(r_seq) 138 | out[length_out + i] = UInt8(convert(Char, base)) 139 | end 140 | push!(out, 0x0a) 141 | append!(out, r.des::Vector{UInt8}) 142 | push!(out, 0x0a) 143 | append!(out, r.qual::Vector{UInt8}) 144 | push!(out, 0x0a) 145 | end 146 | nothing 147 | end 148 | 149 | 150 | """ 151 | FqRecord2StringVec!(outrs::Vector{Vector{UInt8}}, rs::Vector{FqRecord}, stop::Int) 152 | FqRecord2StringVec!(outrs::Vector{Vector{UInt8}}, rs::Vector{FqRecord}, reads_range::UnitRange) 153 | 154 | - `outrs`: the vector of string vectors to be modified in place. 155 | 156 | - `rs`: the vector of reads to be converted. 157 | 158 | - `stop::Int`: only convert `rs` in the range of `1:stop`. 159 | 160 | - `reads_range::UnitRange`: only convert `rs` in the reads range. 161 | """ 162 | @inline function FqRecord2StringVec!(outrs::Vector{Vector{UInt8}}, rs::Vector{FqRecord}, stop::Int)::Nothing 163 | n_outrs = length(outrs) 164 | if n_outrs < stop 165 | # make outrs larger 166 | append!(outrs, 167 | Vector{UInt8}[Base.StringVector(0) for i = 1:(stop-n_outrs)]) 168 | end 169 | if length(rs) < stop 170 | @error "length(rs) < stop" length(rs) stop 171 | end 172 | @sync for reads_start in 1:3072:stop 173 | reads_end = min(reads_start + 3071, stop) 174 | reads_range = reads_start:reads_end 175 | Threads.@spawn FqRecord2StringVec!(outrs, rs, reads_range) 176 | end 177 | nothing 178 | end 179 | 180 | @inline function FqRecord2StringVec!(outrs::Vector{Vector{UInt8}}, rs::Vector{FqRecord}, reads_range::UnitRange)::Nothing 181 | @inbounds for i in reads_range 182 | FqRecord2StringVec!(outrs[i], rs[i]) 183 | end 184 | nothing 185 | end 186 | 187 | 188 | @inline function writebytes(io::IOStream, outrs::Vector{Vector{UInt8}}, stop::Int)::Nothing 189 | @inbounds for i in 1:stop 190 | write_no_lock(io, outrs[i]) 191 | end 192 | nothing 193 | end 194 | @inline function writebytes(io::IOStream, outrs::Vector{Vector{UInt8}}, filters::SubArray{Bool,1,Array{Bool,1},Tuple{UnitRange{Int64}},true})::Nothing 195 | @inbounds for (i, val) in enumerate(filters) 196 | if val 197 | write_no_lock(io, outrs[i]) 198 | end 199 | end 200 | nothing 201 | end 202 | 203 | @inline function writebytes(io::IO, outrs::Vector{Vector{UInt8}}, stop::Int)::Nothing 204 | # for CodecZlib streams, call write once to increase speed (3.2X) 205 | # it is even faster than call pigz in shell. 206 | v_all = @inbounds outrs[1] 207 | @inbounds for i in 2:stop 208 | append!(v_all, outrs[i]) 209 | end 210 | write(io, v_all) 211 | nothing 212 | end 213 | 214 | 215 | 216 | @inline function writebytes(io::IO, outrs::Vector{Vector{UInt8}}, filters::SubArray{Bool,1,Array{Bool,1},Tuple{UnitRange{Int64}},true}, bytes_tmp::Vector{UInt8})::Nothing 217 | # for CodecZlib streams, call write once to increase speed 218 | 219 | # method 1: 112s 220 | # v_all = Base.StringVector(0) 221 | # @inbounds for (i, val) in enumerate(filters) 222 | # if val 223 | # append!(v_all, outrs[i]) 224 | # end 225 | # end 226 | # write(io, v_all) 227 | 228 | # method2: 213s 229 | # @inbounds for (i, val) in enumerate(filters) 230 | # if val 231 | # write(io, outrs[i]) 232 | # end 233 | # end 234 | 235 | # method3: 99s # the same speed as the natual pigz 236 | # bytes_tmp = Vector{UInt8}(undef, 67108864) # 2^26 237 | start = 1 238 | stop = length(bytes_tmp) 239 | @inbounds for (i, val) in enumerate(filters) 240 | if val 241 | outr = outrs[i]::Vector{UInt8} 242 | ncopy = length(outr)::Int 243 | new_stop = (start + ncopy - 1)::Int 244 | if new_stop > stop 245 | stop = max(stop + 2097152, new_stop)::Int 246 | resize!(bytes_tmp, stop) 247 | end 248 | unsafe_copyto!(bytes_tmp, start, outr, 1, ncopy) 249 | start += ncopy 250 | end 251 | end 252 | p_bytes_tmp = pointer(bytes_tmp) 253 | bytes_tmp_to_write = unsafe_wrap(Vector{UInt8}, p_bytes_tmp, start-1) 254 | write(io, bytes_tmp_to_write) 255 | nothing 256 | end 257 | 258 | @inline function fqwriterecord!(io::IO, outrs::Vector{Vector{UInt8}}, rs::Vector{FqRecord}, stop::Int) 259 | FqRecord2StringVec!(outrs, rs, stop) 260 | writebytes(io, outrs) 261 | end 262 | -------------------------------------------------------------------------------- /src/Benchmark/read_stats.jl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env julia 2 | 3 | # using BioSymbols 4 | # using BioSequences 5 | # using Statistics 6 | # 7 | # include("apiBioFqRecords.jl") 8 | 9 | function julia_wrapper_readstat(ARGS) 10 | 11 | help_page = """ 12 | usage: atria readstat [-h] FASTQS... 13 | 14 | positional arguments: 15 | FASTQS input trimmed fastqs. caution: raw fastq has to be 16 | generated by `atria simulate`. If multiple, two by two are considered paired. 17 | 18 | optional arguments: 19 | -h, --help show this help message and exit 20 | """ 21 | 22 | if "-h" in ARGS || "--help" in ARGS || length(ARGS) == 0 23 | println(help_page) 24 | return 0 25 | end 26 | 27 | time0 = time() 28 | 29 | n = length(ARGS) 30 | 31 | if n == 1 32 | peReadSimulatorStats_main(ARGS[1]) 33 | elseif n % 2 == 0 34 | for i in 1:2:n 35 | peReadSimulatorStats_main(ARGS[i], ARGS[i+1]) 36 | end 37 | else 38 | @error "multiple odd FASTQs detected. If providing multiple FASTQs, they are considered paired two by two." 39 | end 40 | 41 | @info "read simulation stats: all done" elapsed=time() - time0 42 | return 0 43 | end 44 | 45 | @inline function fastq_parser(r::FqRecord) 46 | splitted = split(String(copy(r.id)), " ") 47 | 48 | # read validate 49 | if length(splitted) < 7 || !occursin("@PeReadSimulator", splitted[1]) 50 | @error "read simulation stats: read format invalid: reads should be simulated by peReadSimulator and read headers should be intact." invalid_header=String(r.id) _module=nothing _group=nothing _id=nothing _file=nothing 51 | exit(3) 52 | end 53 | 54 | # @PeReadSimulator2:1:1 TRUE=80 INSERT_SIZE=80 ERROR_RATE=0.00102 SEQ_LENGTH=100 ERROR_INSERT=0 ERROR_ADAPTER=0 SUB=0.001 INS=1.0e-5 DEL=1.0e-5 55 | 56 | seq_id = splitted[1] 57 | true_length = parse(Int64, splitted[2][6:end]) 58 | insert_size = parse(Int64, splitted[3][13:end]) 59 | error_rate = parse(Float64, splitted[4][12:end]) 60 | seq_length = parse(Int64, splitted[5][12:end]) 61 | error_insert = parse(Int64, splitted[6][14:end]) 62 | error_adapter = parse(Int64, splitted[7][15:end]) 63 | 64 | if r.seq == dna"N" # compatible with Atria 65 | trimmed_length = 0 66 | else 67 | trimmed_length = length(r.seq) 68 | end 69 | 70 | delta_length = true_length - trimmed_length 71 | is_trim_successful = trimmed_length == true_length 72 | return (seq_id, seq_length, insert_size, error_rate, error_insert, error_adapter, true_length, trimmed_length, delta_length, is_trim_successful) 73 | end 74 | 75 | function stats(n_repeat::Int64, overtrim_deviations::Vector{Int64}, undertrim_deviations::Vector{Int64}) 76 | n_overtrim = length(overtrim_deviations ) 77 | n_undertrim = length(undertrim_deviations) 78 | 79 | rate_precision = (n_repeat - n_overtrim - n_undertrim) / n_repeat 80 | rate_overtrim = n_overtrim / n_repeat 81 | rate_undertrim = n_undertrim / n_repeat 82 | 83 | median_deviation = 0 84 | median_deviation_overtrim = 0 85 | median_deviation_undertrim = 0 86 | 87 | if n_overtrim > 0 88 | median_deviation_overtrim = median(overtrim_deviations) 89 | end 90 | if n_undertrim > 0 91 | median_deviation_undertrim = median(undertrim_deviations) 92 | end 93 | if n_overtrim + n_undertrim > 0 94 | median_deviation = median!([overtrim_deviations; -undertrim_deviations]) 95 | end 96 | 97 | # deviation greater than 1 bp stats 98 | overtrim_deviations_gt1 = filter(x -> abs(x) > 1, overtrim_deviations) 99 | undertrim_deviations_gt1 = filter(x -> abs(x) > 1, undertrim_deviations) 100 | n_overtrim_gt1 = length(overtrim_deviations_gt1 ) 101 | n_undertrim_gt1 = length(undertrim_deviations_gt1) 102 | 103 | rate_precision_in1 = (n_repeat - n_overtrim_gt1 - n_undertrim_gt1) / n_repeat 104 | rate_overtrim_gt1 = n_overtrim_gt1 / n_repeat 105 | rate_undertrim_gt1 = n_undertrim_gt1 / n_repeat 106 | 107 | return rate_precision, rate_overtrim, rate_undertrim, median_deviation, median_deviation_overtrim, median_deviation_undertrim, rate_precision_in1, rate_overtrim_gt1, rate_undertrim_gt1 108 | end 109 | 110 | function peReadSimulatorStats_main(r1::String, r2::String) 111 | io1 = open(r1, "r") 112 | io2 = open(r2, "r") 113 | 114 | tmp_file = r1 * ".r12" 115 | io_out = open(tmp_file, "w+") 116 | 117 | while !eof(io1) || !eof(io2) 118 | while !eof(io1) 119 | line = readline(io1) 120 | println(io_out, line) 121 | line = readline(io1) 122 | println(io_out, line) 123 | line = readline(io1) 124 | println(io_out, line) 125 | line = readline(io1) 126 | println(io_out, line) 127 | break 128 | end 129 | while !eof(io2) 130 | line = readline(io2) 131 | println(io_out, line) 132 | line = readline(io2) 133 | println(io_out, line) 134 | line = readline(io2) 135 | println(io_out, line) 136 | line = readline(io2) 137 | println(io_out, line) 138 | break 139 | end 140 | end 141 | close(io_out) 142 | 143 | peReadSimulatorStats_main(tmp_file) 144 | 145 | rm(tmp_file) 146 | end 147 | 148 | function peReadSimulatorStats_main(input::String) 149 | @info "read simulation stats: start" input 150 | 151 | if !isfile(input) 152 | @warn "read simulation stats: input FASTQ file not valid: skip" FILE=input _module=nothing _group=nothing _id=nothing _file=nothing 153 | return nothing 154 | end 155 | 156 | r = FqRecord() 157 | io = open(input, "r") 158 | 159 | # check if the file is empty 160 | if eof(io) 161 | @warn "read simulation stats: input FASTQ file empty: skip" FILE=input _module=nothing _group=nothing _id=nothing _file=nothing 162 | return nothing 163 | end 164 | 165 | # table = fastq_parser(input::String) 166 | # generate stat-detail.tsv 167 | stat_detail = open(input * ".stat-detail.tsv", "w+") 168 | 169 | stat_detail_header = "seq_id\tseq_length\tinsert_size\terror_rate\terror_insert\terror_adapter\ttrue_length\ttrimmed_length\tdelta_length\tis_trim_successful" 170 | println(stat_detail, stat_detail_header) 171 | 172 | stat_summary = open(input * ".stat.tsv", "w+") 173 | stat_summary_header = "seq_length\tinsert_size\terror_rate\trepeat\tprecision\trate_overtrim\trate_undertrim\tdeviation\tdeviation_overtrim\tdeviation_undertrim\trate_precision_in1\trate_overtrim_gt1\trate_undertrim_gt1" 174 | println(stat_summary, stat_summary_header) 175 | 176 | ### first read 177 | fqreadrecord!(r::FqRecord, io::IO) 178 | 179 | read_stat = fastq_parser(r) 180 | println(stat_detail, join(read_stat, "\t")) 181 | 182 | (seq_id, seq_length, insert_size, error_rate, error_insert, error_adapter, true_length, trimmed_length, delta_length, is_trim_successful) = read_stat 183 | 184 | # identifier 185 | current_seq_length = seq_length 186 | current_insert_size = insert_size 187 | current_error_rate = error_rate 188 | 189 | # stats 190 | n_repeat = 1 191 | 192 | overtrim_deviations = Vector{Int64}() 193 | undertrim_deviations = Vector{Int64}() 194 | 195 | if delta_length > 0 196 | push!(overtrim_deviations, delta_length) 197 | elseif delta_length < 0 198 | push!(undertrim_deviations, delta_length) 199 | end 200 | 201 | ### other reads 202 | while !eof(io) 203 | fqreadrecord!(r::FqRecord, io::IO) 204 | 205 | read_stat = fastq_parser(r) 206 | println(stat_detail, join(read_stat, "\t")) 207 | 208 | (seq_id, seq_length, insert_size, error_rate, error_insert, error_adapter, true_length, trimmed_length, delta_length, is_trim_successful) = read_stat 209 | 210 | # check identifier 211 | if current_seq_length == seq_length && current_insert_size == insert_size && current_error_rate == error_rate 212 | ### same identifier: append 213 | n_repeat += 1 214 | 215 | if delta_length > 0 216 | push!(overtrim_deviations, delta_length) 217 | elseif delta_length < 0 218 | push!(undertrim_deviations, delta_length) 219 | end 220 | else 221 | ### new identifier: compute stats; refresh variables 222 | # compute stats 223 | stats_results = stats(n_repeat, overtrim_deviations, undertrim_deviations) 224 | stats_results_string = join(Any[current_seq_length, current_insert_size, current_error_rate, n_repeat, stats_results...], "\t") 225 | println(stat_summary, stats_results_string) 226 | 227 | # refresh variables 228 | current_seq_length = seq_length 229 | current_insert_size = insert_size 230 | current_error_rate = error_rate 231 | 232 | n_repeat = 1 233 | 234 | overtrim_deviations = Vector{Int64}() 235 | undertrim_deviations = Vector{Int64}() 236 | 237 | if delta_length > 0 238 | push!(overtrim_deviations, delta_length) 239 | elseif delta_length < 0 240 | push!(undertrim_deviations, delta_length) 241 | end 242 | end 243 | end 244 | 245 | ### compute stats for the last 246 | stats_results = stats(n_repeat, overtrim_deviations, undertrim_deviations) 247 | stats_results_string = join(Any[current_seq_length, current_insert_size, current_error_rate, n_repeat, stats_results...], "\t") 248 | println(stat_summary, stats_results_string) 249 | 250 | ### closing 251 | close(io) 252 | close(stat_detail) 253 | close(stat_summary) 254 | 255 | @info "read simulation stats: output" detail="$input.stat-detail.tsv" summary="$input.stat.tsv" 256 | end 257 | -------------------------------------------------------------------------------- /src/Benchmark/read_simulation.jl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env julia 2 | 3 | # using ArgParse 4 | 5 | function parsing_args_simulate(args; exit_after_help = true) 6 | settings = ArgParseSettings(exit_after_help = exit_after_help) 7 | 8 | add_arg_group!(settings, "output") 9 | @add_arg_table! settings begin 10 | "--prefix", "-o" 11 | help = "prefix of output fastq files" 12 | metavar = "PREF" 13 | default = "read_simulation" 14 | end 15 | 16 | add_arg_group!(settings, "simulation") 17 | @add_arg_table! settings begin 18 | "--repeat", "-x" 19 | help = "repeat times for each case" 20 | default = 30000 21 | arg_type = Int64 22 | "--adapter1", "-a" 23 | help = "read 1 adapter" 24 | metavar = "SEQ" 25 | default = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCA" 26 | "--adapter2", "-A" 27 | help = "read 2 adapter" 28 | metavar = "SEQ" 29 | default = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT" 30 | "--seq-length", "-s" 31 | help = "a given sequence length; simulated sequence length might be 1 base more than the value because of simulated phasing error" 32 | default = 100 33 | arg_type = Int64 34 | "--insert-size-range", "-i" 35 | help = "range of insert size" 36 | nargs = '+' 37 | arg_type = Int64 38 | default = [80:2:120;] 39 | "--subsitution-rate", "-S" 40 | help = "subsitution rate per base. it is random for each base. error type includs mismatch" 41 | nargs = '+' 42 | arg_type = Float64 43 | default = [0.001:0.001:0.005;] 44 | "--insertion-rate", "-I" 45 | help = "insertion rate; number of arg should be the same as --subsitution-rate" 46 | nargs = '+' 47 | arg_type = Float64 48 | default = [1.0e-5:1.0e-5:5.0e-5;] 49 | "--deletion-rate", "-D" 50 | help = "deletion rate; number of arg should be the same as --subsitution-rate" 51 | nargs = '+' 52 | arg_type = Float64 53 | default = [1.0e-5:1.0e-5:5.0e-5;] 54 | end 55 | return parse_args(args, settings) 56 | end 57 | 58 | 59 | @inline function simulate_insert(insert_size::Int64) 60 | bases = rand(['A', 'T', 'C', 'G'], insert_size) 61 | string(bases...) 62 | end 63 | 64 | """ 65 | simulate_error(base::String, sub_rate::Float64, insert_rate::Float64, del_rate::Float64) 66 | 67 | Return `(base::String, iserror::Int64)` 68 | """ 69 | @inline function simulate_error(base::String, sub_rate::Float64, insert_rate::Float64, del_rate::Float64) 70 | bases = ["A", "T", "C", "G"] 71 | 72 | randfloat = rand() 73 | if randfloat <= sub_rate 74 | ## subsitution 75 | idx_base = findfirst(x -> x == base, bases) 76 | idx_sub = rand(1:3) 77 | if idx_base == idx_sub 78 | return "G", 1 # bases[4] == "G" 79 | else 80 | return bases[idx_sub], 1 81 | end 82 | else 83 | randfloat -= sub_rate 84 | if randfloat <= insert_rate 85 | ## insert 86 | res = base * rand(bases) 87 | return res, 1 88 | else 89 | randfloat -= insert_rate 90 | if randfloat <= del_rate 91 | ## deletion 92 | return "", 1 93 | end 94 | ## no error 95 | return base, 0 96 | end 97 | end 98 | end 99 | 100 | """ 101 | simulate_read(insert::String, adapter::String, sub_rate::Float64, insert_rate::Float64, del_rate::Float64, seq_length::Int64) 102 | 103 | Return `res, true_insert_size, nerror_insert, nerror_adapter` 104 | """ 105 | @inline function simulate_read(insert::String, adapter::String, sub_rate::Float64, insert_rate::Float64, del_rate::Float64, seq_length::Int64) 106 | res = "" 107 | ninsert = length(insert) 108 | nerror_insert = 0 109 | nerror_adapter = 0 110 | 111 | # simulate insert 112 | the_base = "" 113 | for i in 1:ninsert 114 | the_base, iserror = simulate_error(insert[i:i], sub_rate::Float64, insert_rate::Float64, del_rate::Float64) 115 | n_the_base = length(the_base) 116 | res *= the_base 117 | nerror_insert += iserror 118 | current_insert_size = length(res) 119 | if current_insert_size >= seq_length 120 | if n_the_base <= 1 121 | return res, current_insert_size, nerror_insert, nerror_adapter 122 | elseif n_the_base == 2 # error with insert. Inserted part is not of real DNA fragment! 123 | return res[1:end-1], current_insert_size - 1, nerror_insert, nerror_adapter 124 | else 125 | @error "Bugs at simulate_read()" the_base iserror length(res) _module=nothing _group=nothing _id=nothing _file=nothing 126 | end 127 | end 128 | end 129 | if length(the_base) == 2 # error with insert. Inserted part is not of real DNA fragment! 130 | true_insert_size = length(res) - 1 131 | else 132 | true_insert_size = length(res) 133 | end 134 | 135 | nadapter = length(adapter) 136 | for i in 1:nadapter 137 | the_base, iserror = simulate_error(adapter[i:i], sub_rate::Float64, insert_rate::Float64, del_rate::Float64) 138 | res *= the_base 139 | nerror_adapter += iserror 140 | if length(res) >= seq_length 141 | return res, true_insert_size, nerror_insert, nerror_adapter 142 | end 143 | end 144 | 145 | nrandom = seq_length - length(res) 146 | res *= simulate_insert(nrandom) 147 | 148 | return res, true_insert_size, nerror_insert, nerror_adapter 149 | end 150 | 151 | @inline function complement_char(c::Char) 152 | if c == 'A' 153 | 'T' 154 | elseif c == 'T' 155 | 'A' 156 | elseif c == 'C' 157 | 'G' 158 | elseif c == 'G' 159 | 'C' 160 | else 161 | 'N' 162 | end 163 | end 164 | 165 | @inline function reverse_complement(s::String) 166 | ns = lastindex(s::String) 167 | char_vec = map(x -> complement_char(s[x]), ns:-1:1) 168 | string(char_vec...) 169 | end 170 | 171 | function writeseq(io::IO, header::String, seq::String; error_rate=0.0001) 172 | println(io, header) 173 | println(io, seq) 174 | println(io, "+") 175 | qual_char = if error_rate < 0.0001 176 | 'J' 177 | else 178 | Char(round(Int, -10 * log10(error_rate)) + 33) 179 | end 180 | println(io, qual_char ^ length(seq)) 181 | end 182 | 183 | function julia_wrapper_simulate(ARGS; exit_after_help = true) 184 | time0 = time() 185 | 186 | if length(ARGS) == 0 187 | parsing_args_simulate(["-h"], exit_after_help = exit_after_help) 188 | return 0 189 | end 190 | args = parsing_args_simulate(ARGS, exit_after_help = exit_after_help) 191 | args === nothing && return 0 192 | 193 | r1 = args["prefix"] * ".R1.fastq" 194 | r2 = args["prefix"] * ".R2.fastq" 195 | 196 | r1_io = open(r1, "w+") 197 | r2_io = open(r2, "w+") 198 | 199 | 200 | @info "read simulation: output files" r1 r2 201 | 202 | adapter1 = args["adapter1"] 203 | adapter2 = args["adapter2"] 204 | repeat_times = args["repeat"] 205 | seq_length = args["seq-length"] 206 | insert_sizes = args["insert-size-range"] 207 | 208 | insert_rates = args["insertion-rate"] 209 | deletion_rates = args["deletion-rate"] 210 | subsitution_rates = args["subsitution-rate"] 211 | 212 | length(insert_rates) == length(deletion_rates) == length(subsitution_rates) || 213 | error("ArgumentError: the numbers of args of --subsitution-rate, --insertion-rate, and --deletion-rate should be the same. Abort.") 214 | 215 | error_rates = insert_rates .+ deletion_rates .+ subsitution_rates 216 | 217 | any(error_rates .> 1) && 218 | error("ArgumentError: any dot sums of --subsitution-rate, --insertion-rate, and --deletion-rate should be less than one. Abort.") 219 | 220 | 221 | read_pair_count = repeat_times * length(insert_sizes) * length(error_rates) 222 | 223 | read_id = 0 224 | for insert_size in insert_sizes 225 | if insert_size >= 0 226 | for (i_rate, error_rate) in enumerate(error_rates) 227 | insert_rate = insert_rates[i_rate] 228 | del_rate = deletion_rates[i_rate] 229 | sub_rate = subsitution_rates[i_rate] 230 | 231 | for rep in 1:repeat_times 232 | read_id += 1 233 | insert = simulate_insert(insert_size::Int64) 234 | r1_seq, r1_true_insert_size, r1_nerror_insert, r1_nerror_adapter = simulate_read(insert, adapter1, sub_rate::Float64, insert_rate::Float64, del_rate::Float64, seq_length::Int64) 235 | r2_seq, r2_true_insert_size, r2_nerror_insert, r2_nerror_adapter = simulate_read(reverse_complement(insert), adapter2, sub_rate::Float64, insert_rate::Float64, del_rate::Float64, seq_length::Int64) 236 | 237 | r1_header = "@PeReadSimulator2:$read_id:$rep TRUE=$r1_true_insert_size INSERT_SIZE=$insert_size ERROR_RATE=$error_rate SEQ_LENGTH=$seq_length ERROR_INSERT=$r1_nerror_insert ERROR_ADAPTER=$r1_nerror_adapter SUB=$sub_rate INS=$insert_rate DEL=$del_rate" 238 | r2_header = "@PeReadSimulator2:$read_id:$rep TRUE=$r2_true_insert_size INSERT_SIZE=$insert_size ERROR_RATE=$error_rate SEQ_LENGTH=$seq_length ERROR_INSERT=$r2_nerror_insert ERROR_ADAPTER=$r2_nerror_adapter SUB=$sub_rate INS=$insert_rate DEL=$del_rate" 239 | 240 | writeseq(r1_io, r1_header, r1_seq, error_rate=error_rate) 241 | writeseq(r2_io, r2_header, r2_seq, error_rate=error_rate) 242 | end 243 | end 244 | end 245 | end 246 | close(r1_io) 247 | close(r2_io) 248 | @info "read simulation: all done" elapsed=time() - time0 249 | return 0 250 | end 251 | -------------------------------------------------------------------------------- /src/Benchmark/read_simulation_primer.jl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env julia 2 | 3 | # using ArgParse 4 | 5 | using BioSequences 6 | 7 | function parsing_args_simulate(args; exit_after_help = true) 8 | settings = ArgParseSettings(exit_after_help = exit_after_help) 9 | 10 | add_arg_group!(settings, "output") 11 | @add_arg_table! settings begin 12 | "--prefix", "-o" 13 | help = "prefix of output fastq files" 14 | metavar = "PREF" 15 | default = "read_simulation" 16 | end 17 | 18 | add_arg_group!(settings, "simulation") 19 | @add_arg_table! settings begin 20 | "--repeat", "-x" 21 | help = "repeat times for each case" 22 | default = 30000 23 | arg_type = Int64 24 | "--primer1", "-a" 25 | help = "read 1 primer" 26 | metavar = "SEQ" 27 | default = "AHCGATGAAGAACRYAG" 28 | "--primer2", "-A" 29 | help = "read 2 primer" 30 | metavar = "SEQ" 31 | default = "CTTATTGATATGCTTAAGTTCAG" 32 | "--seq-length", "-s" 33 | help = "a given sequence length; simulated sequence length might be 1 base more than the value because of simulated phasing error" 34 | default = 100 35 | arg_type = Int64 36 | "--insert-size-range", "-i" 37 | help = "range of insert size" 38 | nargs = '+' 39 | arg_type = Int64 40 | default = [80:2:120;] 41 | "--subsitution-rate", "-S" 42 | help = "subsitution rate per base. it is random for each base. error type includs mismatch" 43 | nargs = '+' 44 | arg_type = Float64 45 | default = [0.001:0.001:0.005;] 46 | "--insertion-rate", "-I" 47 | help = "insertion rate; number of arg should be the same as --subsitution-rate" 48 | nargs = '+' 49 | arg_type = Float64 50 | default = [1.0e-5:1.0e-5:5.0e-5;] 51 | "--deletion-rate", "-D" 52 | help = "deletion rate; number of arg should be the same as --subsitution-rate" 53 | nargs = '+' 54 | arg_type = Float64 55 | default = [1.0e-5:1.0e-5:5.0e-5;] 56 | end 57 | return parse_args(args, settings) 58 | end 59 | 60 | 61 | @inline function simulate_insert(insert_size::Int64) 62 | randdnaseq(insert_size) 63 | end 64 | 65 | """ 66 | simulate_error(base::DNA, sub_rate::Float64, insert_rate::Float64, del_rate::Float64) 67 | 68 | Return `(base::DNA, iserror::Int64)` 69 | """ 70 | @inline function simulate_error(base::DNA, sub_rate::Float64, insert_rate::Float64, del_rate::Float64) 71 | bases = [DNA_A, DNA_T, DNA_C, DNA_G] 72 | 73 | randfloat = rand() 74 | if randfloat <= sub_rate 75 | ## subsitution 76 | idx_base = findfirst(x -> x == base, bases) 77 | idx_sub = rand(1:3) 78 | if idx_base == idx_sub 79 | return DNA_G, 1 # bases[4] == "G" 80 | else 81 | return bases[idx_sub], 1 82 | end 83 | else 84 | randfloat -= sub_rate 85 | if randfloat <= insert_rate 86 | ## insert 87 | res = base * rand(bases) 88 | return res, 1 89 | else 90 | randfloat -= insert_rate 91 | if randfloat <= del_rate 92 | ## deletion 93 | return DNA_Gap, 1 94 | end 95 | ## no error 96 | return base, 0 97 | end 98 | end 99 | end 100 | 101 | """ 102 | simulate_read(insert::String, primer::String, sub_rate::Float64, insert_rate::Float64, del_rate::Float64, seq_length::Int64) 103 | 104 | Return `res, true_insert_size, nerror_insert, nerror_primer` 105 | """ 106 | @inline function simulate_read(insert::String, primer_head::LongDNA{4}, primer_tail::LongDNA{4}, sub_rate::Float64, insert_rate::Float64, del_rate::Float64, seq_length::Int64) 107 | res = LongDNA{4} 108 | ninsert = length(insert) 109 | nerror_insert = 0 110 | nerror_primer_head = 0 111 | nerror_primer_tail = 0 112 | 113 | 114 | 115 | 116 | 117 | # simulate insert 118 | the_base = "" 119 | for i in 1:ninsert 120 | the_base, iserror = simulate_error(insert[i:i], sub_rate::Float64, insert_rate::Float64, del_rate::Float64) 121 | n_the_base = length(the_base) 122 | res *= the_base 123 | nerror_insert += iserror 124 | current_insert_size = length(res) 125 | if current_insert_size >= seq_length 126 | if n_the_base <= 1 127 | return res, current_insert_size, nerror_insert, nerror_primer 128 | elseif n_the_base == 2 # error with insert. Inserted part is not of real DNA fragment! 129 | return res[1:end-1], current_insert_size - 1, nerror_insert, nerror_primer 130 | else 131 | @error "Bugs at simulate_read()" the_base iserror length(res) _module=nothing _group=nothing _id=nothing _file=nothing 132 | end 133 | end 134 | end 135 | if length(the_base) == 2 # error with insert. Inserted part is not of real DNA fragment! 136 | true_insert_size = length(res) - 1 137 | else 138 | true_insert_size = length(res) 139 | end 140 | 141 | nprimer = length(primer) 142 | for i in 1:nprimer 143 | the_base, iserror = simulate_error(primer[i:i], sub_rate::Float64, insert_rate::Float64, del_rate::Float64) 144 | res *= the_base 145 | nerror_primer += iserror 146 | if length(res) >= seq_length 147 | return res, true_insert_size, nerror_insert, nerror_primer 148 | end 149 | end 150 | 151 | nrandom = seq_length - length(res) 152 | res *= simulate_insert(nrandom) 153 | 154 | return res, true_insert_size, nerror_insert, nerror_primer 155 | end 156 | 157 | @inline function complement_char(c::Char) 158 | if c == 'A' 159 | 'T' 160 | elseif c == 'T' 161 | 'A' 162 | elseif c == 'C' 163 | 'G' 164 | elseif c == 'G' 165 | 'C' 166 | else 167 | 'N' 168 | end 169 | end 170 | 171 | @inline function reverse_complement(s::String) 172 | ns = lastindex(s::String) 173 | char_vec = map(x -> complement_char(s[x]), ns:-1:1) 174 | string(char_vec...) 175 | end 176 | 177 | function writeseq(io::IO, header::String, seq::String; error_rate=0.0001) 178 | println(io, header) 179 | println(io, seq) 180 | println(io, "+") 181 | qual_char = if error_rate < 0.0001 182 | 'J' 183 | else 184 | Char(round(Int, -10 * log10(error_rate)) + 33) 185 | end 186 | println(io, qual_char ^ length(seq)) 187 | end 188 | 189 | function julia_wrapper_simulate(ARGS; exit_after_help = true) 190 | time0 = time() 191 | 192 | if length(ARGS) == 0 193 | parsing_args_simulate(["-h"], exit_after_help = exit_after_help) 194 | return 0 195 | end 196 | args = parsing_args_simulate(ARGS, exit_after_help = exit_after_help) 197 | args === nothing && return 0 198 | 199 | r1 = args["prefix"] * ".R1.fastq" 200 | r2 = args["prefix"] * ".R2.fastq" 201 | 202 | r1_io = open(r1, "w+") 203 | r2_io = open(r2, "w+") 204 | 205 | 206 | @info "read simulation: output files" r1 r2 207 | 208 | primer1 = args["primer1"] 209 | primer2 = args["primer2"] 210 | repeat_times = args["repeat"] 211 | seq_length = args["seq-length"] 212 | insert_sizes = args["insert-size-range"] 213 | 214 | insert_rates = args["insertion-rate"] 215 | deletion_rates = args["deletion-rate"] 216 | subsitution_rates = args["subsitution-rate"] 217 | 218 | length(insert_rates) == length(deletion_rates) == length(subsitution_rates) || 219 | error("ArgumentError: the numbers of args of --subsitution-rate, --insertion-rate, and --deletion-rate should be the same. Abort.") 220 | 221 | error_rates = insert_rates .+ deletion_rates .+ subsitution_rates 222 | 223 | any(error_rates .> 1) && 224 | error("ArgumentError: any dot sums of --subsitution-rate, --insertion-rate, and --deletion-rate should be less than one. Abort.") 225 | 226 | 227 | read_pair_count = repeat_times * length(insert_sizes) * length(error_rates) 228 | 229 | read_id = 0 230 | for insert_size in insert_sizes 231 | if insert_size >= 0 232 | for (i_rate, error_rate) in enumerate(error_rates) 233 | insert_rate = insert_rates[i_rate] 234 | del_rate = deletion_rates[i_rate] 235 | sub_rate = subsitution_rates[i_rate] 236 | 237 | for rep in 1:repeat_times 238 | read_id += 1 239 | insert = simulate_insert(insert_size::Int64) 240 | r1_seq, r1_true_insert_size, r1_nerror_insert, r1_nerror_primer = simulate_read(insert, primer1, primer2_rc, sub_rate::Float64, insert_rate::Float64, del_rate::Float64, seq_length::Int64) 241 | r2_seq, r2_true_insert_size, r2_nerror_insert, r2_nerror_primer = simulate_read(reverse_complement(insert), primer2, primer1_rc, sub_rate::Float64, insert_rate::Float64, del_rate::Float64, seq_length::Int64) 242 | 243 | r1_header = "@PeReadSimulator2:$read_id:$rep TRUE=$r1_true_insert_size INSERT_SIZE=$insert_size ERROR_RATE=$error_rate SEQ_LENGTH=$seq_length ERROR_INSERT=$r1_nerror_insert ERROR_primer=$r1_nerror_primer SUB=$sub_rate INS=$insert_rate DEL=$del_rate" 244 | r2_header = "@PeReadSimulator2:$read_id:$rep TRUE=$r2_true_insert_size INSERT_SIZE=$insert_size ERROR_RATE=$error_rate SEQ_LENGTH=$seq_length ERROR_INSERT=$r2_nerror_insert ERROR_primer=$r2_nerror_primer SUB=$sub_rate INS=$insert_rate DEL=$del_rate" 245 | 246 | writeseq(r1_io, r1_header, r1_seq, error_rate=error_rate) 247 | writeseq(r2_io, r2_header, r2_seq, error_rate=error_rate) 248 | end 249 | end 250 | end 251 | end 252 | close(r1_io) 253 | close(r2_io) 254 | @info "read simulation: all done" elapsed=time() - time0 255 | return 0 256 | end 257 | --------------------------------------------------------------------------------