├── test
    ├── BioBits
    ├── FqRecords
    ├── AtriaTest.jl
    ├── runtests.jl
    └── trimmer_and_benchmark.jl
├── docs
    ├── logo.png
    ├── Figure 2 Speed.png
    ├── Figure 1 Simulation Accuracy2.png
    ├── 4.Development_notes.md
    ├── 1.1.Release_installation_guide.md
    ├── 1.2.Install_from_source.md
    └── 5.Accuracy_and_speed_benchmark.md
├── .gitignore
├── src
    ├── AtriaTest
    │   ├── FqRecords
    │   │   ├── runtests.jl
    │   │   └── primer_match.jl
    │   ├── BioBits
    │   │   ├── runtests.jl
    │   │   ├── get_seq.jl
    │   │   ├── algorithm_basis.jl
    │   │   ├── bit_match.jl
    │   │   └── biosequences_safety.jl
    │   ├── AtriaTest.jl
    │   └── trimmer_and_benchmark.jl
    ├── atria
    ├── Benchmark
    │   ├── Benchmark.jl
    │   ├── rand_trim.jl
    │   ├── read_stats.jl
    │   ├── read_simulation.jl
    │   └── read_simulation_primer.jl
    ├── atria_profile
    ├── BioBits
    │   ├── BioBits.jl
    │   ├── biosequences_safety.jl
    │   ├── insert_size_decision.jl
    │   └── get_seq.jl
    ├── Trimmer
    │   ├── Trimmer.jl
    │   ├── thread_trim.jl
    │   ├── wrapper_detect_adapter_se.jl
    │   ├── markdown_help.jl
    │   └── wrapper_detect_adapter_pe.jl
    ├── FqRecords
    │   ├── adapter_match_se.jl
    │   ├── FqRecords.jl
    │   ├── copy.jl
    │   ├── interface.jl
    │   ├── quality.jl
    │   ├── basic_io.jl
    │   ├── pcr_dedup.jl
    │   ├── util.jl
    │   ├── consensus.jl
    │   ├── check_and_trim.jl
    │   └── thread_output.jl
    └── Atria.jl
├── benchmark
    ├── atria-simulate-main.bash
    ├── real-data-time.bash
    ├── aln2len.pl
    ├── atria-simulate.bash
    ├── replicates-stats.jl
    ├── art-simulate-main.bash
    ├── real-data-rnaseq.bash
    ├── real-data-human.bash
    ├── atria-similate-for-atria-only.bash
    ├── evalTrimming.pl
    ├── time_stats.jl
    ├── time_stats_plot.R
    └── trimming-functions.bash
├── adapter.known.txt
├── Project.toml
├── README.md
├── LICENSE.md
└── CHANGELOG.md


/test/BioBits:
--------------------------------------------------------------------------------
1 | ../src/AtriaTest/BioBits


--------------------------------------------------------------------------------
/test/FqRecords:
--------------------------------------------------------------------------------
1 | ../src/AtriaTest/FqRecords


--------------------------------------------------------------------------------
/test/AtriaTest.jl:
--------------------------------------------------------------------------------
1 | ../src/AtriaTest/AtriaTest.jl


--------------------------------------------------------------------------------
/test/runtests.jl:
--------------------------------------------------------------------------------
1 | 
2 | using Atria
3 | 
4 | Atria.test_atria()
5 | 


--------------------------------------------------------------------------------
/test/trimmer_and_benchmark.jl:
--------------------------------------------------------------------------------
1 | ../src/AtriaTest/trimmer_and_benchmark.jl


--------------------------------------------------------------------------------
/docs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cihga39871/Atria/HEAD/docs/logo.png


--------------------------------------------------------------------------------
/docs/Figure 2 Speed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cihga39871/Atria/HEAD/docs/Figure 2 Speed.png


--------------------------------------------------------------------------------
/docs/Figure 1 Simulation Accuracy2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cihga39871/Atria/HEAD/docs/Figure 1 Simulation Accuracy2.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.jl.cov
 2 | *.jl.*.cov
 3 | *.jl.mem
 4 | deps/deps.jl
 5 | .Rproj.user
 6 | .*DS_Store
 7 | nohup.out
 8 | bin/*
 9 | lib*/*
10 | tmp*/*
11 | .vscode/
12 | app*/
13 | app
14 | /atria-*
15 | Atria*
16 | Manifest.toml
17 | 


--------------------------------------------------------------------------------
/src/AtriaTest/FqRecords/runtests.jl:
--------------------------------------------------------------------------------
 1 | include("fq_records.jl")
 2 | include("primer_match.jl")
 3 | 
 4 | @noinline function test_fq_records()
 5 |     @testset "BioBits" begin
 6 |         test_fq_records_basis()
 7 |         # test_primer_match()
 8 |     end
 9 | end
10 | 


--------------------------------------------------------------------------------
/src/atria:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env julia
 2 | 
 3 | @info "Atria without precompilation. It may take a while to precompile."
 4 | 
 5 | using Pkg
 6 | 
 7 | Pkg.activate(dirname(@__DIR__))
 8 | Pkg.instantiate()
 9 | 
10 | include(joinpath(@__DIR__, "Atria.jl"))
11 | 
12 | Atria.julia_main()
13 | 


--------------------------------------------------------------------------------
/docs/4.Development_notes.md:
--------------------------------------------------------------------------------
 1 | ## Development
 2 | 
 3 | ### Run `Atria` directly (development only)
 4 | It is an easy way to debug Atria without building binaries:
 5 | 
 6 | ```sh
 7 | # replace ARGS... with Atria arguments
 8 | julia -O3 -i --check-bounds=yes --color=yes $atria/src/atria ARGS...
 9 | ```
10 | 


--------------------------------------------------------------------------------
/benchmark/atria-simulate-main.bash:
--------------------------------------------------------------------------------
 1 | #! bash
 2 | 
 3 | for i in 16 20 24 28 33
 4 | do
 5 |     echo "Start: adapter length $i"
 6 |     echo "Start: adapter length $i"
 7 |     echo "Start: adapter length $i"
 8 |     bash $atria/benchmark/atria-simulate.bash $i
 9 | done
10 | 
11 | working_dir=~/analysis/atria-benchmark/atria_simulate
12 | 
13 | cd $working_dir
14 | 
15 | atria statplot -i auto -l DIR2
16 | 


--------------------------------------------------------------------------------
/src/AtriaTest/BioBits/runtests.jl:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # include("insert_size_decision.jl")
 4 | include("algorithm_basis.jl")
 5 | include("biosequences_safety.jl")
 6 | include("get_seq.jl")
 7 | include("bit_match.jl")
 8 | 
 9 | @noinline function test_bio_bits()
10 |     @testset "BioBits" begin
11 |         test_algorithm_basis()
12 |         test_biosequences_safety()
13 |         test_get_seq()
14 |         test_bit_match()
15 |     end
16 | end
17 | 


--------------------------------------------------------------------------------
/src/Benchmark/Benchmark.jl:
--------------------------------------------------------------------------------
 1 | 
 2 | module Benchmark
 3 | 
 4 | export julia_wrapper_simulate,
 5 | julia_wrapper_randtrim,
 6 | julia_wrapper_readstat,
 7 | julia_wrapper_rscript,
 8 | statplot_code
 9 | 
10 | using ArgParse
11 | using Statistics
12 | 
13 | using ..BioBits.BioSymbols
14 | using ..BioBits.BioSequences
15 | using ..FqRecords
16 | 
17 | include("read_simulation.jl")
18 | include("rand_trim.jl")
19 | include("read_stats.jl")
20 | include("external_code.jl")
21 | 
22 | end
23 | 


--------------------------------------------------------------------------------
/src/atria_profile:
--------------------------------------------------------------------------------
 1 | #! julia -i
 2 | 
 3 | @info "Atria without precompilation. It may take a while to precompile."
 4 | 
 5 | include(joinpath(@__DIR__, "Atria.jl"))
 6 | 
 7 | # using ProfileView
 8 | 
 9 | empty!(ARGS)
10 | append!(ARGS, ["-r", "/home/jc/analysis/atria-benchmark/julia1.8.5/reads_diff_indel.R1.fastq", "-R", "/home/jc/analysis/atria-benchmark/julia1.8.5/reads_diff_indel.R2.fastq", "-t", "8", "-o", "/home/jc/analysis/atria-benchmark/julia1.8.5/outprofile", "-f"])
11 | 
12 | @profview Atria.julia_main()
13 | @profview Atria.julia_main()
14 | 


--------------------------------------------------------------------------------
/src/BioBits/BioBits.jl:
--------------------------------------------------------------------------------
 1 | 
 2 | module BioBits
 3 | 
 4 | using Reexport
 5 | 
 6 | @reexport using BioSymbols
 7 | @reexport using BioSequences
 8 | 
 9 | # include("biosequences_safety.jl")
10 | # export bitsafe!, 
11 | # isbitsafe
12 | 
13 | include("get_seq.jl")
14 | export N2gap,
15 | SeqHead,
16 | SeqHeadSet,
17 | get_pointer,
18 | get_unsafe_index_of_last_bitseq,
19 | unsafe_bitseq,
20 | bin
21 | 
22 | include("bit_match.jl")
23 | export MatchRes, 
24 | bitwise_scan,
25 | _bitwise_scan_fullseq,
26 | bitwise_scan_rc!,
27 | bitwise_scan_rc
28 | 
29 | include("insert_size_decision.jl")
30 | export insert_size_decision,
31 | insert_size_decision_separate,
32 | is_false_positive,
33 | one_bp_check
34 | 
35 | end
36 | 


--------------------------------------------------------------------------------
/src/AtriaTest/AtriaTest.jl:
--------------------------------------------------------------------------------
 1 | 
 2 | module AtriaTest
 3 | export test_atria
 4 | 
 5 | using Test
 6 | 
 7 | using ..BioBits
 8 | using ..BioBits.BioSymbols
 9 | using ..BioBits.BioSequences
10 | using ..FqRecords
11 | using ..Trimmer
12 | using ..Benchmark
13 | using ...Atria
14 | 
15 | #=
16 | using Test
17 | using .Atria
18 | using .Atria.BioBits
19 | using .Atria.BioBits.BioSymbols
20 | using .Atria.BioBits.BioSequences
21 | using .Atria.FqRecords
22 | using .Atria.Trimmer
23 | using .Atria.Benchmark
24 | =#
25 | 
26 | include(joinpath("BioBits", "runtests.jl"))
27 | include(joinpath("FqRecords", "runtests.jl"))
28 | include("trimmer_and_benchmark.jl")
29 | 
30 | @noinline function test_atria()
31 |     @testset "Atria" begin
32 |         test_bio_bits()
33 |         test_fq_records()
34 |         test_trimmer_and_benchmark()
35 |     end
36 |     true
37 | end
38 | 
39 | end
40 | 


--------------------------------------------------------------------------------
/adapter.known.txt:
--------------------------------------------------------------------------------
 1 | AAGTCGGAGGCCAAGC
 2 | AAGTCGGATCGTAGCC
 3 | AATGATACGGCGACCA
 4 | ACACTCTTTCCCTACA
 5 | AGATCGGAAGAGCACA
 6 | AGATCGGAAGAGCGGT
 7 | AGATCGGAAGAGCGTC
 8 | AGATCGGAAGAGCTCG
 9 | CAAGCAGAAGACGGCA
10 | CCACTACGCCTCCGCT
11 | CCGACAGGTTCAGAGT
12 | CCGAGCCCACGAGACA
13 | CCGAGCCCACGAGACC
14 | CCGAGCCCACGAGACG
15 | CCGAGCCCACGAGACT
16 | CGACAGGTTCAGAGTT
17 | CGGTCTCGGCATTCCT
18 | CTAATACGACTCACTA
19 | CTGAGCGGGCTGGCAA
20 | CTGATGGCGCGAGGGA
21 | CTGCCCCGGGTTCCTC
22 | CTGTCTCTTATACACA
23 | GACGCTGCCGACGAAC
24 | GACGCTGCCGACGAAG
25 | GACGCTGCCGACGAAT
26 | GACGCTGCCGACGACG
27 | GACGCTGCCGACGACT
28 | GACGCTGCCGACGAGC
29 | GACGCTGCCGACGATA
30 | GACGCTGCCGACGATC
31 | GATCGGAAGAGCACAC
32 | GATCGGAAGAGCGGTT
33 | GATCGGAAGAGCGTCG
34 | GATCGGAAGAGCTCGT
35 | GATCGTCGGACTGTAG
36 | GTCTCGTGGGCTCGGA
37 | GTGACTGGAGTTCAGA
38 | TACACTCTTTCCCTAC
39 | TCGGACTGTAGAACTC
40 | TCGTCGGCAGCGTCAG
41 | TGGAATTCTCGGGTGC
42 | 


--------------------------------------------------------------------------------
/src/AtriaTest/BioBits/get_seq.jl:
--------------------------------------------------------------------------------
 1 | @noinline function test_get_seq()
 2 |     @testset "get seq" begin
 3 |         a = dna"NNNNATCGNNSANNNNNNNNNNNN" |> bitsafe!
 4 | 
 5 |         a.data = N2gap.(a.data)
 6 |         @test a == dna"----ATCG--SA------------"
 7 | 
 8 |         a = dna"ATCGACTGCGTACGTACGTAC" |> bitsafe!
 9 |         SeqHeadSet(a)
10 | 
11 |         b = dna"" |> bitsafe!
12 |         SeqHeadSet(b)
13 | 
14 |         pa = get_pointer(0x00, a)
15 |         @test unsafe_load(pa) == 0x81
16 | 
17 |         pa = get_pointer(0x0000, a)
18 |         @test unsafe_load(pa) == 0x4281
19 | 
20 |         pa = get_pointer(0x00000000, a)
21 |         @test unsafe_load(pa) == 0x48214281
22 | 
23 |         pa = get_pointer(0x0000000000000000, a)
24 |         @test unsafe_load(pa) == 0x1842184248214281
25 | 
26 |         @test unsafe_bitseq(pa, 1) == 0x1842184248214281
27 |         @test unsafe_bitseq(pa, 2) == 0x0184218424821428
28 |         @test unsafe_bitseq(pa, 21, 21) == (0x0000000000000002, 1)
29 |     end
30 | end


--------------------------------------------------------------------------------
/src/AtriaTest/BioBits/algorithm_basis.jl:
--------------------------------------------------------------------------------
 1 | 
 2 | @noinline function test_algorithm_basis()
 3 |     @testset "algorithm_basis" begin
 4 | 
 5 |         @test UInt === UInt64
 6 |         @test Int === Int64
 7 |         @test sizeof(UInt64) == 8
 8 |         @test sizeof(UInt32) == 4
 9 |         @test sizeof(UInt16) == 2
10 |         @test sizeof(UInt8) == 1
11 |     
12 |     
13 |         seq = dna"ANATATATATATATGGANNNNATATATNNNGGGG"
14 |     
15 |         @test typeof(seq) === LongDNA{4}
16 |         @test typeof(seq) === LongSequence{DNAAlphabet{4}}
17 |     
18 |         @test typeof(seq.data) === Array{UInt64,1}
19 |         @test typeof(seq.len) === UInt
20 |     
21 |         @test seq.data == UInt64[0x44818181818181f1,
22 |                                  0x44fff818181ffff1,
23 |                                  0x0000000000000044]
24 |     
25 |         p_seq = pointer(seq.data)
26 |         @test unsafe_load(p_seq, 2) == 0x44fff818181ffff1
27 |         @test unsafe_load(p_seq + 1) == 0xf144818181818181
28 |     
29 |     end    
30 | end
31 | 


--------------------------------------------------------------------------------
/src/Trimmer/Trimmer.jl:
--------------------------------------------------------------------------------
 1 | 
 2 | module Trimmer
 3 | 
 4 | export julia_wrapper_atria_pe,
 5 | julia_wrapper_atria_se,
 6 | julia_wrapper_detect_adapter_se,
 7 | julia_wrapper_detect_adapter_pe,
 8 | sub_procs, sub_procs_single_end,
 9 | atria_markdown_help,
10 | processing_reads!,
11 | processing_reads_range!,
12 | processing_reads_threads!,
13 | parsing_args,
14 | args_range_test,
15 | get_quality_offset,
16 | get_length_range,
17 | f_procs
18 | 
19 | using Reexport
20 | 
21 | @reexport using ArgParse
22 | @reexport using BioSymbols
23 | @reexport using BioSequences
24 | @reexport using Distributed
25 | @reexport using Logging
26 | @reexport using JSON
27 | @reexport using DataStructures
28 | @reexport using Printf
29 | @reexport using Markdown
30 | @reexport using PrettyTables
31 | @reexport using DataFrames
32 | @reexport using CSV
33 | @reexport using Dates
34 | @reexport using Statistics
35 | 
36 | @reexport using ..BioBits
37 | @reexport using ..FqRecords
38 | 
39 | using Pkg
40 | const atria_version = @eval($(string("v", Pkg.project().version)))
41 | 
42 | include("markdown_help.jl")
43 | include("args.jl")
44 | include("thread_trim.jl")
45 | include("wrapper_pe.jl")
46 | include("wrapper_se.jl")
47 | include("detect_adapter.jl")
48 | include("wrapper_detect_adapter_se.jl")
49 | include("wrapper_detect_adapter_pe.jl")
50 | end
51 | 


--------------------------------------------------------------------------------
/src/FqRecords/adapter_match_se.jl:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | @inline function adapter_match_se(adapter1_seqheadset::SeqHeadSet,
 5 |                            r1::FqRecord,
 6 |                            kmer_tolerance::Int64,
 7 |                            trim_score::Float64)
 8 |     r1_adapter_match = bitwise_scan(adapter1_seqheadset, r1.seq, 1, kmer_tolerance)
 9 |     compute_prob_and_score!(r1_adapter_match, r1, r1_adapter_match.idx, r1_adapter_match.idx + 15)
10 | 
11 |     r1_insert_size = r1_adapter_match.idx - 1
12 | 
13 |     if r1_adapter_match.score > trim_score
14 |         # r1_insert_size can be -1
15 |         # trim
16 |         r1_insert_size < 0 ? 0 : r1_insert_size
17 |     else
18 |         9223372036854775807  # typemax, no trim
19 |     end
20 | end
21 | 
22 | @inline function adapter_match_se(adapter1_seqheadsets::Vector{SeqHeadSet},
23 |                            r1::FqRecord,
24 |                            kmer_tolerance::Int64,
25 |                            trim_score::Float64)
26 | 
27 |     nremain = 9223372036854775807  # typemax, no trim
28 |     for adapter1_seqheadset in adapter1_seqheadsets
29 |         nremain_new = adapter_match_se(adapter1_seqheadset, r1, kmer_tolerance, trim_score)
30 |         if nremain_new < nremain
31 |             nremain = nremain_new
32 | 
33 |             if nremain_new == 0
34 |                 break
35 |             end
36 |         end
37 |     end
38 |     nremain
39 | end


--------------------------------------------------------------------------------
/Project.toml:
--------------------------------------------------------------------------------
 1 | name = "Atria"
 2 | uuid = "226cbef3-b485-431c-85c2-d8bd8da14025"
 3 | authors = ["Jiacheng Chuan <jiacheng_chuan@outlook.com>"]
 4 | version = "4.1.4"
 5 | 
 6 | [deps]
 7 | ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
 8 | BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
 9 | BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
10 | CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
11 | DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
12 | DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
13 | Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
14 | DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
15 | Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
16 | JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
17 | Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
18 | Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
19 | PackageCompiler = "9b87118b-4619-50d2-8e1e-99f35a4d4d9d"
20 | Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
21 | PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
22 | Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
23 | Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
24 | Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
25 | Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
26 | Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
27 | 
28 | [compat]
29 | ArgParse = "1"
30 | BioSequences = "= 3.1.6"
31 | BioSymbols = "5"
32 | CSV = "^0.10"
33 | DataFrames = "1"
34 | DataStructures = "^0.18"
35 | JSON = "^0.21"
36 | PackageCompiler = "2"
37 | PrettyTables = "2"
38 | julia = "1.8"
39 | 


--------------------------------------------------------------------------------
/src/FqRecords/FqRecords.jl:
--------------------------------------------------------------------------------
 1 | 
 2 | module FqRecords
 3 | 
 4 | export FqRecord,
 5 | qualpval,
 6 | qualprob,
 7 | update_prob_from_qual,
 8 | probsum,
 9 | probmean,
10 | copyto!,
11 | safe_copyto!,
12 | fqreadrecord,
13 | fqreadrecord!,
14 | fqwriterecord,
15 | check_identifier,
16 | throw_identifier_error,
17 | iscomplement,
18 | load_fqs_threads!,
19 | read_chunks!,
20 | StringChunk2FqRecord!,
21 | chunk_sizes,
22 | get_ideal_inbyte_sizes,
23 | get_ideal_inbyte_sizes!,
24 | write_fqs_threads!,
25 | isinreadlength!,
26 | count_N,
27 | isnotmuchN!,
28 | front_trim!,
29 | tail_trim!,
30 | tail_N_trim!,
31 | tail_low_qual_trim!,
32 | qualitymatch,
33 | seq_complexity,
34 | polyX_tail_scan,
35 | pe_consensus!
36 | 
37 | using Reexport
38 | 
39 | @reexport using Base.Threads
40 | using ..BioBits
41 | using ..BioBits.BioSymbols
42 | using ..BioBits.BioSequences
43 | 
44 | include("interface.jl")
45 | export TrimStats
46 | 
47 | include("quality.jl")
48 | export compute_prob_and_score!
49 | 
50 | include("copy.jl")
51 | include("basic_io.jl")
52 | include("util.jl")
53 | include("consensus.jl")
54 | 
55 | include("thread_input.jl")
56 | include("thread_output.jl")
57 | 
58 | include("check_and_trim.jl")
59 | 
60 | include("adapter_match_se.jl")
61 | export adapter_match_se
62 | 
63 | include("adapter_match_pe.jl")
64 | export adapter_match_and_trim_pe!,
65 | adapter_match_pe, 
66 | PEOptions, AdapterPERes
67 | 
68 | include("pcr_dedup.jl")
69 | export DupCount, get_dup_count, pcr_dedup, write_pcr_dedup_count, write_pcr_hash_collision,
70 | hash_dna
71 | 
72 | end
73 | 


--------------------------------------------------------------------------------
/src/AtriaTest/BioBits/bit_match.jl:
--------------------------------------------------------------------------------
 1 | 
 2 | @noinline function test_bit_match()
 3 |     @testset "bit match" begin
 4 |         a = dna"ACCCGGTCAGTACGTCAGTACGCAGTAGTGTA" |> bitsafe!
 5 |         b = dna"NNNACCCGGTCAGTACGTCAGTACGCAGTAGTGTA" |> bitsafe!
 6 |         c = dna"NNNNACCCGGTCAGTACGTCAGTACGCAGTAGTGTA" |> bitsafe!
 7 |         d = dna"GGTCAGTACGTCAGTACGCAGTAGTGTANNNNACCC" |> bitsafe!
 8 |         e = dna"GGTCAGTACGTCAGTACGCAGTAGTGTANNNNCCC" |> bitsafe!
 9 |         f = dna"GGTCAGTACGTCAGTACGCAGTAGTGTATTTTCCC" |> bitsafe!
10 |         g = dna"GGTCAGTACGTCAGTACGCAGTAGTGTATTTTACCC" |> bitsafe!
11 |         h = dna"GGTCAGTACGTCAGTACGCAGTAGTGTATTTTAC" |> bitsafe!
12 |         i = dna"GGTCAGTACGTCAGTACGCAGTAGTGTATTTAC" |> bitsafe!
13 |         j = dna"GGTCAGTACGTCAGTACGCAGTAGTGTATTTACC" |> bitsafe!
14 |     
15 |         # to speed up, bitwise_scan does not handle tail match well.
16 |         @test bitwise_scan(a, b, 1, 0) == MatchRes(4, 16, NaN, NaN)
17 |         @test bitwise_scan(a, c, 1, 0) == MatchRes(5, 16, NaN, NaN)
18 |         @test bitwise_scan(a, d, 1, 0) == MatchRes(33, 4, NaN, NaN)
19 |         @test bitwise_scan(a, e, 1, 0) == MatchRes(31, 4, NaN, NaN) # actually best is 32,4
20 |         @test bitwise_scan(a, f, 1, 5) == MatchRes(32, 3, NaN, NaN)
21 |         @test bitwise_scan(a, g, 1, 5) == MatchRes(33, 4, NaN, NaN)
22 |         @test bitwise_scan(a, h, 1, 5) == MatchRes(33, 2, NaN, NaN)
23 |         @test bitwise_scan(a, i, 1, 5) == MatchRes(32, 1, NaN, NaN) # actually best is 32,2
24 |         @test bitwise_scan(a, j, 1, 5) == MatchRes(31, 2, NaN, NaN) # actually best is 32,3
25 |     end
26 | end
27 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center">
 2 |     <img src="docs/logo.png" alt="Atria Logo" width="210" height="auto">
 3 | </p>
 4 | 
 5 | # Atria
 6 | 
 7 | ![](https://img.shields.io/github/downloads/cihga39871/Atria/total)
 8 |  
 9 | Atria is designed to trim adapters and low-quality bases of next-generation sequencing data. It infers the insert DNA precisely by integrating both adapter information and reverse-complementary properties of pair-end reads within a delicate decision tree. It can also remove duplicated sequences due to PCR amplification.
10 | 
11 | If you use Atria, please cite the paper:
12 | > Jiacheng Chuan, Aiguo Zhou, Lawrence Richard Hale, Miao He, Xiang Li, Atria: an ultra-fast and accurate trimmer for adapter and quality trimming, Gigabyte, 1, 2021  https://doi.org/10.46471/gigabyte.31
13 | 
14 | ## Features
15 | 
16 | - FAST, even for compressed fastqs
17 | - Highly accurate Illumina adapter trimming
18 | - Paired-end consensus calling
19 | - Quality trimming
20 | - Poly X tail trimming
21 | - Hard clip 3' and 5' ends
22 | - N tail trimming
23 | - Filtering reads by the number of N bases
24 | - Filtering reads by length
25 | - Filtering reads by read complexity
26 | - Remove PCR duplicates (dedup)
27 | 
28 | ## Contents
29 | 
30 | 1. Installation guide
31 | 
32 |    1.1 [Release installation guide](docs/1.1.Release_installation_guide.md)
33 | 
34 |    1.2 [Install from source](docs/1.2.Install_from_source.md)
35 | 
36 | 2. **[Atria trimming methods and usages](docs/2.Atria_trimming_methods_and_usages.md)**
37 | 
38 | 3. [Benchmark toolkit](docs/3.Benchmark_toolkit.md)
39 | 
40 | 4. [Atria development notes](docs/4.Development_notes.md)
41 | 
42 | 5. **[Accuracy and speed benchmark](docs/5.Accuracy_and_speed_benchmark.md)**
43 | 


--------------------------------------------------------------------------------
/src/FqRecords/copy.jl:
--------------------------------------------------------------------------------
 1 | 
 2 | @inline function Base.copy(r::FqRecord)
 3 |     id = Vector{UInt8}(undef, length(r.id))
 4 |     des = Vector{UInt8}(undef, length(r.des))
 5 |     qual = Vector{UInt8}(undef, length(r.qual))
 6 |     prob = Vector{Float64}(undef, length(r.prob))
 7 | 
 8 |     copyto!(id, 1, r.id, 1, length(r.id))
 9 |     copyto!(des, 1, r.des, 1, length(r.des))
10 |     copyto!(qual, 1, r.qual, 1, length(r.qual))
11 |     copyto!(prob, 1, r.prob, 1, length(r.prob))
12 | 
13 |     seq = copy(r.seq)
14 | 
15 |     FqRecord(id, seq, des, qual, prob)
16 | end
17 | 
18 | @inline function safe_copyto!(dest::Vector{UInt8}, src::T) where T <: AbstractArray
19 |     resize!(dest, length(src))
20 |     copyto!(dest, src)
21 | end
22 | 
23 | @inline function safe_copyto!(dest::Vector{T}, src::Vector{T}, src_offset, N) where T <: Any
24 |     resize!(dest, N)
25 |     unsafe_copyto!(dest, 1, src, src_offset, N)
26 | end
27 | 
28 | @inline function safe_copyto!(dest::LongDNA{4}, src::Vector{UInt8}, src_offset, N)
29 |     resize!(dest, N)
30 |     # BioSequences.encode_chunks!(dest, 1, src, src_offset, N)
31 |     @inbounds copyto!(dest, 1, src, src_offset, N)
32 | end
33 | @inline function safe_copyto!(dest::LongDNA{4}, src::Vector{UInt8})
34 |     copy!(dest, src)
35 | end
36 | 
37 | @inline function safe_copyto!(dest::FqRecord, src::FqRecord)
38 |     safe_copyto!(dest.id, src.id, 1, length(src.id))
39 | 
40 |     safe_copyto!(dest.seq.data, src.seq.data, 1, length(src.seq.data))
41 |     dest.seq.part = src.seq.part
42 |     dest.seq.shared = src.seq.shared
43 | 
44 |     safe_copyto!(dest.des, src.des, 1, length(src.des))
45 |     safe_copyto!(dest.qual, src.qual, 1, length(src.qual))
46 |     safe_copyto!(dest.prob, src.prob, 1, length(src.prob))
47 |     
48 | end
49 | 


--------------------------------------------------------------------------------
/benchmark/real-data-time.bash:
--------------------------------------------------------------------------------
 1 | #!bash
 2 | 
 3 | atria=/home/jiacheng/projects/atria
 4 | 
 5 | . $atria/benchmark/trimming-functions.bash
 6 | 
 7 | run_all_trimmer() {
 8 |     rm -f stderr-simple-time.log
 9 |     run_atria 8 2>> stderr-simple-time.log
10 |     run_atria_consensus 8 2>> stderr-simple-time.log
11 |     run_adapterremoval 8 2>> stderr-simple-time.log
12 |     run_skewer 8 2>> stderr-simple-time.log
13 |     run_trim_galore 8 2>> stderr-simple-time.log
14 |     run_trimmomatic 8 2>> stderr-simple-time.log
15 |     run_ktrim 8 2>> stderr-simple-time.log
16 |     pigz -f Ktrim/ktrim.read1.fq Ktrim/ktrim.read2.fq
17 |     run_fastp 8 2>> stderr-simple-time.log
18 |     run_seqpurge 8 2>> stderr-simple-time.log
19 |     run_atropos  8 2>> stderr-simple-time.log
20 |     pasteTimeOutput stderr-simple-time.log > time_benchmark-simple_time.txt
21 | }
22 | 
23 | 
24 | ####### human data
25 | 
26 | working_dir=~/analysis/atria-benchmark/ERR4695159
27 | cd $working_dir
28 | 
29 | r1=ERR4695159_1.fastq.gz
30 | r2=ERR4695159_2.fastq.gz
31 | a1=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA
32 | a2=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
33 | bwa_ref=`pwd`/genomes/hg38.fasta.gz
34 | 
35 | run_all_trimmer
36 | 
37 | 
38 | ######## SRR330569: RNA-seq D. simulans
39 | 
40 | working_dir=~/analysis/atria-benchmark/SRR330569
41 | cd $working_dir
42 | 
43 | r1=SRR330569.3_1.fastq.gz
44 | r2=SRR330569.3_2.fastq.gz
45 | a1=AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCG
46 | a2=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGAT
47 | bwa_ref=`pwd`/genomes/dsim-all-chromosome-r2.02.fasta
48 | 
49 | run_all_trimmer
50 | 
51 | 
52 | ##### Ensifer spp associated with Medicago whole genome sequencing
53 | # working_dir=~/analysis/atria-benchmark/SRR7243169
54 | # cd $working_dir
55 | #
56 | # r1=SRR7243169_1.fastq.gz
57 | # r2=SRR7243169_2.fastq.gz
58 | # a1=CTGTCTCTTATACACATCT
59 | # a2=CTGTCTCTTATACACATCT
60 | # bwa_ref=`pwd`/genomes/Pseudomonas.sp.Z003-0.4C.fasta
61 | #
62 | # run_all_trimmer
63 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The Atria Software is licensed under the MIT License:
 2 | 
 3 | > Copyright 2021 Jiacheng Chuan
 4 | >
 5 | > Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 6 | > 
 7 | > The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 8 | >
 9 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 | 
11 | The Atria Software includes code from the following projects, which have their own licenses:
12 | 
13 | - [The Julia Language](https://github.com/JuliaLang/julia/blob/master/LICENSE.md) [MIT License]
14 | - [ArgParse.jl](https://github.com/carlobaldassi/ArgParse.jl/blob/master/LICENSE.md) (Parsing command-line arguments in Julia) [MIT License]
15 | - [BioSequences.jl](https://github.com/BioJulia/BioSequences.jl/blob/master/LICENSE) (Biological sequences in Julia) [MIT License]
16 | - [BioSymbols.jl](https://github.com/BioJulia/BioSymbols.jl/blob/master/LICENSE) (Nucleic and amino acid primitive types in Julia) [MIT License]
17 | - [DataStructures.jl](https://github.com/JuliaCollections/DataStructures.jl/blob/master/License.md) (Writing ordered dictionary) [MIT License]
18 | - [JSON.jl](https://github.com/JuliaIO/JSON.jl/blob/master/LICENSE.md) (IO of JSON files in Julia) [MIT "Expat" License]
19 | 


--------------------------------------------------------------------------------
/src/AtriaTest/FqRecords/primer_match.jl:
--------------------------------------------------------------------------------
 1 | #=
 2 | @noinline function test_primer_match()
 3 |     @testset "Primer Match" begin
 4 | 
 5 |         args2=["-r", "peReadSimulated.R1.randtrim.fastq.gz", "-R", "peReadSimulated.R2.randtrim.fastq.gz", "-e", "8", "-E", "8", "--compress", "gz"]
 6 |         args = parsing_args(args2)
 7 |         op = PEOptions(args)
 8 | 
 9 |         r1= fqreadrecord("@M03737:51:000000000-KTBYH:1:1101:21125:2738 1:N:0:GTATCGTCGT+CACCTGTT
10 | ACCGATGAAGAACGCAGCGAAATGCGATACGTAATGTGAATTGCAGAATTCAGTGAATCATCGAATCTTTGAACGCACATTGCGCCCGCCAGTATTCTGGCGGGCATGCCCGTTCGAGCGTCATTTCAACCCTCAAGCCCTGCTTGGTGTTGGGGACCGGCTCAGCGGGTGCGGGCTTCGGCCCGTCCCGTGCCGCCCCCGAAATGGATCGGCGGTCTCGTCGCAGCCTTCTTTGCGTAGTAACATACCACCTCGCAACAGGAGCGCGGCGCGGCCACTGCCGTAAAACGCCCAACTTTT
11 | +
12 | CCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCEGGGGGGGGGGGGGGGFGGGGGGDG:FGGGFEFFFFFFFFFGFGD<@E;AFBFFF=FFFG@FEFFFFFFF?BFFFFFFFBFF:D>B>?F?61186ADF9;BFFB09A<")
13 |         
14 |         r2 = fqreadrecord("@M03737:51:000000000-KTBYH:1:1101:21125:2738 2:N:0:GTATCGTCGT+CACCTGTT
15 | CTTATTGATATGCTTAAGTTCAGCGGGTATTCCTACCTGATTCGAGGTCAACTCTAAAAAGTTGGGCGTTTTACGGCAGTGGCCGCGCCGCGCTCCTGTTGCGAGGTGGTATGTTACTACGCAAAGAAGGCTGCGACGAGACCGCCGATCCATTTCGGGGGCGGCACGGGACGGGCCGAAGCCCGCACCCGCTGAGCCGGTCCCCAACACCAAGCAGGGCTTGAGGGTTGAAATGACGCTCGAACGGGCATGCCCGCCAGAATACTGGCGGGCGCAAGGGGGGTTCAAAGGTTCGAAGAA
16 | +
17 | CCCCCGGGGGGGGGGGGGGGGGGGEDDFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG3BGFFGGGDGGFFFFEBFFFFF:BFF>?4:>12868?FBFBDB253-((41<:1122((.9:??696-4:)).)-4,43((342900600((,,62)-(,)43((4,(")
18 | 
19 |         primer1 = dna"AHCGATGAAGAACRYAG"
20 |         primer2 = dna"CTTATTGATATGCTTAAGTTCAG"
21 |         ps = PrimerSet(primer1, primer2)
22 | 
23 |         init_seq_rc = true
24 |         r1_seq_rc = LongDNA{4}()
25 |         r2_seq_rc = LongDNA{4}()
26 | 
27 |         
28 |     end
29 | end
30 | =#


--------------------------------------------------------------------------------
/src/FqRecords/interface.jl:
--------------------------------------------------------------------------------
 1 | 
 2 | struct FqRecord
 3 |     id::Vector{UInt8}
 4 |     seq::LongDNA{4}
 5 |     des::Vector{UInt8}
 6 |     qual::Vector{UInt8}
 7 |     prob::Vector{Float64}
 8 |     function FqRecord(id::Vector{UInt8}, seq::LongDNA{4}, des::Vector{UInt8}, qual::Vector{UInt8}, prob::Vector{Float64})
 9 |         new(id::Vector{UInt8}, seq::LongDNA{4} |> bitsafe!, des::Vector{UInt8}, qual::Vector{UInt8}, prob::Vector{Float64})
10 |     end
11 | end
12 | @inline FqRecord() = FqRecord(Vector{UInt8}(), LongDNA{4}(), Vector{UInt8}(), Vector{UInt8}(), Vector{Float64}())
13 | 
14 | @inline function FqRecord(id::Vector{UInt8}, seq::LongDNA{4}, des::Vector{UInt8}, qual::Vector{UInt8}; quality_offset=33)
15 |     FqRecord(id, seq, des, qual, qualprob.(qual, quality_offset))
16 | end
17 | 
18 | @inline Base.:(==)(r1::FqRecord, r2::FqRecord) =
19 |     r1.id == r2.id && r1.seq == r2.seq && r1.des == r2.des && r1.qual == r2.qual
20 | 
21 | @inline function Base.isempty(r::FqRecord)::Bool
22 |     isempty(r.id::Vector{UInt8}) && isempty(r.seq::LongDNA{4}) && isempty(r.des::Vector{UInt8}) && isempty(r.qual::Vector{UInt8})
23 | end
24 | 
25 | mutable struct TrimStats
26 |     @atomic polyG::Int
27 |     @atomic polyT::Int
28 |     @atomic polyA::Int
29 |     @atomic polyC::Int
30 |     @atomic complexity_filtered::Int
31 |     @atomic hard_clip_after::Int
32 |     @atomic tail_low_qual_trim::Int
33 |     @atomic tail_N_trim::Int
34 |     @atomic length_filtered::Int
35 |     @atomic max_n_filtered::Int
36 |     @atomic pcr_dedup_removed::Int
37 |     @atomic quality_trim::Int
38 |     @atomic adapter_trim::Int
39 | end
40 | TrimStats() = TrimStats(0,0,0,0,0,0,0,0,0,0,0,0,0)
41 | 
42 | function Base.empty!(t::TrimStats)
43 |     @atomic t.polyG = 0
44 |     @atomic t.polyT = 0
45 |     @atomic t.polyA = 0
46 |     @atomic t.polyC = 0
47 |     @atomic t.complexity_filtered = 0
48 |     @atomic t.hard_clip_after = 0
49 |     @atomic t.tail_low_qual_trim = 0
50 |     @atomic t.tail_N_trim = 0
51 |     @atomic t.length_filtered = 0
52 |     @atomic t.max_n_filtered = 0
53 |     @atomic t.pcr_dedup_removed = 0
54 |     @atomic t.quality_trim = 0
55 |     @atomic t.adapter_trim = 0
56 | end


--------------------------------------------------------------------------------
/docs/1.1.Release_installation_guide.md:
--------------------------------------------------------------------------------
 1 | # Atria
 2 | 
 3 | ## Release Installation Guide
 4 | 
 5 | Atria is written in [Julia Language](https://julialang.org/) v1.9 and works on 64-bit Linux and OSX systems.
 6 | 
 7 | The generic binaries do not require any special installation steps, but you will need to ensure that your system can find the `atria` executable, and `pigz` & `pbzip2` command for compression/decompression.
 8 | 
 9 | ### Linux
10 | 
11 | 
12 | #### Dependency
13 | 
14 | `pigz` and `pbzip2` are required.
15 | 
16 | If you use Ubuntu, try `sudo apt install pigz pbzip2`. You can also download them from [pigz's official site](https://zlib.net/pigz/) and [pbzip2's official site](http://compression.ca/pbzip2/).
17 | 
18 | #### Atria
19 | 
20 | First, extract the `.linux.tar.gz` file downloaded from the [release page](https://github.com/cihga39871/Atria/releases/) to a folder on your computer:
21 | 
22 | ```bash
23 | tar -zxf Atria-VERSION-linux.tar.gz
24 | ```
25 | 
26 | Atria is extracted to `Atria-VERSION` directory. To run Atria, you can do any of the following:
27 | 
28 | - Create a symbolic link to `atria` inside a folder which is on your system `PATH` (recommended)
29 | - Add Atria's bin folder to your system `PATH` environment variable
30 | - Invoke the `atria` executable by using its full path, as in `~/Atria/bin/atria`
31 | 
32 | For example, to create a symbolic link to `atria` inside the `/usr/local/bin` folder, you can do the following:
33 | 
34 | ```bash
35 | sudo ln -s <where_you_extracted_the_Atria_archive>/bin/atria /usr/local/bin/atria
36 | ```
37 | 
38 | ### Mac OS
39 | 
40 | > Sorry, we no longer provide binary files for Mac OS (x86 or M-series). You need to [install Atria from source](./1.2.Install_from_source.md). It is easy to follow and doesn't take much time.
41 | 
42 | #### Dependency
43 | 
44 | `pigz` and `pbzip2` are required.
45 | 
46 | If you use [Homebrew](https://brew.sh/), try `brew install pigz; brew install pbzip2`. You can also download it from [pigz's official site](https://zlib.net/pigz/) and [pbzip2's official site](http://compression.ca/pbzip2/).
47 | 
48 | #### Atria
49 | 
50 | Sorry, we no longer provide Atria release for OSX (x86 or M-series). Please [install Atria from source](./1.2.Install_from_source.md).
51 | 


--------------------------------------------------------------------------------
/benchmark/aln2len.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | # This code is part of Skewer (https://sourceforge.net/projects/skewer/). The License:
 4 | #
 5 | # The MIT License (MIT)
 6 | #
 7 | # Copyright (c) 2013-2014 by Hongshan Jiang
 8 | #
 9 | # Permission is hereby granted, free of charge, to any person obtaining a copy
10 | # of this software and associated documentation files (the "Software"), to deal
11 | # in the Software without restriction, including without limitation the rights
12 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 | # copies of the Software, and to permit persons to whom the Software is
14 | # furnished to do so, subject to the following conditions:
15 | #
16 | # The above copyright notice and this permission notice shall be included in all
17 | # copies or substantial portions of the Software.
18 | #
19 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 | # SOFTWARE.
26 | 
27 | use strict;
28 | 
29 | if(@ARGV != 1){
30 |     print STDERR "Usage: $0 file.aln > lengths.tab\n";
31 |     exit(1);
32 | }
33 | my ($aln_file) = @ARGV;
34 | 
35 | my ($line, $no);
36 | my @columns;
37 | my ($id, $len, $len2);
38 | my @chars;
39 | $no = -1;
40 | open(ALN, "<$aln_file") or die("Can not open $aln_file for reading\n");
41 | while($line = <ALN>){
42 |     chomp($line);
43 |     if($line =~ /^>/){
44 |         @columns = split(/\t/, $line);
45 |         @columns = split(/\//, $columns[1]);
46 |         $id = $columns[0];
47 |         $no = 0;
48 |         next;
49 |     }
50 | 	next if($no < 0);
51 |     $no++;
52 |     if($no == 1){ # first sequence
53 |         $len = length($line);
54 | 		next;
55 |     }
56 | 	if($no == 2){ # second sequence
57 | 		@chars = split(//, substr($line,0,$len));
58 | 		my $del=0;
59 | 		for(my $i=$#chars; $i>=0; $i--){
60 | 			if($chars[$i] eq '-'){
61 | 				$del++;
62 | 			}
63 | 		}
64 | 		$len -= $del;
65 |         print "$id\t$len\n";
66 | 	}
67 | }
68 | close ALN;
69 | 
70 | exit(0);
71 | 


--------------------------------------------------------------------------------
/benchmark/atria-simulate.bash:
--------------------------------------------------------------------------------
 1 | #! bash
 2 | 
 3 | adapter_length=33
 4 | 
 5 | if [[ $1 -lt 33 ]]
 6 | then
 7 |     adapter_length=$1
 8 | fi
 9 | 
10 | working_dir=~/analysis/atria-benchmark/atria_simulate
11 | 
12 | mkdir -p $working_dir
13 | cd $working_dir
14 | 
15 | mkdir adapter_length_$adapter_length
16 | cd adapter_length_$adapter_length
17 | 
18 | # select first adapter_length bp of adapters
19 | a1=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA
20 | a2=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
21 | 
22 | a1=${a1:0:$adapter_length}
23 | a2=${a2:0:$adapter_length}
24 | 
25 | #### simulate data with different indels
26 | # docs: https://github.com/cihga39871/Atria/blob/master/docs/3.Benchmark_toolkit.md#data-simulation
27 | atria simulate --prefix reads_diff_indel --adapter1 $a1 --adapter2 $a2 --repeat 30000 --subsitution-rate 0.001 0.002 0.003 0.004 0.005 --insertion-rate 1.0e-5 2.0e-5 3.0e-5 4.0e-5 5.0e-5 --deletion-rate 1.0e-5 2.0e-5 3.0e-5 4.0e-5 5.0e-5 -s 100 -i `seq 66 2 120`
28 | 
29 | r1="reads_diff_indel.R1.fastq"
30 | r2="reads_diff_indel.R2.fastq"
31 | 
32 | # load trimming functions
33 | . $atria/benchmark/trimming-functions.bash
34 | 
35 | rm -f stderr.log
36 | run_atria 16 2>> stderr.log
37 | run_adapterremoval 16 2>> stderr.log
38 | run_skewer 16 2>> stderr.log
39 | run_trim_galore 16 2>> stderr.log
40 | run_trimmomatic 16 2>> stderr.log
41 | # run_ktrim 16 2>> stderr.log # ktrim fails to output validate fastq
42 | run_fastp 1 2>> stderr.log
43 | run_atropos 16 2>> stderr.log
44 | run_seqpurge 1 2>> stderr.log
45 | run_cutadapt 16 2>> stderr.log
46 | 
47 | pigz -d SeqPurge/*gz
48 | 
49 | mv AdapterRemoval-3/adapterremoval.pair1.truncated AdapterRemoval-3/adapterremoval.pair1.fq
50 | mv AdapterRemoval-3/adapterremoval.pair2.truncated AdapterRemoval-3/adapterremoval.pair2.fq
51 | 
52 | # cat Trimmomatic/out-pair1.unpaired.fq >> Trimmomatic/out-pair1.paired.fq
53 | # cat Trimmomatic/out-pair2.unpaired.fq >> Trimmomatic/out-pair2.paired.fq
54 | # rm Trimmomatic/out-pair1.unpaired.fq Trimmomatic/out-pair2.unpaired.fq
55 | 
56 | ll */*fastq */*fq
57 | 
58 | for i in *
59 | do
60 |     if [[ -d $i ]]
61 |     then
62 |         julia -L $atria/src/Atria.jl -e "Atria.Benchmark.julia_wrapper_readstat(ARGS)" $i/*.f*q &
63 |     fi
64 | done
65 | 
66 | # atria readstat Cutadapt/out.cutadapt.R*.fq
67 | 
68 | ps -x | grep -c "Atria.Benchmark.julia_wrapper_readstat"
69 | 
70 | 
71 | ### Adapter length 16, 20, 24, 28, 33
72 | # atria statplot -i */*r12.stat.tsv
73 | 


--------------------------------------------------------------------------------
/src/Trimmer/thread_trim.jl:
--------------------------------------------------------------------------------
 1 | 
 2 | function processing_reads!(r1s::Vector{FqRecord}, r2s::Vector{FqRecord}, isgoods::Vector{Bool}, n_reads::Int)
 3 |     if length(isgoods) < n_reads
 4 |         resize!(isgoods, n_reads)
 5 |     end
 6 |     for i in 1:n_reads
 7 |         @inbounds isgoods[i] = read_processing!(r1s[i]::FqRecord, r2s[i]::FqRecord, 1)
 8 |     end
 9 |     nothing
10 | end
11 | 
12 | # function processing_reads_range!(r1s::Vector{FqRecord}, r2s::Vector{FqRecord}, isgoods::Vector{Bool}, reads_range::UnitRange{Int64})
13 | #     this_threadid = Threads.threadid()
14 | #     for i in reads_range
15 | #         @inbounds isgoods[i] = read_processing!(r1s[i], r2s[i], this_threadid)
16 | #     end
17 | #     nothing
18 | # end
19 | 
20 | # function processing_reads_threads!(r1s::Vector{FqRecord}, r2s::Vector{FqRecord}, isgoods::Vector{Bool}, n_reads::Int)
21 | #     if length(isgoods) < n_reads
22 | #         resize!(isgoods, n_reads)
23 | #     end
24 | #     # split reads to N reads per batch
25 | #     @sync for reads_start in 1:256:n_reads
26 | #         reads_end = min(reads_start + 255, n_reads)
27 | #         reads_range = reads_start:reads_end
28 | 
29 | #         Threads.@spawn processing_reads_range!(r1s::Vector{FqRecord}, r2s::Vector{FqRecord}, isgoods::Vector{Bool}, reads_range)
30 | #     end
31 | #     nothing
32 | # end
33 | 
34 | ## single end
35 | function processing_reads!(r1s::Vector{FqRecord}, isgoods::Vector{Bool}, n_reads::Int)
36 |     if length(isgoods) < n_reads
37 |         resize!(isgoods, n_reads)
38 |     end
39 |     for i in 1:n_reads
40 |         @inbounds isgoods[i] = read_processing!(r1s[i]::FqRecord, 1)
41 |     end
42 |     nothing
43 | end
44 | 
45 | # function processing_reads_range!(r1s::Vector{FqRecord}, isgoods::Vector{Bool}, reads_range::UnitRange{Int64})
46 | #     this_threadid = Threads.threadid()
47 | #     for i in reads_range
48 | #         @inbounds isgoods[i] = read_processing!(r1s[i], this_threadid)
49 | #     end
50 | #     nothing
51 | # end
52 | 
53 | # function processing_reads_threads!(r1s::Vector{FqRecord}, isgoods::Vector{Bool}, n_reads::Int)
54 | #     if length(isgoods) < n_reads
55 | #         resize!(isgoods, n_reads)
56 | #     end
57 | #     # split reads to N reads per batch
58 | #     @sync for reads_start in 1:512:n_reads
59 | #         reads_end = min(reads_start + 511, n_reads)
60 | #         reads_range = reads_start:reads_end
61 | 
62 | #         Threads.@spawn processing_reads_range!(r1s::Vector{FqRecord}, isgoods::Vector{Bool}, reads_range)
63 | #     end
64 | #     nothing
65 | # end
66 | 


--------------------------------------------------------------------------------
/benchmark/replicates-stats.jl:
--------------------------------------------------------------------------------
 1 | 
 2 | using Statistics
 3 | 
 4 | if isempty(ARGS)
 5 |     println("""
 6 |     Usage: $(@__FILE__) FILE1 FILE2...
 7 | 
 8 |     Format of FILEs has to be the same. The numeric values in the same position of FILEs are replaced with their mean and standard deviation.
 9 |     """)
10 |     exit()
11 | end
12 | 
13 | contents = map(x -> read(x, String), ARGS)
14 | cells = map(x -> split(x, r"[^\w\d\.\-\:]+"), contents)
15 | 
16 | cell_matrix = String.(hcat(cells...))
17 | 
18 | function Base.isnumeric(x::String)
19 |     isfloat = !(tryparse(Float64, x) === nothing)
20 |     istime = occursin(r"^(\d+:)+\d+(\.\d+)?$", x)
21 |     isfloat | istime
22 | end
23 | 
24 | function Base.isnumeric(x::AbstractArray)
25 |     all(isnumeric.(x))
26 | end
27 | 
28 | function parse_numeric(x::String)
29 |     float = tryparse(Float64, x)
30 |     if !isnothing(float)
31 |         return float
32 |     end
33 |     # parse time as D:H:M:S.MS
34 |     xs = split(x, ":") |> reverse!
35 |     second = parse(Float64, xs[1])
36 |     for i in 2:length(xs)
37 |         if i == 2
38 |             second += 60 * parse(Float64, xs[i])
39 |         elseif i == 3
40 |             second += 3600 * parse(Float64, xs[i])
41 |         elseif i == 4
42 |             second += 24 * 3600 * parse(Float64, xs[i])
43 |         else
44 |             error("Failed to parse $x as the time format D:H:M:S")
45 |         end
46 |     end
47 |     second
48 | end
49 | 
50 | 
51 | mean_std_strings = map(eachrow(cell_matrix)) do vec
52 |     if isnumeric(vec)
53 |         vals = parse_numeric.(vec)
54 |         std = Statistics.std(vals)
55 |         if std != 0
56 |             digit = -(floor(Int, log10(std)) - 1)
57 |         else
58 |             digit = 0
59 |         end
60 |         if digit <= 0
61 |             std_string = round(Int, std) |> string
62 |             mean_string = round(Int, Statistics.mean(vals)) |> string
63 |         else
64 |             std_string = round(std, sigdigits=2) |> string
65 |             mean_string = round(Statistics.mean(vals), digits=digit) |> string
66 |         end
67 |         mean_string * " ± " * std_string
68 |     else
69 |         vec[1]
70 |     end
71 | end
72 | 
73 | specials = split(contents[1], r"[\w\d\.\-\:]+")
74 | 
75 | N = min(length(specials), length(mean_std_strings))
76 | result = ""
77 | 
78 | if mean_std_strings[1] == ""
79 |     for i = 1:N
80 |         global result
81 |         result *= mean_std_strings[i] * specials[i]
82 |     end
83 | else
84 |     for i = 1:N
85 |         global result
86 |         result *= specials[i] * mean_std_strings[i]
87 |     end
88 | end
89 | 
90 | filename = "stats." * basename(ARGS[1])
91 | write(filename, result)
92 | 
93 | @info "Done" Output=filename
94 | 


--------------------------------------------------------------------------------
/src/FqRecords/quality.jl:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | const qualpval_table = map(q -> 10 ^ (-q/10), 0:50)
 4 | const qualprob_table = 1.0 .- qualpval_table
 5 | 
 6 | """
 7 | The quality offset used in FqRecords should be the real quality offset - 1, such as Illumina 1.8 => 33-1
 8 | """
 9 | @inline function qualpval(Q, quality_offset)::Float64
10 |     q = Q - quality_offset + 1
11 |     q <= 0 && error("Input quality < 0 detected. Wrong --quality-format FORMAT or the input file in truncated.")
12 |     @inbounds qualpval_table[q > 51 ? 51 : q]
13 | end
14 | 
15 | """
16 | The quality offset used in FqRecords should be the real quality offset - 1, such as Illumina 1.8 => 33-1
17 | """
18 | @inline function qualprob(Q, quality_offset)::Float64
19 |     q = Q - quality_offset + 1
20 |     q <= 0 && error("Input quality < 0 detected. Wrong --quality-format FORMAT or the input file in truncated.")
21 |     @inbounds qualprob_table[q > 51 ? 51 : q]
22 | end
23 | 
24 | 
25 | @inline function update_prob_from_qual(r::FqRecord; quality_offset::Int64=33)::Nothing
26 |     resize!(r.prob, length(r.qual))
27 |     @inbounds for (i,Q) in enumerate(r.qual)
28 |         r.prob[i] = qualprob(Q, quality_offset)
29 |     end
30 |     return
31 | end
32 | 
33 | 
34 | 
35 | @inline function probsum(r::FqRecord, from::Int64, to::Int64)::Float64
36 |     r_prob = r.prob
37 |     nprob = length(r_prob)
38 |     to   > nprob && (to   = nprob)
39 |     from < 1     && (from = 1    )
40 | 
41 |     value = 0.0
42 |     @inbounds while from <= to
43 |         value += r_prob[from]
44 |         from += 1
45 |     end
46 |     value
47 | end
48 | 
49 | @inline function probmean(r::FqRecord, from::Int64, to::Int64)::Float64
50 |     r_prob = r.prob
51 |     nprob = length(r_prob)
52 |     to   > nprob && (to   = nprob)
53 |     from < 1     && (from = 1    )
54 |     n = to - from + 1
55 |     n <= 0 && return 0.0
56 | 
57 |     value = 0.0
58 |     @inbounds while from <= to
59 |         value += r_prob[from]
60 |         from += 1
61 |     end
62 |     @fastmath value/n
63 | end
64 | 
65 | 
66 | 
67 | @inline function compute_prob_and_score!(match_res::MatchRes, r::FqRecord, r_start::Int, r_end::Int; min_prob::Float64 = 0.75)
68 |     match_res.prob = max(probmean(r, r_start, r_end), min_prob)
69 |     match_res.score = @fastmath match_res.ncompatible * match_res.prob
70 | end
71 | @inline function compute_prob_and_score!(match_res::MatchRes, r1::FqRecord, r1_start::Int, r1_end::Int, r2::FqRecord, r2_start::Int, r2_end::Int; min_prob::Float64 = 0.75)
72 |     prob1 = max(probmean(r1, r1_start, r1_end), min_prob)
73 |     prob2 = max(probmean(r2, r2_start, r2_end), min_prob)
74 |     match_res.prob = @fastmath prob1 * prob2
75 |     match_res.score = @fastmath match_res.ncompatible * match_res.prob
76 | end


--------------------------------------------------------------------------------
/benchmark/art-simulate-main.bash:
--------------------------------------------------------------------------------
 1 | 
 2 | # Genome information
 3 | # Arabidopsis thaliana (thale cress) reference genome TAIR10.1
 4 | # https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/735/GCF_000001735.4_TAIR10.1/GCF_000001735.4_TAIR10.1_genomic.fna.gz
 5 | 
 6 | # Simulation program information
 7 | # ART (Skewer modified version)
 8 | # ART: a next-generation sequencing read simulator. Bioinformatics. 2012 Feb 15; 28(4): 593–594.
 9 | # Simulated from a real public sequence: SRR7243169.1
10 | 
11 | 
12 | working_dir=~/analysis/atria-benchmark/art_simulate
13 | 
14 | mkdir -p $working_dir
15 | cd $working_dir
16 | 
17 | # download genome
18 | mkdir genomes
19 | wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/735/GCF_000001735.4_TAIR10.1/GCF_000001735.4_TAIR10.1_genomic.fna.gz -O genomes/TAIR10.1.fasta.gz
20 | pigz -d genomes/TAIR10.1.fasta.gz
21 | 
22 | ### simulate data
23 | # Download from https://sourceforge.net/projects/skewer/files/Simulator/
24 | cd $working_dir/ART/art_profiler_illumina
25 | 
26 | # download real data to generate profiles
27 | fastq-dump --split-files --origfmt SRR330569.3
28 | # generate profiles
29 | ./Illumina_readprofile_art profile_SRR330569 . fastq
30 | 
31 | for i in `seq 1 3`
32 | do
33 |     echo "Replicate $i ----------------------------------"
34 |     bash $atria/benchmark/art-simulate-run-bench.bash
35 | done
36 | 
37 | cd $working_dir
38 | 
39 | julia $atria/benchmark/replicates-stats.jl replicate_*/summary.AdapterRemoval
40 | julia $atria/benchmark/replicates-stats.jl replicate_*/summary.Atria
41 | julia $atria/benchmark/replicates-stats.jl replicate_*/summary.Skewer
42 | julia $atria/benchmark/replicates-stats.jl replicate_*/summary.TrimGalore
43 | julia $atria/benchmark/replicates-stats.jl replicate_*/summary.Trimmomatic
44 | julia $atria/benchmark/replicates-stats.jl replicate_*/summary.Ktrim
45 | julia $atria/benchmark/replicates-stats.jl replicate_*/summary.Fastp
46 | julia $atria/benchmark/replicates-stats.jl replicate_*/summary.Atropos
47 | julia $atria/benchmark/replicates-stats.jl replicate_*/summary.SeqPurge
48 | 
49 | awk 'BEGIN {print "Trimmer\tTP\tFP_ft\tFP_ot\tFN_fr\tFN_ut\tTN\tPPV\tSen.\tSpec.\tmCC" }; NR%3==2{FNR="\t"; sub("stats.summary.", "", FILENAME); print FILENAME"\t"$0}' stats.summary.* > performance_stats.df.txt
50 | 
51 | julia $atria/benchmark/replicates-stats.jl replicate_*/time_benchmark.df.txt
52 | julia $atria/benchmark/replicates-stats.jl replicate_*/time_benchmark_gz.df.txt
53 | 
54 | julia $atria/benchmark/replicates-stats.jl replicate_*/time_benchmark.new.df.txt
55 | julia $atria/benchmark/replicates-stats.jl replicate_*/time_benchmark.new_gz.df.txt
56 | 
57 | Rscript $atria/benchmark/time_stats_plot.R -i stats.time_benchmark.df.txt stats.time_benchmark_gz.df.txt -o time_stats_plot.html
58 | 


--------------------------------------------------------------------------------
/src/AtriaTest/BioBits/biosequences_safety.jl:
--------------------------------------------------------------------------------
 1 | @noinline function test_biosequences_safety()
 2 |         
 3 |     @testset "biosequences safety" begin
 4 |         @testset "bitsafe" begin
 5 |             s1 = dna""
 6 |             s2 = dna"NASTTGGTTATCNNNN"
 7 |             s3 = LongDNA{4}([0x4214824181422181, 0x0ff0000084128142, 0x0000000084128142], UInt(40))
 8 | 
 9 |             bitsafe!(s1)
10 |             @test length(s1) == 0
11 |             @test length(s1.data) == 1
12 |             # @test s1.data[1] == 0x0
13 | 
14 |             bitsafe!(s2)
15 |             @test s2.data[1] == 0xffff28188448861f
16 |             @test length(s2.data) == 2
17 | 
18 |             @test !isbitsafe(s3)
19 |             @test isbitsafe(s1)
20 |             @test isbitsafe(s2)
21 |         end
22 | 
23 |         @testset "bitsafe resize" begin
24 |             s3 = LongDNA{4}([0x4214824181422181, 0x0ff0000084128142, 0x0000000084128142], UInt(40))
25 |             resize!(s3, 5)
26 |             @test isbitsafe(s3)
27 | 
28 |             s3 = LongDNA{4}([0x4214824181422181, 0x0ff0000084128142, 0x0000000084128142], UInt(40))
29 |             resize!(s3, 100)
30 |             @test length(s3.data) == 8
31 |             @test isbitsafe(s3)
32 | 
33 |             s3 = LongDNA{4}([0x4214824181422181, 0x0ff0000084128142, 0x0000000084128142], UInt(40))
34 |             s3 = s3[4:36]
35 |             resize!(s3, 40)
36 |             @test s3.data[1] == 0x1424214824181422
37 |             @test s3.data[2] == 0x1420ff0000084128
38 |             @test s3.data[3] |  0xfffffffffffffff0 == 0xfffffffffffffff8
39 |             @test length(s3.data) == 4
40 |         end
41 | 
42 |         @testset "bitsafe reverse complement" begin
43 |             s3 = LongDNA{4}([0x4214824181422181, 0x0ff0000084128142, 0x0000000084128142], UInt(40))
44 |             s3_rc = reverse_complement(s3)
45 |             true_s3_rc_data = [0x00000ff042814821,
46 |                             0x8241284242814821,
47 |                             0x0000000081844281,
48 |                             0x0000000000000000]
49 |             @test s3_rc.data[1] == 0x00000ff042814821
50 |             @test s3_rc.data[2] == 0x8241284242814821
51 |             @test s3_rc.data[3] |  0xffffffff00000000 == 0xffffffff81844281
52 | 
53 |             bitsafe!(s3)
54 |             s3_rc = reverse_complement(s3)
55 |             @test s3_rc.data[1] == 0x00000ff042814821
56 |             @test s3_rc.data[2] == 0x8241284242814821
57 |             @test s3_rc.data[3] |  0xffffffff00000000 == 0xffffffff81844281
58 |             @test length(s3.data) == 4
59 | 
60 |             s4 = s3[17:25]
61 |             s4_rc = reverse_complement!(s4)
62 |             @test s4_rc.data[1] |  0xfffffff000000000 == 0xfffffff428148210
63 |             @test length(s4.data) == 2
64 |         end
65 |     end
66 | end
67 | 


--------------------------------------------------------------------------------
/src/FqRecords/basic_io.jl:
--------------------------------------------------------------------------------
 1 | 
 2 | function Base.println(io::IO, r::FqRecord)
 3 |     println(String(copy(r.id)))
 4 |     println(r.seq)
 5 |     println(String(copy(r.des)))
 6 |     println(String(copy(r.qual)))
 7 | end
 8 | 
 9 | Base.print(io::IO, r::FqRecord) = println(io, r)
10 | Base.display(r::FqRecord) = println(stdout, r)
11 | Base.show(r::FqRecord) = println(stdout, r)
12 | 
13 | 
14 | remove_blank(s::AbstractString) = replace(s, r"\n[ \t]*" => "\n")
15 | 
16 | """
17 |     fqreadrecord(s::IO    ; quality_offset=33)::FqRecord
18 |     fqreadrecord(s::String; quality_offset=33)::FqRecord
19 | 
20 | It is very slow and not recommended. See also `load_fqs_threads!`.
21 | """
22 | function fqreadrecord(s::IO; quality_offset=33)::FqRecord
23 |     # 0x0a is \n
24 |     # do not compatible with \r\n
25 |     id = readuntil(s, 0x0a, keep=false)::Vector{UInt8}
26 |     seq = LongDNA{4}(readuntil(s, 0x0a, keep=false))::LongDNA{4}
27 |     des = readuntil(s, 0x0a, keep=false)::Vector{UInt8}
28 |     qual = readuntil(s, 0x0a, keep=false)::Vector{UInt8}
29 |     # nqual = length(qual::Vector{UInt8})::Int64
30 |     FqRecord(id, seq, des, qual; quality_offset=quality_offset)::FqRecord
31 | end
32 | fqreadrecord(s::String; quality_offset=33)::FqRecord = fqreadrecord(IOBuffer(remove_blank(s)), quality_offset=quality_offset)
33 | 
34 | """
35 |     fqreadrecord!(r::FqRecord, s::IO)
36 | 
37 | It is very slow and not recommended. See also `load_fqs_threads!`.
38 | """
39 | function fqreadrecord!(r::FqRecord, s::IO; quality_offset=33)
40 |     safe_copyto!(r.id, readuntil(s, 0x0a, keep=false)::Vector{UInt8})
41 |     safe_copyto!(r.seq, readuntil(s, 0x0a, keep=false)::Vector{UInt8})
42 |     bitsafe!(r.seq)
43 |     safe_copyto!(r.des, readuntil(s, 0x0a, keep=false)::Vector{UInt8})
44 |     resize!(r.qual, length(r.seq))
45 |     readfill!(s, r.qual)
46 | 
47 |     resize!(r.prob, length(r.seq))
48 |     @inbounds for (i, q) in enumerate(r.qual)
49 |         r.prob[i] = qualprob(q, quality_offset)
50 |     end
51 | 
52 |     eof(s) && return
53 |     read(s, UInt8) == 0xa || error("FASTQ is not valid: the lengths of sequence and quality are not the same for $(string(r.id)): $s")
54 |     return
55 | end
56 | 
57 | 
58 | function fqwriterecord(io::IO, r::FqRecord)
59 |     if isempty(r.seq::LongDNA{4})
60 |         write(io, r.id::Vector{UInt8})
61 |         write(io, '\n')
62 |         write(io, 'N')
63 |         write(io, '\n')
64 |         write(io, r.des::Vector{UInt8})
65 |         write(io, '\n')
66 |         write(io, '!')
67 |         write(io, '\n')
68 |     else
69 |         write(io, r.id::Vector{UInt8})
70 |         write(io, '\n')
71 |         print(io, r.seq::LongDNA{4}) # no write method for LongDNA{4}
72 |         write(io, '\n')
73 |         write(io, r.des::Vector{UInt8})
74 |         write(io, '\n')
75 |         write(io, r.qual::Vector{UInt8})
76 |         write(io, '\n')
77 |     end
78 | end
79 | 


--------------------------------------------------------------------------------
/docs/1.2.Install_from_source.md:
--------------------------------------------------------------------------------
  1 | # Atria
  2 | 
  3 | ## Install from source
  4 | 
  5 | Atria is tested in [Julia Language](https://julialang.org/) v1.8 and v1.9.
  6 | 
  7 | It is recommended to build Atria using Julia v1.8.5 because it is 3-20% faster than v1.9.
  8 | 
  9 | ### Mac OS
 10 | 
 11 | #### Prerequisite
 12 | 
 13 | Mac OS 10.8 or higher, 64-bit system.
 14 | 
 15 | ##### Julia
 16 | 
 17 | 1. Download Julia package manager
 18 | 
 19 | ```bash
 20 | curl -fsSL https://install.julialang.org | sh
 21 | ```
 22 | 
 23 | 2. Download and select Julia version v1.8.5. It is recommended to build Atria using Julia v1.8.5 because it is 3-20% faster than v1.9.
 24 | 
 25 | ```bash
 26 | juliaup add 1.8
 27 | juliaup default 1.8
 28 | ```
 29 | 
 30 | ##### Pigz and Pbzip2
 31 | 
 32 | Pigz and Pbzip2 are parallel Gzip/Bzip2 commandline tools required in Atria. You can install with [Homebrew](https://brew.sh/):
 33 | 
 34 | ```bash
 35 | brew install pigz
 36 | brew install pbzip2
 37 | ```
 38 | 
 39 | > If you do not use Homebrew, you can also download them from [pigz&#39;s official site](https://zlib.net/pigz/) and [pbzip2](https://pkgs.org/download/pbzip2).
 40 | 
 41 | #### Atria
 42 | 
 43 | Download the Atria git repository:
 44 | 
 45 | ```bash
 46 | git clone https://github.com/cihga39871/Atria.git
 47 | ```
 48 | 
 49 | Go to `Atria` directory, and run `build_atria.jl` with Julia:
 50 | 
 51 | ```bash
 52 | cd Atria
 53 | julia build_atria.jl
 54 | ```
 55 | 
 56 | After installation, Atria is available at `./app-*/bin/atria`.  Link `atria` to one of your PATH:
 57 | 
 58 | ```bash
 59 | sudo ln -s ./app-*/bin/atria /usr/local/bin
 60 | ```
 61 | 
 62 | ### Linux
 63 | 
 64 | #### Prerequisite
 65 | 
 66 | Linux, 64-bit system.
 67 | 
 68 | 1. Download Julia package manager
 69 | 
 70 | ```bash
 71 | curl -fsSL https://install.julialang.org | sh
 72 | ```
 73 | 
 74 | 2. Download and select Julia version v1.8.5. It is recommended to build Atria using Julia v1.8.5 because it is 3-20% faster than v1.9.
 75 | 
 76 | ```bash
 77 | juliaup add 1.8
 78 | juliaup default 1.8
 79 | ```
 80 | 
 81 | Then, download `pigz` and `pbzip2` (a compression/decompression software used in Atria).
 82 | 
 83 | If you use `apt` package manager (Ubuntu/Debian), try `sudo apt install pigz pbzip2`.
 84 | If you use `yum` package manager (CentOS), try `sudo yum install pigz pbzip2`.
 85 | 
 86 | You can also download them from [pigz&#39;s official site](https://zlib.net/pigz/) and [pbzip2](https://pkgs.org/download/pbzip2).
 87 | 
 88 | #### Atria
 89 | 
 90 | Download the Atria git repository:
 91 | 
 92 | ```bash
 93 | git clone https://github.com/cihga39871/Atria.git
 94 | ```
 95 | 
 96 | Go to `Atria` directory, and run `build_atria.jl` with Julia:
 97 | 
 98 | ```bash
 99 | cd Atria
100 | julia build_atria.jl
101 | ```
102 | 
103 | After installation, Atria is available at `./atria-*/bin/atria`. Link `atria` to one of your PATH:
104 | 
105 | ```bash
106 | sudo ln -s ./app-*/bin/atria /usr/local/bin
107 | ```
108 | 


--------------------------------------------------------------------------------
/docs/5.Accuracy_and_speed_benchmark.md:
--------------------------------------------------------------------------------
 1 | ## Introduction
 2 | 
 3 | This section evaluates trimming accuracy regarding different read properties, including adapter presence or absence, base error, and adapter length. To achieve the goal, Atria integrates a benchmarking toolkit for read simulation and trimming analysis.
 4 | 
 5 | The details are described in the [Atria paper](https://gigabytejournal.com/articles/31). 
 6 | 
 7 | ## Trimmers
 8 | 
 9 | - Atria v3.0.0
10 | - AdapterRemoval v2.3.1
11 | - Skewer v0.2.2
12 | - Fastp v0.21.0
13 | - Ktrim v1.2.1
14 | - Atropos v1.1.29
15 | - SeqPurge v2012_12
16 | - Trim Galore v0.6.5
17 | - Trimmomatic v0.39
18 | - Cutadapt v2.8 (#3)
19 | 
20 | ## Data
21 | 
22 | Twenty-one million read pairs were simulated with a uniform read length (100 bp), different error profiles, adapter length, and original insert sizes. 
23 | 
24 | The baseline error profile comprises a 0.1% substitution rate, 0.001% insertion rate, and 0.001% deletion rate, inspired by an Illumina error profile analysis. 1x, 2x, 3x, 4x, and 5x baseline error profile, and 66 to 120 even insert sizes are chosen. 
25 | 
26 | In this way, the reads with the least insert size have full lengths of adapters. The reads with 66-98 original insert sizes contain adapters, and the reads with 100-120 original insert sizes are free from adapter contamination, except for few reads with a 100 bp insert size containing indels. In each condition combination, 30 thousand read pairs were simulated to avoid random errors. 
27 | 
28 | ## Results
29 | 
30 | ![Figure 1](https://github.com/cihga39871/Atria/raw/master/docs/Figure%201%20Simulation%20Accuracy2.png)
31 | 
32 | **Figure 1 Adapter trimming accuracy on adapter presence and absence, different base errors, and adapter lengths** ([Interactive plots can be downloaded here](https://github.com/cihga39871/Atria/raw/master/docs/Figure%201%20Simulation%20Accuracy.html))
33 | 
34 | A1, B1, and C1 are statistics for reads with adapter contamination, while A2, B2, C2 for reads without adapters.
35 | 
36 | A1 and A2 show the accumulated rates of accurate trim, one bp over trim, one bp under trim, multiple bp over trim, and multiple bp under trim. 
37 | 
38 | B1 and B2 show the trimming accuracy on different error profiles.
39 | 
40 | C1 and C2 show the trimming accuracy on different adapter lengths. 
41 | 
42 | > Ktrim throwed an error when processing simulated fastq files. Its accuracy was benched using other methods in the [Atria paper](https://gigabytejournal.com/articles/31). 
43 | 
44 | 
45 | 
46 | ![Figure 2](https://github.com/cihga39871/Atria/raw/master/docs/Figure%202%20Speed.png)
47 | 
48 | **Figure 2 Benchmark of adapter-trimming speed for uncompressed and compressed files on different threading options** ([Interactive plots can be downloaded here](https://github.com/cihga39871/Atria/raw/master/docs/Figure%202%20Speed.html))
49 | 
50 | The simulated paired-end data with a 100 bp read length was trimmed in both uncompressed and compressed format using up to 32 threads. Speed is the ratio of the number of bases to elapsed time (wall time). SeqPurge does not support uncompressed outputs, so it is not shown in the uncompressed benchmark. In the trimming for compressed data, the speed of AdapterRemoval, Skewer, Fastp, Atropos, and Trimmomatic kept constant when the number of threads increased from 4 to 32, so we only benchmark those trimmers using 1, 2, and 4 threads. Ktrim does not support output compressed files, so it is not shown in the compressed benchmark.
51 | 
52 | 


--------------------------------------------------------------------------------
/src/Benchmark/rand_trim.jl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env julia
  2 | 
  3 | # using BioSymbols
  4 | # using BioSequences
  5 | #
  6 | #
  7 | # include("apiBioFqRecords.jl")
  8 | 
  9 | function julia_wrapper_randtrim(ARGS)
 10 | 
 11 |     help_page = """
 12 |     usage: atria randtrim [-h] R1_FASTQ R2_FASTQ
 13 | 
 14 |     positional arguments:
 15 |       R?_FASTQ      input fastqs. caution: raw fastq has to be
 16 |                     generated by `atria simulate`.
 17 | 
 18 |     optional arguments:
 19 |       -h, --help  show this help message and exit
 20 |     """
 21 | 
 22 |     if "-h" in ARGS || "--help" in ARGS || length(ARGS) == 0 || length(ARGS) % 2 == 1
 23 |         println(help_page)
 24 |         return 0
 25 |     end
 26 | 
 27 |     time0 = time()
 28 | 
 29 |     npair = length(ARGS)÷2
 30 | 
 31 |     for i in 1:npair
 32 |         peReadRandomTrim_main(ARGS[2i-1], ARGS[2i])
 33 |     end
 34 | 
 35 |     @info "read random trim: all done" elapsed=time() - time0
 36 |     return 0
 37 | end
 38 | 
 39 | @inline function random_trim!(r::FqRecord)
 40 | 
 41 |     nremain = rand(0:length(r.seq))
 42 | 
 43 |     splitted = split(String(copy(r.id)), " ")
 44 |     # seq_id = splitted[1]
 45 |     true_length = parse(Int64, splitted[2][6:end])
 46 |     # insert_size = parse(Int64, splitted[3][13:end])
 47 |     # error_rate = parse(Float64, splitted[4][12:end])
 48 |     # seq_length = parse(Int64, splitted[5][12:end])
 49 |     # error_insert = parse(Int64, splitted[6][14:end])
 50 |     # error_adapter = parse(Int64, splitted[7][15:end])
 51 | 
 52 |     if nremain < true_length
 53 |         splitted[2] = "TRUE=$nremain"
 54 |         safe_copyto!(r.id, map(UInt8, collect(join(splitted, " "))))
 55 |     end
 56 | 
 57 |     resize!(r.seq, nremain)
 58 |     # r.qual = view(r.qual, 1:nremain)
 59 |     resize!(r.qual, nremain)
 60 | end
 61 | 
 62 | @inline function random_trim!(r1::FqRecord, r2::FqRecord)
 63 |     if rand() < 0.5
 64 |         random_trim!(r1)
 65 |     else
 66 |         random_trim!(r2)
 67 |     end
 68 | end
 69 | 
 70 | 
 71 | function peReadRandomTrim_main(file1::String, file2::String)
 72 |     @info "read random trim: start" file1 file2
 73 | 
 74 |     for input in [file1,file2]
 75 |         if !isfile(input)
 76 |             @warn "read random trim: input FASTQ file not valid: skip" FILE=input _module=nothing _group=nothing _id=nothing _file=nothing
 77 |             return nothing
 78 |         end
 79 |     end
 80 | 
 81 |     outfile1 = joinpath(replace(file1, r"(fastq$|fq$|[^.]*)(\.gz)?$"i => s"randtrim.\1", count=1))
 82 |     outfile2 = joinpath(replace(file2, r"(fastq$|fq$|[^.]*)(\.gz)?$"i => s"randtrim.\1", count=1))
 83 | 
 84 |     io1 = open(file1, "r")
 85 |     io2 = open(file2, "r")
 86 |     io1out = open(outfile1, "w")
 87 |     io2out = open(outfile2, "w")
 88 | 
 89 |     r1 = FqRecord()
 90 |     r2 = FqRecord()
 91 | 
 92 |     #================== Read iteration ====================#
 93 |     # @label start_loop
 94 |     # eof(io1::IO) && @goto stop_loop
 95 |     while !eof(io1::IO)
 96 |         # read record
 97 |         fqreadrecord!(r1::FqRecord, io1::IO)
 98 |         fqreadrecord!(r2::FqRecord, io2::IO)
 99 | 
100 |         random_trim!(r1::FqRecord, r2::FqRecord)
101 | 
102 |         fqwriterecord(io1out::IO, r1::FqRecord)
103 |         fqwriterecord(io2out::IO, r2::FqRecord)
104 | 
105 |         # @goto start_loop
106 |     end
107 | 
108 |     @label stop_loop
109 | 
110 |     #================== Close files ====================#
111 | 
112 |     close(io1)
113 |     close(io2)
114 |     close(io1out)
115 |     close(io2out)
116 | end
117 | 


--------------------------------------------------------------------------------
/src/Atria.jl:
--------------------------------------------------------------------------------
  1 | 
  2 | module Atria
  3 | 
  4 | # add ArgParse BioSymbols BioSequences Printf JSON Statistics DelimitedFiles Distributed Logging DataStructures Markdown PrettyTables
  5 | 
  6 | # using ArgParse
  7 | # using BioSymbols
  8 | # using BioSequences
  9 | # using Printf
 10 | # using JSON
 11 | # using Statistics
 12 | # using DelimitedFiles
 13 | # using Distributed
 14 | # using Base.Threads
 15 | # using Logging
 16 | # using DataStructures
 17 | # using Markdown
 18 | # using PrettyTables
 19 | 
 20 | using Reexport
 21 | 
 22 | include(joinpath("BioBits", "BioBits.jl"))
 23 | @reexport using .BioBits
 24 | 
 25 | include(joinpath("FqRecords", "FqRecords.jl"))
 26 | @reexport using .FqRecords
 27 | 
 28 | include(joinpath("Trimmer", "Trimmer.jl"))
 29 | @reexport using .Trimmer
 30 | 
 31 | include(joinpath("Benchmark", "Benchmark.jl"))
 32 | @reexport using .Benchmark
 33 | 
 34 | include(joinpath("AtriaTest", "AtriaTest.jl"))
 35 | @reexport using .AtriaTest
 36 | 
 37 | 
 38 | function julia_main()::Cint
 39 | 
 40 |     help_programs = """
 41 |     Available programs:
 42 |         atria       Pair-end trimming software (default)
 43 |         simulate    Generate artificial pair-end reads
 44 |         randtrim    Randomly trim R1 or R2 at a random position
 45 |         readstat    Collect trimming statistics
 46 |                         (reads should be generated by `atria simulate`)
 47 |         statplot    Plot trimming statistics
 48 |                         (`Rscript` in PATH required)
 49 |         test        Test Atria program
 50 |         p | prog    Show this program list
 51 |     """
 52 | 
 53 |     if length(ARGS)::Int64 >= 1
 54 |         if ARGS[1] in ["prog", "p"]
 55 |             println(help_programs)
 56 |         elseif ARGS[1] in ("atria", "Atria")
 57 |             if "--detect-adapter" in ARGS
 58 |                 if "-R" in ARGS || "--read2" in ARGS
 59 |                     julia_wrapper_detect_adapter_pe(ARGS[2:end])
 60 |                 else
 61 |                     julia_wrapper_detect_adapter_se(ARGS[2:end])
 62 |                 end
 63 |             elseif "-R" in ARGS || "--read2" in ARGS
 64 |                 # paired-end
 65 |                 julia_wrapper_atria_pe(ARGS[2:end]::Vector{String})
 66 |             else
 67 |                 julia_wrapper_atria_se(ARGS[2:end]::Vector{String})
 68 |             end
 69 |         elseif ARGS[1] == "simulate"
 70 |             julia_wrapper_simulate(ARGS[2:end]::Vector{String})
 71 |         elseif ARGS[1] == "randtrim"
 72 |             julia_wrapper_randtrim(ARGS[2:end]::Vector{String})
 73 |         elseif ARGS[1] == "readstat"
 74 |             julia_wrapper_readstat(ARGS[2:end]::Vector{String})
 75 |         elseif ARGS[1] == "statplot"
 76 |             julia_wrapper_rscript(statplot_code, ARGS[2:end]::Vector{String})
 77 |         elseif ARGS[1] == "test"
 78 |             test_atria()
 79 |         else
 80 |             if "--detect-adapter" in ARGS
 81 |                 if "-R" in ARGS || "--read2" in ARGS
 82 |                     julia_wrapper_detect_adapter_pe(ARGS)
 83 |                 else
 84 |                     julia_wrapper_detect_adapter_se(ARGS)
 85 |                 end
 86 |             elseif "-R" in ARGS || "--read2" in ARGS
 87 |                 # paired-end
 88 |                 julia_wrapper_atria_pe(ARGS::Vector{String})
 89 |             else
 90 |                 julia_wrapper_atria_se(ARGS::Vector{String})
 91 |             end
 92 |         end
 93 |     else
 94 |         atria_markdown_help()
 95 |     end
 96 |     return 0
 97 | end
 98 | 
 99 | 
100 | 
101 | end  # module end
102 | 


--------------------------------------------------------------------------------
/src/BioBits/biosequences_safety.jl:
--------------------------------------------------------------------------------
  1 | 
  2 | #=
  3 | Some functions, such as BioSequences._orphan!, Base.resize!, and
  4 | BioSequences.reverse_complement! were modified from BioSequences package
  5 | developped by BioJulia. Those functions have their own license:
  6 | 
  7 | MIT License
  8 | 
  9 | Copyright (c) 2018: BioJulia.
 10 | 
 11 | Permission is hereby granted, free of charge, to any person obtaining
 12 | a copy of this software and associated documentation files (the
 13 | "Software"), to deal in the Software without restriction, including
 14 | without limitation the rights to use, copy, modify, merge, publish,
 15 | distribute, sublicense, and/or sell copies of the Software, and to
 16 | permit persons to whom the Software is furnished to do so, subject to
 17 | the following conditions:
 18 | 
 19 | The above copyright notice and this permission notice shall be
 20 | included in all copies or substantial portions of the Software.
 21 | 
 22 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 23 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 24 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 25 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 26 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 27 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 28 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 29 | =#
 30 | 
 31 | @inline function isbitsafe(seq::LongDNA{4})
 32 |     unsafe_isbitsafe(seq) && 
 33 |     seq.data[end] == 0x0000000000000000 &&
 34 |     if length(seq.data) > 1 && seq.len % 16 != 0x0000000000000000
 35 |         (seq.data[end-1] >> (seq.len % 16 * 4) == 0x0000000000000000)
 36 |     else
 37 |         true
 38 |     end
 39 | end
 40 | 
 41 | @inline function unsafe_isbitsafe(seq::LongDNA{4})
 42 |     length(seq.data) == cld(seq.len, 16) + 1
 43 | end
 44 | 
 45 | """
 46 |     unsafe_extra_bits_to_zeros!(seq::LongDNA{4})
 47 | 
 48 | Caution: use only in bitsafe seq!
 49 | """
 50 | @inline function unsafe_extra_bits_to_zeros!(seq::LongDNA{4})
 51 |     if !isempty(seq)
 52 |         remain = (seq.len % 16)
 53 |         @inbounds if remain != 0
 54 |             seq.data[end-1] &= ~(0xffffffffffffffff << (remain * 4))
 55 |         end
 56 |     end
 57 |     @inbounds seq.data[end] = 0x0000000000000000
 58 |     return seq
 59 | end
 60 | 
 61 | """
 62 |     bitsafe!(seq::LongDNA{4})
 63 | 
 64 | Resize `seq.data` to allow loading a pointer `Ptr{UInt64}` safely at the end of `seq`.
 65 | 
 66 | Caution: bitsafe LongDNA{4} may not be compatible on all BioSequences functions, especially those do in-place replacement.
 67 | """
 68 | @inline function bitsafe!(seq::LongDNA{4})
 69 |     if !unsafe_isbitsafe(seq)
 70 |         resize!(seq.data, cld(seq.len, 16) + 1)
 71 |     end
 72 |     unsafe_extra_bits_to_zeros!(seq)
 73 | end
 74 | 
 75 | """
 76 |     resize!(seq::LongDNA{4}, size::Int[, force::Bool=false])
 77 | 
 78 | It overrides `resize!` in BioSequences. Resize a biological sequence `seq`, to a given `size`. The underlying data is bitsafe.
 79 | """
 80 | @inline function Base.resize!(seq::LongSequence{A}, size::Int, force::Bool=false) where {A}
 81 |     if size < 0
 82 |         throw(ArgumentError("size must be non-negative"))
 83 |     else
 84 |         if force | (BioSequences.seq_data_len(A, size) > BioSequences.seq_data_len(A, length(seq)))
 85 |             resize!(seq.data, BioSequences.seq_data_len(A, size))
 86 |         end
 87 |         seq.len = size
 88 |         bitsafe!(seq)
 89 |     end
 90 | end
 91 | 
 92 | function BioSequences.reverse_complement!(seq::LongSequence{<:NucleicAcidAlphabet})
 93 |     pred = x -> BioSequences.complement_bitpar(x, Alphabet(seq))
 94 |     BioSequences.reverse_data!(pred, seq.data, BioSequences.seq_data_len(seq) % UInt, BioSequences.BitsPerSymbol(seq))
 95 |     BioSequences.zero_offset!(seq)
 96 |     bitsafe!(seq)
 97 | end
 98 | 
 99 | function BioSequences.reverse_complement(seq::LongSequence{<:NucleicAcidAlphabet})
100 |     cp = typeof(seq)(undef, unsigned(length(seq)))
101 |     pred = x -> BioSequences.complement_bitpar(x, Alphabet(seq))
102 |     BioSequences.reverse_data_copy!(pred, cp.data, seq.data, BioSequences.seq_data_len(seq) % UInt, BioSequences.BitsPerSymbol(seq))
103 |     BioSequences.zero_offset!(cp)
104 |     bitsafe!(cp)
105 | end
106 | 


--------------------------------------------------------------------------------
/benchmark/real-data-rnaseq.bash:
--------------------------------------------------------------------------------
  1 | 
  2 | ##### SRR330569: RNA-seq D. simulans
  3 | working_dir=~/analysis/atria-benchmark/SRR330569
  4 | cd $working_dir
  5 | 
  6 | r1=SRR330569.3_1.fastq.gz
  7 | r2=SRR330569.3_2.fastq.gz
  8 | a1=AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCG
  9 | a2=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGAT
 10 | bwa_ref=`pwd`/genomes/dsim-all-chromosome-r2.02.fasta
 11 | 
 12 | # download reference
 13 | mkdir genomes
 14 | # wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/754/195/GCF_000754195.2_ASM75419v2/GCF_000754195.2_ASM75419v2_genomic.fna.gz -O $bwa_ref_genbank.gz
 15 | wget ftp://ftp.flybase.net/genomes/Drosophila_simulans/dsim_r2.02_FB2020_03/fasta/dsim-all-chromosome-r2.02.fasta.gz -O $bwa_ref.gz
 16 | 
 17 | gzip -d $bwa_ref.gz
 18 | 
 19 | # build reference
 20 | hisat2-build $bwa_ref $bwa_ref-hisat2
 21 | 
 22 | 
 23 | ##### Pipelines
 24 | 
 25 | . $atria/benchmark/trimming-functions.bash
 26 | 
 27 | rm -f stderr.log
 28 | run_atria 8 2>> stderr.log
 29 | 
 30 | run_atria_consensus 8 2>> stderr.log
 31 | 
 32 | run_adapterremoval 8 2>> stderr.log
 33 | 
 34 | run_skewer 8 2>> stderr.log
 35 | 
 36 | run_trim_galore 8 2>> stderr.log
 37 | 
 38 | run_trimmomatic 8 2>> stderr.log
 39 | 
 40 | run_ktrim 8 2>> stderr.log
 41 | pigz Ktrim/ktrim.read1.fq Ktrim/ktrim.read2.fq
 42 | 
 43 | run_fastp 8 2>> stderr.log
 44 | 
 45 | run_seqpurge 8 2>> stderr.log
 46 | 
 47 | run_atropos  8 2>> stderr.log
 48 | 
 49 | # mapping without qualtrim
 50 | mkdir -p trimmed
 51 | ln -s ../AdapterRemoval-3/adapterremoval.pair1.truncated.gz trimmed/adapterremoval.R1.fastq.gz
 52 | ln -s ../AdapterRemoval-3/adapterremoval.pair2.truncated.gz trimmed/adapterremoval.R2.fastq.gz
 53 | 
 54 | ln -s ../Atria/${r1/.fastq*/}.atria.fastq.gz trimmed/atria.R1.fastq.gz
 55 | ln -s ../Atria/${r2/.fastq*/}.atria.fastq.gz trimmed/atria.R2.fastq.gz
 56 | 
 57 | ln -s ../Atria-consensus/${r1/.fastq*/}.atria.fastq.gz trimmed/atria-consensus.R1.fastq.gz
 58 | ln -s ../Atria-consensus/${r2/.fastq*/}.atria.fastq.gz trimmed/atria-consensus.R2.fastq.gz
 59 | 
 60 | ln -s ../Skewer/Skewer-trimmed-pair1.fastq.gz trimmed/Skewer.R1.fastq.gz
 61 | ln -s ../Skewer/Skewer-trimmed-pair2.fastq.gz trimmed/Skewer.R2.fastq.gz
 62 | 
 63 | ln -s ../TrimGalore/${r1/.fastq*/}_val_1.fq.gz trimmed/trimgalore.R1.fastq.gz
 64 | ln -s ../TrimGalore/${r2/.fastq*/}_val_2.fq.gz trimmed/trimgalore.R2.fastq.gz
 65 | 
 66 | ln -s ../Trimmomatic/out-pair1.paired.fq.gz trimmed/trimmomatic.R1.fastq.gz
 67 | ln -s ../Trimmomatic/out-pair2.paired.fq.gz trimmed/trimmomatic.R2.fastq.gz
 68 | 
 69 | ln -s ../Ktrim/ktrim.read1.fq.gz trimmed/ktrim.R1.fastq.gz
 70 | ln -s ../Ktrim/ktrim.read2.fq.gz trimmed/ktrim.R2.fastq.gz
 71 | 
 72 | ln -s ../fastp/out.fastp.r1.fq.gz trimmed/fastp.R1.fastq.gz
 73 | ln -s ../fastp/out.fastp.r2.fq.gz trimmed/fastp.R2.fastq.gz
 74 | 
 75 | ln -s ../SeqPurge/SRR330569.3_1.fastq.gz.seqpurge.fq.gz trimmed/seqpurge.R1.fastq.gz
 76 | ln -s ../SeqPurge/SRR330569.3_2.fastq.gz.seqpurge.fq.gz trimmed/seqpurge.R2.fastq.gz
 77 | 
 78 | ln -s ../Atropos/SRR330569.3_1.fastq.gz.atropos.fq.gz trimmed/atropos.R1.fastq.gz
 79 | ln -s ../Atropos/SRR330569.3_2.fastq.gz.atropos.fq.gz trimmed/atropos.R2.fastq.gz
 80 | 
 81 | 
 82 | # mapping after qualtrim
 83 | QSCORE=15
 84 | time atria -r trimmed/*.R1.fastq.gz -R trimmed/*.R2.fastq.gz -t 5 -p 6 -o trimmed-qualtrim --no-tail-n-trim --max-n=-1 --no-adapter-trim --no-length-filtration --quality-score $QSCORE --check-identifier
 85 | rename --force "s/atria.fastq/qual$QSCORE.fastq/" trimmed-qualtrim/*fastq*
 86 | 
 87 | # time atria -r trimmed/ktrim.R1.fastq.gz trimmed/fastp.R1.fastq.gz trimmed/seqpurge.R1.fastq.gz trimmed/atropos.R1.fastq.gz -R trimmed/ktrim.R2.fastq.gz trimmed/fastp.R2.fastq.gz trimmed/seqpurge.R2.fastq.gz trimmed/atropos.R2.fastq.gz -t 7 -p 4 -o trimmed-qualtrim --no-tail-n-trim --max-n=-1 --no-adapter-trim --no-length-filtration --quality-score $QSCORE --check-identifier
 88 | # rename --force "s/atria.fastq/qual$QSCORE.fastq/" trimmed-qualtrim/*fastq*
 89 | 
 90 | # trimmed*/{atropos,fastp,ktrim,seqpurge}.R1*fastq.gz
 91 | for i in trimmed*/*.R1*fastq.gz
 92 | do
 93 | 	echo $i
 94 | 	mapping_hisat2 $i ${i/.R1/.R2}
 95 | 
 96 | 	samtools stats $i.hisat2.sam > $i.hisat2.sam.samtools-stats &
 97 | 	pigz $i.hisat2.sam
 98 | done 2>&1 | tee mapping.log
 99 | 
100 | 
101 | cd trimmed
102 | pasteSamtoolsStats *samtools-stats
103 | cd ..
104 | 
105 | 
106 | cd trimmed-qualtrim
107 | pasteSamtoolsStats *samtools-stats
108 | cd ..
109 | 


--------------------------------------------------------------------------------
/benchmark/real-data-human.bash:
--------------------------------------------------------------------------------
  1 | 
  2 | ##### SRR330569: RNA-seq D. simulans
  3 | working_dir=~/analysis/atria-benchmark/ERR4695159
  4 | cd $working_dir
  5 | 
  6 | r1=ERR4695159_1.fastq.gz
  7 | r2=ERR4695159_2.fastq.gz
  8 | a1=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA
  9 | a2=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
 10 | bwa_ref=`pwd`/genomes/hg38.fasta.gz
 11 | 
 12 | # download reference
 13 | mkdir genomes
 14 | wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.fna.gz -O $bwa_ref
 15 | 
 16 | gzip -d $bwa_ref.gz
 17 | 
 18 | # build reference
 19 | bowtie2-build $bwa_ref $bwa_ref-bowtie2
 20 | 
 21 | 
 22 | ##### Pipelines
 23 | 
 24 | . $atria/benchmark/trimming-functions.bash
 25 | 
 26 | rm -f stderr.log
 27 | run_atria 8 2>> stderr.log
 28 | 
 29 | run_atria_consensus 8 2>> stderr.log
 30 | 
 31 | run_adapterremoval 8 2>> stderr.log
 32 | 
 33 | run_skewer 8 2>> stderr.log
 34 | 
 35 | run_trim_galore 8 2>> stderr.log
 36 | 
 37 | run_trimmomatic 8 2>> stderr.log
 38 | 
 39 | run_ktrim 8 2>> stderr.log
 40 | pigz Ktrim/ktrim.read1.fq Ktrim/ktrim.read2.fq
 41 | 
 42 | run_fastp 8 2>> stderr.log
 43 | 
 44 | run_seqpurge 8 2>> stderr.log
 45 | 
 46 | run_atropos 8 2>> stderr.log
 47 | 
 48 | # mapping without qualtrim
 49 | mkdir -p trimmed
 50 | ln -s ../AdapterRemoval-3/adapterremoval.pair1.truncated.gz trimmed/adapterremoval.R1.fastq.gz
 51 | ln -s ../AdapterRemoval-3/adapterremoval.pair2.truncated.gz trimmed/adapterremoval.R2.fastq.gz
 52 | 
 53 | ln -s ../Atria/${r1/.fastq*/}.atria.fastq.gz trimmed/atria.R1.fastq.gz
 54 | ln -s ../Atria/${r2/.fastq*/}.atria.fastq.gz trimmed/atria.R2.fastq.gz
 55 | 
 56 | ln -s ../Atria-consensus/${r1/.fastq*/}.atria.fastq.gz trimmed/atria-consensus.R1.fastq.gz
 57 | ln -s ../Atria-consensus/${r2/.fastq*/}.atria.fastq.gz trimmed/atria-consensus.R2.fastq.gz
 58 | 
 59 | ln -s ../Skewer/Skewer-trimmed-pair1.fastq.gz trimmed/Skewer.R1.fastq.gz
 60 | ln -s ../Skewer/Skewer-trimmed-pair2.fastq.gz trimmed/Skewer.R2.fastq.gz
 61 | 
 62 | ln -s ../TrimGalore/${r1/.fastq*/}_val_1.fq.gz trimmed/trimgalore.R1.fastq.gz
 63 | ln -s ../TrimGalore/${r2/.fastq*/}_val_2.fq.gz trimmed/trimgalore.R2.fastq.gz
 64 | 
 65 | ln -s ../Trimmomatic/out-pair1.paired.fq.gz trimmed/trimmomatic.R1.fastq.gz
 66 | ln -s ../Trimmomatic/out-pair2.paired.fq.gz trimmed/trimmomatic.R2.fastq.gz
 67 | 
 68 | ln -s ../Ktrim/ktrim.read1.fq.gz trimmed/ktrim.R1.fastq.gz
 69 | ln -s ../Ktrim/ktrim.read2.fq.gz trimmed/ktrim.R2.fastq.gz
 70 | 
 71 | ln -s ../fastp/out.fastp.r1.fq.gz trimmed/fastp.R1.fastq.gz
 72 | ln -s ../fastp/out.fastp.r2.fq.gz trimmed/fastp.R2.fastq.gz
 73 | 
 74 | ln -s ../SeqPurge/ERR4695159_1.fastq.gz.seqpurge.fq.gz trimmed/seqpurge.R1.fastq.gz
 75 | ln -s ../SeqPurge/ERR4695159_2.fastq.gz.seqpurge.fq.gz trimmed/seqpurge.R2.fastq.gz
 76 | 
 77 | ln -s ../Atropos/ERR4695159_1.fastq.gz.atropos.fq.gz trimmed/atropos.R1.fastq.gz
 78 | ln -s ../Atropos/ERR4695159_2.fastq.gz.atropos.fq.gz trimmed/atropos.R2.fastq.gz
 79 | 
 80 | # ln -s ../Atria-v3.1.4/${r1/.fastq*/}.atria.fastq.gz trimmed/atria-v3.1.4.R1.fastq.gz
 81 | # ln -s ../Atria-v3.1.4/${r2/.fastq*/}.atria.fastq.gz trimmed/atria-v3.1.4.R2.fastq.gz
 82 | 
 83 | # ln -s ../Atria-consensus-v3.1.4/${r1/.fastq*/}.atria.fastq.gz trimmed/atria-consensus-v3.1.4.R1.fastq.gz
 84 | # ln -s ../Atria-consensus-v3.1.4/${r2/.fastq*/}.atria.fastq.gz trimmed/atria-consensus-v3.1.4.R2.fastq.gz
 85 | 
 86 | # mapping after qualtrim
 87 | QSCORE=15
 88 | time atria -r trimmed/*.R1.fastq.gz -R trimmed/*.R2.fastq.gz -t 5 -p 6 -o trimmed-qualtrim --no-tail-n-trim --max-n=-1 --no-adapter-trim --no-length-filtration --quality-score $QSCORE --check-identifier
 89 | rename --force "s/atria.fastq/qual$QSCORE.fastq/" trimmed-qualtrim/*fastq*
 90 | 
 91 | # time atria -r trimmed/ktrim.R1.fastq.gz trimmed/fastp.R1.fastq.gz trimmed/seqpurge.R1.fastq.gz trimmed/atropos.R1.fastq.gz -R trimmed/ktrim.R2.fastq.gz trimmed/fastp.R2.fastq.gz trimmed/seqpurge.R2.fastq.gz trimmed/atropos.R2.fastq.gz -t 7 -p 4 -o trimmed-qualtrim --no-tail-n-trim --max-n=-1 --no-adapter-trim --no-length-filtration --quality-score $QSCORE --check-identifier
 92 | # rename --force "s/atria.fastq/qual$QSCORE.fastq/" trimmed-qualtrim/*fastq*
 93 | 
 94 | # trimmed*/{atropos,fastp,ktrim,seqpurge}.R1*fastq.gz
 95 | for i in trimmed*/*.R1*fastq.gz
 96 | do
 97 | 	echo $i
 98 | 	mapping_bowtie2 $i ${i/.R1/.R2}
 99 | 
100 | 	samtools stats $i.bowtie2.bam > $i.bowtie2.bam.samtools-stats &
101 | done 2>&1 | tee mapping.log
102 | 
103 | 
104 | cd trimmed
105 | pasteSamtoolsStats *samtools-stats
106 | cd ..
107 | 
108 | 
109 | cd trimmed-qualtrim
110 | pasteSamtoolsStats *samtools-stats
111 | cd ..
112 | 


--------------------------------------------------------------------------------
/src/BioBits/insert_size_decision.jl:
--------------------------------------------------------------------------------
  1 | 
  2 | @inline function insert_size_decision(a_insert_size::Int64, a_score::Float64, b_insert_size::Int64, b_score::Float64; insert_size_diff::Int64 = 0)
  3 |     if a_insert_size == b_insert_size
  4 |         insert_size = a_insert_size
  5 |         score = a_score + b_score
  6 |     elseif abs(a_insert_size - b_insert_size) <= insert_size_diff
  7 |         insert_size = min(a_insert_size, b_insert_size)
  8 |         score = a_score + b_score
  9 |     elseif a_score > b_score
 10 |         insert_size = a_insert_size
 11 |         score = a_score
 12 |     else
 13 |         insert_size = b_insert_size
 14 |         score = b_score
 15 |     end
 16 |     insert_size, score
 17 | end
 18 | 
 19 | 
 20 | @inline function insert_size_decision_separate(a_insert_size::Int64, a_score::Float64, b_insert_size::Int64, b_score::Float64; insert_size_diff::Int64 = 0)
 21 |     if abs(a_insert_size - b_insert_size) <= insert_size_diff
 22 |         score = a_score + b_score
 23 |         # insert sizes not changed
 24 |     # NOTE: remove the following elseif because
 25 |     # elseif abs(a_score - b_score) < score_diff 0 <= score_diff <= 3 get the highest result.
 26 |     #     score = (a_score + b_score) / 2
 27 |     #     # choose the min insert size for both a and b
 28 |     #     if a_insert_size > b_insert_size
 29 |     #         a_insert_size = b_insert_size
 30 |     #     else
 31 |     #         b_insert_size = a_insert_size
 32 |     #     end
 33 |     elseif a_score > b_score
 34 |         score = a_score
 35 |         b_insert_size = a_insert_size
 36 |     else
 37 |         score = b_score
 38 |         a_insert_size = b_insert_size
 39 |     end
 40 |     a_insert_size, b_insert_size, score
 41 | end
 42 | 
 43 | @inline function is_false_positive(r1_adapter_insert_size::Int64, r1_pe_insert_size::Int64, r1_length::Int64, r2_adapter_insert_size::Int64, r2_pe_insert_size::Int64, r2_length::Int64; insert_size_diff::Int64 = 0, tail_length::Int64 = 8)::Bool
 44 | 
 45 |     # skip running this function when length are different.
 46 |     if r1_length < max(r2_adapter_insert_size, r2_pe_insert_size) ||
 47 |         r2_length < max(r1_adapter_insert_size, r1_pe_insert_size)
 48 |         return false
 49 |     end
 50 | 
 51 |     r1_adapter_error = (r1_adapter_insert_size > r1_length - tail_length) | (r1_adapter_insert_size == -1)
 52 |     r1_pe_error = r1_pe_insert_size < r1_length - tail_length
 53 |     r1_error = r1_adapter_error & r1_pe_error
 54 | 
 55 |     r2_adapter_error = (r2_adapter_insert_size > r2_length - tail_length) | (r2_adapter_insert_size == -1)
 56 |     r2_pe_error = r2_pe_insert_size < r2_length - tail_length
 57 |     r2_error = r2_adapter_error & r2_pe_error
 58 | 
 59 |     r1_adapter_inrange = abs(r1_adapter_insert_size - r1_pe_insert_size) <= insert_size_diff
 60 |     r2_adapter_inrange = abs(r2_adapter_insert_size - r2_pe_insert_size) <= insert_size_diff
 61 | 
 62 |     not_false_positive = r1_adapter_inrange | r2_adapter_inrange
 63 | 
 64 |     (r1_error | r2_error) & !not_false_positive
 65 | end
 66 | 
 67 | """
 68 |     one_bp_check(r::LongDNA{4}, a::LongDNA{4}, nremain::Int64, length_to_check::Int64)
 69 | 
 70 | v3.0.0: When finishing matching, Atria might have 1 bp offset because of insert size decision. Check 1 bp offset of reads at adapter (`a`) position (`nremain + 1`) to adapter is necessary.
 71 | 
 72 | Return best nremain::Int64.
 73 | """
 74 | @inline function one_bp_check(r::LongDNA{4}, a::LongDNA{4}, nremain::Int64, length_to_check::Int64)
 75 |     n = length(r)
 76 |     if nremain >= n - 3 ## no need to check adapter when no adapter.
 77 |         return nremain
 78 |     end
 79 |     nmatch = unsafe_seq_identity(r, a, nremain + 1, 1, length_to_check)
 80 |     nmatch_left = unsafe_seq_identity(r, a, nremain, 1, length_to_check)
 81 |     nmatch_right = unsafe_seq_identity(r, a, nremain + 2, 1, length_to_check)
 82 |     if nmatch >= nmatch_left
 83 |         if nmatch >= nmatch_right
 84 |             nremain
 85 |         else
 86 |             nremain + 1
 87 |         end
 88 |     else
 89 |         if nmatch_left >= nmatch_right
 90 |             nremain - 1
 91 |         else
 92 |             nremain + 1
 93 |         end
 94 |     end
 95 | end
 96 | 
 97 | @inline function unsafe_seq_identity(a::LongDNA{4}, b::LongDNA{4}, ia::Int64, ib::Int64, max_check::Int64; max_a::Int64 = length(a))
 98 |     nmatch = 0
 99 |     ncheck = 0
100 |     @inbounds while ncheck < max_check && ia <= max_a
101 |         if a[ia] === b[ib]
102 |             nmatch += 1
103 |         end
104 |         ncheck +=1
105 |         ia += 1
106 |         ib += 1
107 |     end
108 |     nmatch
109 | end
110 | 


--------------------------------------------------------------------------------
/benchmark/atria-similate-for-atria-only.bash:
--------------------------------------------------------------------------------
  1 | 
  2 | cd ~/analysis/atria-benchmark/julia1.8.5-atria4.0.0
  3 | 
  4 | a1=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA
  5 | a2=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
  6 | 
  7 | r1="reads_diff_indel.R1.fastq"
  8 | r2="reads_diff_indel.R2.fastq"
  9 | 
 10 | # r1="reads_diff_indel.R1.fastq.gz"
 11 | # r2="reads_diff_indel.R2.fastq.gz"
 12 | 
 13 | atria_old=atria
 14 | atria_new=/home/jc/projects/Atria-jl1.8/app-4.0.0_2023-11-10T09-58/bin/atria
 15 | 
 16 | 
 17 | . $atria/benchmark/trimming-functions.bash
 18 | 
 19 | # atria simulate --prefix reads_diff_indel --adapter1 $a1 --adapter2 $a2 --repeat 300000 --subsitution-rate 0.001 0.002 0.003 0.004 0.005 --insertion-rate 1.0e-5 2.0e-5 3.0e-5 4.0e-5 5.0e-5 --deletion-rate 1.0e-5 2.0e-5 3.0e-5 4.0e-5 5.0e-5 -s 100 -i `seq 66 2 120`
 20 | atria simulate --prefix reads_diff_indel --adapter1 $a1 --adapter2 $a2 --repeat 300000 --subsitution-rate 0.001 0.003 0.005 --insertion-rate 1.0e-5 3.0e-5 5.0e-5 --deletion-rate 1.0e-5 3.0e-5 5.0e-5 -s 100 -i `seq 78 2 108`
 21 | 
 22 | NUM_READS=`wc -l reads_diff_indel.R1.fastq | cut -f1 -d" "`
 23 | NUM_BASES=`echo "$NUM_READS / 4 * 200" | bc`
 24 | echo NUM_BASES=$NUM_BASES
 25 | 
 26 | 
 27 | run_atria_src(){
 28 |     local num_threads=1
 29 |     local outdir=Atria-src
 30 |     if [[ $1 ]]; then
 31 |         num_threads=$1
 32 |     fi
 33 |     if [[ $2 ]]; then
 34 |         outdir=$2
 35 |     fi
 36 | 	export JULIA_NUM_THREADS=$num_threads
 37 | 	$time $atria/src/atria --no-consensus -t $num_threads -r $r1 -R $r2 -o $outdir --no-tail-n-trim --max-n=-1 --no-quality-trim --no-length-filtration --adapter1 $a1 --adapter2 $a2 --force
 38 | }
 39 | 
 40 | run_atria(){
 41 |     local num_threads=1
 42 |     if [[ $1 ]]; then
 43 |         num_threads=$1
 44 |     fi
 45 |     $time -v $atria_old --no-consensus -t $num_threads \
 46 |         -r $r1 -R $r2 \
 47 |         -o Atria-old \
 48 |         --no-tail-n-trim --max-n=-1 --no-quality-trim --no-length-filtration \
 49 |         --adapter1 $a1 --adapter2 $a2 --force
 50 | }
 51 | 
 52 | run_atria_consensus(){
 53 |     local num_threads=1
 54 |     if [[ $1 ]]; then
 55 |         num_threads=$1
 56 |     fi
 57 |     $time -v $atria_old -t $num_threads \
 58 |         -r $r1 -R $r2 \
 59 |         -o Atria-consensus-old \
 60 |         --no-tail-n-trim --max-n=-1 --no-quality-trim --no-length-filtration \
 61 |         --adapter1 $a1 --adapter2 $a2 --force
 62 | }
 63 | 
 64 | run_atria_new(){
 65 |     local num_threads=1
 66 |     if [[ $1 ]]; then
 67 |         num_threads=$1
 68 |     fi
 69 |     $time -v $atria_new --no-consensus -t $num_threads \
 70 |         -r $r1 -R $r2 \
 71 |         -o Atria-new \
 72 |         --no-tail-n-trim --max-n=-1 --no-quality-trim --no-length-filtration \
 73 |         --adapter1 $a1 --adapter2 $a2 --force
 74 | }
 75 | 
 76 | run_atria_consensus_new(){
 77 |     local num_threads=1
 78 |     if [[ $1 ]]; then
 79 |         num_threads=$1
 80 |     fi
 81 |     $time -v $atria_new \
 82 |         -r $r1 -R $r2 \
 83 |         -o Atria-consensus-new \
 84 |         --no-tail-n-trim --max-n=-1 --no-quality-trim --no-length-filtration \
 85 |         --adapter1 $a1 --adapter2 $a2 -t $num_threads --force
 86 | }
 87 | 
 88 | 
 89 | echo "" 2> stderr.base.log
 90 | run_atria 1 2>> stderr.base.log
 91 | run_atria 2 2>> stderr.base.log
 92 | run_atria 4 2>> stderr.base.log
 93 | run_atria 8 2>> stderr.base.log
 94 | run_atria 16 2>> stderr.base.log
 95 | 
 96 | run_atria_consensus 1 2>> stderr.base.log
 97 | run_atria_consensus 2 2>> stderr.base.log
 98 | run_atria_consensus 4 2>> stderr.base.log
 99 | run_atria_consensus 8 2>> stderr.base.log
100 | run_atria_consensus 16 2>> stderr.base.log
101 | 
102 | echo "" 2> stderr.dev.log
103 | 
104 | 
105 | run_atria_new 1 2>> stderr.dev.log
106 | run_atria_new 2 2>> stderr.dev.log
107 | run_atria_new 4 2>> stderr.dev.log
108 | run_atria_new 8 2>> stderr.dev.log
109 | run_atria_new 16 2>> stderr.dev.log
110 | 
111 | run_atria_consensus_new 1 2>> stderr.dev.log
112 | run_atria_consensus_new 2 2>> stderr.dev.log
113 | run_atria_consensus_new 4 2>> stderr.dev.log
114 | run_atria_consensus_new 8 2>> stderr.dev.log
115 | run_atria_consensus_new 16 2>> stderr.dev.log
116 | 
117 | # run_atria 16 2>> stderr.log
118 | # run_atria_new 16 2>> stderr.log
119 | # run_atria_consensus 16 2>> stderr.log
120 | # run_atria_consensus_new 16 2>> stderr.log
121 | 
122 | ll */*fastq
123 | 
124 | for i in *
125 | do
126 |     if [[ -d $i ]]
127 |     then
128 |         atria readstat $i/*.f*q &
129 |     fi
130 | done
131 | wait
132 | 
133 | atria statplot -i auto -l DIR -F
134 | 
135 | cat stderr.base.log stderr.dev.log > std_all.log
136 | pasteTimeOutput std_all.log > time_benchmark.txt
137 | $atria/benchmark/time_stats.jl time_benchmark.txt $NUM_BASES > time_benchmark.df.txt
138 | wps time_benchmark.df.txt & 


--------------------------------------------------------------------------------
/src/Trimmer/wrapper_detect_adapter_se.jl:
--------------------------------------------------------------------------------
  1 | 
  2 | # f_procs(x::String) = x == "-p" || x == "--procs"
  3 | 
  4 | function julia_wrapper_detect_adapter_se(ARGS::Vector{String}; exit_after_help = true)
  5 | 
  6 |     args = parsing_args(ARGS; exit_after_help = exit_after_help)
  7 | 
  8 |     if args === nothing  # ARGS is ["-h"]
  9 |         return 0
 10 |     end
 11 |     args_range_test(args)
 12 | 
 13 | 
 14 |     #================== Arguments ====================#
 15 | 
 16 |     nthread                  =  args["threads"            ]
 17 |     max_chunk_size           =  2 ^ args["log2-chunk-size"]
 18 | 
 19 |     #================== Main function and common variables ====================#
 20 | 
 21 |     in1bytes = Vector{UInt8}(undef, max_chunk_size)
 22 | 
 23 |     # number of jobs to boxing FqRecord from UInt8 Vector
 24 |     njobs = nthread * 5
 25 |     vr1s = ntuple(_ -> Vector{FqRecord}(), njobs)
 26 | 
 27 |     r1s = Vector{FqRecord}()
 28 | 
 29 | 
 30 |     #================== Iteration for files ====================#
 31 | 
 32 |     append!(args["read1"], args["read2"])
 33 | 
 34 |     for file1 in args["read1"]
 35 | 
 36 |         #===== file names =====#
 37 | 
 38 |         isingzip = occursin(r"\.gz$"i, file1)
 39 |         isinbzip2 = occursin(r"\.bz2$"i, file1)
 40 | 
 41 |         #===== file IO =====#
 42 | 
 43 |         if isingzip
 44 |             io1 = open(`pigz -p$nthread -cd $file1`, write=false)
 45 |         elseif isinbzip2
 46 |             io1 = open(`pbzip2 -p$nthread -cd $file1`, write=false)
 47 |         else
 48 |             io1 = open(file1, "r")
 49 |         end
 50 | 
 51 | 
 52 |         #================== Renew variables for read processing ====================#
 53 | 
 54 |         # clear common variables
 55 |         empty!(r1s)
 56 | 
 57 |         n_reads = 0
 58 |         n_r1 = 0
 59 |         in1bytes_nremain = 0
 60 |         task_r1s_unbox = Threads.@spawn 1
 61 |         
 62 |         #================== File processing ====================#
 63 | 
 64 |         # the first cycle to generate compiled code?
 65 |         function cycle_wrapper_detect_adapter()
 66 | 
 67 |             if typeof(io1) <: IOStream  # not compressed
 68 |                 (n_r1, r1s, ncopied) = load_fqs_threads!(io1, in1bytes, vr1s, r1s, task_r1s_unbox; remove_first_n = n_reads, njobs=njobs)
 69 |             else  # gziped
 70 |                 (n_r1, r1s, in1bytes_nremain, ncopied) = load_fqs_threads!(
 71 |                     io1,
 72 |                     in1bytes,
 73 |                     in1bytes_nremain,
 74 |                     vr1s,
 75 |                     r1s,
 76 |                     task_r1s_unbox;
 77 |                     remove_first_n = n_reads,
 78 |                     njobs = njobs
 79 |                 )
 80 |             end
 81 | 
 82 |             top5, headers = detect_adapter_threads!(r1s)
 83 | 
 84 |             adapter_frequency = top5[1,2] / n_r1
 85 |             if adapter_frequency < 0.0004
 86 |                 @info "$file1:\n No adapter detected in the first $n_r1 reads."
 87 |             else
 88 |                 adapter_table = pretty_table(String, top5, header = headers)
 89 |                 @info "$file1:\n Top 5 adapters detected in the first $n_r1 reads:\n$adapter_table"
 90 |             end
 91 |         end
 92 | 
 93 |         cycle_wrapper_detect_adapter()
 94 | 
 95 |         #================== Close files ====================#
 96 | 
 97 |         close(io1)
 98 |     end
 99 | 
100 |     println("""
101 |     _________________________________
102 | 
103 |     Single-end Adapter Detection Note: 
104 |     
105 |     Atria detects adapter sequences using a known adapter file. Adapter sequences are truncated to 16-bp, which are accurate enough for trimming. From experiments of many popular trimmers, increasing adapter length from 16 to 33 does not increase accuracy (Figure 4C of https://doi.org/10.46471/gigabyte.31).
106 | 
107 |     Adapter detection is the last choice because its accuracy is highly based on your data. If your data has been trimmed, the remaining adapters may not be enough for accurate guessing. Also, Atria cannot find adapters not listed in the known adapter file. We suggest using adapter detection only when you cannot find the actual adapter sequence.
108 | 
109 |     Besides, Atria does not automatically trim auto-detected adapters. It is your responsibility to check whether the detected adapters are real.
110 |     
111 |     Those rules can be used to check the adapter results: 
112 |     
113 |     (1) An Illumina sequence file only has ONE adapter sequence. 
114 |     
115 |     (2) In the same batch of NGS experiments, all single-end samples should have the SAME adapter sequence. The most prevalent adapters might be true for all your single-end data.
116 |     _________________________________
117 | 
118 |     """)
119 | 
120 |     return 0
121 | end # func
122 | 


--------------------------------------------------------------------------------
/benchmark/evalTrimming.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | # This code is part of Skewer (https://sourceforge.net/projects/skewer/). The License:
  4 | #
  5 | # The MIT License (MIT)
  6 | #
  7 | # Copyright (c) 2013-2014 by Hongshan Jiang
  8 | #
  9 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 10 | # of this software and associated documentation files (the "Software"), to deal
 11 | # in the Software without restriction, including without limitation the rights
 12 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 13 | # copies of the Software, and to permit persons to whom the Software is
 14 | # furnished to do so, subject to the following conditions:
 15 | #
 16 | # The above copyright notice and this permission notice shall be included in all
 17 | # copies or substantial portions of the Software.
 18 | #
 19 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 20 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 21 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 22 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 23 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 24 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 25 | # SOFTWARE.
 26 | 
 27 | use strict;
 28 | 
 29 | if(@ARGV < 3){
 30 | 	print STDERR "Usage: $0 fullLen lengths.tab file1.fastq [file2.fastq [lengths2.tab]] > summary\n";
 31 | 	exit(1);
 32 | }
 33 | 
 34 | my ($full_len, $tab_file, $file1, $file2, $tab_file2) = @ARGV;
 35 | our ($tp, $tn, $fp, $fp2, $fn, $fn2) = (0, 0, 0, 0, 0, 0);
 36 | 
 37 | open(TAB, "<$tab_file") or die("Can not open $tab_file for reading\n");
 38 | 
 39 | if(!open(IN, "<$file1")){
 40 | 	close TAB;
 41 | 	die("Can not open $file1 for reading\n");
 42 | }
 43 | 
 44 | my $id = &calcMetrics(\*TAB, \*IN);
 45 | die("read $id is in $file1 but not in $tab_file\n") if(defined $id);
 46 | 
 47 | close IN;
 48 | if(defined $file2){
 49 | 	my $bUseTab2;
 50 | 	if(defined $tab_file2){
 51 | 		if(open(TAB2, "<$tab_file2")){
 52 | 			$bUseTab2 = 1;
 53 | 		}
 54 | 		else{
 55 | 			print STDERR "Warning: Can not $tab_file2 for reading, using $tab_file instead\n";
 56 | 			$bUseTab2 = 0;
 57 | 		}
 58 | 	}
 59 | 	else{
 60 | 		$bUseTab2 = 0;
 61 | 	}
 62 | 	#
 63 | 	if(open(IN, "<$file2")){
 64 | 		my $fh = $bUseTab2 ? \*TAB2 : \*TAB;
 65 | 		my $fname = $bUseTab2 ? $tab_file2 : $tab_file;
 66 | 	    $id = &calcMetrics($fh, \*IN);
 67 | 		die("read $id is in $file2 but not in $fname\n") if(defined $id);
 68 | 	}
 69 | 	else{
 70 | 		print STDERR "Warning: Can not $file2 for reading, using information in $file1 only\n";
 71 | 	}
 72 | 	if($bUseTab2){
 73 | 		close TAB2;
 74 | 	}
 75 | }
 76 | 
 77 | close TAB;
 78 | 
 79 | my $ppv = ($tp+$fp+$fp2+$fn2) > 0 ? $tp/($tp+$fp+$fp2+$fn2) : 0;
 80 | my $sen = ($tp+$fn+$fp2+$fn2) > 0 ? $tp/($tp+$fn+$fp2+$fn2) : 0;
 81 | my $spec = $tn/($tn+$fp);
 82 | my $dom = sqrt(($tp+($fp+$fp2))*($tp+($fn+$fn2))*($tn+($fp+$fp2))*($tn+($fn+$fn2)));
 83 | my $cc = ($dom > 0) ? (($tp * $tn - ($fp+$fp2) * ($fn+$fn2)) / $dom) : 0;
 84 | my $fpr = (1 - $spec);
 85 | print "TP\tFP_ft\tFP_ot\tFN_fr\tFN_ut\tTN\tPPV\tSen.\tSpec.\tmCC\n";
 86 | print "$tp\t$fp\t$fp2\t$fn\t$fn2\t$tn\t$ppv\t$sen\t$spec\t$cc\n";
 87 | print "(FPR, TPR) = ($fpr, $sen)\n";
 88 | 
 89 | exit(0);
 90 | 
 91 | sub calcMetrics
 92 | {
 93 |     my ($fh_tab, $fh_file) = @_;
 94 | 	our ($tp, $tn, $fp, $fp2, $fn, $fn2);
 95 | 
 96 |     my $line;
 97 |     my ($id, $len);
 98 |     my ($id2, $len2, $seq);
 99 |     while($line = <$fh_tab>){
100 |         chomp($line);
101 |        ($id, $len) = split(/\t/, $line);
102 | 
103 |        $id2 = <$fh_file>; chomp($id2);
104 |        $seq = <$fh_file>; chomp($seq);
105 |        <$fh_file>; <$fh_file>;
106 |        ($id2) = split(/\//, substr($id2,1));
107 |        while($id2 ne $id){
108 |            if($len == 0){
109 |                $tp++;
110 |            }
111 |            else{
112 | 			   if($len == $full_len){
113 | 	               $fp++;
114 | 			   }
115 | 			   else{
116 | 				   $fp2++;
117 | 			   }
118 |            }
119 | 		   if(!($line=<$fh_tab>)){
120 | 			   return $id2;
121 | 	           #die("read $id2 is $file1 but not in $tab_file\n");
122 | 		   }
123 |            chomp($line);
124 |            ($id, $len) = split(/\t/, $line);
125 |        }
126 |        $len2 = length($seq);
127 |        if($len == $len2){
128 |            if($len == $full_len){
129 |                $tn++;
130 |            }
131 |            else{
132 |                $tp++;
133 |            }
134 |        }
135 |        else{ # $len != $len2
136 | 		   if($len < $len2){
137 | 			   if($len2 == $full_len){
138 | 				   $fn++;
139 | 			   }
140 | 			   else{
141 | 				   $fn2++;
142 | 			   }
143 | 		   }
144 | 		   else{ # $len > $len2
145 | 			   if($len == $full_len){
146 | 				   $fp++;
147 | 			   }
148 | 			   else{
149 | 				   $fp2++;
150 | 			   }
151 | 		   }
152 |        }
153 |     }
154 | 	return undef;
155 | }
156 | 


--------------------------------------------------------------------------------
/src/FqRecords/pcr_dedup.jl:
--------------------------------------------------------------------------------
  1 | 
  2 | mutable struct DupCount
  3 |     @atomic count::Int
  4 |     id::String
  5 | end
  6 | 
  7 | const empty_id = ""
  8 | DupCount(count::Int) = DupCount(count, empty_id)
  9 | 
 10 | function write_pcr_dedup_count(out_pcr_dedup_count::AbstractString, dup_dict::Dict{Vector{UInt64}, DupCount})
 11 |     dup_count = 0
 12 |     open(out_pcr_dedup_count, "w+") do io
 13 |         println(io, "count\tid")
 14 |         for v in values(dup_dict)
 15 |             if v.count > 1
 16 |                 @inbounds println(io, "$(v.count)\t$(v.id)")
 17 |                 dup_count += v.count
 18 |             end
 19 |         end
 20 |     end
 21 |     dup_count
 22 | end
 23 | 
 24 | function get_dup_count(dup_dict::Dict)
 25 |     dup_count = 0
 26 |     for v in values(dup_dict)
 27 |         if v.count > 1
 28 |             dup_count += v.count
 29 |         end
 30 |     end
 31 |     dup_count
 32 | end
 33 | 
 34 | function write_pcr_hash_collision(out_pcr_hash_collision::AbstractString, hash_collision_dict::Dict{Vector{UInt64}, Set{Tuple{LongDNA{4},LongDNA{4}}}})
 35 |     open(out_pcr_hash_collision, "w+") do io
 36 |         for s in values(hash_collision_dict)
 37 |             if length(s) > 1
 38 |                 println(io, "\n", length(s))
 39 |                 for (s1,s2) in values(s)
 40 |                     println(io, "\t", s1, "\t", s2)
 41 |                 end
 42 |             end
 43 |         end
 44 |     end
 45 | end
 46 | 
 47 | function alphabet_dna_2bit()
 48 |     ab = Vector{UInt8}(undef, 16)
 49 |     fill!(ab, 0x01)  # unknown to C
 50 |     ab[reinterpret(UInt8, DNA_A)+1] = 0b00
 51 |     ab[reinterpret(UInt8, DNA_T)+1] = 0b11
 52 |     ab[reinterpret(UInt8, DNA_G)+1] = 0b10
 53 |     ab
 54 | end
 55 | const ALPHABET_DNA_2BIT = alphabet_dna_2bit()
 56 | 
 57 | function alphabet_2dna()
 58 |     ab = Vector{UInt8}(undef, 256)
 59 |     fill!(ab, 0b01 << 2 | 0b01)  # unknown to CC
 60 |     for x in (0b0001, 0b0010, 0b0100, 0b1000)
 61 |         x2 = ALPHABET_DNA_2BIT[x+1]
 62 |         for y in (0b0001, 0b0010, 0b0100, 0b1000)
 63 |             y2 = ALPHABET_DNA_2BIT[y+1]
 64 |             double_dna_8bit = x << 4 | y
 65 |             ab[double_dna_8bit+1] = x2 << 2 | y2
 66 |         end
 67 |     end
 68 |     ab
 69 | end
 70 | const ALPHABET_2DNA = alphabet_2dna()
 71 | 
 72 | 
 73 | """
 74 |     hash_dna(s1::LongDNA{4})
 75 | 
 76 | Hash DNA to `[num_bits_in_it; ::LongDNA{2}.data]`. Ambiguous/Gap DNA converts to C.
 77 | """
 78 | function hash_dna(s1::LongDNA{4})
 79 |     global ALPHABET_2DNA
 80 | 
 81 |     len = length(s1)
 82 |     data = zeros(UInt64, 1 + BioSequences.seq_data_len(DNAAlphabet{2}, len))
 83 |     
 84 |     count_bits = 0
 85 |     count_c = 0
 86 |     for x in s1.data
 87 |         count_bits += count_ones(x)
 88 |         count_c += count_ones(x & 0x2222222222222222)
 89 |     end
 90 | 
 91 |     dt_32 = reinterpret(reshape, UInt32, data)
 92 |     @inbounds dt_32[1] = UInt32(count_bits)
 93 |     @inbounds dt_32[2] = UInt32(count_c)
 94 | 
 95 |     dt_re = reinterpret(reshape, UInt8, data)
 96 | 
 97 |     s1_re = reinterpret(reshape, UInt8, s1.data)
 98 |     dt_re_offset = 8
 99 | 
100 |     double_dna_size = cld(len, 4)
101 |     @inbounds @simd for i in 1:double_dna_size
102 |         double_dna1 = ALPHABET_2DNA[s1_re[2i-1] + 1]
103 |         double_dna2 = ALPHABET_2DNA[s1_re[2i] + 1]
104 | 
105 |         dt_re[dt_re_offset + i] = double_dna2 << 4 | double_dna1
106 |     end
107 |     data
108 | end
109 | 
110 | function hash_dna(s1::LongDNA{4}, s2::LongDNA{4})
111 |     global ALPHABET_2DNA
112 | 
113 |     len1 = length(s1)
114 |     len2 = length(s2)
115 |     data_len1 = BioSequences.seq_data_len(DNAAlphabet{2}, len1)
116 |     data_len2 = BioSequences.seq_data_len(DNAAlphabet{2}, len2)
117 |     data = zeros(UInt64, 1 + data_len1 + data_len2)
118 |     
119 |     count_bits = 0
120 |     count_c = 0
121 |     for x in s1.data
122 |         count_bits += count_ones(x)
123 |         count_c += count_ones(x & 0x2222222222222222)
124 |     end
125 |     for x in s2.data
126 |         count_bits += count_ones(x)
127 |         count_c += count_ones(x & 0x2222222222222222)
128 |     end
129 | 
130 |     dt_32 = reinterpret(reshape, UInt32, data)
131 |     @inbounds dt_32[1] = UInt32(count_bits)
132 |     @inbounds dt_32[2] = UInt32(count_c)
133 | 
134 |     dt_re = reinterpret(reshape, UInt8, data)
135 | 
136 |     s1_re = reinterpret(reshape, UInt8, s1.data)
137 |     dt_re_offset = 8
138 | 
139 |     double_dna_size1 = cld(len1, 4)
140 |     @inbounds for i in 1:double_dna_size1
141 |         double_dna1 = ALPHABET_2DNA[s1_re[2i-1] + 1]
142 |         double_dna2 = ALPHABET_2DNA[s1_re[2i] + 1]
143 | 
144 |         dt_re[dt_re_offset + i] = double_dna2 << 4 | double_dna1
145 |     end
146 | 
147 |     s2_re = reinterpret(reshape, UInt8, s2.data)
148 |     dt_re_offset += data_len1 * 8
149 | 
150 |     double_dna_size2 = cld(len2, 4)
151 |     @inbounds for i in 1:double_dna_size2
152 |         double_dna1 = ALPHABET_2DNA[s2_re[2i-1] + 1]
153 |         double_dna2 = ALPHABET_2DNA[s2_re[2i] + 1]
154 | 
155 |         dt_re[dt_re_offset + i] = double_dna2 << 4 | double_dna1
156 |     end
157 |     data
158 | end


--------------------------------------------------------------------------------
/benchmark/time_stats.jl:
--------------------------------------------------------------------------------
  1 | #!julia
  2 | 
  3 | using DataFrames, CSV
  4 | 
  5 | if isempty(ARGS) || !isfile(ARGS[1])
  6 |     println("""
  7 |     Usage: $(@__FILE__) time_benchmark.txt num_bases [stderr.pigz.log]
  8 | 
  9 |     time_benchmark.txt is the result of `pasteTimeOutput` (see simulate-run-bench.bash);
 10 |     num_bases is the number of bases processed.
 11 |     stderr.pigz.log compensate the bug of GNU TIME which cannot stat the subprocess (pigz) of Julia. The file is the result of ```
 12 |         /usr/bin/time pigz -p 8 -c Atria-consensus/*atria.fq 1>/dev/null 2>> stderr.pigz.log
 13 |         /usr/bin/time pigz -p 8 -c Atria/*atria.fq 1>/dev/null 2>> stderr.pigz.log
 14 |         /usr/bin/time pigz -cd \$r1 \$r2 > /dev/null 2>> stderr.pigz.log
 15 |     ```
 16 | 
 17 |     Result output to stdout.
 18 |     """)
 19 |     exit()
 20 | end
 21 | 
 22 | function parse_numeric(x::String)
 23 |     float = tryparse(Float64, x)
 24 |     if !isnothing(float)
 25 |         return float
 26 |     end
 27 |     # parse percentage
 28 |     if occursin(r"\%$", x)
 29 |         float = tryparse(Float64, x[1:end-1])
 30 |         if !isnothing(float)
 31 |             return float/100
 32 |         end
 33 |     end
 34 |     # parse time as D:H:M:S.MS
 35 |     xs = split(x, ":") |> reverse!
 36 |     second = parse(Float64, xs[1])
 37 |     for i in 2:length(xs)
 38 |         if i == 2
 39 |             second += 60 * parse(Float64, xs[i])
 40 |         elseif i == 3
 41 |             second += 3600 * parse(Float64, xs[i])
 42 |         elseif i == 4
 43 |             second += 24 * 3600 * parse(Float64, xs[i])
 44 |         else
 45 |             error("Failed to parse $x as the time format D:H:M:S")
 46 |         end
 47 |     end
 48 |     second
 49 | end
 50 | 
 51 | const THREADS_STR = ["-threads", "-thread", "-cores", "-t"]
 52 | function get_threads(x; THREADS_STR=THREADS_STR)
 53 |     thread = 1
 54 |     for thread_str in THREADS_STR
 55 |         m = match(Regex("$thread_str[= ]*([\\d]+)"), x)
 56 |         isnothing(m) && continue
 57 |         if length(m.captures) == 1
 58 |             thread = parse(Int, m.captures[1])
 59 |         end
 60 |     end
 61 |     @warn "Set threads == $thread for command: $x"
 62 |     thread
 63 | end
 64 | 
 65 | df = CSV.File(ARGS[1], header=false) |> DataFrame
 66 | NUM_BASES = length(ARGS) >= 2 ? parse(Int, ARGS[2]) : 1
 67 | 
 68 | const USERTIME_STR = "User time (seconds): "
 69 | const SYSTEMTIME_STR = "System time (seconds): "
 70 | const CPU_STR = "Percent of CPU this job got: "
 71 | const ELAPSEDTIME_STR = "Elapsed (wall clock) time (h:mm:ss or m:ss): "
 72 | const MEMORY_STR = "Maximum resident set size (kbytes): "
 73 | 
 74 | USERTIME = findfirst(x -> typeof(x)<:AbstractString && occursin(USERTIME_STR, x), df[1,1:end])
 75 | SYSTEMTIME = findfirst(x -> typeof(x)<:AbstractString && occursin(SYSTEMTIME_STR, x), df[1,1:end])
 76 | CPU = findfirst(x -> typeof(x)<:AbstractString && occursin(CPU_STR, x), df[1,1:end])
 77 | ELAPSEDTIME = findfirst(x -> typeof(x)<:AbstractString && occursin(ELAPSEDTIME_STR, x), df[1,1:end])
 78 | MEMORY = findfirst(x -> typeof(x)<:AbstractString && occursin(MEMORY_STR, x), df[1,1:end])
 79 | 
 80 | usertimes = parse_numeric.(replace.(df[!, USERTIME], USERTIME_STR=>""))
 81 | systemtimes = parse_numeric.(replace.(df[!, SYSTEMTIME], SYSTEMTIME_STR=>""))
 82 | # cpus = parse_numeric.(replace.(df[!, CPU], CPU_STR=>""))
 83 | elapsedtimes = parse_numeric.(replace.(df[!, ELAPSEDTIME], ELAPSEDTIME_STR=>""))
 84 | memories = parse_numeric.(replace.(df[!, MEMORY], MEMORY_STR=>""))
 85 | threads = get_threads.(df[!,1])
 86 | 
 87 | if length(ARGS) == 3
 88 |     #=
 89 |     stderr.pigz.log compensate the bug of GNU TIME which cannot stat the subprocess (pigz) of Julia. The file is the result of ```
 90 |         /usr/bin/time pigz -p 8 -c Atria-consensus/*atria.fq 1>/dev/null 2>> stderr.pigz.log
 91 |         /usr/bin/time pigz -p 8 -c Atria/*atria.fq 1>/dev/null 2>> stderr.pigz.log
 92 |         /usr/bin/time pigz -cd $r1 $r2 > /dev/null 2>> stderr.pigz.log
 93 |     ```
 94 |     =#
 95 |     pigz_time_file = ARGS[3]
 96 |     usertimes_pigz = parse_numeric.(readlines(pipeline(`grep -oE "[0-9\.\:]+user" $pigz_time_file`, `sed 's/user//'`)))
 97 |     systemtimes_pigz = parse_numeric.(readlines(pipeline(`grep -oE "[0-9\.\:]+system" $pigz_time_file`, `sed 's/system//'`)))
 98 | 
 99 |     rows_atria = map(x -> occursin(r"atria", x), df[!,1])
100 |     rows_atria_no_consensus = map(x -> occursin(r"atria .*--no-consensus", x), df[!,1])
101 |     rows_atria_consensus = rows_atria .⊻ rows_atria_no_consensus
102 | 
103 |     # add decompressing time
104 |     usertimes[rows_atria] .+= usertimes_pigz[3]
105 |     systemtimes[rows_atria] .+= systemtimes_pigz[3]
106 | 
107 |     # add compressing time
108 |     usertimes[rows_atria_no_consensus] .+= usertimes_pigz[2]
109 |     systemtimes[rows_atria_no_consensus] .+= systemtimes_pigz[2]
110 | 
111 |     usertimes[rows_atria_consensus] .+= usertimes_pigz[1]
112 |     systemtimes[rows_atria_consensus] .+= systemtimes_pigz[1]
113 | end
114 | 
115 | cpus = @. (usertimes + systemtimes) / elapsedtimes
116 | efficiencies = @. NUM_BASES / elapsedtimes / cpus / 10^6 # M Bases/s/CPU
117 | speeds = NUM_BASES ./ elapsedtimes / 10^6 # M Bases/s
118 | 
119 | dfout = DataFrame(
120 |     "Threads" => threads,
121 |     "Command" => df[!,1],
122 |     "Efficiency (M Bases/s/CPU)" => efficiencies,
123 |     "Speed (M Bases/s)" => speeds,
124 |     "UserTime (s)" => usertimes,
125 |     "SystemTime (s)" => systemtimes,
126 |     "CPU" => cpus,
127 |     "ElapsedTime (s)" => elapsedtimes,
128 |     "MaxMemory (kB)" => memories
129 | )
130 | 
131 | sort!(dfout, :Threads)
132 | 
133 | CSV.write(stdout, dfout; delim='\t')
134 | 


--------------------------------------------------------------------------------
/src/FqRecords/util.jl:
--------------------------------------------------------------------------------
  1 | 
  2 | #=
  3 | Some functions, such as BioSequences.throw_encode_error
  4 | were modified from BioSequences package developped by BioJulia.
  5 | Those functions have their own license:
  6 | 
  7 | MIT License
  8 | 
  9 | Copyright (c) 2018: BioJulia.
 10 | 
 11 | Permission is hereby granted, free of charge, to any person obtaining
 12 | a copy of this software and associated documentation files (the
 13 | "Software"), to deal in the Software without restriction, including
 14 | without limitation the rights to use, copy, modify, merge, publish,
 15 | distribute, sublicense, and/or sell copies of the Software, and to
 16 | permit persons to whom the Software is furnished to do so, subject to
 17 | the following conditions:
 18 | 
 19 | The above copyright notice and this permission notice shall be
 20 | included in all copies or substantial portions of the Software.
 21 | 
 22 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 23 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 24 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 25 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 26 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 27 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 28 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 29 | =#
 30 | 
 31 | @inline function check_identifier(r1_id::Vector{UInt8}, r2_id::Vector{UInt8})::Bool
 32 |     stop1 = findfirst(x -> x == 0x20 || x == 0x2f, r1_id)  # ' ' or '/'
 33 |     if isnothing(stop1)
 34 |         return r1_id == r2_id
 35 |     end
 36 |     stop1 -= 1  # do not count ' ' or '/'
 37 |     if length(r2_id) < stop1
 38 |         return false
 39 |     end
 40 |     @inbounds for i in 1:stop1
 41 |         if r1_id[i] != r2_id[i]
 42 |             return false
 43 |         end
 44 |     end
 45 |     true
 46 | end
 47 | 
 48 | @inline function check_identifier(r1::FqRecord, r2::FqRecord)::Bool
 49 |     check_identifier(r1.id, r2.id)
 50 | end
 51 | 
 52 | @noinline function throw_identifier_error(r1::FqRecord, r2::FqRecord)
 53 |     error("Identifiers of r1 and r2 are not the same!\n   R1: $(String(copy(r1.id)))\n   R2: $(String(copy(r2.id)))")
 54 | end
 55 | 
 56 | # only modify the error message.
 57 | # BioSequences: longsequences/copying.jl
 58 | # @noinline function BioSequences.throw_encode_error(A::BioSequences.Alphabet, src::AbstractArray{UInt8}, soff::Integer)
 59 | #     for i in 1:div(64, BioSequences.bits_per_symbol(A))
 60 | #         index = soff + i - 1
 61 | #         sym = src[index]
 62 | #         if BioSequences.ascii_encode(A, sym) & 0x80 == 0x80
 63 | #             # find the context around the error: one previous line and the current line
 64 | #             nsrc = length(src)
 65 | #             context_start = soff + i - 2
 66 | #             context_previous_line = true
 67 | #             context_end = soff + i
 68 | #             while context_start > 1
 69 | #                 char = src[context_start]
 70 | #                 if char == 0x0a # \n
 71 | #                     if context_previous_line
 72 | #                         context_previous_line = false
 73 | #                     else
 74 | #                         context_start += 1
 75 | #                         break
 76 | #                     end
 77 | #                 elseif soff - context_start > 300 + 300 * !context_previous_line
 78 | #                     break
 79 | #                 end
 80 | #                 context_start -= 1
 81 | #             end
 82 | #             while context_end < nsrc
 83 | #                 char = src[context_end]
 84 | #                 if char == 0x0a # \n or \r
 85 | #                     context_end -= 1
 86 | #                     break
 87 | #                 elseif context_end - soff > 100
 88 | #                     break
 89 | #                 end
 90 | #                 context_end += 1
 91 | #             end
 92 | #             context = String(copy(src[context_start:context_end]))
 93 |         
 94 | #             repr_char = if sym in UInt8('\a'):UInt8('\r') || sym in UInt8(' '):UInt8('~')
 95 | #                 " (char '$(Char(sym))')"
 96 | #             else
 97 | #                 ""
 98 | #             end
 99 | 
100 | #             error("Cannot encode byte $(repr(sym))$(repr_char) at index $(index) to $A. Is the input file valid? Does the disk have bad sections? The error is found in the following context:\n\n$context\n")
101 | #         end
102 | #     end
103 | #     @assert false "Expected error in encoding"
104 | # end
105 | 
106 | @inline function iscomplement(a::DNA, b::DNA)
107 |     BioSequences.complement(a) === b
108 | end
109 | 
110 | 
111 | # codes modified from Julia Base
112 | 
113 | function write_no_lock(s::IOStream, b::UInt8)
114 |     Int(ccall(:ios_putc, Cint, (Cint, Ptr{Cvoid}), b, s.ios))
115 | end
116 | function write_no_lock(s::IOStream, a::Vector{UInt8})
117 |     GC.@preserve a unsafe_write_no_lock(s, pointer(a), UInt64(sizeof(a)))
118 | end
119 | # """
120 | #     unsafe_write_no_lock(io::IO, ref, nbytes::UInt)
121 | #
122 | # Copy `nbytes` from `ref` (converted to a pointer) into the `IO` object.
123 | #
124 | # It is recommended that subtypes `T<:IO` override the following method signature
125 | # to provide more efficient implementations:
126 | # `unsafe_write_no_lock(s::T, p::Ptr{UInt8}, n::UInt)`
127 | # """
128 | # function unsafe_write_no_lock(s::IO, p::Ptr{UInt8}, n::UInt)
129 | #     written::Int = 0
130 | #     for i = 1:n
131 | #         written += write(s, unsafe_load(p, i))
132 | #     end
133 | #     return written
134 | # end
135 | @inline function unsafe_write_no_lock(s::IOStream, p::Ptr{UInt8}, nb::UInt)
136 |     Int(ccall(:ios_write, Csize_t, (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t), s.ios, p, nb))
137 | end
138 | 
139 | # write(io::AbstractPipe, byte::UInt8) = write(Base.pipe_writer(io)::IO, byte)
140 | 


--------------------------------------------------------------------------------
/src/FqRecords/consensus.jl:
--------------------------------------------------------------------------------
  1 | 
  2 | """
  3 |     pe_consensus!(r1::FqRecord, r2::FqRecord, r2_seq_rc::LongDNA{4}, insert_size::Int64; min_ratio_mismatch::Float64 = 0.2, prob_diff::Float64 = 0.0)
  4 | 
  5 | Paired-end consensus calling for read pairs with adapters trimmed. Return `is_consensused::Bool`.
  6 | """
  7 | function pe_consensus!(r1::FqRecord, r2::FqRecord, r2_seq_rc::LongDNA{4}, insert_size::Int64; min_ratio_mismatch::Float64 = 0.2, prob_diff::Float64 = 0.0)
  8 | 
  9 |     r1_seq = r1.seq
 10 |     r2_seq = r2.seq
 11 |     r1_length = length(r1_seq)
 12 |     r2_length = length(r2_seq_rc)
 13 | 
 14 |     # get the overlapped region
 15 |     if r2_length < insert_size
 16 |         r1_i = insert_size - r2_length + 1
 17 |         # check 8 bit alignment
 18 |         if r1_i % 2 == 1
 19 |             r2_seq_rc.len = r2_length
 20 |         else
 21 |             r1_i += 1
 22 |             deleteat!(r2_seq_rc, 1)
 23 |             BioSequences.unsafe_extra_bits_to_zeros!(r2_seq_rc)  # deleteat is not bitsafe, so have to use it.
 24 |             # r2_seq_rc.part = 2:r2_length
 25 |         end
 26 |     else
 27 |         deleteat!(r2_seq_rc, 1:r2_length-insert_size)
 28 |         BioSequences.unsafe_extra_bits_to_zeros!(r2_seq_rc)  # deleteat is not bitsafe, so have to use it.
 29 |         # r2_seq_rc.part = (r2_length-insert_size+1):r2_length
 30 |         r1_i = 1
 31 |     end
 32 |     length_overlap = min(length(r2_seq_rc), r1_length - r1_i + 1)
 33 |     length_overlap <= 0 && return false, 0.0
 34 | 
 35 |     # align r2_seq_rc.data and r1_seq.data
 36 |     if length(r2_seq_rc) != length_overlap  # when r1 length < insert size
 37 |         resize!(r2_seq_rc, length_overlap)
 38 |     end
 39 | 
 40 |     # Ptr{UInt32}: scan 8 bases each time
 41 |     p1 = get_pointer(0x0000000000000000, r1_seq)
 42 |     p2_rc = get_pointer(0x0000000000000000, r2_seq_rc)
 43 | 
 44 |     p1_offset = r1_i ÷ 2  # r1_i cannot be even, so r1_i -1 not necessary
 45 |     p2_rc_offset = 0
 46 |     offset_max = cld(length_overlap, 2)
 47 |     # the start of ncompatible, minus the score of extra tail match
 48 |     num_ones = length_overlap - cld(offset_max,8)*16
 49 | 
 50 |     max_num_ones = floor(Int, (1+min_ratio_mismatch) * length_overlap)
 51 | 
 52 |     while p2_rc_offset <= offset_max
 53 | 
 54 |         if num_ones > max_num_ones
 55 |             return false
 56 |         end
 57 | 
 58 |         # global ncompatible
 59 |         # global p1_offset
 60 |         # global p2_rc_offset
 61 |         # global num_ones
 62 |         bit1 = unsafe_load(p1 + p1_offset)
 63 |         num_ambiguous_bits = count_ones(bit1) - 16
 64 |         bit2 = N2gap(unsafe_load(p2_rc + p2_rc_offset))
 65 |         num_ones += count_ones(bit1|bit2) - num_ambiguous_bits
 66 | 
 67 |         p1_offset += 8
 68 |         p2_rc_offset += 8
 69 |     end
 70 | 
 71 |     # ratio_mismatch = (num_ones - length_overlap) / length_overlap
 72 |     # ratio_mismatch > min_ratio_mismatch && return false
 73 |     
 74 |     # equals to num_ones > (1-min_ratio_mismatch) * length_overlap && return false
 75 |     # see max_num_ones
 76 | 
 77 | 
 78 |     # start comsensus calling
 79 |     r1_end = min(r1_length, insert_size)
 80 |     r2_i = insert_size - r1_i + 1
 81 | 
 82 |     r1_qual = r1.qual
 83 |     r2_qual = r2.qual
 84 | 
 85 |     r1_prob = r1.prob
 86 |     r2_prob = r2.prob
 87 | 
 88 |     @inbounds while r1_i <= r1_end
 89 |         a = r1_seq[r1_i]
 90 |         b = r2_seq[r2_i]
 91 |         # if !((a | b) in (DNA_W, DNA_S)) # not complement
 92 |         if !iscomplement(a, b) # not complement
 93 |             a_prob = r1_prob[r1_i]
 94 |             b_prob = r2_prob[r2_i]
 95 |             if a_prob - b_prob > prob_diff
 96 |                 # modify b to a
 97 |                 r2_seq[r2_i] = complement(a)
 98 |                 r2_qual[r2_i] = r1_qual[r1_i]
 99 |                 r2_prob[r2_i] = a_prob
100 |             elseif b_prob - a_prob > prob_diff
101 |                 r1_seq[r1_i] = complement(b)
102 |                 r1_qual[r1_i] = r2_qual[r2_i]
103 |                 r1_prob[r1_i] = b_prob
104 |             end
105 |         end
106 |         r1_i += 1
107 |         r2_i -= 1
108 |     end
109 |     return true
110 | end
111 | 
112 | """
113 |     pe_consensus!(r1::FqRecord, r2::FqRecord, r1_seq_rc::LongDNA{4}, r2_seq_rc::LongDNA{4}; kmer_tolerance::Int64 = 2, overlap_score::Float64 = 0.0, min_ratio_mismatch::Float64 = 0.2, prob_diff::Float64 = 0.0)
114 | 
115 | Paired-end consensus calling for read pairs without adapters. Check whether the read pair has an overlapped region first. Return `is_consensused::Bool`.
116 | """
117 | function pe_consensus!(r1::FqRecord, r2::FqRecord, r1_seq_rc::LongDNA{4}, r2_seq_rc::LongDNA{4}; kmer_tolerance::Int64 = 2, overlap_score::Float64 = 0.0, min_ratio_mismatch::Float64 = 0.2, prob_diff::Float64 = 0.0)
118 | 
119 |     r1_seq = r1.seq
120 |     r2_seq = r2.seq
121 |     r1_length = length(r1_seq)
122 |     r2_length = length(r2_seq)
123 | 
124 |     # r1_overlap_from, r1_overlap_nmatch, ...
125 |     r1_ms = bitwise_scan(r2_seq_rc, r1_seq, 1, kmer_tolerance)
126 |     # r2_overlap_from, r2_overlap_nmatch, ...
127 |     r2_ms = bitwise_scan(r1_seq_rc, r2_seq, 1, kmer_tolerance)
128 | 
129 |     # r1_overlap_from == 0 && return false, -1.0
130 |     # r2_overlap_from == 0 && return false, -1.0
131 | 
132 |     r1_overlap_nbase = r1_length - r1_ms.idx + 1
133 |     r2_overlap_nbase = r2_length - r2_ms.idx + 1
134 |     r1_overlap_nbase != r2_overlap_nbase && return false, -1.0
135 | 
136 |     if overlap_score > 0
137 |         r1_overlap_prob = probmean(r1, r1_ms.idx, r1_ms.idx + 15)
138 |         r2_overlap_prob = probmean(r2, r2_ms.idx, r2_ms.idx + 15)
139 | 
140 |         max_overlap_score = max(r1_ms.ncompatible * r1_overlap_prob, r2_ms.ncompatible * r2_overlap_prob)
141 |         max_overlap_score < overlap_score && return false, -1.0
142 |     end
143 | 
144 |     insert_size = r1_length + r2_length - r1_overlap_nbase
145 | 
146 |     pe_consensus!(r1, r2, r2_seq_rc, insert_size; min_ratio_mismatch = min_ratio_mismatch, prob_diff = prob_diff)
147 | end
148 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Atria Change Log
  2 | 
  3 | ## TODO
  4 | 
  5 | - Feature: multiple primer trimming.
  6 | - Feature: UMI trimming.
  7 | 
  8 | ## v4.1.4
  9 | 
 10 | - Change: compressed file: better infer inbyte size when `file` does not output actual original file size.
 11 | 
 12 | ## v4.1.3
 13 | 
 14 | - Fix: when the paired end files are compressed, read chunks did not resize, which led to excessive copy, and copy number might accumulate round by round.
 15 | 
 16 | ## v4.1.2
 17 | 
 18 | - Fix: do not throw error if input paired end files are empty when doing `--detect-adapter`.
 19 | 
 20 | ## v4.1.1
 21 | 
 22 | - Fix: `-z NUM -Z NUM` error when length to trim < 0.
 23 | 
 24 | ## v4.1.0
 25 | 
 26 | - Change: --length-range default change from 50:500 to 30:999999.
 27 | - Feature: HardClipEnd: new process to hard remove the last N bases.
 28 | - Change: names in processing order (--order -O) changed.
 29 | - Feature: PCRDedup: remove PCR duplicates from fastq files. The entire paired sequence is compared and hashed. This method require large memory because it stores hashes of reads. To enable, use `--pcr-dedup`.
 30 | - Feature: processing stats are recorded in the json file.
 31 | - Fix: `polyX_tail_scan` algorithm now is more precise, and tailing Ns also count.
 32 | 
 33 | ## v4.0.3
 34 | 
 35 | - Fix: `--order` or `-O` option should accept multiple arguments.
 36 | 
 37 | ## v4.0.2
 38 | 
 39 | - Fix: `--detect-adapter` for paired reads: refer to index 1 of empty vector when no adapter is found.
 40 | 
 41 | ## v4.0.1
 42 | 
 43 | - Fix: dep cihga39871/BioSequences.jl: detailed error message if input files' line break is '\r\n'.
 44 | 
 45 | ## v4.0.0
 46 | 
 47 | - Optimize: algorithm: now the non-overtrim rate for reads without adapters are higher.
 48 | - Feature: re-write trimming to allow trim multiple adapters at the same time. This change is adjusted for metabarcoding data.
 49 | - Feature: hard-clip: now hard-clip arguments do differently for r1 and r2. This change is adjusted for metabarcoding data. Remove `-C --clip-after -c --clip5`; add `-b --clip-after-r1 -B --clip-after-r2 -e --clip5-r1 -E --clip5-r2`.
 50 | - Optimize: --detect-adapter for paired-end reads now guess adapters from pair information, rather than the existing adapter pool.
 51 | - Feature: users can customize order of processing: `-O | --order`.
 52 | 
 53 | ## v3.2.2-1
 54 | 
 55 | - Fix: undef error of is_concensused when enabling --stat (thanks to kalavattam, #3)
 56 | 
 57 | ## v3.2.2
 58 | 
 59 | - Optimize: speed up for threads <= 2.
 60 | - Fix: `atria test` should not depend on source files.
 61 | 
 62 | ## v3.2.1
 63 | 
 64 | - Feature: automatically skip completed analyses. Use --force or -f to disable the feature.
 65 | 
 66 | ## v3.2.0
 67 | 
 68 | - Remove multi-proc mode since it is unstable.
 69 | 
 70 | ## v3.1.4
 71 | 
 72 | - Logging: new logging for versions and sample completion.
 73 | - Fix v3.1.3: multi-proc mode: Julia v1.8.1 does not allow assign new ARGS, and add `-t nthread` in `julia_args`.
 74 | - Fix v3.1.3: pe-consensus: error when `insert_size = -1`; fix trimming when `insert_size = -1`.
 75 | - Benchmark `iscomplement` in Atria v3.1.2 and that in BioSequences, and found it is good to stick to BioSequences.
 76 | 
 77 | ## v3.1.3
 78 | 
 79 | - Compatible: Julia v1.8 and BioSequences v3.1.0.
 80 | - Fix: quality offset not changed in some places when providing a different --quality-format.
 81 | - Fix: use `Base.invokelatest` to bypass world age for functions evaluated at run time.
 82 | - Docs: update.
 83 | 
 84 | ## v3.1.2
 85 | 
 86 | - Fix: optimize output file names if ending with .bz2.
 87 | 
 88 | ## v3.1.1
 89 | 
 90 | - Fix: when reporting an encode error, report the previous and current lines instead of the whole chunk of data.
 91 | 
 92 | ## v3.1.0
 93 | 
 94 | - New feature: `--detect-adapter` for adapter determination.
 95 | - Fix: when input is an empty compressed fastq, atria exits with error because `read_chunks!(::IO, ...)` should return 4 elements, but returned 2.
 96 | 
 97 | ## v3.0.3
 98 | 
 99 | - Fix v3.0.2: `will_eof` should be true when unknown.
100 | - Do not resize chunk sizes before cycle 1 when inputs are compressed and cannot determine uncompressed sizes. Just assume data are not trimmed before.
101 | 
102 | ## v3.0.2
103 | 
104 | - Fix uncompressed_size1 not defined on gzipped single-end input (#2).
105 | 
106 | ## v3.0.1
107 | 
108 | - Avoid to lock `IOStream` when write fastq in thread_output.jl: replace `write(::IOStream, ...)` with `write_no_lock(::IOStream, ...)`. It is slightly faster.
109 | - Speed optimization for consensus calling: overwrite `BioSequences.complement(::DNA)` (1.40X), and define `iscomplement(::DNA, ::DNA)` (1.79X).
110 | - Other minor parallel implementations.
111 | 
112 | ## v3.0.0
113 | 
114 | - If users choose to trim adapter, check 1 bp offset of adapter sequences. It is because Atria might have 1 bp error in some cases.
115 | 
116 | ## v2.1.2
117 | 
118 | - Parameter optimization using `atria simulate`: --trim-score-pe 19->10, --tail-length 8->12.
119 | - Development of Atria simulation methods.
120 | 
121 | ## v2.1.1
122 | 
123 | - Fixing wrapper_single_end.jl: cannot trim true adapter position at index of -1.
124 | 
125 | ## v2.1.0
126 | 
127 | - If a r1/2 adapter is found, but the region of r2/1 is missing or its quality too low (mean prob < 0.6), skip PE check and just trim like single-end. With this, trim_score do not need to compensate for the situation, so rise the default trim-score-pe (10->19).
128 | 
129 | ## v2.0.0
130 | 
131 | - Supporting low-complexity filtration.
132 | - Supporting polyX tail trimming.
133 | - Supporting single-end fastq.
134 | - Supporting bzip2 compression/decompression.
135 | - Supporting non standardized gzip compression files.
136 | - Optimizing default parameters. (r1-r2-diff 0->0, trim-score-pe 8->10, score-diff removed, kmer-n-match 8->9)
137 | - Robustness optimization: the lower bound of match probability is set to 0.75 because match probability lower than 0.75 is outlier and affect trim score strongly.
138 | 
139 | ## v1.1.1
140 | 
141 | - Performance optimization: adapter and PE trimming: following v1.1.0-1, if the loosen match's nmatch > trim_score, replace the old one.
142 | 
143 | ## v1.1.0
144 | 
145 | - Performance optimization: adapter and PE trimming: if no adapters were matched, the number of errors of PE match is loosen.
146 | - Performance optimization: consensus calling: new arg `--kmer-tolerance-consensus 2->10`; optimized arg `--min-ratio-mismatch 0.2->0.28`.
147 | - Speed optimization: check `overlap_score > 0` before computing score (`pe_consensus!`).
148 | 
149 | ## v1.0.3
150 | 
151 | - More detailed error output when encoding a non-nucleotide character (`throw_encode_error(...)`).
152 | - Following symbolic link before checking file size for non-Windows platforms (`check_filesize(::String)`).
153 | - When run in multi-file parallel mode, write stdout and stderr to a 'stdlog' file (`julia_wrapper_atria(...)`).
154 | - Add option `--check-identifier` to check whether the identifiers of r1 and r2 are the same.
155 | 
156 | ## v1.0.2
157 | 
158 | - First mature version of Atria.
159 | 


--------------------------------------------------------------------------------
/src/AtriaTest/trimmer_and_benchmark.jl:
--------------------------------------------------------------------------------
  1 | @noinline function test_trimmer_and_benchmark()
  2 | @testset "trimmer_and_benchmark" begin
  3 | 
  4 |     pwd_backup = pwd()
  5 | 
  6 |     tmp_path = tempname()
  7 |     mkpath(tmp_path)
  8 |     cd(tmp_path)
  9 | 
 10 |     try
 11 |         args = Trimmer.parsing_args(["-r", "peReadSimulated.R1.fastq", "-R", "peReadSimulated.R2.fastq"])
 12 |         logjson = Trimmer.OrderedDict()
 13 |         logjson["version"] = Trimmer.OrderedDict(
 14 |             "julia-version" => VERSION,
 15 |             "atria-version" => "9.9.9"
 16 |         )
 17 |         logjson["arguments"] = sort!(Trimmer.OrderedDict(args))
 18 |         fio = open("json","w+")
 19 |         Trimmer.JSON.print(fio, sort!(logjson), 4)
 20 |         close(fio)
 21 | 
 22 |         Trimmer.Distributed.addprocs(1)
 23 |         var = ["var"]
 24 |         @eval Trimmer.Distributed.@everywhere var = $var
 25 |         Trimmer.Distributed.pmap(+,[1,2],[4,5])
 26 | 
 27 | 
 28 |         Benchmark.julia_wrapper_simulate(["-o" ,"peReadSimulated", "-x", "2000"])
 29 |         Benchmark.julia_wrapper_simulate(["-h"], exit_after_help=false)
 30 | 
 31 |         Benchmark.julia_wrapper_randtrim(["peReadSimulated.R1.fastq", "peReadSimulated.R2.fastq"])
 32 |         Benchmark.julia_wrapper_randtrim(["-h"])
 33 | 
 34 |         if Sys.iswindows()
 35 |             julia_wrapper_atria_pe(["-r", "peReadSimulated.R1.randtrim.fastq", "-R", "peReadSimulated.R2.randtrim.fastq", "-e", "8", "-E", "8", "--compress", "gz", "-f"])
 36 |         else
 37 |             
 38 |             @info "rand trim - gz"
 39 | 
 40 |             run(`pigz --keep peReadSimulated.R1.randtrim.fastq`)
 41 |             run(`pigz --keep peReadSimulated.R2.randtrim.fastq`)
 42 | 
 43 |             julia_wrapper_atria_se(["-r", "peReadSimulated.R1.randtrim.fastq.gz", "-e", "8", "-E", "8", "--compress", "gz", "-f"])
 44 |             julia_wrapper_atria_pe(["-r", "peReadSimulated.R1.randtrim.fastq.gz", "-R", "peReadSimulated.R2.randtrim.fastq.gz", "-e", "8", "-E", "8", "--compress", "gz", "--check-identifier", "-f"])
 45 | 
 46 |             @info "rand trim - gz - check paired ID" 
 47 | 
 48 |             julia_wrapper_atria_pe(["-r", "peReadSimulated.R1.randtrim.atria.fastq.gz", "-R", "peReadSimulated.R2.randtrim.atria.fastq.gz", "-e", "8", "-E", "8", "--compress", "gz", "--check-identifier", "-f"])
 49 | 
 50 |             @info "rand trim - gz - detect adapter" 
 51 | 
 52 |             julia_wrapper_detect_adapter_se(["-r", "peReadSimulated.R1.randtrim.fastq.gz", "-e", "8", "-E", "8", "--compress", "gz"])
 53 |             julia_wrapper_detect_adapter_pe(["-r", "peReadSimulated.R1.randtrim.fastq.gz", "-R", "peReadSimulated.R2.randtrim.fastq.gz", "-e", "8", "-E", "8", "--compress", "gz"])
 54 | 
 55 |             @info "rand trim - bzip" 
 56 | 
 57 |             run(`pbzip2 peReadSimulated.R1.randtrim.fastq`)
 58 |             run(`pbzip2 peReadSimulated.R2.randtrim.fastq`)
 59 | 
 60 |             julia_wrapper_atria_se(["-r", "peReadSimulated.R1.randtrim.fastq.gz", "-e", "8", "-E", "8", "--compress", "bz2", "-f"])
 61 |             julia_wrapper_atria_pe(["-r", "peReadSimulated.R1.randtrim.fastq.gz", "-R", "peReadSimulated.R2.randtrim.fastq.gz", "-e", "8", "-E", "8", "--compress", "bz2", "--check-identifier", "-f"])
 62 | 
 63 |         end
 64 | 
 65 |         @info "normal trim - all filters"
 66 | 
 67 |         julia_wrapper_atria_se(["-r", "peReadSimulated.R1.fastq",  "--polyG", "--enable-complexity-filtration", "--pcr-dedup", "-f", "--log2-chunk-size", "24"])
 68 |         julia_wrapper_atria_pe(["-r", "peReadSimulated.R1.fastq", "-R", "peReadSimulated.R2.fastq", "--polyG", "--enable-complexity-filtration", "--pcr-dedup", "-f", "--stats", "--log2-chunk-size", "24"])
 69 | 
 70 |         @info "normal trim - PCR Dedup with counts"
 71 | 
 72 |         julia_wrapper_atria_se(["-r", "peReadSimulated.R1.fastq", "-O", "PCRDedup", "--pcr-dedup", "--pcr-dedup-count", "-f", "--log2-chunk-size", "24"])
 73 |         julia_wrapper_atria_pe(["-r", "peReadSimulated.R1.fastq", "-R", "peReadSimulated.R2.fastq", "-O", "PCRDedup", "--pcr-dedup", "--pcr-dedup-count", "-f", "--stats", "--log2-chunk-size", "24"])
 74 | 
 75 |         @info "normal trim - all filters - check ID pair"
 76 | 
 77 |         julia_wrapper_atria_pe(["-r", "peReadSimulated.R1.atria.fastq", "-R", "peReadSimulated.R2.atria.fastq", "--polyG", "--enable-complexity-filtration", "-f", "--stats", "--log2-chunk-size", "24", "--check-identifier"])
 78 | 
 79 |         
 80 |         @info "normal trim - all filters - multiple adapters"
 81 | 
 82 |         julia_wrapper_atria_se(["-r", "peReadSimulated.R1.fastq",  "--polyG", "--enable-complexity-filtration", "-a", "AGATCGGAAGAGCACACGTCTGAACTCCAGTCA", "CTGTCTCTTATACACATCT", "-f"])
 83 |         julia_wrapper_atria_pe(["-r", "peReadSimulated.R1.fastq", "-R", "peReadSimulated.R2.fastq", "--polyG", "--enable-complexity-filtration", "-f", "-a", "AGATCGGAAGAGCACACGTCTGAACTCCAGTCA", "CTGTCTCTTATACACATCT", "-A", "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT", "CTGTCTCTTATACACATCT"])
 84 | 
 85 |         @info "normal trim - all filters - check ID pair"
 86 | 
 87 |         julia_wrapper_atria_pe(["-r", "peReadSimulated.R1.atria.fastq", "-R", "peReadSimulated.R2.atria.fastq", "--polyG", "--enable-complexity-filtration", "-f", "-a", "AGATCGGAAGAGCACACGTCTGAACTCCAGTCA", "CTGTCTCTTATACACATCT", "-A", "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT", "CTGTCTCTTATACACATCT", "--log2-chunk-size", "24", "--check-identifier"])
 88 | 
 89 |         @info "normal trim - skip finished"
 90 | 
 91 |         julia_wrapper_atria_pe(["-r", "peReadSimulated.R1.fastq", "-R", "peReadSimulated.R2.fastq", "--polyG", "--enable-complexity-filtration"])
 92 |         julia_wrapper_atria_se(["-r", "peReadSimulated.R1.fastq",  "--polyG", "--enable-complexity-filtration"])
 93 | 
 94 |         @info "normal trim - detect adapter"
 95 | 
 96 |         julia_wrapper_detect_adapter_se(["-r", "peReadSimulated.R1.fastq",  "--polyG", "--enable-complexity-filtration"])
 97 | 
 98 |         @info "trimmer's help page"
 99 | 
100 |         julia_wrapper_atria_pe(["-h"], exit_after_help=false)
101 |         julia_wrapper_atria_se(["-h"], exit_after_help=false)
102 |         atria_markdown_help()
103 | 
104 |         @info "read stat"
105 | 
106 |         julia_wrapper_readstat(["-h"])
107 |         julia_wrapper_readstat(["peReadSimulated.R1.atria.fastq", "peReadSimulated.R2.atria.fastq"])
108 | 
109 |         Rscript_check_package = """
110 |         if (is.na(packageDescription("argparse")[1])) writeLines("R package 'argparse' not found. Please run `install.packages('argparse')` in R session.")
111 |         if (is.na(packageDescription("plotly")[1])) writeLines("R package 'plotly' not found. Please run `install.packages('plotly')` in R session.")
112 |         if (is.na(packageDescription("ggsci")[1])) writeLines("R package 'ggsci' not found. Please run `install.packages('ggsci')` in R session.")
113 |         """
114 | 
115 |         julia_wrapper_rscript(Rscript_check_package, ["-h"])
116 | 
117 | 
118 |         ARGS_old = deepcopy(ARGS)
119 |         empty!(ARGS)
120 |         push!(ARGS, "prog")
121 |         Atria.julia_main()
122 | 
123 |         empty!(ARGS)
124 |         append!(ARGS, ARGS_old)
125 | 
126 |         @info "Precompiling/test passed without errors."
127 | 
128 |     catch e
129 |         @error "Precompiling/test failed!" exception=e
130 |         cd(pwd_backup)
131 |         rm(tmp_path, recursive=true, force=true)
132 |         rethrow(e)
133 |     finally
134 |         cd(pwd_backup)
135 |         rm(tmp_path, recursive=true, force=true)
136 |     end
137 | end
138 | end


--------------------------------------------------------------------------------
/src/Trimmer/markdown_help.jl:
--------------------------------------------------------------------------------
  1 | 
  2 | const atria_markdown_help_text = md"""
  3 | # Atria $atria_version
  4 | 
  5 | An ultra-fast and accurate adapter and quality trimming software designed for paired-end sequencing data.
  6 | 
  7 | If you use Atria, please cite
  8 | > Jiacheng Chuan, Aiguo Zhou, Lawrence Richard Hale, Miao He, Xiang Li, Atria: an ultra-fast and accurate trimmer for adapter and quality trimming, Gigabyte, 1, 2021 https://doi.org/10.46471/gigabyte.31
  9 | 
 10 | Github: https://github.com/cihga39871/Atria
 11 | 
 12 | ## Usage
 13 | 
 14 | Try `atria -h` or `atria --help` for more information.
 15 | 
 16 | ### Input and Output
 17 | 
 18 | The input files should be paired-end FastQ(.gz|.bz2) files (in the same order), or single-end fastqs:
 19 | 
 20 | 1. Read 1 files: `-r XXXX_R1.fastq YYYY_R1.fastq.gz ...`
 21 | 
 22 | 2. Read 2 files (optional): `-R XXXX_R2.fastq YYYY_R2.fastq.gz ...`
 23 | 
 24 | Output all files to a directory: `-o PATH` or `--output-dir PATH`. Default is the current directory.
 25 | 
 26 | Atria skips completed analysis by default. Use `-f` or `--force` to disable the feature.
 27 | 
 28 | ### Order of processing
 29 | 
 30 | Order of trimming and filtration processing methods. Unlisted process will not be done. See default for process names.
 31 | 
 32 | - `--order PROCESS...` or `-O PROCESS...`: default:  
 33 | 
 34 |    - CheckIdentifier
 35 |    - PolyG
 36 |    - PolyT
 37 |    - PolyA
 38 |    - PolyC
 39 |    - LengthFilter
 40 |    - AdapterTrim
 41 |    - HardClipEndR1
 42 |    - HardClipEndR2
 43 |    - HardClipAfterR1
 44 |    - HardClipAfterR2
 45 |    - HardClipFrontR1
 46 |    - HardClipFrontR2
 47 |    - QualityTrim
 48 |    - TailNTrim
 49 |    - MaxNFilter
 50 |    - LengthFilter
 51 |    - ComplexityFilter
 52 |    - PCRDedup
 53 | 
 54 | 
 55 | ### Poly X Tail Trimming (PolyG / PolyT / PolyA / PolyC)
 56 | 
 57 | Remove poly-X tails. Suggest to enable `--polyG` for Illumina NextSeq/NovaSeq data.
 58 | 
 59 | - Enable: `--polyG`, `--polyT`, `--polyA`, and/or `--polyC` (default: disabled)
 60 | 
 61 | - Trim poly X tail if length > INT: `--poly-length 10`
 62 | 
 63 | ### Adapter Trimming (AdapterTrim)
 64 | 
 65 | Multiple adapter pairs are allowed from Atria v4.
 66 | 
 67 | - Read 1 adapter(s) (after DNA insert): `-a SEQ...` or `--adapter1 SEQ...` (default: AGATCGGAAGAGCACACGTCTGAACTCCAGTCA)
 68 | 
 69 | - Read 2 adapter(s) (after DNA insert): `-A SEQ...` or `--adapter2 SEQ...` (default: AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT) (if paired-end)
 70 | 
 71 | - Disable: `--no-adapter-trim`
 72 | 
 73 | - `--detect-adapter` if you do not know adapter sequences.
 74 |    >Atria does not trim detected adapters automatically, please check results first.
 75 | 
 76 | #### Paired-end Consensus Calling
 77 | 
 78 | The overlapped regions of read pairs are checked and corrected. **It is available only when input files are paired-end and Adapter Trimming is on.**
 79 | 
 80 | - Disable: `--no-consensus`
 81 | 
 82 | ### Hard Clip End (HardClipEndR1 / HardClipEndR2)
 83 | 
 84 | Remove the last INT bases from 3' end (tail).
 85 | 
 86 | - Number of bases to keep in read 1: `-z INT` or `--clip3-r1 INT` (default: disabled)
 87 | 
 88 | - Number of bases to keep in read 2: `-Z INT` or `--clip3-r2 INT` (default: disabled)
 89 | 
 90 | ### Hard Clip After N Bases (HardClipAfterR1 / HardClipAfterR2)
 91 | 
 92 | Resize reads to a fixed length by discarding extra bases in 3' end (tail).
 93 | 
 94 | - Number of bases to keep in read 1: `-b INT` or `--clip-after-r1 INT` (default: disabled)
 95 | 
 96 | - Number of bases to keep in read 2: `-B INT` or `--clip-after-r2 INT` (default: disabled)
 97 | 
 98 | ### Hard Clip Front (HardClipFrontR1 / HardClipFrontR2)
 99 | 
100 | Remove the first INT bases from 5' end (front).
101 | 
102 | - Number of bases to remove in read 1: `-e INT` or `--clip5-r1 INT` (default: disabled)
103 | 
104 | - Number of bases to remove in read 2: `-E INT` or `--clip5-r2 INT` (default: disabled)
105 | 
106 | ### Quality Trimming (QualityTrim)
107 | 
108 | Trim low-quality tails. Trimming read tails when the average quality of bases in a sliding window is low.
109 | 
110 | - Average quality threshold: `-q 20` or `--quality-score 20` (default: 20)
111 | 
112 | - Sliding window length: `--quality-kmer 5` (default: 5)
113 | 
114 | - FastQ quality format: `--quality-format Illumina1.8`, or `--quality-format 33` (default: 33, ie. Illumina1.8)
115 | 
116 | - Disable: `--no-quality-trim`
117 | 
118 | ### Tail N Trimming (TailNTrim)
119 | 
120 | Trim N tails.
121 | 
122 | - Disable: `--no-tail-n-trim`
123 | 
124 | ### Max N Filtration (MaxNFilter)
125 | 
126 | Discard a read pair if the number of N in one read is greater than a certain amount. N tails are ignored if Tail N Trimming is on.
127 | 
128 | - Number of N allowed in each read: `-n 15` or `--max-n 15` (default: 15)
129 | 
130 | - Disable: `-n -1` or `--max-n -1`
131 | 
132 | ### Length Filtration (LengthFilter)
133 | 
134 | Filter read pair length in a range.
135 | 
136 | - Read length range: `--length-range 30:999999` (default: 30:999999)
137 | 
138 | - Disable: `--no-length-filtration`
139 | 
140 | ### Complexity Filtration (ComplexityFilter)
141 | 
142 | Discard reads with low complexity. Complexity is the percentage of base that is different from its next base.
143 | 
144 | - Enable: `--enable-complexity-filtration` (default: disabled)
145 | 
146 | - Complexity threshold: `--min-complexity 0.3` (default: 0.3)
147 | 
148 | ### Remove PCR duplicates
149 | 
150 | Only write unique sequences (dedup). Paired reads are only considered identical if both reads are duplicates to both reads in a previous pair. 
151 | 
152 | > Dedup uses LARGE memory to store all unique sequences. 
153 | 
154 | - Enable: `--pcr-dedup`.
155 | 
156 | - Also write a count table of PCR duplicates: `--pcr-dedup-count`.
157 | 
158 | ### Parallel computing
159 | 
160 | - Use INT threads: `-t 8` or `--threads 8` (default: 8)
161 | 
162 | - If memory is not sufficient, use `--log2-chunk-size INT` where INT is from 23 to 25. Memory usage reduces exponentially as it decreases.
163 | 
164 | Try `atria -h` or `atria --help` for more information.
165 | """
166 | 
167 | function atria_markdown_help()
168 |    println(stderr)
169 |    show(stderr, "text/plain", atria_markdown_help_text)
170 |    println(stderr)
171 | end
172 | 
173 | 
174 | #= Future supports
175 | ==================
176 | 
177 | ### UMI (Unique Molecular Identifier)
178 | 
179 | Trim and extract UMI to the first part of read names, so they can be presented in BAM records after mapping.
180 | 
181 | - Enable and specify UMI location(s): `--umi LOC...`, and LOC can be:
182 |    + `INDEX1`: the R1 index is UMI.
183 |    + `INDEX2`: the R2 index is UMI.
184 |    + `READ1`: the head of read1 is UMI.
185 |    + `READ2`: the head of read2 is UMI.
186 |    (default: disabled)
187 | 
188 | - If UMI locations contain `READ1` and/or `READ2`:
189 |    + UMI length argument `--umi-len INT` is required. 
190 |    + Skip several bases after UMI: `--umi-skip INT` (default: 0) 
191 | 
192 | ### Primer Trimming
193 | 
194 | Trim primers from 5' and 3' ends (default: no primer trimming)
195 | 
196 | - Directly provide primer sequence(s):
197 |    + `-m SEQ...` or `--primer1 SEQ...`: primers(s) at 5' end of read 1, and their reverse complement appended to 3' end of read 2.
198 | 
199 |    + `-M SEQ...` or `--primer1 SEQ...`: primers(s) at 5' end of read 1, and their reverse complement appended to 3' end of read 2.
200 | 
201 | - Or provide a primer table: `-P FILE` or `--primers FILE`. Format of primer table:
202 |    + Each line is a primer set.
203 |    + Columns are primer1, primer2, primer name.
204 |    + Deliminator is TAB (`\t`).
205 |    + No header line; Lines starts with `#` are ignored.
206 | 
207 | 
208 | =#


--------------------------------------------------------------------------------
/src/BioBits/get_seq.jl:
--------------------------------------------------------------------------------
  1 | 
  2 | """
  3 |     N2gap(bit::T) where T <: Union{UInt8, UInt16, UInt32, UInt64, UInt128}
  4 | 
  5 | Convert N (1111) to gap (0000) in biological `bit`.
  6 | """
  7 | function N2gap end
  8 | 
  9 | for T in (UInt8, UInt16, UInt32, UInt64, UInt128)
 10 |     @eval @inline function N2gap(bit::$T)
 11 |         nbase_1 = $(sizeof(T) * 2 - 1)
 12 | 
 13 |         N_bit = $(convert(T, 0b1111))
 14 |         if bit & N_bit == N_bit
 15 |             bit &= ~N_bit
 16 |         end
 17 |         for i in 1:nbase_1
 18 |             N_bit = $(convert(T, 0b1111)) << (4*i)
 19 |             if bit & N_bit == N_bit
 20 |                 bit &= ~N_bit
 21 |             end
 22 |         end
 23 |         bit
 24 |     end
 25 | end
 26 | 
 27 | 
 28 | struct SeqHead{T}
 29 |     a::T
 30 |     b::T
 31 |     function SeqHead{T}(a::T, b::T) where T <: Union{UInt8, UInt16, UInt32, UInt64}
 32 |         new(a, b)
 33 |     end
 34 | end
 35 | 
 36 | @inline function SeqHead(a::T, b::T) where T <: Union{UInt8, UInt16, UInt32, UInt64}
 37 |     SeqHead{T}(a,b)
 38 | end
 39 | 
 40 | """
 41 |     SeqHead(::T, seq::LongDNA{4}) where T <: Union{UInt8, UInt16, UInt32, UInt64}
 42 | 
 43 | # Fields
 44 | 
 45 |  - `a::T`: the bits of sequence from index 1.
 46 | 
 47 |  - `b::T`: the bits of sequence from index 2.
 48 | 
 49 | # Argument
 50 | 
 51 |  - `seq::LongDNA{4}`: the seq has to be `bitsafe!`.
 52 | """
 53 | function SeqHead(::T, seq::LongDNA{4}) where T <: Union{UInt8, UInt16, UInt32}
 54 |     bit = seq.data[1]
 55 |     a = unsafe_trunc(T, bit)
 56 |     b = unsafe_trunc(T, bit >> 4)
 57 |     SeqHead{T}(a, b)
 58 | end
 59 | function SeqHead(::UInt64, seq::LongDNA{4})
 60 |     p = pointer(seq.data)
 61 |     a = unsafe_load(p)
 62 |     if length(seq) > 0
 63 |         c = unsafe_load(p+1)
 64 |         b = (a >> 4) | (c << 4)
 65 |     else
 66 |         b = a >> 4
 67 |     end
 68 |     SeqHead{UInt64}(a, b)
 69 | end
 70 | for T in (UInt8, UInt16, UInt32, UInt64)
 71 |     @eval @inline SeqHead{$T}(seq::LongDNA{4}) = SeqHead($(typemin(T)), seq)
 72 | end
 73 | 
 74 | 
 75 | struct SeqHeadSet
 76 |     s64::SeqHead{UInt64}
 77 |     s32::SeqHead{UInt32}
 78 |     s16::SeqHead{UInt16}
 79 |     s8::SeqHead{UInt8}
 80 |     function SeqHeadSet(seq::LongDNA{4})
 81 |         bitsafe!(seq)
 82 |         s64 = SeqHead{UInt64}(seq)
 83 |         s32 = SeqHead{UInt32}(seq)
 84 |         s16 = SeqHead{UInt16}(seq)
 85 |         s8 = SeqHead{UInt8}(seq)
 86 |         new(s64, s32, s16, s8)
 87 |     end
 88 | end
 89 | function SeqHeadSet(seq::AbstractString)
 90 |     SeqHeadSet(LongDNA{4}(seq))
 91 | end
 92 | 
 93 | function BioSequences.LongDNA{4}(s::SeqHeadSet)
 94 |     LongDNA{4}([s.s64.a], 0x0000000000000010)  # 16 % UInt64
 95 | end
 96 | 
 97 | """
 98 |     TruncSeq(::T, seq::LongDNA{4}) where T <: Union{UInt8, UInt16, UInt32, UInt64}
 99 | 
100 | # Fields
101 | 
102 |  - `a::T`: the bits of sequence from index 1.
103 | 
104 |  - `b::T`: the bits of sequence from index 2.
105 | 
106 |  - `a1::T`: the bits of `seq[1]`.
107 | 
108 | # Argument
109 | 
110 |  - `seq::LongDNA{4}`: the seq has to be `bitsafe!`.
111 | """
112 | struct TruncSeq{T}
113 |     a::T
114 |     b::T
115 |     a1::T
116 |     function TruncSeq{T}(a::T, b::T, a1::T) where T <: Union{UInt8, UInt16, UInt32, UInt64}
117 |         new(a, b, a1)
118 |     end
119 | end
120 | 
121 | for T in (UInt8, UInt16, UInt32, UInt64)
122 |     @eval @inline TruncSeq(a::$T, b::$T, a1::$T) = TruncSeq{$T}(a,b,a1)
123 | end
124 | 
125 | for T in (UInt8, UInt16, UInt32)
126 | @eval @inline function TruncSeq(::$T, seq::LongDNA{4})
127 |     bit = seq.data[1] #|> N2gap
128 |     a = unsafe_trunc($T, bit)
129 |     b = unsafe_trunc($T, bit >> 4)
130 |     a1 = $(T(0b1111)) & a
131 |     TruncSeq{T}(a, b, a1)
132 | end
133 | end
134 | function TruncSeq(::UInt64, seq::LongDNA{4})
135 |     p = pointer(seq.data)
136 |     a = unsafe_load(p) #|> N2gap
137 |     c = unsafe_load(p+1) #|> N2gap
138 |     if length(seq) > 0
139 |         c = unsafe_load(p+1)
140 |         b = (a >> 4) | (c << 4)
141 |     else
142 |         b = a >> 4
143 |     end
144 |     a1 = 0x000000000000000f & a
145 |     TruncSeq{UInt64}(a, b, a1)
146 | end
147 | 
148 | 
149 | for T in (UInt8, UInt16, UInt32, UInt64)
150 |     @eval @inline TruncSeq{$T}(seq::LongDNA{4}) = TruncSeq($(typemin(T)), seq)
151 | end
152 | 
153 | """
154 |     get_pointer(::T, seq::LongDNA{4}) where T <: {UInt8, UInt16, UInt32, UInt64}
155 | """
156 | @inline get_pointer(::UInt64, seq::LongDNA{4}) = pointer(seq.data)
157 | for T in (UInt8, UInt16, UInt32)
158 |     @eval @inline get_pointer(::$T, seq::LongDNA{4}) =
159 |         Core.bitcast($(Ptr{T}), pointer(seq.data))
160 | end
161 | 
162 | 
163 | """
164 |     get_unsafe_index_of_last_bitseq(::T, seq::LongDNA{4})
165 |     get_unsafe_index_of_last_bitseq(::T, seq.len::Int64)
166 |     get_unsafe_index_of_last_bitseq(::T, seq.len::UInt64)
167 | 
168 |  - `::T` is one of UInt8, UInt16, UInt32, UInt64.
169 | 
170 | Get the index of the last full-long bitseq. It is unsafe because the returned index can be negative.
171 | """
172 | function get_unsafe_index_of_last_bitseq end
173 | 
174 | for T in (UInt8, UInt16, UInt32, UInt64)
175 |     @eval @inline get_unsafe_index_of_last_bitseq(::$T, seq::LongDNA{4}) =
176 |         (seq.len % Int64) - $(sizeof(T) * 2 - 2)
177 |     @eval @inline get_unsafe_index_of_last_bitseq(::$T, seq_len::Int64) =
178 |         seq_len - $(sizeof(T) * 2 - 2)
179 |     @eval @inline get_unsafe_index_of_last_bitseq(::$T, seq_len::UInt64) =
180 |         (seq_len % Int64) - $(sizeof(T) * 2 - 2)
181 | end
182 | 
183 | """
184 |     unsafe_bitseq(seq_data_ptr::Ptr{T}, idx::Int) => bitseq
185 |     unsafe_bitseq(seq_data_ptr::Ptr{T}, idx::Int, max_idx::Int) => bitseq, num_base_extracted
186 | 
187 |  - `seq_data_ptr::Ptr{T}`: the pointer to `(seq::LongDNA{4}).data`. `Ptr{T}` can be converted to `Ptr` of `UInt8`, `UInt16`, `UInt32`, or `UInt64`.
188 | 
189 |  - `idx`: nucleotide index of `(seq::LongDNA{4}).data`.
190 | 
191 |  - `max_idx`: should be equal to `(seq::LongDNA{4}).len`. Change bits after it to 0. It does not mask bits if `max_idx` < `(seq::LongDNA{4}).len`, but affects num_base_extracted.
192 | 
193 | # Caution
194 | 
195 | When `idx` is even, the bitseq will always start from 0b0000, because it simply shift 4 bits from `idx - 1`.
196 | """
197 | function unsafe_bitseq end
198 | 
199 | for T in (UInt8, UInt16, UInt32, UInt64)
200 | @eval @inline function unsafe_bitseq(seq_data_ptr::Ptr{$T}, idx::Int)
201 |     idx_c = idx - 1
202 |     bitseq = unsafe_load(seq_data_ptr + idx_c ÷ 2)
203 |     access_by_shift = idx_c % 2 == 1
204 |     if access_by_shift
205 |         # cannot accee to this index directly
206 |         # INFO:
207 |         bitseq >>= 0x04
208 |     end
209 |     return bitseq
210 | end
211 | end
212 | 
213 | 
214 | for T in (UInt8, UInt16, UInt32, UInt64)
215 | @eval @inline function unsafe_bitseq(seq_data_ptr::Ptr{$T}, idx::Int, max_idx::Int)
216 |     idx_c = idx - 1
217 |     bitseq = unsafe_load(seq_data_ptr + idx_c ÷ 2)
218 |     access_by_shift = idx_c % 2 == 1
219 |     if access_by_shift
220 |         # cannot accee to this index directly
221 |         bitseq >>= 0x04
222 |     end
223 | 
224 |     nbase = $(sizeof(T) * 2)
225 |     idx_stop = idx_c + nbase
226 |     nbase_overflow = idx_stop - max_idx
227 |     if nbase_overflow > 0
228 |         # mask bases after idx_stop. but it is assumed masked by bitsafe!
229 |         # bitseq &= ($(typemax(T)) >> (nbase_overflow * 4))
230 |         num_base_extracted = nbase - nbase_overflow
231 |     else
232 |         num_base_extracted = access_by_shift ? nbase - 1 : nbase
233 |     end
234 |     bitseq, num_base_extracted
235 | end
236 | end
237 | 
238 | 
239 | function bin(x)
240 |     replace(bitstring(x), r"(....)" => s"\1 ")
241 | end
242 | 


--------------------------------------------------------------------------------
/benchmark/time_stats_plot.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | if (is.na(packageDescription("plotly")[1])) install.packages("plotly")
  4 | library(plotly, quietly = T, warn.conflicts = F)
  5 | if (is.na(packageDescription("argparse")[1])) install.packages("argparse")
  6 | library(argparse, quietly = T)
  7 | 
  8 | parser <- ArgumentParser(description='Plot time stats (speed vs threads/CPU)')
  9 | parser$add_argument('-i', '--input', dest='input', metavar='FILE', type='character',
 10 |                     required=TRUE, nargs='+',
 11 |                     help='[REQUIRED] input time stats tables generated from time_stats.jl (1st=ungz, 2nd=gz')
 12 | parser$add_argument('-o', '--output', dest='out', metavar='PLOT', type='character',
 13 |                     default="time_stats_plot.html",
 14 |                     help='output html heatmap file (default: time_stats_plot.html)')
 15 | 
 16 | args <- parser$parse_args()
 17 | 
 18 | if (FALSE){
 19 |     setwd("~/analysis/atria-benchmark/simulate")
 20 |     args <- parser$parse_args(c("-i", "stats.time_benchmark3.df.txt", "stats.time_benchmark_gz3.df.txt", "-o", "time_stats_plot2.html"))
 21 | }
 22 | 
 23 | wrapper <- function(input_path, show_legend, is_gz){
 24 |     
 25 |     input <- read.delim(input_path, header=TRUE, as.is=TRUE)
 26 |     
 27 |     input$Trimmer <- ""
 28 |     input$Trimmer[grepl("atria", input$Command)] <- "Atria (consensus)"
 29 |     input$Trimmer[grepl("atria --no-consensus", input$Command)] <- "Atria"
 30 |     input$Trimmer[grepl("AdapterRemoval", input$Command)] <- "AdapterRemoval"
 31 |     input$Trimmer[grepl("skewer", input$Command)] <- "Skewer"
 32 |     input$Trimmer[grepl("trim_galore", input$Command)] <- "Trim Galore"
 33 |     input$Trimmer[grepl("trimmomatic", input$Command)] <- "Trimmomatic"
 34 |     input$Trimmer[grepl("ktrim", input$Command)] <- "Ktrim"
 35 |     input$Trimmer[grepl("atropos", input$Command)] <- "Atropos"
 36 |     input$Trimmer[grepl("fastp", input$Command)] <- "Fastp"
 37 |     input$Trimmer[grepl("SeqPurge", input$Command)] <- "SeqPurge"
 38 |     
 39 |     input_labels <- c("Atria (consensus)",
 40 |                       "Atria",
 41 |                       "AdapterRemoval",
 42 |                       "Skewer",
 43 |                       "Trim Galore",
 44 |                       "Trimmomatic",
 45 |                       "Ktrim",
 46 |                       "Atropos",
 47 |                       "Fastp",
 48 |                       "SeqPurge"
 49 |                       )
 50 |     
 51 |     input$Trimmer <- factor(input$Trimmer, input_labels)
 52 |     
 53 |     # input$Command <- NULL
 54 |     
 55 |     input_value = input
 56 |     for (j in 1:ncol(input)){
 57 |         for (i in 1:nrow(input)){
 58 |             input_value[i,j] <- sub(" ±.*", "", input[i,j])
 59 |         }
 60 |         if (!any(is.na(as.numeric(input_value[,j])))) {
 61 |             if (class(input_value[,j]) != "factor") {
 62 |                 input_value[,j] <- as.numeric(input_value[,j])    
 63 |             }
 64 |         }
 65 |     }
 66 |     
 67 |     input_sd = input
 68 |     for (j in 1:ncol(input)){
 69 |         for (i in 1:nrow(input)){
 70 |             input_sd[i,j] <- sub(".*± ", "", input[i,j])
 71 |         }
 72 |         if (!any(is.na(as.numeric(input_sd[,j])))) {
 73 |             if (class(input_value[,j]) != "factor") {
 74 |                 input_sd[,j] <- as.numeric(input_sd[,j])
 75 |                 if (all(input_sd[,j] == input_value[,j])) {
 76 |                     input_sd[,j] <- 0
 77 |                 }
 78 |             }
 79 |         }
 80 |     }
 81 |     
 82 |     input_high = input_value
 83 |     for (j in 1:ncol(input)){
 84 |         for (i in 1:nrow(input)){
 85 |             if (is.numeric(input_sd[i,j])){
 86 |                 input_high[i,j] = input_value[i,j] + input_sd[i,j]
 87 |             }
 88 |         }
 89 |     }
 90 |     input_low = input_value
 91 |     for (j in 1:ncol(input)){
 92 |         for (i in 1:nrow(input)){
 93 |             if (is.numeric(input_sd[i,j])){
 94 |                 input_low[i,j] = input_value[i,j] - input_sd[i,j]
 95 |             }
 96 |         }
 97 |     }
 98 |     
 99 |     
100 |     if (sum(input_sd$Speed..M.Bases.s.) == 0){
101 |         speed_error_y_array = NULL
102 |         efficiency_error_y_array = NULL
103 |     } else {
104 |         speed_error_y_array <- input_sd$Speed..M.Bases.s.
105 |         efficiency_error_y_array <- input_sd$Efficiency..M.Bases.s.CPU.
106 |     }
107 |     # writeLines(as.character(show_legend))
108 |     if (is_gz){
109 |         gz_title = " for Compressed Files"
110 |     } else {
111 |         gz_title = ""
112 |     }
113 |     x_tick_vals = unique(input_value$Threads)
114 |     
115 |     fig_speed <- plot_ly(x=input_value$Threads, y=input_value$Speed..M.Bases.s., color=input_value$Trimmer, legendgroup=input_value$Trimmer, error_y = list(array=speed_error_y_array), showlegend=show_legend) %>%
116 |         add_lines(line = list(shape = "spline" )) %>%
117 |         add_markers(showlegend = FALSE) %>%
118 |         layout(
119 |             xaxis = list(
120 |                 title = "Threads Assigned",
121 |                 tickvals = x_tick_vals
122 |             ), yaxis = list(
123 |                 title = paste0("Speed", gz_title, "\n(M Bases / Second)")
124 |             ))
125 |     fig_speed
126 |     
127 |     fig_efficiency <- plot_ly(x=input_value$Threads, y=input_value$Efficiency..M.Bases.s.CPU., color=input_value$Trimmer, legendgroup=input_value$Trimmer, error_y = list(array=efficiency_error_y_array), showlegend=FALSE) %>%
128 |         add_lines(line = list(shape = "spline")) %>%
129 |         add_markers(showlegend = FALSE) %>%
130 |         layout(
131 |             xaxis = list(
132 |                 title = "Threads Assigned",
133 |                 tickvals = x_tick_vals
134 |             ), yaxis = list(
135 |                 title = paste0("Efficiency", gz_title, "\n(M Bases / Second / CPU Usage)")
136 |             ))
137 |     fig_efficiency
138 |     
139 |     fig_speed_vs_realCPU <- plot_ly(x=input_value$CPU, y=input_value$Speed..M.Bases.s., color=input_value$Trimmer, legendgroup=input_value$Trimmer, error_y = list(array=efficiency_error_y_array), showlegend=FALSE) %>%
140 |         add_trace(line = list(shape = "spline")) %>%
141 |         add_markers(showlegend = FALSE) %>%
142 |         layout(
143 |             xaxis = list(
144 |                 title = "Real Average CPU Usage",
145 |                 tickvals = x_tick_vals
146 |             ), yaxis = list(
147 |                 title = paste0("Speed", gz_title, "\n(M Bases / Second)")
148 |             ))
149 |     fig_speed_vs_realCPU
150 |     
151 |     return(list(
152 |         input_value = input_value,
153 |         input_sd = input_sd,
154 |         input_high = input_high,
155 |         input_low = input_low,
156 |         fig_speed = fig_speed,
157 |         fig_efficiency = fig_efficiency,
158 |         fig_speed_vs_realCPU = fig_speed_vs_realCPU
159 |     ))
160 | }
161 | 
162 | stat_1 <- wrapper(args$input[1], T, F)
163 | stat_2 <- wrapper(args$input[2], T, T)
164 | 
165 | 
166 | p <- subplot(stat_1$fig_speed %>% layout(legend = list(orientation='h', 
167 |                                                        y=1.3, 
168 |                                                        bgcolor=rgb(0,0,0,0))), 
169 |         stat_1$fig_speed_vs_realCPU,
170 |         stat_2$fig_speed, 
171 |         stat_2$fig_speed_vs_realCPU,
172 |         nrows=2, shareX = T, shareY = T)
173 | 
174 | writeLines(sprintf("Output plot: %s", args$out))
175 | htmlwidgets::saveWidget(as_widget(p), args$out)
176 | 
177 | plogx <- subplot(stat_1$fig_speed %>% layout(xaxis = list(type='log'),
178 |                                              legend = list(orientation='h', 
179 |                                                        y=1.3, 
180 |                                                        bgcolor=rgb(0,0,0,0))), 
181 |              stat_1$fig_speed_vs_realCPU %>% layout(xaxis = list(type='log')),
182 |              stat_2$fig_speed %>% layout(xaxis = list(type='log')), 
183 |              stat_2$fig_speed_vs_realCPU %>% layout(xaxis = list(type='log')),
184 |              nrows=2, shareX = T, shareY = T)
185 | plogx
186 | 
187 | outlogx = sub(".html$", ".logx.html", args$out)
188 | writeLines(sprintf("Output logx plot: %s", outlogx))
189 | htmlwidgets::saveWidget(as_widget(plogx), outlogx)
190 | 


--------------------------------------------------------------------------------
/benchmark/trimming-functions.bash:
--------------------------------------------------------------------------------
  1 | #!bash
  2 | export JULIA_NUM_THREADS=16
  3 | 
  4 | time=/usr/bin/time
  5 | ls $time 2>/dev/null
  6 | if [[ $? > 0 ]]
  7 | then
  8 |     time=/export/home/CFIA-ACIA/chuanj/.local/bin/time
  9 | fi
 10 | 
 11 | run_atria(){
 12 |     local num_threads=1
 13 |     if [[ $1 ]]; then
 14 |         num_threads=$1
 15 |     fi
 16 |     $time -v atria --no-consensus \
 17 |         -r $r1 -R $r2 \
 18 |         -o Atria \
 19 |         --no-tail-n-trim --max-n=-1 --no-quality-trim --no-length-filtration \
 20 |         --adapter1 $a1 --adapter2 $a2 --threads $num_threads
 21 | }
 22 | 
 23 | run_atria_consensus(){
 24 |     local num_threads=1
 25 |     if [[ $1 ]]; then
 26 |         num_threads=$1
 27 |     fi
 28 |     $time -v atria \
 29 |         -r $r1 -R $r2 \
 30 |         -o Atria-consensus \
 31 |         --no-tail-n-trim --max-n=-1 --no-quality-trim --no-length-filtration \
 32 |         --adapter1 $a1 --adapter2 $a2 --threads $num_threads
 33 | }
 34 | 
 35 | run_adapterremoval() {
 36 |     local num_threads=1
 37 |     if [[ $1 ]]; then
 38 |         num_threads=$1
 39 |     fi
 40 |     local err=3
 41 |     local folder="AdapterRemoval-$err"
 42 | 	mkdir -p "$folder"
 43 | 	if [[ $r1 = *gz ]]; then
 44 | 		$time -v AdapterRemoval --file1 $r1 --file2 $r2 \
 45 | 	        --basename "$folder"/adapterremoval \
 46 | 	        --adapter1 $a1 --adapter2 $a2 \
 47 | 	        --mm $err --minlength 0 --threads $num_threads --gzip
 48 | 	else
 49 | 		$time -v AdapterRemoval --file1 $r1 --file2 $r2 \
 50 | 			--basename "$folder"/adapterremoval \
 51 | 			--adapter1 $a1 --adapter2 $a2 \
 52 | 			--mm $err --minlength 0 --threads $num_threads
 53 | 	fi
 54 | }
 55 | 
 56 | run_skewer(){
 57 |     local num_threads=1
 58 |     if [[ $1 ]]; then
 59 |         num_threads=$1
 60 |     fi
 61 |     local OUTDIR="Skewer"
 62 |     mkdir -p $OUTDIR
 63 | 	if [[ $r1 = *gz ]]; then
 64 | 	    $time -v skewer --quiet \
 65 | 	        -x $a1 -y $a2 -m pe  \
 66 | 	        -l 0 -o $OUTDIR/$OUTDIR $r1 $r2 --threads $num_threads --compress
 67 | 	else
 68 | 		$time -v skewer --quiet \
 69 | 	        -x $a1 -y $a2 -m pe  \
 70 | 	        -l 0 -o $OUTDIR/$OUTDIR $r1 $r2 --threads $num_threads
 71 | 	fi
 72 | }
 73 | 
 74 | run_trim_galore(){
 75 | 	local num_threads=1
 76 |     if [[ $1 ]]; then
 77 |         num_threads=$1
 78 |     fi
 79 |     local OUTDIR="TrimGalore"
 80 | 	mkdir -p $OUTDIR
 81 | 	$time -v trim_galore --cores $num_threads \
 82 | 	    --quality 0 \
 83 | 	    -o $OUTDIR \
 84 | 	    --adapter $a1 \
 85 | 	    --adapter2 $a2 \
 86 | 	    -e 0.1 --stringency 1 \
 87 | 	    --max_n 100 --length 0 \
 88 | 	    --paired $r1 $r2
 89 | }
 90 | 
 91 | run_trimmomatic(){
 92 | 	local num_threads=1
 93 | 	if [[ $1 ]]; then
 94 | 		num_threads=$1
 95 | 	fi
 96 | 	local OUTDIR="Trimmomatic"
 97 | 	mkdir -p $OUTDIR
 98 | 	rm -f adapters.fa
 99 | 	echo '>TruSeq3/1' >> adapters.fa
100 | 	echo $a1 >> adapters.fa
101 | 	echo '>TruSeq3/2' >> adapters.fa
102 | 	echo $a2 >> adapters.fa
103 | 	output=$OUTDIR/out
104 | 	if [[ $r1 = *gz ]]; then
105 | 		local isgz=.gz
106 | 	else
107 | 		local isgz=
108 | 	fi
109 | 
110 | 	$time -v java -jar /usr/software/Trimmomatic-0.39/trimmomatic-0.39.jar PE -threads $num_threads -phred33 $r1 $r2 $output-pair1.paired.fq$isgz $output-pair1.unpaired.fq$isgz $output-pair2.paired.fq$isgz $output-pair2.unpaired.fq$isgz ILLUMINACLIP:adapters.fa:2:30:10:1:TRUE:keepBothReads MINLEN:1
111 | }
112 | 
113 | run_ktrim(){
114 | 	local num_threads=1
115 | 	if [[ $1 ]]; then
116 | 		num_threads=$1
117 | 	fi
118 | 	local OUTDIR="Ktrim"
119 | 	mkdir -p $OUTDIR
120 | 	$time -v ktrim -1 $r1 -2 $r2 -t $num_threads -p 33 -q 1 -s 10 -a $a1 -b $a2 -o Ktrim/ktrim
121 | }
122 | 
123 | run_fastp(){
124 | 	local num_threads=1
125 | 	if [[ $1 ]]; then
126 | 		num_threads=$1
127 | 	fi
128 | 	local OUTDIR="fastp"
129 | 	output=$OUTDIR/out.fastp
130 | 	if [[ $r1 = *gz ]]; then
131 | 		local isgz=.gz
132 | 	else
133 | 		local isgz=
134 | 	fi
135 | 	mkdir -p $OUTDIR
136 | 	$time -v fastp --in1 $r1 --in2 $r2 --out1 $output.r1.fq$isgz --out2 $output.r2.fq$isgz \
137 | 		-z 6 --adapter_sequence $a1 --adapter_sequence_r2 $a2 --disable_trim_poly_g --disable_quality_filtering --disable_length_filtering --thread $num_threads
138 | }
139 | 
140 | run_seqpurge() {
141 | 	local num_threads=1
142 | 	if [[ $1 ]]; then
143 | 		num_threads=$1
144 | 	fi
145 |     local folder=SeqPurge
146 |     mkdir -p "$folder"
147 | 	# output always gziped
148 |     $time -v SeqPurge -in1 $r1 -in2 $r2 -out1 "$folder"/$r1.seqpurge.fq.gz -out2 "$folder"/$r2.seqpurge.fq.gz \
149 |         -a1 $a1 -a2 $a2 -mep 0.1 \
150 |         -qcut 0 -min_len 0 -summary "$folder"/seqpurge.summary -threads $num_threads
151 | }
152 | 
153 | run_cutadapt() {
154 | 	local num_threads=1
155 | 	if [[ $1 ]]; then
156 | 		num_threads=$1
157 | 	fi
158 | 	local OUTDIR="Cutadapt"
159 | 	output=$OUTDIR/out.cutadapt
160 | 	if [[ $r1 = *gz ]]; then
161 | 		local isgz=.gz
162 | 	else
163 | 		local isgz=
164 | 	fi
165 |     mkdir -p "$OUTDIR"
166 |     $time -v cutadapt -j $num_threads -a $a1 -A $a2 -o $output.R1.fq$isgz -p $output.R2.fq$isgz $r1 $r2
167 | }
168 | 
169 | run_atropos() {
170 | 	# Atropos 1.1.29 with Python 3.8.5
171 | 	local num_threads=1
172 | 	if [[ $1 ]]; then
173 | 		num_threads=$1
174 | 	fi
175 | 	if [[ $r1 = *gz ]]; then
176 | 		local isgz=.gz
177 | 	else
178 | 		local isgz=
179 | 	fi
180 |     local folder="Atropos"
181 |     mkdir -p "$folder"
182 | 	if [[ $num_threads == 1 ]]
183 | 	then
184 | 	    $time -v atropos trim -a $a1 -A $a2 \
185 | 	        -o "$folder"/$r1.atropos.fq$isgz -p "$folder"/$r2.atropos.fq$isgz -pe1 $r1 -pe2 $r2 \
186 | 	        --aligner insert -e 0.1
187 | 	else
188 | 		$time -v atropos trim -a $a1 -A $a2 \
189 | 	        -o "$folder"/$r1.atropos.fq$isgz -p "$folder"/$r2.atropos.fq$isgz -pe1 $r1 -pe2 $r2 \
190 | 	        --aligner insert -e 0.1 --threads $num_threads --preserve-order
191 | 	fi
192 | }
193 | 
194 | mapping() {
195 |     bwa mem -v 1 -t 25 $bwa_ref $1 $2 |\
196 | 	samtools view -@ 10 -b -o $1.bam
197 | }
198 | 
199 | 
200 | mapping_bowtie2(){
201 | 	bowtie2 --maxins 800 --threads 25 -x $bwa_ref-bowtie2 -1 $1 -2 $2 2> $1.bowtie2.stat |\
202 | 	samtools view -@ 10 -b -o $1.bowtie2.bam
203 | }
204 | mapping_hisat2(){
205 | 	hisat2 --threads 25 -x $bwa_ref-hisat2 -1 $1 -2 $2 -S $1.hisat2.sam 2> $1.hisat2.stat
206 | }
207 | qualtrim(){
208 | 		local DIR=`dirname "$1"`/../trimmed-qualtrim
209 | 		if [[ $1 = *gz ]]
210 | 		then
211 | 			local num_threads=20
212 | 			local gzext=.gz
213 | 		else
214 | 			local num_threads=8
215 | 			local gzext=
216 | 		fi
217 | 		if [[ $3 ]]
218 | 		then
219 | 			local num_threads=$3
220 | 		fi
221 | 		time atria -r "$1" -R "$2" -t $num_threads --check-identifier \
222 | 		-o "$DIR" \
223 | 		--no-tail-n-trim --max-n=-1 --no-adapter-trim --no-length-filtration \
224 | 		--quality-score $QSCORE
225 | 		rename --force "s/atria.fastq/qual$QSCORE.fastq/" "$DIR"/*fastq$gzext
226 | 		rename --force "s/atria.fq/qual$QSCORE.fq/" "$DIR"/*fq$gzext
227 | 		rename --force "s/atria.truncated/qual$QSCORE.truncated/" "$DIR"/*truncated$gzext
228 | 		rename --force "s/atria.log/qual$QSCORE.log/" "$DIR"/*log*
229 | }
230 | bowtie2stat(){
231 | 	if [[ $1 ]]
232 | 	then
233 | 		local QSCORE=$1
234 | 	else
235 | 		local QSCORE=
236 | 	fi
237 | 	grep -v Warning */*qualtrim/*qual$QSCORE*bowtie2.stat | sed 's#/[^:]*#\t#' | grep "aligned concordantly exactly 1 time" | column -ts$'\t'
238 | 	echo
239 | 	grep -v Warning */*qualtrim/*qual$QSCORE*bowtie2.stat | sed 's#/[^:]*#\t#' | grep "aligned 0 times concordantly or discordantly" | column -ts$'\t'
240 | }
241 | 
242 | pasteSamtoolsStats(){
243 |     grep ^SN $1| cut -f 2,4 | sed 's/\t# \(.*\)/ [\1]/' | sed 's/://' | awk 'BEGIN{print "sample"};{print}' > samtools-stats.collection.txt
244 |     for i in "$@"
245 |     do
246 |         paste samtools-stats.collection.txt <(grep ^SN $i| cut -f 3 | awk -v var=${i/.fastq*/} 'BEGIN{print var};{print}') > samtools-stats.collection.tmp
247 |         mv samtools-stats.collection.tmp samtools-stats.collection.txt
248 |     done
249 |     echo Output: samtools-stats.collection.txt
250 | }
251 | 
252 | pasteTimeOutput(){
253 |     paste \
254 |         <(grep -E "Command being timed" $1 | sed 's/.*Command being timed://') \
255 |         <(grep -E "^\sUser time" $1) \
256 |         <(grep -E "^\sSystem time" $1) \
257 |         <(grep -E "^\sPercent of CPU this job got" $1) \
258 |         <(grep -E "^\sElapsed" $1) \
259 |         <(grep -E "^\sMaximum resident set size" $1)
260 | }
261 | 
262 | 
263 | sam2bam(){
264 | 	for i in "$@"
265 | 	do
266 | 		echo `date` - $i
267 | 		samtools view -b $i > ${i:0:-3}bam
268 | 		if [[ $? == 0 ]]
269 | 		then
270 | 			rm $i
271 | 		else
272 | 			rm ${i:0:-3}bam
273 | 			echo SamToBam failed: $i
274 | 		fi
275 | 	done
276 | }
277 | 


--------------------------------------------------------------------------------
/src/Trimmer/wrapper_detect_adapter_pe.jl:
--------------------------------------------------------------------------------
  1 | 
  2 | # f_procs(x::String) = x == "-p" || x == "--procs"
  3 | 
  4 | function julia_wrapper_detect_adapter_pe(ARGS::Vector{String}; exit_after_help = true)
  5 | 
  6 |     time_program_initializing = time()
  7 | 
  8 |     args = parsing_args(ARGS; exit_after_help = exit_after_help)
  9 | 
 10 |     if args === nothing  # ARGS is ["-h"]
 11 |         return 0
 12 |     end
 13 |     args_range_test(args)
 14 |     
 15 |     nthread = args["threads"]
 16 |     outdir = args["output-dir"]
 17 | 
 18 |     nfile = length(args["read1"])
 19 |     file_range = 1:nfile
 20 | 
 21 |     #================== Arguments ====================#
 22 | 
 23 |     max_chunk_size           =  2 ^ args["log2-chunk-size"]
 24 | 
 25 |     # NOTE: TruncSeq has some unknown accuracy problems.
 26 |     kmer_tolerance           = args["kmer-tolerance"          ]
 27 |     kmer_n_match             = args["kmer-n-match"            ]
 28 | 
 29 |     # quality
 30 |     quality_offset     = Trimmer.get_quality_offset(args["quality-format"])
 31 | 
 32 | 
 33 |     mkpath(outdir)
 34 | 
 35 | 
 36 |     #================== Main function and common variables ====================#
 37 | 
 38 |     in1bytes = Vector{UInt8}(undef, max_chunk_size)
 39 |     in2bytes = Vector{UInt8}(undef, max_chunk_size)
 40 | 
 41 |     # number of jobs to boxing FqRecord from UInt8 Vector
 42 |     njobs = nthread * 10
 43 |     vr1s = ntuple(_ -> Vector{FqRecord}(), njobs)
 44 |     vr2s = ntuple(_ -> Vector{FqRecord}(), njobs)
 45 | 
 46 |     r1s = Vector{FqRecord}()
 47 |     r2s = Vector{FqRecord}()
 48 | 
 49 |     time_program_initializing = time() - time_program_initializing
 50 | 
 51 |     adapter_detection_summary = init_adapter_detection_summary()
 52 |     #================== Iteration for paired files ====================#
 53 |     for filenum in file_range
 54 |         # filenum = 1
 55 |         time_file_initializing = time()
 56 | 
 57 | 
 58 |         #===== file names =====#
 59 | 
 60 |         file1 = args["read1"][filenum]
 61 |         file2 = args["read2"][filenum]
 62 | 
 63 |         # check whether this sample is processed before
 64 | 
 65 |         isingzip = occursin(r"\.gz$"i, file1)
 66 |         isinbzip2 = occursin(r"\.bz2$"i, file1)
 67 | 
 68 | 
 69 |         #===== file IO =====#
 70 |         halfthread = cld(nthread, 2)
 71 |         if isingzip
 72 |             io1 = open(`pigz -p$halfthread -cd $file1`, write=false)
 73 |             io2 = open(`pigz -p$halfthread -cd $file2`, write=false)
 74 |         elseif isinbzip2
 75 |             io1 = open(`pbzip2 -p$halfthread -cd $file1`, write=false)
 76 |             io2 = open(`pbzip2 -p$halfthread -cd $file2`, write=false)
 77 |         else
 78 |             io1 = open(file1, "r")
 79 |             io2 = open(file2, "r")
 80 |         end
 81 | 
 82 |         #================== Renew variables for read processing ====================#
 83 | 
 84 | 
 85 |         # setting chunk size for file 1 and file2
 86 |         chunk_size1, chunk_size2, uncompressed_size1, uncompressed_size2 = chunk_sizes(file1, file2, max_chunk_size)
 87 |         if (uncompressed_size1 == -1 || uncompressed_size2 == -1) && (isingzip || isinbzip2)
 88 |             # file is gzip but uncompressed size not known.
 89 |             # do not resize. just assume R1/2 is the original data, which means insert size is evenly-distributed.
 90 |             chunk_size1 = length(in1bytes)
 91 |             chunk_size2 = length(in2bytes)
 92 |         else
 93 |             resize!(in1bytes, chunk_size1)
 94 |             resize!(in2bytes, chunk_size2)
 95 |         end
 96 | 
 97 |         # clear common variables
 98 |         empty!(r1s)
 99 |         empty!(r2s)
100 | 
101 |         n_reads = 0
102 |         n_r1 = 0
103 |         n_r2 = 0
104 |         nbatch = 0
105 |         total_read_copied_in_loading = 0
106 |         total_n_bytes_read1 = 0
107 |         total_n_bytes_read2 = 0
108 |         in1bytes_nremain = 0
109 |         in2bytes_nremain = 0
110 | 
111 |         #================== File processing ====================#
112 |         task_r1s_unbox = Threads.@spawn 1
113 |         task_r2s_unbox = Threads.@spawn 1
114 | 
115 |         # the first cycle to generate compiled code?
116 |         function cycle_wrapper()
117 |             nbatch += 1
118 |             n_r1_before = length(r1s) - n_reads
119 |             n_r2_before = length(r2s) - n_reads
120 | 
121 |             if typeof(io1) <: IOStream  # not compressed
122 |                 length(in1bytes) == chunk_size1 || resize!(in1bytes, chunk_size1)
123 |                 length(in2bytes) == chunk_size2 || resize!(in2bytes, chunk_size2)
124 |                 (n_r1, n_r2, r1s, r2s, ncopied) = load_fqs_threads!(io1, io2, in1bytes, in2bytes, vr1s, vr2s, r1s, r2s, task_r1s_unbox, task_r2s_unbox; remove_first_n = n_reads, njobs = njobs, quality_offset = quality_offset)
125 |             else  # gziped
126 |                 total_n_bytes_read1 += length(in1bytes)  # will read INT in this batch
127 |                 total_n_bytes_read2 += length(in2bytes)  # will read INT in this batch
128 |                 will_eof1 = total_n_bytes_read1 >= uncompressed_size1
129 |                 will_eof2 = total_n_bytes_read2 >= uncompressed_size2
130 |                 (n_r1, n_r2, r1s, r2s, in1bytes_nremain, in2bytes_nremain, ncopied) = load_fqs_threads!(
131 |                     io1, io2,
132 |                     in1bytes, in2bytes, in1bytes_nremain, in2bytes_nremain,
133 |                     vr1s, vr2s, r1s, r2s, task_r1s_unbox, task_r2s_unbox;
134 |                     will_eof1 = will_eof1, will_eof2 = will_eof2,
135 |                     in1bytes_resize_before_read = chunk_size1,
136 |                     in2bytes_resize_before_read = chunk_size2,
137 |                     remove_first_n = n_reads, quality_offset = quality_offset,
138 |                     njobs = njobs
139 |                 )
140 |             end
141 | 
142 |             n_reads = min(n_r1, n_r2)
143 |             total_read_copied_in_loading += ncopied
144 | 
145 |             # it only get the sizes, did not change the sizes. Size changing is done in the "Read" part.
146 |             chunk_size1, chunk_size2 = get_ideal_inbyte_sizes(in1bytes, in2bytes, n_r1, n_r2, n_r1_before, n_r2_before, max_chunk_size, chunk_size1, chunk_size2)
147 | 
148 |             # check_fq_ids(r1s::Vector{FqRecord}, r2s::Vector{FqRecord}, n_reads::Int)::nothing
149 | 
150 |             # processing reads
151 |             r1_stats, r2_stats = check_pe_match(r1s, r2s; kmer_tolerance = kmer_tolerance + 1, kmer_n_match = kmer_n_match, occurance = 0.0004)
152 | 
153 |             show_paired_adapter_result(file1, r1_stats, n_reads)
154 |             show_paired_adapter_result(file2, r2_stats, n_reads)
155 |             push_adapter_detection_summary!(adapter_detection_summary, file1, r1_stats, file2, r2_stats)
156 |         end
157 | 
158 |         cycle_wrapper()
159 | 
160 |         #================== Close files ====================#
161 | 
162 |         close(io1)
163 |         close(io2)
164 |     end
165 | 
166 |     timestamp = replace(string(now()), r"[T:\.]" => "-")
167 |     adapter_detection_summary_file = joinpath(outdir, "atria_adapter_detect_summary.$timestamp.txt")
168 |     CSV.write(adapter_detection_summary_file, adapter_detection_summary, delim = '\t')
169 |     println("""
170 |     _________________________________
171 | 
172 |     Summary of detected adapters is saved to $adapter_detection_summary_file
173 |     
174 |     _________________________________
175 | 
176 |     Paired-end Adapter Detection Note: 
177 |     
178 |     Atria detects adapter sequences using paired-end information. Adapter sequences are truncated to 16-bp, which are accurate enough for trimming. From experiments of many popular trimmers, increasing adapter length from 16 to 33 does not increase accuracy (Figure 4C of https://doi.org/10.46471/gigabyte.31).
179 | 
180 |     Adapter detection is the last choice because its accuracy is highly based on your data. If your data has been trimmed, the remaining adapters may not be enough for accurate guessing. We suggest using adapter detection only when you cannot find the actual adapter sequence.
181 | 
182 |     Besides, Atria does not automatically trim auto-detected adapters. It is your responsibility to check whether the detected adapters are real.
183 |     
184 |     Those rules can be used to check the adapter results: 
185 |     
186 |     (1) An Illumina sequence file only has ONE adapter sequence. 
187 |     
188 |     (2) In the same batch of NGS experiments, all R1 samples should have the SAME adapter sequence, so do R2 samples. The most prevalent adapters of R1 and R2 might be true for all your data.
189 |     _________________________________
190 | 
191 |     Summary of detected adapters is saved to $adapter_detection_summary_file
192 |     _________________________________
193 | 
194 |     """)
195 | 
196 |     return 0
197 | end # func
198 | 


--------------------------------------------------------------------------------
/src/FqRecords/check_and_trim.jl:
--------------------------------------------------------------------------------
  1 | 
  2 | @inline function isinreadlength!(r::FqRecord, length_range::UnitRange{Int64})::Bool
  3 |     (length(r.seq)::Int64 in length_range)::Bool
  4 | end
  5 | @inline function isinreadlength!(r1::FqRecord, r2::FqRecord, length_range::UnitRange{Int64})::Bool
  6 |     res1 = (length(r1.seq)::Int64 in length_range)::Bool
  7 |     res2 = (length(r2.seq)::Int64 in length_range)::Bool
  8 |     (res1 && res2)
  9 | end
 10 | 
 11 | @inline function count_N(r::FqRecord)::Float64
 12 |     # A/T/G/C: 0001, 0010, 0100, 1000: count_ones == 1
 13 |     # N      : 1111                  : count_ones == 4
 14 |     n_1s = 0
 15 |     for b in r.seq.data
 16 |         n_1s += count_ones(b::UInt64)::Int64
 17 |     end
 18 |     @fastmath((n_1s - length(r.seq)::Int64)::Int64 / 3.0)::Float64
 19 | end
 20 | 
 21 | @inline function isnotmuchN!(r::FqRecord, max_N::Int64)::Bool
 22 |     c1 = count_N(r)::Float64
 23 |     c1 <= max_N
 24 | end
 25 | @inline function isnotmuchN!(r1::FqRecord, r2::FqRecord, max_N::Int64)::Bool
 26 |     c1 = count_N(r1)::Float64
 27 |     c2 = count_N(r2)::Float64
 28 |     res1 = (c1 <= max_N::Int64)
 29 |     res2 = (c2 <= max_N::Int64)
 30 |     res1 && res2
 31 | end
 32 | 
 33 | @inline function front_trim!(r::FqRecord, ntrim::Int64)::Nothing
 34 |     if ntrim <= 0
 35 |     elseif ntrim < length(r.seq)
 36 |         delete_range = 1:ntrim
 37 |         deleteat!(r.seq, delete_range)
 38 |         deleteat!(r.qual, delete_range)
 39 |         deleteat!(r.prob, delete_range)
 40 |     else  # ntrim >= length(r.seq)
 41 |         resize!(r.seq, 0)
 42 |         resize!(r.qual, 0)
 43 |         resize!(r.prob, 0)
 44 |     end
 45 |     return
 46 | end
 47 | 
 48 | # @inline function tail_trim!(r::FqRecord, m::AlignMatch)::Nothing
 49 | #     resize!(r.seq::LongDNA{4}, m.insert_size::Int64)
 50 | #     resize!(r.qual, m.insert_size)
 51 | #     resize!(r.prob, m.insert_size)
 52 | #     return
 53 | # end
 54 | 
 55 | @inline function tail_trim!(r::FqRecord, nremain::Int64)::Nothing
 56 |     if nremain < length(r.seq::LongDNA{4})
 57 |         resize!(r.seq::LongDNA{4}, nremain::Int64)
 58 |         resize!(r.qual, nremain)
 59 |         resize!(r.prob, nremain)
 60 |     end
 61 |     return
 62 | end
 63 | 
 64 | @inline function tail_N_trim!(r::FqRecord, stats::TrimStats)::Nothing
 65 |     nbase = length(r.seq::LongDNA{4})::Int64
 66 |     # trim end
 67 |     n = nbase::Int64
 68 |     @inbounds while n::Int64 >= 1
 69 |         (r.seq::LongDNA{4})[n]::DNA == DNA_N ? n -= 1 : break
 70 |     end
 71 |     if n::Int64 != nbase::Int64
 72 |         @atomic stats.tail_N_trim += 1
 73 |         resize!(r.seq::LongDNA{4}, n::Int64)
 74 |         resize!(r.qual, n)
 75 |         resize!(r.prob, n)
 76 |     end
 77 |     return
 78 | end
 79 | 
 80 | @inline function tail_low_qual_trim!(r::FqRecord, stats::TrimStats)::Nothing
 81 |     nbase = length(r.seq::LongDNA{4})::Int64
 82 |     # trim end
 83 |     n = nbase::Int64
 84 |     @inbounds while n::Int64 >= 1
 85 |         (r.prob)[n] < 0.3 ? n -= 1 : break  # 0.3: phred Q < 1.5
 86 |     end
 87 |     if n::Int64 != nbase::Int64
 88 |         @atomic stats.tail_low_qual_trim += 1
 89 |         resize!(r.seq::LongDNA{4}, n::Int64)
 90 |         resize!(r.qual, n)
 91 |         resize!(r.prob, n)
 92 |     end
 93 |     return
 94 | end
 95 | 
 96 | """
 97 |     qualitymatch(r::FqRecord, q0::UInt8, qn::UInt64, n::Int64)::Int64
 98 | 
 99 | # ARGUMENTS
100 | 1. `r::FqRecord` is FastQ record.
101 | 2. `q0::UInt8` is the adjusted quality score. (Eg: +33 if Illumina 1.9+ version).
102 | 3. `qn::UInt64` is the adjusted quality score * n. (Eg: +33 if Illumina 1.9+ version).
103 | 4. `n::Int64` is the length of sliding window to iterate the reads.
104 | 
105 | Return the length `n` of reads to keep. `-1` means no need for quality trimming.
106 | """
107 | @inline function qualitymatch(r::FqRecord, q0::UInt8, qn::UInt64, n::Int64)::Int64
108 |     quals = r.qual
109 |     nqual = length(quals)
110 |     N = n - 1
111 |     nbase = nqual - N
112 |     i = 1
113 | 
114 |     ### check any qual less than q0
115 |     while i <= nqual
116 |         if @inbounds(quals[i]) < q0
117 |             break  # start matching sliding window
118 |         end
119 |         i += 1
120 |     end
121 | 
122 |     (i > nqual) && return -1  # no qual less than q0: not trim
123 |     (i > nbase) && @goto tail_qual_match  # i in the last n bases, go to tail_qual_match
124 | 
125 |     ### check sliding window
126 |     qual_sum = UInt64(@inbounds quals[i])
127 |     start = i + 1
128 |     stop = i + N
129 |     for m in start:stop
130 |         qual_sum += @inbounds quals[m]
131 |     end
132 | 
133 |     (qual_sum < qn) && @goto tail_qual_match  # ith failed quality match
134 | 
135 |     i += 1
136 |     while i <= nbase
137 |         qual_sum += @inbounds quals[i+N]
138 |         qual_sum -= @inbounds quals[i-1]
139 |         (qual_sum < qn) && @goto tail_qual_match  # ith failed quality match
140 |         i += 1
141 |     end
142 | 
143 |     @label tail_qual_match
144 |     while i <= nqual
145 |         (@inbounds(quals[i]) < q0) && return i-1  # ith failed quality match
146 |         i += 1
147 |     end
148 | 
149 |     return -1  # no trim
150 | end
151 | 
152 | """
153 |     seq_complexity(r::FqRecord)
154 |     seq_complexity(seq::LongDNA{4})
155 | 
156 | The complexity is defined as the percentage of bases that are different from their next bases (base[i] != base[i+1]). However, here we use an approximation algorithm.
157 | 
158 | The performance of the algorithm:
159 | ```
160 | # Test Sequence            True  Computed Complexity
161 | NNNNNNNNNNNNNNNNNNNNNNNN: (0.0  -2.8260869565217392)
162 | ------------------------: (0.0  1.0)
163 | AAAAAAAAAAAAAAAAAAAAAAAA: (0.0  0.04347826086956519)
164 | ATATATATATATATATATATATAT: (1.0  1.0)
165 | ATTATTATTATTATTATTATTATT: (0.65 0.6521739130434783)
166 | ATATATATGGGGGGGG        : (0.5  0.5333333333333333)
167 | NANANANANANANANA        : (NaN  0.0)
168 | ```
169 | """
170 | @inline function seq_complexity(seq::LongDNA{4})
171 |     nbase = seq.len % Int64  # cannot use length(r.seq) because seq may start from mid, which is not compatible with the algorithm
172 |     seq_data = seq.data
173 |     n_valid_seq_data = length(seq_data) - 1  # -1 because of bitsafe
174 |     n_ones = 0
175 |     for i in 1:n_valid_seq_data
176 |         b = seq_data[i]
177 |         n_ones += count_ones(b & (b << 4))
178 |         # Test Sequence                            True   Computed Complexity (1 - x/15)
179 |         # NNNNNNNNNNNNNNNN: 60 ones, 4 zeros      (0.0    -3.0)
180 |         # ----------------: 0 ones, 64 zeros      (0.0    1.0)
181 |         # AAAAAAAAAAAAAAAA: 15 ones, 49 zeros     (0.0    0.0)
182 |         # ATATATATATATATAT: 0 ones, 64 zeros      (1.0  1.0)
183 |         # ATTATTATTATTATTA: 5 ones, 59 zeros      (0.65   0.6666666666666667)
184 |         # ATATATATGGGGGGGG: 7 ones, 57 zeros      (0.50   0.5333333333333333)
185 |         # NANANANANANANANA: 15 ones, 49 zeros     (NN    0.0)
186 |     end
187 |     n_compensate = nbase % 16
188 |     if n_compensate == 0
189 |         complexity = @fastmath(1 - n_ones / (15 * n_valid_seq_data))
190 |     else
191 |         complexity = @fastmath(1 - n_ones / (15 * (n_valid_seq_data - 1) + n_compensate))
192 |     end
193 | end
194 | 
195 | @inline seq_complexity(r::FqRecord) = seq_complexity(r.seq)
196 | 
197 | 
198 | @inline function polyX_tail_scan(a::DNA, b::LongDNA{4}, allowed_mismatch_per_16mer::Int64; until::Int64 = 1)
199 |     best_idx = 0
200 |     N = length(b)
201 |     n = N
202 |     n_mismatch = 0
203 |     allowed_mismatch = allowed_mismatch_per_16mer
204 |     while n >= until
205 |         if @inbounds(b[n]) & a == a  # ambiguous DNA is true
206 |             best_idx = n
207 |         else
208 |             n_mismatch += 1
209 |             if n_mismatch > allowed_mismatch
210 |                 break
211 |             end
212 |         end
213 |         n -= 1
214 |         if (N-n) % 16 == 0
215 |             allowed_mismatch = allowed_mismatch_per_16mer
216 |         end
217 |     end
218 | 
219 |     # check if can elongate
220 | 
221 |     best_idx2 = 0
222 |     n -= 1
223 |     while n >= until
224 |         if @inbounds(b[n]) === a
225 |             best_idx2 = n
226 |         else
227 |             n_mismatch += 1
228 |             if n_mismatch > allowed_mismatch
229 |                 break
230 |             end
231 |         end
232 |         n -= 1
233 |         if (N-n) % 16 == 0
234 |             allowed_mismatch = allowed_mismatch_per_16mer
235 |         end
236 |     end
237 |     if best_idx2 > 0  # found
238 |         best_idx = best_idx2
239 |     elseif best_idx == 0  # not found
240 |         return 0,0
241 |     end
242 | 
243 |     # reverse check
244 |     n_r_match = 1
245 |     n_r_mismatch = 0
246 |     n = best_idx + 1
247 |     in_mismatch_region = false
248 |     while n_r_match <= allowed_mismatch_per_16mer && n <= N
249 |         if @inbounds(b[n]) === a
250 |             if in_mismatch_region
251 |                 break
252 |             end
253 |             n_r_match += 1
254 |         else
255 |             in_mismatch_region = true
256 |             n_r_mismatch += 1
257 |         end
258 |         n += 1
259 |     end
260 |     if n_r_mismatch >= n_r_match  # revert
261 |         best_idx = n
262 |     end
263 |     if best_idx > N  # occurs when very poor match, best_idx2 > 0 and best_idx == 0
264 |         return 0,0
265 |     end
266 |     n_polyX_length = N - best_idx + 1
267 |     return best_idx, n_polyX_length
268 | end
269 | 
270 | @inline polyX_tail_scan(a::DNA, b::FqRecord, allowed_mismatch_per_16mer::Int64; until::Int64 = 1) = polyX_tail_scan(a, b.seq, allowed_mismatch_per_16mer; until = until)
271 | 


--------------------------------------------------------------------------------
/src/FqRecords/thread_output.jl:
--------------------------------------------------------------------------------
  1 | """
  2 |     bytes_tmp1 = Vector{UInt8}(undef, 67108864) # 2^26
  3 | 
  4 | Used for writebytes(io1out::CodecZlibIO, outr1s, range_filter, bytes_tmp1)
  5 | """
  6 | bytes_tmp1 = Vector{UInt8}(undef, 67108864) # 2^26
  7 | bytes_tmp2 = Vector{UInt8}(undef, 67108864) # 2^26
  8 | 
  9 | """
 10 |     write_fqs_threads!(io1out::IO, io2out::IO,
 11 |         outr1s::Vector{Vector{UInt8}}, outr2s::Vector{Vector{UInt8}},
 12 |         r1s::Vector{FqRecord}, r2s::Vector{FqRecord},
 13 |         n_reads::Int, range_filter, task_write1, task_write2)
 14 | 
 15 | The interface to write paired FASTQ reads. 
 16 | 
 17 | - `r1s` and `r2s`: reads to write.
 18 | """
 19 | function write_fqs_threads!(io1out::IOStream, io2out::IOStream,
 20 |     outr1s::Vector{Vector{UInt8}}, outr2s::Vector{Vector{UInt8}},
 21 |     r1s::Vector{FqRecord}, r2s::Vector{FqRecord},
 22 |     n_reads::Int, range_filter, task_write1, task_write2)
 23 | 
 24 |     task_r1s_unbox = Threads.@spawn begin
 25 |         wait(task_write1)  # last task
 26 |         # @info "write_fqs_threads! FqRecord2StringVec! - start - R1 : n_reads = $n_reads"
 27 |         FqRecord2StringVec!(outr1s::Vector{Vector{UInt8}}, r1s::Vector{FqRecord}, n_reads::Int)
 28 |         # @info "write_fqs_threads! FqRecord2StringVec! - done  - R1 : n_reads = $n_reads"
 29 |     end
 30 | 
 31 |     task_write1_new = Threads.@spawn begin
 32 |         wait(task_r1s_unbox)  # last task
 33 |         writebytes(io1out, outr1s, range_filter) # new task
 34 |     end
 35 | 
 36 |     task_r2s_unbox = Threads.@spawn begin
 37 |         wait(task_write2)
 38 |         # @info "write_fqs_threads! FqRecord2StringVec! - start - R2 : n_reads = $n_reads"
 39 |         FqRecord2StringVec!(outr2s::Vector{Vector{UInt8}}, r2s::Vector{FqRecord}, n_reads::Int)
 40 |         # @info "write_fqs_threads! FqRecord2StringVec! - done  - R2 : n_reads = $n_reads"
 41 |     end
 42 | 
 43 |     task_write2_new = Threads.@spawn begin
 44 |         wait(task_r2s_unbox)  # last task
 45 |         writebytes(io2out, outr2s, range_filter)
 46 |     end
 47 | 
 48 |     task_r1s_unbox, task_r2s_unbox, task_write1_new, task_write2_new
 49 | end
 50 | function write_fqs_threads!(io1out::IO, io2out::IO,
 51 |     outr1s::Vector{Vector{UInt8}}, outr2s::Vector{Vector{UInt8}},
 52 |     r1s::Vector{FqRecord}, r2s::Vector{FqRecord},
 53 |     n_reads::Int, range_filter, task_write1, task_write2)
 54 | 
 55 |     task_r1s_unbox = Threads.@spawn begin
 56 |         wait(task_write1)  # last task
 57 |         FqRecord2StringVec!(outr1s::Vector{Vector{UInt8}}, r1s::Vector{FqRecord}, n_reads::Int)
 58 |     end
 59 | 
 60 |     task_write1_new = Threads.@spawn begin
 61 |         wait(task_r1s_unbox)  # last task
 62 |         writebytes(io1out, outr1s, range_filter, bytes_tmp1) # new task
 63 |     end
 64 |     
 65 |     task_r2s_unbox = Threads.@spawn begin
 66 |         wait(task_write2)
 67 |         FqRecord2StringVec!(outr2s::Vector{Vector{UInt8}}, r2s::Vector{FqRecord}, n_reads::Int)
 68 |     end
 69 | 
 70 |     task_write2_new = Threads.@spawn begin
 71 |         wait(task_r2s_unbox)  # last task
 72 |         writebytes(io2out, outr2s, range_filter, bytes_tmp2)
 73 |     end
 74 | 
 75 |     task_r1s_unbox, task_r2s_unbox, task_write1_new, task_write2_new
 76 | end
 77 | 
 78 | function write_fqs_threads!(io1out::IOStream,
 79 |     outr1s::Vector{Vector{UInt8}},
 80 |     r1s::Vector{FqRecord},
 81 |     n_reads::Int, range_filter, task_write1)
 82 | 
 83 |     wait(task_write1)
 84 |     task_r1s_unbox = Threads.@spawn begin
 85 |         FqRecord2StringVec!(outr1s::Vector{Vector{UInt8}}, r1s::Vector{FqRecord}, n_reads::Int)
 86 |     end
 87 |     task_write1_new = Threads.@spawn begin
 88 |         wait(task_r1s_unbox)
 89 |         writebytes(io1out, outr1s, range_filter)
 90 |     end
 91 |     task_r1s_unbox, task_write1_new
 92 | end
 93 | function write_fqs_threads!(io1out::IO,
 94 |     outr1s::Vector{Vector{UInt8}},
 95 |     r1s::Vector{FqRecord},
 96 |     n_reads::Int, range_filter, task_write1)
 97 | 
 98 |     wait(task_write1)
 99 |     task_r1s_unbox = Threads.@spawn begin
100 |         FqRecord2StringVec!(outr1s::Vector{Vector{UInt8}}, r1s::Vector{FqRecord}, n_reads::Int)
101 |     end
102 |     task_write1_new = Threads.@spawn begin
103 |         wait(task_r1s_unbox)
104 |         writebytes(io1out, outr1s, range_filter, bytes_tmp1)
105 |     end
106 |     task_r1s_unbox, task_write1_new
107 | end
108 | 
109 | 
110 | """
111 |     FqRecord2StringVec!(out::Vector{UInt8}, r::FqRecord)
112 | 
113 | Empty `out`, and then convert `r` to it continuously. If empty sequence, write a N as sequence and a ! as quality.
114 | """
115 | @inline function FqRecord2StringVec!(out::Vector{UInt8}, r::FqRecord)::Nothing
116 |     # out = Base.StringVector(0)
117 |     empty!(out)
118 |     if r.seq.len == 0x0000000000000000  # isempty(r.seq::LongDNA{4})
119 |         append!(out, r.id::Vector{UInt8})
120 |         append!(out, [0x0a, 0x4e, 0x0a])  # \nN\n
121 |         # push!(out, 0x0a)  # \n
122 |         # push!(out, 0x4e)  # N
123 |         # push!(out, 0x0a)
124 |         append!(out, r.des::Vector{UInt8})
125 |         append!(out, [0x0a, 0x21, 0x0a])  # \n!\n
126 |         # push!(out, 0x0a)
127 |         # push!(out, 0x21)  # !
128 |         # push!(out, 0x0a)
129 |     else
130 |         append!(out, r.id::Vector{UInt8})
131 |         push!(out, 0x0a)
132 |         length_out = length(out)
133 |         r_seq = r.seq
134 |         length_r = length(r_seq)
135 | 
136 |         resize!(out, length_out + length_r)
137 |         @inbounds for (i, base) in enumerate(r_seq)
138 |             out[length_out + i] = UInt8(convert(Char, base))
139 |         end
140 |         push!(out, 0x0a)
141 |         append!(out, r.des::Vector{UInt8})
142 |         push!(out, 0x0a)
143 |         append!(out, r.qual::Vector{UInt8})
144 |         push!(out, 0x0a)
145 |     end
146 |     nothing
147 | end
148 | 
149 | 
150 | """
151 |     FqRecord2StringVec!(outrs::Vector{Vector{UInt8}}, rs::Vector{FqRecord}, stop::Int)
152 |     FqRecord2StringVec!(outrs::Vector{Vector{UInt8}}, rs::Vector{FqRecord}, reads_range::UnitRange)
153 | 
154 | - `outrs`: the vector of string vectors to be modified in place.
155 | 
156 | - `rs`: the vector of reads to be converted.
157 | 
158 | - `stop::Int`: only convert `rs` in the range of `1:stop`.
159 | 
160 | - `reads_range::UnitRange`: only convert `rs` in the reads range.
161 | """
162 | @inline function FqRecord2StringVec!(outrs::Vector{Vector{UInt8}}, rs::Vector{FqRecord}, stop::Int)::Nothing
163 |     n_outrs = length(outrs)
164 |     if n_outrs < stop
165 |         # make outrs larger
166 |         append!(outrs,
167 |             Vector{UInt8}[Base.StringVector(0) for i = 1:(stop-n_outrs)])
168 |     end
169 |     if length(rs) < stop
170 |         @error "length(rs) < stop" length(rs) stop
171 |     end
172 |     @sync for reads_start in 1:3072:stop
173 |         reads_end = min(reads_start + 3071, stop)
174 |         reads_range = reads_start:reads_end
175 |         Threads.@spawn FqRecord2StringVec!(outrs, rs, reads_range)
176 |     end
177 |     nothing
178 | end
179 | 
180 | @inline function FqRecord2StringVec!(outrs::Vector{Vector{UInt8}}, rs::Vector{FqRecord}, reads_range::UnitRange)::Nothing
181 |     @inbounds for i in reads_range
182 |         FqRecord2StringVec!(outrs[i], rs[i])
183 |     end
184 |     nothing
185 | end
186 | 
187 | 
188 | @inline function writebytes(io::IOStream, outrs::Vector{Vector{UInt8}}, stop::Int)::Nothing
189 |     @inbounds for i in 1:stop
190 |         write_no_lock(io, outrs[i])
191 |     end
192 |     nothing
193 | end
194 | @inline function writebytes(io::IOStream, outrs::Vector{Vector{UInt8}}, filters::SubArray{Bool,1,Array{Bool,1},Tuple{UnitRange{Int64}},true})::Nothing
195 |     @inbounds for (i, val) in enumerate(filters)
196 |         if val
197 |             write_no_lock(io, outrs[i])
198 |         end
199 |     end
200 |     nothing
201 | end
202 | 
203 | @inline function writebytes(io::IO, outrs::Vector{Vector{UInt8}}, stop::Int)::Nothing
204 |     # for CodecZlib streams, call write once to increase speed (3.2X)
205 |     # it is even faster than call pigz in shell.
206 |     v_all = @inbounds outrs[1]
207 |     @inbounds for i in 2:stop
208 |         append!(v_all, outrs[i])
209 |     end
210 |     write(io, v_all)
211 |     nothing
212 | end
213 | 
214 | 
215 | 
216 | @inline function writebytes(io::IO, outrs::Vector{Vector{UInt8}}, filters::SubArray{Bool,1,Array{Bool,1},Tuple{UnitRange{Int64}},true}, bytes_tmp::Vector{UInt8})::Nothing
217 |     # for CodecZlib streams, call write once to increase speed
218 | 
219 |     # method 1: 112s
220 |     # v_all = Base.StringVector(0)
221 |     # @inbounds for (i, val) in enumerate(filters)
222 |     #     if val
223 |     #         append!(v_all, outrs[i])
224 |     #     end
225 |     # end
226 |     # write(io, v_all)
227 | 
228 |     # method2: 213s
229 |     # @inbounds for (i, val) in enumerate(filters)
230 |     #     if val
231 |     #         write(io, outrs[i])
232 |     #     end
233 |     # end
234 | 
235 |     # method3: 99s # the same speed as the natual pigz
236 |     # bytes_tmp = Vector{UInt8}(undef, 67108864) # 2^26
237 |     start = 1
238 |     stop = length(bytes_tmp)
239 |     @inbounds for (i, val) in enumerate(filters)
240 |         if val
241 |             outr = outrs[i]::Vector{UInt8}
242 |             ncopy = length(outr)::Int
243 |             new_stop = (start + ncopy - 1)::Int
244 |             if new_stop > stop
245 |                 stop = max(stop + 2097152, new_stop)::Int
246 |                 resize!(bytes_tmp, stop)
247 |             end
248 |             unsafe_copyto!(bytes_tmp, start, outr, 1, ncopy)
249 |             start += ncopy
250 |         end
251 |     end
252 |     p_bytes_tmp = pointer(bytes_tmp)
253 |     bytes_tmp_to_write = unsafe_wrap(Vector{UInt8}, p_bytes_tmp, start-1)
254 |     write(io, bytes_tmp_to_write)
255 |     nothing
256 | end
257 | 
258 | @inline function fqwriterecord!(io::IO, outrs::Vector{Vector{UInt8}}, rs::Vector{FqRecord}, stop::Int)
259 |     FqRecord2StringVec!(outrs, rs, stop)
260 |     writebytes(io, outrs)
261 | end
262 | 


--------------------------------------------------------------------------------
/src/Benchmark/read_stats.jl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env julia
  2 | 
  3 | # using BioSymbols
  4 | # using BioSequences
  5 | # using Statistics
  6 | #
  7 | # include("apiBioFqRecords.jl")
  8 | 
  9 | function julia_wrapper_readstat(ARGS)
 10 | 
 11 |     help_page = """
 12 |     usage: atria readstat [-h] FASTQS...
 13 | 
 14 |     positional arguments:
 15 |       FASTQS      input trimmed fastqs. caution: raw fastq has to be
 16 |                   generated by `atria simulate`. If multiple, two by two are considered paired.
 17 | 
 18 |     optional arguments:
 19 |       -h, --help  show this help message and exit
 20 |     """
 21 | 
 22 |     if "-h" in ARGS || "--help" in ARGS || length(ARGS) == 0
 23 |         println(help_page)
 24 |         return 0
 25 |     end
 26 | 
 27 |     time0 = time()
 28 | 
 29 |     n = length(ARGS)
 30 | 
 31 |     if n == 1
 32 |         peReadSimulatorStats_main(ARGS[1])
 33 |     elseif n % 2 == 0
 34 |         for i in 1:2:n
 35 |             peReadSimulatorStats_main(ARGS[i], ARGS[i+1])
 36 |         end
 37 |     else
 38 |         @error "multiple odd FASTQs detected. If providing multiple FASTQs, they are considered paired two by two."
 39 |     end
 40 | 
 41 |     @info "read simulation stats: all done" elapsed=time() - time0
 42 |     return 0
 43 | end
 44 | 
 45 | @inline function fastq_parser(r::FqRecord)
 46 |     splitted = split(String(copy(r.id)), " ")
 47 | 
 48 |     # read validate
 49 |     if length(splitted) < 7 || !occursin("@PeReadSimulator", splitted[1])
 50 |         @error "read simulation stats: read format invalid: reads should be simulated by peReadSimulator and read headers should be intact." invalid_header=String(r.id) _module=nothing _group=nothing _id=nothing _file=nothing
 51 |         exit(3)
 52 |     end
 53 | 
 54 |     # @PeReadSimulator2:1:1 TRUE=80 INSERT_SIZE=80 ERROR_RATE=0.00102 SEQ_LENGTH=100 ERROR_INSERT=0 ERROR_ADAPTER=0 SUB=0.001 INS=1.0e-5 DEL=1.0e-5
 55 | 
 56 |     seq_id = splitted[1]
 57 |     true_length = parse(Int64, splitted[2][6:end])
 58 |     insert_size = parse(Int64, splitted[3][13:end])
 59 |     error_rate = parse(Float64, splitted[4][12:end])
 60 |     seq_length = parse(Int64, splitted[5][12:end])
 61 |     error_insert = parse(Int64, splitted[6][14:end])
 62 |     error_adapter = parse(Int64, splitted[7][15:end])
 63 | 
 64 |     if r.seq == dna"N"  # compatible with Atria
 65 |         trimmed_length = 0
 66 |     else
 67 |         trimmed_length = length(r.seq)
 68 |     end
 69 | 
 70 |     delta_length = true_length - trimmed_length
 71 |     is_trim_successful = trimmed_length == true_length
 72 |     return (seq_id, seq_length, insert_size, error_rate, error_insert, error_adapter, true_length, trimmed_length, delta_length, is_trim_successful)
 73 | end
 74 | 
 75 | function stats(n_repeat::Int64, overtrim_deviations::Vector{Int64}, undertrim_deviations::Vector{Int64})
 76 |     n_overtrim  = length(overtrim_deviations )
 77 |     n_undertrim = length(undertrim_deviations)
 78 | 
 79 |     rate_precision = (n_repeat - n_overtrim - n_undertrim) / n_repeat
 80 |     rate_overtrim  = n_overtrim  / n_repeat
 81 |     rate_undertrim = n_undertrim / n_repeat
 82 | 
 83 |     median_deviation = 0
 84 |     median_deviation_overtrim = 0
 85 |     median_deviation_undertrim = 0
 86 | 
 87 |     if n_overtrim > 0
 88 |         median_deviation_overtrim = median(overtrim_deviations)
 89 |     end
 90 |     if n_undertrim > 0
 91 |         median_deviation_undertrim = median(undertrim_deviations)
 92 |     end
 93 |     if n_overtrim + n_undertrim > 0
 94 |         median_deviation = median!([overtrim_deviations; -undertrim_deviations])
 95 |     end
 96 | 
 97 |     # deviation greater than 1 bp stats
 98 |     overtrim_deviations_gt1 = filter(x -> abs(x) > 1, overtrim_deviations)
 99 |     undertrim_deviations_gt1 = filter(x -> abs(x) > 1, undertrim_deviations)
100 |     n_overtrim_gt1  = length(overtrim_deviations_gt1 )
101 |     n_undertrim_gt1 = length(undertrim_deviations_gt1)
102 | 
103 |     rate_precision_in1 = (n_repeat - n_overtrim_gt1 - n_undertrim_gt1) / n_repeat
104 |     rate_overtrim_gt1  = n_overtrim_gt1  / n_repeat
105 |     rate_undertrim_gt1 = n_undertrim_gt1 / n_repeat
106 | 
107 |     return rate_precision, rate_overtrim, rate_undertrim, median_deviation, median_deviation_overtrim, median_deviation_undertrim, rate_precision_in1, rate_overtrim_gt1, rate_undertrim_gt1
108 | end
109 | 
110 | function peReadSimulatorStats_main(r1::String, r2::String)
111 |     io1 = open(r1, "r")
112 |     io2 = open(r2, "r")
113 | 
114 |     tmp_file = r1 * ".r12"
115 |     io_out = open(tmp_file, "w+")
116 | 
117 |     while !eof(io1) || !eof(io2)
118 |         while !eof(io1)
119 |             line = readline(io1)
120 |             println(io_out, line)
121 |             line = readline(io1)
122 |             println(io_out, line)
123 |             line = readline(io1)
124 |             println(io_out, line)
125 |             line = readline(io1)
126 |             println(io_out, line)
127 |             break
128 |         end
129 |         while !eof(io2)
130 |             line = readline(io2)
131 |             println(io_out, line)
132 |             line = readline(io2)
133 |             println(io_out, line)
134 |             line = readline(io2)
135 |             println(io_out, line)
136 |             line = readline(io2)
137 |             println(io_out, line)
138 |             break
139 |         end
140 |     end
141 |     close(io_out)
142 | 
143 |     peReadSimulatorStats_main(tmp_file)
144 | 
145 |     rm(tmp_file)
146 | end
147 | 
148 | function peReadSimulatorStats_main(input::String)
149 |     @info "read simulation stats: start" input
150 | 
151 |     if !isfile(input)
152 |         @warn "read simulation stats: input FASTQ file not valid: skip" FILE=input _module=nothing _group=nothing _id=nothing _file=nothing
153 |         return nothing
154 |     end
155 | 
156 |     r = FqRecord()
157 |     io = open(input, "r")
158 | 
159 |     # check if the file is empty
160 |     if eof(io)
161 |         @warn "read simulation stats: input FASTQ file empty: skip" FILE=input _module=nothing _group=nothing _id=nothing _file=nothing
162 |         return nothing
163 |     end
164 | 
165 |     # table = fastq_parser(input::String)
166 |     # generate stat-detail.tsv
167 |     stat_detail = open(input * ".stat-detail.tsv", "w+")
168 | 
169 |     stat_detail_header = "seq_id\tseq_length\tinsert_size\terror_rate\terror_insert\terror_adapter\ttrue_length\ttrimmed_length\tdelta_length\tis_trim_successful"
170 |     println(stat_detail, stat_detail_header)
171 | 
172 |     stat_summary = open(input * ".stat.tsv", "w+")
173 |     stat_summary_header = "seq_length\tinsert_size\terror_rate\trepeat\tprecision\trate_overtrim\trate_undertrim\tdeviation\tdeviation_overtrim\tdeviation_undertrim\trate_precision_in1\trate_overtrim_gt1\trate_undertrim_gt1"
174 |     println(stat_summary, stat_summary_header)
175 | 
176 |     ### first read
177 |     fqreadrecord!(r::FqRecord, io::IO)
178 | 
179 |     read_stat = fastq_parser(r)
180 |     println(stat_detail, join(read_stat, "\t"))
181 | 
182 |     (seq_id, seq_length, insert_size, error_rate, error_insert, error_adapter, true_length, trimmed_length, delta_length, is_trim_successful) = read_stat
183 | 
184 |     # identifier
185 |     current_seq_length = seq_length
186 |     current_insert_size = insert_size
187 |     current_error_rate = error_rate
188 | 
189 |     # stats
190 |     n_repeat = 1
191 | 
192 |     overtrim_deviations  = Vector{Int64}()
193 |     undertrim_deviations = Vector{Int64}()
194 | 
195 |     if delta_length > 0
196 |         push!(overtrim_deviations, delta_length)
197 |     elseif delta_length < 0
198 |         push!(undertrim_deviations, delta_length)
199 |     end
200 | 
201 |     ### other reads
202 |     while !eof(io)
203 |         fqreadrecord!(r::FqRecord, io::IO)
204 | 
205 |         read_stat = fastq_parser(r)
206 |         println(stat_detail, join(read_stat, "\t"))
207 | 
208 |         (seq_id, seq_length, insert_size, error_rate, error_insert, error_adapter, true_length, trimmed_length, delta_length, is_trim_successful) = read_stat
209 | 
210 |         # check identifier
211 |         if current_seq_length == seq_length && current_insert_size == insert_size && current_error_rate == error_rate
212 |             ### same identifier: append
213 |             n_repeat += 1
214 | 
215 |             if delta_length > 0
216 |                 push!(overtrim_deviations, delta_length)
217 |             elseif delta_length < 0
218 |                 push!(undertrim_deviations, delta_length)
219 |             end
220 |         else
221 |             ### new identifier: compute stats; refresh variables
222 |             # compute stats
223 |             stats_results = stats(n_repeat, overtrim_deviations, undertrim_deviations)
224 |             stats_results_string = join(Any[current_seq_length, current_insert_size, current_error_rate, n_repeat, stats_results...], "\t")
225 |             println(stat_summary, stats_results_string)
226 | 
227 |             # refresh variables
228 |             current_seq_length = seq_length
229 |             current_insert_size = insert_size
230 |             current_error_rate = error_rate
231 | 
232 |             n_repeat = 1
233 | 
234 |             overtrim_deviations  = Vector{Int64}()
235 |             undertrim_deviations = Vector{Int64}()
236 | 
237 |             if delta_length > 0
238 |                 push!(overtrim_deviations, delta_length)
239 |             elseif delta_length < 0
240 |                 push!(undertrim_deviations, delta_length)
241 |             end
242 |         end
243 |     end
244 | 
245 |     ### compute stats for the last
246 |     stats_results = stats(n_repeat, overtrim_deviations, undertrim_deviations)
247 |     stats_results_string = join(Any[current_seq_length, current_insert_size, current_error_rate, n_repeat, stats_results...], "\t")
248 |     println(stat_summary, stats_results_string)
249 | 
250 |     ### closing
251 |     close(io)
252 |     close(stat_detail)
253 |     close(stat_summary)
254 | 
255 |     @info "read simulation stats: output" detail="$input.stat-detail.tsv" summary="$input.stat.tsv"
256 | end
257 | 


--------------------------------------------------------------------------------
/src/Benchmark/read_simulation.jl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env julia
  2 | 
  3 | # using ArgParse
  4 | 
  5 | function parsing_args_simulate(args; exit_after_help = true)
  6 |     settings = ArgParseSettings(exit_after_help = exit_after_help)
  7 | 
  8 |     add_arg_group!(settings, "output")
  9 |     @add_arg_table! settings begin
 10 |         "--prefix", "-o"
 11 |             help = "prefix of output fastq files"
 12 |             metavar = "PREF"
 13 |             default = "read_simulation"
 14 |     end
 15 | 
 16 |     add_arg_group!(settings, "simulation")
 17 |     @add_arg_table! settings begin
 18 |         "--repeat", "-x"
 19 |             help = "repeat times for each case"
 20 |             default = 30000
 21 |             arg_type = Int64
 22 |         "--adapter1", "-a"
 23 |             help = "read 1 adapter"
 24 |             metavar = "SEQ"
 25 |             default = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCA"
 26 |         "--adapter2", "-A"
 27 |             help = "read 2 adapter"
 28 |             metavar = "SEQ"
 29 |             default = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"
 30 |         "--seq-length", "-s"
 31 |             help = "a given sequence length; simulated sequence length might be 1 base more than the value because of simulated phasing error"
 32 |             default = 100
 33 |             arg_type = Int64
 34 |         "--insert-size-range", "-i"
 35 |             help = "range of insert size"
 36 |             nargs = '+'
 37 |             arg_type = Int64
 38 |             default = [80:2:120;]
 39 |         "--subsitution-rate", "-S"
 40 |             help = "subsitution rate per base. it is random for each base. error type includs mismatch"
 41 |             nargs = '+'
 42 |             arg_type = Float64
 43 |             default = [0.001:0.001:0.005;]
 44 |         "--insertion-rate", "-I"
 45 |             help = "insertion rate; number of arg should be the same as --subsitution-rate"
 46 |             nargs = '+'
 47 |             arg_type = Float64
 48 |             default = [1.0e-5:1.0e-5:5.0e-5;]
 49 |         "--deletion-rate", "-D"
 50 |             help = "deletion rate; number of arg should be the same as --subsitution-rate"
 51 |             nargs = '+'
 52 |             arg_type = Float64
 53 |             default = [1.0e-5:1.0e-5:5.0e-5;]
 54 |     end
 55 |     return parse_args(args, settings)
 56 | end
 57 | 
 58 | 
 59 | @inline function simulate_insert(insert_size::Int64)
 60 |     bases = rand(['A', 'T', 'C', 'G'], insert_size)
 61 |     string(bases...)
 62 | end
 63 | 
 64 | """
 65 |     simulate_error(base::String, sub_rate::Float64, insert_rate::Float64, del_rate::Float64)
 66 | 
 67 | Return `(base::String, iserror::Int64)`
 68 | """
 69 | @inline function simulate_error(base::String, sub_rate::Float64, insert_rate::Float64, del_rate::Float64)
 70 |     bases = ["A", "T", "C", "G"]
 71 | 
 72 |     randfloat = rand()
 73 |     if randfloat <= sub_rate
 74 |         ## subsitution
 75 |         idx_base = findfirst(x -> x == base, bases)
 76 |         idx_sub = rand(1:3)
 77 |         if idx_base == idx_sub
 78 |             return "G", 1  # bases[4] == "G"
 79 |         else
 80 |             return bases[idx_sub], 1
 81 |         end
 82 |     else
 83 |         randfloat -= sub_rate
 84 |         if randfloat <= insert_rate
 85 |             ## insert
 86 |             res = base * rand(bases)
 87 |             return res, 1
 88 |         else
 89 |             randfloat -= insert_rate
 90 |             if randfloat <= del_rate
 91 |                 ## deletion
 92 |                 return "", 1
 93 |             end
 94 |             ## no error
 95 |             return base, 0
 96 |         end
 97 |     end
 98 | end
 99 | 
100 | """
101 |     simulate_read(insert::String, adapter::String, sub_rate::Float64, insert_rate::Float64, del_rate::Float64, seq_length::Int64)
102 | 
103 | Return `res, true_insert_size, nerror_insert, nerror_adapter`
104 | """
105 | @inline function simulate_read(insert::String, adapter::String, sub_rate::Float64, insert_rate::Float64, del_rate::Float64, seq_length::Int64)
106 |     res = ""
107 |     ninsert = length(insert)
108 |     nerror_insert = 0
109 |     nerror_adapter = 0
110 | 
111 |     # simulate insert
112 |     the_base = ""
113 |     for i in 1:ninsert
114 |         the_base, iserror = simulate_error(insert[i:i], sub_rate::Float64, insert_rate::Float64, del_rate::Float64)
115 |         n_the_base = length(the_base)
116 |         res *= the_base
117 |         nerror_insert += iserror
118 |         current_insert_size = length(res)
119 |         if current_insert_size >= seq_length
120 |             if n_the_base <= 1
121 |                 return res, current_insert_size, nerror_insert, nerror_adapter
122 |             elseif n_the_base == 2  # error with insert. Inserted part is not of real DNA fragment!
123 |                 return res[1:end-1], current_insert_size - 1, nerror_insert, nerror_adapter
124 |             else
125 |                 @error "Bugs at simulate_read()" the_base iserror length(res) _module=nothing _group=nothing _id=nothing _file=nothing
126 |             end
127 |         end
128 |     end
129 |     if length(the_base) == 2  # error with insert. Inserted part is not of real DNA fragment!
130 |         true_insert_size = length(res) - 1
131 |     else
132 |         true_insert_size = length(res)
133 |     end
134 | 
135 |     nadapter = length(adapter)
136 |     for i in 1:nadapter
137 |         the_base, iserror = simulate_error(adapter[i:i], sub_rate::Float64, insert_rate::Float64, del_rate::Float64)
138 |         res *= the_base
139 |         nerror_adapter += iserror
140 |         if length(res) >= seq_length
141 |             return res, true_insert_size, nerror_insert, nerror_adapter
142 |         end
143 |     end
144 | 
145 |     nrandom = seq_length - length(res)
146 |     res *= simulate_insert(nrandom)
147 | 
148 |     return res, true_insert_size, nerror_insert, nerror_adapter
149 | end
150 | 
151 | @inline function complement_char(c::Char)
152 |     if c == 'A'
153 |         'T'
154 |     elseif c == 'T'
155 |         'A'
156 |     elseif c == 'C'
157 |         'G'
158 |     elseif c == 'G'
159 |         'C'
160 |     else
161 |         'N'
162 |     end
163 | end
164 | 
165 | @inline function reverse_complement(s::String)
166 |     ns = lastindex(s::String)
167 |     char_vec = map(x -> complement_char(s[x]), ns:-1:1)
168 |     string(char_vec...)
169 | end
170 | 
171 | function writeseq(io::IO, header::String, seq::String; error_rate=0.0001)
172 |     println(io, header)
173 |     println(io, seq)
174 |     println(io, "+")
175 |     qual_char = if error_rate < 0.0001
176 |         'J'
177 |     else
178 |         Char(round(Int, -10 * log10(error_rate)) + 33)
179 |     end
180 |     println(io, qual_char ^ length(seq))
181 | end
182 | 
183 | function julia_wrapper_simulate(ARGS; exit_after_help = true)
184 |     time0 = time()
185 | 
186 |     if length(ARGS) == 0
187 |         parsing_args_simulate(["-h"], exit_after_help = exit_after_help)
188 |         return 0
189 |     end
190 |     args = parsing_args_simulate(ARGS, exit_after_help = exit_after_help)
191 |     args === nothing && return 0
192 | 
193 |     r1 = args["prefix"] * ".R1.fastq"
194 |     r2 = args["prefix"] * ".R2.fastq"
195 | 
196 |     r1_io = open(r1, "w+")
197 |     r2_io = open(r2, "w+")
198 | 
199 | 
200 |     @info "read simulation: output files" r1 r2
201 | 
202 |     adapter1 = args["adapter1"]
203 |     adapter2 = args["adapter2"]
204 |     repeat_times = args["repeat"]
205 |     seq_length = args["seq-length"]
206 |     insert_sizes = args["insert-size-range"]
207 | 
208 |     insert_rates = args["insertion-rate"]
209 |     deletion_rates = args["deletion-rate"]
210 |     subsitution_rates = args["subsitution-rate"]
211 | 
212 |     length(insert_rates) == length(deletion_rates) == length(subsitution_rates) ||
213 |         error("ArgumentError: the numbers of args of --subsitution-rate, --insertion-rate, and --deletion-rate should be the same. Abort.")
214 | 
215 |     error_rates = insert_rates .+ deletion_rates .+ subsitution_rates
216 | 
217 |     any(error_rates .> 1) &&
218 |     error("ArgumentError: any dot sums of --subsitution-rate, --insertion-rate, and --deletion-rate should be less than one. Abort.")
219 | 
220 | 
221 |     read_pair_count = repeat_times * length(insert_sizes) * length(error_rates)
222 | 
223 |     read_id = 0
224 |     for insert_size in insert_sizes
225 |         if insert_size >= 0
226 |             for (i_rate, error_rate) in enumerate(error_rates)
227 |                 insert_rate = insert_rates[i_rate]
228 |                 del_rate = deletion_rates[i_rate]
229 |                 sub_rate = subsitution_rates[i_rate]
230 | 
231 |                 for rep in 1:repeat_times
232 |                     read_id += 1
233 |                     insert = simulate_insert(insert_size::Int64)
234 |                     r1_seq, r1_true_insert_size, r1_nerror_insert, r1_nerror_adapter = simulate_read(insert, adapter1, sub_rate::Float64, insert_rate::Float64, del_rate::Float64, seq_length::Int64)
235 |                     r2_seq, r2_true_insert_size, r2_nerror_insert, r2_nerror_adapter = simulate_read(reverse_complement(insert), adapter2, sub_rate::Float64, insert_rate::Float64, del_rate::Float64, seq_length::Int64)
236 | 
237 |                     r1_header = "@PeReadSimulator2:$read_id:$rep TRUE=$r1_true_insert_size INSERT_SIZE=$insert_size ERROR_RATE=$error_rate SEQ_LENGTH=$seq_length ERROR_INSERT=$r1_nerror_insert ERROR_ADAPTER=$r1_nerror_adapter SUB=$sub_rate INS=$insert_rate DEL=$del_rate"
238 |                     r2_header = "@PeReadSimulator2:$read_id:$rep TRUE=$r2_true_insert_size INSERT_SIZE=$insert_size ERROR_RATE=$error_rate SEQ_LENGTH=$seq_length ERROR_INSERT=$r2_nerror_insert ERROR_ADAPTER=$r2_nerror_adapter SUB=$sub_rate INS=$insert_rate DEL=$del_rate"
239 | 
240 |                     writeseq(r1_io, r1_header, r1_seq, error_rate=error_rate)
241 |                     writeseq(r2_io, r2_header, r2_seq, error_rate=error_rate)
242 |                 end
243 |             end
244 |         end
245 |     end
246 |     close(r1_io)
247 |     close(r2_io)
248 |     @info "read simulation: all done" elapsed=time() - time0
249 |     return 0
250 | end
251 | 


--------------------------------------------------------------------------------
/src/Benchmark/read_simulation_primer.jl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env julia
  2 | 
  3 | # using ArgParse
  4 | 
  5 | using BioSequences
  6 | 
  7 | function parsing_args_simulate(args; exit_after_help = true)
  8 |     settings = ArgParseSettings(exit_after_help = exit_after_help)
  9 | 
 10 |     add_arg_group!(settings, "output")
 11 |     @add_arg_table! settings begin
 12 |         "--prefix", "-o"
 13 |             help = "prefix of output fastq files"
 14 |             metavar = "PREF"
 15 |             default = "read_simulation"
 16 |     end
 17 | 
 18 |     add_arg_group!(settings, "simulation")
 19 |     @add_arg_table! settings begin
 20 |         "--repeat", "-x"
 21 |             help = "repeat times for each case"
 22 |             default = 30000
 23 |             arg_type = Int64
 24 |         "--primer1", "-a"
 25 |             help = "read 1 primer"
 26 |             metavar = "SEQ"
 27 |             default = "AHCGATGAAGAACRYAG"
 28 |         "--primer2", "-A"
 29 |             help = "read 2 primer"
 30 |             metavar = "SEQ"
 31 |             default = "CTTATTGATATGCTTAAGTTCAG"
 32 |         "--seq-length", "-s"
 33 |             help = "a given sequence length; simulated sequence length might be 1 base more than the value because of simulated phasing error"
 34 |             default = 100
 35 |             arg_type = Int64
 36 |         "--insert-size-range", "-i"
 37 |             help = "range of insert size"
 38 |             nargs = '+'
 39 |             arg_type = Int64
 40 |             default = [80:2:120;]
 41 |         "--subsitution-rate", "-S"
 42 |             help = "subsitution rate per base. it is random for each base. error type includs mismatch"
 43 |             nargs = '+'
 44 |             arg_type = Float64
 45 |             default = [0.001:0.001:0.005;]
 46 |         "--insertion-rate", "-I"
 47 |             help = "insertion rate; number of arg should be the same as --subsitution-rate"
 48 |             nargs = '+'
 49 |             arg_type = Float64
 50 |             default = [1.0e-5:1.0e-5:5.0e-5;]
 51 |         "--deletion-rate", "-D"
 52 |             help = "deletion rate; number of arg should be the same as --subsitution-rate"
 53 |             nargs = '+'
 54 |             arg_type = Float64
 55 |             default = [1.0e-5:1.0e-5:5.0e-5;]
 56 |     end
 57 |     return parse_args(args, settings)
 58 | end
 59 | 
 60 | 
 61 | @inline function simulate_insert(insert_size::Int64)
 62 |     randdnaseq(insert_size)
 63 | end
 64 | 
 65 | """
 66 |     simulate_error(base::DNA, sub_rate::Float64, insert_rate::Float64, del_rate::Float64)
 67 | 
 68 | Return `(base::DNA, iserror::Int64)`
 69 | """
 70 | @inline function simulate_error(base::DNA, sub_rate::Float64, insert_rate::Float64, del_rate::Float64)
 71 |     bases = [DNA_A, DNA_T, DNA_C, DNA_G]
 72 | 
 73 |     randfloat = rand()
 74 |     if randfloat <= sub_rate
 75 |         ## subsitution
 76 |         idx_base = findfirst(x -> x == base, bases)
 77 |         idx_sub = rand(1:3)
 78 |         if idx_base == idx_sub
 79 |             return DNA_G, 1  # bases[4] == "G"
 80 |         else
 81 |             return bases[idx_sub], 1
 82 |         end
 83 |     else
 84 |         randfloat -= sub_rate
 85 |         if randfloat <= insert_rate
 86 |             ## insert
 87 |             res = base * rand(bases)
 88 |             return res, 1
 89 |         else
 90 |             randfloat -= insert_rate
 91 |             if randfloat <= del_rate
 92 |                 ## deletion
 93 |                 return DNA_Gap, 1
 94 |             end
 95 |             ## no error
 96 |             return base, 0
 97 |         end
 98 |     end
 99 | end
100 | 
101 | """
102 |     simulate_read(insert::String, primer::String, sub_rate::Float64, insert_rate::Float64, del_rate::Float64, seq_length::Int64)
103 | 
104 | Return `res, true_insert_size, nerror_insert, nerror_primer`
105 | """
106 | @inline function simulate_read(insert::String, primer_head::LongDNA{4}, primer_tail::LongDNA{4}, sub_rate::Float64, insert_rate::Float64, del_rate::Float64, seq_length::Int64)
107 |     res = LongDNA{4}
108 |     ninsert = length(insert)
109 |     nerror_insert = 0
110 |     nerror_primer_head = 0
111 |     nerror_primer_tail = 0
112 | 
113 | 
114 | 
115 | 
116 | 
117 |     # simulate insert
118 |     the_base = ""
119 |     for i in 1:ninsert
120 |         the_base, iserror = simulate_error(insert[i:i], sub_rate::Float64, insert_rate::Float64, del_rate::Float64)
121 |         n_the_base = length(the_base)
122 |         res *= the_base
123 |         nerror_insert += iserror
124 |         current_insert_size = length(res)
125 |         if current_insert_size >= seq_length
126 |             if n_the_base <= 1
127 |                 return res, current_insert_size, nerror_insert, nerror_primer
128 |             elseif n_the_base == 2  # error with insert. Inserted part is not of real DNA fragment!
129 |                 return res[1:end-1], current_insert_size - 1, nerror_insert, nerror_primer
130 |             else
131 |                 @error "Bugs at simulate_read()" the_base iserror length(res) _module=nothing _group=nothing _id=nothing _file=nothing
132 |             end
133 |         end
134 |     end
135 |     if length(the_base) == 2  # error with insert. Inserted part is not of real DNA fragment!
136 |         true_insert_size = length(res) - 1
137 |     else
138 |         true_insert_size = length(res)
139 |     end
140 | 
141 |     nprimer = length(primer)
142 |     for i in 1:nprimer
143 |         the_base, iserror = simulate_error(primer[i:i], sub_rate::Float64, insert_rate::Float64, del_rate::Float64)
144 |         res *= the_base
145 |         nerror_primer += iserror
146 |         if length(res) >= seq_length
147 |             return res, true_insert_size, nerror_insert, nerror_primer
148 |         end
149 |     end
150 | 
151 |     nrandom = seq_length - length(res)
152 |     res *= simulate_insert(nrandom)
153 | 
154 |     return res, true_insert_size, nerror_insert, nerror_primer
155 | end
156 | 
157 | @inline function complement_char(c::Char)
158 |     if c == 'A'
159 |         'T'
160 |     elseif c == 'T'
161 |         'A'
162 |     elseif c == 'C'
163 |         'G'
164 |     elseif c == 'G'
165 |         'C'
166 |     else
167 |         'N'
168 |     end
169 | end
170 | 
171 | @inline function reverse_complement(s::String)
172 |     ns = lastindex(s::String)
173 |     char_vec = map(x -> complement_char(s[x]), ns:-1:1)
174 |     string(char_vec...)
175 | end
176 | 
177 | function writeseq(io::IO, header::String, seq::String; error_rate=0.0001)
178 |     println(io, header)
179 |     println(io, seq)
180 |     println(io, "+")
181 |     qual_char = if error_rate < 0.0001
182 |         'J'
183 |     else
184 |         Char(round(Int, -10 * log10(error_rate)) + 33)
185 |     end
186 |     println(io, qual_char ^ length(seq))
187 | end
188 | 
189 | function julia_wrapper_simulate(ARGS; exit_after_help = true)
190 |     time0 = time()
191 | 
192 |     if length(ARGS) == 0
193 |         parsing_args_simulate(["-h"], exit_after_help = exit_after_help)
194 |         return 0
195 |     end
196 |     args = parsing_args_simulate(ARGS, exit_after_help = exit_after_help)
197 |     args === nothing && return 0
198 | 
199 |     r1 = args["prefix"] * ".R1.fastq"
200 |     r2 = args["prefix"] * ".R2.fastq"
201 | 
202 |     r1_io = open(r1, "w+")
203 |     r2_io = open(r2, "w+")
204 | 
205 | 
206 |     @info "read simulation: output files" r1 r2
207 | 
208 |     primer1 = args["primer1"]
209 |     primer2 = args["primer2"]
210 |     repeat_times = args["repeat"]
211 |     seq_length = args["seq-length"]
212 |     insert_sizes = args["insert-size-range"]
213 | 
214 |     insert_rates = args["insertion-rate"]
215 |     deletion_rates = args["deletion-rate"]
216 |     subsitution_rates = args["subsitution-rate"]
217 | 
218 |     length(insert_rates) == length(deletion_rates) == length(subsitution_rates) ||
219 |         error("ArgumentError: the numbers of args of --subsitution-rate, --insertion-rate, and --deletion-rate should be the same. Abort.")
220 | 
221 |     error_rates = insert_rates .+ deletion_rates .+ subsitution_rates
222 | 
223 |     any(error_rates .> 1) &&
224 |     error("ArgumentError: any dot sums of --subsitution-rate, --insertion-rate, and --deletion-rate should be less than one. Abort.")
225 | 
226 | 
227 |     read_pair_count = repeat_times * length(insert_sizes) * length(error_rates)
228 | 
229 |     read_id = 0
230 |     for insert_size in insert_sizes
231 |         if insert_size >= 0
232 |             for (i_rate, error_rate) in enumerate(error_rates)
233 |                 insert_rate = insert_rates[i_rate]
234 |                 del_rate = deletion_rates[i_rate]
235 |                 sub_rate = subsitution_rates[i_rate]
236 | 
237 |                 for rep in 1:repeat_times
238 |                     read_id += 1
239 |                     insert = simulate_insert(insert_size::Int64)
240 |                     r1_seq, r1_true_insert_size, r1_nerror_insert, r1_nerror_primer = simulate_read(insert, primer1, primer2_rc, sub_rate::Float64, insert_rate::Float64, del_rate::Float64, seq_length::Int64)
241 |                     r2_seq, r2_true_insert_size, r2_nerror_insert, r2_nerror_primer = simulate_read(reverse_complement(insert), primer2, primer1_rc, sub_rate::Float64, insert_rate::Float64, del_rate::Float64, seq_length::Int64)
242 | 
243 |                     r1_header = "@PeReadSimulator2:$read_id:$rep TRUE=$r1_true_insert_size INSERT_SIZE=$insert_size ERROR_RATE=$error_rate SEQ_LENGTH=$seq_length ERROR_INSERT=$r1_nerror_insert ERROR_primer=$r1_nerror_primer SUB=$sub_rate INS=$insert_rate DEL=$del_rate"
244 |                     r2_header = "@PeReadSimulator2:$read_id:$rep TRUE=$r2_true_insert_size INSERT_SIZE=$insert_size ERROR_RATE=$error_rate SEQ_LENGTH=$seq_length ERROR_INSERT=$r2_nerror_insert ERROR_primer=$r2_nerror_primer SUB=$sub_rate INS=$insert_rate DEL=$del_rate"
245 | 
246 |                     writeseq(r1_io, r1_header, r1_seq, error_rate=error_rate)
247 |                     writeseq(r2_io, r2_header, r2_seq, error_rate=error_rate)
248 |                 end
249 |             end
250 |         end
251 |     end
252 |     close(r1_io)
253 |     close(r2_io)
254 |     @info "read simulation: all done" elapsed=time() - time0
255 |     return 0
256 | end
257 | 


--------------------------------------------------------------------------------