├── misc
    ├── logo.png
    └── logo.svg
├── run_script_from_s3.sh
├── .gitignore
├── .github
    └── workflows
    │   └── docker-image.yml
├── Cargo.toml
├── Dockerfile
├── src
    └── bin
    │   ├── utils
    │       ├── mod.rs
    │       ├── build_idx.rs
    │       ├── resolve.rs
    │       ├── build_sdb.rs
    │       ├── ovlp_ec.rs
    │       ├── shmmrutils.rs
    │       ├── layout.rs
    │       └── graph_analysis.rs
    │   ├── pg_build_sdb.rs
    │   ├── pg_graph.rs
    │   ├── pg_dp_graph.rs
    │   ├── pg_layout.rs
    │   ├── pg_resolve.rs
    │   ├── pg_dedup.rs
    │   ├── pg_build_idx.rs
    │   ├── pg_ovlp.rs
    │   ├── pg_ovlp_ec.rs
    │   ├── pg_getreads.rs
    │   └── pg_asm.rs
├── README.md
└── LICENSE


/misc/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cschin/peregrine-2021/HEAD/misc/logo.png


--------------------------------------------------------------------------------
/run_script_from_s3.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -ex
3 | echo $1
4 | aws s3 cp $1/run.sh run.sh
5 | bash run.sh
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Generated by Cargo
 2 | # will have compiled files and executables
 3 | /target/
 4 | /target
 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 7 | Cargo.lock
 8 | 
 9 | # These are backup files generated by rustfmt
10 | **/*.rs.bk
11 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Image CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 | jobs:
10 | 
11 |   build:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Build the Docker image
18 |       run: docker build . --file Dockerfile --tag cschin/pgr-2021:$(date +%s)
19 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "peregrine-r"
 3 | version = "0.4.13"
 4 | authors = ["Jason Chin <cschin@omnibio.ai>"]
 5 | edition = "2018"
 6 | build = "build.rs"
 7 | 
 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 9 | [dependencies]
10 | glob = "0.3.0"
11 | serde = { version = "1", features = ["derive"] }
12 | petgraph = { git = "https://github.com/cschin/petgraph.git", branch = "fx_index_map" } # base on petgraph = "0.5.1"
13 | libc = "0.2"
14 | memmap = "0.7.0"
15 | structview = "1.1.0"
16 | byteorder = "1.3.4"
17 | rand = "0.7.3"
18 | rustc-hash = "1.1.0"
19 | # flate2 = { version ="1.0", features = ["zlib"] }
20 | flate2 = "1.0"
21 | clap = "2"
22 | threadpool = "1.8.1"
23 | num_cpus = "1.13.0"
24 | log = { version = "^0.4.5", features = ["std"] }
25 | simple_logger = "2.1.0"
26 | sysinfo = "0.23.5"
27 | mimalloc = { version = "0.1.17", default-features = false }
28 | rayon = "1.5.0"
29 | intervaltree = "0.2.6"
30 | lazy_static = "1.4.0"
31 | regex = "1"
32 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | #ref: https://alexbrand.dev/post/how-to-package-rust-applications-into-minimal-docker-containers/
 2 | FROM rust:1.58.1 AS build
 3 | WORKDIR /usr/src
 4 | 
 5 | # since we use the mimalloc crate, it does not work well with musl.
 6 | #RUN rustup target add x86_64-unknown-linux-musl
 7 | 
 8 | # Create a dummy project and build the app's dependencies.
 9 | # If the Cargo.toml or Cargo.lock files have not changed,
10 | # we can use the docker build cache and skip these (typically slow) steps.
11 | RUN USER=root cargo new peregrine-r 
12 | WORKDIR /usr/src/peregrine-r
13 | #COPY Cargo.toml Cargo.lock build.rs ./
14 | COPY Cargo.toml build.rs ./
15 | COPY .git ./.git
16 | RUN apt-get update && apt-get install -y build-essential cmake 
17 | RUN cargo build --release
18 | 
19 | # Copy the source and build the application.
20 | COPY src ./src
21 | #RUN cargo install --target x86_64-unknown-linux-musl --path .
22 | RUN cargo install --path .
23 | 
24 | # Copy the statically-linked binary into a scratch container.
25 | #FROM scratch
26 | #COPY --from=build /usr/local/cargo/bin/pg_* ./
27 | #USER 1000
28 | CMD ["./pg_asm"]
29 | 
30 | FROM ubuntu:20.04
31 | COPY --from=build /usr/local/cargo/bin/pg_* /usr/local/bin/
32 | RUN apt-get update
33 | RUN DEBIAN_FRONTEND="noninteractive" apt-get -y install tzdata
34 | RUN DEBIAN_FRONTEND="noninteractive" apt-get install -y parallel pigz awscli
35 | RUN apt-get install -y samtools exonerate jq wget
36 | COPY run_script_from_s3.sh /usr/local/bin
37 | RUN chmod u+x /usr/local/bin/run_script_from_s3.sh
38 | CMD ["pg_asm"]
39 | 


--------------------------------------------------------------------------------
/src/bin/utils/mod.rs:
--------------------------------------------------------------------------------
 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit
 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin
 3 | //
 4 | // This Source Code Form is subject to the terms of the
 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
 6 | //
 7 | // You should have received a copy of the license along with this
 8 | // work. If not, see <http://creativecommons.org/licenses/by-nc-sa/4.0/>.
 9 | 
10 | pub mod build_idx;
11 | pub mod build_sdb;
12 | pub mod dp_graph;
13 | pub mod graph;
14 | pub mod graph_analysis;
15 | pub mod layout;
16 | pub mod ovlp;
17 | pub mod ovlp_ec;
18 | pub mod resolve;
19 | pub mod seqmap;
20 | pub mod shmmrutils;
21 | pub use core::mem::MaybeUninit;
22 | #[cfg(target_os = "linux")]
23 | pub use libc::RUSAGE_THREAD;
24 | pub use libc::{getrusage, rusage, RUSAGE_SELF};
25 | 
26 | #[derive(Copy, Clone)]
27 | pub struct Parameters {
28 |     pub nthreads: u32,
29 |     pub nchunks: u32,
30 |     pub k: u32,
31 |     pub w: u32,
32 |     pub r: u32,
33 |     pub tol: f64,
34 |     pub min_ec_cov: u16,
35 | }
36 | 
37 | #[allow(dead_code)]
38 | pub fn log_resource(msg: &str, data: &mut rusage) -> (u64, u64, u64) {
39 |     let _res = unsafe { getrusage(RUSAGE_SELF, data) };
40 |     log::info!(
41 |         "{} : (maxRSS, utime, stime): {} {} {}",
42 |         msg,
43 |         data.ru_maxrss,
44 |         data.ru_utime.tv_sec,
45 |         data.ru_stime.tv_sec
46 |     );
47 | 
48 |     (
49 |         data.ru_maxrss as u64,
50 |         data.ru_utime.tv_sec as u64,
51 |         data.ru_stime.tv_sec as u64,
52 |     )
53 | }
54 | 


--------------------------------------------------------------------------------
/src/bin/pg_build_sdb.rs:
--------------------------------------------------------------------------------
 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit
 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin
 3 | //
 4 | // This Source Code Form is subject to the terms of the
 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
 6 | //
 7 | // You should have received a copy of the license along with this
 8 | // work. If not, see <http://creativecommons.org/licenses/by-nc-sa/4.0/>.
 9 | 
10 | const VERSION_STRING: &'static str = env!("VERSION_STRING");
11 | 
12 | #[global_allocator]
13 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
14 | //static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc;
15 | 
16 | use clap::clap_app;
17 | mod utils;
18 | use simple_logger::SimpleLogger;
19 | use utils::build_sdb::build;
20 | fn main() -> () {
21 |     let matches = clap_app!(pg_build_sdb =>
22 |         (version: VERSION_STRING)
23 |         (author: "Jason Chin <jason@omnibio.ai>")
24 |         (about: "
25 | Peregrine-2021 genome assembler 
26 | pb_build_sdb: build the sequence database
27 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/")    
28 |         (@arg input: --input +required +takes_value "Path to a file that contains the list of reads in .fa .fa.gz .fastq or fastq.gz formats")
29 |         (@arg out_prefix: --out_prefix +required +takes_value "The prefix for the sequence database and index files")
30 |         (@arg log: --log +takes_value "log level: DBBUG or INFO (default)")
31 |     ).get_matches();
32 | 
33 |     let log_level = match matches.value_of("log").unwrap_or("INFO") {
34 |         "DEBUG" => log::LevelFilter::Debug,
35 |         _ => log::LevelFilter::Info,
36 |     };
37 | 
38 |     SimpleLogger::new()
39 |         .with_level(log_level)
40 |         .with_utc_timestamps()
41 |         .init()
42 |         .unwrap();
43 | 
44 |     let seq_list_file = matches.value_of("input").unwrap().to_string();
45 |     let out_prefix = matches.value_of("out_prefix").unwrap().to_string();
46 |     let _nbases = build(&seq_list_file, &out_prefix);
47 | }
48 | 


--------------------------------------------------------------------------------
/src/bin/pg_graph.rs:
--------------------------------------------------------------------------------
 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit
 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin
 3 | //
 4 | // This Source Code Form is subject to the terms of the
 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
 6 | //
 7 | // You should have received a copy of the license along with this
 8 | // work. If not, see <http://creativecommons.org/licenses/by-nc-sa/4.0/>.
 9 | 
10 | const VERSION_STRING: &'static str = env!("VERSION_STRING");
11 | 
12 | #[global_allocator]
13 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
14 | //static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc;
15 | 
16 | use clap::clap_app;
17 | mod utils;
18 | use simple_logger::SimpleLogger;
19 | use utils::graph::ovlp2layout_v1;
20 | fn main() -> () {
21 |     let matches = clap_app!(pg_graph =>
22 |         (version: VERSION_STRING)
23 |         (author: "Jason Chin <jason@omnibio.ai>")
24 |         (about: "
25 | Peregrine-2021 genome assembler, 
26 | pg_graph: (obsoleted) convert the overlap information between the reads into an assembly group
27 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/")    
28 |         (@arg prefix: --prefix +required +takes_value "Path prefix for input files")
29 |         (@arg out_prefix: --out_prefix +required +takes_value "Path prefix for output files ")
30 |         (@arg log: --log +takes_value "log level: DBBUG or INFO (default)")
31 |     )
32 |     .get_matches();
33 | 
34 |     let log_level = match matches.value_of("log").unwrap_or("INFO") {
35 |         "DEBUG" => log::LevelFilter::Debug,
36 |         _ => log::LevelFilter::Info,
37 |     };
38 | 
39 |     SimpleLogger::new()
40 |         .with_level(log_level)
41 |         .with_utc_timestamps()
42 |         .init()
43 |         .unwrap();
44 | 
45 |     let prefix = matches.value_of("prefix").unwrap().to_string();
46 |     let out_prefix = matches.value_of("out_prefix").unwrap().to_string();
47 | 
48 |     let _err = log::info!("graph:out_prefix: {}", out_prefix,);
49 |     ovlp2layout_v1(&prefix, &out_prefix, 6);
50 | }
51 | 


--------------------------------------------------------------------------------
/src/bin/pg_dp_graph.rs:
--------------------------------------------------------------------------------
 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit
 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin
 3 | //
 4 | // This Source Code Form is subject to the terms of the
 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
 6 | //
 7 | // You should have received a copy of the license along with this
 8 | // work. If not, see <http://creativecommons.org/licenses/by-nc-sa/4.0/>.
 9 | 
10 | const VERSION_STRING: &'static str = env!("VERSION_STRING");
11 | 
12 | #[global_allocator]
13 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
14 | //static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc;
15 | 
16 | use clap::clap_app;
17 | mod utils;
18 | use simple_logger::SimpleLogger;
19 | use utils::dp_graph::ovlp2layout_v2;
20 | fn main() -> Result<(), std::io::Error> {
21 |     let matches = clap_app!(pg_graph =>
22 |             (version: VERSION_STRING)
23 |             (author: "Jason Chin <jason@omnibio.ai>")
24 |             (about: "
25 | Peregrine-2021 genome assembler, 
26 | pg_dp_graph: take overlap data file as input to generate the layout file using a polyploid aware layout algorithm
27 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/")    
28 |             (@arg prefix: --prefix +required +takes_value "Path prefix for input files")
29 |             (@arg out_prefix: --out_prefix +required +takes_value "ath prefix for output files ")
30 |             (@arg bestn: --bestn -b +takes_value "number of best overlaps for initial graph [default: 6]")
31 |             (@arg log: --log +takes_value "log level: DBBUG or INFO (default)")
32 |         ).get_matches();
33 | 
34 |     let log_level = match matches.value_of("log").unwrap_or("INFO") {
35 |         "DEBUG" => log::LevelFilter::Debug,
36 |         _ => log::LevelFilter::Info,
37 |     };
38 | 
39 |     SimpleLogger::new()
40 |         .with_level(log_level)
41 |         .with_utc_timestamps()
42 |         .init()
43 |         .unwrap();
44 | 
45 |     let prefix = matches.value_of("prefix").unwrap().to_string();
46 |     let out_prefix = matches.value_of("out_prefix").unwrap().to_string();
47 |     let bestn = matches
48 |         .value_of("bestn")
49 |         .unwrap_or("6")
50 |         .parse::<usize>()
51 |         .unwrap();
52 | 
53 |     let _err = log::info!("graph:out_prefix: {}", out_prefix,);
54 |     ovlp2layout_v2(&prefix, &out_prefix, bestn)?;
55 |     Ok(())
56 | }
57 | 


--------------------------------------------------------------------------------
/src/bin/pg_layout.rs:
--------------------------------------------------------------------------------
 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit
 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin
 3 | //
 4 | // This Source Code Form is subject to the terms of the
 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
 6 | //
 7 | // You should have received a copy of the license along with this
 8 | // work. If not, see <http://creativecommons.org/licenses/by-nc-sa/4.0/>.
 9 | 
10 | const VERSION_STRING: &'static str = env!("VERSION_STRING");
11 | 
12 | #[global_allocator]
13 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
14 | //static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc;
15 | 
16 | use clap::clap_app;
17 | mod utils;
18 | use simple_logger::SimpleLogger;
19 | use utils::layout::layout2ctg;
20 | 
21 | fn main() -> Result<(), std::io::Error> {
22 |     let matches = clap_app!(pg_layout =>
23 |         (version: VERSION_STRING)
24 |         (author: "Jason Chin <jason@omnibio.ai>")
25 |         (about: "
26 | Peregrine-2021 genome assembler, 
27 | pg_layout: convert the assembly graph to paths and generate the contig fasta file
28 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/")    
29 |         (@arg SEQDB:+required "Path to the seqdb file ")
30 |         (@arg SEQIDX:+required "Path to the seqdb index file")
31 |         (@arg layout_file: --layout_file +required +takes_value "Path to the layout file")
32 |         (@arg output_file: --output +required +takes_value "Path to the output file")
33 |         (@arg log: --log +takes_value "log level: DBBUG or INFO (default)")
34 |     )
35 |     .get_matches();
36 | 
37 |     let log_level = match matches.value_of("log").unwrap_or("INFO") {
38 |         "DEBUG" => log::LevelFilter::Debug,
39 |         _ => log::LevelFilter::Info,
40 |     };
41 | 
42 |     SimpleLogger::new()
43 |         .with_level(log_level)
44 |         .with_utc_timestamps()
45 |         .init()
46 |         .unwrap();
47 | 
48 |     let seqdb_file = matches.value_of("SEQDB").unwrap().to_string();
49 |     let index_file = matches.value_of("SEQIDX").unwrap().to_string();
50 |     let layout_file = matches.value_of("layout_file").unwrap().to_string();
51 |     let output_file = matches.value_of("output_file").unwrap().to_string();
52 | 
53 |     let _res = log::info!("layout:seq_db_file: {}", seqdb_file);
54 |     let _res = log::info!("layout:index_file: {}", index_file);
55 |     let _res = log::info!("layout:layout_file: {}", layout_file);
56 |     let _res = log::info!("layout:output: {}", output_file);
57 | 
58 |     layout2ctg(&seqdb_file, &index_file, &layout_file, &output_file)?;
59 |     Ok(())
60 | }
61 | 


--------------------------------------------------------------------------------
/src/bin/pg_resolve.rs:
--------------------------------------------------------------------------------
 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit
 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin
 3 | //
 4 | // This Source Code Form is subject to the terms of the
 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
 6 | //
 7 | // You should have received a copy of the license along with this
 8 | // work. If not, see <http://creativecommons.org/licenses/by-nc-sa/4.0/>.
 9 | 
10 | const VERSION_STRING: &'static str = env!("VERSION_STRING");
11 | 
12 | #[global_allocator]
13 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
14 | //static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc;
15 | 
16 | use clap::clap_app;
17 | mod utils;
18 | use simple_logger::SimpleLogger;
19 | use utils::resolve::resolve_ht;
20 | 
21 | fn main() -> Result<(), std::io::Error> {
22 |     let matches = clap_app!(pg_resolve =>
23 |         (version: VERSION_STRING)
24 |         (author: "Jason Chin <jason@omnibio.ai>")
25 |         (about: "
26 | Peregrine-2021 genome assembler, 
27 | pg_resolve: this tool aligns all contigs to themselve to identify haplotype-related contigs
28 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/")    
29 |         (@arg fasta_file: -f --fasta_file +required +takes_value "Path to the layout file")
30 |         (@arg output_prefix: -o --out_prefix +required +takes_value "Path to the output prefix")
31 |         (@arg w: -w +takes_value "Window size [default: 80]")
32 |         (@arg k: -k +takes_value "Kmer size [default: 56]")
33 |         (@arg r: -r +takes_value "Reduction factor [default: 6]")
34 |         (@arg log: --log +takes_value "log level: DBBUG or INFO (default)")
35 |     )
36 |     .get_matches();
37 | 
38 |     let log_level = match matches.value_of("log").unwrap_or("INFO") {
39 |         "DEBUG" => log::LevelFilter::Debug,
40 |         _ => log::LevelFilter::Info,
41 |     };
42 | 
43 |     SimpleLogger::new()
44 |         .with_level(log_level)
45 |         .with_utc_timestamps()
46 |         .init()
47 |         .unwrap();
48 | 
49 |     let fasta_file = matches.value_of("fasta_file").unwrap().to_string();
50 |     let output_prefix = matches.value_of("output_prefix").unwrap().to_string();
51 |     let wsize = matches
52 |         .value_of("w")
53 |         .unwrap_or("80")
54 |         .parse::<u32>()
55 |         .unwrap();
56 | 
57 |     let ksize = matches
58 |         .value_of("k")
59 |         .unwrap_or("56")
60 |         .parse::<u32>()
61 |         .unwrap();
62 | 
63 |     let rfactor = matches.value_of("r").unwrap_or("6").parse::<u32>().unwrap();
64 | 
65 |     resolve_ht(&fasta_file, &output_prefix, wsize, ksize, rfactor)?;
66 |     Ok(())
67 | }
68 | 


--------------------------------------------------------------------------------
/src/bin/pg_dedup.rs:
--------------------------------------------------------------------------------
 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit
 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin
 3 | //
 4 | // This Source Code Form is subject to the terms of the
 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
 6 | //
 7 | // You should have received a copy of the license along with this
 8 | // work. If not, see <http://creativecommons.org/licenses/by-nc-sa/4.0/>.
 9 | 
10 | const VERSION_STRING: &'static str = env!("VERSION_STRING");
11 | 
12 | #[global_allocator]
13 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
14 | //static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc;
15 | 
16 | use clap::clap_app;
17 | use simple_logger::SimpleLogger;
18 | 
19 | mod utils;
20 | use utils::seqmap;
21 | 
22 | fn main() -> Result<(), std::io::Error> {
23 |     let matches = clap_app!(pg_resolve =>
24 |         (version: VERSION_STRING)
25 |         (author: "Jason Chin <jason@omnibio.ai>")
26 |         (about: "
27 | Peregrine-2021 genome assembler, 
28 | pg_dedup: perform all contigs to all contigs alignment to remove duplicates 
29 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/")    
30 |         (@arg ref_fasta: -f --ref_fasta +required +takes_value "Path to the reference file")
31 |         (@arg target_fasta: -t --target_fasta +required +takes_value "Path to the target file")
32 |         (@arg output: -o --output +required +takes_value "Path to the output filename")
33 |         (@arg w: -w +takes_value "Window size [default: 48]")
34 |         (@arg k: -k +takes_value "Kmer size [default: 56]")
35 |         (@arg r: -r +takes_value "Reduction factor [default: 4]")
36 |         (@arg log: --log +takes_value "log level: DBBUG or INFO (default)")
37 |     )
38 |     .get_matches();
39 | 
40 |     let log_level = match matches.value_of("log").unwrap_or("INFO") {
41 |         "DEBUG" => log::LevelFilter::Debug,
42 |         _ => log::LevelFilter::Info,
43 |     };
44 | 
45 |     SimpleLogger::new()
46 |         .with_level(log_level)
47 |         .with_utc_timestamps()
48 |         .init()
49 |         .unwrap();
50 | 
51 |     let ref_fasta_file = matches.value_of("ref_fasta").unwrap().to_string();
52 |     let target_fasta_file = matches.value_of("target_fasta").unwrap().to_string();
53 |     let output_file = matches.value_of("output").unwrap().to_string();
54 |     let wsize = matches
55 |         .value_of("w")
56 |         .unwrap_or("48")
57 |         .parse::<u32>()
58 |         .unwrap();
59 | 
60 |     let ksize = matches
61 |         .value_of("k")
62 |         .unwrap_or("56")
63 |         .parse::<u32>()
64 |         .unwrap();
65 | 
66 |     let rfactor = matches.value_of("r").unwrap_or("4").parse::<u32>().unwrap();
67 |     seqmap::dedup_target_seqs(
68 |         &ref_fasta_file,
69 |         &target_fasta_file,
70 |         &output_file,
71 |         wsize,
72 |         ksize,
73 |         rfactor,
74 |     )?;
75 |     Ok(())
76 | }
77 | 


--------------------------------------------------------------------------------
/src/bin/pg_build_idx.rs:
--------------------------------------------------------------------------------
 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit
 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin
 3 | //
 4 | // This Source Code Form is subject to the terms of the
 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
 6 | //
 7 | // You should have received a copy of the license along with this
 8 | // work. If not, see <http://creativecommons.org/licenses/by-nc-sa/4.0/>.
 9 | 
10 | const VERSION_STRING: &'static str = env!("VERSION_STRING");
11 | 
12 | #[global_allocator]
13 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
14 | //static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc;
15 | 
16 | use clap::clap_app;
17 | mod utils;
18 | use simple_logger::SimpleLogger;
19 | use utils::build_idx::build;
20 | use utils::Parameters;
21 | fn main() -> () {
22 |     let matches = clap_app!(pg_build_idx =>
23 |         (version: VERSION_STRING)
24 |         (author: "Jason Chin <jason@omnibio.ai>")
25 |         (about: "
26 | Peregrine-2021 genome assembler
27 | build the SHIMMER index from the reads for overlapping
28 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/")            
29 |         (@arg SEQDB:+required "Path to the seqdb file ")
30 |         (@arg SEQIDX:+required "Path to the seqdb index file")
31 |         (@arg SHMRINDEXPREFIX: +required "The prefix to the output shimmer index database")
32 |         (@arg NTHREADS: +required "Number of threads")
33 |         (@arg NCHUNKS: +required "Number of partition")
34 |         (@arg w: -w +takes_value "Window size [default: 80]")
35 |         (@arg k: -k +takes_value "Kmer size [default: 56]")
36 |         (@arg r: -r +takes_value "Reduction factor [default: 6]")
37 |         (@arg log: --log +takes_value "log level: DBBUG or INFO (default)")
38 |     )
39 |     .get_matches();
40 | 
41 |     let log_level = match matches.value_of("log").unwrap_or("INFO") {
42 |         "DEBUG" => log::LevelFilter::Debug,
43 |         _ => log::LevelFilter::Info,
44 |     };
45 | 
46 |     SimpleLogger::new()
47 |         .with_level(log_level)
48 |         .with_utc_timestamps()
49 |         .init()
50 |         .unwrap();
51 | 
52 |     let seqdb_file = matches.value_of("SEQDB").unwrap().to_string();
53 |     let index_file = matches.value_of("SEQIDX").unwrap().to_string();
54 |     let out_prefix = matches.value_of("SHMRINDEXPREFIX").unwrap().to_string();
55 |     let nthreads = matches
56 |         .value_of("NTHREADS")
57 |         .unwrap()
58 |         .parse::<u32>()
59 |         .unwrap();
60 |     let nchunks = matches.value_of("NCHUNKS").unwrap().parse::<u32>().unwrap();
61 |     let wsize = matches.value_of("w").unwrap_or("80").parse::<u32>().unwrap();
62 |     let ksize = matches.value_of("k").unwrap_or("56").parse::<u32>().unwrap();
63 |     let rfactor = matches.value_of("r").unwrap_or("6").parse::<u32>().unwrap();
64 | 
65 |     let parameters = Parameters {
66 |         nchunks: nchunks,
67 |         nthreads: nthreads,
68 |         w: wsize,
69 |         k: ksize,
70 |         r: rfactor,
71 |         tol: 0.0, //not used
72 |         min_ec_cov: 1,
73 |     };
74 | 
75 |     build(&seqdb_file, &index_file, &out_prefix, &parameters);
76 | }
77 | 


--------------------------------------------------------------------------------
/src/bin/pg_ovlp.rs:
--------------------------------------------------------------------------------
 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit
 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin
 3 | //
 4 | // This Source Code Form is subject to the terms of the
 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
 6 | //
 7 | // You should have received a copy of the license along with this
 8 | // work. If not, see <http://creativecommons.org/licenses/by-nc-sa/4.0/>.
 9 | 
10 | const VERSION_STRING: &'static str = env!("VERSION_STRING");
11 | 
12 | #[global_allocator]
13 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
14 | //static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc;
15 | 
16 | use clap::clap_app;
17 | mod utils;
18 | use simple_logger::SimpleLogger;
19 | use utils::ovlp::ovlp;
20 | use utils::Parameters;
21 | fn main() -> Result<(), std::io::Error> {
22 |     let matches = clap_app!(pg_ovlp =>
23 |         (version: VERSION_STRING)
24 |         (author: "Jason Chin <jason@omnibio.ai>")
25 |         (@arg SEQDB:+required "Path to the seqdb file ")
26 |         (about: "
27 | Peregrine-2021 genome assembler, 
28 | pg_ovlp: generate haplotype specific overlaps between the reads
29 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/")    
30 |         (@arg SEQIDX:+required "Path to the seqdb index file")
31 |         (@arg SHMRINDEXPREFIX: +required "The prefix to the output shimmer index database")
32 |         (@arg OUTPREFIX: +required "The prefix of the output ovelap files")
33 |         (@arg NTHREADS: +required "Number of threads ")
34 |         (@arg NCHUNKS: +required "Number of partition")
35 |         (@arg tol: -t --tol +takes_value "Alignment tolerance [default: 0.01]")
36 |         (@arg log: --log +takes_value "log level: DBBUG or INFO (default)")
37 |     )
38 |     .get_matches();
39 | 
40 |     let log_level = match matches.value_of("log").unwrap_or("INFO") {
41 |         "DEBUG" => log::LevelFilter::Debug,
42 |         _ => log::LevelFilter::Info,
43 |     };
44 | 
45 |     SimpleLogger::new()
46 |         .with_level(log_level)
47 |         .with_utc_timestamps()
48 |         .init()
49 |         .unwrap();
50 | 
51 |     let seqdb_file = matches.value_of("SEQDB").unwrap().to_string();
52 |     let index_file = matches.value_of("SEQIDX").unwrap().to_string();
53 |     let shimmer_index_file_prefix = matches.value_of("SHMRINDEXPREFIX").unwrap().to_string();
54 |     let out_prefix = matches.value_of("OUTPREFIX").unwrap().to_string();
55 |     let nthreads = matches
56 |         .value_of("NTHREADS")
57 |         .unwrap()
58 |         .parse::<u32>()
59 |         .unwrap();
60 |     let nchunks = matches.value_of("NCHUNKS").unwrap().parse::<u32>().unwrap();
61 |     let tol = matches
62 |         .value_of("tol")
63 |         .unwrap_or("0.01")
64 |         .parse::<f64>()
65 |         .unwrap();
66 | 
67 |     let parameters = Parameters {
68 |         nchunks: nchunks,
69 |         nthreads: nthreads,
70 |         w: 0,
71 |         k: 0,
72 |         r: 0,
73 |         tol: tol,
74 |         min_ec_cov: 1,
75 |     };
76 | 
77 |     ovlp(
78 |         &seqdb_file,
79 |         &index_file,
80 |         &shimmer_index_file_prefix,
81 |         &out_prefix,
82 |         &parameters,
83 |     )?;
84 |     Ok(())
85 | }
86 | 


--------------------------------------------------------------------------------
/src/bin/pg_ovlp_ec.rs:
--------------------------------------------------------------------------------
 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit
 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin
 3 | //
 4 | // This Source Code Form is subject to the terms of the
 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
 6 | //
 7 | // You should have received a copy of the license along with this
 8 | // work. If not, see <http://creativecommons.org/licenses/by-nc-sa/4.0/>.
 9 | 
10 | const VERSION_STRING: &'static str = env!("VERSION_STRING");
11 | 
12 | #[global_allocator]
13 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
14 | //static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc;
15 | 
16 | use clap::clap_app;
17 | mod utils;
18 | use simple_logger::SimpleLogger;
19 | use utils::ovlp_ec::ovlp_ec;
20 | use utils::Parameters;
21 | 
22 | fn main() -> Result<(), std::io::Error> {
23 |     let matches = clap_app!(pg_ovlp_ec =>
24 |         (version: VERSION_STRING)
25 |         (author: "Jason Chin <jason@omnibio.ai>")
26 |         (about: "
27 | Peregrine-2021 genome assembler, 
28 | pg_ovlp_ec: perform error correction from the haplotype specific overlaps
29 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/")    
30 |         (@arg SEQDB:+required "Path to the seqdb file ")
31 |         (@arg SEQIDX:+required "Path to the seqdb index file")
32 |         (@arg prefix: +required "The prefix to the output shimmer index database")
33 |         (@arg out_prefix: +required "The prefix of the output ovelap files")
34 |         (@arg NTHREADS: +required "Number of threads ")
35 |         (@arg NCHUNKS: +required "Number of partition")
36 |         (@arg tol: -t --tol +takes_value "Alignment tolerance [default: 0.01]")
37 |         (@arg min_ec_cov: -c --min_ec_cov +takes_value "Minimum error coverage [default: 1]")
38 |         (@arg log: --log +takes_value "log level: DBBUG or INFO (default)")
39 |     )
40 |     .get_matches();
41 | 
42 |     let log_level = match matches.value_of("log").unwrap_or("INFO") {
43 |         "DEBUG" => log::LevelFilter::Debug,
44 |         _ => log::LevelFilter::Info,
45 |     };
46 | 
47 |     SimpleLogger::new()
48 |         .with_level(log_level)
49 |         .with_utc_timestamps()
50 |         .init()
51 |         .unwrap();
52 | 
53 |     let seqdb_file = matches.value_of("SEQDB").unwrap().to_string();
54 |     let index_file = matches.value_of("SEQIDX").unwrap().to_string();
55 |     let prefix = matches.value_of("prefix").unwrap().to_string();
56 |     let out_prefix = matches.value_of("out_prefix").unwrap().to_string();
57 |     let nthreads = matches
58 |         .value_of("NTHREADS")
59 |         .unwrap()
60 |         .parse::<u32>()
61 |         .unwrap();
62 |     let nchunks = matches.value_of("NCHUNKS").unwrap().parse::<u32>().unwrap();
63 |     let tol = matches
64 |         .value_of("tol")
65 |         .unwrap_or("0.01")
66 |         .parse::<f64>()
67 |         .unwrap();
68 |     let min_ec_cov = matches
69 |         .value_of("min_ec_cov")
70 |         .unwrap_or("1")
71 |         .parse::<u16>()
72 |         .unwrap();
73 | 
74 |     let parameters = Parameters {
75 |         nchunks: nchunks,
76 |         nthreads: nthreads,
77 |         w: 0,
78 |         k: 0,
79 |         r: 0,
80 |         tol: tol,
81 |         min_ec_cov: min_ec_cov,
82 |     };
83 | 
84 |     ovlp_ec(&seqdb_file, &index_file, &prefix, &out_prefix, &parameters)?;
85 |     Ok(())
86 | }
87 | 


--------------------------------------------------------------------------------
/src/bin/pg_getreads.rs:
--------------------------------------------------------------------------------
  1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit
  2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin
  3 | //
  4 | // This Source Code Form is subject to the terms of the
  5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
  6 | //
  7 | // You should have received a copy of the license along with this
  8 | // work. If not, see <http://creativecommons.org/licenses/by-nc-sa/4.0/>.
  9 | 
 10 | const VERSION_STRING: &'static str = env!("VERSION_STRING");
 11 | 
 12 | #[global_allocator]
 13 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
 14 | //static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc;
 15 | 
 16 | use clap::clap_app;
 17 | use memmap::MmapOptions;
 18 | use simple_logger::SimpleLogger;
 19 | use std::fs::File;
 20 | use std::io::{self, BufRead, Write};
 21 | use std::path::Path;
 22 | mod utils;
 23 | use utils::shmmrutils::{get_seq_fragment, ReadLocation};
 24 | 
 25 | fn read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>>
 26 | where
 27 |     P: AsRef<Path>,
 28 | {
 29 |     let file = File::open(filename)?;
 30 |     Ok(io::BufReader::new(file).lines())
 31 | }
 32 | 
 33 | fn main() -> () {
 34 |     let matches = clap_app!(pg_getreads =>
 35 |         (version: VERSION_STRING)
 36 |         (author: "Jason Chin <jason@omnibio.ai>")
 37 |         (about: "
 38 | Peregrine-2021 genome assembler, 
 39 | pg_getreads: generate fasta file for a subset of reads from the sequence database
 40 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/")    
 41 |         (@arg SEQDB: +required "Path to the seqdb file ")
 42 |         (@arg SEQIDX: +required "Path to the seqdb index file")
 43 |         (@arg READID: +required "Path to the read id file")
 44 |         (@arg log: --log +takes_value "log level: DBBUG or INFO (default)")
 45 |     )
 46 |     .get_matches();
 47 | 
 48 |     let log_level = match matches.value_of("log").unwrap_or("INFO") {
 49 |         "DEBUG" => log::LevelFilter::Debug,
 50 |         _ => log::LevelFilter::Info,
 51 |     };
 52 | 
 53 |     SimpleLogger::new()
 54 |         .with_level(log_level)
 55 |         .with_utc_timestamps()
 56 |         .init()
 57 |         .unwrap();
 58 | 
 59 |     let seqdb_file = matches.value_of("SEQDB").unwrap().to_string();
 60 |     let index_file = matches.value_of("SEQIDX").unwrap().to_string();
 61 |     let read_id_file = matches.value_of("READID").unwrap().to_string();
 62 | 
 63 |     let _res = writeln!(io::stderr(), "seq_db_file: {}", seqdb_file);
 64 |     let _res = writeln!(io::stderr(), "index_file: {}", index_file);
 65 |     let _res = writeln!(io::stderr(), "read_id_file: {}", read_id_file);
 66 | 
 67 |     let mut read_index = Vec::<ReadLocation>::new();
 68 |     let mut read_name = Vec::<String>::new();
 69 | 
 70 |     if let Ok(lines) = read_lines(index_file) {
 71 |         for line in lines {
 72 |             if let Ok(rec) = line {
 73 |                 //let rec_trimmed = rec.trim_end();
 74 |                 // the record line looks like 000000023 m64062_190803_042216/144/ccs 20359 467415
 75 |                 let v: Vec<&str> = rec.split_whitespace().collect();
 76 |                 //let rid: u32 = v[0].parse().unwrap();
 77 |                 let start: usize = v[3].parse().unwrap();
 78 |                 let len: usize = v[2].parse().unwrap();
 79 |                 read_index.push(ReadLocation {
 80 |                     start: start,
 81 |                     len: len,
 82 |                 });
 83 |                 read_name.push(v[1].to_string());
 84 |                 //println!("{} {} {}", rid, start, len);
 85 |             }
 86 |         }
 87 |     }
 88 | 
 89 |     let file = File::open(seqdb_file).unwrap();
 90 |     let mmap = unsafe { MmapOptions::new().map(&file).unwrap() };
 91 |     let stdout = io::stdout();
 92 |     let mut handle = stdout.lock();
 93 |     if let Ok(lines) = read_lines(read_id_file) {
 94 |         for line in lines {
 95 |             if let Ok(rec) = line {
 96 |                 let rec_trimmed = rec.trim_end();
 97 |                 let rid0 = rec_trimmed.parse::<u32>().unwrap();
 98 |                 let rloc = read_index[rid0 as usize];
 99 |                 let len = rloc.len as u32;
100 |                 let seq_frag = get_seq_fragment(rid0, 0, 0, len, &mmap, &read_index);
101 |                 let _ = writeln!(handle, ">{} {:09}", read_name[rid0 as usize], rid0);
102 |                 let _ = writeln!(handle, "{}", String::from_utf8_lossy(&seq_frag));
103 |             }
104 |         }
105 |     }
106 | }
107 | 


--------------------------------------------------------------------------------
/src/bin/utils/build_idx.rs:
--------------------------------------------------------------------------------
  1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit
  2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin
  3 | //
  4 | // This Source Code Form is subject to the terms of the
  5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
  6 | //
  7 | // You should have received a copy of the license along with this
  8 | // work. If not, see <http://creativecommons.org/licenses/by-nc-sa/4.0/>.
  9 | 
 10 | #![allow(dead_code)]
 11 | 
 12 | use super::shmmrutils::sequence_to_shmmrs;
 13 | use super::shmmrutils::{get_2bit_fragment, ReadLocation};
 14 | use super::Parameters;
 15 | use byteorder::{LittleEndian, WriteBytesExt};
 16 | use memmap::{Mmap, MmapOptions};
 17 | use std::fs::File;
 18 | use std::io::{self, BufRead, BufWriter, Write};
 19 | use std::mem::size_of;
 20 | use std::path::Path;
 21 | use threadpool::ThreadPool;
 22 | 
 23 | fn read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>>
 24 | where
 25 |     P: AsRef<Path>,
 26 | {
 27 |     let file = File::open(filename)?;
 28 |     Ok(io::BufReader::new(file).lines())
 29 | }
 30 | 
 31 | fn index_chunk(
 32 |     chunk: u32,
 33 |     total_chunk: u32,
 34 |     readsdb: &Mmap,
 35 |     read_index: &Vec<ReadLocation>,
 36 |     prefix: &String,
 37 |     wsize: u32,
 38 |     ksize: u32,
 39 |     rfactor: u32,
 40 | ) -> Result<(), io::Error> {
 41 |     // Create index for a chunk from the read database
 42 | 
 43 |     let filename = format!("{}-{:03}-of-{:03}.dat", prefix, chunk, total_chunk);
 44 |     let mut out_f = BufWriter::new(File::create(filename).unwrap());
 45 | 
 46 |     let mut wrt = Vec::<u8>::with_capacity(1 << 16);
 47 |     for seq_id in 0..read_index.len() {
 48 |         if seq_id % (total_chunk as usize) != (chunk % total_chunk) as usize {
 49 |             continue;
 50 |         }
 51 |         let len = read_index[seq_id].len as u32;
 52 |         log::debug!("build_idx: len: {} {}", seq_id, len);
 53 |         let seq = get_2bit_fragment(seq_id as u32, 0, 0, len, &readsdb, read_index);
 54 |         let shmmrs = sequence_to_shmmrs(seq_id as u32, &seq, wsize, ksize, rfactor);
 55 |         for m in shmmrs {
 56 |             wrt.write_u64::<LittleEndian>(m.x)?;
 57 |             wrt.write_u64::<LittleEndian>(m.y)?;
 58 |         }
 59 |     }
 60 |     let us = size_of::<usize>();
 61 |     assert!(us == 8 as usize); //make sure the usize is a 64bit int.
 62 |     out_f.write_u64::<LittleEndian>((wrt.len() >> 4) as u64)?;
 63 | 
 64 |     // Not sure if it a bug in BufferWriter, it can not write more than 2Gb at once
 65 |     // we chop up the data and write in small chunks
 66 |     let c = 24_usize;
 67 |     for i in 0..=wrt.len() >> c {
 68 |         if ((i + 1) << c) < wrt.len() {
 69 |             let s = i << c;
 70 |             let e = (i + 1) << c;
 71 |             out_f.write(&wrt[s..e])?;
 72 |         } else {
 73 |             let s = i << c;
 74 |             let e = wrt.len();
 75 |             out_f.write(&wrt[s..e])?;
 76 |         }
 77 |     }
 78 |     out_f.flush()?;
 79 |     Ok(())
 80 | }
 81 | 
 82 | pub fn build(
 83 |     seqdb_file: &String,
 84 |     index_file: &String,
 85 |     out_prefix: &String,
 86 |     parameters: &Parameters,
 87 | ) -> () {
 88 |     // Using thread pool to build the SHIMMER index in paralle
 89 | 
 90 |     let mut read_index = Vec::<ReadLocation>::new();
 91 | 
 92 |     if let Ok(lines) = read_lines(index_file) {
 93 |         for line in lines {
 94 |             if let Ok(rec) = line {
 95 |                 let v: Vec<&str> = rec.split_whitespace().collect();
 96 |                 let start: usize = v[3].parse().unwrap();
 97 |                 let len: usize = v[2].parse().unwrap();
 98 |                 read_index.push(ReadLocation {
 99 |                     start: start,
100 |                     len: len,
101 |                 });
102 |             }
103 |         }
104 |     }
105 |     let mmap_seqdb = File::open(seqdb_file).unwrap();
106 |     let mmap_seqdb = unsafe { MmapOptions::new().map(&mmap_seqdb).unwrap() };
107 | 
108 |     let read_index = std::sync::Arc::new(read_index);
109 |     let mmap_seqdb = std::sync::Arc::new(mmap_seqdb);
110 | 
111 |     let pool = ThreadPool::new(parameters.nthreads as usize);
112 | 
113 |     let _nchunks = parameters.nchunks;
114 |     for i in 0.._nchunks {
115 |         let mmap_seqdb = mmap_seqdb.clone();
116 |         let read_index = read_index.clone();
117 |         let out_prefix = out_prefix.clone();
118 |         let parameters = (*parameters).clone();
119 |         pool.execute(move || {
120 |             let r = index_chunk(
121 |                 i + 1,
122 |                 _nchunks,
123 |                 &mmap_seqdb,
124 |                 &read_index,
125 |                 &out_prefix,
126 |                 parameters.w,
127 |                 parameters.k,
128 |                 parameters.r,
129 |             );
130 |             match r {
131 |                 Err(error) => panic!("build index fail: {}", error),
132 |                 Ok(()) => (),
133 |             };
134 |         });
135 |     }
136 |     pool.join();
137 | }
138 | 


--------------------------------------------------------------------------------
/src/bin/utils/resolve.rs:
--------------------------------------------------------------------------------
  1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit
  2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin
  3 | //
  4 | // This Source Code Form is subject to the terms of the
  5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
  6 | //
  7 | // You should have received a copy of the license along with this
  8 | // work. If not, see <http://creativecommons.org/licenses/by-nc-sa/4.0/>.
  9 | 
 10 | #![allow(dead_code)]
 11 | //
 12 | // for resolve contigs that are highly similar to each others which are most
 13 | // likily be homologuous pairs in a diploid genome
 14 | //
 15 | use super::build_sdb::FastxReader;
 16 | use super::layout::log_asm_summary;
 17 | use super::shmmrutils::{sequence_to_shmmrs, MM128};
 18 | use std::fs::File;
 19 | use std::io::{self, BufRead, BufReader, Write};
 20 | use std::path::Path;
 21 | 
 22 | use rustc_hash::FxHashMap;
 23 | 
 24 | fn read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>>
 25 | where
 26 |     P: AsRef<Path>,
 27 | {
 28 |     let file = File::open(filename)?;
 29 |     Ok(io::BufReader::new(file).lines())
 30 | }
 31 | 
 32 | fn base2twobit(s: &Vec<u8>) -> Vec<u8> {
 33 |     let fourbit_map_f: [u8; 256] = [
 34 |         12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
 35 |         12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
 36 |         12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 0, 12, 1, 12,
 37 |         12, 12, 2, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 3, 12, 12, 12, 12, 12, 12, 12,
 38 |         12, 12, 12, 12, 12, 0, 12, 1, 12, 12, 12, 2, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
 39 |         12, 3, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
 40 |         12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
 41 |         12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
 42 |         12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
 43 |         12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
 44 |         12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
 45 |         12, 12, 12,
 46 |     ];
 47 |     let len = s.len();
 48 |     let mut out_s = Vec::<u8>::with_capacity(len);
 49 |     for p in 0..len {
 50 |         out_s.push(fourbit_map_f[s[p] as usize]);
 51 |     }
 52 |     out_s
 53 | }
 54 | 
 55 | pub fn resolve_ht(
 56 |     fasta_file: &String,
 57 |     output_prefix: &String,
 58 |     w: u32,
 59 |     k: u32,
 60 |     r: u32,
 61 | ) -> Result<(), io::Error> {
 62 |     log::info!("resolve:fasta_file: {}", fasta_file);
 63 |     log::info!("resolve:output_prefix: {}", output_prefix);
 64 |     log::info!("resolve:parameters: w:{}, k:{}, r:{}", w, k, r,);
 65 | 
 66 |     let input_file = File::open(&fasta_file)?;
 67 |     let reader = BufReader::new(input_file);
 68 |     let mut fastx_reader = FastxReader::new(reader, &fasta_file)?;
 69 | 
 70 |     //let mut seqdb = FxHashMap::<Vec<u8>, Vec<u8>>::default();
 71 |     let mut shmmr_db = FxHashMap::<String, (u32, Vec<MM128>)>::default();
 72 |     let mut shmmr_map = FxHashMap::<u64, Vec<u64>>::default();
 73 |     let mut id2name = FxHashMap::<u32, String>::default();
 74 |     let mut rid = 0;
 75 |     let mut seq_db = FxHashMap::<String, Vec<u8>>::default();
 76 |     while let Some(rec) = fastx_reader.next_rec() {
 77 |         let rec = rec.unwrap();
 78 |         //seqdb.insert(r.id, r.seq);
 79 |         //println!("N {}", String::from_utf8_lossy(&r.id));
 80 |         let rec_2bitseq = base2twobit(&rec.seq);
 81 |         let shmmers = sequence_to_shmmrs(rid, &rec_2bitseq, w, k, r);
 82 |         for mm in shmmers.iter() {
 83 |             let hash = mm.x >> 8;
 84 |             shmmr_map.entry(hash).or_insert_with(|| vec![]).push(mm.y);
 85 |         }
 86 |         let n = String::from_utf8_lossy(&rec.id).into_owned();
 87 |         id2name.insert(rid, n.clone());
 88 |         shmmr_db.insert(n.clone(), (rid, shmmers));
 89 |         seq_db.insert(n, rec.seq);
 90 |         rid += 1;
 91 |     }
 92 | 
 93 |     let mut matches = FxHashMap::<(u32, u32), Vec<(u32, u32)>>::default();
 94 | 
 95 |     for (_, (rid, shmmrs)) in shmmr_db.iter() {
 96 |         for mer0 in shmmrs {
 97 |             let y0 = mer0.y;
 98 |             let pos0 = ((y0 & 0xFFFFFFFF) >> 1) as u32;
 99 |             //let strand0 = y0 & 0x1;
100 |             let hash = mer0.x >> 8;
101 |             let other = shmmr_map.get(&hash).unwrap();
102 |             if other.len() > 10 || other.len() < 2 {
103 |                 continue;
104 |             }
105 |             for y1 in other {
106 |                 let rid1 = (*y1 >> 32) as u32;
107 |                 if rid1 == *rid {
108 |                     continue;
109 |                 }
110 |                 let pos1 = ((*y1 & 0xFFFFFFFF) >> 1) as u32;
111 |                 //let strand1 = *y1 & 0x1;
112 |                 matches
113 |                     .entry((*rid, rid1))
114 |                     .or_insert_with(|| vec![])
115 |                     .push((pos0, pos1));
116 |                 //log::info!("S {} {} {} {} {} {} {}", rid, pos0, strain0, rid1, pos1, strain1, (pos0 as i32) - (pos1 as i32));
117 |             }
118 |         }
119 |     }
120 | 
121 |     let rel_path = format!("{}_rel.dat", output_prefix);
122 |     let mut rel_file = File::create(rel_path).unwrap();
123 |     let mut a_to_p = FxHashMap::<u32, u32>::default();
124 | 
125 |     for ((rid0, rid1), v) in matches {
126 |         let n0 = id2name.get(&rid0).unwrap();
127 |         let n1 = id2name.get(&rid1).unwrap();
128 | 
129 |         let s0 = shmmr_db.get(n0).unwrap().1.len() as f32;
130 |         let s1 = shmmr_db.get(n1).unwrap().1.len() as f32;
131 |         let c = v.len();
132 |         let r0 = (c as f32) / s0;
133 |         let r1 = (c as f32) / s1;
134 |         writeln!(
135 |             rel_file,
136 |             "S {} {} {} {} {} {} {}",
137 |             rid0, rid1, c, s0, s1, r0, r1
138 |         )?;
139 |         if r0 > 0.50 && s1 > s0 {
140 |             a_to_p.insert(rid0, rid1);
141 |         }
142 |     }
143 | 
144 |     let p_ctg_path = format!("{}_p.fa", output_prefix);
145 |     let mut p_ctg_file = File::create(p_ctg_path).unwrap();
146 |     let a_ctg_path = format!("{}_a.fa", output_prefix);
147 |     let mut a_ctg_file = File::create(a_ctg_path).unwrap();
148 |     let mut ctg_ids = id2name.keys().map(|x| *x).collect::<Vec<u32>>();
149 |     ctg_ids.sort();
150 |     let mut p_ctg_lengths = Vec::<(String, usize)>::new();
151 |     let mut a_ctg_lengths = Vec::<(String, usize)>::new();
152 |     for ctg_id in ctg_ids {
153 |         if a_to_p.contains_key(&ctg_id) {
154 |             let n0 = id2name.get(&ctg_id).unwrap();
155 |             let n1 = id2name.get(a_to_p.get(&ctg_id).unwrap()).unwrap();
156 |             writeln!(rel_file, "A {} {}", n0, n1)?;
157 |             let seq = seq_db.get(n0).unwrap();
158 |             writeln!(a_ctg_file, ">{}", n0)?;
159 |             writeln!(a_ctg_file, "{}", String::from_utf8_lossy(seq))?;
160 |             a_ctg_lengths.push((n0.clone(), seq.len()))
161 |         } else {
162 |             let n0 = id2name.get(&ctg_id).unwrap();
163 |             writeln!(rel_file, "P {} {}", n0, n0)?;
164 |             let seq = seq_db.get(n0).unwrap();
165 |             writeln!(p_ctg_file, ">{}", n0)?;
166 |             writeln!(p_ctg_file, "{}", String::from_utf8_lossy(seq))?;
167 |             p_ctg_lengths.push((n0.clone(), seq.len()))
168 |         }
169 |     }
170 |     log::info!("primary ctg stats");
171 |     log_asm_summary(p_ctg_lengths);
172 |     log::info!("associated ctg stats");
173 |     log_asm_summary(a_ctg_lengths);
174 |     Ok(())
175 | }
176 | 


--------------------------------------------------------------------------------
/misc/logo.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
 3 | <svg width="100%" height="100%" viewBox="0 0 247 246" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" xmlns:serif="http://www.serif.com/" style="fill-rule:evenodd;clip-rule:evenodd;">
 4 |     <g transform="matrix(1,0,0,1,11.0332,10.8976)">
 5 |         <g transform="matrix(1,0,0,1,-873.121,-487.926)">
 6 |             <g transform="matrix(1,0,0,1,485.08,304.02)">
 7 |                 <g>
 8 |                     <path d="M579.228,185.5C596.8,185.5 611,200.102 611,218.059L611,373.941C611,391.898 596.8,406.5 579.228,406.5L421.272,406.5C403.7,406.5 389.5,391.898 389.5,373.941L389.5,218.059C389.5,200.102 403.7,185.5 421.272,185.5L579.228,185.5Z" style="fill-opacity:0.643137;fill-rule:nonzero;"/>
 9 |                     <path d="M421.272,189.5L579.228,189.5C594.566,189.5 607,202.286 607,218.059L607,373.941C607,389.714 594.566,402.5 579.228,402.5L421.272,402.5C405.934,402.5 393.5,389.714 393.5,373.941L393.5,218.059C393.5,202.286 405.934,189.5 421.272,189.5Z" style="fill:url(#_Linear1);fill-rule:nonzero;"/>
10 |                     <path d="M421.272,189.5L579.228,189.5C594.566,189.5 607,202.286 607,218.059L607,373.941C607,389.714 594.566,402.5 579.228,402.5L421.272,402.5C405.934,402.5 393.5,389.714 393.5,373.941L393.5,218.059C393.5,202.286 405.934,189.5 421.272,189.5Z" style="fill-opacity:0;fill-rule:nonzero;stroke:rgb(41,82,139);stroke-width:8px;"/>
11 |                 </g>
12 |                 <g transform="matrix(-0.002,-1,1,-0.002,445.783,382.839)">
13 |                     <g>
14 |                         <text x="0px" y="0px" style="font-family:'Optima-BoldItalic', 'Optima';font-weight:700;font-style:italic;font-size:42px;fill:rgb(67,89,165);">Peregrine</text>
15 |                     </g>
16 |                 </g>
17 |                 <g>
18 |                     <path d="M498.431,224.439C498.928,232.061 500.585,231.398 498.762,234.878C496.94,238.357 496.774,240.843 491.637,252.773C486.501,264.703 486.335,265.862 480.536,275.141C474.737,284.42 469.931,289.888 476.062,284.752C482.193,279.615 479.707,282.598 485.838,276.301C491.969,270.005 489.318,273.319 496.277,265.531C503.236,257.743 500.585,260.395 507.378,253.767C514.172,247.139 516.657,248.133 518.646,243.825C520.634,239.517 521.297,241.505 519.64,237.86C517.983,234.215 518.314,231.564 512.349,228.913C506.384,226.261 498.431,224.439 498.431,224.439Z" style="fill:rgb(67,89,165);fill-rule:nonzero;"/>
19 |                     <path d="M499.094,255.369C498.276,257.496 496.967,264.695 496.803,269.276C496.64,273.857 495.331,275.493 495.658,280.401C495.985,285.309 495.331,283.509 495.658,290.872C495.985,298.234 493.531,295.616 497.131,302.978C500.73,310.341 500.566,312.14 503.348,311.65C506.129,311.159 505.643,310.116 507.481,307.108C510.906,301.501 510.219,307.723 508.91,300.197C507.601,292.671 508.316,294.079 507.274,286.728C506.364,280.31 506.231,283.658 506.364,276.867C506.507,269.579 506.253,267.321 506.829,262.258C507.318,257.967 508.281,257.537 507.378,253.767C506.661,250.767 499.094,255.369 499.094,255.369Z" style="fill:rgb(67,89,165);fill-rule:nonzero;"/>
20 |                     <path d="M530.179,218.722C526.742,221.947 526.991,218.727 527.68,224.439C528.037,227.4 528.927,229.819 529.581,232.764C530.236,235.709 530.992,236.009 530.79,238.17C530.599,240.205 531.108,240.078 528.706,242.772C524.677,247.292 516.763,253.079 524.943,248.334C533.124,243.59 533.402,244.638 535.087,241.626C536.362,239.348 536.293,238.984 538.828,238.514C542.964,237.747 543.343,237.924 546.191,235.736C548.746,233.773 549.992,233.223 549.873,229.276C549.81,227.19 550.281,225.75 547.928,223.753C543.833,220.28 544.654,220.632 539.037,219.343C533.985,218.183 530.179,218.722 530.179,218.722Z" style="fill:rgb(67,89,165);fill-rule:nonzero;"/>
21 |                     <path d="M508.91,253.687C513.979,255.996 512.208,258.715 517.498,256.994C521.942,255.549 521.434,254.514 526.274,252.897C529.648,251.769 527.398,252.768 535.414,250.142C543.431,247.516 544.713,247.025 549.634,246.698C554.556,246.371 553.619,246.392 557.055,246.883C560.491,247.373 565.759,248.456 564.941,244.529C564.123,240.603 564.621,239.683 560.489,239.315C553.317,238.674 554.758,238.997 549.634,240.014C542.093,241.512 547.769,240.703 539.121,242.685C531.215,244.497 536.802,244.15 528.691,245.801C521.091,247.349 518.958,248.981 514.769,250.142C510.828,251.234 508.91,253.687 508.91,253.687Z" style="fill:rgb(67,89,165);fill-rule:nonzero;"/>
22 |                     <path d="M521.114,248.189C522.825,250.142 522.942,250.142 524.453,252.897C527.007,257.553 526.945,254.904 527.451,261.485C527.958,268.065 527.424,263.942 527.804,268.877C528.183,273.812 528.579,272.681 528.563,277.735C528.534,286.759 525.81,284.193 526.791,288.238C527.413,290.798 527.474,293.256 530.133,293.335C535.067,293.482 535.795,291.152 538.92,290.293C542.03,289.437 541.564,289.554 542.356,286.728C543.902,281.215 543.495,283.429 543.622,277.355C543.748,271.281 543.242,275.078 543.116,269.003C542.989,262.929 543.495,266.128 543.116,259.512C542.736,252.897 545.037,255.036 542.938,251.312C539.827,245.794 543.347,246.259 538.081,244.629C532.85,243.01 533.472,243.847 531.231,243.965C525.432,244.268 521.114,248.189 521.114,248.189Z" style="fill:rgb(67,89,165);fill-rule:nonzero;"/>
23 |                     <path d="M512.664,267.17C514.072,269.636 514.356,269.652 516.413,271.098C518.906,272.851 519.724,272.197 521.434,271.914C523.136,271.632 521.223,272.04 526.285,269.51C531.347,266.979 528.213,268.24 534.16,266.089C540.108,263.938 537.996,264.398 543.833,263.481C549.209,262.637 546.515,262.812 549.634,263.1C552.452,263.359 556.434,263.147 556.419,260.492C556.405,257.886 556.311,257.134 555.847,256.804C552.581,254.48 550.723,254.617 547.388,255.532C543.933,256.479 546.086,256.482 540.138,258.203C534.191,259.923 539.405,258.595 532.825,260.619C526.244,262.644 531.267,260.947 525.066,262.845C518.865,264.743 512.664,267.17 512.664,267.17Z" style="fill:rgb(67,89,165);fill-rule:nonzero;"/>
24 |                     <path d="M514.769,282.67C516.035,284.821 515.782,284.687 517.553,285.707C519.325,286.728 517.847,286.728 521.434,286.728C525.02,286.728 522.236,286.973 526.285,285.328C530.335,283.683 529.702,283.556 533.372,282.544C537.041,281.531 535.97,281.406 540.205,280.392C542.998,279.724 542.51,279.959 544.469,279.886C546.381,279.815 549.637,280.019 551.721,280.139C553.92,280.266 554.264,280.14 556.029,279C558.538,277.381 559.281,276.669 559.054,275.861C558.673,274.499 558.364,274.069 558.301,273.939C557.036,271.31 558.662,272.351 556.029,270.685C555.081,270.084 553.977,270.372 549.634,270.648C541.881,271.141 547.038,270.031 540.332,272.554C533.625,275.078 535.902,276.469 529.828,277.608C523.754,278.747 523.858,279.633 520.579,280.266C517.3,280.899 514.769,282.67 514.769,282.67Z" style="fill:rgb(67,89,165);fill-rule:nonzero;"/>
25 |                     <path d="M508.281,298.723C509.91,301.983 508.844,299.983 511.016,302.856C512.82,305.242 510.932,306.125 518.524,303.088C526.117,300.051 525.189,300.717 528.361,300.062C533.716,298.956 534.026,298.543 539.012,298.141C548.221,297.399 544.88,297.279 549.634,297.349C555.891,297.443 554.772,298.849 559.093,299.072C562.468,299.247 563.194,299.754 564.86,296.464C566.526,293.173 566.232,294.535 564.86,291.956C563.489,289.377 565.382,289.496 560.705,288.112C556.029,286.728 555.684,286.974 551.41,287.781C545.24,288.945 548.101,288.532 540.002,290.225C531.903,291.919 531.002,291.725 527.371,293.31C522.834,295.291 525.987,294.031 519.165,295.929C512.343,297.828 508.281,298.723 508.281,298.723Z" style="fill:rgb(67,89,165);fill-rule:nonzero;"/>
26 |                     <path d="M460.899,328.125L469.755,334.29L479.956,339.111L494.193,332.609L503.049,328.349L514.769,324.426L521.114,322.632L532.083,319.942L543.833,317.363L555.513,316.915L566.002,317.475L576.139,320.39L583.202,322.296L588.807,319.942L592.369,314.673L589.703,308.619L583.426,305.027L574.121,305.027L563.808,305.929L549.88,308.731L536.343,312.52L519.08,314.561L504.731,317.139L492.175,319.493L479.844,322.071L460.899,328.125Z" style="fill:rgb(67,89,165);fill-rule:nonzero;"/>
27 |                     <path d="M519.08,301.669L520.131,306.938L520.131,312.296L520.931,321.511L520.131,331.712L520.931,342.362L522.219,350.321C522.219,350.321 523.003,355.029 523.115,356.374C523.227,357.72 526.366,366.688 526.366,366.688L529.617,370.163L533.653,367.248L536.904,361.195L538.809,354.357L540.379,346.285L541.164,336.42L541.836,327.677L541.052,319.942L540.491,309.292L537.688,304.803L528.945,300.884" style="fill:rgb(67,89,165);fill-rule:nonzero;"/>
28 |                 </g>
29 |             </g>
30 |         </g>
31 |     </g>
32 |     <defs>
33 |         <linearGradient id="_Linear1" x1="0" y1="0" x2="1" y2="0" gradientUnits="userSpaceOnUse" gradientTransform="matrix(1.30425e-14,213,-213,1.30425e-14,500.25,189.5)"><stop offset="0" style="stop-color:white;stop-opacity:1"/><stop offset="1" style="stop-color:rgb(212,212,212);stop-opacity:1"/></linearGradient>
34 |     </defs>
35 | </svg>
36 | 


--------------------------------------------------------------------------------
/src/bin/utils/build_sdb.rs:
--------------------------------------------------------------------------------
  1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit
  2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin
  3 | //
  4 | // This Source Code Form is subject to the terms of the
  5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
  6 | //
  7 | // You should have received a copy of the license along with this
  8 | // work. If not, see <http://creativecommons.org/licenses/by-nc-sa/4.0/>.
  9 | 
 10 | #![allow(dead_code)]
 11 | 
 12 | use flate2::bufread::MultiGzDecoder;
 13 | use rayon::prelude::*;
 14 | use std::fs::File;
 15 | use std::io::prelude::*;
 16 | use std::io::{self, BufReader, SeekFrom};
 17 | 
 18 | pub struct SeqRec {
 19 |     pub id: Vec<u8>,
 20 |     pub seq: Vec<u8>,
 21 | }
 22 | 
 23 | enum Fastx {
 24 |     FastQ,
 25 |     FastA,
 26 | }
 27 | pub struct FastxReader<R> {
 28 |     // struct for reading different file types
 29 |     inner: R,
 30 |     t: Fastx,
 31 | }
 32 | 
 33 | impl<R: BufRead> FastxReader<R> {
 34 |     pub fn new(mut inner: R, filename: &String) -> Result<Self, io::Error> {
 35 |         let t: Fastx;
 36 |         {
 37 |             // peek the file to decide if it is fasta or fastq
 38 |             let r = inner.by_ref();
 39 |             let mut buf = Vec::<u8>::new();
 40 |             r.take(1).read_to_end(&mut buf)?;
 41 |             if buf.len() < 1 {
 42 |                 return Err(io::Error::new(
 43 |                     io::ErrorKind::Other,
 44 |                     format!("empty file: {}", filename),
 45 |                 ));
 46 |             }
 47 |             match buf[0] {
 48 |                 b'>' => t = Fastx::FastA,
 49 |                 b'@' => t = Fastx::FastQ,
 50 |                 _ => t = Fastx::FastA,
 51 |             }
 52 |         }
 53 |         Ok(Self { inner, t })
 54 |     }
 55 | 
 56 |     pub fn next_rec(&mut self) -> Option<io::Result<SeqRec>> {
 57 |         match self.t {
 58 |             Fastx::FastA => self.fasta_next_rec(),
 59 |             Fastx::FastQ => self.fastq_next_rec(),
 60 |         }
 61 |     }
 62 | 
 63 |     pub fn fasta_next_rec(&mut self) -> Option<io::Result<SeqRec>> {
 64 |         // naive partser for fasta format to get the next record
 65 | 
 66 |         let mut id_tmp = Vec::<u8>::with_capacity(512);
 67 |         let mut seq = Vec::<u8>::with_capacity(1 << 14);
 68 | 
 69 |         let res = self.inner.read_until(b'\n', &mut id_tmp);
 70 |         if res.is_err() {
 71 |             Some(res);
 72 |         } else if res.ok() == Some(0) {
 73 |             return None;
 74 |         }
 75 |         let mut r = BufReader::new(&id_tmp[..]);
 76 |         let mut id = Vec::<u8>::with_capacity(512);
 77 |         let res = r.read_until(b' ', &mut id);
 78 |         if res.is_err() {
 79 |             Some(res);
 80 |         }
 81 |         let id = id
 82 |             .into_iter()
 83 |             .filter(|c| *c != b'\n' && *c != b' ' && *c != b'\r')
 84 |             .collect();
 85 |         let _x = self.inner.read_until(b'>', &mut seq);
 86 |         let seq = seq
 87 |             .into_iter()
 88 |             .filter(|c| *c != b'\n' && *c != b'>' && *c != b'\r')
 89 |             .collect();
 90 |         let rec = SeqRec { id: id, seq: seq };
 91 | 
 92 |         Some(Ok(rec))
 93 |     }
 94 | 
 95 |     pub fn fastq_next_rec(&mut self) -> Option<io::Result<SeqRec>> {
 96 |         // naive partser for fastq format to get the next record
 97 |         // QV strings are ignored
 98 | 
 99 |         let mut buf = Vec::<u8>::with_capacity(512);
100 |         let mut id_tmp = Vec::<u8>::with_capacity(512);
101 |         let mut seq = Vec::<u8>::with_capacity(1 << 14);
102 | 
103 |         let _res = self.inner.read_until(b'\n', &mut id_tmp); //read id
104 |                                                               // fetch the first id up to the first space, strip '\n'
105 |         let mut r = BufReader::new(&id_tmp[..]);
106 |         let mut id = Vec::<u8>::with_capacity(512);
107 |         let _res = r.read_until(b' ', &mut id);
108 |         let id = id
109 |             .into_iter()
110 |             .filter(|c| *c != b'\n' && *c != b' ' && *c != b'\r')
111 |             .collect();
112 |         // get the seq
113 |         let _res = self.inner.read_until(b'\n', &mut seq);
114 |         let seq = seq
115 |             .into_iter()
116 |             .filter(|c| *c != b'\n' && *c != b'\r')
117 |             .collect();
118 |         let rec = SeqRec { id: id, seq: seq };
119 |         // ignore QV
120 |         let _res = self.inner.read_until(b'+', &mut buf);
121 |         let _res = self.inner.read_until(b'\n', &mut buf);
122 |         let _res = self.inner.read_until(b'\n', &mut buf);
123 |         let res = self.inner.read_until(b'@', &mut buf); //get to id line
124 |         if res.is_err() {
125 |             Some(res);
126 |         } else if res.ok() == Some(0) {
127 |             return None;
128 |         }
129 |         Some(Ok(rec))
130 |     }
131 | }
132 | 
133 | fn get_hpc_flag(seq0: &Vec<u8>) -> Vec<u8> {
134 |     // We use a hybrid approach to handle homopolymer sequence
135 |     // In somce case, it is use to know a base is part of long homopolymer and
136 |     // it is prone to have insertion or deletion errors. We don't compress them
137 |     // when we store the sequences but we mark those bases in case it is useful
138 |     // to ignore them.
139 | 
140 |     let mut flag = Vec::<u8>::with_capacity(seq0.len());
141 |     let mut i = 0_usize;
142 |     let seq0len = seq0.len();
143 |     while i < seq0len {
144 |         let mut j = i;
145 |         while j < seq0len - 2 && seq0[j] == seq0[j + 1] {
146 |             j += 1;
147 |         }
148 |         if j != i {
149 |             // mask HP > 5 bases
150 |             let mut count = 0_u32;
151 |             while i <= j {
152 |                 if count < 4 {
153 |                     flag.push(0b0000);
154 |                 } else {
155 |                     flag.push(0b0100);
156 |                 }
157 |                 count += 1;
158 |                 i += 1;
159 |             }
160 |         } else {
161 |             //dimer case
162 |             let mut j = i;
163 |             while j < seq0len - 4 && seq0[j] == seq0[j + 2] && seq0[j + 1] == seq0[j + 3] {
164 |                 j += 2;
165 |             }
166 |             if j != i {
167 |                 let mut count = 0_u32;
168 |                 while i <= j {
169 |                     if count < 4 {
170 |                         flag.push(0b0000);
171 |                         flag.push(0b0000);
172 |                     } else {
173 |                         flag.push(0b1000);
174 |                         flag.push(0b1000);
175 |                     }
176 |                     i += 2;
177 |                     count += 2;
178 |                 }
179 |             } else {
180 |                 flag.push(0x0000);
181 |                 i += 1;
182 |             }
183 |         }
184 |     }
185 |     flag
186 | }
187 | 
188 | pub fn encode_biseq(s: &Vec<u8>) -> Vec<u8> {
189 |     // we use 4 bits to store the each base, the reversed compliment are stored together
190 |     // the lower 4 bits are for the base from the original orientation
191 |     // the higher 4 bits are for the base from the reversed complementary orientation
192 |     // For each 4 bit field, lower two bits are for the base, the higher two bits are flags.
193 |     // 'A' = 0bxxx00, 'C' = 0bxx01, 'G' = 0bxx10, 'T' = 0bxx11
194 |     // flag: 0b10xx = hp tagged, 0b00xx = non hp tagged
195 |     // 0b1100 (12) : None base
196 | 
197 |     let fourbit_map_f: [u8; 256] = [
198 |         12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
199 |         12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
200 |         12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 0, 12, 1, 12,
201 |         12, 12, 2, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 3, 12, 12, 12, 12, 12, 12, 12,
202 |         12, 12, 12, 12, 12, 0, 12, 1, 12, 12, 12, 2, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
203 |         12, 3, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
204 |         12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
205 |         12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
206 |         12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
207 |         12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
208 |         12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
209 |         12, 12, 12,
210 |     ];
211 |     let len = s.len();
212 |     let mut out_s = Vec::<u8>::with_capacity(len);
213 |     let mut f_seq = Vec::<u8>::with_capacity(len);
214 |     let mut r_seq = Vec::<u8>::with_capacity(len);
215 |     for p in 0..len {
216 |         let rp = len - 1 - p;
217 |         //let code = ((fourbit_map_f[s[rp] as usize] ^ 0x03) << 4) | fourbit_map_f[s[p] as usize];
218 |         //out_s.push(code);
219 |         f_seq.push(fourbit_map_f[s[p] as usize]);
220 |         r_seq.push(fourbit_map_f[s[rp] as usize] ^ 0b0011);
221 |     }
222 | 
223 |     let f_flag = get_hpc_flag(&f_seq);
224 |     let r_flag = get_hpc_flag(&r_seq);
225 |     for p in 0..len {
226 |         out_s.push(((r_flag[p] | r_seq[p]) << 4) | (f_flag[p] | f_seq[p]));
227 |     }
228 |     out_s
229 | }
230 | 
231 | pub fn build(seq_list_file: &String, out_prefix: &String) -> Result<usize, io::Error> {
232 |     // given a list of file in `seq_list_file`, read the sequences to building the sequence
233 |     // database and index
234 | 
235 |     let seqdb_name = format!("{}.seqdb", out_prefix);
236 |     log::info!("create seq db: {}", seqdb_name);
237 |     let mut out_db_file = File::create(seqdb_name)?;
238 |     let seqidx_name = format!("{}.idx", out_prefix);
239 |     log::info!("create seq index: {}", seqidx_name);
240 |     let mut out_idx_file = File::create(seqidx_name)?;
241 |     let mut start = 0_usize;
242 |     let mut seq_id = 0_u32;
243 | 
244 |     log::info!("get input files from: {}", seq_list_file);
245 |     let f = File::open(seq_list_file)?;
246 |     let seq_list_buf = BufReader::new(f);
247 | 
248 |     for fastx_file in seq_list_buf.lines() {
249 |         let input_fn = fastx_file.unwrap();
250 |         log::info!("input file: {}", input_fn);
251 |         let metadata = std::fs::metadata(&input_fn)?;
252 |         if !metadata.is_file() || metadata.len() < (1 << 16) {
253 |             log::info!(
254 |                 "input file: {} may not be proper input file (filesize = {}), ignore",
255 |                 input_fn,
256 |                 metadata.len()
257 |             );
258 |             continue;
259 |         }
260 |         let input_file = File::open(&input_fn)?;
261 |         let mut reader = BufReader::new(input_file);
262 |         let mut is_gzfile = false;
263 |         {
264 |             let r = reader.by_ref();
265 |             let mut buf = Vec::<u8>::new();
266 |             let _ = r.take(2).read_to_end(&mut buf);
267 |             if buf == [0x1F_u8, 0x8B_u8] {
268 |                 log::info!("input file: {} detected as gz-compressed file", input_fn);
269 |                 is_gzfile = true;
270 |             }
271 |         }
272 | 
273 |         let _ = reader.seek(SeekFrom::Start(0));
274 |         let mut seqs = Vec::<(u32, Vec<u8>, Vec<u8>)>::new();
275 |         if is_gzfile {
276 |             let fastx_buf = BufReader::new(MultiGzDecoder::new(&mut reader));
277 |             let mut fastx_reader = FastxReader::new(fastx_buf, &input_fn)?;
278 |             while let Some(r) = fastx_reader.next_rec() {
279 |                 let r = r.unwrap();
280 |                 if r.seq.len() < 500 {
281 |                     //ignore very short reads
282 |                     continue;
283 |                 }
284 |                 seqs.push((seq_id, r.id, r.seq));
285 |                 seq_id += 1;
286 |             }
287 |         } else {
288 |             let mut fastx_reader = FastxReader::new(reader, &input_fn)?;
289 |             while let Some(r) = fastx_reader.next_rec() {
290 |                 let r = r.unwrap();
291 |                 if r.seq.len() < 500 {
292 |                     //ignore very short reads
293 |                     continue;
294 |                 }
295 |                 seqs.push((seq_id, r.id, r.seq));
296 |                 seq_id += 1;
297 |             }
298 |         }
299 |         let biseq = seqs
300 |             .par_iter()
301 |             .map(|(id, name, s)| (*id, name.clone(), encode_biseq(s)))
302 |             .collect::<Vec<(u32, Vec<u8>, Vec<u8>)>>();
303 | 
304 |         biseq.iter().for_each(|(id, name, s)| {
305 |             let _ = out_db_file.write(&s);
306 |             let _ = writeln!(
307 |                 out_idx_file,
308 |                 "{:09} {} {} {}",
309 |                 id,
310 |                 String::from_utf8_lossy(name),
311 |                 s.len(),
312 |                 start
313 |             );
314 |             start += s.len();
315 |         });
316 |     }
317 |     log::info!("total number of reads indexed: {}", seq_id);
318 |     log::info!("total number of bases: {}", start);
319 |     log::info!("average read length: {}", start as f32 / seq_id as f32);
320 |     Ok(start)
321 | }
322 | 


--------------------------------------------------------------------------------
/src/bin/utils/ovlp_ec.rs:
--------------------------------------------------------------------------------
  1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit
  2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin
  3 | //
  4 | // This Source Code Form is subject to the terms of the
  5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
  6 | //
  7 | // You should have received a copy of the license along with this
  8 | // work. If not, see <http://creativecommons.org/licenses/by-nc-sa/4.0/>.
  9 | 
 10 | #![allow(dead_code)]
 11 | 
 12 | //
 13 | // generate error corrected reads from the overlaps
 14 | //
 15 | 
 16 | use super::shmmrutils::*;
 17 | use super::Parameters;
 18 | use glob::glob;
 19 | use memmap::{Mmap, MmapOptions};
 20 | use rustc_hash::FxHashMap;
 21 | use rustc_hash::FxHashSet;
 22 | use std::fs::File;
 23 | use std::io::{self, BufRead, Write};
 24 | use std::path::Path;
 25 | use std::str::from_utf8;
 26 | use std::thread;
 27 | 
 28 | use std::io::prelude::*;
 29 | use threadpool::ThreadPool;
 30 | 
 31 | #[derive(Clone, Copy)]
 32 | struct ReadLocation {
 33 |     start: usize,
 34 |     len: usize,
 35 | }
 36 | 
 37 | #[derive(Debug, Copy, Clone)]
 38 | struct Overlap {
 39 |     rid0: u32,
 40 |     rid1: u32,
 41 |     strand1: u8,
 42 |     len0: u32,
 43 |     len1: u32,
 44 |     d_left: i32,
 45 |     d_right: i32,
 46 |     bgn0: u32,
 47 |     end0: u32,
 48 |     bgn1: u32,
 49 |     end1: u32,
 50 |     dist: u32,
 51 |     idt: f32,
 52 |     dist_c: u32,
 53 |     max_dist_c: u32,
 54 |     idt_c: f32,
 55 |     flag: u8,
 56 | }
 57 | // flag bit field
 58 | // 0x01: the rid0 is chimer
 59 | // 0x02: the rir0 and rid1 are compatitable pair
 60 | // 0x04: the rid1 is the best right pair
 61 | // 0x08: the rid1 is the best left pair
 62 | // 0x10: the rid1 is a chimer
 63 | // 0x20: the rid1 is contained
 64 | // 0x40: the rid0 is contained
 65 | 
 66 | fn build_overlap(v: Vec<&str>) -> Overlap {
 67 |     Overlap {
 68 |         rid0: v[1].parse().unwrap(),
 69 |         rid1: v[2].parse().unwrap(),
 70 |         strand1: v[3].parse().unwrap(),
 71 |         len0: v[4].parse().unwrap(),
 72 |         len1: v[5].parse().unwrap(),
 73 |         d_left: v[6].parse().unwrap(),
 74 |         d_right: v[7].parse().unwrap(),
 75 |         bgn0: v[8].parse().unwrap(),
 76 |         end0: v[9].parse().unwrap(),
 77 |         bgn1: v[10].parse().unwrap(),
 78 |         end1: v[11].parse().unwrap(),
 79 |         dist: v[12].parse().unwrap(),
 80 |         idt: v[13].parse().unwrap(),
 81 |         dist_c: v[14].parse().unwrap(),
 82 |         max_dist_c: v[15].parse().unwrap(),
 83 |         idt_c: v[16].parse().unwrap(),
 84 |         flag: v[17].parse().unwrap(),
 85 |     }
 86 | }
 87 | 
 88 | fn _format_overlap(o: Overlap) -> String {
 89 |     format!(
 90 |         "{} {} {} {} {} {} {} {} {} {} {} {} {:.2} {} {} {:.2} {}",
 91 |         o.rid0,
 92 |         o.rid1,
 93 |         o.strand1,
 94 |         o.len0,
 95 |         o.len1,
 96 |         o.d_left,
 97 |         o.d_right,
 98 |         o.bgn0,
 99 |         o.end0,
100 |         o.bgn1,
101 |         o.end1,
102 |         o.dist,
103 |         o.idt,
104 |         o.dist_c,
105 |         o.max_dist_c,
106 |         o.idt_c,
107 |         o.flag
108 |     )
109 | }
110 | 
111 | type OverlapMap = FxHashMap<u32, Vec<Overlap>>;
112 | type _DeltaMap = FxHashMap<(u32, u32, u8, i32), Vec<u32>>;
113 | 
114 | fn build_read_ovlp_data<P>(filename: P) -> Result<OverlapMap, std::io::Error>
115 | where
116 |     P: AsRef<Path>,
117 | {
118 |     let mut rid2ovlp = OverlapMap::default();
119 |     let mut buffer = String::new();
120 | 
121 |     let mut file = File::open(filename)?;
122 |     file.read_to_string(&mut buffer)?;
123 |     for line in buffer.split("\n") {
124 |         let mut v: Vec<&str> = Vec::<&str>::with_capacity(24); // we need pre-allocate some space for performance
125 |         line.split(' ').for_each(|c| v.push(c));
126 |         match v[0] {
127 |             "O" => {
128 |                 let ovlp = build_overlap(v);
129 |                 //d_left = ovlp.d_left;
130 |                 rid2ovlp
131 |                     .entry(ovlp.rid0)
132 |                     .or_insert_with(|| vec![])
133 |                     .push(ovlp);
134 |             }
135 |             _ => (),
136 |         }
137 |     }
138 |     Ok(rid2ovlp)
139 | }
140 | 
141 | fn get_consensue_seq(
142 |     seq0: Vec<u8>,
143 |     support_seq: Vec<([usize; 4], Vec<u8>)>,
144 |     tol: f64,
145 |     min_ec_cov: Option<u16>,
146 | ) -> Option<Vec<u8>> {
147 |     let min_ec_cov = min_ec_cov.unwrap_or(1);
148 |     // 4bit encoded seq consensus
149 |     let mut cov = vec![0 as u16; seq0.len()];
150 |     let mut deltas = FxHashMap::<u32, u16>::default();
151 |     for (offsets, seq1) in support_seq {
152 |         let [b0, e0, b1, e1] = offsets;
153 |         let seq0_trim = seq0[b0..e0].to_vec();
154 |         let seq1_trim = seq1[b1..e1].to_vec();
155 |         if let Some(ovlpmatch) = match_reads(&seq0_trim, &seq1_trim, true, tol, 1200, 32) {
156 |             for p in ovlpmatch.bgn0..ovlpmatch.end0 {
157 |                 cov[(p as usize) + b0] += 1;
158 |             }
159 |             let mut dpts = ovlpmatch.deltas.unwrap();
160 |             dpts.reverse();
161 |             let mut d = 0_u8;
162 |             let mut px = 0_u32;
163 | 
164 |             for dpt in dpts {
165 |                 let c = if dpt.dk > 0 {
166 |                     0
167 |                 } else {
168 |                     seq1_trim[dpt.y as usize - 1]
169 |                 };
170 |                 let cx = dpt.x + (b0 as u32) - 1;
171 |                 if cx != px {
172 |                     d = 0;
173 |                 } else {
174 |                     d += 1;
175 |                 }
176 | 
177 |                 let key = cx << 12 | (d as u32) << 4 | (c as u32); //this limit the length of the read to be corrected to 2^20
178 |                 let counter = deltas.entry(key).or_insert(0);
179 |                 *counter += 1;
180 | 
181 |                 px = cx;
182 |             }
183 |         }
184 |     }
185 | 
186 |     let mut delta_best = FxHashMap::<u32, (u16, u8)>::default();
187 |     for k in deltas.keys() {
188 |         let v = deltas.get(k).unwrap();
189 |         let p = k >> 12;
190 |         if *v < cov[p as usize] >> 1 || *v < 3 {
191 |             continue;
192 |         }
193 |         let key = *k >> 4;
194 |         let counter = delta_best.entry(key).or_insert((*v, (*k & 0xF) as u8));
195 |         if *v > counter.0 {
196 |             delta_best.insert(key, (*v, (*k & 0xF) as u8));
197 |         }
198 |     }
199 | 
200 |     let mut keys = delta_best.keys().collect::<Vec<&u32>>();
201 |     keys.sort();
202 |     let mut max_delta = FxHashMap::<u32, u8>::default();
203 |     for k in keys.iter() {
204 |         //let v = delta_best.get(k).unwrap();
205 |         let x = *k >> 8;
206 |         max_delta.insert(x, (*k & 0xFF) as u8);
207 |     }
208 | 
209 |     let mut consensus_seq = Vec::<u8>::new();
210 |     let mut consensus_seq_cov = Vec::<u16>::new();
211 |     for p in 0..seq0.len() {
212 |         if !max_delta.contains_key(&(p as u32)) {
213 |             consensus_seq.push(seq0[p]);
214 |             consensus_seq_cov.push(cov[p]);
215 |             continue;
216 |         }
217 |         let max_d = *max_delta.get(&(p as u32)).unwrap();
218 |         for d in 0..max_d + 1 {
219 |             let k = ((p as u32) << 8) | (d as u32);
220 |             if let Some(v) = delta_best.get(&k) {
221 |                 if v.1 != 0 {
222 |                     if d == 0 {
223 |                         consensus_seq.push(seq0[p]);
224 |                         consensus_seq_cov.push(cov[p]);
225 |                     }
226 |                     consensus_seq.push(v.1);
227 |                     consensus_seq_cov.push(cov[p]);
228 |                     // println!("insert {} {}", p, b);
229 |                 }
230 |             }
231 |         }
232 |     }
233 |     assert!(consensus_seq_cov.len() == consensus_seq.len());
234 | 
235 |     let mut bgn = Option::<usize>::None;
236 |     for i in 0..consensus_seq_cov.len() {
237 |         if consensus_seq_cov[i] >= min_ec_cov {
238 |             // need two other supporting reads for EC
239 |             bgn = Some(i);
240 |             break;
241 |         }
242 |     }
243 | 
244 |     //consensus_seq_cov.iter().for_each(|c| {println!("{}", c)});
245 | 
246 |     let mut end = Option::<usize>::None;
247 |     let len = consensus_seq_cov.len() - 1;
248 |     for i in 0..consensus_seq_cov.len() {
249 |         if consensus_seq_cov[len - i] >= min_ec_cov {
250 |             end = Some(len - i);
251 |             break;
252 |         }
253 |     }
254 | 
255 |     if bgn.is_none() || end.is_none() {
256 |         None
257 |     } else {
258 |         let b = bgn.unwrap();
259 |         let e = end.unwrap();
260 |         if min_ec_cov >= 3 {
261 |             for i in b..e {
262 |                 if consensus_seq_cov[i] < min_ec_cov {
263 |                     return None;
264 |                 }
265 |             }
266 |         }
267 |         if b > e || e < b + 1000 {
268 |             None
269 |         } else {
270 |             Some(consensus_seq[b..e].to_vec())
271 |         }
272 |     }
273 | }
274 | 
275 | fn get_corrected_seq(
276 |     r: u32,
277 |     ovlp: &Vec<Overlap>,
278 |     read_index: &Vec<ReadLocation>,
279 |     readsdb: &Mmap,
280 |     tol: f64,
281 |     min_ec_cov: Option<u16>,
282 | ) -> Option<Vec<u8>> {
283 |     let rid0 = r;
284 |     let rloc0 = read_index[rid0 as usize];
285 |     let s0 = rloc0.start;
286 |     let len0 = rloc0.len;
287 |     let e0 = s0 + len0;
288 |     let mut seq0 = Vec::<u8>::with_capacity(20480);
289 | 
290 |     let basemap = [0b0001, 0b0010, 0b0100, 0b1000];
291 |     for c in &readsdb[s0..e0] {
292 |         seq0.push(basemap[(c & 0b0011) as usize]);
293 |     }
294 | 
295 |     // println!("seq {} {}", r, seq2string(&seq0));
296 |     let mut support_seqs = Vec::<([usize; 4], Vec<u8>)>::new();
297 |     for vv in ovlp.iter() {
298 |         if vv.flag & 0b0010 == 0 {
299 |             continue;
300 |         }
301 |         let rid1 = vv.rid1;
302 |         let rloc1 = read_index[rid1 as usize];
303 |         let strand: u8 = vv.strand1;
304 |         let s1 = rloc1.start;
305 |         let len1 = rloc1.len;
306 |         let e1 = s1 + len1;
307 |         let mut seq1 = Vec::<u8>::with_capacity(20480);
308 | 
309 |         // we need to map 2bit to 4bit so we can use 0x0000 for special cases
310 |         if strand == 0 {
311 |             for c in &readsdb[s1..e1] {
312 |                 // seq1.push(basemap[(c & 0x0F) as usize]);
313 |                 seq1.push(basemap[(c & 0b0011) as usize]);
314 |             }
315 |         } else {
316 |             for c in &readsdb[s1..e1] {
317 |                 //seq1.push(basemap[((c >> 4) & 0x0F) as usize]);
318 |                 seq1.push(basemap[((c >> 4) & 0b0011) as usize]);
319 |             }
320 |         }
321 |         let b0 = vv.bgn0 as usize;
322 |         let e0 = vv.end0 as usize;
323 |         let b1 = vv.bgn1 as usize;
324 |         let e1 = vv.end1 as usize;
325 |         if b1 > e1 {
326 |             continue;
327 |         }
328 |         support_seqs.push(([b0, e0, b1, e1], seq1));
329 |     }
330 |     if let Some(c_seq) = get_consensue_seq(seq0, support_seqs, tol, min_ec_cov) {
331 |         let mut out_seq = Vec::<u8>::new();
332 |         for c in c_seq {
333 |             match c {
334 |                 0b0001 => out_seq.push(b'A'),
335 |                 0b0010 => out_seq.push(b'C'),
336 |                 0b0100 => out_seq.push(b'G'),
337 |                 0b1000 => out_seq.push(b'T'),
338 |                 _ => out_seq.push(b'N'),
339 |             }
340 |         }
341 |         Some(out_seq)
342 |     } else {
343 |         None
344 |     }
345 | }
346 | 
347 | pub fn ovlp_ec(
348 |     seqdb_file: &String,
349 |     index_file: &String,
350 |     prefix: &String,
351 |     out_prefix: &String,
352 |     parameters: &Parameters,
353 | ) -> Result<(), io::Error> {
354 |     let mut read_index = Vec::<ReadLocation>::new();
355 |     let mut read_name = Vec::<String>::new();
356 | 
357 |     log::info!("read seq idx: {}", seqdb_file);
358 |     let lines = read_lines(index_file)?;
359 |     for line in lines {
360 |         if let Ok(rec) = line {
361 |             // the record line looks like 000000023 m64062_190803_042216/144/ccs 20359 467415
362 |             let v: Vec<&str> = rec.split_whitespace().collect();
363 |             // let rid: u32 = v[0].parse().unwrap();
364 |             let start: usize = v[3].parse().unwrap();
365 |             let len: usize = v[2].parse().unwrap();
366 |             read_index.push(ReadLocation {
367 |                 start: start,
368 |                 len: len,
369 |             });
370 |             read_name.push(v[1].to_string());
371 |             //println!("{} {} {}", rid, start, len);
372 |         }
373 |     }
374 | 
375 |     let read_index = read_index;
376 |     log::info!("read seq db: {}", seqdb_file);
377 |     let file = File::open(seqdb_file)?;
378 |     let readsdb = unsafe { MmapOptions::new().map(&file)? };
379 |     //println!("{:?}", args);
380 |     let _ = log::info!("prefix: {}", prefix);
381 |     let _ = log::info!("out_prefix: {}", out_prefix);
382 |     let infile_pattern = [prefix.clone(), "*".to_string()].concat();
383 | 
384 |     let mut children = Vec::new();
385 |     //let mut chunk: u8 = 0;
386 |     for entry in glob(&infile_pattern).expect("Failed to read glob pattern") {
387 |         match entry {
388 |             Ok(path) => {
389 |                 // println!("{:?}", path.display());
390 |                 let child = thread::spawn(move || {
391 |                     let rid2ovlp = build_read_ovlp_data(path);
392 |                     rid2ovlp
393 |                 });
394 |                 children.push(child);
395 |             }
396 |             Err(e) => log::error!("{:?}", e),
397 |         }
398 |         //chunk += 1;
399 |     }
400 | 
401 |     let mut rid2ovlp_all = OverlapMap::default();
402 |     for child in children {
403 |         let rid2ovlp_p = child.join().expect("oops! the child thread panicked")?;
404 |         rid2ovlp_all.extend(rid2ovlp_p);
405 |     }
406 | 
407 |     let contained = FxHashSet::<u32>::default();
408 | 
409 |     let readsdb = std::sync::Arc::new(readsdb);
410 |     let read_index = std::sync::Arc::new(read_index);
411 |     let contained = std::sync::Arc::new(contained);
412 |     let rid2ovlp_all = std::sync::Arc::new(rid2ovlp_all);
413 |     let read_name = std::sync::Arc::new(read_name);
414 |     let pool = ThreadPool::new(parameters.nthreads as usize);
415 |     for chunk in 0..parameters.nchunks {
416 |         let readsdb = readsdb.clone();
417 |         let read_index = read_index.clone();
418 |         let contained = contained.clone();
419 |         let rid2ovlp_all = rid2ovlp_all.clone();
420 |         let read_name = read_name.clone();
421 |         let out_prefix = out_prefix.clone();
422 |         let nchunks = parameters.nchunks;
423 |         let tol = parameters.tol;
424 |         let min_ec_cov = parameters.min_ec_cov;
425 |         pool.execute(move || {
426 |             let out_file_name = format!("{}_{:02}.fa", out_prefix, chunk + 1);
427 |             let mut f = File::create(out_file_name).unwrap();
428 |             for r in rid2ovlp_all.keys() {
429 |                 if (*r % nchunks) != chunk {
430 |                     continue;
431 |                 }
432 |                 if contained.contains(r) {
433 |                     continue;
434 |                 }
435 |                 let ovlp = rid2ovlp_all.get(r).unwrap();
436 |                 // let mut deltas = HashMap::<u32, u8>::new();
437 | 
438 |                 let rid0 = *r;
439 |                 if let Some(corrected_seq) =
440 |                     get_corrected_seq(rid0, &ovlp, &read_index, &readsdb, tol, Some(min_ec_cov))
441 |                 {
442 |                     let _ = writeln!(
443 |                         f,
444 |                         ">{} {}\n{}",
445 |                         read_name[*r as usize],
446 |                         r,
447 |                         from_utf8(&corrected_seq).unwrap()
448 |                     );
449 |                 }
450 |             }
451 |         });
452 |     }
453 |     pool.join();
454 |     Ok(())
455 | }
456 | 
457 | // The output is wrapped in a Result to allow matching on errors
458 | // Returns an Iterator to the Reader of the lines of the file.
459 | fn read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>>
460 | where
461 |     P: AsRef<Path>,
462 | {
463 |     let file = File::open(filename)?;
464 |     Ok(io::BufReader::with_capacity(10000000, file).lines())
465 | }
466 | 


--------------------------------------------------------------------------------
/src/bin/pg_asm.rs:
--------------------------------------------------------------------------------
  1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit
  2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin
  3 | //
  4 | // This Source Code Form is subject to the terms of the
  5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
  6 | //
  7 | // You should have received a copy of the license along with this
  8 | // work. If not, see <http://creativecommons.org/licenses/by-nc-sa/4.0/>.
  9 | 
 10 | const VERSION_STRING: &'static str = env!("VERSION_STRING");
 11 | 
 12 | #[global_allocator]
 13 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
 14 | //static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc;
 15 | 
 16 | use clap::clap_app;
 17 | use glob::glob;
 18 | use num_cpus;
 19 | use std::fs::File;
 20 | use std::fs::{create_dir_all, remove_file};
 21 | use std::io::{BufRead, Write};
 22 | use std::io::{BufReader, BufWriter, Result};
 23 | use std::path::Path;
 24 | use sysinfo::SystemExt;
 25 | 
 26 | mod utils;
 27 | use simple_logger::SimpleLogger;
 28 | use std::time::SystemTime;
 29 | use utils::build_idx;
 30 | use utils::build_sdb;
 31 | use utils::dp_graph;
 32 | use utils::graph;
 33 | use utils::layout;
 34 | use utils::ovlp;
 35 | use utils::ovlp_ec;
 36 | use utils::resolve::resolve_ht;
 37 | use utils::seqmap::dedup_target_seqs;
 38 | use utils::Parameters;
 39 | use utils::{getrusage, log_resource, rusage, MaybeUninit, RUSAGE_SELF};
 40 | 
 41 | fn cat_path(wd: &String, filename: &String) -> String {
 42 |     Path::new(&wd).join(&filename).to_string_lossy().to_string()
 43 | }
 44 | 
 45 | fn get_ovlps(
 46 |     input_reads: &String,
 47 |     work_dir: &String,
 48 |     prefix: &String,
 49 |     delete_input: bool,
 50 |     parameters: &Parameters,
 51 |     rdata: &mut rusage,
 52 | ) -> Result<()> {
 53 |     log_resource(
 54 |         &format!("BGN: get_ovlp, input_reads: {}", input_reads),
 55 |         rdata,
 56 |     );
 57 |     let output_prefix = format!("{}/{}", &work_dir, &prefix);
 58 | 
 59 |     log_resource("BGN: bulding sequence database", rdata);
 60 |     let nbase = build_sdb::build(&input_reads, &output_prefix)?;
 61 |     log_resource("END: bulding sequence database", rdata);
 62 |     if delete_input {
 63 |         let seq_list_buf = BufReader::new(File::open(&input_reads).unwrap());
 64 |         for fastx_file in seq_list_buf.lines() {
 65 |             let fastx_file = fastx_file?;
 66 |             remove_file(&fastx_file)?;
 67 |         }
 68 |     }
 69 | 
 70 |     let seqdb = cat_path(&work_dir, &format!("{}.seqdb", &prefix));
 71 |     let seqidx = cat_path(&work_dir, &format!("{}.idx", &prefix));
 72 |     let shmmer_idx = cat_path(&work_dir, &format!("{}-shmr", &prefix));
 73 | 
 74 |     // step 2: build shimmer index
 75 |     log_resource("BGN: bulding shimmer", rdata);
 76 |     build_idx::build(&seqdb, &seqidx, &shmmer_idx, &parameters);
 77 |     log_resource("END: bulding shimmer", rdata);
 78 | 
 79 |     //step 3: overlap reads
 80 |     let ovlp_out = format!("{}/{}-ovlp", &work_dir, &prefix);
 81 |     log_resource("BGN: overlapping", rdata);
 82 |     let system = sysinfo::System::new_all();
 83 |     let free_mem = system.total_memory() - system.used_memory();
 84 |     if (free_mem as f64) < ((nbase >> 10) as f64 * 1.5) {
 85 |         log::warn!("free memory is less than 1.5 x (total number of bases)");
 86 |         log::warn!(
 87 |             "free memory = {}kb, total bases / 1024 = {}",
 88 |             free_mem,
 89 |             nbase >> 10
 90 |         );
 91 |     }
 92 |     ovlp::ovlp(&seqdb, &seqidx, &shmmer_idx, &ovlp_out, &parameters)?;
 93 |     log_resource("END: overlapping", rdata);
 94 | 
 95 |     log_resource(
 96 |         &format!("END: get_ovlp - input_reads: {}", input_reads),
 97 |         rdata,
 98 |     );
 99 |     Ok(())
100 | }
101 | 
102 | fn main() -> Result<()> {
103 |     let mut rdata = unsafe { MaybeUninit::uninit().assume_init() };
104 |     let _res = unsafe { getrusage(RUSAGE_SELF, &mut rdata) };
105 | 
106 |     let matches = clap_app!(pg_asm =>
107 |         (version: VERSION_STRING)
108 |         (author: "Jason Chin <jason@omnibio.ai>")
109 |         (about: "
110 | Peregrine-2021 genome assembler
111 | pg_asm: the main workflow entry for end-to-end assembly from the reads
112 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/")
113 |         (@arg input_reads: +required "Path to a file that contains the list of reads in .fa .fa.gz .fastq or fastq.gz formats")
114 |         (@arg work_dir: +required "The path to a work directory for intermediate files and the results")
115 |         (@arg NTHREADS: +takes_value "Number of threads")
116 |         (@arg NCHUNKS: +takes_value "Number of partition")
117 |         (@arg w: -w +takes_value "Window size [default: 80]")
118 |         (@arg k: -k +takes_value "Kmer size [default: 56]")
119 |         (@arg r: -r +takes_value "Reduction factor [default: 6]")
120 |         (@arg tol: -t --tol +takes_value "Alignment tolerance [default: 0.01]")
121 |         (@arg layout_method: -l +takes_value "layout version [default: 2]")
122 |         (@arg bestn: --bestn -b +takes_value "number of best overlaps for initial graph [default: 6]")
123 |         (@arg keep: --keep "keep intermediate files")
124 |         (@arg fast: --fast "run the assembler in the fast mode")
125 |         (@arg no_resolve: --no_resolve "disable resolving repeats / dups at the end")
126 |         (@arg min_ec_cov: -c --min_ec_cov +takes_value "Minimum error coverage [default: 1]")
127 |         (@arg log: --log +takes_value "log level: DBBUG or INFO (default)")
128 |     ).get_matches();
129 | 
130 |     let log_level = match matches.value_of("log").unwrap_or("INFO") {
131 |         "DEBUG" => log::LevelFilter::Debug,
132 |         _ => log::LevelFilter::Info,
133 |     };
134 | 
135 |     SimpleLogger::new()
136 |         .with_level(log_level)
137 |         .with_utc_timestamps()
138 |         .init()
139 |         .unwrap();
140 | 
141 |     let input_reads = matches.value_of("input_reads").unwrap().to_string();
142 |     let work_dir = matches.value_of("work_dir").unwrap().to_string();
143 | 
144 |     let prefix = "reads".to_string();
145 | 
146 |     let keep = matches.is_present("keep");
147 |     let fastmode = matches.is_present("fast");
148 |     let no_resolve = matches.is_present("no_resolve");
149 | 
150 |     let physical_cpus = num_cpus::get_physical();
151 |     let nthreads = matches
152 |         .value_of("NTHREADS")
153 |         .unwrap_or(&physical_cpus.to_string())
154 |         .parse::<u32>()
155 |         .unwrap();
156 | 
157 |     let nchunks: u32;
158 |     if matches.is_present("NCHUNKS") {
159 |         nchunks = matches
160 |             .value_of("NCHUNKS")
161 |             .unwrap()
162 |             .to_string()
163 |             .parse::<u32>()
164 |             .unwrap();
165 |     } else {
166 |         let physical_cpus = physical_cpus as u32;
167 |         match physical_cpus {
168 |             1..=5 => nchunks = 16 + physical_cpus * 4,
169 |             6..=12 => nchunks = 2 + physical_cpus * 3,
170 |             13..=19 => nchunks = physical_cpus * 2,
171 |             _ => nchunks = physical_cpus,
172 |         }
173 |     }
174 | 
175 |     let wsize = matches
176 |         .value_of("w")
177 |         .unwrap_or("80")
178 |         .parse::<u32>()
179 |         .unwrap();
180 | 
181 |     let ksize = matches
182 |         .value_of("k")
183 |         .unwrap_or("56")
184 |         .parse::<u32>()
185 |         .unwrap();
186 | 
187 |     let rfactor = matches.value_of("r").unwrap_or("6").parse::<u32>().unwrap();
188 | 
189 |     let tol = matches
190 |         .value_of("tol")
191 |         .unwrap_or("0.01")
192 |         .parse::<f64>()
193 |         .unwrap();
194 | 
195 |     let min_ec_cov = matches
196 |         .value_of("min_ec_cov")
197 |         .unwrap_or("1")
198 |         .parse::<u16>()
199 |         .unwrap();
200 | 
201 |     let layout_method = matches
202 |         .value_of("layout_method")
203 |         .unwrap_or("2")
204 |         .parse::<u8>()
205 |         .unwrap();
206 | 
207 |     let bestn = matches
208 |         .value_of("bestn")
209 |         .unwrap_or("6")
210 |         .parse::<usize>()
211 |         .unwrap();
212 | 
213 |     let parameters = Parameters {
214 |         nchunks: nchunks,
215 |         nthreads: nthreads,
216 |         w: wsize,
217 |         k: ksize,
218 |         r: rfactor,
219 |         tol: tol,
220 |         min_ec_cov: min_ec_cov,
221 |     };
222 | 
223 |     log::info!("pg_asm {}", VERSION_STRING);
224 |     log::info!(
225 |         "command: {}",
226 |         std::env::args().collect::<Vec<String>>().join(" ")
227 |     );
228 |     let cdir = std::env::current_dir()?;
229 |     log::info!("current dir: {}", cdir.as_os_str().to_string_lossy());
230 | 
231 |     let start_wall_clock_time = SystemTime::now();
232 |     log::info!("pg_asm run start");
233 | 
234 |     log_resource("BGN: pg_asm", &mut rdata);
235 |     log::info!(
236 |         "pg_asm run parameters: w:{}, k:{}, r:{}, tol:{} bestn:{}",
237 |         wsize,
238 |         ksize,
239 |         rfactor,
240 |         tol,
241 |         bestn
242 |     );
243 | 
244 |     log::info!("faster mode: {}", fastmode);
245 |     log::info!("use layout method: {}", layout_method);
246 |     log::info!("keep intermediate files: {}", keep);
247 |     log::info!("number of threads: {}", nthreads);
248 |     log::info!("number of chunks: {}", nchunks);
249 |     log::info!("input read file: {}", input_reads);
250 |     log::info!("working directory: {}", work_dir);
251 |     log::info!(
252 |         "sys: number of physical CPU cores detected: {}",
253 |         physical_cpus
254 |     );
255 |     let system = sysinfo::System::new_all();
256 |     log::info!("sys: total memory: {} KB", system.total_memory());
257 |     log::info!("sys: used memory: {} KB", system.used_memory());
258 |     log::info!("sys: total swap: {} KB", system.total_swap());
259 |     log::info!("sys: used swap: {} KB", system.used_swap());
260 | 
261 |     if !Path::new(&work_dir).exists() {
262 |         create_dir_all(&work_dir)?;
263 |     };
264 | 
265 |     get_ovlps(
266 |         &input_reads,
267 |         &work_dir,
268 |         &prefix,
269 |         false,
270 |         &parameters,
271 |         &mut rdata,
272 |     )?;
273 | 
274 |     if fastmode {
275 |         log::info!("Fast mode: ignore read level error correction");
276 |         // graph processing
277 | 
278 |         let seqdb = cat_path(&work_dir, &format!("{}.seqdb", &prefix));
279 |         let seqidx = cat_path(&work_dir, &format!("{}.idx", &prefix));
280 |         let ovlp_out = cat_path(&work_dir, &format!("{}-ovlp", &prefix));
281 |         let layout_prefix = cat_path(&work_dir, &"asm".to_string());
282 | 
283 |         let layout_file = format!("{}_layout.dat", &layout_prefix);
284 | 
285 |         log_resource("BGN: ovlp2layout", &mut rdata);
286 |         log::info!("use layout method: {}", layout_method);
287 |         match layout_method {
288 |             1 => graph::ovlp2layout_v1(&ovlp_out, &layout_prefix, bestn),
289 |             _ => dp_graph::ovlp2layout_v2(&ovlp_out, &layout_prefix, bestn)?,
290 |         }
291 |         log_resource("END: ovlp2layout", &mut rdata);
292 | 
293 |         // layout -> sequence
294 |         let output_file_prefix = format!("{}/asm_ctgs", &work_dir);
295 |         log_resource("BGN: layout2ctg", &mut rdata);
296 |         layout::layout2ctg(&seqdb, &seqidx, &layout_file, &output_file_prefix)?;
297 |         let _res = unsafe { getrusage(RUSAGE_SELF, &mut rdata) };
298 |         log_resource("END: layout2ctg", &mut rdata);
299 | 
300 |     } else {
301 |         // error correction
302 |         let seqdb = cat_path(&work_dir, &format!("{}.seqdb", &prefix));
303 |         let seqidx = cat_path(&work_dir, &format!("{}.idx", &prefix));
304 |         let shmmer_idx = cat_path(&work_dir, &format!("{}-shmr", &prefix));
305 |         let ovlp_out = cat_path(&work_dir, &format!("{}-ovlp", &prefix));
306 |         let ec_read_prefix = cat_path(&work_dir, &"ec_read".to_string());
307 | 
308 |         log_resource("BGN: ovlp_ec", &mut rdata);
309 |         ovlp_ec::ovlp_ec(&seqdb, &seqidx, &ovlp_out, &ec_read_prefix, &parameters)?;
310 |         log_resource("END: ovlp_ec", &mut rdata);
311 | 
312 |         if !keep {
313 |             let ovlp_out_ptn = format!("{}*", ovlp_out);
314 |             for out in glob::glob(&ovlp_out_ptn.as_str()).expect("error to delete file") {
315 |                 let out = out.unwrap().to_string_lossy().into_owned();
316 |                 log::info!("remove {}", out);
317 |                 remove_file(out)?;
318 |             }
319 |             let shmmer_idx_ptn = format!("{}*", &shmmer_idx);
320 |             for f in glob(&shmmer_idx_ptn.as_str()).expect("error to delete file") {
321 |                 let f = f.unwrap().to_string_lossy().into_owned();
322 |                 log::info!("remove {}", f);
323 |                 remove_file(f)?;
324 |             }
325 | 
326 |             remove_file(&seqdb)?;
327 |             log::info!("remove {}", seqdb.to_string());
328 |             remove_file(&seqidx)?;
329 |             log::info!("remove {}", seqidx.to_string());
330 |         }
331 | 
332 |         let ec_lst = cat_path(&work_dir, &"ec_reads.lst".to_string());
333 |         let mut ec_lst_file = BufWriter::new(File::create(&ec_lst).unwrap());
334 |         let ec_file_ptn = format!("{}*.fa", &ec_read_prefix);
335 |         for f in glob(&ec_file_ptn.as_str()).unwrap() {
336 |             writeln!(ec_lst_file, "{}", f.unwrap().to_string_lossy())?;
337 |         }
338 |         drop(ec_lst_file); //close the file
339 | 
340 |         log::info!("input reads: {}", ec_lst);
341 |         log::info!("working directory: {}", work_dir);
342 | 
343 |         let prefix = "ec_reads".to_string();
344 |         if keep {
345 |             get_ovlps(&ec_lst, &work_dir, &prefix, false, &parameters, &mut rdata)?;
346 |         } else {
347 |             get_ovlps(&ec_lst, &work_dir, &prefix, true, &parameters, &mut rdata)?;
348 |         }
349 | 
350 |         if !keep {
351 |             let ec_file_ptn = format!("{}*.fa", &ec_read_prefix);
352 |             for f in glob(&ec_file_ptn.as_str()).unwrap() {
353 |                 let f = f.unwrap().to_string_lossy().into_owned();
354 |                 log::info!("remove {}", f);
355 |                 remove_file(f)?;
356 |             }
357 |             let shmmer_idx = cat_path(&work_dir, &format!("{}-shmr", &prefix));
358 |             let shmmer_idx_ptn = format!("{}*", &shmmer_idx);
359 |             for f in glob(&shmmer_idx_ptn.as_str()).expect("error to delete file") {
360 |                 let f = f.unwrap().to_string_lossy().into_owned();
361 |                 log::info!("remove {}", f);
362 |                 remove_file(f)?;
363 |             }
364 |         }
365 | 
366 |         //step 4: graph processing
367 | 
368 |         let seqdb = cat_path(&work_dir, &format!("{}.seqdb", &prefix));
369 |         let seqidx = cat_path(&work_dir, &format!("{}.idx", &prefix));
370 |         let ovlp_out = cat_path(&work_dir, &format!("{}-ovlp", &prefix));
371 |         let layout_prefix = cat_path(&work_dir, &"asm".to_string());
372 |         let layout_file = format!("{}_layout.dat", &layout_prefix);
373 | 
374 |         log_resource("BGN: ovlp2layout", &mut rdata);
375 |         log::info!("use layout method: {}", layout_method);
376 |         match layout_method {
377 |             1 => graph::ovlp2layout_v1(&ovlp_out, &layout_prefix, bestn),
378 |             _ => dp_graph::ovlp2layout_v2(&ovlp_out, &layout_prefix, bestn)?,
379 |         }
380 |         log_resource("END: ovlp2layout", &mut rdata);
381 | 
382 |         //step 5: layout -> sequence
383 |         log_resource("BGN: layout2ctg", &mut rdata);
384 |         let output_file_prefix = format!("{}/asm_ctgs", &work_dir);
385 |         layout::layout2ctg(&seqdb, &seqidx, &layout_file, &output_file_prefix)?;
386 |         let _res = unsafe { getrusage(RUSAGE_SELF, &mut rdata) };
387 |         log_resource("END: layout2ctg", &mut rdata);
388 |     }
389 |     if no_resolve {
390 |         log::info!("ignore dup resolution");
391 |     } else {
392 |         let ref_file = format!("{}/asm_ctgs_m.fa", &work_dir);
393 |         let tgt_file = format!("{}/asm_ctgs_e0.fa", &work_dir);
394 |         let out_file = format!("{}/asm_ctgs_e.fa", &work_dir);
395 | 
396 |         log_resource("BEN: dedup_a_ctgs", &mut rdata);
397 |         dedup_target_seqs(&ref_file, &tgt_file, &out_file, wsize, ksize, rfactor)?;
398 |         log_resource("END: dedup_a_ctgs", &mut rdata);
399 | 
400 |         let resolve_prefix = format!("{}/asm_ctgs_m", &work_dir);
401 | 
402 |         log_resource("BEN: resolve_ht", &mut rdata);
403 |         resolve_ht(&ref_file, &resolve_prefix, wsize, ksize, rfactor)?;
404 |         log_resource("END: resolve_ht", &mut rdata);
405 |     }
406 |     let (_, ut, st) = log_resource("END: pg_asm", &mut rdata);
407 |     log::info!("pg_asm run end");
408 |     log::info!(
409 |         "total user cpu time: {} seconds = {} hours",
410 |         ut,
411 |         ut as f32 / 60.0 / 60.0
412 |     );
413 |     log::info!(
414 |         "total system cpu time: {} seconds = {} hours",
415 |         st,
416 |         st as f32 / 60.0 / 60.0
417 |     );
418 |     let elapsed_time = start_wall_clock_time.elapsed().unwrap().as_secs_f32();
419 |     log::info!(
420 |         "total elapse time: {} seconds = {} hours",
421 |         elapsed_time,
422 |         elapsed_time / 60.0 / 60.0
423 |     );
424 |     Ok(())
425 | }
426 | 


--------------------------------------------------------------------------------
/src/bin/utils/shmmrutils.rs:
--------------------------------------------------------------------------------
  1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit
  2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin
  3 | //
  4 | // This Source Code Form is subject to the terms of the
  5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
  6 | //
  7 | // You should have received a copy of the license along with this
  8 | // work. If not, see <http://creativecommons.org/licenses/by-nc-sa/4.0/>.
  9 | 
 10 | #![allow(dead_code)]
 11 | 
 12 | //
 13 | // utility functions for SHIMMER process and the read matching / mapping
 14 | //
 15 | 
 16 | use memmap::Mmap;
 17 | use std::fmt;
 18 | 
 19 | use std::collections::HashMap;
 20 | 
 21 | pub struct HPCSeq {
 22 |     pub s: Vec<u8>,
 23 |     pub p: Vec<u32>,
 24 | }
 25 | 
 26 | pub fn get_hpc_seq(seq0: &Vec<u8>) -> HPCSeq {
 27 |     let mut seq = HPCSeq {
 28 |         s: Vec::<u8>::with_capacity(seq0.len()),
 29 |         p: Vec::<u32>::with_capacity(seq0.len()),
 30 |     };
 31 |     let mut i = 0_usize;
 32 |     let seq0len = seq0.len();
 33 | 
 34 |     while i < seq0len - 1 {
 35 |         let b = seq0[i]; // 2bit flag + 2bit base
 36 |         if b & 0b1100 != 0 {
 37 |             i += 1; // base marked as hp/dimer tial
 38 |             continue;
 39 |         }
 40 |         seq.s.push(b & 0b0011);
 41 |         seq.p.push(i as u32);
 42 |         i += 1;
 43 |     }
 44 |     seq
 45 | }
 46 | 
 47 | pub struct OvlpMatch {
 48 |     pub m_size: u32,
 49 |     pub dist: u32,
 50 |     pub bgn0: u32,
 51 |     pub end0: u32,
 52 |     pub bgn1: u32,
 53 |     pub end1: u32,
 54 |     pub m_end0: u32,
 55 |     pub m_end1: u32,
 56 |     pub deltas: Option<Vec<DeltaPoint>>,
 57 | }
 58 | 
 59 | #[derive(Copy, Clone)]
 60 | pub struct DeltaPoint {
 61 |     pub x: u32,
 62 |     pub y: u32,
 63 |     pub dk: i32,
 64 | }
 65 | 
 66 | pub fn track_delta_point(
 67 |     delta_pts: &HashMap<(u32, i32), DeltaPoint>,
 68 |     d_final: u32,
 69 |     k_final: i32,
 70 |     s: u32,
 71 |     e: u32,
 72 | ) -> Vec<DeltaPoint> {
 73 |     let mut dpts = Vec::<DeltaPoint>::with_capacity(d_final as usize);
 74 |     let mut d = d_final;
 75 |     let mut k = k_final;
 76 |     while d > 0 {
 77 |         let dpt = delta_pts.get(&(d, k)).unwrap();
 78 |         if dpt.x > s && dpt.x < e {
 79 |             dpts.push(*dpt);
 80 |         }
 81 |         d -= 1;
 82 |         k -= dpt.dk;
 83 |     }
 84 |     dpts
 85 | }
 86 | 
 87 | pub fn match_reads(
 88 |     seq0: &Vec<u8>,
 89 |     seq1: &Vec<u8>,
 90 |     get_delta: bool,
 91 |     tol: f64,
 92 |     min_match_len: u32,
 93 |     bandwidth: u32,
 94 | ) -> Option<OvlpMatch> {
 95 |     //
 96 |     // A variation of the O(nD) algorithm for read alignments
 97 |     //
 98 | 
 99 |     // let min_match_len = 1200;
100 |     let len0 = seq0.len();
101 |     let len1 = seq1.len();
102 |     //println!("S {} {}", len0, len1);
103 |     //let d_max = 64 + (0.01 * if len0 < len1 {len0 as f32} else {len1 as f32}) as u32;
104 |     let d_max = 32
105 |         + (tol
106 |             * if len0 < len1 {
107 |                 len0 as f64
108 |             } else {
109 |                 len1 as f64
110 |             }) as u32;
111 |     let max_band_width = bandwidth;
112 |     let band_tolerance = bandwidth;
113 |     let mut k_min = 0_i32;
114 |     let mut k_max = 0_i32;
115 |     let mut uv_map = HashMap::<i32, (u32, u32)>::new();
116 |     // uv_map: maping k to the u, v, which keep the d path end in k
117 |     let mut delta_pts = HashMap::<(u32, i32), DeltaPoint>::new();
118 |     let mut x: u32;
119 |     let mut y: u32;
120 |     let mut x1: u32;
121 |     let mut y1: u32;
122 |     let mut best_m = -1_i32;
123 |     let mut matched = false;
124 |     let mut d_final = 0_u32;
125 |     let mut k_final = 0_i32;
126 |     let mut pre_k: i32;
127 |     let mut start = false;
128 |     let mut longest_match = 0_u32;
129 |     let mut rtn = OvlpMatch {
130 |         m_size: 0,
131 |         dist: 0,
132 |         bgn0: 0,
133 |         end0: 0,
134 |         bgn1: 0,
135 |         end1: 0,
136 |         m_end0: 0,
137 |         m_end1: 0,
138 |         deltas: None,
139 |     };
140 | 
141 |     for d in -(d_max as i32)..=(d_max as i32) {
142 |         uv_map.insert(d, (0, 0));
143 |     }
144 |     for d in 0..d_max {
145 |         if k_max - k_min > max_band_width as i32 {
146 |             // println!("KK {} {} {} {}", k_max, k_min, k_max - k_min, max_band_width);
147 |             break;
148 |         }
149 |         for k in (k_min..=k_max).step_by(2) {
150 |             let (_, vn) = uv_map.get(&(k - 1)).unwrap();
151 |             let (_, vp) = uv_map.get(&(k + 1)).unwrap();
152 |             if k == k_min || ((k != k_max) && vn < vp) {
153 |                 x = *vp;
154 |                 pre_k = k + 1;
155 |             } else {
156 |                 x = *vn + 1;
157 |                 pre_k = k - 1;
158 |             }
159 |             y = ((x as i32) - k) as u32;
160 | 
161 |             if get_delta {
162 |                 let dpt = DeltaPoint {
163 |                     x: x,
164 |                     y: y,
165 |                     dk: k - pre_k,
166 |                 };
167 |                 delta_pts.entry((d, k)).or_insert(dpt);
168 |             };
169 | 
170 |             x1 = x;
171 |             y1 = y;
172 | 
173 |             while (x as usize) < len0 - 1
174 |                 && (y as usize) < len1 - 1
175 |                 && seq0[x as usize] == seq1[y as usize]
176 |             {
177 |                 x += 1;
178 |                 y += 1;
179 |             }
180 | 
181 |             if (x - x1) > 8 {
182 |                 if !start {
183 |                     rtn.bgn0 = x1;
184 |                     rtn.bgn1 = y1;
185 |                     start = true;
186 |                 }
187 |                 // we set the ends here to avoid bad sequences
188 |                 // this way, we are sure that, at least, 8 bases are aligned
189 |                 rtn.end0 = x;
190 |                 rtn.end1 = y;
191 |             }
192 | 
193 |             if (x - x1) > longest_match {
194 |                 longest_match = x - x1;
195 |                 rtn.m_end0 = x;
196 |                 rtn.m_end1 = y;
197 |             }
198 | 
199 |             // println!("IM {} {} {} {} {} {} {} {}", x, y, len0, len1, d, d_max, k, pre_k);
200 |             uv_map.insert(k, (x + y, x));
201 |             if (x + y) as i32 > best_m {
202 |                 best_m = (x + y) as i32;
203 |             }
204 |             if (x as usize) >= len0 - 1 || (y as usize) >= len1 - 1 {
205 |                 matched = true;
206 |                 d_final = d;
207 |                 k_final = k;
208 |                 break;
209 |             }
210 |         }
211 |         // For banding
212 |         let mut k_max_new = k_min;
213 |         let mut k_min_new = k_max;
214 |         for k2 in (k_min..=k_max).step_by(2) {
215 |             let (u, _) = uv_map.get(&k2).unwrap();
216 |             if *u as i32 >= (best_m - (band_tolerance as i32)) {
217 |                 if k2 < k_min_new {
218 |                     k_min_new = k2;
219 |                 }
220 |                 if k2 > k_max_new {
221 |                     k_max_new = k2;
222 |                 }
223 |             }
224 |         }
225 | 
226 |         k_max = k_max_new + 1;
227 |         k_min = k_min_new - 1;
228 |         if matched == true {
229 |             //println!("match: {} {}", d_final, k_final);
230 |             let mut d_inside = 0_u32;
231 |             if get_delta {
232 |                 let dpts = track_delta_point(&delta_pts, d_final, k_final, rtn.bgn0, rtn.end0);
233 |                 for dpt in &dpts {
234 |                     if dpt.x > rtn.bgn0 && dpt.x < rtn.end0 {
235 |                         d_inside += 1;
236 |                     }
237 |                 }
238 |                 rtn.deltas = Some(dpts);
239 |             }
240 |             rtn.dist = d_inside;
241 |             rtn.m_size = (rtn.end0 - rtn.bgn0 + rtn.end1 - rtn.bgn1 + 2 * d_inside) >> 1;
242 |             if rtn.m_size < min_match_len {
243 |                 matched = false;
244 |             }
245 |             break;
246 |         }
247 |     }
248 |     if !matched {
249 |         None
250 |     } else {
251 |         Some(rtn)
252 |     }
253 | }
254 | 
255 | #[derive(Clone, Copy)]
256 | pub struct ReadLocation {
257 |     pub start: usize,
258 |     pub len: usize,
259 | }
260 | 
261 | pub fn get_2bit_fragment(
262 |     rid: u32,
263 |     strand: u8,
264 |     bgn: u32,
265 |     end: u32,
266 |     readsdb: &Mmap,
267 |     read_index: &Vec<ReadLocation>,
268 | ) -> Vec<u8> {
269 |     let mut seq = Vec::<u8>::new();
270 |     let rloc = read_index[rid as usize];
271 |     let s = rloc.start + bgn as usize;
272 |     let e = rloc.start + end as usize;
273 | 
274 |     if strand == 0 {
275 |         for c in &readsdb[s..e] {
276 |             seq.push(c & 0b0011);
277 |         }
278 |     } else {
279 |         for c in &readsdb[s..e] {
280 |             seq.push((c >> 4) & 0b0011);
281 |         }
282 |     }
283 |     seq
284 | }
285 | 
286 | pub fn get_seq_fragment(
287 |     rid: u32,
288 |     strand: u8,
289 |     bgn: u32,
290 |     end: u32,
291 |     mmap: &Mmap,
292 |     read_index: &Vec<ReadLocation>,
293 | ) -> Vec<u8> {
294 |     let mut seq = Vec::<u8>::new();
295 |     let base_map = &[b'A', b'C', b'G', b'T'];
296 |     let rloc = read_index[rid as usize];
297 |     let s = rloc.start + bgn as usize;
298 |     let e = rloc.start + end as usize;
299 | 
300 |     if strand == 0 {
301 |         for c in &mmap[s..e] {
302 |             if c & 0b1100 != 0b1100 {
303 |                 seq.push(base_map[(c & 0b0011) as usize]);
304 |             } else {
305 |                 seq.push(b'N');
306 |             }
307 |         }
308 |     } else {
309 |         for c in &mmap[s..e] {
310 |             if ((c >> 4) & 0b1100) != 0b1100 {
311 |                 seq.push(base_map[((c >> 4) & 0b0011) as usize]);
312 |             } else {
313 |                 seq.push(b'N');
314 |             }
315 |         }
316 |     }
317 |     //println!("{} {}", rid, String::from_utf8_lossy(&seq));
318 |     seq
319 | }
320 | 
321 | #[derive(Clone, Copy)]
322 | pub struct MM128 {
323 |     pub x: u64,
324 |     pub y: u64,
325 | }
326 | 
327 | impl fmt::Display for MM128 {
328 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
329 |         let hash = self.x >> 8;
330 |         let span = (self.x & 0xFF) as u8;
331 |         let rid = (self.y >> 32) as u32;
332 |         let pos = ((self.y & 0xFFFFFFFF) >> 1) as u32;
333 |         let strand = (self.y & 0x1) as u8;
334 |         write!(f, "({}, {}, {}, {}, {})", hash, span, rid, pos, strand)
335 |     }
336 | }
337 | 
338 | pub fn u64hash(key: u64) -> u64 {
339 |     let key = (!key).wrapping_add(key << 21); // key = (key << 21) - key - 1;
340 |     let key = key ^ key >> 24;
341 |     let key = (key.wrapping_add(key << 3)).wrapping_add(key << 8); // key * 265
342 |     let key = key ^ key >> 14;
343 |     let key = (key.wrapping_add(key << 2)).wrapping_add(key << 4); // key * 21
344 |     let key = key ^ key >> 28;
345 |     let key = key.wrapping_add(key << 31);
346 |     key
347 | }
348 | 
349 | fn _u64hash(key: u64) -> u64 {
350 |     let key = !key + (key << 21); // key = (key << 21) - key - 1;
351 |     let key = key ^ key >> 24;
352 |     let key = (key + (key << 3)) + (key << 8); // key * 265
353 |     let key = key ^ key >> 14;
354 |     let key = (key + (key << 2)) + (key << 4); // key * 21
355 |     let key = key ^ key >> 28;
356 |     let key = key + (key << 31);
357 |     key
358 | }
359 | 
360 | pub struct RingBuffer {
361 |     pub v: Vec<MM128>,
362 |     pub size: usize,
363 |     pub start_pos: usize,
364 |     pub end_pos: usize,
365 |     pub len: usize,
366 | }
367 | 
368 | impl RingBuffer {
369 |     pub fn new(size: usize) -> Self {
370 |         let vv = vec![
371 |             MM128 {
372 |                 x: u64::MAX,
373 |                 y: u64::MAX
374 |             };
375 |             size
376 |         ];
377 |         RingBuffer {
378 |             v: vv,
379 |             size: size,
380 |             start_pos: 0,
381 |             end_pos: 0,
382 |             len: 0,
383 |         }
384 |     }
385 | 
386 |     pub fn push(&mut self, m: MM128) {
387 |         if self.len < self.size {
388 |             self.v[self.end_pos] = m;
389 |             self.end_pos += 1;
390 |             self.end_pos %= self.size;
391 |             self.len += 1;
392 |         } else {
393 |             self.v[self.end_pos] = m;
394 |             self.end_pos += 1;
395 |             self.end_pos %= self.size;
396 |             self.start_pos += 1;
397 |             self.start_pos %= self.size;
398 |         }
399 |     }
400 | 
401 |     pub fn _clear(&mut self) {
402 |         self.v.clear();
403 |         self.len = 0;
404 |         self.start_pos = 0;
405 |         self.end_pos = 0;
406 |     }
407 | 
408 |     pub fn get_min(&self) -> MM128 {
409 |         let mut min = MM128 {
410 |             x: u64::MAX,
411 |             y: u64::MAX,
412 |         };
413 |         for i in 0..self.len {
414 |             if self.v[i].x < min.x {
415 |                 min = self.v[i];
416 |             }
417 |         }
418 |         min
419 |     }
420 | 
421 |     pub fn get(&self, i: usize) -> MM128 {
422 |         self.v[(self.start_pos + i) % self.size]
423 |     }
424 | }
425 | 
426 | pub fn reduce_shmmr(mers: Vec<MM128>, r: u32) -> Vec<MM128> {
427 |     let mut shmmrs = Vec::<MM128>::new();
428 |     let mut rbuf = RingBuffer::new(r as usize);
429 |     let mut min_mer = MM128 {
430 |         x: u64::MAX,
431 |         y: u64::MAX,
432 |     };
433 |     let mut pos = 0;
434 |     let mut mdist = 0;
435 |     loop {
436 |         if pos >= mers.len() {
437 |             break;
438 |         }
439 |         let m = mers[pos];
440 |         rbuf.push(m);
441 |         if mdist == (r - 1) as usize {
442 |             min_mer = rbuf.get_min();
443 |             let mut last_i = 0_usize;
444 |             for i in 0..rbuf.size as usize {
445 |                 let mm = rbuf.get(i);
446 |                 if mm.x == min_mer.x {
447 |                     shmmrs.push(mm);
448 |                     min_mer = mm;
449 |                     last_i = i;
450 |                 }
451 |             }
452 |             mdist = r as usize - 1 - last_i;
453 |             pos += 1;
454 |             continue;
455 |         } else if m.x <= min_mer.x && pos >= r as usize {
456 |             shmmrs.push(m);
457 |             min_mer = m;
458 |             mdist = 0;
459 |             pos += 1;
460 |             continue;
461 |         }
462 |         mdist += 1;
463 |         pos += 1;
464 |     }
465 |     shmmrs
466 | }
467 | 
468 | pub fn sequence_to_shmmrs(rid: u32, seq: &Vec<u8>, w: u32, k: u32, r: u32) -> Vec<MM128> {
469 |     //let base2bits: [u64; 4] = [0, 1, 2, 3];
470 | 
471 |     let mut shmmrs = Vec::<MM128>::new();
472 | 
473 |     let mut pos = 0;
474 |     let mut mdist = 0;
475 |     let shift = k - 1;
476 |     assert!(k <= 56);
477 |     assert!(w <= 128);
478 |     assert!(r > 0 && r < 13);
479 |     let mut fmmer = (0_u64, 0_u64);
480 |     let mut rmmer = (0_u64, 0_u64);
481 |     let mask = u64::MAX >> (64 - k);
482 |     let mut rbuf = RingBuffer::new(w as usize);
483 |     let mut min_mer = MM128 {
484 |         x: u64::MAX,
485 |         y: u64::MAX,
486 |     };
487 |     loop {
488 |         if pos >= seq.len() {
489 |             break;
490 |         }
491 |         let c = (seq[pos] & 0b0011) as u64;
492 |         // println!("C {} {} {}", seq[pos], pos, c);
493 |         if seq[pos] & 0b1100 != 0b1100 {
494 |             // Not non-A,C,G,T base
495 |             fmmer.0 <<= 1;
496 |             fmmer.0 |= c & 0b01;
497 |             fmmer.0 &= mask;
498 |             fmmer.1 <<= 1;
499 |             fmmer.1 |= (c & 0b10) >> 1;
500 |             fmmer.1 &= mask;
501 | 
502 |             let rc = 0x03 ^ c;
503 |             rmmer.0 >>= 1;
504 |             rmmer.0 |= (rc & 0b01) << shift;
505 |             rmmer.0 &= mask;
506 |             rmmer.1 >>= 1;
507 |             rmmer.1 |= ((rc & 0b10) >> 1) << shift;
508 |             rmmer.1 &= mask;
509 |         }
510 |         if fmmer == rmmer {
511 |             pos += 1;
512 |             continue;
513 |         }
514 |         if pos < k as usize {
515 |             pos += 1;
516 |             continue;
517 |         }
518 |         let mut forward = true;
519 |         if rmmer.0 < fmmer.0 {
520 |             forward = false;
521 |         }
522 |         let mmer_hash = match forward {
523 |             true => u64hash(fmmer.0) ^ u64hash(fmmer.1) ^ 0x0,
524 |             false => u64hash(rmmer.0) ^ u64hash(rmmer.1) ^ 0x0,
525 |         };
526 |         let strand: u64 = if forward { 0 } else { 1 };
527 |         let m = MM128 {
528 |             x: mmer_hash << 8 | k as u64,
529 |             y: (rid as u64) << 32 | (pos as u64) << 1 | strand,
530 |         };
531 |         rbuf.push(m);
532 |         // println!("MM {} {} {} {}", rid, m, fmmer.0, fmmer.1);
533 |         if mdist == (w - 1) as usize {
534 |             min_mer = rbuf.get_min();
535 |             for i in 0..rbuf.size as usize {
536 |                 let mm = rbuf.get(i);
537 |                 if mm.x == min_mer.x {
538 |                     shmmrs.push(mm);
539 |                     min_mer = mm;
540 |                     // println!("MM0 {} {}", rid, mm);
541 |                 }
542 |             }
543 |             mdist = pos - ((min_mer.y & 0xFFFFFFFF) >> 1) as usize;
544 |             pos += 1;
545 |             continue;
546 |         } else if m.x <= min_mer.x && pos >= w as usize {
547 |             shmmrs.push(m);
548 |             // println!("MM1 {} {}", rid, m);
549 |             min_mer = m;
550 |             mdist = 0;
551 |             pos += 1;
552 |             continue;
553 |         }
554 |         mdist += 1;
555 |         pos += 1;
556 |     }
557 |     let shmmrs = reduce_shmmr(shmmrs, r);
558 |     let shmmrs = reduce_shmmr(shmmrs, r);
559 |     shmmrs
560 | }
561 | 


--------------------------------------------------------------------------------
/src/bin/utils/layout.rs:
--------------------------------------------------------------------------------
  1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit
  2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin
  3 | //
  4 | // This Source Code Form is subject to the terms of the
  5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
  6 | //
  7 | // You should have received a copy of the license along with this
  8 | // work. If not, see <http://creativecommons.org/licenses/by-nc-sa/4.0/>.
  9 | 
 10 | #![allow(dead_code)]
 11 | 
 12 | //
 13 | // take the layout file and generate contig sequence from the layout file
 14 | // it handles the read stitches and consensuse
 15 | //
 16 | 
 17 | use super::shmmrutils::match_reads;
 18 | use super::shmmrutils::sequence_to_shmmrs;
 19 | use super::shmmrutils::{get_2bit_fragment, ReadLocation};
 20 | use memmap::MmapOptions;
 21 | use std::fs::File;
 22 | use std::io::{self, BufRead, Write};
 23 | use std::path::Path;
 24 | 
 25 | use rustc_hash::{FxHashMap, FxHashSet};
 26 | 
 27 | fn read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>>
 28 | where
 29 |     P: AsRef<Path>,
 30 | {
 31 |     let file = File::open(filename)?;
 32 |     Ok(io::BufReader::new(file).lines())
 33 | }
 34 | 
 35 | fn get_shmr_offset(s0: &Vec<u8>, s1: &Vec<u8>) -> (u32, u32) {
 36 |     // take two sequences and use the shmmrs to compute/etimate the offset between them
 37 |     // here we take the first hit, it might be useful to use the most common offset
 38 | 
 39 |     let mmer0 = sequence_to_shmmrs(0, &s0, 24, 24, 1);
 40 |     let mmer1 = sequence_to_shmmrs(0, &s1, 24, 24, 1);
 41 |     let mut counter0 = FxHashMap::<u64, u32>::default();
 42 |     for m in mmer0.iter() {
 43 |         let x = m.x >> 8;
 44 |         *counter0.entry(x).or_insert(0) += 1;
 45 |     }
 46 |     let mut hmap0 = FxHashMap::<u64, u32>::default();
 47 |     for m in mmer0.iter() {
 48 |         let x = m.x >> 8;
 49 |         let pos = ((m.y & 0xFFFFFFFF) >> 1) as u32;
 50 |         if *counter0.get(&x).unwrap() == 1 {
 51 |             hmap0.insert(x, pos);
 52 |         }
 53 |     }
 54 | 
 55 |     let (mut offset0, mut offset1) = (0_u32, 0_u32);
 56 |     for m in mmer1 {
 57 |         let x = m.x >> 8;
 58 |         let pos = ((m.y & 0xFFFFFFFF) >> 1) as u32;
 59 |         if hmap0.contains_key(&x) {
 60 |             offset0 = *hmap0.get(&x).unwrap();
 61 |             offset1 = pos;
 62 |             break;
 63 |         }
 64 |     }
 65 |     (offset0, offset1)
 66 | }
 67 | 
 68 | fn get_ctg_cns_with_tiling_reads(
 69 |     tiling_reads: &Vec<(u32, Vec<u8>)>,
 70 |     out_frag_bgn: &Vec<(usize, usize)>,
 71 |     template_seq: &Vec<u8>,
 72 | ) -> (Vec<u8>, f32) {
 73 |     //
 74 |     // generate consensus squence from tiling reads
 75 |     //
 76 |     let mut cov_aux = Vec::<(usize, i32)>::new();
 77 |     let mut s_bgns = FxHashSet::<usize>::default();
 78 |     let mut s_ends = FxHashSet::<usize>::default();
 79 |     let mut delta_count = FxHashMap::<(usize, u8, u8), u32>::default();
 80 |     for i in 0..tiling_reads.len() {
 81 |         let (_rid0, seq0) = tiling_reads.get(i).unwrap();
 82 |         let (bgn, bgn0) = *out_frag_bgn.get(i).unwrap();
 83 |         let len = seq0.len() - bgn0;
 84 |         let end = if bgn + len < template_seq.len() {
 85 |             bgn + len
 86 |         } else {
 87 |             template_seq.len()
 88 |         };
 89 |         let end0 = if bgn0 + len < seq0.len() {
 90 |             bgn + len
 91 |         } else {
 92 |             seq0.len()
 93 |         };
 94 | 
 95 |         let seq_s = template_seq[bgn..end].to_vec();
 96 |         let seq0_s = seq0[bgn0..end0].to_vec();
 97 |         let ovlp = match_reads(&seq_s, &seq0_s, true, 0.02, 1200, 256);
 98 | 
 99 |         if let Some(ovlp) = ovlp {
100 |             //println!("{} {} {} {}", ovlp.bgn0, ovlp.end0, ovlp.bgn1, ovlp.end1);
101 |             let mut dpts = ovlp.deltas.unwrap();
102 |             dpts.reverse();
103 |             let mut d = 0_u8;
104 |             let mut px = 0_usize;
105 |             cov_aux.push((bgn + ovlp.bgn0 as usize, 1));
106 |             cov_aux.push((bgn + ovlp.end0 as usize, -1));
107 |             s_bgns.insert(bgn + ovlp.bgn0 as usize);
108 |             s_ends.insert(bgn + ovlp.end0 as usize);
109 |             for dpt in dpts {
110 |                 let c = if dpt.dk > 0 {
111 |                     b'-'
112 |                 } else {
113 |                     seq0_s[dpt.y as usize - 1]
114 |                 };
115 |                 let cx = dpt.x as usize + bgn - 1;
116 |                 cov_aux.push((cx, 0));
117 |                 if cx != px {
118 |                     d = 0;
119 |                 } else {
120 |                     d += 1;
121 |                 }
122 |                 //println!("D {} {} {}", cx, d, c as char);
123 |                 *delta_count.entry((cx, d, c)).or_insert(0) += 1;
124 |                 px = cx;
125 |             }
126 |         }
127 |     }
128 |     cov_aux.sort();
129 |     let mut dpt_cov = FxHashMap::<usize, u32>::default();
130 |     let mut cov = 0_i32;
131 |     for (p, d) in cov_aux {
132 |         if d != 0 {
133 |             cov += d;
134 |         } else {
135 |             dpt_cov.entry(p).or_insert(cov as u32);
136 |         }
137 |     }
138 | 
139 |     let mut dpt_c = Vec::<(usize, u8, u8)>::new();
140 |     for k in delta_count.keys() {
141 |         let count = delta_count.get(k).unwrap();
142 |         let cov = dpt_cov.get(&k.0).unwrap();
143 |         if *count > (cov >> 1) {
144 |             /*
145 |             println!(
146 |                 "D {} {} {} {} {} {} {}",
147 |                 k.0,
148 |                 k.1,
149 |                 template_seq[k.0] as char,
150 |                 k.2 as char,
151 |                 count,
152 |                 cov,
153 |                 String::from_utf8_lossy(&template_seq[k.0..k.0 + 5].to_vec())
154 |             );
155 |             */
156 |             dpt_c.push(*k);
157 |         }
158 |     }
159 |     dpt_c.sort();
160 | 
161 |     let mut out_seq = Vec::<u8>::new();
162 |     let mut pos_map = Vec::<usize>::new();
163 |     let mut c_bgn = 0_usize;
164 |     for k in dpt_c {
165 |         if k.0 > c_bgn {
166 |             if c_bgn != 0 {
167 |                 c_bgn += 1;
168 |             }
169 |             let c_end = k.0;
170 |             out_seq.extend(template_seq[c_bgn..c_end].to_vec());
171 |             pos_map.extend((c_bgn..c_end).collect::<Vec<usize>>());
172 |             c_bgn = k.0;
173 |         }
174 |         if k.0 == c_bgn {
175 |             if k.1 == 0 && k.2 != b'-' {
176 |                 out_seq.push(template_seq[k.0]);
177 |                 pos_map.push(k.0);
178 |             }
179 |             if k.2 != b'-' {
180 |                 out_seq.push(k.2);
181 |                 pos_map.push(k.0);
182 |             }
183 |         }
184 |     }
185 |     let c_end = template_seq.len();
186 |     out_seq.extend(template_seq[c_bgn..c_end].to_vec());
187 |     pos_map.extend((c_bgn..c_end).collect::<Vec<usize>>());
188 |     let mut cov = 0_u32;
189 |     let mut sidx = 0;
190 |     let mut lq_count = 0_u32;
191 |     let out_seq2 = pos_map
192 |         .iter()
193 |         .map(|&p| {
194 |             if s_bgns.contains(&p) {
195 |                 cov += 1;
196 |                 s_bgns.remove(&p);
197 |             }
198 |             if s_ends.contains(&p) {
199 |                 cov -= 1;
200 |                 s_ends.remove(&p);
201 |             }
202 |             let base = out_seq[sidx];
203 |             sidx += 1;
204 | 
205 |             if cov < 3 {
206 |                 lq_count += 1;
207 |                 base + 4
208 |             } else {
209 |                 base
210 |             }
211 |         })
212 |         .collect::<Vec<u8>>();
213 |     /*
214 |         println!(
215 |         "{:?} {:?} {:?}",
216 |         lq_count,
217 |         out_seq2.len(),
218 |         (lq_count as f32 / out_seq2.len() as f32)
219 |     );
220 |     */
221 |     let r = (lq_count as f32) / (out_seq2.len() as f32);
222 |     (out_seq2, r)
223 | }
224 | 
225 | fn stitch_fragments(
226 |     tiling_reads: &Vec<(u32, Vec<u8>)>,
227 |     match_bng: &FxHashMap<(u32, u32), (u32, u32)>,
228 | ) -> (Vec<u8>, f32) {
229 |     let mut frag_bgn = vec![0_u32; tiling_reads.len()];
230 |     let mut frag_end = vec![0_u32; tiling_reads.len()];
231 |     for i in 0..tiling_reads.len() - 1 {
232 |         let (rid0, seq0) = tiling_reads.get(i).unwrap();
233 |         let (rid1, seq1) = tiling_reads.get(i + 1).unwrap();
234 |         //println!("DBG {} {}", rid0, rid1);
235 |         let (mut bgn0, mut bgn1) = match_bng.get(&(*rid0, *rid1)).unwrap();
236 |         if bgn0 < frag_bgn[i] as u32 {
237 |             let correction = frag_bgn[i] as u32 - bgn0;
238 |             bgn0 += correction;
239 |             bgn1 += correction;
240 |         }
241 |         let end0 = if bgn0 as usize + 100 < seq0.len() {
242 |             bgn0 as usize + 100
243 |         } else {
244 |             seq0.len()
245 |         };
246 |         let end1 = if bgn1 as usize + 100 < seq1.len() {
247 |             bgn1 as usize + 100
248 |         } else {
249 |             seq1.len()
250 |         };
251 |         let seq0_s = seq0[bgn0 as usize..end0].to_vec();
252 |         let seq1_s = seq1[bgn1 as usize..end1].to_vec();
253 |         let (offset0, offset1) = get_shmr_offset(&seq0_s, &seq1_s);
254 |         //println!("F {} {} {} {}", rid0, rid1, offset0, offset1);
255 |         frag_end[i] = bgn0 + offset0;
256 |         frag_bgn[i + 1] = bgn1 + offset1;
257 |     }
258 |     frag_end[tiling_reads.len() - 1] =
259 |         tiling_reads.get(tiling_reads.len() - 1).unwrap().1.len() as u32;
260 | 
261 |     let mut template_seq = Vec::<u8>::new();
262 |     let mut out_frag_bgn = vec![(0_usize, 0_usize); tiling_reads.len()];
263 |     for i in 0..tiling_reads.len() {
264 |         let (_rid0, seq0) = tiling_reads.get(i).unwrap();
265 |         let b = *frag_bgn.get(i).unwrap() as usize;
266 |         let e = *frag_end.get(i).unwrap() as usize;
267 |         out_frag_bgn[i] = (template_seq.len(), b);
268 |         template_seq.extend(seq0[b..e].to_vec());
269 |     }
270 | 
271 |     get_ctg_cns_with_tiling_reads(&tiling_reads, &out_frag_bgn, &template_seq)
272 | }
273 | 
274 | pub fn log_asm_summary(ctg_lengths: Vec<(String, usize)>) -> () {
275 |     let mut lengths = ctg_lengths.iter().map(|x| x.1).collect::<Vec<usize>>();
276 |     lengths.sort();
277 |     lengths.reverse();
278 |     let total_bases: usize = lengths.iter().sum();
279 |     let total_ctgs = lengths.len();
280 |     let mut n50 = 0_usize;
281 |     let mut n90 = 0_usize;
282 |     let mut count_gt_100kb = 0_u32;
283 |     log::info!("Total size: {}", total_bases);
284 |     log::info!("Longest size: {}", lengths.get(0).unwrap_or(&0));
285 |     let mut cumsum = 0_usize;
286 |     for l in lengths {
287 |         cumsum += l;
288 |         if cumsum as f32 > (total_bases as f32 * 0.5) {
289 |             if n50 == 0 {
290 |                 n50 = l;
291 |             }
292 |         }
293 |         if cumsum as f32 > (total_bases as f32 * 0.9) {
294 |             if n90 == 0 {
295 |                 n90 = l;
296 |             }
297 |         }
298 |         if l > 100000 {
299 |             count_gt_100kb += 1;
300 |         }
301 |     }
302 |     log::info!("N50: {}", n50);
303 |     log::info!("N90: {}", n90);
304 |     log::info!("Number of contigs: {}", total_ctgs);
305 |     log::info!("Number of Contigs > 100kb: {}", count_gt_100kb);
306 | }
307 | 
308 | pub fn layout2ctg(
309 |     seqdb_file: &String,
310 |     index_file: &String,
311 |     layout_file: &String,
312 |     output_file_prefix: &String,
313 | ) -> Result<(), io::Error> {
314 |     let mut read_index = Vec::<ReadLocation>::new();
315 | 
316 |     if let Ok(lines) = read_lines(index_file) {
317 |         for line in lines {
318 |             if let Ok(rec) = line {
319 |                 // let rec_trimmed = rec.trim_end();
320 |                 // the record line looks like 000000023 m64062_190803_042216/144/ccs 20359 467415
321 |                 let v: Vec<&str> = rec.split_whitespace().collect();
322 |                 // let rid: u32 = v[0].parse().unwrap();
323 |                 let start: usize = v[3].parse().unwrap();
324 |                 let len: usize = v[2].parse().unwrap();
325 |                 read_index.push(ReadLocation {
326 |                     start: start,
327 |                     len: len,
328 |                 });
329 |                 // println!("{} {} {}", rid, start, len);
330 |             }
331 |         }
332 |     }
333 | 
334 |     let file = File::open(seqdb_file).unwrap();
335 |     let readsdb = unsafe { MmapOptions::new().map(&file).unwrap() };
336 | 
337 |     let mut p_ctg_file = File::create(&format!("{}_m.fa", output_file_prefix)).unwrap();
338 |     let mut a_ctg_file = File::create(&format!("{}_e0.fa", output_file_prefix)).unwrap();
339 |     let mut tiling_reads = Vec::<(u32, Vec<u8>)>::new();
340 |     let mut match_bgn = FxHashMap::<(u32, u32), (u32, u32)>::default();
341 | 
342 |     let mut pre_ctg_id = 0_u32;
343 |     let mut pre_ctg_tag = 'P';
344 |     let mut ctg_lengths = Vec::<(String, usize)>::new();
345 |     let basemap = [b'A', b'C', b'G', b'T', b'a', b'c', b'g', b't'];
346 |     if let Ok(lines) = read_lines(layout_file) {
347 |         for line in lines {
348 |             if let Ok(rec) = line {
349 |                 //let rec_trimmed = rec.trim_end();
350 |                 let v: Vec<&str> = rec.split_whitespace().collect();
351 |                 if v[0] == "P" || v[0] == "D" || v[0] == "A" {
352 |                     if tiling_reads.len() != 0 {
353 |                         let ctg_id = format!("ctg{:06}_{}", pre_ctg_id, pre_ctg_tag);
354 |                         let (bseq, r) = stitch_fragments(&tiling_reads, &match_bgn);
355 |                         if r < 0.6 {
356 |                             let ctg = bseq
357 |                                 .iter()
358 |                                 .map(|&t| basemap[t as usize])
359 |                                 .collect::<Vec<u8>>();
360 |                             match pre_ctg_tag {
361 |                                 'P' => {
362 |                                     let _res = writeln!(p_ctg_file, ">{}", ctg_id);
363 |                                     let _res =
364 |                                         writeln!(p_ctg_file, "{}", String::from_utf8_lossy(&ctg));
365 | 
366 |                                     ctg_lengths.push((ctg_id, ctg.len()));
367 |                                 }
368 |                                 'A' => {
369 |                                     let _res = writeln!(a_ctg_file, ">{}", ctg_id);
370 |                                     let _res =
371 |                                         writeln!(a_ctg_file, "{}", String::from_utf8_lossy(&ctg));
372 |                                 }
373 |                                 'D' => {
374 |                                     let _res = writeln!(a_ctg_file, ">{}", ctg_id);
375 |                                     let _res =
376 |                                         writeln!(a_ctg_file, "{}", String::from_utf8_lossy(&ctg));
377 |                                 }
378 |                                 _ => (),
379 |                             }
380 |                         }
381 |                     }
382 |                     tiling_reads.clear();
383 |                     pre_ctg_tag = v[0].chars().nth(0).unwrap();
384 |                     continue;
385 |                 }
386 |                 if v[0] == "E" {
387 |                     let utg_id: u32 = v[1].parse().unwrap();
388 |                     let rid0: u32 = v[2].parse().unwrap();
389 |                     let strand0: u8 = v[3].parse().unwrap();
390 |                     let rid1: u32 = v[4].parse().unwrap();
391 |                     let strand1: u8 = v[5].parse().unwrap();
392 |                     let bgn0: u32 = v[6].parse().unwrap();
393 |                     let bgn1: u32 = v[7].parse().unwrap();
394 |                     if tiling_reads.len() == 0 {
395 |                         let rloc = read_index[rid0 as usize];
396 |                         let len = rloc.len as u32;
397 |                         let seq_full =
398 |                             get_2bit_fragment(rid0, strand0, 0, len, &readsdb, &read_index);
399 |                         tiling_reads.push((rid0, seq_full));
400 |                     }
401 |                     let rloc = read_index[rid1 as usize];
402 |                     let len = rloc.len as u32;
403 |                     //let seq_frag = get_seq_fragment(rid1, strand1, bgn, end, &mmap, &read_index);
404 |                     let seq_full = get_2bit_fragment(rid1, strand1, 0, len, &readsdb, &read_index);
405 |                     pre_ctg_id = utg_id;
406 |                     tiling_reads.push((rid1, seq_full));
407 |                     match_bgn.insert((rid0, rid1), (bgn0, bgn1));
408 |                 }
409 |             }
410 |         }
411 | 
412 |         if tiling_reads.len() != 0 {
413 |             let (bseq, r) = stitch_fragments(&tiling_reads, &match_bgn);
414 |             if r < 0.6 {
415 |                 let ctg = bseq
416 |                     .iter()
417 |                     .map(|&t| basemap[t as usize])
418 |                     .collect::<Vec<u8>>();
419 |                 let ctg_id = format!("ctg{:06}_{}", pre_ctg_id, pre_ctg_tag);
420 |                 match pre_ctg_tag {
421 |                     'P' => {
422 |                         let _res = writeln!(p_ctg_file, ">{}", ctg_id);
423 |                         let _res = writeln!(p_ctg_file, "{}", String::from_utf8_lossy(&ctg));
424 | 
425 |                         ctg_lengths.push((ctg_id, ctg.len()));
426 |                     }
427 |                     'A' => {
428 |                         let _res = writeln!(a_ctg_file, ">{}", ctg_id);
429 |                         let _res = writeln!(a_ctg_file, "{}", String::from_utf8_lossy(&ctg));
430 |                     }
431 |                     'D' => {
432 |                         let _res = writeln!(a_ctg_file, ">{}", ctg_id);
433 |                         let _res = writeln!(a_ctg_file, "{}", String::from_utf8_lossy(&ctg));
434 |                     }
435 |                     _ => (),
436 |                 }
437 |             }
438 |             tiling_reads.clear();
439 |         }
440 |     }
441 |     log_asm_summary(ctg_lengths);
442 |     Ok(())
443 | }
444 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Peregrine-2021: A faster and minimum genome assembler 
  2 | 
  3 | Peregrine-2021 is an genome assembler designed for long-reads that have good enough accuracy. It is written with the Rust language.
  4 | The main method used in the genome assembler is described in [Human Genome Assembly in 100 Minutes](https://www.biorxiv.org/content/10.1101/705616v1).
  5 | 
  6 | <img src="misc/logo.png" alt="PeregrineLogo" width="120"/>
  7 | 
  8 | ## System requirement: 
  9 | 
 10 | A modern Linux workstation or compute node with enough disk, CPUs, and RAM. It is better to have a good number of CPUs (my testing system has 20 cores) and a good amount of RAM (~ total 1.5x of the reads data set). For example, for 100G sequences, it is probably good to have at least 150G RAM. A smaller amount, e.g., 32G, works, but you will need some manual setup for effective computation.
 11 | ## Some Ballpark Performance Summary 
 12 | 
 13 | With a proper hardware (e.g. ~1Tb RAM), Peregrine-2021 had successful assembled a total 30G diploid genome (2n = 30G) with a contig N50 = 55.2Mb for a large diploid genome. (For who might want to know more details, I ran it as a unpaid service so I don't have much other infomation. [Link to the the graph of the assembly](https://twitter.com/infoecho/status/1330617986185457669?s=20&t=FlHjuWCHslvjxVdyZpU1gQ).)
 14 | 
 15 | For a typical human-size assembly, a much cheaper compute instance with from 128G to 512G RAM can work well. (see this blog [Accelerating genome assembly with AWS Graviton2](https://aws.amazon.com/blogs/publicsector/accelerating-genome-assembly-aws-graviton2/)), and it takes only 2 to 3 hours wall o'clock time to get an assembly. (We also provide a "fast" mode eliminating one error correct stage for perfect reads as input.)
 16 | 
 17 | On the accuracy side, here, we only have some rough numbers from the earlier version compared to other assemblers. Ironically, it could take more effort and resources to do a comprehensive benchmark for a publication than writing the assembler itself. Unfortunately, benchmarking work is a luxury that I currently do not have. However, suppose if anyone is interested in taking a shot running the code fine-tuning the parameters to evaluate the results correctly, in that case, I might help a bit from time to time. 
 18 | 
 19 | ## Usage:
 20 | 
 21 | 1.	Put the paths to the sequence read files (fasta / fasta.gz / fastq / fastq.gz, compressed with the standard gzip but not bgzip) in a file, e.g. `reads.lst`, so the Peregrin-2021 assembler can find the read data. For example, this shows the content of a `reads.lst` file:
 22 | ```
 23 | $ cat seq.lst 
 24 | /wd/CHM13_20k/m64062_190803_042216.fastq.gz
 25 | /wd/CHM13_20k/m64062_190804_172951.fastq.gz
 26 | /wd/CHM13_20k/m64062_190806_063919.fastq.gz
 27 | /wd/CHM13_20k/m64062_190807_194840.fastq.gz
 28 | ```
 29 | 
 30 | 2.	Make sure your have enough disk (preferably SSD storage or high performance network filesystem) for a working directory.  Let’s call the working directory `asm_out`.
 31 | 
 32 | 3.	Execute: `pg_asm reads.lst asm_out` from command line / shell, some potentially useful intermediate files and the assembled contigs will be in the directory `asm_out/`
 33 | 
 34 | ```
 35 | $ pg_asm seq.lst asm_out >& log &
 36 | ```
 37 | 
 38 | 4.	There are a number of options that you can try to tune for optimizing the assembly results. Here is the full usage information of `pg_asm`.
 39 | 
 40 | ```
 41 | ❯ pg_asm --help
 42 | pg_asm peregrine-2021 0.4.9 (arm_config:58e666e+, release build, linux [x86_64] [rustc 1.58.0 (02072b482 2022-01-11)])
 43 | Jason Chin <jason@omnibio.ai>
 44 | 
 45 | Peregrine-2021 genome assembler,
 46 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/
 47 | 
 48 | USAGE:
 49 |     pg_asm [FLAGS] [OPTIONS] <input_reads> <work_dir> [ARGS]
 50 | 
 51 | FLAGS:
 52 |         --fast          run the assembler in the fast mode
 53 |     -h, --help          Prints help information
 54 |         --keep          keep intermediate files
 55 |         --no_resolve    disable resolving repeats / dups at the end
 56 |     -V, --version       Prints version information
 57 | 
 58 | OPTIONS:
 59 |     -b, --bestn <bestn>              number of best overlaps for initial graph [default: 6]
 60 |     -k <k>                           Kmer size [default: 56]
 61 |     -l <layout_method>               layout version [default: 2]
 62 |         --log <log>                  log level: DBBUG or INFO (default)
 63 |     -c, --min_ec_cov <min_ec_cov>    Minimum error coverage [default: 1]
 64 |     -r <r>                           Reduction factor [default: 6]
 65 |     -t, --tol <tol>                  Alignment tolerance [default: 0.01]
 66 |     -w <w>                           Window size [default: 80]
 67 | 
 68 | ARGS:
 69 |     <input_reads>    Path to a file that contains the list of reads in .fa .fa.gz .fastq or fastq.gz formats
 70 |     <work_dir>       The path to a work directory for intermediate files and the results
 71 |     <NTHREADS>       Number of threads
 72 |     <NCHUNKS>        Number of partition
 73 | ```
 74 | 
 75 | You can reduce the `Reduction factor` and `Window Size` to increase the sensitivity to 
 76 | detect overlaps. I found `-r 4 -w 64` may be better for human assembly than the default
 77 | parameters. 
 78 | 
 79 | ## Example Results: (`v0.2.0, main:6d91294, release build, linux [x86_64]`)
 80 | 
 81 | ### Input data set: Human CHM13 dataset
 82 | 
 83 | - 5,567,153 reads, average length = 18,028 bp, about 30x human genome size
 84 | 
 85 | - Systme: 20 core Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz, 512G ram
 86 | 
 87 | 
 88 | ### Run Peregrine-2021 in Fast-mode (without read level error correction)
 89 |  
 90 | 
 91 | #### CPU time for the fast-mode:
 92 | ```
 93 | usr: 152025s = 42.5 cpu hours, 
 94 | 
 95 | sys: 900s = 0.25 cpu hours, 
 96 | 
 97 | wall clock time = 2:45:39
 98 | ``` 
 99 | (In contrast, it takes [HiCanu](https://www.biorxiv.org/content/10.1101/2020.03.14.992248v3) > 4000 CPU hour to get an assembly)
100 | 
101 | #### Assembly Summary Statistics with the fast-mode:
102 | 
103 | ```
104 | total size: 3,034,243,471
105 | max size: 201,005,110
106 | N50 size: 81,361,265
107 | N90 size: 17,301,175
108 | Number of Contigs: 237
109 | Number of Contigs > 100kb: 151
110 | ```
111 | 
112 | #### CHM13 BAC evaluation result with the fast-mode setting:
113 | 
114 | ```
115 | ******************* BAC SUMMARY ******************
116 |  TOTAL    : 341
117 |  BP       : 51532183
118 | ************** Statistics for: _asm_p_ctg-2.fa ****************
119 | BACs closed: 327 (95.8944); BACs attempted: 338 %good = 96.7456; BASES 49454907 (95.969)
120 | Median:          99.9844
121 | MedianQV:        38.06875
122 | Mean:            99.93617
123 | MeanQV:          31.94969
124 | ***** STATS IGNORING INDELS ********************
125 | Median:          100
126 | MedianQV:        Inf
127 | Mean:            99.99177
128 | MeanQV:          40.84457
129 | **********************************************
130 | ```
131 | (This [HiCanu paper preprint](https://www.biorxiv.org/content/10.1101/2020.03.14.992248v3) reported resolving only 326 out of the 341 BAC BACs.)
132 | 
133 | #### CHM13 BAC evaluation result with the fast-mode setting but without final contig level consensus:
134 | (This pretty much just reflects the quality of the input reads.)
135 | 
136 | ```
137 | ******************* BAC SUMMARY ******************
138 |  TOTAL    : 341
139 |  BP       : 51532183
140 | ************** Statistics for: asm-p_cns-nocns_fast.fa ****************
141 | BACs closed: 327 (95.8944); BACs attempted: 339 %good = 96.4602; BASES 49454907 (95.969)
142 | Median:          99.7163 
143 | MedianQV:        25.47141 
144 | Mean:            99.67459 
145 | MeanQV:          24.87566 
146 | ***** STATS IGNORING INDELS ********************
147 | Median:          99.9782 
148 | MedianQV:        36.61544 
149 | Mean:            99.97005 
150 | MeanQV:          35.23557 
151 | **********************************************
152 | ```
153 | 
154 | ### Run Peregrine-2021 in Standard-mode (one round of the read level error correction)
155 | 
156 | #### CPU time for the standard setting which involves one round of read level error correction: 
157 | 
158 | ```
159 | usr: 355257s = 98.7 cpu hours,  
160 | 
161 | sys: 2052s = 0.6 cpu hours, 
162 |   
163 | wall clock time = 5:47:23
164 | ```
165 | (In contrast, it takes [HiCanu](https://www.biorxiv.org/content/10.1101/2020.03.14.992248v3) > 4000 CPU hour to get an assembly)
166 | 
167 | #### Assembly Summary Statistics of the standard Setting:
168 | 
169 | ```
170 | total size: 3,039,592,838
171 | max size: 142,204,433
172 | N50 size: 83,143,579
173 | N90 size: 16,250,509
174 | Number of Contigs: 227
175 | Number of Contigs > 100kb: 157
176 | ```
177 | 
178 | #### CHM13 BAC evaluation result:
179 | 
180 | ```
181 | ******************* BAC SUMMARY ******************
182 |  TOTAL    : 341
183 |  BP       : 51532183
184 | ************** Statistics for: _asm_p_ctg.fa ****************
185 | BACs closed: 330 (96.7742); BACs attempted: 341 %good = 96.7742; BASES 49874385 (96.783)
186 | Median:          99.9911
187 | MedianQV:        40.5061
188 | Mean:            99.94317
189 | MeanQV:          32.45434
190 | ***** STATS IGNORING INDELS ********************
191 | Median:          100
192 | MedianQV:        Inf
193 | Mean:            99.99405
194 | MeanQV:          42.25218
195 | **********************************************
196 | ```
197 | #### CHM13 BAC evaluation result without final contig level consensus:
198 | 
199 | ```
200 | ******************* BAC SUMMARY ******************
201 |  TOTAL    : 341
202 |  BP       : 51532183
203 | ************** Statistics for: asm-p_cns-nocns.fa ****************
204 | BACs closed: 330 (96.7742); BACs attempted: 341 %good = 96.7742; BASES 49874385 (96.783)
205 | Median:          99.9832
206 | MedianQV:        37.74691
207 | Mean:            99.93411
208 | MeanQV:          31.81188
209 | ***** STATS IGNORING INDELS ********************
210 | Median:          99.9994
211 | MedianQV:        52.21849
212 | Mean:            99.9927
213 | MeanQV:          41.36713
214 | **********************************************
215 | ```
216 | 
217 |   
218 | ## Some FAQs: 
219 | 
220 | * Q: Why do you write a new assembler? There are already many others (FALCON, HifiAsm, Flye, Shasta, HiCanu, etc.).
221 | 
222 |     A: We demonstrated that it was possible to use Spare Hierarchical MiniMiER (SHIMMER) to assemble a human-size genome with 100 wall clock minutes with the Peregrine genome assembler. A standard approach filters out high duplicated sequences to get the significant part of a gnome assembled. And, most assemblers adapt some repeat filtering schemes to make the run time for genome assembly acceptable while keeping the most helpful information of a genome. 
223 | 
224 |     However, while it was reasonable to do a repeat-suppressing assembly, the narrative about what an assembler should do is changing. Genomic researchers may want to get more about the repeat even if it needs additional compute power/energy ( > 20x for a human genome compared to a Peregrine run) to get the repetitive sequences in a genome. I think it is worth showing the same technique that we used in the original Peregrine-2021 can also get the high repeat content assembled with only moderate increases of computation cost/energy.
225 | 
226 |     Another reason for this Peregrine-2021 assembler is that we were not happy with the C / Python hybrid approach used in the original Peregrine assembler. While C / Python combination is very efficient for rapid development, it has too many caveats. It could be interesting to learn something new as well. Following Richard Feynman's wisdom, "What I cannot create, I don't understand." To better understand how to handle repeats and understand the Rust programming language, I created Peregrine-2021 from late-2020 to mid-2021. I want to push Peregrine-2021 to the next stage to apply it to genomics research work; unfortunately, I realized it is too demanding to take this as a hobby or a side-gig. Given that I won't be able to push it too far by myself, it might still be helpful for others who are interested in using this. I finally decide to release the code for non-commercial use.
227 | 
228 | * Q: I can't run it, can you help?
229 | 
230 |     A: it depends. If it is something that is straight forward, I am happy to help. If it is more involved, then I simply can not do it as I only have limited resources. 
231 | 
232 | * Q: The results are bad, do I do something wrong?
233 |   
234 |     A: Possiblely. Like all other assemblers, Peregrine-2021 is design with some sepcifications in mind and test accordingly. Depending on the input data and the parameters used to run the assemblers, one might be able to improve the results. Unfortunately, there is no universal simple answer for how to investigate and improve now. It can be some trivial mistakes or very intensive investigation that related to inital sequence methods or even the genome biology itself. 
235 | 
236 | * Q: I use it and generate some good results for publication, how do I cite it?
237 | 
238 |     A: While Peregrine-2021 is mostly based on the original ideas described in the preprint [Human Genome Assembly in 100 Minutes](https://www.biorxiv.org/content/10.1101/705616v1), it is a different codebase. I currently have no plan to write up another manuscript specifically for peregrine-2021. Yes, please cite the preprint if you find it is useful.
239 | 
240 | * Q: Is any published work using Peregrine-2021?
241 | 
242 |     A: Check [this](https://scholar.google.com/scholar?cites=12093836825307648052). Most citations are for benchmarking purposes (of the older version), and some are for the related ideas. There are a couple of papers using it for generating results. As this is mostly a "hobby" project which lacks resources for promoting its usage, I am grateful that the Chief Scientific Officier of Medicinal Genomics, Keven McKernan, provides data sets for me to test it and push the results into papers.
243 | 
244 |     * [A draft reference assembly of the Psilocybe cubensis genome](https://www.ncbi.nlm.nih.gov/labs/pmc/articles/PMC8220353/) 
245 | 
246 |     * [Cannabis Genome: Jamaican Lion strain ](https://www.medicinalgenomics.com/jamaican-lion-data-release/), Note Peregrine-2021 assembled thie genome in an old Mac Pro with only 64G RAM.
247 | 
248 | * Q: I don't have a large memory machine, how do I run Peregrine-2021?
249 | 
250 |     A: For efficiency, it will be great to put all sequence data and the smaller index data in the RAM. Currently, my suggestion is that the RAM should be about 1.5x of the total sequence data. However, the data is accessed through the memory-mapped file (MMAP) mechanism, and the chunking machinery for parallelization can help if one does not have a big memory machine. The code can access the data from the disk through the MMAP file. In such a case, high efficient NVME SSD will help. I had successfully assembled a human genome using a 32G RAM machine. However, I won't recommend that is the right way to go, given that a medium-size RAM machine is relatively cheap to rent now.     
251 | 
252 | * Q: Can you write better Rust code?
253 | 
254 |     A: I guess I could, but I was literally learning Rust and developing new algorithms at the same time. Rust is a big language, there is still a lot to learn
255 | 
256 | * Q: Why do you choose CC BY-NC-SA 4.0 license to release the code?
257 | 
258 |     A: oh, that will be a fire side or beer hour conversation.
259 | ## How does Peregrine-2021 work?
260 | 	
261 | Peregrine-2021 is still just another OLC assembler. 
262 | 
263 | 1. It uses the SHIMMER index for finding overlap candidates as described in the Peregrine preprint but it is more aggressive for overlapping repetitive reads rather than filtering them out.
264 | 
265 | 2. The overlaper performs analysis to identify read overlaps within the same haplotype. We did not use a de-Bruijn graph approach for this. We think our method is likely more computational efficient than using a de-Bruijn graph to separate haplotypes.
266 | 
267 | 3. It adapts the techniques using partial homopolymer compression for separating the reads from different haplotype.
268 | 
269 | ## Build For X86_64
270 | 
271 | 1. Check [Rust Installation](https://www.rust-lang.org/tools/install)
272 | 
273 | 2. Run [`cargo install`](https://doc.rust-lang.org/cargo/commands/cargo-install.html) or [`cargo build`](https://doc.rust-lang.org/cargo/commands/cargo-build.html). Make sure you set up the environment variable `PATH` to the directory of the built binaries or you can run the excutable `pg_asm` with full path. If you use `cargo build`, make sure you compile it with the `--release` option for optimization.
274 | 
275 | 3. `pg_asm` will run the assembly pipeline end-to-end. If it fails, it does not re-use the existing data when one runs `pg_asm` again. The assemblers is much faster than other assemblers, so it is less important to re-use intermidate data. That has been said, the built will contains executables (e.g. `pg_build_idx`, `pg_ovlp`, etc.) for each assembly steps which one can chain them together with their favorite workflow engine for re-using and re-starting an assembly pipeline.
276 | 
277 | 4. A Dockerfile is provided for creating a Docker image. It also provide information to build the assembler from a clean environment.
278 | 
279 | 5. To compile in aarch64, it will need some configuration changes to get the best performance. The memory alloctor package needs to be patched for aarch64. See [https://github.com/cschin/mimalloc_rust/tree/aarch64_build](https://github.com/cschin/mimalloc_rust/tree/aarch64_build).
280 | 
281 | ## Other utility command line tools
282 | 
283 | ```
284 | pg_build_sdb  # convert fasta/fastq/fasta.gz/fastq.gz read data into a simple binary data base for the assembler to fetch the reads. 
285 | 
286 | pg_build_idx  # build the SHIMMER index from the reads for overlapping
287 | 
288 | pg_build_sdb  # build the sequence database
289 | 
290 | pg_dedup      # perform all contigs to all contigs alignment to remove duplicates 
291 | 
292 | pg_dp_graph   # take overlap data file as input to generate the layout file using a polyploid aware layout algorithm
293 | 
294 | pg_getreads   # generate fasta file for a subset of reads from the sequence database
295 | 
296 | pg_graph      # (obsoleted) convert the overlap information between the reads into an assembly group
297 | 
298 | pg_layout     # convert the assembly graph to paths and generate the contig fasta file
299 | 
300 | pg_ovlp_ec    # perform error correction from the haplotype specific overlaps
301 | 
302 | pg_ovlp       # generate haplotype specific overlaps between the reads
303 | 
304 | pg_resolve    # this tool aligns all contigs to themselve to identify haplotype-related contigs
305 | ```
306 | 
307 | --
308 | Jason Chin (twitter: @infoecho)
309 | 
310 | first version: Nov. 16, 2020
311 | 
312 | current version: Fab. 5, 2022
313 | 
314 | 
315 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Peregrine Assembler and SHIMMER Genome Assembly Toolkit 
  2 | 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin
  3 | 
  4 | This Source Code Form is subject to the terms of the Creative Commons 
  5 | Attribution-NonCommercial-ShareAlike 4.0 International License.
  6 | 
  7 | You should have received a copy of the license along with this
  8 | work. If not, see <http://creativecommons.org/licenses/by-nc-sa/4.0/>.
  9 | =======================================================================
 10 | 
 11 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
 12 | Public License
 13 | 
 14 | By exercising the Licensed Rights (defined below), You accept and agree
 15 | to be bound by the terms and conditions of this Creative Commons
 16 | Attribution-NonCommercial-ShareAlike 4.0 International Public License
 17 | ("Public License"). To the extent this Public License may be
 18 | interpreted as a contract, You are granted the Licensed Rights in
 19 | consideration of Your acceptance of these terms and conditions, and the
 20 | Licensor grants You such rights in consideration of benefits the
 21 | Licensor receives from making the Licensed Material available under
 22 | these terms and conditions.
 23 | 
 24 | 
 25 | Section 1 -- Definitions.
 26 | 
 27 |   a. Adapted Material means material subject to Copyright and Similar
 28 |      Rights that is derived from or based upon the Licensed Material
 29 |      and in which the Licensed Material is translated, altered,
 30 |      arranged, transformed, or otherwise modified in a manner requiring
 31 |      permission under the Copyright and Similar Rights held by the
 32 |      Licensor. For purposes of this Public License, where the Licensed
 33 |      Material is a musical work, performance, or sound recording,
 34 |      Adapted Material is always produced where the Licensed Material is
 35 |      synched in timed relation with a moving image.
 36 | 
 37 |   b. Adapter's License means the license You apply to Your Copyright
 38 |      and Similar Rights in Your contributions to Adapted Material in
 39 |      accordance with the terms and conditions of this Public License.
 40 | 
 41 |   c. BY-NC-SA Compatible License means a license listed at
 42 |      creativecommons.org/compatiblelicenses, approved by Creative
 43 |      Commons as essentially the equivalent of this Public License.
 44 | 
 45 |   d. Copyright and Similar Rights means copyright and/or similar rights
 46 |      closely related to copyright including, without limitation,
 47 |      performance, broadcast, sound recording, and Sui Generis Database
 48 |      Rights, without regard to how the rights are labeled or
 49 |      categorized. For purposes of this Public License, the rights
 50 |      specified in Section 2(b)(1)-(2) are not Copyright and Similar
 51 |      Rights.
 52 | 
 53 |   e. Effective Technological Measures means those measures that, in the
 54 |      absence of proper authority, may not be circumvented under laws
 55 |      fulfilling obligations under Article 11 of the WIPO Copyright
 56 |      Treaty adopted on December 20, 1996, and/or similar international
 57 |      agreements.
 58 | 
 59 |   f. Exceptions and Limitations means fair use, fair dealing, and/or
 60 |      any other exception or limitation to Copyright and Similar Rights
 61 |      that applies to Your use of the Licensed Material.
 62 | 
 63 |   g. License Elements means the license attributes listed in the name
 64 |      of a Creative Commons Public License. The License Elements of this
 65 |      Public License are Attribution, NonCommercial, and ShareAlike.
 66 | 
 67 |   h. Licensed Material means the artistic or literary work, database,
 68 |      or other material to which the Licensor applied this Public
 69 |      License.
 70 | 
 71 |   i. Licensed Rights means the rights granted to You subject to the
 72 |      terms and conditions of this Public License, which are limited to
 73 |      all Copyright and Similar Rights that apply to Your use of the
 74 |      Licensed Material and that the Licensor has authority to license.
 75 | 
 76 |   j. Licensor means the individual(s) or entity(ies) granting rights
 77 |      under this Public License.
 78 | 
 79 |   k. NonCommercial means not primarily intended for or directed towards
 80 |      commercial advantage or monetary compensation. For purposes of
 81 |      this Public License, the exchange of the Licensed Material for
 82 |      other material subject to Copyright and Similar Rights by digital
 83 |      file-sharing or similar means is NonCommercial provided there is
 84 |      no payment of monetary compensation in connection with the
 85 |      exchange.
 86 | 
 87 |   l. Share means to provide material to the public by any means or
 88 |      process that requires permission under the Licensed Rights, such
 89 |      as reproduction, public display, public performance, distribution,
 90 |      dissemination, communication, or importation, and to make material
 91 |      available to the public including in ways that members of the
 92 |      public may access the material from a place and at a time
 93 |      individually chosen by them.
 94 | 
 95 |   m. Sui Generis Database Rights means rights other than copyright
 96 |      resulting from Directive 96/9/EC of the European Parliament and of
 97 |      the Council of 11 March 1996 on the legal protection of databases,
 98 |      as amended and/or succeeded, as well as other essentially
 99 |      equivalent rights anywhere in the world.
100 | 
101 |   n. You means the individual or entity exercising the Licensed Rights
102 |      under this Public License. Your has a corresponding meaning.
103 | 
104 | 
105 | Section 2 -- Scope.
106 | 
107 |   a. License grant.
108 | 
109 |        1. Subject to the terms and conditions of this Public License,
110 |           the Licensor hereby grants You a worldwide, royalty-free,
111 |           non-sublicensable, non-exclusive, irrevocable license to
112 |           exercise the Licensed Rights in the Licensed Material to:
113 | 
114 |             a. reproduce and Share the Licensed Material, in whole or
115 |                in part, for NonCommercial purposes only; and
116 | 
117 |             b. produce, reproduce, and Share Adapted Material for
118 |                NonCommercial purposes only.
119 | 
120 |        2. Exceptions and Limitations. For the avoidance of doubt, where
121 |           Exceptions and Limitations apply to Your use, this Public
122 |           License does not apply, and You do not need to comply with
123 |           its terms and conditions.
124 | 
125 |        3. Term. The term of this Public License is specified in Section
126 |           6(a).
127 | 
128 |        4. Media and formats; technical modifications allowed. The
129 |           Licensor authorizes You to exercise the Licensed Rights in
130 |           all media and formats whether now known or hereafter created,
131 |           and to make technical modifications necessary to do so. The
132 |           Licensor waives and/or agrees not to assert any right or
133 |           authority to forbid You from making technical modifications
134 |           necessary to exercise the Licensed Rights, including
135 |           technical modifications necessary to circumvent Effective
136 |           Technological Measures. For purposes of this Public License,
137 |           simply making modifications authorized by this Section 2(a)
138 |           (4) never produces Adapted Material.
139 | 
140 |        5. Downstream recipients.
141 | 
142 |             a. Offer from the Licensor -- Licensed Material. Every
143 |                recipient of the Licensed Material automatically
144 |                receives an offer from the Licensor to exercise the
145 |                Licensed Rights under the terms and conditions of this
146 |                Public License.
147 | 
148 |             b. Additional offer from the Licensor -- Adapted Material.
149 |                Every recipient of Adapted Material from You
150 |                automatically receives an offer from the Licensor to
151 |                exercise the Licensed Rights in the Adapted Material
152 |                under the conditions of the Adapter's License You apply.
153 | 
154 |             c. No downstream restrictions. You may not offer or impose
155 |                any additional or different terms or conditions on, or
156 |                apply any Effective Technological Measures to, the
157 |                Licensed Material if doing so restricts exercise of the
158 |                Licensed Rights by any recipient of the Licensed
159 |                Material.
160 | 
161 |        6. No endorsement. Nothing in this Public License constitutes or
162 |           may be construed as permission to assert or imply that You
163 |           are, or that Your use of the Licensed Material is, connected
164 |           with, or sponsored, endorsed, or granted official status by,
165 |           the Licensor or others designated to receive attribution as
166 |           provided in Section 3(a)(1)(A)(i).
167 | 
168 |   b. Other rights.
169 | 
170 |        1. Moral rights, such as the right of integrity, are not
171 |           licensed under this Public License, nor are publicity,
172 |           privacy, and/or other similar personality rights; however, to
173 |           the extent possible, the Licensor waives and/or agrees not to
174 |           assert any such rights held by the Licensor to the limited
175 |           extent necessary to allow You to exercise the Licensed
176 |           Rights, but not otherwise.
177 | 
178 |        2. Patent and trademark rights are not licensed under this
179 |           Public License.
180 | 
181 |        3. To the extent possible, the Licensor waives any right to
182 |           collect royalties from You for the exercise of the Licensed
183 |           Rights, whether directly or through a collecting society
184 |           under any voluntary or waivable statutory or compulsory
185 |           licensing scheme. In all other cases the Licensor expressly
186 |           reserves any right to collect such royalties, including when
187 |           the Licensed Material is used other than for NonCommercial
188 |           purposes.
189 | 
190 | 
191 | Section 3 -- License Conditions.
192 | 
193 | Your exercise of the Licensed Rights is expressly made subject to the
194 | following conditions.
195 | 
196 |   a. Attribution.
197 | 
198 |        1. If You Share the Licensed Material (including in modified
199 |           form), You must:
200 | 
201 |             a. retain the following if it is supplied by the Licensor
202 |                with the Licensed Material:
203 | 
204 |                  i. identification of the creator(s) of the Licensed
205 |                     Material and any others designated to receive
206 |                     attribution, in any reasonable manner requested by
207 |                     the Licensor (including by pseudonym if
208 |                     designated);
209 | 
210 |                 ii. a copyright notice;
211 | 
212 |                iii. a notice that refers to this Public License;
213 | 
214 |                 iv. a notice that refers to the disclaimer of
215 |                     warranties;
216 | 
217 |                  v. a URI or hyperlink to the Licensed Material to the
218 |                     extent reasonably practicable;
219 | 
220 |             b. indicate if You modified the Licensed Material and
221 |                retain an indication of any previous modifications; and
222 | 
223 |             c. indicate the Licensed Material is licensed under this
224 |                Public License, and include the text of, or the URI or
225 |                hyperlink to, this Public License.
226 | 
227 |        2. You may satisfy the conditions in Section 3(a)(1) in any
228 |           reasonable manner based on the medium, means, and context in
229 |           which You Share the Licensed Material. For example, it may be
230 |           reasonable to satisfy the conditions by providing a URI or
231 |           hyperlink to a resource that includes the required
232 |           information.
233 |        3. If requested by the Licensor, You must remove any of the
234 |           information required by Section 3(a)(1)(A) to the extent
235 |           reasonably practicable.
236 | 
237 |   b. ShareAlike.
238 | 
239 |      In addition to the conditions in Section 3(a), if You Share
240 |      Adapted Material You produce, the following conditions also apply.
241 | 
242 |        1. The Adapter's License You apply must be a Creative Commons
243 |           license with the same License Elements, this version or
244 |           later, or a BY-NC-SA Compatible License.
245 | 
246 |        2. You must include the text of, or the URI or hyperlink to, the
247 |           Adapter's License You apply. You may satisfy this condition
248 |           in any reasonable manner based on the medium, means, and
249 |           context in which You Share Adapted Material.
250 | 
251 |        3. You may not offer or impose any additional or different terms
252 |           or conditions on, or apply any Effective Technological
253 |           Measures to, Adapted Material that restrict exercise of the
254 |           rights granted under the Adapter's License You apply.
255 | 
256 | 
257 | Section 4 -- Sui Generis Database Rights.
258 | 
259 | Where the Licensed Rights include Sui Generis Database Rights that
260 | apply to Your use of the Licensed Material:
261 | 
262 |   a. for the avoidance of doubt, Section 2(a)(1) grants You the right
263 |      to extract, reuse, reproduce, and Share all or a substantial
264 |      portion of the contents of the database for NonCommercial purposes
265 |      only;
266 | 
267 |   b. if You include all or a substantial portion of the database
268 |      contents in a database in which You have Sui Generis Database
269 |      Rights, then the database in which You have Sui Generis Database
270 |      Rights (but not its individual contents) is Adapted Material,
271 |      including for purposes of Section 3(b); and
272 | 
273 |   c. You must comply with the conditions in Section 3(a) if You Share
274 |      all or a substantial portion of the contents of the database.
275 | 
276 | For the avoidance of doubt, this Section 4 supplements and does not
277 | replace Your obligations under this Public License where the Licensed
278 | Rights include other Copyright and Similar Rights.
279 | 
280 | 
281 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
282 | 
283 |   a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
284 |      EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
285 |      AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
286 |      ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
287 |      IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
288 |      WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
289 |      PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
290 |      ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
291 |      KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
292 |      ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
293 | 
294 |   b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
295 |      TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
296 |      NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
297 |      INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
298 |      COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
299 |      USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
300 |      ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
301 |      DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
302 |      IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
303 | 
304 |   c. The disclaimer of warranties and limitation of liability provided
305 |      above shall be interpreted in a manner that, to the extent
306 |      possible, most closely approximates an absolute disclaimer and
307 |      waiver of all liability.
308 | 
309 | 
310 | Section 6 -- Term and Termination.
311 | 
312 |   a. This Public License applies for the term of the Copyright and
313 |      Similar Rights licensed here. However, if You fail to comply with
314 |      this Public License, then Your rights under this Public License
315 |      terminate automatically.
316 | 
317 |   b. Where Your right to use the Licensed Material has terminated under
318 |      Section 6(a), it reinstates:
319 | 
320 |        1. automatically as of the date the violation is cured, provided
321 |           it is cured within 30 days of Your discovery of the
322 |           violation; or
323 | 
324 |        2. upon express reinstatement by the Licensor.
325 | 
326 |      For the avoidance of doubt, this Section 6(b) does not affect any
327 |      right the Licensor may have to seek remedies for Your violations
328 |      of this Public License.
329 | 
330 |   c. For the avoidance of doubt, the Licensor may also offer the
331 |      Licensed Material under separate terms or conditions or stop
332 |      distributing the Licensed Material at any time; however, doing so
333 |      will not terminate this Public License.
334 | 
335 |   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
336 |      License.
337 | 
338 | 
339 | Section 7 -- Other Terms and Conditions.
340 | 
341 |   a. The Licensor shall not be bound by any additional or different
342 |      terms or conditions communicated by You unless expressly agreed.
343 | 
344 |   b. Any arrangements, understandings, or agreements regarding the
345 |      Licensed Material not stated herein are separate from and
346 |      independent of the terms and conditions of this Public License.
347 | 
348 | 
349 | Section 8 -- Interpretation.
350 | 
351 |   a. For the avoidance of doubt, this Public License does not, and
352 |      shall not be interpreted to, reduce, limit, restrict, or impose
353 |      conditions on any use of the Licensed Material that could lawfully
354 |      be made without permission under this Public License.
355 | 
356 |   b. To the extent possible, if any provision of this Public License is
357 |      deemed unenforceable, it shall be automatically reformed to the
358 |      minimum extent necessary to make it enforceable. If the provision
359 |      cannot be reformed, it shall be severed from this Public License
360 |      without affecting the enforceability of the remaining terms and
361 |      conditions.
362 | 
363 |   c. No term or condition of this Public License will be waived and no
364 |      failure to comply consented to unless expressly agreed to by the
365 |      Licensor.
366 | 
367 |   d. Nothing in this Public License constitutes or may be interpreted
368 |      as a limitation upon, or waiver of, any privileges and immunities
369 |      that apply to the Licensor or You, including from the legal
370 |      processes of any jurisdiction or authority.
371 | 
372 | 


--------------------------------------------------------------------------------
/src/bin/utils/graph_analysis.rs:
--------------------------------------------------------------------------------
  1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit
  2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin
  3 | //
  4 | // This Source Code Form is subject to the terms of the
  5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
  6 | //
  7 | // You should have received a copy of the license along with this
  8 | // work. If not, see <http://creativecommons.org/licenses/by-nc-sa/4.0/>.
  9 | 
 10 | #![allow(dead_code)]
 11 | 
 12 | //
 13 | // define the overlap and graph data structure and lower level graph processing utility functions
 14 | //
 15 | 
 16 | use petgraph::graphmap::DiGraphMap;
 17 | use petgraph::visit::Bfs;
 18 | use petgraph::Direction::{Incoming, Outgoing};
 19 | use rustc_hash::FxHashMap;
 20 | use rustc_hash::FxHashSet;
 21 | 
 22 | use std::fs::File;
 23 | use std::io::{self, BufWriter, Write};
 24 | 
 25 | pub type U32AsmGraph = DiGraphMap<(u32, u8), u32>;
 26 | pub type OvlpGraph = U32AsmGraph;
 27 | pub type UtgGraph = U32AsmGraph;
 28 | pub type ReadNode = (u32, u8);
 29 | pub type OvlpEdge = (ReadNode, ReadNode);
 30 | 
 31 | #[derive(Debug, Copy, Clone, Hash, Eq, PartialEq)]
 32 | pub struct ReadPair {
 33 |     pub rid0: u32,
 34 |     pub strand0: u8,
 35 |     pub rid1: u32,
 36 |     pub strand1: u8,
 37 | }
 38 | 
 39 | impl ReadPair {
 40 |     pub fn new(v: (u32, u8), w: (u32, u8)) -> Self {
 41 |         ReadPair {
 42 |             rid0: v.0,
 43 |             strand0: v.1,
 44 |             rid1: w.0,
 45 |             strand1: w.1,
 46 |         }
 47 |     }
 48 |     pub fn _to_str(&self) -> String {
 49 |         format!(
 50 |             "{} {} {} {}",
 51 |             self.rid0, self.strand0, self.rid1, self.strand1
 52 |         )
 53 |     }
 54 | 
 55 |     pub fn reverse(&self) -> ReadPair {
 56 |         ReadPair {
 57 |             rid0: self.rid1,
 58 |             strand0: 1 - self.strand1,
 59 |             rid1: self.rid0,
 60 |             strand1: 1 - self.strand0,
 61 |         }
 62 |     }
 63 | }
 64 | 
 65 | #[derive(Debug, Copy, Clone)]
 66 | pub struct Overlap {
 67 |     // main struct for keeping read overlap
 68 |     pub rid0: u32,
 69 |     pub rid1: u32,
 70 |     pub strand1: u8,
 71 |     pub len0: u32,
 72 |     pub len1: u32,
 73 |     pub d_left: i32,
 74 |     pub d_right: i32,
 75 |     pub bgn0: u32,
 76 |     pub end0: u32,
 77 |     pub bgn1: u32,
 78 |     pub end1: u32,
 79 |     pub dist: u32,
 80 |     // dist: raw distice determine by the O(Nd) alignment algorithm
 81 |     pub idt: f32,
 82 |     pub dist_c: u32,
 83 |     // dist_c: "distance" after hp corrections
 84 |     pub max_dist_c: u32,
 85 |     pub idt_c: f32,
 86 |     pub flag: u8,
 87 |     // flag bit field, Not used now 2020/11/02
 88 |     // 0x01: the rid0 is chimer
 89 |     // 0x02: the rir0 and rid1 are compatitable pair
 90 |     // 0x04: the rid1 is the best right pair
 91 |     // 0x08: the rid1 is the best left pair
 92 |     // 0x10: the rid1 is a chimer
 93 |     // 0x20: the rid1 is contained
 94 |     // 0x40: the rid0 is contained
 95 | }
 96 | 
 97 | impl Overlap {
 98 |     pub fn _new() -> Self {
 99 |         Self {
100 |             rid0: 0,
101 |             rid1: 0,
102 |             strand1: 0,
103 |             len0: 0,
104 |             len1: 0,
105 |             d_left: 0,
106 |             d_right: 0,
107 |             bgn0: 0,
108 |             end0: 0,
109 |             bgn1: 0,
110 |             end1: 0,
111 |             dist: 0,
112 |             idt: 0.0,
113 |             dist_c: 0,
114 |             max_dist_c: 0,
115 |             idt_c: 0.0,
116 |             flag: 0,
117 |         }
118 |     }
119 | 
120 |     pub fn build_from(v: Vec<&str>) -> Self {
121 |         Overlap {
122 |             rid0: v[1].parse().unwrap(),
123 |             rid1: v[2].parse().unwrap(),
124 |             strand1: v[3].parse().unwrap(),
125 |             len0: v[4].parse().unwrap(),
126 |             len1: v[5].parse().unwrap(),
127 |             d_left: v[6].parse().unwrap(),
128 |             d_right: v[7].parse().unwrap(),
129 |             bgn0: v[8].parse().unwrap(),
130 |             end0: v[9].parse().unwrap(),
131 |             bgn1: v[10].parse().unwrap(),
132 |             end1: v[11].parse().unwrap(),
133 |             dist: v[12].parse().unwrap(),
134 |             idt: v[13].parse().unwrap(),
135 |             dist_c: v[14].parse().unwrap(),
136 |             max_dist_c: v[15].parse().unwrap(),
137 |             idt_c: v[16].parse().unwrap(),
138 |             flag: v[17].parse().unwrap(),
139 |         }
140 |     }
141 | 
142 |     pub fn _format(&self) -> String {
143 |         format!(
144 |             "{} {} {} {} {} {} {} {} {} {} {} {} {:.2} {} {} {:.2} {}",
145 |             self.rid0,
146 |             self.rid1,
147 |             self.strand1,
148 |             self.len0,
149 |             self.len1,
150 |             self.d_left,
151 |             self.d_right,
152 |             self.bgn0,
153 |             self.end0,
154 |             self.bgn1,
155 |             self.end1,
156 |             self.dist,
157 |             self.idt,
158 |             self.dist_c,
159 |             self.max_dist_c,
160 |             self.idt_c,
161 |             self.flag
162 |         )
163 |     }
164 | 
165 |     pub fn swap_rp(&self) -> Overlap {
166 |         // swap the overlapped pair
167 |         let d_left: i32;
168 |         let d_right: i32;
169 |         let strand1: u8;
170 |         let bgn0: u32;
171 |         let end0: u32;
172 |         let bgn1: u32;
173 |         let end1: u32;
174 |         if self.strand1 == 0 {
175 |             d_left = -self.d_left;
176 |             d_right = -self.d_right;
177 |             strand1 = 0;
178 |             bgn0 = self.bgn0;
179 |             end0 = self.end0;
180 |             bgn1 = self.bgn1;
181 |             end1 = self.end1;
182 |         } else {
183 |             d_left = self.d_right;
184 |             d_right = self.d_left;
185 |             strand1 = 1;
186 |             bgn0 = self.len0 - self.end0;
187 |             end0 = self.len0 - self.bgn0;
188 |             bgn1 = self.len1 - self.end1;
189 |             end1 = self.len1 - self.bgn1;
190 |         }
191 |         Overlap {
192 |             rid0: self.rid1,
193 |             rid1: self.rid0,
194 |             strand1: strand1,
195 |             len0: self.len1,
196 |             len1: self.len0,
197 |             d_left: d_left,
198 |             d_right: d_right,
199 |             bgn0: bgn1,
200 |             end0: end1,
201 |             bgn1: bgn0,
202 |             end1: end0,
203 |             dist: self.dist,
204 |             idt: self.idt,
205 |             dist_c: self.dist_c,
206 |             max_dist_c: self.max_dist_c,
207 |             idt_c: self.idt_c,
208 |             flag: self.flag,
209 |         }
210 |     }
211 | 
212 |     pub fn reverse_strand(&self) -> Overlap {
213 |         // reverse the overlapped strain of the read 1
214 |         Overlap {
215 |             rid0: self.rid0,
216 |             rid1: self.rid1,
217 |             strand1: 1 - self.strand1,
218 |             len0: self.len1,
219 |             len1: self.len0,
220 |             d_left: -self.d_right,
221 |             d_right: -self.d_left,
222 |             bgn0: self.len0 - self.end0,
223 |             end0: self.len0 - self.bgn0,
224 |             bgn1: self.len1 - self.end1,
225 |             end1: self.len1 - self.bgn1,
226 |             dist: self.dist,
227 |             idt: self.idt,
228 |             dist_c: self.dist_c,
229 |             max_dist_c: self.max_dist_c,
230 |             idt_c: self.idt_c,
231 |             flag: self.flag | 0x80,
232 |         }
233 |     }
234 | }
235 | 
236 | pub type OverlapMap = FxHashMap<u32, Vec<Overlap>>;
237 | 
238 | fn get_upath(g: &OvlpGraph, v: ReadNode, w: ReadNode) -> Vec<ReadNode> {
239 |     //
240 |     // find a simple path (no out branch) start at v->w edge in the graph g
241 |     //
242 | 
243 |     let mut path = Vec::<ReadNode>::new();
244 |     let mut visited_nodes = FxHashSet::<ReadNode>::default();
245 |     path.push(v);
246 |     path.push(w);
247 | 
248 |     visited_nodes.insert(v);
249 |     visited_nodes.insert(w);
250 | 
251 |     let mut n = w;
252 |     loop {
253 |         if g.neighbors_directed(n, Outgoing).count() == 1
254 |             && g.neighbors_directed(n, Incoming).count() == 1
255 |         {
256 |             n = *g
257 |                 .neighbors_directed(n, Outgoing)
258 |                 .into_iter()
259 |                 .collect::<Vec<ReadNode>>()
260 |                 .get(0)
261 |                 .unwrap();
262 |             if !visited_nodes.contains(&n) {
263 |                 path.push(n);
264 |                 visited_nodes.insert(n);
265 |             } else {
266 |                 path.push(n);
267 |                 break;
268 |             }
269 |         } else {
270 |             break;
271 |         }
272 |     }
273 |     path
274 | }
275 | 
276 | pub fn get_utg_paths(g: &OvlpGraph) -> Vec<(u32, Vec<ReadNode>)> {
277 |     //
278 |     // get all unitig paths
279 |     //
280 | 
281 |     let mut start_nodes = FxHashSet::<(u32, u8)>::default();
282 |     for v in g.nodes() {
283 |         if g.neighbors_directed(v, Incoming).count() != 1
284 |             || g.neighbors_directed(v, Outgoing).count() != 1
285 |         {
286 |             start_nodes.insert(v);
287 |         }
288 |     }
289 | 
290 |     let mut uid = 0_u32;
291 |     let mut paths = Vec::<(u32, Vec<(u32, u8)>)>::new();
292 | 
293 |     for v in start_nodes {
294 |         for w in g.neighbors_directed(v, Outgoing) {
295 |             let path = get_upath(&g, v, w);
296 |             //let e = path[path.len() - 1];
297 |             paths.push((uid, path));
298 |             uid += 1;
299 |         }
300 |     }
301 |     paths
302 | }
303 | 
304 | pub fn transitive_reduction(g: &mut U32AsmGraph) -> () {
305 |     let mut tr_edges = FxHashSet::<((u32, u8), (u32, u8))>::default();
306 |     for v in g.nodes().into_iter() {
307 |         let mut edges = Vec::<(u32, (u32, u8), (u32, u8))>::with_capacity(32);
308 |         for w in g.neighbors_directed(v, Outgoing) {
309 |             let ovlp_length = *g.edge_weight(v, w).unwrap();
310 |             edges.push((ovlp_length, v, w));
311 |         }
312 |         edges.sort();
313 |         for (_l, v, w) in edges.iter() {
314 |             for (_l, _vv, x) in edges.iter() {
315 |                 if x == w {
316 |                     continue;
317 |                 }
318 |                 // w in out_neighbor(x), v->x->w exist, v->w can be eliminate
319 |                 let mut found = false;
320 |                 for x_out in g.neighbors_directed(*x, Outgoing) {
321 |                     if x_out == *w {
322 |                         found = true;
323 |                         break;
324 |                     }
325 |                 }
326 |                 if found {
327 |                     tr_edges.insert((*v, *w));
328 |                 }
329 |             }
330 |         }
331 |     }
332 | 
333 |     for (v, w, _) in g.all_edges() {
334 |         if g.neighbors_directed(v, Incoming).count() == 0
335 |             && g.neighbors_directed(w, Incoming).count() >= 2
336 |             && g.neighbors_directed(w, Outgoing).count() >= 1
337 |         {
338 |             tr_edges.insert((v, w));
339 |         }
340 |         if g.neighbors_directed(w, Outgoing).count() == 0
341 |             && g.neighbors_directed(v, Outgoing).count() >= 2
342 |             && g.neighbors_directed(v, Incoming).count() >= 1
343 |         {
344 |             tr_edges.insert((v, w));
345 |         }
346 |     }
347 | 
348 |     for (v, w) in tr_edges.into_iter() {
349 |         //println!("D {:?} {:?}", v, w);
350 |         g.remove_edge(v, w);
351 |     }
352 | }
353 | 
354 | pub fn remove_simple_spur(g: &mut U32AsmGraph, max: u32) {
355 |     let mut edges = FxHashSet::<OvlpEdge>::default();
356 |     for (v, w, s) in g.all_edges() {
357 |         if *s > max {
358 |             continue;
359 |         }
360 |         if g.neighbors_directed(v, Incoming).count() == 0
361 |             && g.neighbors_directed(w, Incoming).count() >= 2
362 |             && g.neighbors_directed(w, Outgoing).count() >= 1
363 |         {
364 |             edges.insert((v, w));
365 |         }
366 |         if g.neighbors_directed(w, Outgoing).count() == 0
367 |             && g.neighbors_directed(v, Outgoing).count() >= 2
368 |             && g.neighbors_directed(v, Incoming).count() >= 1
369 |         {
370 |             edges.insert((v, w));
371 |         }
372 |     }
373 |     for (v, w) in edges.into_iter() {
374 |         g.remove_edge(v, w);
375 |     }
376 | }
377 | 
378 | pub fn remove_single_bridge(g: &mut U32AsmGraph, max: u32) -> () {
379 |     //
380 |     // implement a huristic rule removing spurious connections (repeat induced bridge)
381 |     //
382 | 
383 |     let mut to_remove = FxHashSet::<(u32, OvlpEdge)>::default();
384 | 
385 |     for (v, w, weight) in g.all_edges() {
386 |         if g.neighbors_directed(v, Outgoing).count() >= 2
387 |             && g.neighbors_directed(v, Incoming).count() >= 1
388 |             && g.neighbors_directed(w, Outgoing).count() >= 1
389 |             && g.neighbors_directed(w, Incoming).count() >= 2
390 |             && *weight < max
391 |         {
392 |             to_remove.insert((*weight, (v, w)));
393 |         }
394 |     }
395 | 
396 |     let mut to_remove_vec = Vec::<(u32, OvlpEdge)>::new();
397 |     for e in to_remove {
398 |         to_remove_vec.push(e);
399 |     }
400 | 
401 |     to_remove_vec.sort(); // remove edges with smaller weight (~ overlaps length) first
402 | 
403 |     for (_c, (v, w)) in to_remove_vec {
404 |         if g.neighbors_directed(v, Outgoing).count() > 1
405 |             && g.neighbors_directed(w, Incoming).count() > 1
406 |             && g.neighbors_directed((w.0, 1 - w.1), Outgoing).count() > 1
407 |             && g.neighbors_directed((v.0, 1 - v.1), Incoming).count() > 1
408 |         {
409 |             g.remove_edge(v, w);
410 |             g.remove_edge((w.0, 1 - w.1), (v.0, 1 - v.1));
411 |             //println!("RMSB0 {}:{} {}:{}", v.0, v.1, w.0, w.1);
412 |             //println!("RMSB1 {}:{} {}:{}", w.0, 1-w.1, v.0, 1-v.1);
413 |         }
414 |     }
415 | }
416 | 
417 | pub fn dump_utg_paths(
418 |     paths: &Vec<(u32, Vec<ReadNode>)>,
419 |     utg_g: &UtgGraph,
420 |     filename: &String,
421 | ) -> Result<(), io::Error> {
422 |     let mut utg_file = BufWriter::new(File::create(filename).unwrap());
423 | 
424 |     for (uid, p) in paths.iter() {
425 |         let v = p[0];
426 |         let w = p[p.len() - 1];
427 |         let mut tag = 0_u32;
428 |         if utg_g.contains_edge(v, w) {
429 |             tag = 1;
430 |         }
431 |         writeln!(
432 |             utg_file,
433 |             "UTG {} {}:{} {}:{} {} {}",
434 |             uid,
435 |             v.0,
436 |             v.1,
437 |             w.0,
438 |             w.1,
439 |             p.len(),
440 |             tag
441 |         )?;
442 |         for v in p {
443 |             writeln!(utg_file, "N {} {}:{}", uid, v.0, v.1)?;
444 |         }
445 |     }
446 |     Ok(())
447 | }
448 | 
449 | pub fn dump_utg_gfa(
450 |     paths: &Vec<(u32, Vec<ReadNode>)>,
451 |     utg_g: &UtgGraph,
452 |     rpair2overlap: &FxHashMap<ReadPair, Overlap>,
453 |     _read_to_ctg: &FxHashMap<ReadNode, Vec<(u32, u32)>>,
454 |     filename: &String,
455 | ) -> Result<(), io::Error> {
456 |     let mut gfa_file = BufWriter::new(File::create(filename).unwrap());
457 |     writeln!(gfa_file, "H\tVN:Z:1.0")?;
458 |     let mut end_nodes = FxHashMap::<(u32, u8), Vec<u32>>::default();
459 |     let mut bgn_nodes = FxHashMap::<(u32, u8), Vec<u32>>::default();
460 |     let mut read_len = FxHashMap::<u32, u32>::default();
461 |     for (uid, p) in paths.iter() {
462 |         if p.len() < 2 {
463 |             continue;
464 |         }
465 |         let nb = p[0];
466 |         let ne = p[p.len() - 1];
467 |         let mut tag = 0_u32;
468 |         if utg_g.contains_edge(nb, ne) {
469 |             tag = 1;
470 |         }
471 |         bgn_nodes.entry(nb).or_insert_with(|| vec![]).push(*uid);
472 |         end_nodes.entry(ne).or_insert_with(|| vec![]).push(*uid);
473 |         let mut node_start = Vec::<(ReadNode, u32)>::new();
474 |         let mut v = p[0];
475 |         let mut utg_len = 0_u32;
476 |         node_start.push((v, 0));
477 |         for w in p[1..p.len()].iter() {
478 |             let ovlp = rpair2overlap.get(&ReadPair::new(v, *w)).unwrap();
479 |             let len0 = ovlp.len0;
480 |             let len1 = ovlp.len1;
481 |             let ovlp_len = ovlp.end0 - ovlp.bgn0;
482 |             if utg_len == 0 {
483 |                 utg_len += len0;
484 |                 read_len.insert(v.0, len0);
485 |             }
486 |             node_start.push((*w, utg_len));
487 |             utg_len += len1 - ovlp_len;
488 |             read_len.insert(w.0, len1);
489 |             v = *w;
490 |         }
491 | 
492 |         writeln!(
493 |             gfa_file,
494 |             "S\t{}\t*\tLN:i:{}\tRB:i:{}\tSB:i:{}\tRE:i:{}\tSE:i:{}\tNC:i:{}\tTG:i:{}",
495 |             uid,
496 |             utg_len,
497 |             nb.0,
498 |             nb.1,
499 |             ne.0,
500 |             ne.1,
501 |             p.len(),
502 |             tag
503 |         )?;
504 | 
505 |         for (n, start) in node_start {
506 |             writeln!(gfa_file, "N\t{}\t{}\t{}\t{}", uid, n.0, n.1, start)?;
507 |         }
508 |     }
509 |     for (v, uids0) in end_nodes.iter() {
510 |         if let Some(uids1) = bgn_nodes.get(v) {
511 |             for u0 in uids0 {
512 |                 for u1 in uids1 {
513 |                     let rlen = read_len.get(&v.0).unwrap();
514 |                     writeln!(gfa_file, "L\t{}\t+\t{}\t+\t{}M", u0, u1, rlen)?;
515 |                 }
516 |             }
517 |         }
518 | 
519 |         if let Some(uids1) = end_nodes.get(&(v.0, 1 - v.1)) {
520 |             for u0 in uids0 {
521 |                 for u1 in uids1 {
522 |                     if *u0 == *u1 {
523 |                         continue;
524 |                     }
525 |                     let rlen = read_len.get(&v.0).unwrap();
526 |                     writeln!(gfa_file, "L\t{}\t+\t{}\t-\t{}M", u0, u1, rlen)?;
527 |                 }
528 |             }
529 |         }
530 |     }
531 |     Ok(())
532 | }
533 | 
534 | pub fn bfs_extend(v: (u32, u8), g: &U32AsmGraph, limit: u32) -> (u32, Vec<(u32, u8)>) {
535 |     //
536 |     // output bfs search from node v in graph g bounded by the `limit`
537 |     //
538 | 
539 |     let mut bfs = Bfs::new(g, v);
540 |     let mut count = 0_u32;
541 |     let mut nodes = Vec::<(u32, u8)>::with_capacity(32);
542 |     while let Some(n) = bfs.next(g) {
543 |         count += 1;
544 |         if count >= limit {
545 |             break;
546 |         }
547 |         nodes.push(n);
548 |     }
549 |     //assert!(nodes[0]==v);
550 |     (count, nodes)
551 | }
552 | 
553 | fn find_branching_nodes(
554 |     g: &mut UtgGraph,
555 |     max_path_length: u32,
556 |     min_path_length: u32,
557 |     max_edge_count: u32,
558 | ) -> Vec<(u32, u8)> {
559 |     // output nodes that has branches
560 | 
561 |     let mut branching_nodes = FxHashSet::<(u32, u8)>::default();
562 |     let candidates = g.nodes().collect::<Vec<ReadNode>>();
563 |     for v in candidates {
564 |         let out_count = g.neighbors_directed(v, Outgoing).count();
565 |         if out_count < 2 {
566 |             continue;
567 |         }
568 |         let mut ext_branch_count = 0_u32;
569 |         let mut node_count = FxHashMap::<(u32, u8), u32>::default();
570 |         for w in g.neighbors_directed(v, Outgoing) {
571 |             let (count, p) = bfs_extend(w, g, max_path_length);
572 |             let mut overlapped_path = false;
573 |             let mut ovlp_count = 0;
574 |             for ww in g.neighbors_directed(v, Outgoing) {
575 |                 ovlp_count += g.edge_weight(v, ww).unwrap();
576 |             }
577 |             for &vv in &p {
578 |                 for ww in g.neighbors_directed(vv, Outgoing) {
579 |                     ovlp_count += g.edge_weight(vv, ww).unwrap();
580 |                 }
581 |             }
582 |             if ovlp_count <= 3 {
583 |                 // don't count if a branch is very short
584 |                 break;
585 |             }
586 |             if count >= min_path_length {
587 |                 let mut ec = 0_u32;
588 |                 for vv in p {
589 |                     ec += 1;
590 |                     if ec > max_edge_count {
591 |                         break;
592 |                     }
593 |                     if vv.0 == v.0 {
594 |                         // loop
595 |                         break;
596 |                     }
597 |                     let c = node_count.entry(vv).or_insert(0);
598 |                     *c += 1;
599 |                     if *c > 1 {
600 |                         overlapped_path = true;
601 |                         break;
602 |                     }
603 |                 }
604 |             }
605 |             if !overlapped_path {
606 |                 ext_branch_count += 1;
607 |             }
608 |         }
609 |         if ext_branch_count >= 2 {
610 |             branching_nodes.insert(v);
611 |         }
612 |     }
613 |     branching_nodes.iter().map(|x| *x).collect()
614 | }
615 | 
616 | fn find_path(g: &UtgGraph, s: ReadNode, t: ReadNode) -> Option<(u32, Vec<OvlpEdge>)> {
617 |     // search a path from s to t
618 | 
619 |     assert!(s != t);
620 |     let mut pre = FxHashMap::<ReadNode, Option<ReadNode>>::default();
621 |     let mut min_edge_count = FxHashMap::<ReadNode, u32>::default();
622 |     let mut out = Vec::<OvlpEdge>::new();
623 | 
624 |     pre.insert(s, None);
625 |     min_edge_count.insert(s, 0);
626 |     let limit = 256_u32;
627 |     let mut count = 0_u32;
628 |     let mut found = false;
629 | 
630 |     let mut bfs = Bfs::new(&g, s);
631 |     let mut m: u32;
632 |     while let Some(n) = bfs.next(&g) {
633 |         m = u32::MAX;
634 |         let mut min_in = None;
635 |         for ww in g.neighbors_directed(n, Incoming) {
636 |             if let Some(mm) = min_edge_count.get(&ww) {
637 |                 if *mm < m {
638 |                     m = *mm;
639 |                     min_in = Some(ww);
640 |                 }
641 |             }
642 |         }
643 | 
644 |         if min_in != None && n != s {
645 |             pre.insert(n, min_in);
646 |             min_edge_count.insert(n, m + g.edge_weight(min_in.unwrap(), n).unwrap());
647 |         }
648 | 
649 |         if n == t {
650 |             found = true;
651 |             break;
652 |         }
653 |         count += 1;
654 |         if count > limit {
655 |             break;
656 |         }
657 |     }
658 |     if found {
659 |         let mut v = t;
660 |         loop {
661 |             let p = *pre.get(&v).unwrap();
662 |             if p == None {
663 |                 break;
664 |             }
665 |             out.push((p.unwrap(), v));
666 |             v = p.unwrap();
667 |         }
668 |         out.reverse();
669 |         Some((*min_edge_count.get(&t).unwrap(), out))
670 |     } else {
671 |         None
672 |     }
673 | }
674 | 
675 | pub fn utg_reduction(paths: &Vec<(u32, Vec<ReadNode>)>, g0: &OvlpGraph) -> (UtgGraph, OvlpGraph) {
676 |     // graph reduction in the unitig level
677 | 
678 |     let mut utg_g = UtgGraph::new();
679 |     let mut g = OvlpGraph::new();
680 |     for (_uid, p) in paths.iter() {
681 |         let b = p[0];
682 |         let e = p[p.len() - 1];
683 |         utg_g.add_edge(b, e, p.len() as u32);
684 |     }
685 | 
686 |     let mut branching_nodes = find_branching_nodes(&mut utg_g, 24, 1, 128);
687 |     branching_nodes.sort();
688 |     let mut bracnhing_node_sinks = FxHashSet::<ReadNode>::default();
689 |     for v in branching_nodes.iter() {
690 |         let w = (v.0, 1 - v.1);
691 |         if utg_g.neighbors_directed(w, Outgoing).count() > 0 {
692 |             // only add w if the w is not dead end
693 |             bracnhing_node_sinks.insert(w);
694 |             //println!("SINK {}:{}", v.0, 1 - v.1);
695 |         }
696 |     }
697 |     //println!("BCOUNT {}", branching_nodes.len());
698 |     let mut ripaths = Vec::<Vec<OvlpEdge>>::new(); //ripaths = repeat induced path
699 |     for v in branching_nodes.iter() {
700 |         let mut paths_to_remove_candidates = Vec::<(u32, Vec<OvlpEdge>)>::new();
701 |         for w in utg_g.neighbors_directed(*v, Outgoing) {
702 |             let vv = *v;
703 |             let ww = w;
704 |             if ww == vv {
705 |                 // ignore self edge caused by as loop
706 |                 continue;
707 |             }
708 |             log::debug!("BS {}:{} {}:{}", vv.0, vv.1, ww.0, ww.1);
709 |             let c_nodes = bfs_extend(ww, &utg_g, 16);
710 |             let mut sink_nodes = Vec::<ReadNode>::new();
711 |             for p in &c_nodes.1 {
712 |                 if bracnhing_node_sinks.contains(p) {
713 |                     sink_nodes.push(*p);
714 |                 }
715 |             }
716 |             if sink_nodes.len() > 0 {
717 |                 let mut s_ec = u32::MAX;
718 |                 let mut s_path = Vec::<OvlpEdge>::new();
719 |                 let ec0 = *utg_g.edge_weight(vv, ww).unwrap();
720 |                 s_path.push((vv, ww));
721 |                 for n in sink_nodes {
722 |                     log::debug!("V {}:{} W {}:{}  SINK {}:{}", v.0, v.1, w.0, w.1, n.0, n.1);
723 |                     if w == n {
724 |                         s_ec = ec0;
725 |                         break;
726 |                     } else if let Some((mut ec, path)) = find_path(&utg_g, ww, n) {
727 |                         ec += ec0;
728 |                         if ec < 5 {
729 |                             if ec < s_ec {
730 |                                 s_ec = ec;
731 |                                 s_path.extend(path);
732 |                             }
733 |                         }
734 |                     }
735 |                 }
736 |                 if s_ec < 5 {
737 |                     log::debug!("PL {} {}", s_ec, s_path.len());
738 |                     paths_to_remove_candidates.push((s_ec, s_path));
739 |                 }
740 |             }
741 |         }
742 |         //remove repeat candidates but ensure to keep one out
743 |         paths_to_remove_candidates.sort();
744 |         /*
745 |         println!(
746 |             "BN {}:{} {}",
747 |             v.0,
748 |             v.1,
749 |             utg_g.neighbors_directed(*v, Outgoing).count()
750 |         );
751 |         */
752 |         for (_ec, path) in paths_to_remove_candidates.iter() {
753 |             log::debug!("RMPC {} {:?}", _ec, path,);
754 |         }
755 |         if paths_to_remove_candidates.len() < utg_g.neighbors_directed(*v, Outgoing).count() {
756 |             for (_ec, path) in paths_to_remove_candidates.iter() {
757 |                 log::debug!("RMP {} {:?}", _ec, path);
758 |                 ripaths.push(path.clone());
759 |             }
760 |         } else {
761 |             let l = paths_to_remove_candidates.len();
762 |             if l >= 1 {
763 |                 for i in 0..l - 1 {
764 |                     let (_ec, path) = paths_to_remove_candidates.get(i).unwrap();
765 |                     log::debug!("RMP {} {:?}", _ec, path);
766 |                     ripaths.push(path.clone());
767 |                 }
768 |             }
769 |         }
770 |     }
771 |     for p in ripaths {
772 |         for (vv, ww) in p {
773 |             utg_g.remove_edge(vv, ww);
774 |             utg_g.remove_edge((ww.0, 1 - ww.1), (vv.0, 1 - vv.1));
775 |             //log::debug!("RME {}:{} {}:{}", vv.0, vv.1, ww.0, ww.1);
776 |         }
777 |     }
778 | 
779 |     //patch some useful edge backs
780 |     for (_uid, p) in paths.iter() {
781 |         let b = p[0];
782 |         let e = p[p.len() - 1];
783 |         if utg_g.neighbors_directed(b, Outgoing).count() == 0
784 |             && utg_g.neighbors_directed(b, Incoming).count() > 0
785 |             && utg_g.neighbors_directed(e, Outgoing).count() > 0
786 |             && utg_g.neighbors_directed(e, Incoming).count() == 0
787 |         {
788 |             utg_g.add_edge(b, e, p.len() as u32);
789 |             utg_g.add_edge((e.0, 1 - e.1), (b.0, 1 - b.1), p.len() as u32);
790 |             //log::debug!("ADD {}:{} {}:{}", b.0, b.1, e.0, e.1);
791 |         }
792 |     }
793 | 
794 |     //remove_simple_spur(&mut utg_g, 5);
795 |     //remove_single_bridge(&mut utg_g, 5);
796 | 
797 |     for (_uid, p) in paths.iter() {
798 |         let v = p[0];
799 |         let w = p[p.len() - 1];
800 |         if utg_g.contains_edge(v, w) {
801 |             let mut vv = v;
802 |             for i in 1..p.len() {
803 |                 let ww = p[i];
804 |                 let weight = *g0.edge_weight(vv, ww).unwrap();
805 |                 g.add_edge(vv, ww, weight);
806 |                 vv = ww;
807 |             }
808 |         }
809 |     }
810 |     (utg_g, g)
811 | }
812 | 


--------------------------------------------------------------------------------