├── misc ├── logo.png └── logo.svg ├── run_script_from_s3.sh ├── .gitignore ├── .github └── workflows │ └── docker-image.yml ├── Cargo.toml ├── Dockerfile ├── src └── bin │ ├── utils │ ├── mod.rs │ ├── build_idx.rs │ ├── resolve.rs │ ├── build_sdb.rs │ ├── ovlp_ec.rs │ ├── shmmrutils.rs │ ├── layout.rs │ └── graph_analysis.rs │ ├── pg_build_sdb.rs │ ├── pg_graph.rs │ ├── pg_dp_graph.rs │ ├── pg_layout.rs │ ├── pg_resolve.rs │ ├── pg_dedup.rs │ ├── pg_build_idx.rs │ ├── pg_ovlp.rs │ ├── pg_ovlp_ec.rs │ ├── pg_getreads.rs │ └── pg_asm.rs ├── README.md └── LICENSE /misc/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cschin/peregrine-2021/HEAD/misc/logo.png -------------------------------------------------------------------------------- /run_script_from_s3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | echo $1 4 | aws s3 cp $1/run.sh run.sh 5 | bash run.sh 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | /target/ 4 | /target 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 7 | Cargo.lock 8 | 9 | # These are backup files generated by rustfmt 10 | **/*.rs.bk 11 | -------------------------------------------------------------------------------- /.github/workflows/docker-image.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image CI 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | 11 | build: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Build the Docker image 18 | run: docker build . --file Dockerfile --tag cschin/pgr-2021:$(date +%s) 19 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "peregrine-r" 3 | version = "0.4.13" 4 | authors = ["Jason Chin "] 5 | edition = "2018" 6 | build = "build.rs" 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | [dependencies] 10 | glob = "0.3.0" 11 | serde = { version = "1", features = ["derive"] } 12 | petgraph = { git = "https://github.com/cschin/petgraph.git", branch = "fx_index_map" } # base on petgraph = "0.5.1" 13 | libc = "0.2" 14 | memmap = "0.7.0" 15 | structview = "1.1.0" 16 | byteorder = "1.3.4" 17 | rand = "0.7.3" 18 | rustc-hash = "1.1.0" 19 | # flate2 = { version ="1.0", features = ["zlib"] } 20 | flate2 = "1.0" 21 | clap = "2" 22 | threadpool = "1.8.1" 23 | num_cpus = "1.13.0" 24 | log = { version = "^0.4.5", features = ["std"] } 25 | simple_logger = "2.1.0" 26 | sysinfo = "0.23.5" 27 | mimalloc = { version = "0.1.17", default-features = false } 28 | rayon = "1.5.0" 29 | intervaltree = "0.2.6" 30 | lazy_static = "1.4.0" 31 | regex = "1" 32 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | #ref: https://alexbrand.dev/post/how-to-package-rust-applications-into-minimal-docker-containers/ 2 | FROM rust:1.58.1 AS build 3 | WORKDIR /usr/src 4 | 5 | # since we use the mimalloc crate, it does not work well with musl. 6 | #RUN rustup target add x86_64-unknown-linux-musl 7 | 8 | # Create a dummy project and build the app's dependencies. 9 | # If the Cargo.toml or Cargo.lock files have not changed, 10 | # we can use the docker build cache and skip these (typically slow) steps. 11 | RUN USER=root cargo new peregrine-r 12 | WORKDIR /usr/src/peregrine-r 13 | #COPY Cargo.toml Cargo.lock build.rs ./ 14 | COPY Cargo.toml build.rs ./ 15 | COPY .git ./.git 16 | RUN apt-get update && apt-get install -y build-essential cmake 17 | RUN cargo build --release 18 | 19 | # Copy the source and build the application. 20 | COPY src ./src 21 | #RUN cargo install --target x86_64-unknown-linux-musl --path . 22 | RUN cargo install --path . 23 | 24 | # Copy the statically-linked binary into a scratch container. 25 | #FROM scratch 26 | #COPY --from=build /usr/local/cargo/bin/pg_* ./ 27 | #USER 1000 28 | CMD ["./pg_asm"] 29 | 30 | FROM ubuntu:20.04 31 | COPY --from=build /usr/local/cargo/bin/pg_* /usr/local/bin/ 32 | RUN apt-get update 33 | RUN DEBIAN_FRONTEND="noninteractive" apt-get -y install tzdata 34 | RUN DEBIAN_FRONTEND="noninteractive" apt-get install -y parallel pigz awscli 35 | RUN apt-get install -y samtools exonerate jq wget 36 | COPY run_script_from_s3.sh /usr/local/bin 37 | RUN chmod u+x /usr/local/bin/run_script_from_s3.sh 38 | CMD ["pg_asm"] 39 | -------------------------------------------------------------------------------- /src/bin/utils/mod.rs: -------------------------------------------------------------------------------- 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin 3 | // 4 | // This Source Code Form is subject to the terms of the 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 6 | // 7 | // You should have received a copy of the license along with this 8 | // work. If not, see . 9 | 10 | pub mod build_idx; 11 | pub mod build_sdb; 12 | pub mod dp_graph; 13 | pub mod graph; 14 | pub mod graph_analysis; 15 | pub mod layout; 16 | pub mod ovlp; 17 | pub mod ovlp_ec; 18 | pub mod resolve; 19 | pub mod seqmap; 20 | pub mod shmmrutils; 21 | pub use core::mem::MaybeUninit; 22 | #[cfg(target_os = "linux")] 23 | pub use libc::RUSAGE_THREAD; 24 | pub use libc::{getrusage, rusage, RUSAGE_SELF}; 25 | 26 | #[derive(Copy, Clone)] 27 | pub struct Parameters { 28 | pub nthreads: u32, 29 | pub nchunks: u32, 30 | pub k: u32, 31 | pub w: u32, 32 | pub r: u32, 33 | pub tol: f64, 34 | pub min_ec_cov: u16, 35 | } 36 | 37 | #[allow(dead_code)] 38 | pub fn log_resource(msg: &str, data: &mut rusage) -> (u64, u64, u64) { 39 | let _res = unsafe { getrusage(RUSAGE_SELF, data) }; 40 | log::info!( 41 | "{} : (maxRSS, utime, stime): {} {} {}", 42 | msg, 43 | data.ru_maxrss, 44 | data.ru_utime.tv_sec, 45 | data.ru_stime.tv_sec 46 | ); 47 | 48 | ( 49 | data.ru_maxrss as u64, 50 | data.ru_utime.tv_sec as u64, 51 | data.ru_stime.tv_sec as u64, 52 | ) 53 | } 54 | -------------------------------------------------------------------------------- /src/bin/pg_build_sdb.rs: -------------------------------------------------------------------------------- 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin 3 | // 4 | // This Source Code Form is subject to the terms of the 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 6 | // 7 | // You should have received a copy of the license along with this 8 | // work. If not, see . 9 | 10 | const VERSION_STRING: &'static str = env!("VERSION_STRING"); 11 | 12 | #[global_allocator] 13 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; 14 | //static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc; 15 | 16 | use clap::clap_app; 17 | mod utils; 18 | use simple_logger::SimpleLogger; 19 | use utils::build_sdb::build; 20 | fn main() -> () { 21 | let matches = clap_app!(pg_build_sdb => 22 | (version: VERSION_STRING) 23 | (author: "Jason Chin ") 24 | (about: " 25 | Peregrine-2021 genome assembler 26 | pb_build_sdb: build the sequence database 27 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/") 28 | (@arg input: --input +required +takes_value "Path to a file that contains the list of reads in .fa .fa.gz .fastq or fastq.gz formats") 29 | (@arg out_prefix: --out_prefix +required +takes_value "The prefix for the sequence database and index files") 30 | (@arg log: --log +takes_value "log level: DBBUG or INFO (default)") 31 | ).get_matches(); 32 | 33 | let log_level = match matches.value_of("log").unwrap_or("INFO") { 34 | "DEBUG" => log::LevelFilter::Debug, 35 | _ => log::LevelFilter::Info, 36 | }; 37 | 38 | SimpleLogger::new() 39 | .with_level(log_level) 40 | .with_utc_timestamps() 41 | .init() 42 | .unwrap(); 43 | 44 | let seq_list_file = matches.value_of("input").unwrap().to_string(); 45 | let out_prefix = matches.value_of("out_prefix").unwrap().to_string(); 46 | let _nbases = build(&seq_list_file, &out_prefix); 47 | } 48 | -------------------------------------------------------------------------------- /src/bin/pg_graph.rs: -------------------------------------------------------------------------------- 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin 3 | // 4 | // This Source Code Form is subject to the terms of the 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 6 | // 7 | // You should have received a copy of the license along with this 8 | // work. If not, see . 9 | 10 | const VERSION_STRING: &'static str = env!("VERSION_STRING"); 11 | 12 | #[global_allocator] 13 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; 14 | //static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc; 15 | 16 | use clap::clap_app; 17 | mod utils; 18 | use simple_logger::SimpleLogger; 19 | use utils::graph::ovlp2layout_v1; 20 | fn main() -> () { 21 | let matches = clap_app!(pg_graph => 22 | (version: VERSION_STRING) 23 | (author: "Jason Chin ") 24 | (about: " 25 | Peregrine-2021 genome assembler, 26 | pg_graph: (obsoleted) convert the overlap information between the reads into an assembly group 27 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/") 28 | (@arg prefix: --prefix +required +takes_value "Path prefix for input files") 29 | (@arg out_prefix: --out_prefix +required +takes_value "Path prefix for output files ") 30 | (@arg log: --log +takes_value "log level: DBBUG or INFO (default)") 31 | ) 32 | .get_matches(); 33 | 34 | let log_level = match matches.value_of("log").unwrap_or("INFO") { 35 | "DEBUG" => log::LevelFilter::Debug, 36 | _ => log::LevelFilter::Info, 37 | }; 38 | 39 | SimpleLogger::new() 40 | .with_level(log_level) 41 | .with_utc_timestamps() 42 | .init() 43 | .unwrap(); 44 | 45 | let prefix = matches.value_of("prefix").unwrap().to_string(); 46 | let out_prefix = matches.value_of("out_prefix").unwrap().to_string(); 47 | 48 | let _err = log::info!("graph:out_prefix: {}", out_prefix,); 49 | ovlp2layout_v1(&prefix, &out_prefix, 6); 50 | } 51 | -------------------------------------------------------------------------------- /src/bin/pg_dp_graph.rs: -------------------------------------------------------------------------------- 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin 3 | // 4 | // This Source Code Form is subject to the terms of the 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 6 | // 7 | // You should have received a copy of the license along with this 8 | // work. If not, see . 9 | 10 | const VERSION_STRING: &'static str = env!("VERSION_STRING"); 11 | 12 | #[global_allocator] 13 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; 14 | //static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc; 15 | 16 | use clap::clap_app; 17 | mod utils; 18 | use simple_logger::SimpleLogger; 19 | use utils::dp_graph::ovlp2layout_v2; 20 | fn main() -> Result<(), std::io::Error> { 21 | let matches = clap_app!(pg_graph => 22 | (version: VERSION_STRING) 23 | (author: "Jason Chin ") 24 | (about: " 25 | Peregrine-2021 genome assembler, 26 | pg_dp_graph: take overlap data file as input to generate the layout file using a polyploid aware layout algorithm 27 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/") 28 | (@arg prefix: --prefix +required +takes_value "Path prefix for input files") 29 | (@arg out_prefix: --out_prefix +required +takes_value "ath prefix for output files ") 30 | (@arg bestn: --bestn -b +takes_value "number of best overlaps for initial graph [default: 6]") 31 | (@arg log: --log +takes_value "log level: DBBUG or INFO (default)") 32 | ).get_matches(); 33 | 34 | let log_level = match matches.value_of("log").unwrap_or("INFO") { 35 | "DEBUG" => log::LevelFilter::Debug, 36 | _ => log::LevelFilter::Info, 37 | }; 38 | 39 | SimpleLogger::new() 40 | .with_level(log_level) 41 | .with_utc_timestamps() 42 | .init() 43 | .unwrap(); 44 | 45 | let prefix = matches.value_of("prefix").unwrap().to_string(); 46 | let out_prefix = matches.value_of("out_prefix").unwrap().to_string(); 47 | let bestn = matches 48 | .value_of("bestn") 49 | .unwrap_or("6") 50 | .parse::() 51 | .unwrap(); 52 | 53 | let _err = log::info!("graph:out_prefix: {}", out_prefix,); 54 | ovlp2layout_v2(&prefix, &out_prefix, bestn)?; 55 | Ok(()) 56 | } 57 | -------------------------------------------------------------------------------- /src/bin/pg_layout.rs: -------------------------------------------------------------------------------- 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin 3 | // 4 | // This Source Code Form is subject to the terms of the 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 6 | // 7 | // You should have received a copy of the license along with this 8 | // work. If not, see . 9 | 10 | const VERSION_STRING: &'static str = env!("VERSION_STRING"); 11 | 12 | #[global_allocator] 13 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; 14 | //static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc; 15 | 16 | use clap::clap_app; 17 | mod utils; 18 | use simple_logger::SimpleLogger; 19 | use utils::layout::layout2ctg; 20 | 21 | fn main() -> Result<(), std::io::Error> { 22 | let matches = clap_app!(pg_layout => 23 | (version: VERSION_STRING) 24 | (author: "Jason Chin ") 25 | (about: " 26 | Peregrine-2021 genome assembler, 27 | pg_layout: convert the assembly graph to paths and generate the contig fasta file 28 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/") 29 | (@arg SEQDB:+required "Path to the seqdb file ") 30 | (@arg SEQIDX:+required "Path to the seqdb index file") 31 | (@arg layout_file: --layout_file +required +takes_value "Path to the layout file") 32 | (@arg output_file: --output +required +takes_value "Path to the output file") 33 | (@arg log: --log +takes_value "log level: DBBUG or INFO (default)") 34 | ) 35 | .get_matches(); 36 | 37 | let log_level = match matches.value_of("log").unwrap_or("INFO") { 38 | "DEBUG" => log::LevelFilter::Debug, 39 | _ => log::LevelFilter::Info, 40 | }; 41 | 42 | SimpleLogger::new() 43 | .with_level(log_level) 44 | .with_utc_timestamps() 45 | .init() 46 | .unwrap(); 47 | 48 | let seqdb_file = matches.value_of("SEQDB").unwrap().to_string(); 49 | let index_file = matches.value_of("SEQIDX").unwrap().to_string(); 50 | let layout_file = matches.value_of("layout_file").unwrap().to_string(); 51 | let output_file = matches.value_of("output_file").unwrap().to_string(); 52 | 53 | let _res = log::info!("layout:seq_db_file: {}", seqdb_file); 54 | let _res = log::info!("layout:index_file: {}", index_file); 55 | let _res = log::info!("layout:layout_file: {}", layout_file); 56 | let _res = log::info!("layout:output: {}", output_file); 57 | 58 | layout2ctg(&seqdb_file, &index_file, &layout_file, &output_file)?; 59 | Ok(()) 60 | } 61 | -------------------------------------------------------------------------------- /src/bin/pg_resolve.rs: -------------------------------------------------------------------------------- 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin 3 | // 4 | // This Source Code Form is subject to the terms of the 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 6 | // 7 | // You should have received a copy of the license along with this 8 | // work. If not, see . 9 | 10 | const VERSION_STRING: &'static str = env!("VERSION_STRING"); 11 | 12 | #[global_allocator] 13 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; 14 | //static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc; 15 | 16 | use clap::clap_app; 17 | mod utils; 18 | use simple_logger::SimpleLogger; 19 | use utils::resolve::resolve_ht; 20 | 21 | fn main() -> Result<(), std::io::Error> { 22 | let matches = clap_app!(pg_resolve => 23 | (version: VERSION_STRING) 24 | (author: "Jason Chin ") 25 | (about: " 26 | Peregrine-2021 genome assembler, 27 | pg_resolve: this tool aligns all contigs to themselve to identify haplotype-related contigs 28 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/") 29 | (@arg fasta_file: -f --fasta_file +required +takes_value "Path to the layout file") 30 | (@arg output_prefix: -o --out_prefix +required +takes_value "Path to the output prefix") 31 | (@arg w: -w +takes_value "Window size [default: 80]") 32 | (@arg k: -k +takes_value "Kmer size [default: 56]") 33 | (@arg r: -r +takes_value "Reduction factor [default: 6]") 34 | (@arg log: --log +takes_value "log level: DBBUG or INFO (default)") 35 | ) 36 | .get_matches(); 37 | 38 | let log_level = match matches.value_of("log").unwrap_or("INFO") { 39 | "DEBUG" => log::LevelFilter::Debug, 40 | _ => log::LevelFilter::Info, 41 | }; 42 | 43 | SimpleLogger::new() 44 | .with_level(log_level) 45 | .with_utc_timestamps() 46 | .init() 47 | .unwrap(); 48 | 49 | let fasta_file = matches.value_of("fasta_file").unwrap().to_string(); 50 | let output_prefix = matches.value_of("output_prefix").unwrap().to_string(); 51 | let wsize = matches 52 | .value_of("w") 53 | .unwrap_or("80") 54 | .parse::() 55 | .unwrap(); 56 | 57 | let ksize = matches 58 | .value_of("k") 59 | .unwrap_or("56") 60 | .parse::() 61 | .unwrap(); 62 | 63 | let rfactor = matches.value_of("r").unwrap_or("6").parse::().unwrap(); 64 | 65 | resolve_ht(&fasta_file, &output_prefix, wsize, ksize, rfactor)?; 66 | Ok(()) 67 | } 68 | -------------------------------------------------------------------------------- /src/bin/pg_dedup.rs: -------------------------------------------------------------------------------- 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin 3 | // 4 | // This Source Code Form is subject to the terms of the 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 6 | // 7 | // You should have received a copy of the license along with this 8 | // work. If not, see . 9 | 10 | const VERSION_STRING: &'static str = env!("VERSION_STRING"); 11 | 12 | #[global_allocator] 13 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; 14 | //static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc; 15 | 16 | use clap::clap_app; 17 | use simple_logger::SimpleLogger; 18 | 19 | mod utils; 20 | use utils::seqmap; 21 | 22 | fn main() -> Result<(), std::io::Error> { 23 | let matches = clap_app!(pg_resolve => 24 | (version: VERSION_STRING) 25 | (author: "Jason Chin ") 26 | (about: " 27 | Peregrine-2021 genome assembler, 28 | pg_dedup: perform all contigs to all contigs alignment to remove duplicates 29 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/") 30 | (@arg ref_fasta: -f --ref_fasta +required +takes_value "Path to the reference file") 31 | (@arg target_fasta: -t --target_fasta +required +takes_value "Path to the target file") 32 | (@arg output: -o --output +required +takes_value "Path to the output filename") 33 | (@arg w: -w +takes_value "Window size [default: 48]") 34 | (@arg k: -k +takes_value "Kmer size [default: 56]") 35 | (@arg r: -r +takes_value "Reduction factor [default: 4]") 36 | (@arg log: --log +takes_value "log level: DBBUG or INFO (default)") 37 | ) 38 | .get_matches(); 39 | 40 | let log_level = match matches.value_of("log").unwrap_or("INFO") { 41 | "DEBUG" => log::LevelFilter::Debug, 42 | _ => log::LevelFilter::Info, 43 | }; 44 | 45 | SimpleLogger::new() 46 | .with_level(log_level) 47 | .with_utc_timestamps() 48 | .init() 49 | .unwrap(); 50 | 51 | let ref_fasta_file = matches.value_of("ref_fasta").unwrap().to_string(); 52 | let target_fasta_file = matches.value_of("target_fasta").unwrap().to_string(); 53 | let output_file = matches.value_of("output").unwrap().to_string(); 54 | let wsize = matches 55 | .value_of("w") 56 | .unwrap_or("48") 57 | .parse::() 58 | .unwrap(); 59 | 60 | let ksize = matches 61 | .value_of("k") 62 | .unwrap_or("56") 63 | .parse::() 64 | .unwrap(); 65 | 66 | let rfactor = matches.value_of("r").unwrap_or("4").parse::().unwrap(); 67 | seqmap::dedup_target_seqs( 68 | &ref_fasta_file, 69 | &target_fasta_file, 70 | &output_file, 71 | wsize, 72 | ksize, 73 | rfactor, 74 | )?; 75 | Ok(()) 76 | } 77 | -------------------------------------------------------------------------------- /src/bin/pg_build_idx.rs: -------------------------------------------------------------------------------- 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin 3 | // 4 | // This Source Code Form is subject to the terms of the 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 6 | // 7 | // You should have received a copy of the license along with this 8 | // work. If not, see . 9 | 10 | const VERSION_STRING: &'static str = env!("VERSION_STRING"); 11 | 12 | #[global_allocator] 13 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; 14 | //static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc; 15 | 16 | use clap::clap_app; 17 | mod utils; 18 | use simple_logger::SimpleLogger; 19 | use utils::build_idx::build; 20 | use utils::Parameters; 21 | fn main() -> () { 22 | let matches = clap_app!(pg_build_idx => 23 | (version: VERSION_STRING) 24 | (author: "Jason Chin ") 25 | (about: " 26 | Peregrine-2021 genome assembler 27 | build the SHIMMER index from the reads for overlapping 28 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/") 29 | (@arg SEQDB:+required "Path to the seqdb file ") 30 | (@arg SEQIDX:+required "Path to the seqdb index file") 31 | (@arg SHMRINDEXPREFIX: +required "The prefix to the output shimmer index database") 32 | (@arg NTHREADS: +required "Number of threads") 33 | (@arg NCHUNKS: +required "Number of partition") 34 | (@arg w: -w +takes_value "Window size [default: 80]") 35 | (@arg k: -k +takes_value "Kmer size [default: 56]") 36 | (@arg r: -r +takes_value "Reduction factor [default: 6]") 37 | (@arg log: --log +takes_value "log level: DBBUG or INFO (default)") 38 | ) 39 | .get_matches(); 40 | 41 | let log_level = match matches.value_of("log").unwrap_or("INFO") { 42 | "DEBUG" => log::LevelFilter::Debug, 43 | _ => log::LevelFilter::Info, 44 | }; 45 | 46 | SimpleLogger::new() 47 | .with_level(log_level) 48 | .with_utc_timestamps() 49 | .init() 50 | .unwrap(); 51 | 52 | let seqdb_file = matches.value_of("SEQDB").unwrap().to_string(); 53 | let index_file = matches.value_of("SEQIDX").unwrap().to_string(); 54 | let out_prefix = matches.value_of("SHMRINDEXPREFIX").unwrap().to_string(); 55 | let nthreads = matches 56 | .value_of("NTHREADS") 57 | .unwrap() 58 | .parse::() 59 | .unwrap(); 60 | let nchunks = matches.value_of("NCHUNKS").unwrap().parse::().unwrap(); 61 | let wsize = matches.value_of("w").unwrap_or("80").parse::().unwrap(); 62 | let ksize = matches.value_of("k").unwrap_or("56").parse::().unwrap(); 63 | let rfactor = matches.value_of("r").unwrap_or("6").parse::().unwrap(); 64 | 65 | let parameters = Parameters { 66 | nchunks: nchunks, 67 | nthreads: nthreads, 68 | w: wsize, 69 | k: ksize, 70 | r: rfactor, 71 | tol: 0.0, //not used 72 | min_ec_cov: 1, 73 | }; 74 | 75 | build(&seqdb_file, &index_file, &out_prefix, ¶meters); 76 | } 77 | -------------------------------------------------------------------------------- /src/bin/pg_ovlp.rs: -------------------------------------------------------------------------------- 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin 3 | // 4 | // This Source Code Form is subject to the terms of the 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 6 | // 7 | // You should have received a copy of the license along with this 8 | // work. If not, see . 9 | 10 | const VERSION_STRING: &'static str = env!("VERSION_STRING"); 11 | 12 | #[global_allocator] 13 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; 14 | //static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc; 15 | 16 | use clap::clap_app; 17 | mod utils; 18 | use simple_logger::SimpleLogger; 19 | use utils::ovlp::ovlp; 20 | use utils::Parameters; 21 | fn main() -> Result<(), std::io::Error> { 22 | let matches = clap_app!(pg_ovlp => 23 | (version: VERSION_STRING) 24 | (author: "Jason Chin ") 25 | (@arg SEQDB:+required "Path to the seqdb file ") 26 | (about: " 27 | Peregrine-2021 genome assembler, 28 | pg_ovlp: generate haplotype specific overlaps between the reads 29 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/") 30 | (@arg SEQIDX:+required "Path to the seqdb index file") 31 | (@arg SHMRINDEXPREFIX: +required "The prefix to the output shimmer index database") 32 | (@arg OUTPREFIX: +required "The prefix of the output ovelap files") 33 | (@arg NTHREADS: +required "Number of threads ") 34 | (@arg NCHUNKS: +required "Number of partition") 35 | (@arg tol: -t --tol +takes_value "Alignment tolerance [default: 0.01]") 36 | (@arg log: --log +takes_value "log level: DBBUG or INFO (default)") 37 | ) 38 | .get_matches(); 39 | 40 | let log_level = match matches.value_of("log").unwrap_or("INFO") { 41 | "DEBUG" => log::LevelFilter::Debug, 42 | _ => log::LevelFilter::Info, 43 | }; 44 | 45 | SimpleLogger::new() 46 | .with_level(log_level) 47 | .with_utc_timestamps() 48 | .init() 49 | .unwrap(); 50 | 51 | let seqdb_file = matches.value_of("SEQDB").unwrap().to_string(); 52 | let index_file = matches.value_of("SEQIDX").unwrap().to_string(); 53 | let shimmer_index_file_prefix = matches.value_of("SHMRINDEXPREFIX").unwrap().to_string(); 54 | let out_prefix = matches.value_of("OUTPREFIX").unwrap().to_string(); 55 | let nthreads = matches 56 | .value_of("NTHREADS") 57 | .unwrap() 58 | .parse::() 59 | .unwrap(); 60 | let nchunks = matches.value_of("NCHUNKS").unwrap().parse::().unwrap(); 61 | let tol = matches 62 | .value_of("tol") 63 | .unwrap_or("0.01") 64 | .parse::() 65 | .unwrap(); 66 | 67 | let parameters = Parameters { 68 | nchunks: nchunks, 69 | nthreads: nthreads, 70 | w: 0, 71 | k: 0, 72 | r: 0, 73 | tol: tol, 74 | min_ec_cov: 1, 75 | }; 76 | 77 | ovlp( 78 | &seqdb_file, 79 | &index_file, 80 | &shimmer_index_file_prefix, 81 | &out_prefix, 82 | ¶meters, 83 | )?; 84 | Ok(()) 85 | } 86 | -------------------------------------------------------------------------------- /src/bin/pg_ovlp_ec.rs: -------------------------------------------------------------------------------- 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin 3 | // 4 | // This Source Code Form is subject to the terms of the 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 6 | // 7 | // You should have received a copy of the license along with this 8 | // work. If not, see . 9 | 10 | const VERSION_STRING: &'static str = env!("VERSION_STRING"); 11 | 12 | #[global_allocator] 13 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; 14 | //static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc; 15 | 16 | use clap::clap_app; 17 | mod utils; 18 | use simple_logger::SimpleLogger; 19 | use utils::ovlp_ec::ovlp_ec; 20 | use utils::Parameters; 21 | 22 | fn main() -> Result<(), std::io::Error> { 23 | let matches = clap_app!(pg_ovlp_ec => 24 | (version: VERSION_STRING) 25 | (author: "Jason Chin ") 26 | (about: " 27 | Peregrine-2021 genome assembler, 28 | pg_ovlp_ec: perform error correction from the haplotype specific overlaps 29 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/") 30 | (@arg SEQDB:+required "Path to the seqdb file ") 31 | (@arg SEQIDX:+required "Path to the seqdb index file") 32 | (@arg prefix: +required "The prefix to the output shimmer index database") 33 | (@arg out_prefix: +required "The prefix of the output ovelap files") 34 | (@arg NTHREADS: +required "Number of threads ") 35 | (@arg NCHUNKS: +required "Number of partition") 36 | (@arg tol: -t --tol +takes_value "Alignment tolerance [default: 0.01]") 37 | (@arg min_ec_cov: -c --min_ec_cov +takes_value "Minimum error coverage [default: 1]") 38 | (@arg log: --log +takes_value "log level: DBBUG or INFO (default)") 39 | ) 40 | .get_matches(); 41 | 42 | let log_level = match matches.value_of("log").unwrap_or("INFO") { 43 | "DEBUG" => log::LevelFilter::Debug, 44 | _ => log::LevelFilter::Info, 45 | }; 46 | 47 | SimpleLogger::new() 48 | .with_level(log_level) 49 | .with_utc_timestamps() 50 | .init() 51 | .unwrap(); 52 | 53 | let seqdb_file = matches.value_of("SEQDB").unwrap().to_string(); 54 | let index_file = matches.value_of("SEQIDX").unwrap().to_string(); 55 | let prefix = matches.value_of("prefix").unwrap().to_string(); 56 | let out_prefix = matches.value_of("out_prefix").unwrap().to_string(); 57 | let nthreads = matches 58 | .value_of("NTHREADS") 59 | .unwrap() 60 | .parse::() 61 | .unwrap(); 62 | let nchunks = matches.value_of("NCHUNKS").unwrap().parse::().unwrap(); 63 | let tol = matches 64 | .value_of("tol") 65 | .unwrap_or("0.01") 66 | .parse::() 67 | .unwrap(); 68 | let min_ec_cov = matches 69 | .value_of("min_ec_cov") 70 | .unwrap_or("1") 71 | .parse::() 72 | .unwrap(); 73 | 74 | let parameters = Parameters { 75 | nchunks: nchunks, 76 | nthreads: nthreads, 77 | w: 0, 78 | k: 0, 79 | r: 0, 80 | tol: tol, 81 | min_ec_cov: min_ec_cov, 82 | }; 83 | 84 | ovlp_ec(&seqdb_file, &index_file, &prefix, &out_prefix, ¶meters)?; 85 | Ok(()) 86 | } 87 | -------------------------------------------------------------------------------- /src/bin/pg_getreads.rs: -------------------------------------------------------------------------------- 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin 3 | // 4 | // This Source Code Form is subject to the terms of the 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 6 | // 7 | // You should have received a copy of the license along with this 8 | // work. If not, see . 9 | 10 | const VERSION_STRING: &'static str = env!("VERSION_STRING"); 11 | 12 | #[global_allocator] 13 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; 14 | //static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc; 15 | 16 | use clap::clap_app; 17 | use memmap::MmapOptions; 18 | use simple_logger::SimpleLogger; 19 | use std::fs::File; 20 | use std::io::{self, BufRead, Write}; 21 | use std::path::Path; 22 | mod utils; 23 | use utils::shmmrutils::{get_seq_fragment, ReadLocation}; 24 | 25 | fn read_lines

(filename: P) -> io::Result>> 26 | where 27 | P: AsRef, 28 | { 29 | let file = File::open(filename)?; 30 | Ok(io::BufReader::new(file).lines()) 31 | } 32 | 33 | fn main() -> () { 34 | let matches = clap_app!(pg_getreads => 35 | (version: VERSION_STRING) 36 | (author: "Jason Chin ") 37 | (about: " 38 | Peregrine-2021 genome assembler, 39 | pg_getreads: generate fasta file for a subset of reads from the sequence database 40 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/") 41 | (@arg SEQDB: +required "Path to the seqdb file ") 42 | (@arg SEQIDX: +required "Path to the seqdb index file") 43 | (@arg READID: +required "Path to the read id file") 44 | (@arg log: --log +takes_value "log level: DBBUG or INFO (default)") 45 | ) 46 | .get_matches(); 47 | 48 | let log_level = match matches.value_of("log").unwrap_or("INFO") { 49 | "DEBUG" => log::LevelFilter::Debug, 50 | _ => log::LevelFilter::Info, 51 | }; 52 | 53 | SimpleLogger::new() 54 | .with_level(log_level) 55 | .with_utc_timestamps() 56 | .init() 57 | .unwrap(); 58 | 59 | let seqdb_file = matches.value_of("SEQDB").unwrap().to_string(); 60 | let index_file = matches.value_of("SEQIDX").unwrap().to_string(); 61 | let read_id_file = matches.value_of("READID").unwrap().to_string(); 62 | 63 | let _res = writeln!(io::stderr(), "seq_db_file: {}", seqdb_file); 64 | let _res = writeln!(io::stderr(), "index_file: {}", index_file); 65 | let _res = writeln!(io::stderr(), "read_id_file: {}", read_id_file); 66 | 67 | let mut read_index = Vec::::new(); 68 | let mut read_name = Vec::::new(); 69 | 70 | if let Ok(lines) = read_lines(index_file) { 71 | for line in lines { 72 | if let Ok(rec) = line { 73 | //let rec_trimmed = rec.trim_end(); 74 | // the record line looks like 000000023 m64062_190803_042216/144/ccs 20359 467415 75 | let v: Vec<&str> = rec.split_whitespace().collect(); 76 | //let rid: u32 = v[0].parse().unwrap(); 77 | let start: usize = v[3].parse().unwrap(); 78 | let len: usize = v[2].parse().unwrap(); 79 | read_index.push(ReadLocation { 80 | start: start, 81 | len: len, 82 | }); 83 | read_name.push(v[1].to_string()); 84 | //println!("{} {} {}", rid, start, len); 85 | } 86 | } 87 | } 88 | 89 | let file = File::open(seqdb_file).unwrap(); 90 | let mmap = unsafe { MmapOptions::new().map(&file).unwrap() }; 91 | let stdout = io::stdout(); 92 | let mut handle = stdout.lock(); 93 | if let Ok(lines) = read_lines(read_id_file) { 94 | for line in lines { 95 | if let Ok(rec) = line { 96 | let rec_trimmed = rec.trim_end(); 97 | let rid0 = rec_trimmed.parse::().unwrap(); 98 | let rloc = read_index[rid0 as usize]; 99 | let len = rloc.len as u32; 100 | let seq_frag = get_seq_fragment(rid0, 0, 0, len, &mmap, &read_index); 101 | let _ = writeln!(handle, ">{} {:09}", read_name[rid0 as usize], rid0); 102 | let _ = writeln!(handle, "{}", String::from_utf8_lossy(&seq_frag)); 103 | } 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/bin/utils/build_idx.rs: -------------------------------------------------------------------------------- 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin 3 | // 4 | // This Source Code Form is subject to the terms of the 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 6 | // 7 | // You should have received a copy of the license along with this 8 | // work. If not, see . 9 | 10 | #![allow(dead_code)] 11 | 12 | use super::shmmrutils::sequence_to_shmmrs; 13 | use super::shmmrutils::{get_2bit_fragment, ReadLocation}; 14 | use super::Parameters; 15 | use byteorder::{LittleEndian, WriteBytesExt}; 16 | use memmap::{Mmap, MmapOptions}; 17 | use std::fs::File; 18 | use std::io::{self, BufRead, BufWriter, Write}; 19 | use std::mem::size_of; 20 | use std::path::Path; 21 | use threadpool::ThreadPool; 22 | 23 | fn read_lines

(filename: P) -> io::Result>> 24 | where 25 | P: AsRef, 26 | { 27 | let file = File::open(filename)?; 28 | Ok(io::BufReader::new(file).lines()) 29 | } 30 | 31 | fn index_chunk( 32 | chunk: u32, 33 | total_chunk: u32, 34 | readsdb: &Mmap, 35 | read_index: &Vec, 36 | prefix: &String, 37 | wsize: u32, 38 | ksize: u32, 39 | rfactor: u32, 40 | ) -> Result<(), io::Error> { 41 | // Create index for a chunk from the read database 42 | 43 | let filename = format!("{}-{:03}-of-{:03}.dat", prefix, chunk, total_chunk); 44 | let mut out_f = BufWriter::new(File::create(filename).unwrap()); 45 | 46 | let mut wrt = Vec::::with_capacity(1 << 16); 47 | for seq_id in 0..read_index.len() { 48 | if seq_id % (total_chunk as usize) != (chunk % total_chunk) as usize { 49 | continue; 50 | } 51 | let len = read_index[seq_id].len as u32; 52 | log::debug!("build_idx: len: {} {}", seq_id, len); 53 | let seq = get_2bit_fragment(seq_id as u32, 0, 0, len, &readsdb, read_index); 54 | let shmmrs = sequence_to_shmmrs(seq_id as u32, &seq, wsize, ksize, rfactor); 55 | for m in shmmrs { 56 | wrt.write_u64::(m.x)?; 57 | wrt.write_u64::(m.y)?; 58 | } 59 | } 60 | let us = size_of::(); 61 | assert!(us == 8 as usize); //make sure the usize is a 64bit int. 62 | out_f.write_u64::((wrt.len() >> 4) as u64)?; 63 | 64 | // Not sure if it a bug in BufferWriter, it can not write more than 2Gb at once 65 | // we chop up the data and write in small chunks 66 | let c = 24_usize; 67 | for i in 0..=wrt.len() >> c { 68 | if ((i + 1) << c) < wrt.len() { 69 | let s = i << c; 70 | let e = (i + 1) << c; 71 | out_f.write(&wrt[s..e])?; 72 | } else { 73 | let s = i << c; 74 | let e = wrt.len(); 75 | out_f.write(&wrt[s..e])?; 76 | } 77 | } 78 | out_f.flush()?; 79 | Ok(()) 80 | } 81 | 82 | pub fn build( 83 | seqdb_file: &String, 84 | index_file: &String, 85 | out_prefix: &String, 86 | parameters: &Parameters, 87 | ) -> () { 88 | // Using thread pool to build the SHIMMER index in paralle 89 | 90 | let mut read_index = Vec::::new(); 91 | 92 | if let Ok(lines) = read_lines(index_file) { 93 | for line in lines { 94 | if let Ok(rec) = line { 95 | let v: Vec<&str> = rec.split_whitespace().collect(); 96 | let start: usize = v[3].parse().unwrap(); 97 | let len: usize = v[2].parse().unwrap(); 98 | read_index.push(ReadLocation { 99 | start: start, 100 | len: len, 101 | }); 102 | } 103 | } 104 | } 105 | let mmap_seqdb = File::open(seqdb_file).unwrap(); 106 | let mmap_seqdb = unsafe { MmapOptions::new().map(&mmap_seqdb).unwrap() }; 107 | 108 | let read_index = std::sync::Arc::new(read_index); 109 | let mmap_seqdb = std::sync::Arc::new(mmap_seqdb); 110 | 111 | let pool = ThreadPool::new(parameters.nthreads as usize); 112 | 113 | let _nchunks = parameters.nchunks; 114 | for i in 0.._nchunks { 115 | let mmap_seqdb = mmap_seqdb.clone(); 116 | let read_index = read_index.clone(); 117 | let out_prefix = out_prefix.clone(); 118 | let parameters = (*parameters).clone(); 119 | pool.execute(move || { 120 | let r = index_chunk( 121 | i + 1, 122 | _nchunks, 123 | &mmap_seqdb, 124 | &read_index, 125 | &out_prefix, 126 | parameters.w, 127 | parameters.k, 128 | parameters.r, 129 | ); 130 | match r { 131 | Err(error) => panic!("build index fail: {}", error), 132 | Ok(()) => (), 133 | }; 134 | }); 135 | } 136 | pool.join(); 137 | } 138 | -------------------------------------------------------------------------------- /src/bin/utils/resolve.rs: -------------------------------------------------------------------------------- 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin 3 | // 4 | // This Source Code Form is subject to the terms of the 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 6 | // 7 | // You should have received a copy of the license along with this 8 | // work. If not, see . 9 | 10 | #![allow(dead_code)] 11 | // 12 | // for resolve contigs that are highly similar to each others which are most 13 | // likily be homologuous pairs in a diploid genome 14 | // 15 | use super::build_sdb::FastxReader; 16 | use super::layout::log_asm_summary; 17 | use super::shmmrutils::{sequence_to_shmmrs, MM128}; 18 | use std::fs::File; 19 | use std::io::{self, BufRead, BufReader, Write}; 20 | use std::path::Path; 21 | 22 | use rustc_hash::FxHashMap; 23 | 24 | fn read_lines

(filename: P) -> io::Result>> 25 | where 26 | P: AsRef, 27 | { 28 | let file = File::open(filename)?; 29 | Ok(io::BufReader::new(file).lines()) 30 | } 31 | 32 | fn base2twobit(s: &Vec) -> Vec { 33 | let fourbit_map_f: [u8; 256] = [ 34 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 35 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 36 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 0, 12, 1, 12, 37 | 12, 12, 2, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 3, 12, 12, 12, 12, 12, 12, 12, 38 | 12, 12, 12, 12, 12, 0, 12, 1, 12, 12, 12, 2, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 39 | 12, 3, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 40 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 41 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 42 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 43 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 44 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 45 | 12, 12, 12, 46 | ]; 47 | let len = s.len(); 48 | let mut out_s = Vec::::with_capacity(len); 49 | for p in 0..len { 50 | out_s.push(fourbit_map_f[s[p] as usize]); 51 | } 52 | out_s 53 | } 54 | 55 | pub fn resolve_ht( 56 | fasta_file: &String, 57 | output_prefix: &String, 58 | w: u32, 59 | k: u32, 60 | r: u32, 61 | ) -> Result<(), io::Error> { 62 | log::info!("resolve:fasta_file: {}", fasta_file); 63 | log::info!("resolve:output_prefix: {}", output_prefix); 64 | log::info!("resolve:parameters: w:{}, k:{}, r:{}", w, k, r,); 65 | 66 | let input_file = File::open(&fasta_file)?; 67 | let reader = BufReader::new(input_file); 68 | let mut fastx_reader = FastxReader::new(reader, &fasta_file)?; 69 | 70 | //let mut seqdb = FxHashMap::, Vec>::default(); 71 | let mut shmmr_db = FxHashMap::)>::default(); 72 | let mut shmmr_map = FxHashMap::>::default(); 73 | let mut id2name = FxHashMap::::default(); 74 | let mut rid = 0; 75 | let mut seq_db = FxHashMap::>::default(); 76 | while let Some(rec) = fastx_reader.next_rec() { 77 | let rec = rec.unwrap(); 78 | //seqdb.insert(r.id, r.seq); 79 | //println!("N {}", String::from_utf8_lossy(&r.id)); 80 | let rec_2bitseq = base2twobit(&rec.seq); 81 | let shmmers = sequence_to_shmmrs(rid, &rec_2bitseq, w, k, r); 82 | for mm in shmmers.iter() { 83 | let hash = mm.x >> 8; 84 | shmmr_map.entry(hash).or_insert_with(|| vec![]).push(mm.y); 85 | } 86 | let n = String::from_utf8_lossy(&rec.id).into_owned(); 87 | id2name.insert(rid, n.clone()); 88 | shmmr_db.insert(n.clone(), (rid, shmmers)); 89 | seq_db.insert(n, rec.seq); 90 | rid += 1; 91 | } 92 | 93 | let mut matches = FxHashMap::<(u32, u32), Vec<(u32, u32)>>::default(); 94 | 95 | for (_, (rid, shmmrs)) in shmmr_db.iter() { 96 | for mer0 in shmmrs { 97 | let y0 = mer0.y; 98 | let pos0 = ((y0 & 0xFFFFFFFF) >> 1) as u32; 99 | //let strand0 = y0 & 0x1; 100 | let hash = mer0.x >> 8; 101 | let other = shmmr_map.get(&hash).unwrap(); 102 | if other.len() > 10 || other.len() < 2 { 103 | continue; 104 | } 105 | for y1 in other { 106 | let rid1 = (*y1 >> 32) as u32; 107 | if rid1 == *rid { 108 | continue; 109 | } 110 | let pos1 = ((*y1 & 0xFFFFFFFF) >> 1) as u32; 111 | //let strand1 = *y1 & 0x1; 112 | matches 113 | .entry((*rid, rid1)) 114 | .or_insert_with(|| vec![]) 115 | .push((pos0, pos1)); 116 | //log::info!("S {} {} {} {} {} {} {}", rid, pos0, strain0, rid1, pos1, strain1, (pos0 as i32) - (pos1 as i32)); 117 | } 118 | } 119 | } 120 | 121 | let rel_path = format!("{}_rel.dat", output_prefix); 122 | let mut rel_file = File::create(rel_path).unwrap(); 123 | let mut a_to_p = FxHashMap::::default(); 124 | 125 | for ((rid0, rid1), v) in matches { 126 | let n0 = id2name.get(&rid0).unwrap(); 127 | let n1 = id2name.get(&rid1).unwrap(); 128 | 129 | let s0 = shmmr_db.get(n0).unwrap().1.len() as f32; 130 | let s1 = shmmr_db.get(n1).unwrap().1.len() as f32; 131 | let c = v.len(); 132 | let r0 = (c as f32) / s0; 133 | let r1 = (c as f32) / s1; 134 | writeln!( 135 | rel_file, 136 | "S {} {} {} {} {} {} {}", 137 | rid0, rid1, c, s0, s1, r0, r1 138 | )?; 139 | if r0 > 0.50 && s1 > s0 { 140 | a_to_p.insert(rid0, rid1); 141 | } 142 | } 143 | 144 | let p_ctg_path = format!("{}_p.fa", output_prefix); 145 | let mut p_ctg_file = File::create(p_ctg_path).unwrap(); 146 | let a_ctg_path = format!("{}_a.fa", output_prefix); 147 | let mut a_ctg_file = File::create(a_ctg_path).unwrap(); 148 | let mut ctg_ids = id2name.keys().map(|x| *x).collect::>(); 149 | ctg_ids.sort(); 150 | let mut p_ctg_lengths = Vec::<(String, usize)>::new(); 151 | let mut a_ctg_lengths = Vec::<(String, usize)>::new(); 152 | for ctg_id in ctg_ids { 153 | if a_to_p.contains_key(&ctg_id) { 154 | let n0 = id2name.get(&ctg_id).unwrap(); 155 | let n1 = id2name.get(a_to_p.get(&ctg_id).unwrap()).unwrap(); 156 | writeln!(rel_file, "A {} {}", n0, n1)?; 157 | let seq = seq_db.get(n0).unwrap(); 158 | writeln!(a_ctg_file, ">{}", n0)?; 159 | writeln!(a_ctg_file, "{}", String::from_utf8_lossy(seq))?; 160 | a_ctg_lengths.push((n0.clone(), seq.len())) 161 | } else { 162 | let n0 = id2name.get(&ctg_id).unwrap(); 163 | writeln!(rel_file, "P {} {}", n0, n0)?; 164 | let seq = seq_db.get(n0).unwrap(); 165 | writeln!(p_ctg_file, ">{}", n0)?; 166 | writeln!(p_ctg_file, "{}", String::from_utf8_lossy(seq))?; 167 | p_ctg_lengths.push((n0.clone(), seq.len())) 168 | } 169 | } 170 | log::info!("primary ctg stats"); 171 | log_asm_summary(p_ctg_lengths); 172 | log::info!("associated ctg stats"); 173 | log_asm_summary(a_ctg_lengths); 174 | Ok(()) 175 | } 176 | -------------------------------------------------------------------------------- /misc/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | Peregrine 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /src/bin/utils/build_sdb.rs: -------------------------------------------------------------------------------- 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin 3 | // 4 | // This Source Code Form is subject to the terms of the 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 6 | // 7 | // You should have received a copy of the license along with this 8 | // work. If not, see . 9 | 10 | #![allow(dead_code)] 11 | 12 | use flate2::bufread::MultiGzDecoder; 13 | use rayon::prelude::*; 14 | use std::fs::File; 15 | use std::io::prelude::*; 16 | use std::io::{self, BufReader, SeekFrom}; 17 | 18 | pub struct SeqRec { 19 | pub id: Vec, 20 | pub seq: Vec, 21 | } 22 | 23 | enum Fastx { 24 | FastQ, 25 | FastA, 26 | } 27 | pub struct FastxReader { 28 | // struct for reading different file types 29 | inner: R, 30 | t: Fastx, 31 | } 32 | 33 | impl FastxReader { 34 | pub fn new(mut inner: R, filename: &String) -> Result { 35 | let t: Fastx; 36 | { 37 | // peek the file to decide if it is fasta or fastq 38 | let r = inner.by_ref(); 39 | let mut buf = Vec::::new(); 40 | r.take(1).read_to_end(&mut buf)?; 41 | if buf.len() < 1 { 42 | return Err(io::Error::new( 43 | io::ErrorKind::Other, 44 | format!("empty file: {}", filename), 45 | )); 46 | } 47 | match buf[0] { 48 | b'>' => t = Fastx::FastA, 49 | b'@' => t = Fastx::FastQ, 50 | _ => t = Fastx::FastA, 51 | } 52 | } 53 | Ok(Self { inner, t }) 54 | } 55 | 56 | pub fn next_rec(&mut self) -> Option> { 57 | match self.t { 58 | Fastx::FastA => self.fasta_next_rec(), 59 | Fastx::FastQ => self.fastq_next_rec(), 60 | } 61 | } 62 | 63 | pub fn fasta_next_rec(&mut self) -> Option> { 64 | // naive partser for fasta format to get the next record 65 | 66 | let mut id_tmp = Vec::::with_capacity(512); 67 | let mut seq = Vec::::with_capacity(1 << 14); 68 | 69 | let res = self.inner.read_until(b'\n', &mut id_tmp); 70 | if res.is_err() { 71 | Some(res); 72 | } else if res.ok() == Some(0) { 73 | return None; 74 | } 75 | let mut r = BufReader::new(&id_tmp[..]); 76 | let mut id = Vec::::with_capacity(512); 77 | let res = r.read_until(b' ', &mut id); 78 | if res.is_err() { 79 | Some(res); 80 | } 81 | let id = id 82 | .into_iter() 83 | .filter(|c| *c != b'\n' && *c != b' ' && *c != b'\r') 84 | .collect(); 85 | let _x = self.inner.read_until(b'>', &mut seq); 86 | let seq = seq 87 | .into_iter() 88 | .filter(|c| *c != b'\n' && *c != b'>' && *c != b'\r') 89 | .collect(); 90 | let rec = SeqRec { id: id, seq: seq }; 91 | 92 | Some(Ok(rec)) 93 | } 94 | 95 | pub fn fastq_next_rec(&mut self) -> Option> { 96 | // naive partser for fastq format to get the next record 97 | // QV strings are ignored 98 | 99 | let mut buf = Vec::::with_capacity(512); 100 | let mut id_tmp = Vec::::with_capacity(512); 101 | let mut seq = Vec::::with_capacity(1 << 14); 102 | 103 | let _res = self.inner.read_until(b'\n', &mut id_tmp); //read id 104 | // fetch the first id up to the first space, strip '\n' 105 | let mut r = BufReader::new(&id_tmp[..]); 106 | let mut id = Vec::::with_capacity(512); 107 | let _res = r.read_until(b' ', &mut id); 108 | let id = id 109 | .into_iter() 110 | .filter(|c| *c != b'\n' && *c != b' ' && *c != b'\r') 111 | .collect(); 112 | // get the seq 113 | let _res = self.inner.read_until(b'\n', &mut seq); 114 | let seq = seq 115 | .into_iter() 116 | .filter(|c| *c != b'\n' && *c != b'\r') 117 | .collect(); 118 | let rec = SeqRec { id: id, seq: seq }; 119 | // ignore QV 120 | let _res = self.inner.read_until(b'+', &mut buf); 121 | let _res = self.inner.read_until(b'\n', &mut buf); 122 | let _res = self.inner.read_until(b'\n', &mut buf); 123 | let res = self.inner.read_until(b'@', &mut buf); //get to id line 124 | if res.is_err() { 125 | Some(res); 126 | } else if res.ok() == Some(0) { 127 | return None; 128 | } 129 | Some(Ok(rec)) 130 | } 131 | } 132 | 133 | fn get_hpc_flag(seq0: &Vec) -> Vec { 134 | // We use a hybrid approach to handle homopolymer sequence 135 | // In somce case, it is use to know a base is part of long homopolymer and 136 | // it is prone to have insertion or deletion errors. We don't compress them 137 | // when we store the sequences but we mark those bases in case it is useful 138 | // to ignore them. 139 | 140 | let mut flag = Vec::::with_capacity(seq0.len()); 141 | let mut i = 0_usize; 142 | let seq0len = seq0.len(); 143 | while i < seq0len { 144 | let mut j = i; 145 | while j < seq0len - 2 && seq0[j] == seq0[j + 1] { 146 | j += 1; 147 | } 148 | if j != i { 149 | // mask HP > 5 bases 150 | let mut count = 0_u32; 151 | while i <= j { 152 | if count < 4 { 153 | flag.push(0b0000); 154 | } else { 155 | flag.push(0b0100); 156 | } 157 | count += 1; 158 | i += 1; 159 | } 160 | } else { 161 | //dimer case 162 | let mut j = i; 163 | while j < seq0len - 4 && seq0[j] == seq0[j + 2] && seq0[j + 1] == seq0[j + 3] { 164 | j += 2; 165 | } 166 | if j != i { 167 | let mut count = 0_u32; 168 | while i <= j { 169 | if count < 4 { 170 | flag.push(0b0000); 171 | flag.push(0b0000); 172 | } else { 173 | flag.push(0b1000); 174 | flag.push(0b1000); 175 | } 176 | i += 2; 177 | count += 2; 178 | } 179 | } else { 180 | flag.push(0x0000); 181 | i += 1; 182 | } 183 | } 184 | } 185 | flag 186 | } 187 | 188 | pub fn encode_biseq(s: &Vec) -> Vec { 189 | // we use 4 bits to store the each base, the reversed compliment are stored together 190 | // the lower 4 bits are for the base from the original orientation 191 | // the higher 4 bits are for the base from the reversed complementary orientation 192 | // For each 4 bit field, lower two bits are for the base, the higher two bits are flags. 193 | // 'A' = 0bxxx00, 'C' = 0bxx01, 'G' = 0bxx10, 'T' = 0bxx11 194 | // flag: 0b10xx = hp tagged, 0b00xx = non hp tagged 195 | // 0b1100 (12) : None base 196 | 197 | let fourbit_map_f: [u8; 256] = [ 198 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 199 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 200 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 0, 12, 1, 12, 201 | 12, 12, 2, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 3, 12, 12, 12, 12, 12, 12, 12, 202 | 12, 12, 12, 12, 12, 0, 12, 1, 12, 12, 12, 2, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 203 | 12, 3, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 204 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 205 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 206 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 207 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 208 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 209 | 12, 12, 12, 210 | ]; 211 | let len = s.len(); 212 | let mut out_s = Vec::::with_capacity(len); 213 | let mut f_seq = Vec::::with_capacity(len); 214 | let mut r_seq = Vec::::with_capacity(len); 215 | for p in 0..len { 216 | let rp = len - 1 - p; 217 | //let code = ((fourbit_map_f[s[rp] as usize] ^ 0x03) << 4) | fourbit_map_f[s[p] as usize]; 218 | //out_s.push(code); 219 | f_seq.push(fourbit_map_f[s[p] as usize]); 220 | r_seq.push(fourbit_map_f[s[rp] as usize] ^ 0b0011); 221 | } 222 | 223 | let f_flag = get_hpc_flag(&f_seq); 224 | let r_flag = get_hpc_flag(&r_seq); 225 | for p in 0..len { 226 | out_s.push(((r_flag[p] | r_seq[p]) << 4) | (f_flag[p] | f_seq[p])); 227 | } 228 | out_s 229 | } 230 | 231 | pub fn build(seq_list_file: &String, out_prefix: &String) -> Result { 232 | // given a list of file in `seq_list_file`, read the sequences to building the sequence 233 | // database and index 234 | 235 | let seqdb_name = format!("{}.seqdb", out_prefix); 236 | log::info!("create seq db: {}", seqdb_name); 237 | let mut out_db_file = File::create(seqdb_name)?; 238 | let seqidx_name = format!("{}.idx", out_prefix); 239 | log::info!("create seq index: {}", seqidx_name); 240 | let mut out_idx_file = File::create(seqidx_name)?; 241 | let mut start = 0_usize; 242 | let mut seq_id = 0_u32; 243 | 244 | log::info!("get input files from: {}", seq_list_file); 245 | let f = File::open(seq_list_file)?; 246 | let seq_list_buf = BufReader::new(f); 247 | 248 | for fastx_file in seq_list_buf.lines() { 249 | let input_fn = fastx_file.unwrap(); 250 | log::info!("input file: {}", input_fn); 251 | let metadata = std::fs::metadata(&input_fn)?; 252 | if !metadata.is_file() || metadata.len() < (1 << 16) { 253 | log::info!( 254 | "input file: {} may not be proper input file (filesize = {}), ignore", 255 | input_fn, 256 | metadata.len() 257 | ); 258 | continue; 259 | } 260 | let input_file = File::open(&input_fn)?; 261 | let mut reader = BufReader::new(input_file); 262 | let mut is_gzfile = false; 263 | { 264 | let r = reader.by_ref(); 265 | let mut buf = Vec::::new(); 266 | let _ = r.take(2).read_to_end(&mut buf); 267 | if buf == [0x1F_u8, 0x8B_u8] { 268 | log::info!("input file: {} detected as gz-compressed file", input_fn); 269 | is_gzfile = true; 270 | } 271 | } 272 | 273 | let _ = reader.seek(SeekFrom::Start(0)); 274 | let mut seqs = Vec::<(u32, Vec, Vec)>::new(); 275 | if is_gzfile { 276 | let fastx_buf = BufReader::new(MultiGzDecoder::new(&mut reader)); 277 | let mut fastx_reader = FastxReader::new(fastx_buf, &input_fn)?; 278 | while let Some(r) = fastx_reader.next_rec() { 279 | let r = r.unwrap(); 280 | if r.seq.len() < 500 { 281 | //ignore very short reads 282 | continue; 283 | } 284 | seqs.push((seq_id, r.id, r.seq)); 285 | seq_id += 1; 286 | } 287 | } else { 288 | let mut fastx_reader = FastxReader::new(reader, &input_fn)?; 289 | while let Some(r) = fastx_reader.next_rec() { 290 | let r = r.unwrap(); 291 | if r.seq.len() < 500 { 292 | //ignore very short reads 293 | continue; 294 | } 295 | seqs.push((seq_id, r.id, r.seq)); 296 | seq_id += 1; 297 | } 298 | } 299 | let biseq = seqs 300 | .par_iter() 301 | .map(|(id, name, s)| (*id, name.clone(), encode_biseq(s))) 302 | .collect::, Vec)>>(); 303 | 304 | biseq.iter().for_each(|(id, name, s)| { 305 | let _ = out_db_file.write(&s); 306 | let _ = writeln!( 307 | out_idx_file, 308 | "{:09} {} {} {}", 309 | id, 310 | String::from_utf8_lossy(name), 311 | s.len(), 312 | start 313 | ); 314 | start += s.len(); 315 | }); 316 | } 317 | log::info!("total number of reads indexed: {}", seq_id); 318 | log::info!("total number of bases: {}", start); 319 | log::info!("average read length: {}", start as f32 / seq_id as f32); 320 | Ok(start) 321 | } 322 | -------------------------------------------------------------------------------- /src/bin/utils/ovlp_ec.rs: -------------------------------------------------------------------------------- 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin 3 | // 4 | // This Source Code Form is subject to the terms of the 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 6 | // 7 | // You should have received a copy of the license along with this 8 | // work. If not, see . 9 | 10 | #![allow(dead_code)] 11 | 12 | // 13 | // generate error corrected reads from the overlaps 14 | // 15 | 16 | use super::shmmrutils::*; 17 | use super::Parameters; 18 | use glob::glob; 19 | use memmap::{Mmap, MmapOptions}; 20 | use rustc_hash::FxHashMap; 21 | use rustc_hash::FxHashSet; 22 | use std::fs::File; 23 | use std::io::{self, BufRead, Write}; 24 | use std::path::Path; 25 | use std::str::from_utf8; 26 | use std::thread; 27 | 28 | use std::io::prelude::*; 29 | use threadpool::ThreadPool; 30 | 31 | #[derive(Clone, Copy)] 32 | struct ReadLocation { 33 | start: usize, 34 | len: usize, 35 | } 36 | 37 | #[derive(Debug, Copy, Clone)] 38 | struct Overlap { 39 | rid0: u32, 40 | rid1: u32, 41 | strand1: u8, 42 | len0: u32, 43 | len1: u32, 44 | d_left: i32, 45 | d_right: i32, 46 | bgn0: u32, 47 | end0: u32, 48 | bgn1: u32, 49 | end1: u32, 50 | dist: u32, 51 | idt: f32, 52 | dist_c: u32, 53 | max_dist_c: u32, 54 | idt_c: f32, 55 | flag: u8, 56 | } 57 | // flag bit field 58 | // 0x01: the rid0 is chimer 59 | // 0x02: the rir0 and rid1 are compatitable pair 60 | // 0x04: the rid1 is the best right pair 61 | // 0x08: the rid1 is the best left pair 62 | // 0x10: the rid1 is a chimer 63 | // 0x20: the rid1 is contained 64 | // 0x40: the rid0 is contained 65 | 66 | fn build_overlap(v: Vec<&str>) -> Overlap { 67 | Overlap { 68 | rid0: v[1].parse().unwrap(), 69 | rid1: v[2].parse().unwrap(), 70 | strand1: v[3].parse().unwrap(), 71 | len0: v[4].parse().unwrap(), 72 | len1: v[5].parse().unwrap(), 73 | d_left: v[6].parse().unwrap(), 74 | d_right: v[7].parse().unwrap(), 75 | bgn0: v[8].parse().unwrap(), 76 | end0: v[9].parse().unwrap(), 77 | bgn1: v[10].parse().unwrap(), 78 | end1: v[11].parse().unwrap(), 79 | dist: v[12].parse().unwrap(), 80 | idt: v[13].parse().unwrap(), 81 | dist_c: v[14].parse().unwrap(), 82 | max_dist_c: v[15].parse().unwrap(), 83 | idt_c: v[16].parse().unwrap(), 84 | flag: v[17].parse().unwrap(), 85 | } 86 | } 87 | 88 | fn _format_overlap(o: Overlap) -> String { 89 | format!( 90 | "{} {} {} {} {} {} {} {} {} {} {} {} {:.2} {} {} {:.2} {}", 91 | o.rid0, 92 | o.rid1, 93 | o.strand1, 94 | o.len0, 95 | o.len1, 96 | o.d_left, 97 | o.d_right, 98 | o.bgn0, 99 | o.end0, 100 | o.bgn1, 101 | o.end1, 102 | o.dist, 103 | o.idt, 104 | o.dist_c, 105 | o.max_dist_c, 106 | o.idt_c, 107 | o.flag 108 | ) 109 | } 110 | 111 | type OverlapMap = FxHashMap>; 112 | type _DeltaMap = FxHashMap<(u32, u32, u8, i32), Vec>; 113 | 114 | fn build_read_ovlp_data

(filename: P) -> Result 115 | where 116 | P: AsRef, 117 | { 118 | let mut rid2ovlp = OverlapMap::default(); 119 | let mut buffer = String::new(); 120 | 121 | let mut file = File::open(filename)?; 122 | file.read_to_string(&mut buffer)?; 123 | for line in buffer.split("\n") { 124 | let mut v: Vec<&str> = Vec::<&str>::with_capacity(24); // we need pre-allocate some space for performance 125 | line.split(' ').for_each(|c| v.push(c)); 126 | match v[0] { 127 | "O" => { 128 | let ovlp = build_overlap(v); 129 | //d_left = ovlp.d_left; 130 | rid2ovlp 131 | .entry(ovlp.rid0) 132 | .or_insert_with(|| vec![]) 133 | .push(ovlp); 134 | } 135 | _ => (), 136 | } 137 | } 138 | Ok(rid2ovlp) 139 | } 140 | 141 | fn get_consensue_seq( 142 | seq0: Vec, 143 | support_seq: Vec<([usize; 4], Vec)>, 144 | tol: f64, 145 | min_ec_cov: Option, 146 | ) -> Option> { 147 | let min_ec_cov = min_ec_cov.unwrap_or(1); 148 | // 4bit encoded seq consensus 149 | let mut cov = vec![0 as u16; seq0.len()]; 150 | let mut deltas = FxHashMap::::default(); 151 | for (offsets, seq1) in support_seq { 152 | let [b0, e0, b1, e1] = offsets; 153 | let seq0_trim = seq0[b0..e0].to_vec(); 154 | let seq1_trim = seq1[b1..e1].to_vec(); 155 | if let Some(ovlpmatch) = match_reads(&seq0_trim, &seq1_trim, true, tol, 1200, 32) { 156 | for p in ovlpmatch.bgn0..ovlpmatch.end0 { 157 | cov[(p as usize) + b0] += 1; 158 | } 159 | let mut dpts = ovlpmatch.deltas.unwrap(); 160 | dpts.reverse(); 161 | let mut d = 0_u8; 162 | let mut px = 0_u32; 163 | 164 | for dpt in dpts { 165 | let c = if dpt.dk > 0 { 166 | 0 167 | } else { 168 | seq1_trim[dpt.y as usize - 1] 169 | }; 170 | let cx = dpt.x + (b0 as u32) - 1; 171 | if cx != px { 172 | d = 0; 173 | } else { 174 | d += 1; 175 | } 176 | 177 | let key = cx << 12 | (d as u32) << 4 | (c as u32); //this limit the length of the read to be corrected to 2^20 178 | let counter = deltas.entry(key).or_insert(0); 179 | *counter += 1; 180 | 181 | px = cx; 182 | } 183 | } 184 | } 185 | 186 | let mut delta_best = FxHashMap::::default(); 187 | for k in deltas.keys() { 188 | let v = deltas.get(k).unwrap(); 189 | let p = k >> 12; 190 | if *v < cov[p as usize] >> 1 || *v < 3 { 191 | continue; 192 | } 193 | let key = *k >> 4; 194 | let counter = delta_best.entry(key).or_insert((*v, (*k & 0xF) as u8)); 195 | if *v > counter.0 { 196 | delta_best.insert(key, (*v, (*k & 0xF) as u8)); 197 | } 198 | } 199 | 200 | let mut keys = delta_best.keys().collect::>(); 201 | keys.sort(); 202 | let mut max_delta = FxHashMap::::default(); 203 | for k in keys.iter() { 204 | //let v = delta_best.get(k).unwrap(); 205 | let x = *k >> 8; 206 | max_delta.insert(x, (*k & 0xFF) as u8); 207 | } 208 | 209 | let mut consensus_seq = Vec::::new(); 210 | let mut consensus_seq_cov = Vec::::new(); 211 | for p in 0..seq0.len() { 212 | if !max_delta.contains_key(&(p as u32)) { 213 | consensus_seq.push(seq0[p]); 214 | consensus_seq_cov.push(cov[p]); 215 | continue; 216 | } 217 | let max_d = *max_delta.get(&(p as u32)).unwrap(); 218 | for d in 0..max_d + 1 { 219 | let k = ((p as u32) << 8) | (d as u32); 220 | if let Some(v) = delta_best.get(&k) { 221 | if v.1 != 0 { 222 | if d == 0 { 223 | consensus_seq.push(seq0[p]); 224 | consensus_seq_cov.push(cov[p]); 225 | } 226 | consensus_seq.push(v.1); 227 | consensus_seq_cov.push(cov[p]); 228 | // println!("insert {} {}", p, b); 229 | } 230 | } 231 | } 232 | } 233 | assert!(consensus_seq_cov.len() == consensus_seq.len()); 234 | 235 | let mut bgn = Option::::None; 236 | for i in 0..consensus_seq_cov.len() { 237 | if consensus_seq_cov[i] >= min_ec_cov { 238 | // need two other supporting reads for EC 239 | bgn = Some(i); 240 | break; 241 | } 242 | } 243 | 244 | //consensus_seq_cov.iter().for_each(|c| {println!("{}", c)}); 245 | 246 | let mut end = Option::::None; 247 | let len = consensus_seq_cov.len() - 1; 248 | for i in 0..consensus_seq_cov.len() { 249 | if consensus_seq_cov[len - i] >= min_ec_cov { 250 | end = Some(len - i); 251 | break; 252 | } 253 | } 254 | 255 | if bgn.is_none() || end.is_none() { 256 | None 257 | } else { 258 | let b = bgn.unwrap(); 259 | let e = end.unwrap(); 260 | if min_ec_cov >= 3 { 261 | for i in b..e { 262 | if consensus_seq_cov[i] < min_ec_cov { 263 | return None; 264 | } 265 | } 266 | } 267 | if b > e || e < b + 1000 { 268 | None 269 | } else { 270 | Some(consensus_seq[b..e].to_vec()) 271 | } 272 | } 273 | } 274 | 275 | fn get_corrected_seq( 276 | r: u32, 277 | ovlp: &Vec, 278 | read_index: &Vec, 279 | readsdb: &Mmap, 280 | tol: f64, 281 | min_ec_cov: Option, 282 | ) -> Option> { 283 | let rid0 = r; 284 | let rloc0 = read_index[rid0 as usize]; 285 | let s0 = rloc0.start; 286 | let len0 = rloc0.len; 287 | let e0 = s0 + len0; 288 | let mut seq0 = Vec::::with_capacity(20480); 289 | 290 | let basemap = [0b0001, 0b0010, 0b0100, 0b1000]; 291 | for c in &readsdb[s0..e0] { 292 | seq0.push(basemap[(c & 0b0011) as usize]); 293 | } 294 | 295 | // println!("seq {} {}", r, seq2string(&seq0)); 296 | let mut support_seqs = Vec::<([usize; 4], Vec)>::new(); 297 | for vv in ovlp.iter() { 298 | if vv.flag & 0b0010 == 0 { 299 | continue; 300 | } 301 | let rid1 = vv.rid1; 302 | let rloc1 = read_index[rid1 as usize]; 303 | let strand: u8 = vv.strand1; 304 | let s1 = rloc1.start; 305 | let len1 = rloc1.len; 306 | let e1 = s1 + len1; 307 | let mut seq1 = Vec::::with_capacity(20480); 308 | 309 | // we need to map 2bit to 4bit so we can use 0x0000 for special cases 310 | if strand == 0 { 311 | for c in &readsdb[s1..e1] { 312 | // seq1.push(basemap[(c & 0x0F) as usize]); 313 | seq1.push(basemap[(c & 0b0011) as usize]); 314 | } 315 | } else { 316 | for c in &readsdb[s1..e1] { 317 | //seq1.push(basemap[((c >> 4) & 0x0F) as usize]); 318 | seq1.push(basemap[((c >> 4) & 0b0011) as usize]); 319 | } 320 | } 321 | let b0 = vv.bgn0 as usize; 322 | let e0 = vv.end0 as usize; 323 | let b1 = vv.bgn1 as usize; 324 | let e1 = vv.end1 as usize; 325 | if b1 > e1 { 326 | continue; 327 | } 328 | support_seqs.push(([b0, e0, b1, e1], seq1)); 329 | } 330 | if let Some(c_seq) = get_consensue_seq(seq0, support_seqs, tol, min_ec_cov) { 331 | let mut out_seq = Vec::::new(); 332 | for c in c_seq { 333 | match c { 334 | 0b0001 => out_seq.push(b'A'), 335 | 0b0010 => out_seq.push(b'C'), 336 | 0b0100 => out_seq.push(b'G'), 337 | 0b1000 => out_seq.push(b'T'), 338 | _ => out_seq.push(b'N'), 339 | } 340 | } 341 | Some(out_seq) 342 | } else { 343 | None 344 | } 345 | } 346 | 347 | pub fn ovlp_ec( 348 | seqdb_file: &String, 349 | index_file: &String, 350 | prefix: &String, 351 | out_prefix: &String, 352 | parameters: &Parameters, 353 | ) -> Result<(), io::Error> { 354 | let mut read_index = Vec::::new(); 355 | let mut read_name = Vec::::new(); 356 | 357 | log::info!("read seq idx: {}", seqdb_file); 358 | let lines = read_lines(index_file)?; 359 | for line in lines { 360 | if let Ok(rec) = line { 361 | // the record line looks like 000000023 m64062_190803_042216/144/ccs 20359 467415 362 | let v: Vec<&str> = rec.split_whitespace().collect(); 363 | // let rid: u32 = v[0].parse().unwrap(); 364 | let start: usize = v[3].parse().unwrap(); 365 | let len: usize = v[2].parse().unwrap(); 366 | read_index.push(ReadLocation { 367 | start: start, 368 | len: len, 369 | }); 370 | read_name.push(v[1].to_string()); 371 | //println!("{} {} {}", rid, start, len); 372 | } 373 | } 374 | 375 | let read_index = read_index; 376 | log::info!("read seq db: {}", seqdb_file); 377 | let file = File::open(seqdb_file)?; 378 | let readsdb = unsafe { MmapOptions::new().map(&file)? }; 379 | //println!("{:?}", args); 380 | let _ = log::info!("prefix: {}", prefix); 381 | let _ = log::info!("out_prefix: {}", out_prefix); 382 | let infile_pattern = [prefix.clone(), "*".to_string()].concat(); 383 | 384 | let mut children = Vec::new(); 385 | //let mut chunk: u8 = 0; 386 | for entry in glob(&infile_pattern).expect("Failed to read glob pattern") { 387 | match entry { 388 | Ok(path) => { 389 | // println!("{:?}", path.display()); 390 | let child = thread::spawn(move || { 391 | let rid2ovlp = build_read_ovlp_data(path); 392 | rid2ovlp 393 | }); 394 | children.push(child); 395 | } 396 | Err(e) => log::error!("{:?}", e), 397 | } 398 | //chunk += 1; 399 | } 400 | 401 | let mut rid2ovlp_all = OverlapMap::default(); 402 | for child in children { 403 | let rid2ovlp_p = child.join().expect("oops! the child thread panicked")?; 404 | rid2ovlp_all.extend(rid2ovlp_p); 405 | } 406 | 407 | let contained = FxHashSet::::default(); 408 | 409 | let readsdb = std::sync::Arc::new(readsdb); 410 | let read_index = std::sync::Arc::new(read_index); 411 | let contained = std::sync::Arc::new(contained); 412 | let rid2ovlp_all = std::sync::Arc::new(rid2ovlp_all); 413 | let read_name = std::sync::Arc::new(read_name); 414 | let pool = ThreadPool::new(parameters.nthreads as usize); 415 | for chunk in 0..parameters.nchunks { 416 | let readsdb = readsdb.clone(); 417 | let read_index = read_index.clone(); 418 | let contained = contained.clone(); 419 | let rid2ovlp_all = rid2ovlp_all.clone(); 420 | let read_name = read_name.clone(); 421 | let out_prefix = out_prefix.clone(); 422 | let nchunks = parameters.nchunks; 423 | let tol = parameters.tol; 424 | let min_ec_cov = parameters.min_ec_cov; 425 | pool.execute(move || { 426 | let out_file_name = format!("{}_{:02}.fa", out_prefix, chunk + 1); 427 | let mut f = File::create(out_file_name).unwrap(); 428 | for r in rid2ovlp_all.keys() { 429 | if (*r % nchunks) != chunk { 430 | continue; 431 | } 432 | if contained.contains(r) { 433 | continue; 434 | } 435 | let ovlp = rid2ovlp_all.get(r).unwrap(); 436 | // let mut deltas = HashMap::::new(); 437 | 438 | let rid0 = *r; 439 | if let Some(corrected_seq) = 440 | get_corrected_seq(rid0, &ovlp, &read_index, &readsdb, tol, Some(min_ec_cov)) 441 | { 442 | let _ = writeln!( 443 | f, 444 | ">{} {}\n{}", 445 | read_name[*r as usize], 446 | r, 447 | from_utf8(&corrected_seq).unwrap() 448 | ); 449 | } 450 | } 451 | }); 452 | } 453 | pool.join(); 454 | Ok(()) 455 | } 456 | 457 | // The output is wrapped in a Result to allow matching on errors 458 | // Returns an Iterator to the Reader of the lines of the file. 459 | fn read_lines

(filename: P) -> io::Result>> 460 | where 461 | P: AsRef, 462 | { 463 | let file = File::open(filename)?; 464 | Ok(io::BufReader::with_capacity(10000000, file).lines()) 465 | } 466 | -------------------------------------------------------------------------------- /src/bin/pg_asm.rs: -------------------------------------------------------------------------------- 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin 3 | // 4 | // This Source Code Form is subject to the terms of the 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 6 | // 7 | // You should have received a copy of the license along with this 8 | // work. If not, see . 9 | 10 | const VERSION_STRING: &'static str = env!("VERSION_STRING"); 11 | 12 | #[global_allocator] 13 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; 14 | //static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc; 15 | 16 | use clap::clap_app; 17 | use glob::glob; 18 | use num_cpus; 19 | use std::fs::File; 20 | use std::fs::{create_dir_all, remove_file}; 21 | use std::io::{BufRead, Write}; 22 | use std::io::{BufReader, BufWriter, Result}; 23 | use std::path::Path; 24 | use sysinfo::SystemExt; 25 | 26 | mod utils; 27 | use simple_logger::SimpleLogger; 28 | use std::time::SystemTime; 29 | use utils::build_idx; 30 | use utils::build_sdb; 31 | use utils::dp_graph; 32 | use utils::graph; 33 | use utils::layout; 34 | use utils::ovlp; 35 | use utils::ovlp_ec; 36 | use utils::resolve::resolve_ht; 37 | use utils::seqmap::dedup_target_seqs; 38 | use utils::Parameters; 39 | use utils::{getrusage, log_resource, rusage, MaybeUninit, RUSAGE_SELF}; 40 | 41 | fn cat_path(wd: &String, filename: &String) -> String { 42 | Path::new(&wd).join(&filename).to_string_lossy().to_string() 43 | } 44 | 45 | fn get_ovlps( 46 | input_reads: &String, 47 | work_dir: &String, 48 | prefix: &String, 49 | delete_input: bool, 50 | parameters: &Parameters, 51 | rdata: &mut rusage, 52 | ) -> Result<()> { 53 | log_resource( 54 | &format!("BGN: get_ovlp, input_reads: {}", input_reads), 55 | rdata, 56 | ); 57 | let output_prefix = format!("{}/{}", &work_dir, &prefix); 58 | 59 | log_resource("BGN: bulding sequence database", rdata); 60 | let nbase = build_sdb::build(&input_reads, &output_prefix)?; 61 | log_resource("END: bulding sequence database", rdata); 62 | if delete_input { 63 | let seq_list_buf = BufReader::new(File::open(&input_reads).unwrap()); 64 | for fastx_file in seq_list_buf.lines() { 65 | let fastx_file = fastx_file?; 66 | remove_file(&fastx_file)?; 67 | } 68 | } 69 | 70 | let seqdb = cat_path(&work_dir, &format!("{}.seqdb", &prefix)); 71 | let seqidx = cat_path(&work_dir, &format!("{}.idx", &prefix)); 72 | let shmmer_idx = cat_path(&work_dir, &format!("{}-shmr", &prefix)); 73 | 74 | // step 2: build shimmer index 75 | log_resource("BGN: bulding shimmer", rdata); 76 | build_idx::build(&seqdb, &seqidx, &shmmer_idx, ¶meters); 77 | log_resource("END: bulding shimmer", rdata); 78 | 79 | //step 3: overlap reads 80 | let ovlp_out = format!("{}/{}-ovlp", &work_dir, &prefix); 81 | log_resource("BGN: overlapping", rdata); 82 | let system = sysinfo::System::new_all(); 83 | let free_mem = system.total_memory() - system.used_memory(); 84 | if (free_mem as f64) < ((nbase >> 10) as f64 * 1.5) { 85 | log::warn!("free memory is less than 1.5 x (total number of bases)"); 86 | log::warn!( 87 | "free memory = {}kb, total bases / 1024 = {}", 88 | free_mem, 89 | nbase >> 10 90 | ); 91 | } 92 | ovlp::ovlp(&seqdb, &seqidx, &shmmer_idx, &ovlp_out, ¶meters)?; 93 | log_resource("END: overlapping", rdata); 94 | 95 | log_resource( 96 | &format!("END: get_ovlp - input_reads: {}", input_reads), 97 | rdata, 98 | ); 99 | Ok(()) 100 | } 101 | 102 | fn main() -> Result<()> { 103 | let mut rdata = unsafe { MaybeUninit::uninit().assume_init() }; 104 | let _res = unsafe { getrusage(RUSAGE_SELF, &mut rdata) }; 105 | 106 | let matches = clap_app!(pg_asm => 107 | (version: VERSION_STRING) 108 | (author: "Jason Chin ") 109 | (about: " 110 | Peregrine-2021 genome assembler 111 | pg_asm: the main workflow entry for end-to-end assembly from the reads 112 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/") 113 | (@arg input_reads: +required "Path to a file that contains the list of reads in .fa .fa.gz .fastq or fastq.gz formats") 114 | (@arg work_dir: +required "The path to a work directory for intermediate files and the results") 115 | (@arg NTHREADS: +takes_value "Number of threads") 116 | (@arg NCHUNKS: +takes_value "Number of partition") 117 | (@arg w: -w +takes_value "Window size [default: 80]") 118 | (@arg k: -k +takes_value "Kmer size [default: 56]") 119 | (@arg r: -r +takes_value "Reduction factor [default: 6]") 120 | (@arg tol: -t --tol +takes_value "Alignment tolerance [default: 0.01]") 121 | (@arg layout_method: -l +takes_value "layout version [default: 2]") 122 | (@arg bestn: --bestn -b +takes_value "number of best overlaps for initial graph [default: 6]") 123 | (@arg keep: --keep "keep intermediate files") 124 | (@arg fast: --fast "run the assembler in the fast mode") 125 | (@arg no_resolve: --no_resolve "disable resolving repeats / dups at the end") 126 | (@arg min_ec_cov: -c --min_ec_cov +takes_value "Minimum error coverage [default: 1]") 127 | (@arg log: --log +takes_value "log level: DBBUG or INFO (default)") 128 | ).get_matches(); 129 | 130 | let log_level = match matches.value_of("log").unwrap_or("INFO") { 131 | "DEBUG" => log::LevelFilter::Debug, 132 | _ => log::LevelFilter::Info, 133 | }; 134 | 135 | SimpleLogger::new() 136 | .with_level(log_level) 137 | .with_utc_timestamps() 138 | .init() 139 | .unwrap(); 140 | 141 | let input_reads = matches.value_of("input_reads").unwrap().to_string(); 142 | let work_dir = matches.value_of("work_dir").unwrap().to_string(); 143 | 144 | let prefix = "reads".to_string(); 145 | 146 | let keep = matches.is_present("keep"); 147 | let fastmode = matches.is_present("fast"); 148 | let no_resolve = matches.is_present("no_resolve"); 149 | 150 | let physical_cpus = num_cpus::get_physical(); 151 | let nthreads = matches 152 | .value_of("NTHREADS") 153 | .unwrap_or(&physical_cpus.to_string()) 154 | .parse::() 155 | .unwrap(); 156 | 157 | let nchunks: u32; 158 | if matches.is_present("NCHUNKS") { 159 | nchunks = matches 160 | .value_of("NCHUNKS") 161 | .unwrap() 162 | .to_string() 163 | .parse::() 164 | .unwrap(); 165 | } else { 166 | let physical_cpus = physical_cpus as u32; 167 | match physical_cpus { 168 | 1..=5 => nchunks = 16 + physical_cpus * 4, 169 | 6..=12 => nchunks = 2 + physical_cpus * 3, 170 | 13..=19 => nchunks = physical_cpus * 2, 171 | _ => nchunks = physical_cpus, 172 | } 173 | } 174 | 175 | let wsize = matches 176 | .value_of("w") 177 | .unwrap_or("80") 178 | .parse::() 179 | .unwrap(); 180 | 181 | let ksize = matches 182 | .value_of("k") 183 | .unwrap_or("56") 184 | .parse::() 185 | .unwrap(); 186 | 187 | let rfactor = matches.value_of("r").unwrap_or("6").parse::().unwrap(); 188 | 189 | let tol = matches 190 | .value_of("tol") 191 | .unwrap_or("0.01") 192 | .parse::() 193 | .unwrap(); 194 | 195 | let min_ec_cov = matches 196 | .value_of("min_ec_cov") 197 | .unwrap_or("1") 198 | .parse::() 199 | .unwrap(); 200 | 201 | let layout_method = matches 202 | .value_of("layout_method") 203 | .unwrap_or("2") 204 | .parse::() 205 | .unwrap(); 206 | 207 | let bestn = matches 208 | .value_of("bestn") 209 | .unwrap_or("6") 210 | .parse::() 211 | .unwrap(); 212 | 213 | let parameters = Parameters { 214 | nchunks: nchunks, 215 | nthreads: nthreads, 216 | w: wsize, 217 | k: ksize, 218 | r: rfactor, 219 | tol: tol, 220 | min_ec_cov: min_ec_cov, 221 | }; 222 | 223 | log::info!("pg_asm {}", VERSION_STRING); 224 | log::info!( 225 | "command: {}", 226 | std::env::args().collect::>().join(" ") 227 | ); 228 | let cdir = std::env::current_dir()?; 229 | log::info!("current dir: {}", cdir.as_os_str().to_string_lossy()); 230 | 231 | let start_wall_clock_time = SystemTime::now(); 232 | log::info!("pg_asm run start"); 233 | 234 | log_resource("BGN: pg_asm", &mut rdata); 235 | log::info!( 236 | "pg_asm run parameters: w:{}, k:{}, r:{}, tol:{} bestn:{}", 237 | wsize, 238 | ksize, 239 | rfactor, 240 | tol, 241 | bestn 242 | ); 243 | 244 | log::info!("faster mode: {}", fastmode); 245 | log::info!("use layout method: {}", layout_method); 246 | log::info!("keep intermediate files: {}", keep); 247 | log::info!("number of threads: {}", nthreads); 248 | log::info!("number of chunks: {}", nchunks); 249 | log::info!("input read file: {}", input_reads); 250 | log::info!("working directory: {}", work_dir); 251 | log::info!( 252 | "sys: number of physical CPU cores detected: {}", 253 | physical_cpus 254 | ); 255 | let system = sysinfo::System::new_all(); 256 | log::info!("sys: total memory: {} KB", system.total_memory()); 257 | log::info!("sys: used memory: {} KB", system.used_memory()); 258 | log::info!("sys: total swap: {} KB", system.total_swap()); 259 | log::info!("sys: used swap: {} KB", system.used_swap()); 260 | 261 | if !Path::new(&work_dir).exists() { 262 | create_dir_all(&work_dir)?; 263 | }; 264 | 265 | get_ovlps( 266 | &input_reads, 267 | &work_dir, 268 | &prefix, 269 | false, 270 | ¶meters, 271 | &mut rdata, 272 | )?; 273 | 274 | if fastmode { 275 | log::info!("Fast mode: ignore read level error correction"); 276 | // graph processing 277 | 278 | let seqdb = cat_path(&work_dir, &format!("{}.seqdb", &prefix)); 279 | let seqidx = cat_path(&work_dir, &format!("{}.idx", &prefix)); 280 | let ovlp_out = cat_path(&work_dir, &format!("{}-ovlp", &prefix)); 281 | let layout_prefix = cat_path(&work_dir, &"asm".to_string()); 282 | 283 | let layout_file = format!("{}_layout.dat", &layout_prefix); 284 | 285 | log_resource("BGN: ovlp2layout", &mut rdata); 286 | log::info!("use layout method: {}", layout_method); 287 | match layout_method { 288 | 1 => graph::ovlp2layout_v1(&ovlp_out, &layout_prefix, bestn), 289 | _ => dp_graph::ovlp2layout_v2(&ovlp_out, &layout_prefix, bestn)?, 290 | } 291 | log_resource("END: ovlp2layout", &mut rdata); 292 | 293 | // layout -> sequence 294 | let output_file_prefix = format!("{}/asm_ctgs", &work_dir); 295 | log_resource("BGN: layout2ctg", &mut rdata); 296 | layout::layout2ctg(&seqdb, &seqidx, &layout_file, &output_file_prefix)?; 297 | let _res = unsafe { getrusage(RUSAGE_SELF, &mut rdata) }; 298 | log_resource("END: layout2ctg", &mut rdata); 299 | 300 | } else { 301 | // error correction 302 | let seqdb = cat_path(&work_dir, &format!("{}.seqdb", &prefix)); 303 | let seqidx = cat_path(&work_dir, &format!("{}.idx", &prefix)); 304 | let shmmer_idx = cat_path(&work_dir, &format!("{}-shmr", &prefix)); 305 | let ovlp_out = cat_path(&work_dir, &format!("{}-ovlp", &prefix)); 306 | let ec_read_prefix = cat_path(&work_dir, &"ec_read".to_string()); 307 | 308 | log_resource("BGN: ovlp_ec", &mut rdata); 309 | ovlp_ec::ovlp_ec(&seqdb, &seqidx, &ovlp_out, &ec_read_prefix, ¶meters)?; 310 | log_resource("END: ovlp_ec", &mut rdata); 311 | 312 | if !keep { 313 | let ovlp_out_ptn = format!("{}*", ovlp_out); 314 | for out in glob::glob(&ovlp_out_ptn.as_str()).expect("error to delete file") { 315 | let out = out.unwrap().to_string_lossy().into_owned(); 316 | log::info!("remove {}", out); 317 | remove_file(out)?; 318 | } 319 | let shmmer_idx_ptn = format!("{}*", &shmmer_idx); 320 | for f in glob(&shmmer_idx_ptn.as_str()).expect("error to delete file") { 321 | let f = f.unwrap().to_string_lossy().into_owned(); 322 | log::info!("remove {}", f); 323 | remove_file(f)?; 324 | } 325 | 326 | remove_file(&seqdb)?; 327 | log::info!("remove {}", seqdb.to_string()); 328 | remove_file(&seqidx)?; 329 | log::info!("remove {}", seqidx.to_string()); 330 | } 331 | 332 | let ec_lst = cat_path(&work_dir, &"ec_reads.lst".to_string()); 333 | let mut ec_lst_file = BufWriter::new(File::create(&ec_lst).unwrap()); 334 | let ec_file_ptn = format!("{}*.fa", &ec_read_prefix); 335 | for f in glob(&ec_file_ptn.as_str()).unwrap() { 336 | writeln!(ec_lst_file, "{}", f.unwrap().to_string_lossy())?; 337 | } 338 | drop(ec_lst_file); //close the file 339 | 340 | log::info!("input reads: {}", ec_lst); 341 | log::info!("working directory: {}", work_dir); 342 | 343 | let prefix = "ec_reads".to_string(); 344 | if keep { 345 | get_ovlps(&ec_lst, &work_dir, &prefix, false, ¶meters, &mut rdata)?; 346 | } else { 347 | get_ovlps(&ec_lst, &work_dir, &prefix, true, ¶meters, &mut rdata)?; 348 | } 349 | 350 | if !keep { 351 | let ec_file_ptn = format!("{}*.fa", &ec_read_prefix); 352 | for f in glob(&ec_file_ptn.as_str()).unwrap() { 353 | let f = f.unwrap().to_string_lossy().into_owned(); 354 | log::info!("remove {}", f); 355 | remove_file(f)?; 356 | } 357 | let shmmer_idx = cat_path(&work_dir, &format!("{}-shmr", &prefix)); 358 | let shmmer_idx_ptn = format!("{}*", &shmmer_idx); 359 | for f in glob(&shmmer_idx_ptn.as_str()).expect("error to delete file") { 360 | let f = f.unwrap().to_string_lossy().into_owned(); 361 | log::info!("remove {}", f); 362 | remove_file(f)?; 363 | } 364 | } 365 | 366 | //step 4: graph processing 367 | 368 | let seqdb = cat_path(&work_dir, &format!("{}.seqdb", &prefix)); 369 | let seqidx = cat_path(&work_dir, &format!("{}.idx", &prefix)); 370 | let ovlp_out = cat_path(&work_dir, &format!("{}-ovlp", &prefix)); 371 | let layout_prefix = cat_path(&work_dir, &"asm".to_string()); 372 | let layout_file = format!("{}_layout.dat", &layout_prefix); 373 | 374 | log_resource("BGN: ovlp2layout", &mut rdata); 375 | log::info!("use layout method: {}", layout_method); 376 | match layout_method { 377 | 1 => graph::ovlp2layout_v1(&ovlp_out, &layout_prefix, bestn), 378 | _ => dp_graph::ovlp2layout_v2(&ovlp_out, &layout_prefix, bestn)?, 379 | } 380 | log_resource("END: ovlp2layout", &mut rdata); 381 | 382 | //step 5: layout -> sequence 383 | log_resource("BGN: layout2ctg", &mut rdata); 384 | let output_file_prefix = format!("{}/asm_ctgs", &work_dir); 385 | layout::layout2ctg(&seqdb, &seqidx, &layout_file, &output_file_prefix)?; 386 | let _res = unsafe { getrusage(RUSAGE_SELF, &mut rdata) }; 387 | log_resource("END: layout2ctg", &mut rdata); 388 | } 389 | if no_resolve { 390 | log::info!("ignore dup resolution"); 391 | } else { 392 | let ref_file = format!("{}/asm_ctgs_m.fa", &work_dir); 393 | let tgt_file = format!("{}/asm_ctgs_e0.fa", &work_dir); 394 | let out_file = format!("{}/asm_ctgs_e.fa", &work_dir); 395 | 396 | log_resource("BEN: dedup_a_ctgs", &mut rdata); 397 | dedup_target_seqs(&ref_file, &tgt_file, &out_file, wsize, ksize, rfactor)?; 398 | log_resource("END: dedup_a_ctgs", &mut rdata); 399 | 400 | let resolve_prefix = format!("{}/asm_ctgs_m", &work_dir); 401 | 402 | log_resource("BEN: resolve_ht", &mut rdata); 403 | resolve_ht(&ref_file, &resolve_prefix, wsize, ksize, rfactor)?; 404 | log_resource("END: resolve_ht", &mut rdata); 405 | } 406 | let (_, ut, st) = log_resource("END: pg_asm", &mut rdata); 407 | log::info!("pg_asm run end"); 408 | log::info!( 409 | "total user cpu time: {} seconds = {} hours", 410 | ut, 411 | ut as f32 / 60.0 / 60.0 412 | ); 413 | log::info!( 414 | "total system cpu time: {} seconds = {} hours", 415 | st, 416 | st as f32 / 60.0 / 60.0 417 | ); 418 | let elapsed_time = start_wall_clock_time.elapsed().unwrap().as_secs_f32(); 419 | log::info!( 420 | "total elapse time: {} seconds = {} hours", 421 | elapsed_time, 422 | elapsed_time / 60.0 / 60.0 423 | ); 424 | Ok(()) 425 | } 426 | -------------------------------------------------------------------------------- /src/bin/utils/shmmrutils.rs: -------------------------------------------------------------------------------- 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin 3 | // 4 | // This Source Code Form is subject to the terms of the 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 6 | // 7 | // You should have received a copy of the license along with this 8 | // work. If not, see . 9 | 10 | #![allow(dead_code)] 11 | 12 | // 13 | // utility functions for SHIMMER process and the read matching / mapping 14 | // 15 | 16 | use memmap::Mmap; 17 | use std::fmt; 18 | 19 | use std::collections::HashMap; 20 | 21 | pub struct HPCSeq { 22 | pub s: Vec, 23 | pub p: Vec, 24 | } 25 | 26 | pub fn get_hpc_seq(seq0: &Vec) -> HPCSeq { 27 | let mut seq = HPCSeq { 28 | s: Vec::::with_capacity(seq0.len()), 29 | p: Vec::::with_capacity(seq0.len()), 30 | }; 31 | let mut i = 0_usize; 32 | let seq0len = seq0.len(); 33 | 34 | while i < seq0len - 1 { 35 | let b = seq0[i]; // 2bit flag + 2bit base 36 | if b & 0b1100 != 0 { 37 | i += 1; // base marked as hp/dimer tial 38 | continue; 39 | } 40 | seq.s.push(b & 0b0011); 41 | seq.p.push(i as u32); 42 | i += 1; 43 | } 44 | seq 45 | } 46 | 47 | pub struct OvlpMatch { 48 | pub m_size: u32, 49 | pub dist: u32, 50 | pub bgn0: u32, 51 | pub end0: u32, 52 | pub bgn1: u32, 53 | pub end1: u32, 54 | pub m_end0: u32, 55 | pub m_end1: u32, 56 | pub deltas: Option>, 57 | } 58 | 59 | #[derive(Copy, Clone)] 60 | pub struct DeltaPoint { 61 | pub x: u32, 62 | pub y: u32, 63 | pub dk: i32, 64 | } 65 | 66 | pub fn track_delta_point( 67 | delta_pts: &HashMap<(u32, i32), DeltaPoint>, 68 | d_final: u32, 69 | k_final: i32, 70 | s: u32, 71 | e: u32, 72 | ) -> Vec { 73 | let mut dpts = Vec::::with_capacity(d_final as usize); 74 | let mut d = d_final; 75 | let mut k = k_final; 76 | while d > 0 { 77 | let dpt = delta_pts.get(&(d, k)).unwrap(); 78 | if dpt.x > s && dpt.x < e { 79 | dpts.push(*dpt); 80 | } 81 | d -= 1; 82 | k -= dpt.dk; 83 | } 84 | dpts 85 | } 86 | 87 | pub fn match_reads( 88 | seq0: &Vec, 89 | seq1: &Vec, 90 | get_delta: bool, 91 | tol: f64, 92 | min_match_len: u32, 93 | bandwidth: u32, 94 | ) -> Option { 95 | // 96 | // A variation of the O(nD) algorithm for read alignments 97 | // 98 | 99 | // let min_match_len = 1200; 100 | let len0 = seq0.len(); 101 | let len1 = seq1.len(); 102 | //println!("S {} {}", len0, len1); 103 | //let d_max = 64 + (0.01 * if len0 < len1 {len0 as f32} else {len1 as f32}) as u32; 104 | let d_max = 32 105 | + (tol 106 | * if len0 < len1 { 107 | len0 as f64 108 | } else { 109 | len1 as f64 110 | }) as u32; 111 | let max_band_width = bandwidth; 112 | let band_tolerance = bandwidth; 113 | let mut k_min = 0_i32; 114 | let mut k_max = 0_i32; 115 | let mut uv_map = HashMap::::new(); 116 | // uv_map: maping k to the u, v, which keep the d path end in k 117 | let mut delta_pts = HashMap::<(u32, i32), DeltaPoint>::new(); 118 | let mut x: u32; 119 | let mut y: u32; 120 | let mut x1: u32; 121 | let mut y1: u32; 122 | let mut best_m = -1_i32; 123 | let mut matched = false; 124 | let mut d_final = 0_u32; 125 | let mut k_final = 0_i32; 126 | let mut pre_k: i32; 127 | let mut start = false; 128 | let mut longest_match = 0_u32; 129 | let mut rtn = OvlpMatch { 130 | m_size: 0, 131 | dist: 0, 132 | bgn0: 0, 133 | end0: 0, 134 | bgn1: 0, 135 | end1: 0, 136 | m_end0: 0, 137 | m_end1: 0, 138 | deltas: None, 139 | }; 140 | 141 | for d in -(d_max as i32)..=(d_max as i32) { 142 | uv_map.insert(d, (0, 0)); 143 | } 144 | for d in 0..d_max { 145 | if k_max - k_min > max_band_width as i32 { 146 | // println!("KK {} {} {} {}", k_max, k_min, k_max - k_min, max_band_width); 147 | break; 148 | } 149 | for k in (k_min..=k_max).step_by(2) { 150 | let (_, vn) = uv_map.get(&(k - 1)).unwrap(); 151 | let (_, vp) = uv_map.get(&(k + 1)).unwrap(); 152 | if k == k_min || ((k != k_max) && vn < vp) { 153 | x = *vp; 154 | pre_k = k + 1; 155 | } else { 156 | x = *vn + 1; 157 | pre_k = k - 1; 158 | } 159 | y = ((x as i32) - k) as u32; 160 | 161 | if get_delta { 162 | let dpt = DeltaPoint { 163 | x: x, 164 | y: y, 165 | dk: k - pre_k, 166 | }; 167 | delta_pts.entry((d, k)).or_insert(dpt); 168 | }; 169 | 170 | x1 = x; 171 | y1 = y; 172 | 173 | while (x as usize) < len0 - 1 174 | && (y as usize) < len1 - 1 175 | && seq0[x as usize] == seq1[y as usize] 176 | { 177 | x += 1; 178 | y += 1; 179 | } 180 | 181 | if (x - x1) > 8 { 182 | if !start { 183 | rtn.bgn0 = x1; 184 | rtn.bgn1 = y1; 185 | start = true; 186 | } 187 | // we set the ends here to avoid bad sequences 188 | // this way, we are sure that, at least, 8 bases are aligned 189 | rtn.end0 = x; 190 | rtn.end1 = y; 191 | } 192 | 193 | if (x - x1) > longest_match { 194 | longest_match = x - x1; 195 | rtn.m_end0 = x; 196 | rtn.m_end1 = y; 197 | } 198 | 199 | // println!("IM {} {} {} {} {} {} {} {}", x, y, len0, len1, d, d_max, k, pre_k); 200 | uv_map.insert(k, (x + y, x)); 201 | if (x + y) as i32 > best_m { 202 | best_m = (x + y) as i32; 203 | } 204 | if (x as usize) >= len0 - 1 || (y as usize) >= len1 - 1 { 205 | matched = true; 206 | d_final = d; 207 | k_final = k; 208 | break; 209 | } 210 | } 211 | // For banding 212 | let mut k_max_new = k_min; 213 | let mut k_min_new = k_max; 214 | for k2 in (k_min..=k_max).step_by(2) { 215 | let (u, _) = uv_map.get(&k2).unwrap(); 216 | if *u as i32 >= (best_m - (band_tolerance as i32)) { 217 | if k2 < k_min_new { 218 | k_min_new = k2; 219 | } 220 | if k2 > k_max_new { 221 | k_max_new = k2; 222 | } 223 | } 224 | } 225 | 226 | k_max = k_max_new + 1; 227 | k_min = k_min_new - 1; 228 | if matched == true { 229 | //println!("match: {} {}", d_final, k_final); 230 | let mut d_inside = 0_u32; 231 | if get_delta { 232 | let dpts = track_delta_point(&delta_pts, d_final, k_final, rtn.bgn0, rtn.end0); 233 | for dpt in &dpts { 234 | if dpt.x > rtn.bgn0 && dpt.x < rtn.end0 { 235 | d_inside += 1; 236 | } 237 | } 238 | rtn.deltas = Some(dpts); 239 | } 240 | rtn.dist = d_inside; 241 | rtn.m_size = (rtn.end0 - rtn.bgn0 + rtn.end1 - rtn.bgn1 + 2 * d_inside) >> 1; 242 | if rtn.m_size < min_match_len { 243 | matched = false; 244 | } 245 | break; 246 | } 247 | } 248 | if !matched { 249 | None 250 | } else { 251 | Some(rtn) 252 | } 253 | } 254 | 255 | #[derive(Clone, Copy)] 256 | pub struct ReadLocation { 257 | pub start: usize, 258 | pub len: usize, 259 | } 260 | 261 | pub fn get_2bit_fragment( 262 | rid: u32, 263 | strand: u8, 264 | bgn: u32, 265 | end: u32, 266 | readsdb: &Mmap, 267 | read_index: &Vec, 268 | ) -> Vec { 269 | let mut seq = Vec::::new(); 270 | let rloc = read_index[rid as usize]; 271 | let s = rloc.start + bgn as usize; 272 | let e = rloc.start + end as usize; 273 | 274 | if strand == 0 { 275 | for c in &readsdb[s..e] { 276 | seq.push(c & 0b0011); 277 | } 278 | } else { 279 | for c in &readsdb[s..e] { 280 | seq.push((c >> 4) & 0b0011); 281 | } 282 | } 283 | seq 284 | } 285 | 286 | pub fn get_seq_fragment( 287 | rid: u32, 288 | strand: u8, 289 | bgn: u32, 290 | end: u32, 291 | mmap: &Mmap, 292 | read_index: &Vec, 293 | ) -> Vec { 294 | let mut seq = Vec::::new(); 295 | let base_map = &[b'A', b'C', b'G', b'T']; 296 | let rloc = read_index[rid as usize]; 297 | let s = rloc.start + bgn as usize; 298 | let e = rloc.start + end as usize; 299 | 300 | if strand == 0 { 301 | for c in &mmap[s..e] { 302 | if c & 0b1100 != 0b1100 { 303 | seq.push(base_map[(c & 0b0011) as usize]); 304 | } else { 305 | seq.push(b'N'); 306 | } 307 | } 308 | } else { 309 | for c in &mmap[s..e] { 310 | if ((c >> 4) & 0b1100) != 0b1100 { 311 | seq.push(base_map[((c >> 4) & 0b0011) as usize]); 312 | } else { 313 | seq.push(b'N'); 314 | } 315 | } 316 | } 317 | //println!("{} {}", rid, String::from_utf8_lossy(&seq)); 318 | seq 319 | } 320 | 321 | #[derive(Clone, Copy)] 322 | pub struct MM128 { 323 | pub x: u64, 324 | pub y: u64, 325 | } 326 | 327 | impl fmt::Display for MM128 { 328 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 329 | let hash = self.x >> 8; 330 | let span = (self.x & 0xFF) as u8; 331 | let rid = (self.y >> 32) as u32; 332 | let pos = ((self.y & 0xFFFFFFFF) >> 1) as u32; 333 | let strand = (self.y & 0x1) as u8; 334 | write!(f, "({}, {}, {}, {}, {})", hash, span, rid, pos, strand) 335 | } 336 | } 337 | 338 | pub fn u64hash(key: u64) -> u64 { 339 | let key = (!key).wrapping_add(key << 21); // key = (key << 21) - key - 1; 340 | let key = key ^ key >> 24; 341 | let key = (key.wrapping_add(key << 3)).wrapping_add(key << 8); // key * 265 342 | let key = key ^ key >> 14; 343 | let key = (key.wrapping_add(key << 2)).wrapping_add(key << 4); // key * 21 344 | let key = key ^ key >> 28; 345 | let key = key.wrapping_add(key << 31); 346 | key 347 | } 348 | 349 | fn _u64hash(key: u64) -> u64 { 350 | let key = !key + (key << 21); // key = (key << 21) - key - 1; 351 | let key = key ^ key >> 24; 352 | let key = (key + (key << 3)) + (key << 8); // key * 265 353 | let key = key ^ key >> 14; 354 | let key = (key + (key << 2)) + (key << 4); // key * 21 355 | let key = key ^ key >> 28; 356 | let key = key + (key << 31); 357 | key 358 | } 359 | 360 | pub struct RingBuffer { 361 | pub v: Vec, 362 | pub size: usize, 363 | pub start_pos: usize, 364 | pub end_pos: usize, 365 | pub len: usize, 366 | } 367 | 368 | impl RingBuffer { 369 | pub fn new(size: usize) -> Self { 370 | let vv = vec![ 371 | MM128 { 372 | x: u64::MAX, 373 | y: u64::MAX 374 | }; 375 | size 376 | ]; 377 | RingBuffer { 378 | v: vv, 379 | size: size, 380 | start_pos: 0, 381 | end_pos: 0, 382 | len: 0, 383 | } 384 | } 385 | 386 | pub fn push(&mut self, m: MM128) { 387 | if self.len < self.size { 388 | self.v[self.end_pos] = m; 389 | self.end_pos += 1; 390 | self.end_pos %= self.size; 391 | self.len += 1; 392 | } else { 393 | self.v[self.end_pos] = m; 394 | self.end_pos += 1; 395 | self.end_pos %= self.size; 396 | self.start_pos += 1; 397 | self.start_pos %= self.size; 398 | } 399 | } 400 | 401 | pub fn _clear(&mut self) { 402 | self.v.clear(); 403 | self.len = 0; 404 | self.start_pos = 0; 405 | self.end_pos = 0; 406 | } 407 | 408 | pub fn get_min(&self) -> MM128 { 409 | let mut min = MM128 { 410 | x: u64::MAX, 411 | y: u64::MAX, 412 | }; 413 | for i in 0..self.len { 414 | if self.v[i].x < min.x { 415 | min = self.v[i]; 416 | } 417 | } 418 | min 419 | } 420 | 421 | pub fn get(&self, i: usize) -> MM128 { 422 | self.v[(self.start_pos + i) % self.size] 423 | } 424 | } 425 | 426 | pub fn reduce_shmmr(mers: Vec, r: u32) -> Vec { 427 | let mut shmmrs = Vec::::new(); 428 | let mut rbuf = RingBuffer::new(r as usize); 429 | let mut min_mer = MM128 { 430 | x: u64::MAX, 431 | y: u64::MAX, 432 | }; 433 | let mut pos = 0; 434 | let mut mdist = 0; 435 | loop { 436 | if pos >= mers.len() { 437 | break; 438 | } 439 | let m = mers[pos]; 440 | rbuf.push(m); 441 | if mdist == (r - 1) as usize { 442 | min_mer = rbuf.get_min(); 443 | let mut last_i = 0_usize; 444 | for i in 0..rbuf.size as usize { 445 | let mm = rbuf.get(i); 446 | if mm.x == min_mer.x { 447 | shmmrs.push(mm); 448 | min_mer = mm; 449 | last_i = i; 450 | } 451 | } 452 | mdist = r as usize - 1 - last_i; 453 | pos += 1; 454 | continue; 455 | } else if m.x <= min_mer.x && pos >= r as usize { 456 | shmmrs.push(m); 457 | min_mer = m; 458 | mdist = 0; 459 | pos += 1; 460 | continue; 461 | } 462 | mdist += 1; 463 | pos += 1; 464 | } 465 | shmmrs 466 | } 467 | 468 | pub fn sequence_to_shmmrs(rid: u32, seq: &Vec, w: u32, k: u32, r: u32) -> Vec { 469 | //let base2bits: [u64; 4] = [0, 1, 2, 3]; 470 | 471 | let mut shmmrs = Vec::::new(); 472 | 473 | let mut pos = 0; 474 | let mut mdist = 0; 475 | let shift = k - 1; 476 | assert!(k <= 56); 477 | assert!(w <= 128); 478 | assert!(r > 0 && r < 13); 479 | let mut fmmer = (0_u64, 0_u64); 480 | let mut rmmer = (0_u64, 0_u64); 481 | let mask = u64::MAX >> (64 - k); 482 | let mut rbuf = RingBuffer::new(w as usize); 483 | let mut min_mer = MM128 { 484 | x: u64::MAX, 485 | y: u64::MAX, 486 | }; 487 | loop { 488 | if pos >= seq.len() { 489 | break; 490 | } 491 | let c = (seq[pos] & 0b0011) as u64; 492 | // println!("C {} {} {}", seq[pos], pos, c); 493 | if seq[pos] & 0b1100 != 0b1100 { 494 | // Not non-A,C,G,T base 495 | fmmer.0 <<= 1; 496 | fmmer.0 |= c & 0b01; 497 | fmmer.0 &= mask; 498 | fmmer.1 <<= 1; 499 | fmmer.1 |= (c & 0b10) >> 1; 500 | fmmer.1 &= mask; 501 | 502 | let rc = 0x03 ^ c; 503 | rmmer.0 >>= 1; 504 | rmmer.0 |= (rc & 0b01) << shift; 505 | rmmer.0 &= mask; 506 | rmmer.1 >>= 1; 507 | rmmer.1 |= ((rc & 0b10) >> 1) << shift; 508 | rmmer.1 &= mask; 509 | } 510 | if fmmer == rmmer { 511 | pos += 1; 512 | continue; 513 | } 514 | if pos < k as usize { 515 | pos += 1; 516 | continue; 517 | } 518 | let mut forward = true; 519 | if rmmer.0 < fmmer.0 { 520 | forward = false; 521 | } 522 | let mmer_hash = match forward { 523 | true => u64hash(fmmer.0) ^ u64hash(fmmer.1) ^ 0x0, 524 | false => u64hash(rmmer.0) ^ u64hash(rmmer.1) ^ 0x0, 525 | }; 526 | let strand: u64 = if forward { 0 } else { 1 }; 527 | let m = MM128 { 528 | x: mmer_hash << 8 | k as u64, 529 | y: (rid as u64) << 32 | (pos as u64) << 1 | strand, 530 | }; 531 | rbuf.push(m); 532 | // println!("MM {} {} {} {}", rid, m, fmmer.0, fmmer.1); 533 | if mdist == (w - 1) as usize { 534 | min_mer = rbuf.get_min(); 535 | for i in 0..rbuf.size as usize { 536 | let mm = rbuf.get(i); 537 | if mm.x == min_mer.x { 538 | shmmrs.push(mm); 539 | min_mer = mm; 540 | // println!("MM0 {} {}", rid, mm); 541 | } 542 | } 543 | mdist = pos - ((min_mer.y & 0xFFFFFFFF) >> 1) as usize; 544 | pos += 1; 545 | continue; 546 | } else if m.x <= min_mer.x && pos >= w as usize { 547 | shmmrs.push(m); 548 | // println!("MM1 {} {}", rid, m); 549 | min_mer = m; 550 | mdist = 0; 551 | pos += 1; 552 | continue; 553 | } 554 | mdist += 1; 555 | pos += 1; 556 | } 557 | let shmmrs = reduce_shmmr(shmmrs, r); 558 | let shmmrs = reduce_shmmr(shmmrs, r); 559 | shmmrs 560 | } 561 | -------------------------------------------------------------------------------- /src/bin/utils/layout.rs: -------------------------------------------------------------------------------- 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin 3 | // 4 | // This Source Code Form is subject to the terms of the 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 6 | // 7 | // You should have received a copy of the license along with this 8 | // work. If not, see . 9 | 10 | #![allow(dead_code)] 11 | 12 | // 13 | // take the layout file and generate contig sequence from the layout file 14 | // it handles the read stitches and consensuse 15 | // 16 | 17 | use super::shmmrutils::match_reads; 18 | use super::shmmrutils::sequence_to_shmmrs; 19 | use super::shmmrutils::{get_2bit_fragment, ReadLocation}; 20 | use memmap::MmapOptions; 21 | use std::fs::File; 22 | use std::io::{self, BufRead, Write}; 23 | use std::path::Path; 24 | 25 | use rustc_hash::{FxHashMap, FxHashSet}; 26 | 27 | fn read_lines

(filename: P) -> io::Result>> 28 | where 29 | P: AsRef, 30 | { 31 | let file = File::open(filename)?; 32 | Ok(io::BufReader::new(file).lines()) 33 | } 34 | 35 | fn get_shmr_offset(s0: &Vec, s1: &Vec) -> (u32, u32) { 36 | // take two sequences and use the shmmrs to compute/etimate the offset between them 37 | // here we take the first hit, it might be useful to use the most common offset 38 | 39 | let mmer0 = sequence_to_shmmrs(0, &s0, 24, 24, 1); 40 | let mmer1 = sequence_to_shmmrs(0, &s1, 24, 24, 1); 41 | let mut counter0 = FxHashMap::::default(); 42 | for m in mmer0.iter() { 43 | let x = m.x >> 8; 44 | *counter0.entry(x).or_insert(0) += 1; 45 | } 46 | let mut hmap0 = FxHashMap::::default(); 47 | for m in mmer0.iter() { 48 | let x = m.x >> 8; 49 | let pos = ((m.y & 0xFFFFFFFF) >> 1) as u32; 50 | if *counter0.get(&x).unwrap() == 1 { 51 | hmap0.insert(x, pos); 52 | } 53 | } 54 | 55 | let (mut offset0, mut offset1) = (0_u32, 0_u32); 56 | for m in mmer1 { 57 | let x = m.x >> 8; 58 | let pos = ((m.y & 0xFFFFFFFF) >> 1) as u32; 59 | if hmap0.contains_key(&x) { 60 | offset0 = *hmap0.get(&x).unwrap(); 61 | offset1 = pos; 62 | break; 63 | } 64 | } 65 | (offset0, offset1) 66 | } 67 | 68 | fn get_ctg_cns_with_tiling_reads( 69 | tiling_reads: &Vec<(u32, Vec)>, 70 | out_frag_bgn: &Vec<(usize, usize)>, 71 | template_seq: &Vec, 72 | ) -> (Vec, f32) { 73 | // 74 | // generate consensus squence from tiling reads 75 | // 76 | let mut cov_aux = Vec::<(usize, i32)>::new(); 77 | let mut s_bgns = FxHashSet::::default(); 78 | let mut s_ends = FxHashSet::::default(); 79 | let mut delta_count = FxHashMap::<(usize, u8, u8), u32>::default(); 80 | for i in 0..tiling_reads.len() { 81 | let (_rid0, seq0) = tiling_reads.get(i).unwrap(); 82 | let (bgn, bgn0) = *out_frag_bgn.get(i).unwrap(); 83 | let len = seq0.len() - bgn0; 84 | let end = if bgn + len < template_seq.len() { 85 | bgn + len 86 | } else { 87 | template_seq.len() 88 | }; 89 | let end0 = if bgn0 + len < seq0.len() { 90 | bgn + len 91 | } else { 92 | seq0.len() 93 | }; 94 | 95 | let seq_s = template_seq[bgn..end].to_vec(); 96 | let seq0_s = seq0[bgn0..end0].to_vec(); 97 | let ovlp = match_reads(&seq_s, &seq0_s, true, 0.02, 1200, 256); 98 | 99 | if let Some(ovlp) = ovlp { 100 | //println!("{} {} {} {}", ovlp.bgn0, ovlp.end0, ovlp.bgn1, ovlp.end1); 101 | let mut dpts = ovlp.deltas.unwrap(); 102 | dpts.reverse(); 103 | let mut d = 0_u8; 104 | let mut px = 0_usize; 105 | cov_aux.push((bgn + ovlp.bgn0 as usize, 1)); 106 | cov_aux.push((bgn + ovlp.end0 as usize, -1)); 107 | s_bgns.insert(bgn + ovlp.bgn0 as usize); 108 | s_ends.insert(bgn + ovlp.end0 as usize); 109 | for dpt in dpts { 110 | let c = if dpt.dk > 0 { 111 | b'-' 112 | } else { 113 | seq0_s[dpt.y as usize - 1] 114 | }; 115 | let cx = dpt.x as usize + bgn - 1; 116 | cov_aux.push((cx, 0)); 117 | if cx != px { 118 | d = 0; 119 | } else { 120 | d += 1; 121 | } 122 | //println!("D {} {} {}", cx, d, c as char); 123 | *delta_count.entry((cx, d, c)).or_insert(0) += 1; 124 | px = cx; 125 | } 126 | } 127 | } 128 | cov_aux.sort(); 129 | let mut dpt_cov = FxHashMap::::default(); 130 | let mut cov = 0_i32; 131 | for (p, d) in cov_aux { 132 | if d != 0 { 133 | cov += d; 134 | } else { 135 | dpt_cov.entry(p).or_insert(cov as u32); 136 | } 137 | } 138 | 139 | let mut dpt_c = Vec::<(usize, u8, u8)>::new(); 140 | for k in delta_count.keys() { 141 | let count = delta_count.get(k).unwrap(); 142 | let cov = dpt_cov.get(&k.0).unwrap(); 143 | if *count > (cov >> 1) { 144 | /* 145 | println!( 146 | "D {} {} {} {} {} {} {}", 147 | k.0, 148 | k.1, 149 | template_seq[k.0] as char, 150 | k.2 as char, 151 | count, 152 | cov, 153 | String::from_utf8_lossy(&template_seq[k.0..k.0 + 5].to_vec()) 154 | ); 155 | */ 156 | dpt_c.push(*k); 157 | } 158 | } 159 | dpt_c.sort(); 160 | 161 | let mut out_seq = Vec::::new(); 162 | let mut pos_map = Vec::::new(); 163 | let mut c_bgn = 0_usize; 164 | for k in dpt_c { 165 | if k.0 > c_bgn { 166 | if c_bgn != 0 { 167 | c_bgn += 1; 168 | } 169 | let c_end = k.0; 170 | out_seq.extend(template_seq[c_bgn..c_end].to_vec()); 171 | pos_map.extend((c_bgn..c_end).collect::>()); 172 | c_bgn = k.0; 173 | } 174 | if k.0 == c_bgn { 175 | if k.1 == 0 && k.2 != b'-' { 176 | out_seq.push(template_seq[k.0]); 177 | pos_map.push(k.0); 178 | } 179 | if k.2 != b'-' { 180 | out_seq.push(k.2); 181 | pos_map.push(k.0); 182 | } 183 | } 184 | } 185 | let c_end = template_seq.len(); 186 | out_seq.extend(template_seq[c_bgn..c_end].to_vec()); 187 | pos_map.extend((c_bgn..c_end).collect::>()); 188 | let mut cov = 0_u32; 189 | let mut sidx = 0; 190 | let mut lq_count = 0_u32; 191 | let out_seq2 = pos_map 192 | .iter() 193 | .map(|&p| { 194 | if s_bgns.contains(&p) { 195 | cov += 1; 196 | s_bgns.remove(&p); 197 | } 198 | if s_ends.contains(&p) { 199 | cov -= 1; 200 | s_ends.remove(&p); 201 | } 202 | let base = out_seq[sidx]; 203 | sidx += 1; 204 | 205 | if cov < 3 { 206 | lq_count += 1; 207 | base + 4 208 | } else { 209 | base 210 | } 211 | }) 212 | .collect::>(); 213 | /* 214 | println!( 215 | "{:?} {:?} {:?}", 216 | lq_count, 217 | out_seq2.len(), 218 | (lq_count as f32 / out_seq2.len() as f32) 219 | ); 220 | */ 221 | let r = (lq_count as f32) / (out_seq2.len() as f32); 222 | (out_seq2, r) 223 | } 224 | 225 | fn stitch_fragments( 226 | tiling_reads: &Vec<(u32, Vec)>, 227 | match_bng: &FxHashMap<(u32, u32), (u32, u32)>, 228 | ) -> (Vec, f32) { 229 | let mut frag_bgn = vec![0_u32; tiling_reads.len()]; 230 | let mut frag_end = vec![0_u32; tiling_reads.len()]; 231 | for i in 0..tiling_reads.len() - 1 { 232 | let (rid0, seq0) = tiling_reads.get(i).unwrap(); 233 | let (rid1, seq1) = tiling_reads.get(i + 1).unwrap(); 234 | //println!("DBG {} {}", rid0, rid1); 235 | let (mut bgn0, mut bgn1) = match_bng.get(&(*rid0, *rid1)).unwrap(); 236 | if bgn0 < frag_bgn[i] as u32 { 237 | let correction = frag_bgn[i] as u32 - bgn0; 238 | bgn0 += correction; 239 | bgn1 += correction; 240 | } 241 | let end0 = if bgn0 as usize + 100 < seq0.len() { 242 | bgn0 as usize + 100 243 | } else { 244 | seq0.len() 245 | }; 246 | let end1 = if bgn1 as usize + 100 < seq1.len() { 247 | bgn1 as usize + 100 248 | } else { 249 | seq1.len() 250 | }; 251 | let seq0_s = seq0[bgn0 as usize..end0].to_vec(); 252 | let seq1_s = seq1[bgn1 as usize..end1].to_vec(); 253 | let (offset0, offset1) = get_shmr_offset(&seq0_s, &seq1_s); 254 | //println!("F {} {} {} {}", rid0, rid1, offset0, offset1); 255 | frag_end[i] = bgn0 + offset0; 256 | frag_bgn[i + 1] = bgn1 + offset1; 257 | } 258 | frag_end[tiling_reads.len() - 1] = 259 | tiling_reads.get(tiling_reads.len() - 1).unwrap().1.len() as u32; 260 | 261 | let mut template_seq = Vec::::new(); 262 | let mut out_frag_bgn = vec![(0_usize, 0_usize); tiling_reads.len()]; 263 | for i in 0..tiling_reads.len() { 264 | let (_rid0, seq0) = tiling_reads.get(i).unwrap(); 265 | let b = *frag_bgn.get(i).unwrap() as usize; 266 | let e = *frag_end.get(i).unwrap() as usize; 267 | out_frag_bgn[i] = (template_seq.len(), b); 268 | template_seq.extend(seq0[b..e].to_vec()); 269 | } 270 | 271 | get_ctg_cns_with_tiling_reads(&tiling_reads, &out_frag_bgn, &template_seq) 272 | } 273 | 274 | pub fn log_asm_summary(ctg_lengths: Vec<(String, usize)>) -> () { 275 | let mut lengths = ctg_lengths.iter().map(|x| x.1).collect::>(); 276 | lengths.sort(); 277 | lengths.reverse(); 278 | let total_bases: usize = lengths.iter().sum(); 279 | let total_ctgs = lengths.len(); 280 | let mut n50 = 0_usize; 281 | let mut n90 = 0_usize; 282 | let mut count_gt_100kb = 0_u32; 283 | log::info!("Total size: {}", total_bases); 284 | log::info!("Longest size: {}", lengths.get(0).unwrap_or(&0)); 285 | let mut cumsum = 0_usize; 286 | for l in lengths { 287 | cumsum += l; 288 | if cumsum as f32 > (total_bases as f32 * 0.5) { 289 | if n50 == 0 { 290 | n50 = l; 291 | } 292 | } 293 | if cumsum as f32 > (total_bases as f32 * 0.9) { 294 | if n90 == 0 { 295 | n90 = l; 296 | } 297 | } 298 | if l > 100000 { 299 | count_gt_100kb += 1; 300 | } 301 | } 302 | log::info!("N50: {}", n50); 303 | log::info!("N90: {}", n90); 304 | log::info!("Number of contigs: {}", total_ctgs); 305 | log::info!("Number of Contigs > 100kb: {}", count_gt_100kb); 306 | } 307 | 308 | pub fn layout2ctg( 309 | seqdb_file: &String, 310 | index_file: &String, 311 | layout_file: &String, 312 | output_file_prefix: &String, 313 | ) -> Result<(), io::Error> { 314 | let mut read_index = Vec::::new(); 315 | 316 | if let Ok(lines) = read_lines(index_file) { 317 | for line in lines { 318 | if let Ok(rec) = line { 319 | // let rec_trimmed = rec.trim_end(); 320 | // the record line looks like 000000023 m64062_190803_042216/144/ccs 20359 467415 321 | let v: Vec<&str> = rec.split_whitespace().collect(); 322 | // let rid: u32 = v[0].parse().unwrap(); 323 | let start: usize = v[3].parse().unwrap(); 324 | let len: usize = v[2].parse().unwrap(); 325 | read_index.push(ReadLocation { 326 | start: start, 327 | len: len, 328 | }); 329 | // println!("{} {} {}", rid, start, len); 330 | } 331 | } 332 | } 333 | 334 | let file = File::open(seqdb_file).unwrap(); 335 | let readsdb = unsafe { MmapOptions::new().map(&file).unwrap() }; 336 | 337 | let mut p_ctg_file = File::create(&format!("{}_m.fa", output_file_prefix)).unwrap(); 338 | let mut a_ctg_file = File::create(&format!("{}_e0.fa", output_file_prefix)).unwrap(); 339 | let mut tiling_reads = Vec::<(u32, Vec)>::new(); 340 | let mut match_bgn = FxHashMap::<(u32, u32), (u32, u32)>::default(); 341 | 342 | let mut pre_ctg_id = 0_u32; 343 | let mut pre_ctg_tag = 'P'; 344 | let mut ctg_lengths = Vec::<(String, usize)>::new(); 345 | let basemap = [b'A', b'C', b'G', b'T', b'a', b'c', b'g', b't']; 346 | if let Ok(lines) = read_lines(layout_file) { 347 | for line in lines { 348 | if let Ok(rec) = line { 349 | //let rec_trimmed = rec.trim_end(); 350 | let v: Vec<&str> = rec.split_whitespace().collect(); 351 | if v[0] == "P" || v[0] == "D" || v[0] == "A" { 352 | if tiling_reads.len() != 0 { 353 | let ctg_id = format!("ctg{:06}_{}", pre_ctg_id, pre_ctg_tag); 354 | let (bseq, r) = stitch_fragments(&tiling_reads, &match_bgn); 355 | if r < 0.6 { 356 | let ctg = bseq 357 | .iter() 358 | .map(|&t| basemap[t as usize]) 359 | .collect::>(); 360 | match pre_ctg_tag { 361 | 'P' => { 362 | let _res = writeln!(p_ctg_file, ">{}", ctg_id); 363 | let _res = 364 | writeln!(p_ctg_file, "{}", String::from_utf8_lossy(&ctg)); 365 | 366 | ctg_lengths.push((ctg_id, ctg.len())); 367 | } 368 | 'A' => { 369 | let _res = writeln!(a_ctg_file, ">{}", ctg_id); 370 | let _res = 371 | writeln!(a_ctg_file, "{}", String::from_utf8_lossy(&ctg)); 372 | } 373 | 'D' => { 374 | let _res = writeln!(a_ctg_file, ">{}", ctg_id); 375 | let _res = 376 | writeln!(a_ctg_file, "{}", String::from_utf8_lossy(&ctg)); 377 | } 378 | _ => (), 379 | } 380 | } 381 | } 382 | tiling_reads.clear(); 383 | pre_ctg_tag = v[0].chars().nth(0).unwrap(); 384 | continue; 385 | } 386 | if v[0] == "E" { 387 | let utg_id: u32 = v[1].parse().unwrap(); 388 | let rid0: u32 = v[2].parse().unwrap(); 389 | let strand0: u8 = v[3].parse().unwrap(); 390 | let rid1: u32 = v[4].parse().unwrap(); 391 | let strand1: u8 = v[5].parse().unwrap(); 392 | let bgn0: u32 = v[6].parse().unwrap(); 393 | let bgn1: u32 = v[7].parse().unwrap(); 394 | if tiling_reads.len() == 0 { 395 | let rloc = read_index[rid0 as usize]; 396 | let len = rloc.len as u32; 397 | let seq_full = 398 | get_2bit_fragment(rid0, strand0, 0, len, &readsdb, &read_index); 399 | tiling_reads.push((rid0, seq_full)); 400 | } 401 | let rloc = read_index[rid1 as usize]; 402 | let len = rloc.len as u32; 403 | //let seq_frag = get_seq_fragment(rid1, strand1, bgn, end, &mmap, &read_index); 404 | let seq_full = get_2bit_fragment(rid1, strand1, 0, len, &readsdb, &read_index); 405 | pre_ctg_id = utg_id; 406 | tiling_reads.push((rid1, seq_full)); 407 | match_bgn.insert((rid0, rid1), (bgn0, bgn1)); 408 | } 409 | } 410 | } 411 | 412 | if tiling_reads.len() != 0 { 413 | let (bseq, r) = stitch_fragments(&tiling_reads, &match_bgn); 414 | if r < 0.6 { 415 | let ctg = bseq 416 | .iter() 417 | .map(|&t| basemap[t as usize]) 418 | .collect::>(); 419 | let ctg_id = format!("ctg{:06}_{}", pre_ctg_id, pre_ctg_tag); 420 | match pre_ctg_tag { 421 | 'P' => { 422 | let _res = writeln!(p_ctg_file, ">{}", ctg_id); 423 | let _res = writeln!(p_ctg_file, "{}", String::from_utf8_lossy(&ctg)); 424 | 425 | ctg_lengths.push((ctg_id, ctg.len())); 426 | } 427 | 'A' => { 428 | let _res = writeln!(a_ctg_file, ">{}", ctg_id); 429 | let _res = writeln!(a_ctg_file, "{}", String::from_utf8_lossy(&ctg)); 430 | } 431 | 'D' => { 432 | let _res = writeln!(a_ctg_file, ">{}", ctg_id); 433 | let _res = writeln!(a_ctg_file, "{}", String::from_utf8_lossy(&ctg)); 434 | } 435 | _ => (), 436 | } 437 | } 438 | tiling_reads.clear(); 439 | } 440 | } 441 | log_asm_summary(ctg_lengths); 442 | Ok(()) 443 | } 444 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Peregrine-2021: A faster and minimum genome assembler 2 | 3 | Peregrine-2021 is an genome assembler designed for long-reads that have good enough accuracy. It is written with the Rust language. 4 | The main method used in the genome assembler is described in [Human Genome Assembly in 100 Minutes](https://www.biorxiv.org/content/10.1101/705616v1). 5 | 6 | PeregrineLogo 7 | 8 | ## System requirement: 9 | 10 | A modern Linux workstation or compute node with enough disk, CPUs, and RAM. It is better to have a good number of CPUs (my testing system has 20 cores) and a good amount of RAM (~ total 1.5x of the reads data set). For example, for 100G sequences, it is probably good to have at least 150G RAM. A smaller amount, e.g., 32G, works, but you will need some manual setup for effective computation. 11 | ## Some Ballpark Performance Summary 12 | 13 | With a proper hardware (e.g. ~1Tb RAM), Peregrine-2021 had successful assembled a total 30G diploid genome (2n = 30G) with a contig N50 = 55.2Mb for a large diploid genome. (For who might want to know more details, I ran it as a unpaid service so I don't have much other infomation. [Link to the the graph of the assembly](https://twitter.com/infoecho/status/1330617986185457669?s=20&t=FlHjuWCHslvjxVdyZpU1gQ).) 14 | 15 | For a typical human-size assembly, a much cheaper compute instance with from 128G to 512G RAM can work well. (see this blog [Accelerating genome assembly with AWS Graviton2](https://aws.amazon.com/blogs/publicsector/accelerating-genome-assembly-aws-graviton2/)), and it takes only 2 to 3 hours wall o'clock time to get an assembly. (We also provide a "fast" mode eliminating one error correct stage for perfect reads as input.) 16 | 17 | On the accuracy side, here, we only have some rough numbers from the earlier version compared to other assemblers. Ironically, it could take more effort and resources to do a comprehensive benchmark for a publication than writing the assembler itself. Unfortunately, benchmarking work is a luxury that I currently do not have. However, suppose if anyone is interested in taking a shot running the code fine-tuning the parameters to evaluate the results correctly, in that case, I might help a bit from time to time. 18 | 19 | ## Usage: 20 | 21 | 1. Put the paths to the sequence read files (fasta / fasta.gz / fastq / fastq.gz, compressed with the standard gzip but not bgzip) in a file, e.g. `reads.lst`, so the Peregrin-2021 assembler can find the read data. For example, this shows the content of a `reads.lst` file: 22 | ``` 23 | $ cat seq.lst 24 | /wd/CHM13_20k/m64062_190803_042216.fastq.gz 25 | /wd/CHM13_20k/m64062_190804_172951.fastq.gz 26 | /wd/CHM13_20k/m64062_190806_063919.fastq.gz 27 | /wd/CHM13_20k/m64062_190807_194840.fastq.gz 28 | ``` 29 | 30 | 2. Make sure your have enough disk (preferably SSD storage or high performance network filesystem) for a working directory. Let’s call the working directory `asm_out`. 31 | 32 | 3. Execute: `pg_asm reads.lst asm_out` from command line / shell, some potentially useful intermediate files and the assembled contigs will be in the directory `asm_out/` 33 | 34 | ``` 35 | $ pg_asm seq.lst asm_out >& log & 36 | ``` 37 | 38 | 4. There are a number of options that you can try to tune for optimizing the assembly results. Here is the full usage information of `pg_asm`. 39 | 40 | ``` 41 | ❯ pg_asm --help 42 | pg_asm peregrine-2021 0.4.9 (arm_config:58e666e+, release build, linux [x86_64] [rustc 1.58.0 (02072b482 2022-01-11)]) 43 | Jason Chin 44 | 45 | Peregrine-2021 genome assembler, 46 | LICENSE: http://creativecommons.org/licenses/by-nc-sa/4.0/ 47 | 48 | USAGE: 49 | pg_asm [FLAGS] [OPTIONS] [ARGS] 50 | 51 | FLAGS: 52 | --fast run the assembler in the fast mode 53 | -h, --help Prints help information 54 | --keep keep intermediate files 55 | --no_resolve disable resolving repeats / dups at the end 56 | -V, --version Prints version information 57 | 58 | OPTIONS: 59 | -b, --bestn number of best overlaps for initial graph [default: 6] 60 | -k Kmer size [default: 56] 61 | -l layout version [default: 2] 62 | --log log level: DBBUG or INFO (default) 63 | -c, --min_ec_cov Minimum error coverage [default: 1] 64 | -r Reduction factor [default: 6] 65 | -t, --tol Alignment tolerance [default: 0.01] 66 | -w Window size [default: 80] 67 | 68 | ARGS: 69 | Path to a file that contains the list of reads in .fa .fa.gz .fastq or fastq.gz formats 70 | The path to a work directory for intermediate files and the results 71 | Number of threads 72 | Number of partition 73 | ``` 74 | 75 | You can reduce the `Reduction factor` and `Window Size` to increase the sensitivity to 76 | detect overlaps. I found `-r 4 -w 64` may be better for human assembly than the default 77 | parameters. 78 | 79 | ## Example Results: (`v0.2.0, main:6d91294, release build, linux [x86_64]`) 80 | 81 | ### Input data set: Human CHM13 dataset 82 | 83 | - 5,567,153 reads, average length = 18,028 bp, about 30x human genome size 84 | 85 | - Systme: 20 core Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz, 512G ram 86 | 87 | 88 | ### Run Peregrine-2021 in Fast-mode (without read level error correction) 89 | 90 | 91 | #### CPU time for the fast-mode: 92 | ``` 93 | usr: 152025s = 42.5 cpu hours, 94 | 95 | sys: 900s = 0.25 cpu hours, 96 | 97 | wall clock time = 2:45:39 98 | ``` 99 | (In contrast, it takes [HiCanu](https://www.biorxiv.org/content/10.1101/2020.03.14.992248v3) > 4000 CPU hour to get an assembly) 100 | 101 | #### Assembly Summary Statistics with the fast-mode: 102 | 103 | ``` 104 | total size: 3,034,243,471 105 | max size: 201,005,110 106 | N50 size: 81,361,265 107 | N90 size: 17,301,175 108 | Number of Contigs: 237 109 | Number of Contigs > 100kb: 151 110 | ``` 111 | 112 | #### CHM13 BAC evaluation result with the fast-mode setting: 113 | 114 | ``` 115 | ******************* BAC SUMMARY ****************** 116 | TOTAL : 341 117 | BP : 51532183 118 | ************** Statistics for: _asm_p_ctg-2.fa **************** 119 | BACs closed: 327 (95.8944); BACs attempted: 338 %good = 96.7456; BASES 49454907 (95.969) 120 | Median: 99.9844 121 | MedianQV: 38.06875 122 | Mean: 99.93617 123 | MeanQV: 31.94969 124 | ***** STATS IGNORING INDELS ******************** 125 | Median: 100 126 | MedianQV: Inf 127 | Mean: 99.99177 128 | MeanQV: 40.84457 129 | ********************************************** 130 | ``` 131 | (This [HiCanu paper preprint](https://www.biorxiv.org/content/10.1101/2020.03.14.992248v3) reported resolving only 326 out of the 341 BAC BACs.) 132 | 133 | #### CHM13 BAC evaluation result with the fast-mode setting but without final contig level consensus: 134 | (This pretty much just reflects the quality of the input reads.) 135 | 136 | ``` 137 | ******************* BAC SUMMARY ****************** 138 | TOTAL : 341 139 | BP : 51532183 140 | ************** Statistics for: asm-p_cns-nocns_fast.fa **************** 141 | BACs closed: 327 (95.8944); BACs attempted: 339 %good = 96.4602; BASES 49454907 (95.969) 142 | Median: 99.7163 143 | MedianQV: 25.47141 144 | Mean: 99.67459 145 | MeanQV: 24.87566 146 | ***** STATS IGNORING INDELS ******************** 147 | Median: 99.9782 148 | MedianQV: 36.61544 149 | Mean: 99.97005 150 | MeanQV: 35.23557 151 | ********************************************** 152 | ``` 153 | 154 | ### Run Peregrine-2021 in Standard-mode (one round of the read level error correction) 155 | 156 | #### CPU time for the standard setting which involves one round of read level error correction: 157 | 158 | ``` 159 | usr: 355257s = 98.7 cpu hours, 160 | 161 | sys: 2052s = 0.6 cpu hours, 162 | 163 | wall clock time = 5:47:23 164 | ``` 165 | (In contrast, it takes [HiCanu](https://www.biorxiv.org/content/10.1101/2020.03.14.992248v3) > 4000 CPU hour to get an assembly) 166 | 167 | #### Assembly Summary Statistics of the standard Setting: 168 | 169 | ``` 170 | total size: 3,039,592,838 171 | max size: 142,204,433 172 | N50 size: 83,143,579 173 | N90 size: 16,250,509 174 | Number of Contigs: 227 175 | Number of Contigs > 100kb: 157 176 | ``` 177 | 178 | #### CHM13 BAC evaluation result: 179 | 180 | ``` 181 | ******************* BAC SUMMARY ****************** 182 | TOTAL : 341 183 | BP : 51532183 184 | ************** Statistics for: _asm_p_ctg.fa **************** 185 | BACs closed: 330 (96.7742); BACs attempted: 341 %good = 96.7742; BASES 49874385 (96.783) 186 | Median: 99.9911 187 | MedianQV: 40.5061 188 | Mean: 99.94317 189 | MeanQV: 32.45434 190 | ***** STATS IGNORING INDELS ******************** 191 | Median: 100 192 | MedianQV: Inf 193 | Mean: 99.99405 194 | MeanQV: 42.25218 195 | ********************************************** 196 | ``` 197 | #### CHM13 BAC evaluation result without final contig level consensus: 198 | 199 | ``` 200 | ******************* BAC SUMMARY ****************** 201 | TOTAL : 341 202 | BP : 51532183 203 | ************** Statistics for: asm-p_cns-nocns.fa **************** 204 | BACs closed: 330 (96.7742); BACs attempted: 341 %good = 96.7742; BASES 49874385 (96.783) 205 | Median: 99.9832 206 | MedianQV: 37.74691 207 | Mean: 99.93411 208 | MeanQV: 31.81188 209 | ***** STATS IGNORING INDELS ******************** 210 | Median: 99.9994 211 | MedianQV: 52.21849 212 | Mean: 99.9927 213 | MeanQV: 41.36713 214 | ********************************************** 215 | ``` 216 | 217 | 218 | ## Some FAQs: 219 | 220 | * Q: Why do you write a new assembler? There are already many others (FALCON, HifiAsm, Flye, Shasta, HiCanu, etc.). 221 | 222 | A: We demonstrated that it was possible to use Spare Hierarchical MiniMiER (SHIMMER) to assemble a human-size genome with 100 wall clock minutes with the Peregrine genome assembler. A standard approach filters out high duplicated sequences to get the significant part of a gnome assembled. And, most assemblers adapt some repeat filtering schemes to make the run time for genome assembly acceptable while keeping the most helpful information of a genome. 223 | 224 | However, while it was reasonable to do a repeat-suppressing assembly, the narrative about what an assembler should do is changing. Genomic researchers may want to get more about the repeat even if it needs additional compute power/energy ( > 20x for a human genome compared to a Peregrine run) to get the repetitive sequences in a genome. I think it is worth showing the same technique that we used in the original Peregrine-2021 can also get the high repeat content assembled with only moderate increases of computation cost/energy. 225 | 226 | Another reason for this Peregrine-2021 assembler is that we were not happy with the C / Python hybrid approach used in the original Peregrine assembler. While C / Python combination is very efficient for rapid development, it has too many caveats. It could be interesting to learn something new as well. Following Richard Feynman's wisdom, "What I cannot create, I don't understand." To better understand how to handle repeats and understand the Rust programming language, I created Peregrine-2021 from late-2020 to mid-2021. I want to push Peregrine-2021 to the next stage to apply it to genomics research work; unfortunately, I realized it is too demanding to take this as a hobby or a side-gig. Given that I won't be able to push it too far by myself, it might still be helpful for others who are interested in using this. I finally decide to release the code for non-commercial use. 227 | 228 | * Q: I can't run it, can you help? 229 | 230 | A: it depends. If it is something that is straight forward, I am happy to help. If it is more involved, then I simply can not do it as I only have limited resources. 231 | 232 | * Q: The results are bad, do I do something wrong? 233 | 234 | A: Possiblely. Like all other assemblers, Peregrine-2021 is design with some sepcifications in mind and test accordingly. Depending on the input data and the parameters used to run the assemblers, one might be able to improve the results. Unfortunately, there is no universal simple answer for how to investigate and improve now. It can be some trivial mistakes or very intensive investigation that related to inital sequence methods or even the genome biology itself. 235 | 236 | * Q: I use it and generate some good results for publication, how do I cite it? 237 | 238 | A: While Peregrine-2021 is mostly based on the original ideas described in the preprint [Human Genome Assembly in 100 Minutes](https://www.biorxiv.org/content/10.1101/705616v1), it is a different codebase. I currently have no plan to write up another manuscript specifically for peregrine-2021. Yes, please cite the preprint if you find it is useful. 239 | 240 | * Q: Is any published work using Peregrine-2021? 241 | 242 | A: Check [this](https://scholar.google.com/scholar?cites=12093836825307648052). Most citations are for benchmarking purposes (of the older version), and some are for the related ideas. There are a couple of papers using it for generating results. As this is mostly a "hobby" project which lacks resources for promoting its usage, I am grateful that the Chief Scientific Officier of Medicinal Genomics, Keven McKernan, provides data sets for me to test it and push the results into papers. 243 | 244 | * [A draft reference assembly of the Psilocybe cubensis genome](https://www.ncbi.nlm.nih.gov/labs/pmc/articles/PMC8220353/) 245 | 246 | * [Cannabis Genome: Jamaican Lion strain ](https://www.medicinalgenomics.com/jamaican-lion-data-release/), Note Peregrine-2021 assembled thie genome in an old Mac Pro with only 64G RAM. 247 | 248 | * Q: I don't have a large memory machine, how do I run Peregrine-2021? 249 | 250 | A: For efficiency, it will be great to put all sequence data and the smaller index data in the RAM. Currently, my suggestion is that the RAM should be about 1.5x of the total sequence data. However, the data is accessed through the memory-mapped file (MMAP) mechanism, and the chunking machinery for parallelization can help if one does not have a big memory machine. The code can access the data from the disk through the MMAP file. In such a case, high efficient NVME SSD will help. I had successfully assembled a human genome using a 32G RAM machine. However, I won't recommend that is the right way to go, given that a medium-size RAM machine is relatively cheap to rent now. 251 | 252 | * Q: Can you write better Rust code? 253 | 254 | A: I guess I could, but I was literally learning Rust and developing new algorithms at the same time. Rust is a big language, there is still a lot to learn 255 | 256 | * Q: Why do you choose CC BY-NC-SA 4.0 license to release the code? 257 | 258 | A: oh, that will be a fire side or beer hour conversation. 259 | ## How does Peregrine-2021 work? 260 | 261 | Peregrine-2021 is still just another OLC assembler. 262 | 263 | 1. It uses the SHIMMER index for finding overlap candidates as described in the Peregrine preprint but it is more aggressive for overlapping repetitive reads rather than filtering them out. 264 | 265 | 2. The overlaper performs analysis to identify read overlaps within the same haplotype. We did not use a de-Bruijn graph approach for this. We think our method is likely more computational efficient than using a de-Bruijn graph to separate haplotypes. 266 | 267 | 3. It adapts the techniques using partial homopolymer compression for separating the reads from different haplotype. 268 | 269 | ## Build For X86_64 270 | 271 | 1. Check [Rust Installation](https://www.rust-lang.org/tools/install) 272 | 273 | 2. Run [`cargo install`](https://doc.rust-lang.org/cargo/commands/cargo-install.html) or [`cargo build`](https://doc.rust-lang.org/cargo/commands/cargo-build.html). Make sure you set up the environment variable `PATH` to the directory of the built binaries or you can run the excutable `pg_asm` with full path. If you use `cargo build`, make sure you compile it with the `--release` option for optimization. 274 | 275 | 3. `pg_asm` will run the assembly pipeline end-to-end. If it fails, it does not re-use the existing data when one runs `pg_asm` again. The assemblers is much faster than other assemblers, so it is less important to re-use intermidate data. That has been said, the built will contains executables (e.g. `pg_build_idx`, `pg_ovlp`, etc.) for each assembly steps which one can chain them together with their favorite workflow engine for re-using and re-starting an assembly pipeline. 276 | 277 | 4. A Dockerfile is provided for creating a Docker image. It also provide information to build the assembler from a clean environment. 278 | 279 | 5. To compile in aarch64, it will need some configuration changes to get the best performance. The memory alloctor package needs to be patched for aarch64. See [https://github.com/cschin/mimalloc_rust/tree/aarch64_build](https://github.com/cschin/mimalloc_rust/tree/aarch64_build). 280 | 281 | ## Other utility command line tools 282 | 283 | ``` 284 | pg_build_sdb # convert fasta/fastq/fasta.gz/fastq.gz read data into a simple binary data base for the assembler to fetch the reads. 285 | 286 | pg_build_idx # build the SHIMMER index from the reads for overlapping 287 | 288 | pg_build_sdb # build the sequence database 289 | 290 | pg_dedup # perform all contigs to all contigs alignment to remove duplicates 291 | 292 | pg_dp_graph # take overlap data file as input to generate the layout file using a polyploid aware layout algorithm 293 | 294 | pg_getreads # generate fasta file for a subset of reads from the sequence database 295 | 296 | pg_graph # (obsoleted) convert the overlap information between the reads into an assembly group 297 | 298 | pg_layout # convert the assembly graph to paths and generate the contig fasta file 299 | 300 | pg_ovlp_ec # perform error correction from the haplotype specific overlaps 301 | 302 | pg_ovlp # generate haplotype specific overlaps between the reads 303 | 304 | pg_resolve # this tool aligns all contigs to themselve to identify haplotype-related contigs 305 | ``` 306 | 307 | -- 308 | Jason Chin (twitter: @infoecho) 309 | 310 | first version: Nov. 16, 2020 311 | 312 | current version: Fab. 5, 2022 313 | 314 | 315 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Peregrine Assembler and SHIMMER Genome Assembly Toolkit 2 | 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin 3 | 4 | This Source Code Form is subject to the terms of the Creative Commons 5 | Attribution-NonCommercial-ShareAlike 4.0 International License. 6 | 7 | You should have received a copy of the license along with this 8 | work. If not, see . 9 | ======================================================================= 10 | 11 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International 12 | Public License 13 | 14 | By exercising the Licensed Rights (defined below), You accept and agree 15 | to be bound by the terms and conditions of this Creative Commons 16 | Attribution-NonCommercial-ShareAlike 4.0 International Public License 17 | ("Public License"). To the extent this Public License may be 18 | interpreted as a contract, You are granted the Licensed Rights in 19 | consideration of Your acceptance of these terms and conditions, and the 20 | Licensor grants You such rights in consideration of benefits the 21 | Licensor receives from making the Licensed Material available under 22 | these terms and conditions. 23 | 24 | 25 | Section 1 -- Definitions. 26 | 27 | a. Adapted Material means material subject to Copyright and Similar 28 | Rights that is derived from or based upon the Licensed Material 29 | and in which the Licensed Material is translated, altered, 30 | arranged, transformed, or otherwise modified in a manner requiring 31 | permission under the Copyright and Similar Rights held by the 32 | Licensor. For purposes of this Public License, where the Licensed 33 | Material is a musical work, performance, or sound recording, 34 | Adapted Material is always produced where the Licensed Material is 35 | synched in timed relation with a moving image. 36 | 37 | b. Adapter's License means the license You apply to Your Copyright 38 | and Similar Rights in Your contributions to Adapted Material in 39 | accordance with the terms and conditions of this Public License. 40 | 41 | c. BY-NC-SA Compatible License means a license listed at 42 | creativecommons.org/compatiblelicenses, approved by Creative 43 | Commons as essentially the equivalent of this Public License. 44 | 45 | d. Copyright and Similar Rights means copyright and/or similar rights 46 | closely related to copyright including, without limitation, 47 | performance, broadcast, sound recording, and Sui Generis Database 48 | Rights, without regard to how the rights are labeled or 49 | categorized. For purposes of this Public License, the rights 50 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 51 | Rights. 52 | 53 | e. Effective Technological Measures means those measures that, in the 54 | absence of proper authority, may not be circumvented under laws 55 | fulfilling obligations under Article 11 of the WIPO Copyright 56 | Treaty adopted on December 20, 1996, and/or similar international 57 | agreements. 58 | 59 | f. Exceptions and Limitations means fair use, fair dealing, and/or 60 | any other exception or limitation to Copyright and Similar Rights 61 | that applies to Your use of the Licensed Material. 62 | 63 | g. License Elements means the license attributes listed in the name 64 | of a Creative Commons Public License. The License Elements of this 65 | Public License are Attribution, NonCommercial, and ShareAlike. 66 | 67 | h. Licensed Material means the artistic or literary work, database, 68 | or other material to which the Licensor applied this Public 69 | License. 70 | 71 | i. Licensed Rights means the rights granted to You subject to the 72 | terms and conditions of this Public License, which are limited to 73 | all Copyright and Similar Rights that apply to Your use of the 74 | Licensed Material and that the Licensor has authority to license. 75 | 76 | j. Licensor means the individual(s) or entity(ies) granting rights 77 | under this Public License. 78 | 79 | k. NonCommercial means not primarily intended for or directed towards 80 | commercial advantage or monetary compensation. For purposes of 81 | this Public License, the exchange of the Licensed Material for 82 | other material subject to Copyright and Similar Rights by digital 83 | file-sharing or similar means is NonCommercial provided there is 84 | no payment of monetary compensation in connection with the 85 | exchange. 86 | 87 | l. Share means to provide material to the public by any means or 88 | process that requires permission under the Licensed Rights, such 89 | as reproduction, public display, public performance, distribution, 90 | dissemination, communication, or importation, and to make material 91 | available to the public including in ways that members of the 92 | public may access the material from a place and at a time 93 | individually chosen by them. 94 | 95 | m. Sui Generis Database Rights means rights other than copyright 96 | resulting from Directive 96/9/EC of the European Parliament and of 97 | the Council of 11 March 1996 on the legal protection of databases, 98 | as amended and/or succeeded, as well as other essentially 99 | equivalent rights anywhere in the world. 100 | 101 | n. You means the individual or entity exercising the Licensed Rights 102 | under this Public License. Your has a corresponding meaning. 103 | 104 | 105 | Section 2 -- Scope. 106 | 107 | a. License grant. 108 | 109 | 1. Subject to the terms and conditions of this Public License, 110 | the Licensor hereby grants You a worldwide, royalty-free, 111 | non-sublicensable, non-exclusive, irrevocable license to 112 | exercise the Licensed Rights in the Licensed Material to: 113 | 114 | a. reproduce and Share the Licensed Material, in whole or 115 | in part, for NonCommercial purposes only; and 116 | 117 | b. produce, reproduce, and Share Adapted Material for 118 | NonCommercial purposes only. 119 | 120 | 2. Exceptions and Limitations. For the avoidance of doubt, where 121 | Exceptions and Limitations apply to Your use, this Public 122 | License does not apply, and You do not need to comply with 123 | its terms and conditions. 124 | 125 | 3. Term. The term of this Public License is specified in Section 126 | 6(a). 127 | 128 | 4. Media and formats; technical modifications allowed. The 129 | Licensor authorizes You to exercise the Licensed Rights in 130 | all media and formats whether now known or hereafter created, 131 | and to make technical modifications necessary to do so. The 132 | Licensor waives and/or agrees not to assert any right or 133 | authority to forbid You from making technical modifications 134 | necessary to exercise the Licensed Rights, including 135 | technical modifications necessary to circumvent Effective 136 | Technological Measures. For purposes of this Public License, 137 | simply making modifications authorized by this Section 2(a) 138 | (4) never produces Adapted Material. 139 | 140 | 5. Downstream recipients. 141 | 142 | a. Offer from the Licensor -- Licensed Material. Every 143 | recipient of the Licensed Material automatically 144 | receives an offer from the Licensor to exercise the 145 | Licensed Rights under the terms and conditions of this 146 | Public License. 147 | 148 | b. Additional offer from the Licensor -- Adapted Material. 149 | Every recipient of Adapted Material from You 150 | automatically receives an offer from the Licensor to 151 | exercise the Licensed Rights in the Adapted Material 152 | under the conditions of the Adapter's License You apply. 153 | 154 | c. No downstream restrictions. You may not offer or impose 155 | any additional or different terms or conditions on, or 156 | apply any Effective Technological Measures to, the 157 | Licensed Material if doing so restricts exercise of the 158 | Licensed Rights by any recipient of the Licensed 159 | Material. 160 | 161 | 6. No endorsement. Nothing in this Public License constitutes or 162 | may be construed as permission to assert or imply that You 163 | are, or that Your use of the Licensed Material is, connected 164 | with, or sponsored, endorsed, or granted official status by, 165 | the Licensor or others designated to receive attribution as 166 | provided in Section 3(a)(1)(A)(i). 167 | 168 | b. Other rights. 169 | 170 | 1. Moral rights, such as the right of integrity, are not 171 | licensed under this Public License, nor are publicity, 172 | privacy, and/or other similar personality rights; however, to 173 | the extent possible, the Licensor waives and/or agrees not to 174 | assert any such rights held by the Licensor to the limited 175 | extent necessary to allow You to exercise the Licensed 176 | Rights, but not otherwise. 177 | 178 | 2. Patent and trademark rights are not licensed under this 179 | Public License. 180 | 181 | 3. To the extent possible, the Licensor waives any right to 182 | collect royalties from You for the exercise of the Licensed 183 | Rights, whether directly or through a collecting society 184 | under any voluntary or waivable statutory or compulsory 185 | licensing scheme. In all other cases the Licensor expressly 186 | reserves any right to collect such royalties, including when 187 | the Licensed Material is used other than for NonCommercial 188 | purposes. 189 | 190 | 191 | Section 3 -- License Conditions. 192 | 193 | Your exercise of the Licensed Rights is expressly made subject to the 194 | following conditions. 195 | 196 | a. Attribution. 197 | 198 | 1. If You Share the Licensed Material (including in modified 199 | form), You must: 200 | 201 | a. retain the following if it is supplied by the Licensor 202 | with the Licensed Material: 203 | 204 | i. identification of the creator(s) of the Licensed 205 | Material and any others designated to receive 206 | attribution, in any reasonable manner requested by 207 | the Licensor (including by pseudonym if 208 | designated); 209 | 210 | ii. a copyright notice; 211 | 212 | iii. a notice that refers to this Public License; 213 | 214 | iv. a notice that refers to the disclaimer of 215 | warranties; 216 | 217 | v. a URI or hyperlink to the Licensed Material to the 218 | extent reasonably practicable; 219 | 220 | b. indicate if You modified the Licensed Material and 221 | retain an indication of any previous modifications; and 222 | 223 | c. indicate the Licensed Material is licensed under this 224 | Public License, and include the text of, or the URI or 225 | hyperlink to, this Public License. 226 | 227 | 2. You may satisfy the conditions in Section 3(a)(1) in any 228 | reasonable manner based on the medium, means, and context in 229 | which You Share the Licensed Material. For example, it may be 230 | reasonable to satisfy the conditions by providing a URI or 231 | hyperlink to a resource that includes the required 232 | information. 233 | 3. If requested by the Licensor, You must remove any of the 234 | information required by Section 3(a)(1)(A) to the extent 235 | reasonably practicable. 236 | 237 | b. ShareAlike. 238 | 239 | In addition to the conditions in Section 3(a), if You Share 240 | Adapted Material You produce, the following conditions also apply. 241 | 242 | 1. The Adapter's License You apply must be a Creative Commons 243 | license with the same License Elements, this version or 244 | later, or a BY-NC-SA Compatible License. 245 | 246 | 2. You must include the text of, or the URI or hyperlink to, the 247 | Adapter's License You apply. You may satisfy this condition 248 | in any reasonable manner based on the medium, means, and 249 | context in which You Share Adapted Material. 250 | 251 | 3. You may not offer or impose any additional or different terms 252 | or conditions on, or apply any Effective Technological 253 | Measures to, Adapted Material that restrict exercise of the 254 | rights granted under the Adapter's License You apply. 255 | 256 | 257 | Section 4 -- Sui Generis Database Rights. 258 | 259 | Where the Licensed Rights include Sui Generis Database Rights that 260 | apply to Your use of the Licensed Material: 261 | 262 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 263 | to extract, reuse, reproduce, and Share all or a substantial 264 | portion of the contents of the database for NonCommercial purposes 265 | only; 266 | 267 | b. if You include all or a substantial portion of the database 268 | contents in a database in which You have Sui Generis Database 269 | Rights, then the database in which You have Sui Generis Database 270 | Rights (but not its individual contents) is Adapted Material, 271 | including for purposes of Section 3(b); and 272 | 273 | c. You must comply with the conditions in Section 3(a) if You Share 274 | all or a substantial portion of the contents of the database. 275 | 276 | For the avoidance of doubt, this Section 4 supplements and does not 277 | replace Your obligations under this Public License where the Licensed 278 | Rights include other Copyright and Similar Rights. 279 | 280 | 281 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 282 | 283 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 284 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 285 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 286 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 287 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 288 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 289 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 290 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 291 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 292 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 293 | 294 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 295 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 296 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 297 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 298 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 299 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 300 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 301 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 302 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 303 | 304 | c. The disclaimer of warranties and limitation of liability provided 305 | above shall be interpreted in a manner that, to the extent 306 | possible, most closely approximates an absolute disclaimer and 307 | waiver of all liability. 308 | 309 | 310 | Section 6 -- Term and Termination. 311 | 312 | a. This Public License applies for the term of the Copyright and 313 | Similar Rights licensed here. However, if You fail to comply with 314 | this Public License, then Your rights under this Public License 315 | terminate automatically. 316 | 317 | b. Where Your right to use the Licensed Material has terminated under 318 | Section 6(a), it reinstates: 319 | 320 | 1. automatically as of the date the violation is cured, provided 321 | it is cured within 30 days of Your discovery of the 322 | violation; or 323 | 324 | 2. upon express reinstatement by the Licensor. 325 | 326 | For the avoidance of doubt, this Section 6(b) does not affect any 327 | right the Licensor may have to seek remedies for Your violations 328 | of this Public License. 329 | 330 | c. For the avoidance of doubt, the Licensor may also offer the 331 | Licensed Material under separate terms or conditions or stop 332 | distributing the Licensed Material at any time; however, doing so 333 | will not terminate this Public License. 334 | 335 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 336 | License. 337 | 338 | 339 | Section 7 -- Other Terms and Conditions. 340 | 341 | a. The Licensor shall not be bound by any additional or different 342 | terms or conditions communicated by You unless expressly agreed. 343 | 344 | b. Any arrangements, understandings, or agreements regarding the 345 | Licensed Material not stated herein are separate from and 346 | independent of the terms and conditions of this Public License. 347 | 348 | 349 | Section 8 -- Interpretation. 350 | 351 | a. For the avoidance of doubt, this Public License does not, and 352 | shall not be interpreted to, reduce, limit, restrict, or impose 353 | conditions on any use of the Licensed Material that could lawfully 354 | be made without permission under this Public License. 355 | 356 | b. To the extent possible, if any provision of this Public License is 357 | deemed unenforceable, it shall be automatically reformed to the 358 | minimum extent necessary to make it enforceable. If the provision 359 | cannot be reformed, it shall be severed from this Public License 360 | without affecting the enforceability of the remaining terms and 361 | conditions. 362 | 363 | c. No term or condition of this Public License will be waived and no 364 | failure to comply consented to unless expressly agreed to by the 365 | Licensor. 366 | 367 | d. Nothing in this Public License constitutes or may be interpreted 368 | as a limitation upon, or waiver of, any privileges and immunities 369 | that apply to the Licensor or You, including from the legal 370 | processes of any jurisdiction or authority. 371 | 372 | -------------------------------------------------------------------------------- /src/bin/utils/graph_analysis.rs: -------------------------------------------------------------------------------- 1 | // Peregrine Assembler and SHIMMER Genome Assembly Toolkit 2 | // 2019, 2020, 2021- (c) by Jason, Chen-Shan, Chin 3 | // 4 | // This Source Code Form is subject to the terms of the 5 | // Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 6 | // 7 | // You should have received a copy of the license along with this 8 | // work. If not, see . 9 | 10 | #![allow(dead_code)] 11 | 12 | // 13 | // define the overlap and graph data structure and lower level graph processing utility functions 14 | // 15 | 16 | use petgraph::graphmap::DiGraphMap; 17 | use petgraph::visit::Bfs; 18 | use petgraph::Direction::{Incoming, Outgoing}; 19 | use rustc_hash::FxHashMap; 20 | use rustc_hash::FxHashSet; 21 | 22 | use std::fs::File; 23 | use std::io::{self, BufWriter, Write}; 24 | 25 | pub type U32AsmGraph = DiGraphMap<(u32, u8), u32>; 26 | pub type OvlpGraph = U32AsmGraph; 27 | pub type UtgGraph = U32AsmGraph; 28 | pub type ReadNode = (u32, u8); 29 | pub type OvlpEdge = (ReadNode, ReadNode); 30 | 31 | #[derive(Debug, Copy, Clone, Hash, Eq, PartialEq)] 32 | pub struct ReadPair { 33 | pub rid0: u32, 34 | pub strand0: u8, 35 | pub rid1: u32, 36 | pub strand1: u8, 37 | } 38 | 39 | impl ReadPair { 40 | pub fn new(v: (u32, u8), w: (u32, u8)) -> Self { 41 | ReadPair { 42 | rid0: v.0, 43 | strand0: v.1, 44 | rid1: w.0, 45 | strand1: w.1, 46 | } 47 | } 48 | pub fn _to_str(&self) -> String { 49 | format!( 50 | "{} {} {} {}", 51 | self.rid0, self.strand0, self.rid1, self.strand1 52 | ) 53 | } 54 | 55 | pub fn reverse(&self) -> ReadPair { 56 | ReadPair { 57 | rid0: self.rid1, 58 | strand0: 1 - self.strand1, 59 | rid1: self.rid0, 60 | strand1: 1 - self.strand0, 61 | } 62 | } 63 | } 64 | 65 | #[derive(Debug, Copy, Clone)] 66 | pub struct Overlap { 67 | // main struct for keeping read overlap 68 | pub rid0: u32, 69 | pub rid1: u32, 70 | pub strand1: u8, 71 | pub len0: u32, 72 | pub len1: u32, 73 | pub d_left: i32, 74 | pub d_right: i32, 75 | pub bgn0: u32, 76 | pub end0: u32, 77 | pub bgn1: u32, 78 | pub end1: u32, 79 | pub dist: u32, 80 | // dist: raw distice determine by the O(Nd) alignment algorithm 81 | pub idt: f32, 82 | pub dist_c: u32, 83 | // dist_c: "distance" after hp corrections 84 | pub max_dist_c: u32, 85 | pub idt_c: f32, 86 | pub flag: u8, 87 | // flag bit field, Not used now 2020/11/02 88 | // 0x01: the rid0 is chimer 89 | // 0x02: the rir0 and rid1 are compatitable pair 90 | // 0x04: the rid1 is the best right pair 91 | // 0x08: the rid1 is the best left pair 92 | // 0x10: the rid1 is a chimer 93 | // 0x20: the rid1 is contained 94 | // 0x40: the rid0 is contained 95 | } 96 | 97 | impl Overlap { 98 | pub fn _new() -> Self { 99 | Self { 100 | rid0: 0, 101 | rid1: 0, 102 | strand1: 0, 103 | len0: 0, 104 | len1: 0, 105 | d_left: 0, 106 | d_right: 0, 107 | bgn0: 0, 108 | end0: 0, 109 | bgn1: 0, 110 | end1: 0, 111 | dist: 0, 112 | idt: 0.0, 113 | dist_c: 0, 114 | max_dist_c: 0, 115 | idt_c: 0.0, 116 | flag: 0, 117 | } 118 | } 119 | 120 | pub fn build_from(v: Vec<&str>) -> Self { 121 | Overlap { 122 | rid0: v[1].parse().unwrap(), 123 | rid1: v[2].parse().unwrap(), 124 | strand1: v[3].parse().unwrap(), 125 | len0: v[4].parse().unwrap(), 126 | len1: v[5].parse().unwrap(), 127 | d_left: v[6].parse().unwrap(), 128 | d_right: v[7].parse().unwrap(), 129 | bgn0: v[8].parse().unwrap(), 130 | end0: v[9].parse().unwrap(), 131 | bgn1: v[10].parse().unwrap(), 132 | end1: v[11].parse().unwrap(), 133 | dist: v[12].parse().unwrap(), 134 | idt: v[13].parse().unwrap(), 135 | dist_c: v[14].parse().unwrap(), 136 | max_dist_c: v[15].parse().unwrap(), 137 | idt_c: v[16].parse().unwrap(), 138 | flag: v[17].parse().unwrap(), 139 | } 140 | } 141 | 142 | pub fn _format(&self) -> String { 143 | format!( 144 | "{} {} {} {} {} {} {} {} {} {} {} {} {:.2} {} {} {:.2} {}", 145 | self.rid0, 146 | self.rid1, 147 | self.strand1, 148 | self.len0, 149 | self.len1, 150 | self.d_left, 151 | self.d_right, 152 | self.bgn0, 153 | self.end0, 154 | self.bgn1, 155 | self.end1, 156 | self.dist, 157 | self.idt, 158 | self.dist_c, 159 | self.max_dist_c, 160 | self.idt_c, 161 | self.flag 162 | ) 163 | } 164 | 165 | pub fn swap_rp(&self) -> Overlap { 166 | // swap the overlapped pair 167 | let d_left: i32; 168 | let d_right: i32; 169 | let strand1: u8; 170 | let bgn0: u32; 171 | let end0: u32; 172 | let bgn1: u32; 173 | let end1: u32; 174 | if self.strand1 == 0 { 175 | d_left = -self.d_left; 176 | d_right = -self.d_right; 177 | strand1 = 0; 178 | bgn0 = self.bgn0; 179 | end0 = self.end0; 180 | bgn1 = self.bgn1; 181 | end1 = self.end1; 182 | } else { 183 | d_left = self.d_right; 184 | d_right = self.d_left; 185 | strand1 = 1; 186 | bgn0 = self.len0 - self.end0; 187 | end0 = self.len0 - self.bgn0; 188 | bgn1 = self.len1 - self.end1; 189 | end1 = self.len1 - self.bgn1; 190 | } 191 | Overlap { 192 | rid0: self.rid1, 193 | rid1: self.rid0, 194 | strand1: strand1, 195 | len0: self.len1, 196 | len1: self.len0, 197 | d_left: d_left, 198 | d_right: d_right, 199 | bgn0: bgn1, 200 | end0: end1, 201 | bgn1: bgn0, 202 | end1: end0, 203 | dist: self.dist, 204 | idt: self.idt, 205 | dist_c: self.dist_c, 206 | max_dist_c: self.max_dist_c, 207 | idt_c: self.idt_c, 208 | flag: self.flag, 209 | } 210 | } 211 | 212 | pub fn reverse_strand(&self) -> Overlap { 213 | // reverse the overlapped strain of the read 1 214 | Overlap { 215 | rid0: self.rid0, 216 | rid1: self.rid1, 217 | strand1: 1 - self.strand1, 218 | len0: self.len1, 219 | len1: self.len0, 220 | d_left: -self.d_right, 221 | d_right: -self.d_left, 222 | bgn0: self.len0 - self.end0, 223 | end0: self.len0 - self.bgn0, 224 | bgn1: self.len1 - self.end1, 225 | end1: self.len1 - self.bgn1, 226 | dist: self.dist, 227 | idt: self.idt, 228 | dist_c: self.dist_c, 229 | max_dist_c: self.max_dist_c, 230 | idt_c: self.idt_c, 231 | flag: self.flag | 0x80, 232 | } 233 | } 234 | } 235 | 236 | pub type OverlapMap = FxHashMap>; 237 | 238 | fn get_upath(g: &OvlpGraph, v: ReadNode, w: ReadNode) -> Vec { 239 | // 240 | // find a simple path (no out branch) start at v->w edge in the graph g 241 | // 242 | 243 | let mut path = Vec::::new(); 244 | let mut visited_nodes = FxHashSet::::default(); 245 | path.push(v); 246 | path.push(w); 247 | 248 | visited_nodes.insert(v); 249 | visited_nodes.insert(w); 250 | 251 | let mut n = w; 252 | loop { 253 | if g.neighbors_directed(n, Outgoing).count() == 1 254 | && g.neighbors_directed(n, Incoming).count() == 1 255 | { 256 | n = *g 257 | .neighbors_directed(n, Outgoing) 258 | .into_iter() 259 | .collect::>() 260 | .get(0) 261 | .unwrap(); 262 | if !visited_nodes.contains(&n) { 263 | path.push(n); 264 | visited_nodes.insert(n); 265 | } else { 266 | path.push(n); 267 | break; 268 | } 269 | } else { 270 | break; 271 | } 272 | } 273 | path 274 | } 275 | 276 | pub fn get_utg_paths(g: &OvlpGraph) -> Vec<(u32, Vec)> { 277 | // 278 | // get all unitig paths 279 | // 280 | 281 | let mut start_nodes = FxHashSet::<(u32, u8)>::default(); 282 | for v in g.nodes() { 283 | if g.neighbors_directed(v, Incoming).count() != 1 284 | || g.neighbors_directed(v, Outgoing).count() != 1 285 | { 286 | start_nodes.insert(v); 287 | } 288 | } 289 | 290 | let mut uid = 0_u32; 291 | let mut paths = Vec::<(u32, Vec<(u32, u8)>)>::new(); 292 | 293 | for v in start_nodes { 294 | for w in g.neighbors_directed(v, Outgoing) { 295 | let path = get_upath(&g, v, w); 296 | //let e = path[path.len() - 1]; 297 | paths.push((uid, path)); 298 | uid += 1; 299 | } 300 | } 301 | paths 302 | } 303 | 304 | pub fn transitive_reduction(g: &mut U32AsmGraph) -> () { 305 | let mut tr_edges = FxHashSet::<((u32, u8), (u32, u8))>::default(); 306 | for v in g.nodes().into_iter() { 307 | let mut edges = Vec::<(u32, (u32, u8), (u32, u8))>::with_capacity(32); 308 | for w in g.neighbors_directed(v, Outgoing) { 309 | let ovlp_length = *g.edge_weight(v, w).unwrap(); 310 | edges.push((ovlp_length, v, w)); 311 | } 312 | edges.sort(); 313 | for (_l, v, w) in edges.iter() { 314 | for (_l, _vv, x) in edges.iter() { 315 | if x == w { 316 | continue; 317 | } 318 | // w in out_neighbor(x), v->x->w exist, v->w can be eliminate 319 | let mut found = false; 320 | for x_out in g.neighbors_directed(*x, Outgoing) { 321 | if x_out == *w { 322 | found = true; 323 | break; 324 | } 325 | } 326 | if found { 327 | tr_edges.insert((*v, *w)); 328 | } 329 | } 330 | } 331 | } 332 | 333 | for (v, w, _) in g.all_edges() { 334 | if g.neighbors_directed(v, Incoming).count() == 0 335 | && g.neighbors_directed(w, Incoming).count() >= 2 336 | && g.neighbors_directed(w, Outgoing).count() >= 1 337 | { 338 | tr_edges.insert((v, w)); 339 | } 340 | if g.neighbors_directed(w, Outgoing).count() == 0 341 | && g.neighbors_directed(v, Outgoing).count() >= 2 342 | && g.neighbors_directed(v, Incoming).count() >= 1 343 | { 344 | tr_edges.insert((v, w)); 345 | } 346 | } 347 | 348 | for (v, w) in tr_edges.into_iter() { 349 | //println!("D {:?} {:?}", v, w); 350 | g.remove_edge(v, w); 351 | } 352 | } 353 | 354 | pub fn remove_simple_spur(g: &mut U32AsmGraph, max: u32) { 355 | let mut edges = FxHashSet::::default(); 356 | for (v, w, s) in g.all_edges() { 357 | if *s > max { 358 | continue; 359 | } 360 | if g.neighbors_directed(v, Incoming).count() == 0 361 | && g.neighbors_directed(w, Incoming).count() >= 2 362 | && g.neighbors_directed(w, Outgoing).count() >= 1 363 | { 364 | edges.insert((v, w)); 365 | } 366 | if g.neighbors_directed(w, Outgoing).count() == 0 367 | && g.neighbors_directed(v, Outgoing).count() >= 2 368 | && g.neighbors_directed(v, Incoming).count() >= 1 369 | { 370 | edges.insert((v, w)); 371 | } 372 | } 373 | for (v, w) in edges.into_iter() { 374 | g.remove_edge(v, w); 375 | } 376 | } 377 | 378 | pub fn remove_single_bridge(g: &mut U32AsmGraph, max: u32) -> () { 379 | // 380 | // implement a huristic rule removing spurious connections (repeat induced bridge) 381 | // 382 | 383 | let mut to_remove = FxHashSet::<(u32, OvlpEdge)>::default(); 384 | 385 | for (v, w, weight) in g.all_edges() { 386 | if g.neighbors_directed(v, Outgoing).count() >= 2 387 | && g.neighbors_directed(v, Incoming).count() >= 1 388 | && g.neighbors_directed(w, Outgoing).count() >= 1 389 | && g.neighbors_directed(w, Incoming).count() >= 2 390 | && *weight < max 391 | { 392 | to_remove.insert((*weight, (v, w))); 393 | } 394 | } 395 | 396 | let mut to_remove_vec = Vec::<(u32, OvlpEdge)>::new(); 397 | for e in to_remove { 398 | to_remove_vec.push(e); 399 | } 400 | 401 | to_remove_vec.sort(); // remove edges with smaller weight (~ overlaps length) first 402 | 403 | for (_c, (v, w)) in to_remove_vec { 404 | if g.neighbors_directed(v, Outgoing).count() > 1 405 | && g.neighbors_directed(w, Incoming).count() > 1 406 | && g.neighbors_directed((w.0, 1 - w.1), Outgoing).count() > 1 407 | && g.neighbors_directed((v.0, 1 - v.1), Incoming).count() > 1 408 | { 409 | g.remove_edge(v, w); 410 | g.remove_edge((w.0, 1 - w.1), (v.0, 1 - v.1)); 411 | //println!("RMSB0 {}:{} {}:{}", v.0, v.1, w.0, w.1); 412 | //println!("RMSB1 {}:{} {}:{}", w.0, 1-w.1, v.0, 1-v.1); 413 | } 414 | } 415 | } 416 | 417 | pub fn dump_utg_paths( 418 | paths: &Vec<(u32, Vec)>, 419 | utg_g: &UtgGraph, 420 | filename: &String, 421 | ) -> Result<(), io::Error> { 422 | let mut utg_file = BufWriter::new(File::create(filename).unwrap()); 423 | 424 | for (uid, p) in paths.iter() { 425 | let v = p[0]; 426 | let w = p[p.len() - 1]; 427 | let mut tag = 0_u32; 428 | if utg_g.contains_edge(v, w) { 429 | tag = 1; 430 | } 431 | writeln!( 432 | utg_file, 433 | "UTG {} {}:{} {}:{} {} {}", 434 | uid, 435 | v.0, 436 | v.1, 437 | w.0, 438 | w.1, 439 | p.len(), 440 | tag 441 | )?; 442 | for v in p { 443 | writeln!(utg_file, "N {} {}:{}", uid, v.0, v.1)?; 444 | } 445 | } 446 | Ok(()) 447 | } 448 | 449 | pub fn dump_utg_gfa( 450 | paths: &Vec<(u32, Vec)>, 451 | utg_g: &UtgGraph, 452 | rpair2overlap: &FxHashMap, 453 | _read_to_ctg: &FxHashMap>, 454 | filename: &String, 455 | ) -> Result<(), io::Error> { 456 | let mut gfa_file = BufWriter::new(File::create(filename).unwrap()); 457 | writeln!(gfa_file, "H\tVN:Z:1.0")?; 458 | let mut end_nodes = FxHashMap::<(u32, u8), Vec>::default(); 459 | let mut bgn_nodes = FxHashMap::<(u32, u8), Vec>::default(); 460 | let mut read_len = FxHashMap::::default(); 461 | for (uid, p) in paths.iter() { 462 | if p.len() < 2 { 463 | continue; 464 | } 465 | let nb = p[0]; 466 | let ne = p[p.len() - 1]; 467 | let mut tag = 0_u32; 468 | if utg_g.contains_edge(nb, ne) { 469 | tag = 1; 470 | } 471 | bgn_nodes.entry(nb).or_insert_with(|| vec![]).push(*uid); 472 | end_nodes.entry(ne).or_insert_with(|| vec![]).push(*uid); 473 | let mut node_start = Vec::<(ReadNode, u32)>::new(); 474 | let mut v = p[0]; 475 | let mut utg_len = 0_u32; 476 | node_start.push((v, 0)); 477 | for w in p[1..p.len()].iter() { 478 | let ovlp = rpair2overlap.get(&ReadPair::new(v, *w)).unwrap(); 479 | let len0 = ovlp.len0; 480 | let len1 = ovlp.len1; 481 | let ovlp_len = ovlp.end0 - ovlp.bgn0; 482 | if utg_len == 0 { 483 | utg_len += len0; 484 | read_len.insert(v.0, len0); 485 | } 486 | node_start.push((*w, utg_len)); 487 | utg_len += len1 - ovlp_len; 488 | read_len.insert(w.0, len1); 489 | v = *w; 490 | } 491 | 492 | writeln!( 493 | gfa_file, 494 | "S\t{}\t*\tLN:i:{}\tRB:i:{}\tSB:i:{}\tRE:i:{}\tSE:i:{}\tNC:i:{}\tTG:i:{}", 495 | uid, 496 | utg_len, 497 | nb.0, 498 | nb.1, 499 | ne.0, 500 | ne.1, 501 | p.len(), 502 | tag 503 | )?; 504 | 505 | for (n, start) in node_start { 506 | writeln!(gfa_file, "N\t{}\t{}\t{}\t{}", uid, n.0, n.1, start)?; 507 | } 508 | } 509 | for (v, uids0) in end_nodes.iter() { 510 | if let Some(uids1) = bgn_nodes.get(v) { 511 | for u0 in uids0 { 512 | for u1 in uids1 { 513 | let rlen = read_len.get(&v.0).unwrap(); 514 | writeln!(gfa_file, "L\t{}\t+\t{}\t+\t{}M", u0, u1, rlen)?; 515 | } 516 | } 517 | } 518 | 519 | if let Some(uids1) = end_nodes.get(&(v.0, 1 - v.1)) { 520 | for u0 in uids0 { 521 | for u1 in uids1 { 522 | if *u0 == *u1 { 523 | continue; 524 | } 525 | let rlen = read_len.get(&v.0).unwrap(); 526 | writeln!(gfa_file, "L\t{}\t+\t{}\t-\t{}M", u0, u1, rlen)?; 527 | } 528 | } 529 | } 530 | } 531 | Ok(()) 532 | } 533 | 534 | pub fn bfs_extend(v: (u32, u8), g: &U32AsmGraph, limit: u32) -> (u32, Vec<(u32, u8)>) { 535 | // 536 | // output bfs search from node v in graph g bounded by the `limit` 537 | // 538 | 539 | let mut bfs = Bfs::new(g, v); 540 | let mut count = 0_u32; 541 | let mut nodes = Vec::<(u32, u8)>::with_capacity(32); 542 | while let Some(n) = bfs.next(g) { 543 | count += 1; 544 | if count >= limit { 545 | break; 546 | } 547 | nodes.push(n); 548 | } 549 | //assert!(nodes[0]==v); 550 | (count, nodes) 551 | } 552 | 553 | fn find_branching_nodes( 554 | g: &mut UtgGraph, 555 | max_path_length: u32, 556 | min_path_length: u32, 557 | max_edge_count: u32, 558 | ) -> Vec<(u32, u8)> { 559 | // output nodes that has branches 560 | 561 | let mut branching_nodes = FxHashSet::<(u32, u8)>::default(); 562 | let candidates = g.nodes().collect::>(); 563 | for v in candidates { 564 | let out_count = g.neighbors_directed(v, Outgoing).count(); 565 | if out_count < 2 { 566 | continue; 567 | } 568 | let mut ext_branch_count = 0_u32; 569 | let mut node_count = FxHashMap::<(u32, u8), u32>::default(); 570 | for w in g.neighbors_directed(v, Outgoing) { 571 | let (count, p) = bfs_extend(w, g, max_path_length); 572 | let mut overlapped_path = false; 573 | let mut ovlp_count = 0; 574 | for ww in g.neighbors_directed(v, Outgoing) { 575 | ovlp_count += g.edge_weight(v, ww).unwrap(); 576 | } 577 | for &vv in &p { 578 | for ww in g.neighbors_directed(vv, Outgoing) { 579 | ovlp_count += g.edge_weight(vv, ww).unwrap(); 580 | } 581 | } 582 | if ovlp_count <= 3 { 583 | // don't count if a branch is very short 584 | break; 585 | } 586 | if count >= min_path_length { 587 | let mut ec = 0_u32; 588 | for vv in p { 589 | ec += 1; 590 | if ec > max_edge_count { 591 | break; 592 | } 593 | if vv.0 == v.0 { 594 | // loop 595 | break; 596 | } 597 | let c = node_count.entry(vv).or_insert(0); 598 | *c += 1; 599 | if *c > 1 { 600 | overlapped_path = true; 601 | break; 602 | } 603 | } 604 | } 605 | if !overlapped_path { 606 | ext_branch_count += 1; 607 | } 608 | } 609 | if ext_branch_count >= 2 { 610 | branching_nodes.insert(v); 611 | } 612 | } 613 | branching_nodes.iter().map(|x| *x).collect() 614 | } 615 | 616 | fn find_path(g: &UtgGraph, s: ReadNode, t: ReadNode) -> Option<(u32, Vec)> { 617 | // search a path from s to t 618 | 619 | assert!(s != t); 620 | let mut pre = FxHashMap::>::default(); 621 | let mut min_edge_count = FxHashMap::::default(); 622 | let mut out = Vec::::new(); 623 | 624 | pre.insert(s, None); 625 | min_edge_count.insert(s, 0); 626 | let limit = 256_u32; 627 | let mut count = 0_u32; 628 | let mut found = false; 629 | 630 | let mut bfs = Bfs::new(&g, s); 631 | let mut m: u32; 632 | while let Some(n) = bfs.next(&g) { 633 | m = u32::MAX; 634 | let mut min_in = None; 635 | for ww in g.neighbors_directed(n, Incoming) { 636 | if let Some(mm) = min_edge_count.get(&ww) { 637 | if *mm < m { 638 | m = *mm; 639 | min_in = Some(ww); 640 | } 641 | } 642 | } 643 | 644 | if min_in != None && n != s { 645 | pre.insert(n, min_in); 646 | min_edge_count.insert(n, m + g.edge_weight(min_in.unwrap(), n).unwrap()); 647 | } 648 | 649 | if n == t { 650 | found = true; 651 | break; 652 | } 653 | count += 1; 654 | if count > limit { 655 | break; 656 | } 657 | } 658 | if found { 659 | let mut v = t; 660 | loop { 661 | let p = *pre.get(&v).unwrap(); 662 | if p == None { 663 | break; 664 | } 665 | out.push((p.unwrap(), v)); 666 | v = p.unwrap(); 667 | } 668 | out.reverse(); 669 | Some((*min_edge_count.get(&t).unwrap(), out)) 670 | } else { 671 | None 672 | } 673 | } 674 | 675 | pub fn utg_reduction(paths: &Vec<(u32, Vec)>, g0: &OvlpGraph) -> (UtgGraph, OvlpGraph) { 676 | // graph reduction in the unitig level 677 | 678 | let mut utg_g = UtgGraph::new(); 679 | let mut g = OvlpGraph::new(); 680 | for (_uid, p) in paths.iter() { 681 | let b = p[0]; 682 | let e = p[p.len() - 1]; 683 | utg_g.add_edge(b, e, p.len() as u32); 684 | } 685 | 686 | let mut branching_nodes = find_branching_nodes(&mut utg_g, 24, 1, 128); 687 | branching_nodes.sort(); 688 | let mut bracnhing_node_sinks = FxHashSet::::default(); 689 | for v in branching_nodes.iter() { 690 | let w = (v.0, 1 - v.1); 691 | if utg_g.neighbors_directed(w, Outgoing).count() > 0 { 692 | // only add w if the w is not dead end 693 | bracnhing_node_sinks.insert(w); 694 | //println!("SINK {}:{}", v.0, 1 - v.1); 695 | } 696 | } 697 | //println!("BCOUNT {}", branching_nodes.len()); 698 | let mut ripaths = Vec::>::new(); //ripaths = repeat induced path 699 | for v in branching_nodes.iter() { 700 | let mut paths_to_remove_candidates = Vec::<(u32, Vec)>::new(); 701 | for w in utg_g.neighbors_directed(*v, Outgoing) { 702 | let vv = *v; 703 | let ww = w; 704 | if ww == vv { 705 | // ignore self edge caused by as loop 706 | continue; 707 | } 708 | log::debug!("BS {}:{} {}:{}", vv.0, vv.1, ww.0, ww.1); 709 | let c_nodes = bfs_extend(ww, &utg_g, 16); 710 | let mut sink_nodes = Vec::::new(); 711 | for p in &c_nodes.1 { 712 | if bracnhing_node_sinks.contains(p) { 713 | sink_nodes.push(*p); 714 | } 715 | } 716 | if sink_nodes.len() > 0 { 717 | let mut s_ec = u32::MAX; 718 | let mut s_path = Vec::::new(); 719 | let ec0 = *utg_g.edge_weight(vv, ww).unwrap(); 720 | s_path.push((vv, ww)); 721 | for n in sink_nodes { 722 | log::debug!("V {}:{} W {}:{} SINK {}:{}", v.0, v.1, w.0, w.1, n.0, n.1); 723 | if w == n { 724 | s_ec = ec0; 725 | break; 726 | } else if let Some((mut ec, path)) = find_path(&utg_g, ww, n) { 727 | ec += ec0; 728 | if ec < 5 { 729 | if ec < s_ec { 730 | s_ec = ec; 731 | s_path.extend(path); 732 | } 733 | } 734 | } 735 | } 736 | if s_ec < 5 { 737 | log::debug!("PL {} {}", s_ec, s_path.len()); 738 | paths_to_remove_candidates.push((s_ec, s_path)); 739 | } 740 | } 741 | } 742 | //remove repeat candidates but ensure to keep one out 743 | paths_to_remove_candidates.sort(); 744 | /* 745 | println!( 746 | "BN {}:{} {}", 747 | v.0, 748 | v.1, 749 | utg_g.neighbors_directed(*v, Outgoing).count() 750 | ); 751 | */ 752 | for (_ec, path) in paths_to_remove_candidates.iter() { 753 | log::debug!("RMPC {} {:?}", _ec, path,); 754 | } 755 | if paths_to_remove_candidates.len() < utg_g.neighbors_directed(*v, Outgoing).count() { 756 | for (_ec, path) in paths_to_remove_candidates.iter() { 757 | log::debug!("RMP {} {:?}", _ec, path); 758 | ripaths.push(path.clone()); 759 | } 760 | } else { 761 | let l = paths_to_remove_candidates.len(); 762 | if l >= 1 { 763 | for i in 0..l - 1 { 764 | let (_ec, path) = paths_to_remove_candidates.get(i).unwrap(); 765 | log::debug!("RMP {} {:?}", _ec, path); 766 | ripaths.push(path.clone()); 767 | } 768 | } 769 | } 770 | } 771 | for p in ripaths { 772 | for (vv, ww) in p { 773 | utg_g.remove_edge(vv, ww); 774 | utg_g.remove_edge((ww.0, 1 - ww.1), (vv.0, 1 - vv.1)); 775 | //log::debug!("RME {}:{} {}:{}", vv.0, vv.1, ww.0, ww.1); 776 | } 777 | } 778 | 779 | //patch some useful edge backs 780 | for (_uid, p) in paths.iter() { 781 | let b = p[0]; 782 | let e = p[p.len() - 1]; 783 | if utg_g.neighbors_directed(b, Outgoing).count() == 0 784 | && utg_g.neighbors_directed(b, Incoming).count() > 0 785 | && utg_g.neighbors_directed(e, Outgoing).count() > 0 786 | && utg_g.neighbors_directed(e, Incoming).count() == 0 787 | { 788 | utg_g.add_edge(b, e, p.len() as u32); 789 | utg_g.add_edge((e.0, 1 - e.1), (b.0, 1 - b.1), p.len() as u32); 790 | //log::debug!("ADD {}:{} {}:{}", b.0, b.1, e.0, e.1); 791 | } 792 | } 793 | 794 | //remove_simple_spur(&mut utg_g, 5); 795 | //remove_single_bridge(&mut utg_g, 5); 796 | 797 | for (_uid, p) in paths.iter() { 798 | let v = p[0]; 799 | let w = p[p.len() - 1]; 800 | if utg_g.contains_edge(v, w) { 801 | let mut vv = v; 802 | for i in 1..p.len() { 803 | let ww = p[i]; 804 | let weight = *g0.edge_weight(vv, ww).unwrap(); 805 | g.add_edge(vv, ww, weight); 806 | vv = ww; 807 | } 808 | } 809 | } 810 | (utg_g, g) 811 | } 812 | --------------------------------------------------------------------------------