├── .gitignore ├── tests ├── test_graphs │ ├── test_gap.ann.csv │ ├── scc_tangle.ann.csv │ ├── sparse_markers.ann.csv │ ├── test_gap.gfa │ ├── test_assign.ann.csv │ ├── test1.no_homozygous.csv │ ├── path_closing.ann.csv │ ├── test1.ann.csv │ ├── scc_tangle.gfa │ ├── test2.ann.csv │ ├── sparse_markers.gfa │ ├── test3.ann.csv │ ├── test1.gfa │ ├── path_closing.gfa │ ├── test_assign.gfa │ ├── test2.gfa │ └── test3.gfa ├── scc_test.rs ├── trio_test.rs ├── graph_test.rs ├── superbubble_test.rs └── trio_walk_test.rs ├── src ├── graph_algos.rs ├── main.rs ├── graph_algos │ ├── scc.rs │ ├── dfs.rs │ └── superbubble.rs ├── pseudo_hap.rs ├── trio.rs ├── lib.rs └── graph.rs ├── Cargo.toml ├── .vscode ├── tasks.json └── launch.json ├── .github └── workflows │ ├── CI.yml │ ├── fmt.yml │ └── clippy.yml ├── README.licenses └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | Cargo.lock 3 | -------------------------------------------------------------------------------- /tests/test_graphs/test_gap.ann.csv: -------------------------------------------------------------------------------- 1 | node assignment color 2 | utig4-947 MATERNAL #FF8888 3 | utig4-948 PATERNAL #8888FF 4 | utig4-1318 MATERNAL #FF8888 5 | utig4-1319 PATERNAL #8888FF 6 | utig4-1320 MATERNAL #FF8888 7 | utig4-1321 PATERNAL #8888FF 8 | utig4-1322 MATERNAL #FF8888 9 | -------------------------------------------------------------------------------- /src/graph_algos.rs: -------------------------------------------------------------------------------- 1 | pub mod dfs; 2 | pub mod scc; 3 | pub mod superbubble; 4 | 5 | pub fn only_or_none(mut iter: impl Iterator) -> Option { 6 | let e = iter.next()?; 7 | match iter.next() { 8 | None => Some(e), 9 | _ => None, 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rukki" 3 | version = "0.4.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | log = "0.4" 10 | env_logger = "0.10" 11 | clap = { version = "4.1", features = ["derive"] } 12 | itertools = "0.10" 13 | -------------------------------------------------------------------------------- /.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0.0", 3 | "tasks": [ 4 | { 5 | "type": "shell", 6 | "command": "cargo", 7 | "args": [ 8 | "build", 9 | "--tests" 10 | ], 11 | "problemMatcher": [ 12 | "$rustc" 13 | ], 14 | "group": { 15 | "kind": "build", 16 | "isDefault": true 17 | }, 18 | "label": "Rust: cargo build --tests - rukki" 19 | } 20 | ] 21 | } -------------------------------------------------------------------------------- /tests/test_graphs/scc_tangle.ann.csv: -------------------------------------------------------------------------------- 1 | node assignment color 2 | utig4-648 PATERNAL #8888FF 3 | utig4-2545 PATERNAL #8888FF 4 | utig4-2602 PATERNAL #8888FF 5 | utig4-2603 PATERNAL #8888FF 6 | utig4-2604 PATERNAL #8888FF 7 | utig4-2605 PATERNAL #8888FF 8 | utig4-2606 PATERNAL #8888FF 9 | utig4-2607 PATERNAL #8888FF 10 | utig4-2608 PATERNAL #8888FF 11 | utig4-2609 PATERNAL #8888FF 12 | utig4-2610 PATERNAL #8888FF 13 | utig4-4076 PATERNAL #8888FF 14 | -------------------------------------------------------------------------------- /.github/workflows/CI.yml: -------------------------------------------------------------------------------- 1 | name: Test and Build 2 | 3 | on: 4 | push: 5 | branches: [ "master" ] 6 | pull_request: 7 | branches: [ "master" ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | build_and_test: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | - name: Build 20 | run: cargo build --verbose 21 | - name: Run tests 22 | run: cargo test --verbose 23 | -------------------------------------------------------------------------------- /.github/workflows/fmt.yml: -------------------------------------------------------------------------------- 1 | name: Formatting 2 | 3 | on: 4 | push: 5 | branches: [ "master" ] 6 | pull_request: 7 | branches: [ "master" ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | rustfmt: 14 | name: Verify code formatting 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v2 18 | - uses: hecrj/setup-rust-action@v1 19 | with: 20 | components: rustfmt 21 | - run: cargo fmt --all -- --check 22 | -------------------------------------------------------------------------------- /tests/test_graphs/sparse_markers.ann.csv: -------------------------------------------------------------------------------- 1 | node assignment color 2 | utig4-791 HOMOZYGOUS #7900D6 3 | utig4-792 MATERNAL #FF8888 4 | utig4-794 PATERNAL #8888FF 5 | utig4-795 MATERNAL #FF8888 6 | utig4-1418 HOMOZYGOUS #7900D6 7 | utig4-1419 PATERNAL #8888FF 8 | utig4-1420 MATERNAL #FF8888 9 | utig4-1422 MATERNAL #FF8888 10 | utig4-1423 PATERNAL #8888FF 11 | utig4-1424 MATERNAL #FF8888 12 | utig4-1435 HOMOZYGOUS #7900D6 13 | utig4-1437 MATERNAL #FF8888 14 | utig4-1438 PATERNAL #8888FF 15 | utig4-1439 MATERNAL #FF8888 16 | -------------------------------------------------------------------------------- /.github/workflows/clippy.yml: -------------------------------------------------------------------------------- 1 | name: Clippy 2 | 3 | on: 4 | push: 5 | branches: [ "master" ] 6 | pull_request: 7 | branches: [ "master" ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | clippy: 14 | name: Lint with clippy 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v2 18 | - uses: actions-rs/toolchain@v1 19 | with: 20 | toolchain: nightly 21 | components: clippy 22 | - run: cargo clippy --workspace --all-targets --verbose --all-features 23 | -------------------------------------------------------------------------------- /tests/test_graphs/test_gap.gfa: -------------------------------------------------------------------------------- 1 | S utig4-947 * LN:i:2459260 RC:i:65899807 ll:f:26.797 2 | S utig4-948 * LN:i:2458044 RC:i:65419120 ll:f:26.614 3 | S utig4-1318 * LN:i:92031 RC:i:5418914 ll:f:58.881 4 | S utig4-1319 * LN:i:4287968 RC:i:99208143 ll:f:23.136 5 | S utig4-1320 * LN:i:4251545 RC:i:99597543 ll:f:23.426 6 | S utig4-1321 * LN:i:6585985 RC:i:158659672 ll:f:24.091 7 | S utig4-1322 * LN:i:6858336 RC:i:170299341 ll:f:24.831 8 | L utig4-947 + utig4-1318 - 1591M 9 | L utig4-948 + utig4-1318 - 1612M 10 | L utig4-1318 - utig4-1320 + 1644M 11 | L utig4-1318 - utig4-1319 + 1582M 12 | L utig4-1319 + utig4-1322 + 1695M 13 | L utig4-1319 + utig4-1321 + 1588M 14 | -------------------------------------------------------------------------------- /tests/test_graphs/test_assign.ann.csv: -------------------------------------------------------------------------------- 1 | node assignment color 2 | utig4-1038 MATERNAL #FF8888 3 | utig4-1040 PATERNAL #8888FF 4 | utig4-1041 MATERNAL #FF8888 5 | utig4-1042 PATERNAL #8888FF 6 | utig4-1043 MATERNAL #FF8888 7 | utig4-1222 PATERNAL #8888FF 8 | utig4-1223 MATERNAL #FF8888 9 | utig4-1231 HOMOZYGOUS #7900D6 10 | utig4-1232 PATERNAL #8888FF 11 | utig4-1233 MATERNAL #FF8888 12 | utig4-1274 HOMOZYGOUS #7900D6 13 | utig4-1275 PATERNAL #8888FF 14 | utig4-1277 MATERNAL #FF8888 15 | utig4-1366 HOMOZYGOUS #7900D6 16 | utig4-1367 PATERNAL #8888FF 17 | utig4-1607 HOMOZYGOUS #7900D6 18 | utig4-1608 MATERNAL #FF8888 19 | utig4-1616 HOMOZYGOUS #7900D6 20 | utig4-1621 HOMOZYGOUS #7900D6 21 | utig4-1891 HOMOZYGOUS #7900D6 22 | utig4-4120 PATERNAL #8888FF 23 | utig4-4154 MATERNAL #FF8888 24 | utig4-4287 MATERNAL #FF8888 25 | -------------------------------------------------------------------------------- /tests/test_graphs/test1.no_homozygous.csv: -------------------------------------------------------------------------------- 1 | node assignment color 2 | utig4-1238 PATERNAL #8888FF 3 | utig4-1239 MATERNAL #FF8888 4 | utig4-1240 MATERNAL #FF8888 5 | utig4-1241 PATERNAL #8888FF 6 | utig4-1553 PATERNAL #8888FF 7 | utig4-1554 MATERNAL #FF8888 8 | utig4-1827 PATERNAL #8888FF 9 | utig4-1828 MATERNAL #FF8888 10 | utig4-1829 MATERNAL #FF8888 11 | utig4-1830 PATERNAL #8888FF 12 | utig4-1831 PATERNAL #8888FF 13 | utig4-1832 MATERNAL #FF8888 14 | utig4-2590 MATERNAL #FF8888 15 | utig4-2591 PATERNAL #8888FF 16 | utig4-2592 PATERNAL #8888FF 17 | utig4-2593 MATERNAL #FF8888 18 | utig4-4096 PATERNAL #8888FF 19 | utig4-4097 PATERNAL #8888FF 20 | utig4-4098 PATERNAL #8888FF 21 | utig4-4099 PATERNAL #8888FF 22 | utig4-4100 PATERNAL #8888FF 23 | utig4-4105 MATERNAL #FF8888 24 | utig4-4112 PATERNAL #8888FF 25 | utig4-4113 PATERNAL #8888FF 26 | -------------------------------------------------------------------------------- /tests/test_graphs/path_closing.ann.csv: -------------------------------------------------------------------------------- 1 | node assignment color 2 | utig4-768 HOMOZYGOUS #7900D6 3 | utig4-769 PATERNAL #8888FF 4 | utig4-770 MATERNAL #FF8888 5 | utig4-771 MATERNAL #FF8888 6 | utig4-772 PATERNAL #8888FF 7 | utig4-773 MATERNAL #FF8888 8 | utig4-775 PATERNAL #8888FF 9 | utig4-814 HOMOZYGOUS #7900D6 10 | utig4-818 PATERNAL #8888FF 11 | utig4-1343 HOMOZYGOUS #7900D6 12 | utig4-1346 PATERNAL #8888FF 13 | utig4-1395 HOMOZYGOUS #7900D6 14 | utig4-1396 PATERNAL #8888FF 15 | utig4-1568 HOMOZYGOUS #7900D6 16 | utig4-1574 HOMOZYGOUS #7900D6 17 | utig4-1575 MATERNAL #FF8888 18 | utig4-1576 PATERNAL #8888FF 19 | utig4-1796 HOMOZYGOUS #7900D6 20 | utig4-1797 PATERNAL #8888FF 21 | utig4-1798 MATERNAL #FF8888 22 | utig4-1799 MATERNAL #FF8888 23 | utig4-3412 PATERNAL #8888FF 24 | utig4-3444 MATERNAL #FF8888 25 | utig4-4080 MATERNAL #FF8888 26 | utig4-4212 PATERNAL #8888FF 27 | utig4-4213 PATERNAL #8888FF 28 | -------------------------------------------------------------------------------- /tests/scc_test.rs: -------------------------------------------------------------------------------- 1 | //use rukki::*; 2 | //use rukki::graph_algos::scc; 3 | //use std::fs; 4 | //use std::fs::File; 5 | //use std::io::Write; 6 | //use itertools::Itertools; 7 | ////FIXME populate with small corner cases. 8 | 9 | //#[test] 10 | //fn manual_tmp_test() { 11 | // let in_file = ""; 12 | // let out_file = ""; 13 | // let scc_out_file = ""; 14 | // let g = Graph::read(&fs::read_to_string(in_file).unwrap()); 15 | // let sccs = scc::strongly_connected(&g); 16 | // let (cond, _v_map) = scc::condensation(&g, &sccs, false); 17 | // let mut output = File::create(out_file).unwrap(); 18 | // write!(output, "{}", cond.as_gfa()).unwrap(); 19 | // let mut output = File::create(scc_out_file).unwrap(); 20 | // for (scc_id, scc) in sccs.iter().enumerate() { 21 | // write!(output, "scc_{}: {}\n", scc_id, scc.iter().map(|&w| g.v_str(w)).join(",")).unwrap(); 22 | // } 23 | //} 24 | -------------------------------------------------------------------------------- /tests/test_graphs/test1.ann.csv: -------------------------------------------------------------------------------- 1 | node assignment color 2 | utig4-1238 PATERNAL #8888FF 3 | utig4-1239 MATERNAL #FF8888 4 | utig4-1240 MATERNAL #FF8888 5 | utig4-1241 PATERNAL #8888FF 6 | utig4-1553 PATERNAL #8888FF 7 | utig4-1554 MATERNAL #FF8888 8 | utig4-1827 PATERNAL #8888FF 9 | utig4-1828 MATERNAL #FF8888 10 | utig4-1829 MATERNAL #FF8888 11 | utig4-1830 PATERNAL #8888FF 12 | utig4-1831 PATERNAL #8888FF 13 | utig4-1832 MATERNAL #FF8888 14 | utig4-2590 MATERNAL #FF8888 15 | utig4-2591 PATERNAL #8888FF 16 | utig4-2592 PATERNAL #8888FF 17 | utig4-2593 MATERNAL #FF8888 18 | utig4-4096 PATERNAL #8888FF 19 | utig4-4097 PATERNAL #8888FF 20 | utig4-4098 PATERNAL #8888FF 21 | utig4-4099 PATERNAL #8888FF 22 | utig4-4100 PATERNAL #8888FF 23 | utig4-4105 MATERNAL #FF8888 24 | utig4-4112 PATERNAL #8888FF 25 | utig4-4113 PATERNAL #8888FF 26 | utig4-1237 HOMOZYGOUS #7900D6 27 | utig4-1552 HOMOZYGOUS #7900D6 28 | utig4-1826 HOMOZYGOUS #7900D6 29 | utig4-2589 HOMOZYGOUS #7900D6 30 | -------------------------------------------------------------------------------- /README.licenses: -------------------------------------------------------------------------------- 1 | PUBLIC DOMAIN NOTICE 2 | 3 | This software is "United States Government Work" under the terms of the United 4 | States Copyright Act. It was written as part of the authors' official duties 5 | for the United States Government and thus cannot be copyrighted. This software 6 | is freely available to the public for use without a copyright 7 | notice. Restrictions cannot be placed on its present or future use. 8 | 9 | Although all reasonable efforts have been taken to ensure the accuracy and 10 | reliability of the software and associated data, the National Human Genome 11 | Research Institute (NHGRI), National Institutes of Health (NIH) and the 12 | U.S. Government do not and cannot warrant the performance or results that may 13 | be obtained by using this software or data. NHGRI, NIH and the U.S. Government 14 | disclaim all warranties as to performance, merchantability or fitness for any 15 | particular purpose. 16 | 17 | Please cite the authors in any work or product based on this material. 18 | -------------------------------------------------------------------------------- /tests/trio_test.rs: -------------------------------------------------------------------------------- 1 | extern crate log; 2 | use itertools::Itertools; 3 | 4 | use rukki::trio::*; 5 | use rukki::*; 6 | use std::fs; 7 | 8 | fn init() { 9 | let _ = env_logger::builder().is_test(true).try_init(); 10 | } 11 | 12 | #[test] 13 | fn homozygous_assignment() { 14 | init(); 15 | 16 | let graph_fn = "tests/test_graphs/test1.gfa"; 17 | let assignments_fn = "tests/test_graphs/test1.no_homozygous.csv"; 18 | let g = graph::Graph::read(&fs::read_to_string(graph_fn).unwrap()); 19 | let assignments = trio::parse_node_assignments(&g, assignments_fn).unwrap(); 20 | let assigner = 21 | trio::HomozygousAssigner::new(&g, assignments, 200_000, None, 500_000, 1.5, usize::MAX); 22 | 23 | let assignments = assigner.run(); 24 | 25 | let mut homozygous_names = (0..g.node_cnt()) 26 | .filter(|&node_id| assignments.group(node_id) == Some(TrioGroup::HOMOZYGOUS)) 27 | .map(|node_id| g.name(node_id)) 28 | .collect_vec(); 29 | homozygous_names.sort(); 30 | assert_eq!( 31 | &homozygous_names, 32 | &["utig4-1237", "utig4-1552", "utig4-1826", "utig4-2589"] 33 | ); 34 | } 35 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | //use std::io; 2 | #[macro_use] 3 | extern crate log; 4 | use clap::{Parser, Subcommand}; 5 | use env_logger::{Builder, Env, Target}; 6 | 7 | #[derive(Parser, Debug)] 8 | #[command(name = "rukki", author = "Sergey Nurk", about = "extraction of paths from assembly graphs", long_about=None)] 9 | struct Args { 10 | #[clap(subcommand)] 11 | subcmd: Commands, 12 | } 13 | 14 | #[derive(Subcommand, Debug)] 15 | enum Commands { 16 | /// Trio-marker based analysis 17 | Trio(rukki::TrioSettings), 18 | } 19 | 20 | fn main() { 21 | //env_logger::init(); 22 | let mut builder = Builder::from_env(Env::default().default_filter_or("info")); 23 | builder.target(Target::Stdout); 24 | builder.init(); 25 | //info!("Starting up"); 26 | 27 | //info!("Cmd arguments: {:?}", env::args()); 28 | 29 | let args = Args::parse(); 30 | 31 | match &args.subcmd { 32 | Commands::Trio(settings) => { 33 | info!("Running trio marker analysis"); 34 | settings.validate(); 35 | 36 | match rukki::run_trio_analysis(settings) { 37 | Ok(()) => info!("Success"), 38 | Err(e) => info!("Some error happened {:?}", e), 39 | } 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /tests/test_graphs/scc_tangle.gfa: -------------------------------------------------------------------------------- 1 | S utig4-648 * LN:i:8662028 RC:i:225322736 ll:f:26.013 2 | S utig4-2545 * LN:i:2347206 RC:i:53739986 ll:f:22.895 3 | S utig4-2602 * LN:i:148123 RC:i:11704620 ll:f:79.02 4 | S utig4-2603 * LN:i:127043 RC:i:6472002 ll:f:50.943 5 | S utig4-2604 * LN:i:199000 RC:i:5454968 ll:f:27.412 6 | S utig4-2605 * LN:i:92273 RC:i:4988804 ll:f:54.066 7 | S utig4-2606 * LN:i:231023 RC:i:6246793 ll:f:27.04 8 | S utig4-2607 * LN:i:113533 RC:i:3263699 ll:f:28.747 9 | S utig4-2608 * LN:i:5199 RC:i:80584 ll:f:15.5 10 | S utig4-2609 * LN:i:5314 RC:i:98309 ll:f:18.5 11 | S utig4-2610 * LN:i:31694 RC:i:1064912 ll:f:33.6 12 | S utig4-4076 * LN:i:148046 RC:i:7775983 ll:f:52.524 13 | L utig4-648 + utig4-2602 + 1641M 14 | L utig4-2545 + utig4-2606 - 14656M 15 | L utig4-2602 + utig4-2604 + 1435M 16 | L utig4-2602 + utig4-2605 + 1435M 17 | L utig4-2602 - utig4-2603 + 1673M 18 | L utig4-2603 + utig4-2608 + 2004M 19 | L utig4-2603 + utig4-2609 + 2003M 20 | L utig4-2604 + utig4-4076 - 30546M 21 | L utig4-2605 + utig4-2607 + 6805M 22 | L utig4-2605 + utig4-2606 + 6805M 23 | L utig4-2606 + utig4-2610 + 14656M 24 | L utig4-2607 + utig4-4076 - 30546M 25 | L utig4-2608 + utig4-2610 - 3130M 26 | L utig4-2609 + utig4-2610 - 3247M 27 | L utig4-2610 - utig4-4076 + 14656M 28 | -------------------------------------------------------------------------------- /tests/test_graphs/test2.ann.csv: -------------------------------------------------------------------------------- 1 | node assignment color 2 | utig4-307 HOMOZYGOUS #7900D6 3 | utig4-309 PATERNAL #8888FF 4 | utig4-310 PATERNAL #8888FF 5 | utig4-311 MATERNAL #FF8888 6 | utig4-312 PATERNAL #8888FF 7 | utig4-412 HOMOZYGOUS #7900D6 8 | utig4-413 PATERNAL #8888FF 9 | utig4-416 MATERNAL #FF8888 10 | utig4-417 MATERNAL #FF8888 11 | utig4-418 PATERNAL #8888FF 12 | utig4-419 MATERNAL #FF8888 13 | utig4-420 ISSUE #FFDE24 14 | utig4-421 PATERNAL #8888FF 15 | utig4-422 PATERNAL #8888FF 16 | utig4-768 HOMOZYGOUS #7900D6 17 | utig4-769 PATERNAL #8888FF 18 | utig4-770 MATERNAL #FF8888 19 | utig4-771 MATERNAL #FF8888 20 | utig4-772 PATERNAL #8888FF 21 | utig4-773 MATERNAL #FF8888 22 | utig4-775 PATERNAL #8888FF 23 | utig4-1384 HOMOZYGOUS #7900D6 24 | utig4-1385 MATERNAL #FF8888 25 | utig4-1386 PATERNAL #8888FF 26 | utig4-1459 HOMOZYGOUS #7900D6 27 | utig4-1460 MATERNAL #FF8888 28 | utig4-1461 PATERNAL #8888FF 29 | utig4-1462 PATERNAL #8888FF 30 | utig4-1463 MATERNAL #FF8888 31 | utig4-1897 HOMOZYGOUS #7900D6 32 | utig4-1898 MATERNAL #FF8888 33 | utig4-1899 PATERNAL #8888FF 34 | utig4-3412 PATERNAL #8888FF 35 | utig4-3429 PATERNAL #8888FF 36 | utig4-3430 MATERNAL #FF8888 37 | utig4-3431 MATERNAL #FF8888 38 | utig4-3444 MATERNAL #FF8888 39 | utig4-4073 MATERNAL #FF8888 40 | utig4-4080 MATERNAL #FF8888 41 | utig4-4212 PATERNAL #8888FF 42 | utig4-4213 PATERNAL #8888FF 43 | utig4-4227 HOMOZYGOUS #7900D6 44 | -------------------------------------------------------------------------------- /tests/test_graphs/sparse_markers.gfa: -------------------------------------------------------------------------------- 1 | S utig4-791 * LN:i:85487 RC:i:5159867 ll:f:60.359 2 | S utig4-792 * LN:i:498444 RC:i:13699434 ll:f:27.484 3 | S utig4-793 * LN:i:498270 RC:i:13291701 ll:f:26.676 4 | S utig4-794 * LN:i:58381 RC:i:1467009 ll:f:25.128 5 | S utig4-795 * LN:i:58381 RC:i:1847087 ll:f:31.639 6 | S utig4-1418 * LN:i:112656 RC:i:5942210 ll:f:52.747 7 | S utig4-1419 * LN:i:135842 RC:i:3825922 ll:f:28.165 8 | S utig4-1420 * LN:i:135724 RC:i:4003247 ll:f:29.496 9 | S utig4-1421 * LN:i:1788713 RC:i:47601230 ll:f:26.612 10 | S utig4-1422 * LN:i:1788398 RC:i:50148111 ll:f:28.041 11 | S utig4-1423 * LN:i:18425897 RC:i:490209934 ll:f:26.604 12 | S utig4-1424 * LN:i:18521096 RC:i:490592347 ll:f:26.488 13 | S utig4-1435 * LN:i:112341 RC:i:6766624 ll:f:60.233 14 | S utig4-1436 * LN:i:74169 RC:i:2180747 ll:f:29.402 15 | S utig4-1437 * LN:i:74173 RC:i:2114902 ll:f:28.513 16 | S utig4-1438 * LN:i:26824337 RC:i:714329412 ll:f:26.63 17 | S utig4-1439 * LN:i:26774252 RC:i:727014652 ll:f:27.154 18 | L utig4-791 + utig4-794 + 1576M 19 | L utig4-791 + utig4-795 + 1576M 20 | L utig4-791 - utig4-792 + 1678M 21 | L utig4-791 - utig4-793 + 1671M 22 | L utig4-792 + utig4-1418 - 1509M 23 | L utig4-793 + utig4-1418 - 1470M 24 | L utig4-794 + utig4-1435 + 1497M 25 | L utig4-795 + utig4-1435 + 1497M 26 | L utig4-1418 - utig4-1419 + 1679M 27 | L utig4-1418 - utig4-1420 + 1548M 28 | L utig4-1419 + utig4-1421 + 14954M 29 | L utig4-1420 + utig4-1422 + 14954M 30 | L utig4-1420 + utig4-1421 + 14954M 31 | L utig4-1421 + utig4-1424 + 15519M 32 | L utig4-1421 + utig4-1423 + 15519M 33 | L utig4-1422 + utig4-1424 + 15519M 34 | L utig4-1435 + utig4-1436 + 1598M 35 | L utig4-1435 + utig4-1437 + 1597M 36 | L utig4-1436 + utig4-1439 + 14230M 37 | L utig4-1436 + utig4-1438 + 14230M 38 | L utig4-1437 + utig4-1439 + 14230M 39 | -------------------------------------------------------------------------------- /tests/test_graphs/test3.ann.csv: -------------------------------------------------------------------------------- 1 | node assignment color 2 | utig4-64 HOMOZYGOUS #7900D6 3 | utig4-65 MATERNAL #FF8888 4 | utig4-67 MATERNAL #FF8888 5 | utig4-68 PATERNAL #8888FF 6 | utig4-923 HOMOZYGOUS #7900D6 7 | utig4-924 PATERNAL #8888FF 8 | utig4-925 MATERNAL #FF8888 9 | utig4-926 MATERNAL #FF8888 10 | utig4-927 PATERNAL #8888FF 11 | utig4-1019 HOMOZYGOUS #7900D6 12 | utig4-1020 MATERNAL #FF8888 13 | utig4-1021 PATERNAL #8888FF 14 | utig4-1022 MATERNAL #FF8888 15 | utig4-1023 PATERNAL #8888FF 16 | utig4-1024 PATERNAL #8888FF 17 | utig4-1025 MATERNAL #FF8888 18 | utig4-1026 PATERNAL #8888FF 19 | utig4-1027 MATERNAL #FF8888 20 | utig4-1249 HOMOZYGOUS #7900D6 21 | utig4-1252 MATERNAL #FF8888 22 | utig4-1253 PATERNAL #8888FF 23 | utig4-1254 MATERNAL #FF8888 24 | utig4-1255 PATERNAL #8888FF 25 | utig4-1256 PATERNAL #8888FF 26 | utig4-1387 HOMOZYGOUS #7900D6 27 | utig4-1392 HOMOZYGOUS #7900D6 28 | utig4-1393 MATERNAL #FF8888 29 | utig4-1402 HOMOZYGOUS #7900D6 30 | utig4-1405 PATERNAL #8888FF 31 | utig4-1407 MATERNAL #FF8888 32 | utig4-1408 PATERNAL #8888FF 33 | utig4-1409 MATERNAL #FF8888 34 | utig4-1410 PATERNAL #8888FF 35 | utig4-1450 HOMOZYGOUS #7900D6 36 | utig4-1451 MATERNAL #FF8888 37 | utig4-1452 PATERNAL #8888FF 38 | utig4-1476 HOMOZYGOUS #7900D6 39 | utig4-1477 MATERNAL #FF8888 40 | utig4-1478 PATERNAL #8888FF 41 | utig4-1529 HOMOZYGOUS #7900D6 42 | utig4-1530 PATERNAL #8888FF 43 | utig4-1531 MATERNAL #FF8888 44 | utig4-1532 PATERNAL #8888FF 45 | utig4-1533 MATERNAL #FF8888 46 | utig4-1534 PATERNAL #8888FF 47 | utig4-1535 MATERNAL #FF8888 48 | utig4-1595 HOMOZYGOUS #7900D6 49 | utig4-1596 PATERNAL #8888FF 50 | utig4-1597 MATERNAL #FF8888 51 | utig4-1617 HOMOZYGOUS #7900D6 52 | utig4-1618 PATERNAL #8888FF 53 | utig4-1795 HOMOZYGOUS #7900D6 54 | utig4-1892 HOMOZYGOUS #7900D6 55 | utig4-1896 HOMOZYGOUS #7900D6 56 | utig4-3384 PATERNAL #8888FF 57 | utig4-3445 PATERNAL #8888FF 58 | utig4-3446 MATERNAL #FF8888 59 | utig4-3447 PATERNAL #8888FF 60 | utig4-3448 MATERNAL #FF8888 61 | utig4-3455 PATERNAL #8888FF 62 | utig4-3456 MATERNAL #FF8888 63 | utig4-3587 MATERNAL #FF8888 64 | utig4-3588 MATERNAL #FF8888 65 | utig4-3589 PATERNAL #8888FF 66 | utig4-3590 MATERNAL #FF8888 67 | utig4-3591 PATERNAL #8888FF 68 | utig4-3592 MATERNAL #FF8888 69 | utig4-3593 PATERNAL #8888FF 70 | utig4-3626 HOMOZYGOUS #7900D6 71 | utig4-3627 PATERNAL #8888FF 72 | utig4-3628 MATERNAL #FF8888 73 | utig4-3631 MATERNAL #FF8888 74 | utig4-3650 PATERNAL #8888FF 75 | utig4-4041 MATERNAL #FF8888 76 | utig4-4093 MATERNAL #FF8888 77 | utig4-4211 PATERNAL #8888FF 78 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "type": "lldb", 9 | "request": "launch", 10 | "name": "Debug unit tests in library 'rukki'", 11 | "cargo": { 12 | "args": [ 13 | "test", 14 | "--no-run", 15 | "--lib", 16 | "--package=rukki" 17 | ], 18 | "filter": { 19 | "name": "rukki", 20 | "kind": "lib" 21 | } 22 | }, 23 | "args": [], 24 | "cwd": "${workspaceFolder}" 25 | }, 26 | { 27 | "type": "lldb", 28 | "request": "launch", 29 | "name": "Debug executable 'rukki'", 30 | "cargo": { 31 | "args": [ 32 | "build", 33 | "--bin=rukki", 34 | "--package=rukki" 35 | ], 36 | "filter": { 37 | "name": "rukki", 38 | "kind": "bin" 39 | } 40 | }, 41 | "args": [], 42 | "cwd": "${workspaceFolder}" 43 | }, 44 | { 45 | "type": "lldb", 46 | "request": "launch", 47 | "name": "Debug unit tests in executable 'rukki'", 48 | "cargo": { 49 | "args": [ 50 | "test", 51 | "--no-run", 52 | "--bin=rukki", 53 | "--package=rukki" 54 | ], 55 | "filter": { 56 | "name": "rukki", 57 | "kind": "bin" 58 | } 59 | }, 60 | "args": [], 61 | "cwd": "${workspaceFolder}" 62 | }, 63 | { 64 | "type": "lldb", 65 | "request": "launch", 66 | "name": "Debug integration test 'graph_test'", 67 | "cargo": { 68 | "args": [ 69 | "test", 70 | "--no-run", 71 | "--test=graph_test", 72 | "--package=rukki" 73 | ], 74 | "filter": { 75 | "name": "graph_test", 76 | "kind": "test" 77 | } 78 | }, 79 | "args": [], 80 | "cwd": "${workspaceFolder}" 81 | } 82 | ] 83 | } -------------------------------------------------------------------------------- /tests/test_graphs/test1.gfa: -------------------------------------------------------------------------------- 1 | S utig4-1237 * LN:i:93030 RC:i:5081364 ll:f:54.621 2 | S utig4-1238 * LN:i:798769 RC:i:21041173 ll:f:26.342 3 | S utig4-1239 * LN:i:798897 RC:i:21466842 ll:f:26.871 4 | S utig4-1240 * LN:i:4763960 RC:i:128115271 ll:f:26.893 5 | S utig4-1241 * LN:i:4773193 RC:i:128533973 ll:f:26.928 6 | S utig4-1242 * LN:i:24853 RC:i:0 ll:f:0 7 | S utig4-1243 * LN:i:24853 RC:i:0 ll:f:0 8 | S utig4-1244 * LN:i:24852 RC:i:0 ll:f:0 9 | S utig4-1245 * LN:i:24852 RC:i:0 ll:f:0 10 | S utig4-1552 * LN:i:477967 RC:i:25164963 ll:f:52.65 11 | S utig4-1553 * LN:i:9288406 RC:i:243219698 ll:f:26.185 12 | S utig4-1554 * LN:i:9295001 RC:i:238275492 ll:f:25.635 13 | S utig4-1826 * LN:i:101634 RC:i:5416310 ll:f:53.292 14 | S utig4-1827 * LN:i:541057 RC:i:14687750 ll:f:27.146 15 | S utig4-1828 * LN:i:540745 RC:i:13891901 ll:f:25.69 16 | S utig4-1829 * LN:i:14840217 RC:i:373327919 ll:f:25.157 17 | S utig4-1830 * LN:i:18287362 RC:i:450882225 ll:f:24.655 18 | S utig4-1831 * LN:i:3319603 RC:i:91104181 ll:f:27.444 19 | S utig4-1832 * LN:i:3320560 RC:i:91185234 ll:f:27.461 20 | S utig4-2589 * LN:i:70873 RC:i:3716403 ll:f:52.438 21 | S utig4-2590 * LN:i:29869255 RC:i:799907610 ll:f:26.78 22 | S utig4-2591 * LN:i:28541269 RC:i:765639520 ll:f:26.826 23 | S utig4-2592 * LN:i:12635807 RC:i:303287167 ll:f:24.002 24 | S utig4-2593 * LN:i:19361012 RC:i:455502657 ll:f:23.527 25 | S utig4-4096 * LN:i:12629594 RC:i:336812328 ll:f:26.669 26 | S utig4-4097 * LN:i:26818 RC:i:549769 ll:f:20.5 27 | S utig4-4098 * LN:i:100817 RC:i:0 ll:f:0 28 | S utig4-4099 * LN:i:172817 RC:i:0 ll:f:0 29 | S utig4-4100 * LN:i:100816 RC:i:0 ll:f:0 30 | S utig4-4105 * LN:i:12644984 RC:i:327076421 ll:f:25.866 31 | S utig4-4112 * LN:i:100817 RC:i:0 ll:f:0 32 | S utig4-4113 * LN:i:100816 RC:i:0 ll:f:0 33 | L utig4-1237 + utig4-1240 + 1640M 34 | L utig4-1237 + utig4-1241 + 1631M 35 | L utig4-1237 - utig4-1239 + 1601M 36 | L utig4-1237 - utig4-1238 + 1593M 37 | L utig4-1238 + utig4-1552 + 1580M 38 | L utig4-1239 + utig4-1552 + 1626M 39 | L utig4-1240 + utig4-1245 + 23852M 40 | L utig4-1240 + utig4-1244 + 23852M 41 | L utig4-1241 + utig4-1242 + 23853M 42 | L utig4-1241 + utig4-1243 + 23853M 43 | L utig4-1242 + utig4-1832 - 15322M 44 | L utig4-1243 + utig4-1831 - 15322M 45 | L utig4-1244 + utig4-1831 - 15322M 46 | L utig4-1245 + utig4-1832 - 15322M 47 | L utig4-1552 + utig4-1554 + 1530M 48 | L utig4-1552 + utig4-1553 + 1530M 49 | L utig4-1553 + utig4-4096 - 17522M 50 | L utig4-1554 + utig4-4105 + 17522M 51 | L utig4-1554 + utig4-4096 - 17522M 52 | L utig4-1826 + utig4-1829 + 1781M 53 | L utig4-1826 + utig4-1830 + 1781M 54 | L utig4-1826 - utig4-1827 + 1483M 55 | L utig4-1826 - utig4-1828 + 1483M 56 | L utig4-1827 + utig4-1831 + 7497M 57 | L utig4-1827 + utig4-1832 + 7497M 58 | L utig4-1828 + utig4-1832 + 7497M 59 | L utig4-1828 + utig4-1831 + 7497M 60 | L utig4-2589 + utig4-2592 + 1563M 61 | L utig4-2589 + utig4-2593 + 1550M 62 | L utig4-2589 - utig4-2590 + 2211M 63 | L utig4-2589 - utig4-2591 + 2211M 64 | L utig4-2592 + utig4-4112 + 87719M 65 | L utig4-2592 + utig4-4097 - 13719M 66 | L utig4-2593 + utig4-4100 - 27816M 67 | L utig4-2593 + utig4-4105 - 26817M 68 | L utig4-4096 - utig4-4098 + 87086M 69 | L utig4-4096 - utig4-4097 + 13086M 70 | L utig4-4098 + utig4-4099 + 99817M 71 | L utig4-4098 + utig4-4100 + 99817M 72 | L utig4-4099 + utig4-4112 - 99817M 73 | L utig4-4105 + utig4-4113 - 27816M 74 | L utig4-4112 + utig4-4113 + 99817M 75 | -------------------------------------------------------------------------------- /tests/test_graphs/path_closing.gfa: -------------------------------------------------------------------------------- 1 | S utig4-768 * LN:i:80358 RC:i:4660748 ll:f:58 2 | S utig4-769 * LN:i:3637298 RC:i:92987160 ll:f:25.565 3 | S utig4-770 * LN:i:3629728 RC:i:95914836 ll:f:26.425 4 | S utig4-771 * LN:i:105349 RC:i:757833 ll:f:7.194 5 | S utig4-772 * LN:i:14381500 RC:i:367627094 ll:f:25.563 6 | S utig4-773 * LN:i:26514 RC:i:172341 ll:f:6.5 7 | S utig4-774 * LN:i:30068 RC:i:871972 ll:f:29 8 | S utig4-775 * LN:i:100066 RC:i:0 ll:f:0 9 | S utig4-814 * LN:i:35724 RC:i:2158973 ll:f:60.435 10 | S utig4-815 * LN:i:3323 RC:i:186088 ll:f:56 11 | S utig4-816 * LN:i:3243 RC:i:21079 ll:f:6.5 12 | S utig4-817 * LN:i:60587 RC:i:0 ll:f:0 13 | S utig4-818 * LN:i:248099 RC:i:7389157 ll:f:29.783 14 | S utig4-819 * LN:i:25361 RC:i:253610 ll:f:10 15 | S utig4-1343 * LN:i:144209 RC:i:7221309 ll:f:50.075 16 | S utig4-1344 * LN:i:3341 RC:i:138651 ll:f:41.5 17 | S utig4-1345 * LN:i:3259 RC:i:81475 ll:f:25 18 | S utig4-1346 * LN:i:3218 RC:i:82059 ll:f:25.5 19 | S utig4-1347 * LN:i:3218 RC:i:85277 ll:f:26.5 20 | S utig4-1395 * LN:i:121871 RC:i:7418093 ll:f:60.868 21 | S utig4-1396 * LN:i:3077 RC:i:93848 ll:f:30.5 22 | S utig4-1397 * LN:i:3081 RC:i:80106 ll:f:26 23 | S utig4-1568 * LN:i:38930 RC:i:2567940 ll:f:65.963 24 | S utig4-1574 * LN:i:291615 RC:i:16248729 ll:f:55.72 25 | S utig4-1575 * LN:i:4802606 RC:i:131275393 ll:f:27.334 26 | S utig4-1576 * LN:i:4787839 RC:i:131390751 ll:f:27.443 27 | S utig4-1796 * LN:i:77698 RC:i:4038742 ll:f:51.98 28 | S utig4-1797 * LN:i:28908048 RC:i:775374554 ll:f:26.822 29 | S utig4-1798 * LN:i:28860130 RC:i:771670814 ll:f:26.738 30 | S utig4-1799 * LN:i:212264 RC:i:6472672 ll:f:30.494 31 | S utig4-3412 * LN:i:27615605 RC:i:715423671 ll:f:25.907 32 | S utig4-3444 * LN:i:27659866 RC:i:722804852 ll:f:26.132 33 | S utig4-4080 * LN:i:13822186 RC:i:362124687 ll:f:26.199 34 | S utig4-4212 * LN:i:100066 RC:i:0 ll:f:0 35 | S utig4-4213 * LN:i:100066 RC:i:0 ll:f:0 36 | L utig4-768 + utig4-772 + 80357M 37 | L utig4-768 + utig4-771 + 80357M 38 | L utig4-768 + utig4-773 + 1522M 39 | L utig4-768 - utig4-770 + 1584M 40 | L utig4-768 - utig4-769 + 1584M 41 | L utig4-771 + utig4-4080 + 24819M 42 | L utig4-772 + utig4-775 + 16755M 43 | L utig4-772 + utig4-774 + 15755M 44 | L utig4-773 + utig4-4080 + 24819M 45 | L utig4-774 + utig4-3412 - 14198M 46 | L utig4-775 + utig4-4212 - 99066M 47 | L utig4-814 + utig4-818 + 35723M 48 | L utig4-814 + utig4-817 + 35723M 49 | L utig4-814 + utig4-819 + 1497M 50 | L utig4-814 - utig4-815 + 1612M 51 | L utig4-814 - utig4-816 + 1489M 52 | L utig4-815 + utig4-1568 + 1692M 53 | L utig4-816 + utig4-1568 + 1736M 54 | L utig4-817 + utig4-1799 - 24740M 55 | L utig4-818 + utig4-1796 - 1416M 56 | L utig4-819 + utig4-1799 - 23740M 57 | L utig4-1343 + utig4-1347 + 1505M 58 | L utig4-1343 + utig4-1346 + 1505M 59 | L utig4-1343 - utig4-1344 + 1631M 60 | L utig4-1343 - utig4-1345 + 1550M 61 | L utig4-1344 + utig4-1568 - 1520M 62 | L utig4-1345 + utig4-1568 - 1520M 63 | L utig4-1346 + utig4-1395 + 1558M 64 | L utig4-1347 + utig4-1395 + 1558M 65 | L utig4-1395 + utig4-1396 + 1504M 66 | L utig4-1395 + utig4-1397 + 1504M 67 | L utig4-1396 + utig4-1574 - 1419M 68 | L utig4-1397 + utig4-1574 - 1419M 69 | L utig4-1574 - utig4-1575 + 1599M 70 | L utig4-1574 - utig4-1576 + 1569M 71 | L utig4-1796 + utig4-1799 + 1416M 72 | L utig4-1796 - utig4-1797 + 1541M 73 | L utig4-1796 - utig4-1798 + 1541M 74 | L utig4-3412 + utig4-4212 + 84198M 75 | L utig4-3444 + utig4-4080 - 15755M 76 | L utig4-4080 + utig4-4213 - 16755M 77 | L utig4-4212 + utig4-4213 + 99066M 78 | -------------------------------------------------------------------------------- /tests/test_graphs/test_assign.gfa: -------------------------------------------------------------------------------- 1 | S utig4-1036 * LN:i:51066 RC:i:3137367 ll:f:61.438 2 | S utig4-1037 * LN:i:73693 RC:i:0 ll:f:0 3 | S utig4-1038 * LN:i:560336 RC:i:14925334 ll:f:26.636 4 | S utig4-1039 * LN:i:23334 RC:i:665019 ll:f:28.5 5 | S utig4-1040 * LN:i:899705 RC:i:23234252 ll:f:25.824 6 | S utig4-1041 * LN:i:899242 RC:i:22335553 ll:f:24.838 7 | S utig4-1042 * LN:i:5958735 RC:i:158474345 ll:f:26.595 8 | S utig4-1043 * LN:i:5954863 RC:i:155385600 ll:f:26.094 9 | S utig4-1222 * LN:i:11085676 RC:i:299855342 ll:f:27.049 10 | S utig4-1223 * LN:i:11140744 RC:i:302041167 ll:f:27.111 11 | S utig4-1231 * LN:i:78785 RC:i:3836349 ll:f:48.694 12 | S utig4-1232 * LN:i:509207 RC:i:13446884 ll:f:26.408 13 | S utig4-1233 * LN:i:3181 RC:i:77934 ll:f:24.5 14 | S utig4-1234 * LN:i:3162 RC:i:82212 ll:f:26 15 | S utig4-1274 * LN:i:250910 RC:i:14419497 ll:f:57.469 16 | S utig4-1275 * LN:i:6689 RC:i:185619 ll:f:27.75 17 | S utig4-1276 * LN:i:6671 RC:i:163439 ll:f:24.5 18 | S utig4-1277 * LN:i:21012 RC:i:524970 ll:f:24.984 19 | S utig4-1278 * LN:i:21012 RC:i:646146 ll:f:30.751 20 | S utig4-1366 * LN:i:651125 RC:i:31734009 ll:f:48.737 21 | S utig4-1367 * LN:i:3350 RC:i:120600 ll:f:36 22 | S utig4-1368 * LN:i:3302 RC:i:100711 ll:f:30.5 23 | S utig4-1369 * LN:i:3060 RC:i:81090 ll:f:26.5 24 | S utig4-1370 * LN:i:3061 RC:i:96421 ll:f:31.5 25 | S utig4-1607 * LN:i:433050 RC:i:22246775 ll:f:51.372 26 | S utig4-1608 * LN:i:3293 RC:i:85618 ll:f:26 27 | S utig4-1609 * LN:i:3277 RC:i:96671 ll:f:29.5 28 | S utig4-1616 * LN:i:262886 RC:i:14740728 ll:f:56.073 29 | S utig4-1621 * LN:i:213875 RC:i:11513854 ll:f:53.835 30 | S utig4-1891 * LN:i:198936 RC:i:10620477 ll:f:53.386 31 | S utig4-4120 * LN:i:17388432 RC:i:438025035 ll:f:25.191 32 | S utig4-4154 * LN:i:17362249 RC:i:442032442 ll:f:25.459 33 | S utig4-4286 * LN:i:43693 RC:i:1201558 ll:f:27.5 34 | S utig4-4287 * LN:i:43697 RC:i:1121558 ll:f:25.667 35 | L utig4-1036 + utig4-1040 + 1667M 36 | L utig4-1036 + utig4-1041 + 1655M 37 | L utig4-1036 - utig4-1037 + 51065M 38 | L utig4-1036 - utig4-1038 + 51065M 39 | L utig4-1036 - utig4-1039 + 1708M 40 | L utig4-1037 + utig4-1232 - 22496M 41 | L utig4-1038 + utig4-1231 + 1646M 42 | L utig4-1039 + utig4-1232 - 21496M 43 | L utig4-1040 + utig4-1043 + 15166M 44 | L utig4-1040 + utig4-1042 + 15166M 45 | L utig4-1041 + utig4-1043 + 15166M 46 | L utig4-1042 + utig4-4286 - 1516M 47 | L utig4-1042 + utig4-4287 - 1516M 48 | L utig4-1043 + utig4-4287 - 1516M 49 | L utig4-1222 + utig4-1616 + 1727M 50 | L utig4-1223 + utig4-1616 + 1648M 51 | L utig4-1231 + utig4-1233 + 1502M 52 | L utig4-1231 + utig4-1234 + 1502M 53 | L utig4-1231 - utig4-1232 + 1646M 54 | L utig4-1233 + utig4-1621 - 1538M 55 | L utig4-1234 + utig4-1621 - 1520M 56 | L utig4-1274 + utig4-1278 + 1667M 57 | L utig4-1274 + utig4-1277 + 1667M 58 | L utig4-1274 - utig4-1276 + 1468M 59 | L utig4-1274 - utig4-1275 + 1454M 60 | L utig4-1275 + utig4-1891 - 1519M 61 | L utig4-1276 + utig4-1891 - 1486M 62 | L utig4-1277 + utig4-1616 - 1558M 63 | L utig4-1278 + utig4-1616 - 1558M 64 | L utig4-1366 + utig4-1370 + 1524M 65 | L utig4-1366 + utig4-1369 + 1524M 66 | L utig4-1366 - utig4-1368 + 1458M 67 | L utig4-1366 - utig4-1367 + 1458M 68 | L utig4-1367 + utig4-1607 + 1499M 69 | L utig4-1368 + utig4-1607 + 1499M 70 | L utig4-1369 + utig4-1621 + 1439M 71 | L utig4-1370 + utig4-1621 + 1439M 72 | L utig4-1607 + utig4-1609 + 1532M 73 | L utig4-1607 + utig4-1608 + 1532M 74 | L utig4-1608 + utig4-1891 + 1595M 75 | L utig4-1609 + utig4-1891 + 1577M 76 | L utig4-4120 + utig4-4286 + 18630M 77 | L utig4-4120 + utig4-4287 + 18630M 78 | L utig4-4154 + utig4-4287 + 18630M 79 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Rukki 2 | 3 | [![Actions Status](https://github.com/marbl/rukki/workflows/Test%20and%20Build/badge.svg)](https://github.com/marbl/rukki/actions) 4 | [![Actions Status](https://github.com/marbl/rukki/workflows/Formatting/badge.svg)](https://github.com/marbl/rukki/actions) 5 | [![Actions Status](https://github.com/marbl/rukki/workflows/Clippy/badge.svg)](https://github.com/marbl/rukki/actions) 6 | 7 | Rukki ([“spinning wheel”](https://en.wikipedia.org/wiki/Spinning_wheel) in Finnish) is a companion tool of Verkko assembler for extracting longer ‘scaffold’ paths from assembly graphs. 8 | 9 | It's primary purpose is to utilize parental (trio) information attributed to the Verkko assembly graph nodes for extraction of longer haplotypes reconstructions in diploid organisms. 10 | 11 | Rukki first assigns parental (maternal/paternal) classes to the nodes with prevalence of corresponding parental-specific markers, tries to identify homozygous nodes (belonging to both haplotypes), and then performs heuristic search of haplotype-paths starting from long nodes of the graph. 12 | 13 | Plans are to turn it into a tool for comprehensive analysis of assembly graphs, in particular support extraction of ‘primary’ and ‘alt’ scaffolds is under development. 14 | 15 | ## Some useful features 16 | 17 | * Can exclude suspicious nodes (having high prevalence of both types of markers) from the traversals. 18 | * Prevents the re-use of long nodes (unless assigned as homozygous), which can happen if the graph has missing connections. 19 | * Can scaffold across gaps in one haplotype if the other haplotype is intact. 20 | * Can scaffold across ambiguous regions (e.g. tandem repeat expansions). 21 | * Can deal with ambiguous bubble structures (either scaffold across of force-pick a path). 22 | 23 | ## Requirements 24 | 25 | Rust 2021 edition compiler. 26 | Try building with `cargo build --release`. 27 | 28 | ## Usage 29 | 30 | Basic usage example 31 | ``` 32 | ./target/release/rukki trio -g -m -p [--final-assign ] [--try-fill-bubbles] 33 | ``` 34 | 35 | * `graph.gfa` -- graph in GFA format. Sequences are ignored and optiona. 36 | Node coverage values will be used for various purposes if provided (as `RC:i:`, `FC:i:`, and/or `ll:f:` tags for `S` records). 37 | * `marker_cnts.tsv` -- TSV file, where first three columns of every line are interpreted as 38 | `node_name\tmaternal\tpaternal`, where 'maternal'/'paternal' are parental-specific marker counts. 39 | All columns after the third in TSV are ignored. 40 | * `out_paths.tsv` -- TSV output containing haplo-paths (one per line). 41 | Lines have format `path_name\tpath\tassignment`. 42 | By default paths are formatted as (`[+-](,[+-])*`). 43 | Also supports GAF path format, i.e. `([<>])+`, via the `--gaf-format` option. 44 | The path can also include gaps in the `[NXXXN]` format, where `XXX` is the integer giving an estimate gap size. 45 | Estimators are currently work in progress and not available for all cases. 46 | Default gap size (for cases where estimator is not yet available) is 5kb. 47 | Minimal reported value is currently fixed at 1kb (if an estimated value is lower than 1kb, 1kb will be reported instead). 48 | Gaps represent either an absense of the appropriate connections or a localized ambiguity within the graph. 49 | Assignment categories are `MATERNAL`, `PATERNAL` or `NA` (for _unassigned_). 50 | `NA` can only be associated with paths consisting of a single node. 51 | Every node of the graph is guaranteed to be covered by one or more output paths. 52 | * `--try-fill-bubbles` -- enables more agressive filling of ambiguous regions with one of available alternatives (recommended). 53 | * `node_assign.tsv` -- assignments of individual nodes, reflecting their usage by haplo-paths (`MATERNAL`, `PATERNAL` or `HOMOZYGOUS`). Nodes forming _unassigned_ paths are excluded. 54 | 55 | To see all options use: 56 | ``` 57 | ./target/release/rukki trio --help 58 | ``` 59 | -------------------------------------------------------------------------------- /tests/test_graphs/test2.gfa: -------------------------------------------------------------------------------- 1 | S utig4-307 * LN:i:17862 RC:i:964548 ll:f:54 2 | S utig4-308 * LN:i:19467 RC:i:622944 ll:f:32 3 | S utig4-309 * LN:i:3947518 RC:i:109069133 ll:f:27.63 4 | S utig4-310 * LN:i:36864 RC:i:0 ll:f:0 5 | S utig4-311 * LN:i:105518 RC:i:2663190 ll:f:25.239 6 | S utig4-312 * LN:i:104105 RC:i:2718973 ll:f:26.118 7 | S utig4-412 * LN:i:61877 RC:i:2951056 ll:f:47.692 8 | S utig4-413 * LN:i:322115 RC:i:7346638 ll:f:22.808 9 | S utig4-414 * LN:i:322115 RC:i:9194064 ll:f:28.543 10 | S utig4-415 * LN:i:18799 RC:i:404179 ll:f:21.5 11 | S utig4-416 * LN:i:1620359 RC:i:43961474 ll:f:27.131 12 | S utig4-417 * LN:i:80149 RC:i:0 ll:f:0 13 | S utig4-418 * LN:i:3953570 RC:i:108559102 ll:f:27.459 14 | S utig4-419 * LN:i:3963846 RC:i:110022095 ll:f:27.756 15 | S utig4-420 * LN:i:40212 RC:i:0 ll:f:0 16 | S utig4-421 * LN:i:12920760 RC:i:352154022 ll:f:27.255 17 | S utig4-422 * LN:i:1566227 RC:i:42861211 ll:f:27.366 18 | S utig4-768 * LN:i:80358 RC:i:4660748 ll:f:58 19 | S utig4-769 * LN:i:3637298 RC:i:92987160 ll:f:25.565 20 | S utig4-770 * LN:i:3629728 RC:i:95914836 ll:f:26.425 21 | S utig4-771 * LN:i:105349 RC:i:757833 ll:f:7.194 22 | S utig4-772 * LN:i:14381500 RC:i:367627094 ll:f:25.563 23 | S utig4-773 * LN:i:26514 RC:i:172341 ll:f:6.5 24 | S utig4-774 * LN:i:30068 RC:i:871972 ll:f:29 25 | S utig4-775 * LN:i:100066 RC:i:0 ll:f:0 26 | S utig4-1384 * LN:i:85998 RC:i:4326482 ll:f:50.309 27 | S utig4-1385 * LN:i:17849954 RC:i:482311112 ll:f:27.02 28 | S utig4-1386 * LN:i:17992456 RC:i:492933919 ll:f:27.397 29 | S utig4-1459 * LN:i:26383 RC:i:1585911 ll:f:60.111 30 | S utig4-1460 * LN:i:11977901 RC:i:317081391 ll:f:26.472 31 | S utig4-1461 * LN:i:11965984 RC:i:317130884 ll:f:26.503 32 | S utig4-1462 * LN:i:18250086 RC:i:499408128 ll:f:27.365 33 | S utig4-1463 * LN:i:18260753 RC:i:498880120 ll:f:27.32 34 | S utig4-1897 * LN:i:154693 RC:i:7952489 ll:f:51.408 35 | S utig4-1898 * LN:i:5011993 RC:i:133716966 ll:f:26.679 36 | S utig4-1899 * LN:i:5010456 RC:i:132505517 ll:f:26.446 37 | S utig4-3412 * LN:i:27615605 RC:i:715423671 ll:f:25.907 38 | S utig4-3429 * LN:i:12437089 RC:i:319441656 ll:f:25.685 39 | S utig4-3430 * LN:i:12478428 RC:i:324394206 ll:f:25.996 40 | S utig4-3431 * LN:i:3931831 RC:i:106809369 ll:f:27.165 41 | S utig4-3444 * LN:i:27659866 RC:i:722804852 ll:f:26.132 42 | S utig4-4073 * LN:i:12913891 RC:i:352015881 ll:f:27.259 43 | S utig4-4080 * LN:i:13822186 RC:i:362124687 ll:f:26.199 44 | S utig4-4212 * LN:i:100066 RC:i:0 ll:f:0 45 | S utig4-4213 * LN:i:100066 RC:i:0 ll:f:0 46 | S utig4-4227 * LN:i:48300 RC:i:527277 ll:f:10.917 47 | L utig4-307 + utig4-311 + 1654M 48 | L utig4-307 + utig4-312 + 1573M 49 | L utig4-307 - utig4-309 + 17861M 50 | L utig4-307 - utig4-310 + 17861M 51 | L utig4-307 - utig4-308 + 1462M 52 | L utig4-308 + utig4-3431 - 17834M 53 | L utig4-309 + utig4-3429 + 1685M 54 | L utig4-309 + utig4-3430 + 1685M 55 | L utig4-310 + utig4-3431 - 18834M 56 | L utig4-311 + utig4-4227 - 2030M 57 | L utig4-312 + utig4-4227 - 2030M 58 | L utig4-412 + utig4-417 + 61876M 59 | L utig4-412 + utig4-416 + 61876M 60 | L utig4-412 + utig4-415 + 1525M 61 | L utig4-412 - utig4-413 + 1654M 62 | L utig4-412 - utig4-414 + 1654M 63 | L utig4-413 + utig4-1897 + 1482M 64 | L utig4-414 + utig4-1897 + 1482M 65 | L utig4-415 + utig4-422 - 17116M 66 | L utig4-416 + utig4-419 + 13323M 67 | L utig4-416 + utig4-418 + 13323M 68 | L utig4-417 + utig4-422 - 18116M 69 | L utig4-418 + utig4-421 + 27583M 70 | L utig4-418 + utig4-420 + 27583M 71 | L utig4-418 - utig4-422 + 13323M 72 | L utig4-419 + utig4-4073 - 30593M 73 | L utig4-420 + utig4-4073 - 30593M 74 | L utig4-421 + utig4-1461 - 11970M 75 | L utig4-768 + utig4-772 + 80357M 76 | L utig4-768 + utig4-771 + 80357M 77 | L utig4-768 + utig4-773 + 1522M 78 | L utig4-768 - utig4-770 + 1584M 79 | L utig4-768 - utig4-769 + 1584M 80 | L utig4-769 + utig4-1384 - 1680M 81 | L utig4-770 + utig4-1384 - 1724M 82 | L utig4-771 + utig4-4080 + 24819M 83 | L utig4-772 + utig4-775 + 16755M 84 | L utig4-772 + utig4-774 + 15755M 85 | L utig4-773 + utig4-4080 + 24819M 86 | L utig4-774 + utig4-3412 - 14198M 87 | L utig4-775 + utig4-4212 - 99066M 88 | L utig4-1384 - utig4-1385 + 1613M 89 | L utig4-1384 - utig4-1386 + 1500M 90 | L utig4-1385 + utig4-1899 - 13059M 91 | L utig4-1385 + utig4-1898 - 13059M 92 | L utig4-1386 + utig4-1899 - 13059M 93 | L utig4-1386 + utig4-1898 - 13059M 94 | L utig4-1459 + utig4-1463 + 1675M 95 | L utig4-1459 + utig4-1462 + 1675M 96 | L utig4-1459 - utig4-1460 + 1682M 97 | L utig4-1459 - utig4-1461 + 1645M 98 | L utig4-1460 + utig4-4073 + 11970M 99 | L utig4-1461 + utig4-4073 + 11970M 100 | L utig4-1462 + utig4-4227 + 26413M 101 | L utig4-1463 + utig4-4227 + 26413M 102 | L utig4-1897 + utig4-1898 + 1478M 103 | L utig4-1897 + utig4-1899 + 1478M 104 | L utig4-3412 + utig4-4212 + 84198M 105 | L utig4-3430 - utig4-3431 + 1685M 106 | L utig4-3444 + utig4-4080 - 15755M 107 | L utig4-4080 + utig4-4213 - 16755M 108 | L utig4-4212 + utig4-4213 + 99066M 109 | -------------------------------------------------------------------------------- /tests/graph_test.rs: -------------------------------------------------------------------------------- 1 | use rukki::*; 2 | 3 | #[test] 4 | fn one_node() { 5 | let s = "S a * LN:i:100"; 6 | let g = Graph::read(&s.replace(' ', "\t")); 7 | assert_eq!(1, g.node_cnt()); 8 | assert_eq!(0, g.link_cnt()); 9 | let n = g.all_nodes().next().unwrap(); 10 | assert_eq!("a", n.name); 11 | assert_eq!(100, n.length); 12 | assert_eq!(None, g.all_links().next()); 13 | assert_eq!(g.name2id("a"), 0); 14 | let v = Vertex::forward(0); 15 | assert_eq!(v.rc(), Vertex::reverse(0)); 16 | } 17 | 18 | #[test] 19 | fn loop1() { 20 | let s = " 21 | S a * LN:i:100 22 | L a + a + 10M 23 | "; 24 | let g = Graph::read(&s.replace(' ', "\t")); 25 | assert_eq!(1, g.node_cnt()); 26 | assert_eq!(1, g.link_cnt()); 27 | let l = g.all_links().next().unwrap(); 28 | assert_eq!(10, l.overlap); 29 | assert_eq!(Direction::FORWARD, l.start.direction); 30 | assert_eq!(Direction::FORWARD, l.end.direction); 31 | let v = Vertex::forward(0); 32 | assert_eq!(g.outgoing_edges(v), vec![l]); 33 | assert_eq!(g.incoming_edges(v), vec![l]); 34 | } 35 | 36 | #[test] 37 | #[should_panic] 38 | fn nontrivial_cigar() { 39 | let s = " 40 | S a * LN:i:100 41 | L a + a + 1D10M1I 42 | "; 43 | Graph::read(&s.replace(' ', "\t")); 44 | } 45 | 46 | #[test] 47 | fn loop2() { 48 | let s = " 49 | S a * LN:i:100 50 | L a - a - 10M 51 | "; 52 | let g = Graph::read(&s.replace(' ', "\t")); 53 | assert_eq!(1, g.node_cnt()); 54 | assert_eq!(1, g.link_cnt()); 55 | let l = g.all_links().next().unwrap(); 56 | assert_eq!("a+->a+", g.l_str(l)); 57 | assert_eq!("a", g.node(l.start.node_id).name); 58 | assert_eq!("a", g.node(l.end.node_id).name); 59 | assert_eq!(Direction::FORWARD, l.start.direction); 60 | assert_eq!(Direction::FORWARD, l.end.direction); 61 | let v = Vertex::forward(0); 62 | assert_eq!(g.outgoing_edges(v), vec![l]); 63 | assert_eq!(g.incoming_edges(v), vec![l]); 64 | } 65 | 66 | #[test] 67 | fn self_conj1() { 68 | let s = " 69 | S a * LN:i:100 70 | L a + a - 10M 71 | "; 72 | let g = Graph::read(&s.replace(' ', "\t")); 73 | assert_eq!(1, g.node_cnt()); 74 | assert_eq!(1, g.link_cnt()); 75 | let l = g.all_links().next().unwrap(); 76 | assert_eq!("a+->a-", g.l_str(l)); 77 | assert_eq!(Direction::FORWARD, l.start.direction); 78 | assert_eq!(Direction::REVERSE, l.end.direction); 79 | let v = Vertex::forward(0); 80 | assert_eq!(g.outgoing_edges(v), vec![l]); 81 | assert_eq!(g.incoming_edges(v.rc()), vec![l]); 82 | } 83 | 84 | #[test] 85 | fn self_conj2() { 86 | let s = " 87 | S a * LN:i:100 88 | L a - a + 10M 89 | "; 90 | let g = Graph::read(&s.replace(' ', "\t")); 91 | assert_eq!(1, g.node_cnt()); 92 | assert_eq!(1, g.link_cnt()); 93 | let l = g.all_links().next().unwrap(); 94 | assert_eq!("a-->a+", g.l_str(l)); 95 | assert_eq!(Direction::REVERSE, l.start.direction); 96 | assert_eq!(Direction::FORWARD, l.end.direction); 97 | let v = Vertex::forward(0); 98 | assert_eq!(g.incoming_edges(v), vec![l]); 99 | assert_eq!(g.outgoing_edges(v.rc()), vec![l]); 100 | } 101 | 102 | #[test] 103 | fn two_nodes() { 104 | let s = " 105 | S a * LN:i:100 106 | S b * LN:i:200 107 | "; 108 | let g = Graph::read(&s.replace(' ', "\t")); 109 | assert_eq!(2, g.node_cnt()); 110 | assert_eq!(0, g.link_cnt()); 111 | assert_eq!(g.all_vertices().count(), 4); 112 | } 113 | 114 | #[test] 115 | fn one_link() { 116 | let s = " 117 | S a * LN:i:100 118 | S b * LN:i:200 119 | L a + b + 10M 120 | "; 121 | let g = Graph::read(&s.replace(' ', "\t")); 122 | assert_eq!(2, g.node_cnt()); 123 | assert_eq!(1, g.link_cnt()); 124 | let v = Vertex::forward(g.name2id("a")); 125 | let w = Vertex::forward(g.name2id("b")); 126 | let l = Link { 127 | start: v, 128 | end: w, 129 | overlap: 10, 130 | }; 131 | assert_eq!(g.outgoing_edges(v), vec![l]); 132 | assert_eq!(g.incoming_edges(v), vec![]); 133 | assert_eq!(g.outgoing_edges(v.rc()), vec![]); 134 | assert_eq!(g.incoming_edges(v.rc()), vec![l.rc()]); 135 | assert_eq!(g.outgoing_edges(w), vec![]); 136 | assert_eq!(g.incoming_edges(w), vec![l]); 137 | assert_eq!(g.outgoing_edges(w.rc()), vec![l.rc()]); 138 | assert_eq!(g.incoming_edges(w.rc()), vec![]); 139 | } 140 | 141 | #[test] 142 | #[should_panic] 143 | fn invalid_link() { 144 | let s = " 145 | S a * LN:i:100 146 | S b * LN:i:200 147 | L a + b + 100M 148 | "; 149 | Graph::read(&s.replace(' ', "\t")); 150 | } 151 | 152 | #[test] 153 | fn basic_sanitize() { 154 | let s = " 155 | S a * LN:i:100 156 | S b * LN:i:200 157 | L a + b + 100M 158 | L b - a - 50M 159 | "; 160 | let g = Graph::read_sanitize(&s.replace(' ', "\t")); 161 | assert_eq!(2, g.node_cnt()); 162 | assert_eq!(1, g.link_cnt()); 163 | assert_eq!(99, g.all_links().next().unwrap().overlap); 164 | } 165 | -------------------------------------------------------------------------------- /tests/superbubble_test.rs: -------------------------------------------------------------------------------- 1 | use itertools::Itertools; 2 | 3 | use rukki::graph_algos::superbubble; 4 | use rukki::*; 5 | 6 | #[test] 7 | fn multi_link_bubble() { 8 | let s = " 9 | S a * LN:i:100 10 | S b * LN:i:100 11 | L a + b + 50M 12 | L a + b + 75M 13 | "; 14 | let g = Graph::read(&s.replace(' ', "\t")); 15 | let bubble = superbubble::find_superbubble( 16 | &g, 17 | Vertex::forward(0), 18 | &superbubble::SbSearchParams::unrestricted(), 19 | ) 20 | .unwrap(); 21 | assert_eq!(bubble.length_range(&g), (125, 150)); 22 | assert!(g.name(bubble.end_vertex().node_id) == "b"); 23 | } 24 | 25 | #[test] 26 | #[should_panic] 27 | fn extra_link_start() { 28 | let s = " 29 | S a * LN:i:100 30 | S c * LN:i:100 31 | S b * LN:i:100 32 | L a + b + 50M 33 | L a + b + 75M 34 | L a + c + 50M 35 | "; 36 | let g = Graph::read(&s.replace(' ', "\t")); 37 | let _bubble = superbubble::find_superbubble( 38 | &g, 39 | Vertex::forward(0), 40 | &superbubble::SbSearchParams::unrestricted(), 41 | ) 42 | .unwrap(); 43 | } 44 | 45 | #[test] 46 | #[should_panic] 47 | fn extra_link_end() { 48 | let s = " 49 | S a * LN:i:100 50 | S b * LN:i:100 51 | S c * LN:i:100 52 | L a + b + 50M 53 | L a + b + 50M 54 | L c + b + 50M 55 | "; 56 | let g = Graph::read(&s.replace(' ', "\t")); 57 | let _bubble = superbubble::find_superbubble( 58 | &g, 59 | Vertex::forward(0), 60 | &superbubble::SbSearchParams::unrestricted(), 61 | ) 62 | .unwrap(); 63 | } 64 | 65 | #[test] 66 | fn simple_bubble() { 67 | let s = " 68 | S a * LN:i:100 69 | S b * LN:i:100 70 | S c * LN:i:100 71 | S d * LN:i:100 72 | L a + b + 50M 73 | L a + c + 50M 74 | L b + d + 50M 75 | L c + d + 50M 76 | "; 77 | let g = Graph::read(&s.replace(' ', "\t")); 78 | let bubble = superbubble::find_superbubble( 79 | &g, 80 | Vertex::forward(0), 81 | &superbubble::SbSearchParams::unrestricted(), 82 | ) 83 | .unwrap(); 84 | assert!(g.name(bubble.end_vertex().node_id) == "d"); 85 | assert_eq!(bubble.vertices().count(), 4); 86 | let mut bubble_vertices = bubble.vertices().map(|&v| g.v_str(v)).collect_vec(); 87 | bubble_vertices.sort(); 88 | assert_eq!(bubble_vertices, vec!["a+", "b+", "c+", "d+"]); 89 | assert_eq!(bubble.length_range(&g), (200, 200)); 90 | } 91 | 92 | //TODO support this case 93 | #[test] 94 | #[should_panic] 95 | fn simple_bubble_loop() { 96 | let s = " 97 | S a * LN:i:100 98 | S b * LN:i:100 99 | S c * LN:i:100 100 | L a + b + 50M 101 | L a + c + 50M 102 | L b + a + 50M 103 | L c + a + 50M 104 | "; 105 | let g = Graph::read(&s.replace(' ', "\t")); 106 | let bubble = superbubble::find_superbubble( 107 | &g, 108 | Vertex::forward(0), 109 | &superbubble::SbSearchParams::unrestricted(), 110 | ) 111 | .unwrap(); 112 | assert!(g.name(bubble.end_vertex().node_id) == "a"); 113 | assert_eq!(bubble.vertices().count(), 4); 114 | let mut bubble_vertices = bubble.vertices().map(|&v| g.v_str(v)).collect_vec(); 115 | bubble_vertices.sort(); 116 | assert_eq!(bubble_vertices, vec!["a+", "b+", "c+"]); 117 | assert_eq!(bubble.length_range(&g), (100, 100)); 118 | } 119 | 120 | #[test] 121 | fn triple_bubble() { 122 | let s = " 123 | S a * LN:i:100 124 | S b * LN:i:100 125 | S c * LN:i:100 126 | S d * LN:i:100 127 | L a + b + 50M 128 | L a + c + 50M 129 | L b + d + 50M 130 | L c + d + 50M 131 | L a + d + 50M 132 | "; 133 | let g = Graph::read(&s.replace(' ', "\t")); 134 | let bubble = superbubble::find_superbubble( 135 | &g, 136 | Vertex::forward(0), 137 | &superbubble::SbSearchParams::unrestricted(), 138 | ) 139 | .unwrap(); 140 | assert!(g.name(bubble.end_vertex().node_id) == "d"); 141 | assert_eq!(bubble.vertices().count(), 4); 142 | let mut bubble_vertices = bubble.vertices().map(|&v| g.v_str(v)).collect_vec(); 143 | bubble_vertices.sort(); 144 | assert_eq!(bubble_vertices, vec!["a+", "b+", "c+", "d+"]); 145 | assert_eq!(bubble.length_range(&g), (150, 200)); 146 | } 147 | 148 | #[test] 149 | fn super_bubble_1() { 150 | let s = " 151 | S a * LN:i:100 152 | S b * LN:i:100 153 | S c * LN:i:100 154 | S d * LN:i:100 155 | L a + b + 50M 156 | L a + c + 50M 157 | L b + c + 50M 158 | L b + d + 50M 159 | L c + d + 50M 160 | "; 161 | let g = Graph::read(&s.replace(' ', "\t")); 162 | let bubble = superbubble::find_superbubble( 163 | &g, 164 | Vertex::forward(0), 165 | &superbubble::SbSearchParams::unrestricted(), 166 | ) 167 | .unwrap(); 168 | assert!(g.name(bubble.end_vertex().node_id) == "d"); 169 | assert_eq!(bubble.vertices().count(), 4); 170 | let mut bubble_vertices = bubble.vertices().map(|&v| g.v_str(v)).collect_vec(); 171 | bubble_vertices.sort(); 172 | assert_eq!(bubble_vertices, vec!["a+", "b+", "c+", "d+"]); 173 | assert_eq!(bubble.length_range(&g), (200, 250)); 174 | } 175 | 176 | #[test] 177 | fn super_bubble_1_reverse() { 178 | let s = " 179 | S a * LN:i:100 180 | S b * LN:i:100 181 | S c * LN:i:100 182 | S d * LN:i:100 183 | L a + b + 50M 184 | L a + c + 50M 185 | L b + c + 50M 186 | L b + d + 50M 187 | L c + d + 50M 188 | "; 189 | let g = Graph::read(&s.replace(' ', "\t")); 190 | let bubble = superbubble::find_superbubble( 191 | &g, 192 | Vertex::reverse(3), 193 | &superbubble::SbSearchParams::unrestricted(), 194 | ) 195 | .unwrap(); 196 | assert!(g.name(bubble.end_vertex().node_id) == "a"); 197 | assert_eq!(bubble.vertices().count(), 4); 198 | let mut bubble_vertices = bubble.vertices().map(|&v| g.v_str(v)).collect_vec(); 199 | bubble_vertices.sort(); 200 | assert_eq!(bubble_vertices, vec!["a-", "b-", "c-", "d-"]); 201 | assert_eq!(bubble.length_range(&g), (200, 250)); 202 | } 203 | 204 | #[test] 205 | fn super_bubble_2() { 206 | let s = " 207 | S a * LN:i:100 208 | S b * LN:i:100 209 | S c * LN:i:100 210 | S d * LN:i:100 211 | S e * LN:i:100 212 | S f * LN:i:100 213 | L a + b + 75M 214 | L a + c + 50M 215 | L b + d + 75M 216 | L b + e + 50M 217 | L c + d + 50M 218 | L c + e + 50M 219 | L d + f + 50M 220 | L e + f + 50M 221 | "; 222 | let g = Graph::read(&s.replace(' ', "\t")); 223 | let bubble = superbubble::find_superbubble( 224 | &g, 225 | Vertex::forward(0), 226 | &superbubble::SbSearchParams::unrestricted(), 227 | ) 228 | .unwrap(); 229 | assert!(g.name(bubble.end_vertex().node_id) == "f"); 230 | assert_eq!(bubble.vertices().count(), 6); 231 | let mut bubble_vertices = bubble.vertices().map(|&v| g.v_str(v)).collect_vec(); 232 | bubble_vertices.sort(); 233 | assert_eq!(bubble_vertices, vec!["a+", "b+", "c+", "d+", "e+", "f+"]); 234 | assert_eq!(bubble.length_range(&g), (200, 250)); 235 | } 236 | 237 | #[test] 238 | fn simple_max_chain() { 239 | let s = " 240 | S a * LN:i:100 241 | S b * LN:i:100 242 | S c * LN:i:100 243 | S d * LN:i:100 244 | S e * LN:i:100 245 | S f * LN:i:100 246 | S g * LN:i:100 247 | L a + b + 50M 248 | L a + c + 50M 249 | L b + d + 50M 250 | L c + d + 50M 251 | L d + e + 50M 252 | L d + f + 50M 253 | L e + g + 50M 254 | L f + g + 50M 255 | "; 256 | let g = Graph::read(&s.replace(' ', "\t")); 257 | let chain = superbubble::find_maximal_chain( 258 | &g, 259 | Vertex::forward(g.name2id("d")), 260 | &superbubble::SbSearchParams::unrestricted(), 261 | ); 262 | assert_eq!(chain.len(), 2); 263 | assert_eq!(chain[0].start_vertex(), Vertex::forward(g.name2id("a"))); 264 | assert_eq!(chain[0].end_vertex(), Vertex::forward(g.name2id("d"))); 265 | assert_eq!(chain[1].start_vertex(), Vertex::forward(g.name2id("d"))); 266 | assert_eq!(chain[1].end_vertex(), Vertex::forward(g.name2id("g"))); 267 | assert_eq!(superbubble::length_range(&chain, &g), (300, 300)); 268 | } 269 | 270 | #[test] 271 | fn simple_chain_loop() { 272 | let s = " 273 | S a * LN:i:100 274 | S b * LN:i:100 275 | S c * LN:i:100 276 | S d * LN:i:100 277 | S e * LN:i:100 278 | S f * LN:i:100 279 | L a + b + 50M 280 | L a + c + 50M 281 | L b + d + 50M 282 | L c + d + 50M 283 | L d + e + 50M 284 | L d + f + 50M 285 | L e + a + 50M 286 | L f + a + 50M 287 | "; 288 | let g = Graph::read(&s.replace(' ', "\t")); 289 | //testing search ahead 290 | let chain = superbubble::find_chain_ahead( 291 | &g, 292 | Vertex::forward(0), 293 | &superbubble::SbSearchParams::unrestricted(), 294 | ); 295 | assert_eq!(chain.len(), 2); 296 | assert!(g.name(chain[0].end_vertex().node_id) == "d"); 297 | assert!(g.name(chain[1].end_vertex().node_id) == "a"); 298 | assert_eq!(superbubble::length_range(&chain, &g), (200, 200)); 299 | 300 | //testing maximal chain search 301 | let chain = superbubble::find_maximal_chain( 302 | &g, 303 | Vertex::forward(g.name2id("d")), 304 | &superbubble::SbSearchParams::unrestricted(), 305 | ); 306 | assert_eq!(chain.len(), 2); 307 | assert_eq!(chain[0].start_vertex(), Vertex::forward(g.name2id("d"))); 308 | assert_eq!(chain[0].end_vertex(), Vertex::forward(g.name2id("a"))); 309 | assert_eq!(chain[1].start_vertex(), Vertex::forward(g.name2id("a"))); 310 | assert_eq!(chain[1].end_vertex(), Vertex::forward(g.name2id("d"))); 311 | assert_eq!(superbubble::length_range(&chain, &g), (200, 200)); 312 | } 313 | -------------------------------------------------------------------------------- /src/graph_algos/scc.rs: -------------------------------------------------------------------------------- 1 | use super::dfs; 2 | use super::only_or_none; 3 | use crate::graph::*; 4 | use itertools::Itertools; 5 | use log::debug; 6 | use std::collections::{HashMap, HashSet}; 7 | 8 | //Implementing Kosaraju-Sharir algorithm 9 | //'trivial' SCCs of individual vertices are not reported 10 | //NB. Loop of single vertex is considered 'NON-trivial' 11 | pub fn strongly_connected(graph: &Graph) -> Vec> { 12 | let mut non_trivial_sccs: Vec> = Vec::new(); 13 | let is_loop = |v: Vertex| graph.outgoing_edges(v).iter().any(|l| l.end == v); 14 | 15 | // run DFS on direct edges 16 | let mut dfs = dfs::DFS::new_forward(graph); 17 | dfs.run(); 18 | let mut used: HashSet = HashSet::new(); 19 | // consider vertices in decreasing order of exit times (latest exit times first) 20 | for &v in dfs.exit_order().iter().rev() { 21 | if !used.contains(&v) { 22 | // run DFS on reverse edges 23 | let mut reverse_dfs = dfs::DFS::new_reverse(graph); 24 | reverse_dfs.set_blocked(used); 25 | reverse_dfs.run_from(v); 26 | let visited = reverse_dfs.exit_order(); 27 | assert!(!visited.is_empty()); 28 | if visited.len() > 1 || is_loop(visited[0]) { 29 | debug!( 30 | "Identified non-trivial component of size {}: {}", 31 | visited.len(), 32 | visited.iter().map(|&v| graph.v_str(v)).join(",") 33 | ); 34 | 35 | non_trivial_sccs.push(visited.clone()); 36 | } 37 | used = reverse_dfs.take_blocked(); 38 | } 39 | } 40 | assert!(check_consistency(graph, &non_trivial_sccs)); 41 | non_trivial_sccs 42 | } 43 | 44 | pub fn nodes_in_sccs(_g: &Graph, sccs: &[Vec]) -> HashSet { 45 | HashSet::from_iter(sccs.iter().flat_map(|comp| comp.iter().map(|v| v.node_id))) 46 | } 47 | 48 | fn check_consistency(graph: &Graph, non_trivial_sccs: &[Vec]) -> bool { 49 | let mut vertices_to_scc = HashMap::new(); 50 | for (scc_id, vertices) in non_trivial_sccs.iter().enumerate() { 51 | for v in vertices { 52 | vertices_to_scc.insert(v, scc_id); 53 | } 54 | } 55 | 56 | let mut considered_node_ids: HashSet = HashSet::new(); 57 | for v in graph.all_vertices() { 58 | if considered_node_ids.contains(&v.node_id) { 59 | continue; 60 | } 61 | let sorted = |mut vertices: Vec| { 62 | vertices.sort(); 63 | vertices 64 | }; 65 | if let Some(&scc_id) = vertices_to_scc.get(&v) { 66 | for scc_v in &non_trivial_sccs[scc_id] { 67 | considered_node_ids.insert(scc_v.node_id); 68 | } 69 | match vertices_to_scc.get(&v.rc()) { 70 | None => return false, 71 | Some(&rc_scc_id) => { 72 | assert_eq!( 73 | sorted(non_trivial_sccs[scc_id].clone()), 74 | sorted(non_trivial_sccs[rc_scc_id].iter().map(|w| w.rc()).collect()) 75 | ); 76 | } 77 | } 78 | } 79 | } 80 | true 81 | } 82 | 83 | //Building condensation Graph 84 | pub fn condensation( 85 | graph: &Graph, 86 | non_trivial_sccs: &[Vec], 87 | ignore_loops: bool, 88 | ) -> (Graph, HashMap) { 89 | assert!(check_consistency(graph, non_trivial_sccs)); 90 | let mut condensation = Graph::new(); 91 | let mut vertices_to_scc = HashMap::new(); 92 | for (scc_id, vertices) in non_trivial_sccs.iter().enumerate() { 93 | //filtering 'trivial' loops 94 | if vertices.len() == 1 { 95 | continue; 96 | } 97 | for v in vertices { 98 | vertices_to_scc.insert(v, scc_id); 99 | } 100 | } 101 | 102 | let mut old_2_new: HashMap = HashMap::new(); 103 | 104 | let mut update_old_2_new = |old_vertices: &[Vertex], new_node_id: usize| { 105 | //two passes for more consistent processing of self-conjugate scc 106 | for v in old_vertices { 107 | old_2_new.insert(v.rc(), Vertex::reverse(new_node_id)); 108 | } 109 | for v in old_vertices { 110 | old_2_new.insert(*v, Vertex::forward(new_node_id)); 111 | } 112 | }; 113 | 114 | let mut considered_node_ids: HashSet = HashSet::new(); 115 | for (node_id, node) in graph.node_iter().enumerate() { 116 | let v = Vertex::forward(node_id); 117 | if considered_node_ids.contains(&node_id) { 118 | continue; 119 | } 120 | if let Some(&scc_id) = vertices_to_scc.get(&v) { 121 | let scc_vertices = &non_trivial_sccs[scc_id]; 122 | for scc_v in scc_vertices { 123 | considered_node_ids.insert(scc_v.node_id); 124 | } 125 | if scc_vertices.contains(&v.rc()) { 126 | debug!( 127 | "Dealing with self-conjugate SCC {}: {}", 128 | scc_id, 129 | scc_vertices.iter().map(|&w| graph.v_str(w)).join("") 130 | ) 131 | } 132 | let length = scc_vertices 133 | .iter() 134 | .map(|w| graph.node(w.node_id).length) 135 | .max() 136 | .unwrap(); 137 | let name = format!( 138 | "scc_{}_vcnt_{}_init_{}", 139 | scc_id, 140 | scc_vertices.len(), 141 | node.name 142 | ); 143 | //let cnd_node; 144 | let cnd_id = condensation.add_node(Node { 145 | name, 146 | length, 147 | coverage: 0., 148 | }); 149 | update_old_2_new(scc_vertices, cnd_id); 150 | } else { 151 | considered_node_ids.insert(v.node_id); 152 | let cnd_id = condensation.add_node(node.clone()); 153 | update_old_2_new(std::slice::from_ref(&v), cnd_id); 154 | } 155 | } 156 | 157 | for l in graph.all_links() { 158 | let &v = old_2_new.get(&l.start).unwrap(); 159 | let &w = old_2_new.get(&l.end).unwrap(); 160 | //checking that no link between nodes exists 161 | if ignore_loops && v == w { 162 | debug!("Loop ignored for vertex {}", condensation.v_str(v)); 163 | continue; 164 | } 165 | if !condensation.outgoing_edges(v).iter().any(|l| l.end == w) { 166 | condensation.add_link(Link { 167 | start: v, 168 | end: w, 169 | overlap: l.overlap, 170 | }); 171 | } 172 | } 173 | 174 | (condensation, old_2_new) 175 | } 176 | 177 | pub struct LocalizedTangle { 178 | pub entrance: Link, 179 | pub exit: Link, 180 | pub vertices: Vec, 181 | } 182 | 183 | //very crude (under-)estimate without multiplicity guessing! 184 | //subtracts minimal incoming overlap from every vertex and takes sum 185 | pub fn estimate_size_no_mult(tangle: &LocalizedTangle, g: &Graph) -> usize { 186 | let shortest_incoming_overlap = |v: Vertex| { 187 | return g 188 | .incoming_edges(v) 189 | .iter() 190 | .map(|l| l.overlap) 191 | .min() 192 | .unwrap_or(0); 193 | }; 194 | 195 | tangle 196 | .vertices 197 | .iter() 198 | .map(|&v| g.vertex_length(v) - shortest_incoming_overlap(v)) 199 | .sum() 200 | } 201 | 202 | fn find_localized(g: &Graph, non_trivial_scc: &[Vertex]) -> Option { 203 | let component_vertices: HashSet = HashSet::from_iter(non_trivial_scc.iter().copied()); 204 | 205 | //TODO learn if there is a better way or implement my own helper function 206 | let entrance = only_or_none( 207 | component_vertices 208 | .iter() 209 | .flat_map(|&v| g.incoming_edges(v)) 210 | .filter(|l| !component_vertices.contains(&l.start)), 211 | )?; 212 | 213 | let exit = only_or_none( 214 | component_vertices 215 | .iter() 216 | .flat_map(|&v| g.outgoing_edges(v)) 217 | .filter(|l| !component_vertices.contains(&l.end)), 218 | )?; 219 | 220 | //TODO think where this check should be performed 221 | //Also checking that entrance and exit 222 | //are the only ways to go from corresponding vertices 223 | let entrance = only_or_none(g.outgoing_edges(entrance.start).into_iter())?; 224 | let exit = only_or_none(g.incoming_edges(exit.end).into_iter())?; 225 | 226 | //guard against potential tricky strand-switching case 227 | if entrance.start.node_id == exit.end.node_id { 228 | None 229 | } else { 230 | Some(LocalizedTangle { 231 | entrance, 232 | exit, 233 | vertices: non_trivial_scc.to_owned(), 234 | }) 235 | } 236 | } 237 | 238 | pub fn find_small_localized( 239 | g: &Graph, 240 | non_trivial_sccs: &[Vec], 241 | size_limit: usize, 242 | ) -> Vec { 243 | non_trivial_sccs 244 | .iter() 245 | .filter_map(|vs| find_localized(g, vs)) 246 | .filter(|t| estimate_size_no_mult(t, g) <= size_limit) 247 | .collect() 248 | } 249 | -------------------------------------------------------------------------------- /tests/test_graphs/test3.gfa: -------------------------------------------------------------------------------- 1 | S utig4-64 * LN:i:107247 RC:i:6202973 ll:f:57.838 2 | S utig4-65 * LN:i:112721 RC:i:2976251 ll:f:26.404 3 | S utig4-66 * LN:i:112744 RC:i:2990411 ll:f:26.524 4 | S utig4-67 * LN:i:7607784 RC:i:209427078 ll:f:27.528 5 | S utig4-68 * LN:i:7607382 RC:i:207977456 ll:f:27.339 6 | S utig4-923 * LN:i:102941 RC:i:5587761 ll:f:54.281 7 | S utig4-924 * LN:i:11113 RC:i:283382 ll:f:25.5 8 | S utig4-925 * LN:i:11249 RC:i:279349 ll:f:24.833 9 | S utig4-926 * LN:i:10166448 RC:i:280305238 ll:f:27.572 10 | S utig4-927 * LN:i:10208881 RC:i:276227819 ll:f:27.058 11 | S utig4-1019 * LN:i:66651 RC:i:3746739 ll:f:56.214 12 | S utig4-1020 * LN:i:1026861 RC:i:27395727 ll:f:26.679 13 | S utig4-1021 * LN:i:1026888 RC:i:28932980 ll:f:28.175 14 | S utig4-1022 * LN:i:15082854 RC:i:404553818 ll:f:26.822 15 | S utig4-1023 * LN:i:15026495 RC:i:404042916 ll:f:26.889 16 | S utig4-1024 * LN:i:3335212 RC:i:91299761 ll:f:27.375 17 | S utig4-1025 * LN:i:3337032 RC:i:90081177 ll:f:26.994 18 | S utig4-1026 * LN:i:15584294 RC:i:423306827 ll:f:27.162 19 | S utig4-1027 * LN:i:15562674 RC:i:420526795 ll:f:27.022 20 | S utig4-1249 * LN:i:61139 RC:i:3715723 ll:f:60.775 21 | S utig4-1250 * LN:i:3258 RC:i:30951 ll:f:9.5 22 | S utig4-1251 * LN:i:3259 RC:i:185763 ll:f:57 23 | S utig4-1252 * LN:i:1769940 RC:i:49046099 ll:f:27.711 24 | S utig4-1253 * LN:i:1774754 RC:i:46572740 ll:f:26.242 25 | S utig4-1254 * LN:i:29618744 RC:i:853526308 ll:f:28.817 26 | S utig4-1255 * LN:i:100761 RC:i:0 ll:f:0 27 | S utig4-1256 * LN:i:100761 RC:i:0 ll:f:0 28 | S utig4-1257 * LN:i:30761 RC:i:553698 ll:f:18 29 | S utig4-1387 * LN:i:570020 RC:i:31776677 ll:f:55.747 30 | S utig4-1388 * LN:i:2975 RC:i:69912 ll:f:23.5 31 | S utig4-1389 * LN:i:3044 RC:i:68490 ll:f:22.5 32 | S utig4-1392 * LN:i:629947 RC:i:36131051 ll:f:57.356 33 | S utig4-1393 * LN:i:3255 RC:i:130200 ll:f:40 34 | S utig4-1394 * LN:i:3408 RC:i:64752 ll:f:19 35 | S utig4-1402 * LN:i:366389 RC:i:19442103 ll:f:53.064 36 | S utig4-1403 * LN:i:25200 RC:i:957600 ll:f:38 37 | S utig4-1404 * LN:i:100155 RC:i:0 ll:f:0 38 | S utig4-1405 * LN:i:3136 RC:i:72128 ll:f:23 39 | S utig4-1406 * LN:i:3210 RC:i:78645 ll:f:24.5 40 | S utig4-1407 * LN:i:100155 RC:i:0 ll:f:0 41 | S utig4-1408 * LN:i:404567 RC:i:10792432 ll:f:26.677 42 | S utig4-1409 * LN:i:6284850 RC:i:167503194 ll:f:26.652 43 | S utig4-1410 * LN:i:6282660 RC:i:167799168 ll:f:26.708 44 | S utig4-1450 * LN:i:621926 RC:i:35499412 ll:f:57.08 45 | S utig4-1451 * LN:i:5172 RC:i:159039 ll:f:30.75 46 | S utig4-1452 * LN:i:5156 RC:i:150813 ll:f:29.25 47 | S utig4-1476 * LN:i:83409 RC:i:4964329 ll:f:59.518 48 | S utig4-1477 * LN:i:4977133 RC:i:133604665 ll:f:26.844 49 | S utig4-1478 * LN:i:4986768 RC:i:135138421 ll:f:27.099 50 | S utig4-1529 * LN:i:82467 RC:i:4263387 ll:f:51.698 51 | S utig4-1530 * LN:i:13269279 RC:i:360521003 ll:f:27.17 52 | S utig4-1531 * LN:i:13275259 RC:i:356776568 ll:f:26.875 53 | S utig4-1532 * LN:i:3113425 RC:i:80028410 ll:f:25.704 54 | S utig4-1533 * LN:i:3112840 RC:i:81839365 ll:f:26.291 55 | S utig4-1534 * LN:i:24467698 RC:i:600244014 ll:f:24.532 56 | S utig4-1535 * LN:i:24417646 RC:i:604847067 ll:f:24.771 57 | S utig4-1595 * LN:i:103428 RC:i:5433104 ll:f:52.53 58 | S utig4-1596 * LN:i:6641258 RC:i:184276978 ll:f:27.747 59 | S utig4-1597 * LN:i:6642510 RC:i:180887504 ll:f:27.232 60 | S utig4-1617 * LN:i:90552 RC:i:5317585 ll:f:58.724 61 | S utig4-1618 * LN:i:3081 RC:i:132483 ll:f:43 62 | S utig4-1619 * LN:i:3211 RC:i:102752 ll:f:32 63 | S utig4-1795 * LN:i:114837 RC:i:6293366 ll:f:54.803 64 | S utig4-1892 * LN:i:69694 RC:i:3834717 ll:f:55.022 65 | S utig4-1896 * LN:i:92720 RC:i:4866252 ll:f:52.483 66 | S utig4-3384 * LN:i:2733755 RC:i:59520407 ll:f:21.772 67 | S utig4-3445 * LN:i:3119960 RC:i:77504174 ll:f:24.841 68 | S utig4-3446 * LN:i:13146089 RC:i:341165987 ll:f:25.952 69 | S utig4-3447 * LN:i:13087612 RC:i:340459830 ll:f:26.014 70 | S utig4-3448 * LN:i:328947 RC:i:9405582 ll:f:28.593 71 | S utig4-3455 * LN:i:1110218 RC:i:29715540 ll:f:26.766 72 | S utig4-3456 * LN:i:3105862 RC:i:77899367 ll:f:25.081 73 | S utig4-3587 * LN:i:2768349 RC:i:59384962 ll:f:21.451 74 | S utig4-3588 * LN:i:3329641 RC:i:77281634 ll:f:23.21 75 | S utig4-3589 * LN:i:3322089 RC:i:76954198 ll:f:23.164 76 | S utig4-3590 * LN:i:15898 RC:i:0 ll:f:0 77 | S utig4-3591 * LN:i:127086 RC:i:2456598 ll:f:19.33 78 | S utig4-3592 * LN:i:43610 RC:i:1090250 ll:f:25 79 | S utig4-3593 * LN:i:43610 RC:i:1061175 ll:f:24.333 80 | S utig4-3626 * LN:i:46567 RC:i:1105114 ll:f:23.732 81 | S utig4-3627 * LN:i:35478281 RC:i:1062485820 ll:f:29.948 82 | S utig4-3628 * LN:i:37817 RC:i:40782 ll:f:1.078 83 | S utig4-3629 * LN:i:37817 RC:i:51164 ll:f:1.353 84 | S utig4-3630 * LN:i:37815 RC:i:750367 ll:f:19.843 85 | S utig4-3631 * LN:i:37815 RC:i:726642 ll:f:19.216 86 | S utig4-3650 * LN:i:51428 RC:i:2419513 ll:f:47.047 87 | S utig4-4041 * LN:i:126907 RC:i:2430751 ll:f:19.154 88 | S utig4-4093 * LN:i:1245196 RC:i:24630350 ll:f:19.78 89 | S utig4-4211 * LN:i:100761 RC:i:0 ll:f:0 90 | L utig4-64 + utig4-67 + 1637M 91 | L utig4-64 + utig4-68 + 1508M 92 | L utig4-64 - utig4-65 + 1523M 93 | L utig4-64 - utig4-66 + 1523M 94 | L utig4-65 + utig4-1617 - 1590M 95 | L utig4-66 + utig4-1617 - 1614M 96 | L utig4-67 + utig4-3650 + 1646M 97 | L utig4-68 + utig4-3650 + 1646M 98 | L utig4-923 + utig4-926 + 1462M 99 | L utig4-923 + utig4-927 + 1462M 100 | L utig4-923 - utig4-925 + 1675M 101 | L utig4-923 - utig4-924 + 1619M 102 | L utig4-924 + utig4-1892 + 1531M 103 | L utig4-925 + utig4-1892 + 1612M 104 | L utig4-926 + utig4-1595 + 1623M 105 | L utig4-927 + utig4-1595 + 1587M 106 | L utig4-1019 + utig4-1022 + 1490M 107 | L utig4-1019 + utig4-1023 + 1490M 108 | L utig4-1019 - utig4-1020 + 1493M 109 | L utig4-1019 - utig4-1021 + 1493M 110 | L utig4-1020 + utig4-1387 + 1548M 111 | L utig4-1021 + utig4-1387 + 1548M 112 | L utig4-1022 + utig4-1024 + 15328M 113 | L utig4-1022 + utig4-1025 + 15328M 114 | L utig4-1023 + utig4-1025 + 15328M 115 | L utig4-1023 + utig4-1024 + 15328M 116 | L utig4-1024 + utig4-1026 + 14686M 117 | L utig4-1024 + utig4-1027 + 14686M 118 | L utig4-1025 + utig4-1027 + 14686M 119 | L utig4-1026 + utig4-3629 - 19417M 120 | L utig4-1026 + utig4-3630 - 19417M 121 | L utig4-1027 + utig4-3631 - 19417M 122 | L utig4-1027 + utig4-3628 - 19417M 123 | L utig4-1249 + utig4-1252 + 1517M 124 | L utig4-1249 + utig4-1253 + 1517M 125 | L utig4-1249 - utig4-1250 + 1537M 126 | L utig4-1249 - utig4-1251 + 1537M 127 | L utig4-1250 + utig4-1476 - 1504M 128 | L utig4-1251 + utig4-1476 - 1504M 129 | L utig4-1252 + utig4-1255 + 17702M 130 | L utig4-1252 + utig4-1254 + 16702M 131 | L utig4-1253 + utig4-1256 + 17702M 132 | L utig4-1253 + utig4-1257 + 16702M 133 | L utig4-1254 + utig4-3626 + 10334M 134 | L utig4-1255 + utig4-4211 - 99761M 135 | L utig4-1256 + utig4-4211 - 99761M 136 | L utig4-1257 + utig4-3627 - 14026M 137 | L utig4-1387 + utig4-1389 + 1484M 138 | L utig4-1387 + utig4-1388 + 1414M 139 | L utig4-1388 + utig4-1392 + 1434M 140 | L utig4-1389 + utig4-1392 + 1434M 141 | L utig4-1392 + utig4-1394 + 1576M 142 | L utig4-1392 + utig4-1393 + 1576M 143 | L utig4-1393 + utig4-1450 + 1507M 144 | L utig4-1394 + utig4-1450 + 1659M 145 | L utig4-1402 + utig4-1406 + 1592M 146 | L utig4-1402 + utig4-1405 + 1490M 147 | L utig4-1402 - utig4-1404 + 76456M 148 | L utig4-1402 - utig4-1403 + 1501M 149 | L utig4-1403 + utig4-3448 - 23580M 150 | L utig4-1404 + utig4-1407 + 99155M 151 | L utig4-1404 + utig4-1408 + 99155M 152 | L utig4-1405 + utig4-1795 - 1597M 153 | L utig4-1406 + utig4-1795 - 1569M 154 | L utig4-1407 + utig4-3448 - 24580M 155 | L utig4-1408 + utig4-1410 + 14422M 156 | L utig4-1408 + utig4-1409 + 14422M 157 | L utig4-1409 + utig4-3446 - 12741M 158 | L utig4-1409 - utig4-3448 + 14422M 159 | L utig4-1410 + utig4-3447 - 12741M 160 | L utig4-1410 + utig4-3446 - 12741M 161 | L utig4-1450 + utig4-1452 + 1467M 162 | L utig4-1450 + utig4-1451 + 1467M 163 | L utig4-1451 + utig4-1795 + 1704M 164 | L utig4-1452 + utig4-1795 + 1688M 165 | L utig4-1476 - utig4-1478 + 1719M 166 | L utig4-1476 - utig4-1477 + 1580M 167 | L utig4-1477 + utig4-3650 - 1783M 168 | L utig4-1478 + utig4-3650 - 1783M 169 | L utig4-1529 + utig4-1533 + 1648M 170 | L utig4-1529 + utig4-1532 + 1551M 171 | L utig4-1529 - utig4-1530 + 1579M 172 | L utig4-1529 - utig4-1531 + 1478M 173 | L utig4-1530 + utig4-1892 - 1453M 174 | L utig4-1531 + utig4-1892 - 1453M 175 | L utig4-1532 + utig4-1534 + 17399M 176 | L utig4-1533 + utig4-1534 + 17399M 177 | L utig4-1533 + utig4-1535 + 17399M 178 | L utig4-1534 + utig4-3593 - 11519M 179 | L utig4-1534 + utig4-3592 - 11519M 180 | L utig4-1535 + utig4-3592 - 11519M 181 | L utig4-1595 + utig4-1597 + 1607M 182 | L utig4-1595 + utig4-1596 + 1550M 183 | L utig4-1596 + utig4-1896 + 1468M 184 | L utig4-1597 + utig4-1896 + 1467M 185 | L utig4-1617 - utig4-1619 + 1667M 186 | L utig4-1617 - utig4-1618 + 1535M 187 | L utig4-1618 + utig4-1896 - 1501M 188 | L utig4-1619 + utig4-1896 - 1501M 189 | L utig4-3384 + utig4-4093 + 1520M 190 | L utig4-3445 + utig4-3455 + 1548M 191 | L utig4-3445 - utig4-3447 + 1591M 192 | L utig4-3445 - utig4-3446 + 1591M 193 | L utig4-3455 - utig4-3456 + 1658M 194 | L utig4-3587 + utig4-4093 + 1520M 195 | L utig4-3587 - utig4-3588 + 1585M 196 | L utig4-3587 - utig4-3589 + 1585M 197 | L utig4-3588 + utig4-4041 - 14898M 198 | L utig4-3589 + utig4-3590 + 8934M 199 | L utig4-3589 + utig4-3591 + 7934M 200 | L utig4-3590 + utig4-4041 - 14898M 201 | L utig4-3591 + utig4-3593 + 18019M 202 | L utig4-3591 + utig4-3592 + 18019M 203 | L utig4-3592 - utig4-4041 + 18019M 204 | L utig4-3626 + utig4-3630 + 15084M 205 | L utig4-3626 + utig4-3628 + 15084M 206 | L utig4-3626 + utig4-3631 + 15084M 207 | L utig4-3626 + utig4-3629 + 15084M 208 | L utig4-3626 - utig4-3627 + 11334M 209 | L utig4-3627 + utig4-4211 + 84026M 210 | -------------------------------------------------------------------------------- /src/graph_algos/dfs.rs: -------------------------------------------------------------------------------- 1 | use crate::graph::*; 2 | use itertools::Itertools; 3 | use std::collections::HashSet; 4 | 5 | #[derive(Copy, Clone, Debug)] 6 | pub enum TraversalDirection { 7 | FORWARD, 8 | REVERSE, 9 | //TODO add 10 | //UNDIRECTED, 11 | } 12 | 13 | //TODO replace sets with arrays 14 | 15 | //TODO pass functions giving neighbour iterators to allow more flexibility (directions, subgraphs, length boundaries, etc) 16 | //TODO use within trio_walk 17 | pub struct DFS<'a> { 18 | g: &'a Graph, 19 | direction: TraversalDirection, 20 | visit_f: Option<&'a dyn Fn(Vertex) -> bool>, 21 | blocked: HashSet, 22 | boundary: HashSet, 23 | tout: Vec, 24 | node_len_thr: usize, 25 | } 26 | 27 | impl<'a> DFS<'a> { 28 | pub fn new( 29 | g: &'a Graph, 30 | direction: TraversalDirection, 31 | visit_f: Option<&'a dyn Fn(Vertex) -> bool>, 32 | ) -> DFS<'a> { 33 | DFS { 34 | g, 35 | direction, 36 | visit_f, 37 | blocked: HashSet::new(), 38 | boundary: HashSet::new(), 39 | tout: Vec::new(), 40 | node_len_thr: usize::MAX, 41 | } 42 | } 43 | 44 | pub fn new_forward(g: &'a Graph) -> DFS<'a> { 45 | Self::new(g, TraversalDirection::FORWARD, None) 46 | } 47 | 48 | pub fn new_reverse(g: &'a Graph) -> DFS<'a> { 49 | Self::new(g, TraversalDirection::REVERSE, None) 50 | } 51 | 52 | //TODO make consume self and return new DFS 53 | pub fn set_blocked(&mut self, blocked: HashSet) { 54 | self.blocked = blocked; 55 | } 56 | 57 | //TODO make consume self and return new DFS 58 | pub fn set_max_node_len(&mut self, max_node_len: usize) { 59 | self.node_len_thr = max_node_len; 60 | } 61 | 62 | //TODO make consume self and return new DFS 63 | pub fn extend_blocked(&mut self, iter: impl IntoIterator) { 64 | self.blocked.extend(iter); 65 | } 66 | 67 | //TODO use iterators 68 | fn neighbors(&self, v: Vertex) -> Vec { 69 | match self.direction { 70 | TraversalDirection::FORWARD => self.g.outgoing_edges(v).iter().map(|l| l.end).collect(), 71 | TraversalDirection::REVERSE => { 72 | self.g.incoming_edges(v).iter().map(|l| l.start).collect() 73 | } 74 | } 75 | } 76 | 77 | pub fn run_from(&mut self, v: Vertex) { 78 | assert!(!self.blocked.contains(&v)); 79 | self.blocked.insert(v); 80 | 81 | for w in self.neighbors(v) { 82 | if !self.blocked.contains(&w) 83 | && (self.visit_f.is_none() || self.visit_f.unwrap()(w)) 84 | && self.g.vertex_length(w) < self.node_len_thr 85 | { 86 | self.run_from(w); 87 | } else { 88 | self.boundary.insert(w); 89 | } 90 | } 91 | 92 | self.tout.push(v); 93 | } 94 | 95 | //TODO maybe rename into topsort? 96 | //will run from long nodes, but not from blocked 97 | pub fn run(&mut self) { 98 | assert!(self.node_len_thr == usize::MAX); 99 | for v in self.g.all_vertices() { 100 | if !self.blocked.contains(&v) { 101 | self.run_from(v); 102 | } 103 | } 104 | } 105 | 106 | //includes visited and initially blocked 107 | pub fn take_blocked(self) -> HashSet { 108 | self.blocked 109 | } 110 | 111 | pub fn visited(&self) -> HashSet { 112 | self.tout.iter().copied().collect() 113 | } 114 | 115 | //pub fn blocked(&self) -> &HashSet { 116 | // &self.blocked 117 | //} 118 | 119 | pub fn exit_order(&self) -> &Vec { 120 | &self.tout 121 | } 122 | 123 | //todo return iterator 124 | //nodes that were reached, but not visited 125 | pub fn boundary(&self) -> &HashSet { 126 | &self.boundary 127 | //let mut boundary = HashSet::new(); 128 | //let visited = self.visited(); 129 | 130 | //for &v in &visited { 131 | // for w in self.neighbors(v) { 132 | // if !visited.contains(&w) { 133 | // boundary.insert(w); 134 | // } 135 | // } 136 | //} 137 | //boundary 138 | } 139 | 140 | //TODO return iterator? 141 | //return nodes that didn't have any neighbors 142 | pub fn dead_ends(&self) -> Vec { 143 | self.tout 144 | .iter() 145 | .filter(|&v| self.neighbors(*v).is_empty()) 146 | .copied() 147 | .collect() 148 | } 149 | } 150 | 151 | pub struct ShortNodeComponent { 152 | pub sources: HashSet, 153 | pub sinks: HashSet, 154 | pub has_deadends: bool, 155 | pub inner: HashSet, 156 | } 157 | 158 | impl ShortNodeComponent { 159 | fn consider(&mut self, g: &Graph, v: Vertex, l: Link, length_threshold: usize) { 160 | let mut is_source = false; 161 | let mut is_sink = false; 162 | 163 | if g.vertex_length(v) < length_threshold { 164 | if !self.inner.insert(v) { 165 | //inner already considered 166 | return; 167 | } 168 | } else { 169 | //v is long 170 | if v == l.start { 171 | //if v is long and we came from the 'right' 172 | is_source = true; 173 | if !self.sources.insert(v) { 174 | //source already considered 175 | return; 176 | } 177 | } else { 178 | assert!(v == l.end); 179 | //if v is long and we came from the 'left' 180 | is_sink = true; 181 | if !self.sinks.insert(v) { 182 | //sink already considered 183 | return; 184 | } 185 | } 186 | } 187 | 188 | //if not a source consider it's incoming edges 189 | if !is_source { 190 | if g.incoming_edge_cnt(v) == 0 { 191 | assert!(g.vertex_length(v) < length_threshold); 192 | self.has_deadends = true; 193 | } 194 | for i_l in g.incoming_edges(v) { 195 | if i_l != l { 196 | self.consider(g, i_l.start, i_l, length_threshold); 197 | } 198 | } 199 | } 200 | 201 | //if not a sink consider outgoing edges 202 | if !is_sink { 203 | if g.outgoing_edge_cnt(v) == 0 { 204 | assert!(g.vertex_length(v) < length_threshold); 205 | self.has_deadends = true; 206 | } 207 | for o_l in g.outgoing_edges(v) { 208 | if o_l != l { 209 | self.consider(g, o_l.end, o_l, length_threshold); 210 | } 211 | } 212 | } 213 | } 214 | 215 | //returns true if all nodes are distinct within sources/sinks union 216 | pub fn simple_boundary(&self) -> bool { 217 | let mut used = HashSet::new(); 218 | for v in self.sinks.iter().chain(self.sources.iter()) { 219 | if used.contains(&v.node_id) { 220 | return false; 221 | } 222 | used.insert(v.node_id); 223 | } 224 | true 225 | } 226 | 227 | pub fn ahead_from_long(g: &Graph, v: Vertex, length_threshold: usize) -> ShortNodeComponent { 228 | assert!(g.vertex_length(v) >= length_threshold); 229 | let mut component = ShortNodeComponent { 230 | sources: std::iter::once(v).collect(), 231 | sinks: HashSet::new(), 232 | has_deadends: false, 233 | inner: HashSet::new(), 234 | }; 235 | 236 | for o_l in g.outgoing_edges(v) { 237 | component.consider(g, o_l.end, o_l, length_threshold); 238 | } 239 | component 240 | } 241 | 242 | pub fn back_from_long(g: &Graph, v: Vertex, length_threshold: usize) -> ShortNodeComponent { 243 | assert!(g.vertex_length(v) >= length_threshold); 244 | let mut component = ShortNodeComponent { 245 | sources: HashSet::new(), 246 | sinks: std::iter::once(v).collect(), 247 | has_deadends: false, 248 | inner: HashSet::new(), 249 | }; 250 | 251 | for i_l in g.incoming_edges(v) { 252 | component.consider(g, i_l.start, i_l, length_threshold); 253 | } 254 | component 255 | } 256 | 257 | //todo refactor and simplify logic! 258 | //if v is long searching ahead from it, otherwise search in both directions 259 | pub fn search_from(g: &Graph, v: Vertex, length_threshold: usize) -> ShortNodeComponent { 260 | if g.vertex_length(v) >= length_threshold { 261 | Self::ahead_from_long(g, v, length_threshold) 262 | } else { 263 | let mut component = ShortNodeComponent { 264 | sources: HashSet::new(), 265 | sinks: HashSet::new(), 266 | has_deadends: (g.outgoing_edge_cnt(v) == 0 || g.incoming_edge_cnt(v) == 0), 267 | inner: std::iter::once(v).collect(), 268 | }; 269 | for i_l in g.incoming_edges(v) { 270 | component.consider(g, i_l.start, i_l, length_threshold); 271 | } 272 | for o_l in g.outgoing_edges(v) { 273 | component.consider(g, o_l.end, o_l, length_threshold); 274 | } 275 | component 276 | } 277 | } 278 | 279 | pub fn all_nodes(&self) -> impl Iterator { 280 | self.inner 281 | .iter() 282 | .chain(self.sources.iter()) 283 | .chain(self.sinks.iter()) 284 | } 285 | 286 | pub fn print(&self, g: &Graph) -> String { 287 | format!( 288 | "Sources: {}; sinks: {}", 289 | self.sources.iter().map(|&v| g.v_str(v)).join(", "), 290 | self.sinks.iter().map(|&v| g.v_str(v)).join(", ") 291 | ) 292 | } 293 | } 294 | -------------------------------------------------------------------------------- /tests/trio_walk_test.rs: -------------------------------------------------------------------------------- 1 | extern crate log; 2 | use itertools::Itertools; 3 | 4 | use rukki::trio::*; 5 | use rukki::trio_walk::{HaploSearchSettings, HaploSearcher}; 6 | use rukki::*; 7 | use std::fs; 8 | 9 | //fn from_assignment_iterator<'a>(g: &'a Graph, node_assign_it: impl Iterator) 10 | //-> AssignmentStorage<'a> { 11 | // let mut storage = AssignmentStorage::new(g); 12 | // for (node_id, group) in node_assign_it { 13 | // storage.update_group(node_id, group); 14 | // } 15 | // storage 16 | //} 17 | 18 | //fn from_parental_groups<'a>(g: &'a Graph, maternal: &[usize], paternal: &[usize]) 19 | //-> AssignmentStorage<'a> { 20 | // from_assignment_iterator(g, maternal.iter() 21 | // .map(|n| (*n, TrioGroup::MATERNAL)) 22 | // .chain(paternal.iter() 23 | // .map(|n| (*n, TrioGroup::PATERNAL)))) 24 | //} 25 | 26 | fn build_searcher<'a>( 27 | settings: HaploSearchSettings, 28 | g: &'a Graph, 29 | assignments: &'a AssignmentStorage, 30 | ) -> HaploSearcher<'a> { 31 | HaploSearcher::new(g, assignments, settings, None) 32 | } 33 | 34 | fn init() { 35 | let _ = env_logger::builder().is_test(true).try_init(); 36 | } 37 | 38 | #[test] 39 | fn haplo_paths() { 40 | init(); 41 | 42 | let graph_fn = "tests/test_graphs/test1.gfa"; 43 | let assignments_fn = "tests/test_graphs/test1.ann.csv"; 44 | let g = graph::Graph::read(&fs::read_to_string(graph_fn).unwrap()); 45 | let assignments = trio::parse_node_assignments(&g, assignments_fn).unwrap(); 46 | 47 | let settings = trio_walk::HaploSearchSettings::default(); 48 | let augment_assign = augment_by_path_search(&g, assignments, settings); 49 | 50 | let mut haplo_searcher = build_searcher(settings, &g, &augment_assign); 51 | let mut answer = haplo_searcher 52 | .find_all() 53 | .into_iter() 54 | .map(|(p, _, group)| (group, p.print(&g))) 55 | .collect_vec(); 56 | answer.sort(); 57 | assert_eq!(&answer, &[ 58 | (TrioGroup::MATERNAL, 59 | String::from("utig4-1829-,utig4-1826-,utig4-1828+,utig4-1832+,utig4-1245-,utig4-1240-,utig4-1237-,utig4-1239+,utig4-1552+,utig4-1554+,utig4-4105+,utig4-2593-,utig4-2589-,utig4-2590+")), 60 | (TrioGroup::PATERNAL, 61 | String::from("utig4-1830-,utig4-1826-,utig4-1827+,utig4-1831+,utig4-1243-,utig4-1241-,utig4-1237-,utig4-1238+,utig4-1552+,utig4-1553+,utig4-4096-,utig4-4097+,utig4-2592-,utig4-2589-,utig4-2591+"))]) 62 | } 63 | 64 | #[test] 65 | fn augment_by_search() { 66 | init(); 67 | 68 | let graph_fn = "tests/test_graphs/sparse_markers.gfa"; 69 | let assignments_fn = "tests/test_graphs/sparse_markers.ann.csv"; 70 | let g = graph::Graph::read(&fs::read_to_string(graph_fn).unwrap()); 71 | let assignments = trio::parse_node_assignments(&g, assignments_fn).unwrap(); 72 | 73 | let settings = trio_walk::HaploSearchSettings::default(); 74 | assert_eq!(assignments.assigned().count(), 14); 75 | 76 | let augment_assign = augment_by_path_search(&g, assignments, settings); 77 | 78 | assert_eq!(augment_assign.assigned().count(), 17); 79 | assert_eq!( 80 | augment_assign.group(g.name2id("utig4-1421")), 81 | Some(TrioGroup::PATERNAL) 82 | ); 83 | assert_eq!( 84 | augment_assign.group(g.name2id("utig4-793")), 85 | Some(TrioGroup::PATERNAL) 86 | ); 87 | assert_eq!( 88 | augment_assign.group(g.name2id("utig4-1436")), 89 | Some(TrioGroup::PATERNAL) 90 | ); 91 | 92 | let mut haplo_searcher = build_searcher(settings, &g, &augment_assign); 93 | 94 | let mut answer = haplo_searcher 95 | .find_all() 96 | .into_iter() 97 | .map(|(p, _, group)| (group, p.print(&g))) 98 | .collect_vec(); 99 | answer.sort(); 100 | assert_eq!(&answer, &[ 101 | (TrioGroup::MATERNAL, 102 | String::from("utig4-1424-,utig4-1422-,utig4-1420-,utig4-1418+,utig4-792-,utig4-791+,utig4-795+,utig4-1435+,utig4-1437+,utig4-1439+")), 103 | (TrioGroup::PATERNAL, 104 | String::from("utig4-1423-,utig4-1421-,utig4-1419-,utig4-1418+,utig4-793-,utig4-791+,utig4-794+,utig4-1435+,utig4-1436+,utig4-1438+"))]) 105 | } 106 | 107 | #[test] 108 | fn bubble_filling() { 109 | init(); 110 | 111 | let graph_fn = "tests/test_graphs/path_closing.gfa"; 112 | let assignments_fn = "tests/test_graphs/path_closing.ann.csv"; 113 | let g = graph::Graph::read(&fs::read_to_string(graph_fn).unwrap()); 114 | let assignments = trio::parse_node_assignments(&g, assignments_fn).unwrap(); 115 | 116 | let settings = trio_walk::HaploSearchSettings { 117 | fill_bubbles: true, 118 | ..trio_walk::HaploSearchSettings::default() 119 | }; 120 | assert_eq!(assignments.assigned().count(), 26); 121 | 122 | let augment_assign = augment_by_path_search(&g, assignments, settings); 123 | 124 | assert_eq!(augment_assign.assigned().count(), 28); 125 | assert_eq!( 126 | augment_assign.group(g.name2id("utig4-1397")), 127 | Some(TrioGroup::MATERNAL) 128 | ); 129 | assert_eq!( 130 | augment_assign.group(g.name2id("utig4-1347")), 131 | Some(TrioGroup::MATERNAL) 132 | ); 133 | 134 | let mut haplo_searcher = build_searcher(settings, &g, &augment_assign); 135 | 136 | let mut answer = haplo_searcher 137 | .find_all() 138 | .into_iter() 139 | .map(|(p, _, group)| (group, p.print(&g))) 140 | .collect_vec(); 141 | answer.sort(); 142 | assert_eq!(&answer, &[ 143 | (TrioGroup::MATERNAL, 144 | String::from("utig4-1575-,utig4-1574+,utig4-1397-,utig4-1395-,utig4-1347-,utig4-1343-,utig4-1345+,utig4-1568-,utig4-815-,utig4-814+,utig4-819+,utig4-1799-,utig4-1796-,utig4-1798+")), 145 | (TrioGroup::MATERNAL, 146 | String::from("utig4-3444+,utig4-4080-,utig4-771-,utig4-768-,utig4-770+")), 147 | (TrioGroup::PATERNAL, 148 | String::from("utig4-1576-,utig4-1574+,utig4-1396-,utig4-1395-,utig4-1346-,utig4-1343-,utig4-1344+,utig4-1568-,utig4-815-,utig4-814+,utig4-818+,utig4-1796-,utig4-1797+")), 149 | (TrioGroup::PATERNAL, 150 | String::from("utig4-3412+,utig4-774-,utig4-772-,utig4-768-,utig4-769+"))]); 151 | } 152 | 153 | #[test] 154 | fn haplo_paths_2() { 155 | init(); 156 | 157 | let graph_fn = "tests/test_graphs/test2.gfa"; 158 | let assignments_fn = "tests/test_graphs/test2.ann.csv"; 159 | let g = graph::Graph::read(&fs::read_to_string(graph_fn).unwrap()); 160 | let assignments = trio::parse_node_assignments(&g, assignments_fn).unwrap(); 161 | 162 | let settings = trio_walk::HaploSearchSettings::default(); 163 | assert_eq!(assignments.assigned().count(), 42); 164 | 165 | let augment_assign = augment_by_path_search(&g, assignments, settings); 166 | 167 | assert_eq!( 168 | augment_assign.group(g.name2id("utig4-414")), 169 | Some(TrioGroup::MATERNAL) 170 | ); 171 | assert_eq!( 172 | augment_assign.group(g.name2id("utig4-308")), 173 | Some(TrioGroup::MATERNAL) 174 | ); 175 | assert_eq!( 176 | augment_assign.group(g.name2id("utig4-415")), 177 | Some(TrioGroup::PATERNAL) 178 | ); 179 | assert_eq!(augment_assign.assigned().count(), 45); 180 | 181 | let mut haplo_searcher = build_searcher(settings, &g, &augment_assign); 182 | 183 | let mut answer = haplo_searcher 184 | .find_all() 185 | .into_iter() 186 | .map(|(p, _, group)| (group, p.print(&g))) 187 | .collect_vec(); 188 | answer.sort(); 189 | assert_eq!(&answer, &[ 190 | (TrioGroup::MATERNAL, 191 | String::from("utig4-3444+,utig4-4080-,utig4-771-,utig4-768-,utig4-770+,utig4-1384-,utig4-1385+,utig4-1898-,utig4-1897-,utig4-414-,utig4-412+,utig4-416+,utig4-419+,utig4-4073-,utig4-1460-,utig4-1459+,utig4-1463+,utig4-4227+,utig4-311-,utig4-307-,utig4-308+,utig4-3431-,utig4-3430+")), 192 | (TrioGroup::PATERNAL, 193 | String::from("utig4-3412+,utig4-774-,utig4-772-,utig4-768-,utig4-769+,utig4-1384-,utig4-1386+,utig4-1899-,utig4-1897-,utig4-413-,utig4-412+,utig4-415+,utig4-422-,utig4-418+,utig4-421+,utig4-1461-,utig4-1459+,utig4-1462+,utig4-4227+,utig4-312-,utig4-307-,utig4-309+,utig4-3429+"))]); 194 | } 195 | 196 | #[test] 197 | fn haplo_paths_3() { 198 | init(); 199 | 200 | let graph_fn = "tests/test_graphs/test3.gfa"; 201 | let assignments_fn = "tests/test_graphs/test3.ann.csv"; 202 | let g = graph::Graph::read(&fs::read_to_string(graph_fn).unwrap()); 203 | let assignments = trio::parse_node_assignments(&g, assignments_fn).unwrap(); 204 | 205 | let settings = trio_walk::HaploSearchSettings::default(); 206 | assert_eq!(assignments.assigned().count(), 76); 207 | 208 | let augment_assign = augment_by_path_search(&g, assignments, settings); 209 | 210 | assert_eq!( 211 | augment_assign.group(g.name2id("utig4-1404")), 212 | Some(TrioGroup::PATERNAL) 213 | ); 214 | assert_eq!( 215 | augment_assign.group(g.name2id("utig4-1403")), 216 | Some(TrioGroup::MATERNAL) 217 | ); 218 | 219 | assert_eq!(augment_assign.assigned().count(), 82); 220 | 221 | let mut haplo_searcher = build_searcher(settings, &g, &augment_assign); 222 | 223 | let mut answer = haplo_searcher 224 | .find_all() 225 | .into_iter() 226 | .map(|(p, _, group)| (group, p.print(&g))) 227 | .collect_vec(); 228 | answer.sort(); 229 | assert_eq!(&answer, &[ 230 | (TrioGroup::MATERNAL, 231 | String::from("utig4-4093-,utig4-3587-,utig4-3588+,utig4-4041-,utig4-3592+,utig4-1535-,utig4-1533-,utig4-1529-,utig4-1531+,utig4-1892-,utig4-925-,utig4-923+,utig4-926+,utig4-1595+,utig4-1597+,utig4-1896+,utig4-1619-,utig4-1617+,utig4-65-,utig4-64+,utig4-67+,[N5000N:ambig_path],utig4-1477-,utig4-1476+,utig4-1251-,utig4-1249+,utig4-1252+,utig4-1254+,utig4-3626+,utig4-3631+,utig4-1027-,utig4-1025-,utig4-1022-,utig4-1019-,utig4-1020+,utig4-1387+,utig4-1389+,utig4-1392+,utig4-1393+,utig4-1450+,utig4-1451+,utig4-1795+,utig4-1406-,utig4-1402-,utig4-1403+,utig4-3448-,utig4-1409+,utig4-3446-,[N14098N:alt-utig4-3445],utig4-3456-")), 232 | (TrioGroup::PATERNAL, 233 | String::from("utig4-3455-,utig4-3445-,utig4-3447+,utig4-1410-,utig4-1408-,utig4-1404-,utig4-1402+,utig4-1405+,utig4-1795-,utig4-1452-,utig4-1450-,utig4-1394-,utig4-1392-,utig4-1388-,utig4-1387-,utig4-1021-,utig4-1019+,utig4-1023+,utig4-1024+,utig4-1026+,utig4-3630-,utig4-3626-,utig4-3627+,utig4-1257-,utig4-1253-,utig4-1249-,utig4-1251+,utig4-1476-,utig4-1478+,utig4-3650-,utig4-68-,utig4-64-,utig4-66+,utig4-1617-,utig4-1618+,utig4-1896-,utig4-1596-,utig4-1595-,utig4-927-,utig4-923-,utig4-924+,utig4-1892+,utig4-1530-,utig4-1529+,utig4-1532+,utig4-1534+,utig4-3593-,utig4-3591-,utig4-3589-,[N34594N:alt-utig4-3587],utig4-3384+"))]); 234 | } 235 | -------------------------------------------------------------------------------- /src/graph_algos/superbubble.rs: -------------------------------------------------------------------------------- 1 | use crate::graph::*; 2 | use log::debug; 3 | use std::cmp; 4 | use std::collections::HashMap; 5 | use std::collections::HashSet; 6 | 7 | type DistRange = (usize, usize); 8 | 9 | fn shift_range((min, max): DistRange, s: usize) -> DistRange { 10 | (min + s, max + s) 11 | } 12 | 13 | fn merge_range((min1, max1): DistRange, (min2, max2): DistRange) -> DistRange { 14 | (cmp::min(min1, min2), cmp::max(max1, max2)) 15 | } 16 | 17 | pub struct Superbubble { 18 | start_vertex: Vertex, 19 | end_vertex: Option, 20 | //vertex to path length range 21 | reached_vertices: HashMap, 22 | } 23 | 24 | impl Superbubble { 25 | fn link_dist_range(&self, l: Link, g: &Graph) -> Option { 26 | let &r = self.reached_vertices.get(&l.start)?; 27 | let enode_len = g.vertex_length(l.end); 28 | assert!(enode_len >= l.overlap); 29 | Some(shift_range(r, enode_len - l.overlap)) 30 | } 31 | 32 | pub fn longest_path(&self, g: &Graph) -> Path { 33 | let mut v = self.end_vertex.unwrap(); 34 | let mut longest_dist = self.reached_vertices.get(&v).unwrap().1; 35 | let mut rc_p = Path::new(v.rc()); 36 | 'outer: while v != self.start_vertex { 37 | //let l = self.heaviest_backtrace.get(v).unwrap(); 38 | for l in g.incoming_edges(v) { 39 | if let Some((_, l_d)) = self.link_dist_range(l, g) { 40 | if l_d == longest_dist { 41 | assert!(l.end == v); 42 | rc_p.append(l.rc()); 43 | v = l.start; 44 | longest_dist = self.reached_vertices.get(&l.start).unwrap().1; 45 | continue 'outer; 46 | } 47 | } 48 | } 49 | panic!("Couldn't recover bubble path"); 50 | } 51 | rc_p.reverse_complement() 52 | } 53 | 54 | pub fn shortest_path(&self, g: &Graph) -> Path { 55 | let mut v = self.end_vertex.unwrap(); 56 | let mut shortest_dist = self.reached_vertices.get(&v).unwrap().0; 57 | let mut rc_p = Path::new(v.rc()); 58 | 'outer: while v != self.start_vertex { 59 | //let l = self.heaviest_backtrace.get(v).unwrap(); 60 | for l in g.incoming_edges(v) { 61 | if let Some((l_d, _)) = self.link_dist_range(l, g) { 62 | if l_d == shortest_dist { 63 | assert!(l.end == v); 64 | rc_p.append(l.rc()); 65 | v = l.start; 66 | shortest_dist = self.reached_vertices.get(&l.start).unwrap().0; 67 | continue 'outer; 68 | } 69 | } 70 | } 71 | panic!("Couldn't recover bubble path"); 72 | } 73 | rc_p.reverse_complement() 74 | } 75 | 76 | pub fn vertices(&self) -> impl Iterator + '_ { 77 | self.reached_vertices.keys() 78 | } 79 | 80 | pub fn inner_vertices(&self) -> impl Iterator + '_ { 81 | self.reached_vertices 82 | .keys() 83 | .filter(|&v| *v != self.start_vertex() && *v != self.end_vertex()) 84 | } 85 | 86 | pub fn start_vertex(&self) -> Vertex { 87 | self.start_vertex 88 | } 89 | 90 | pub fn end_vertex(&self) -> Vertex { 91 | self.end_vertex.unwrap() 92 | } 93 | 94 | pub fn length_range(&self, g: &Graph) -> (usize, usize) { 95 | let r = *self.reached_vertices.get(&self.end_vertex()).unwrap(); 96 | //currently start vertex and end vertex can't be the same 97 | assert!(self.start_vertex() != self.end_vertex()); 98 | shift_range(r, g.vertex_length(self.start_vertex())) 99 | //if self.start_vertex() != self.end_vertex() { 100 | // shift_range(r, g.node(self.start_vertex().node_id).length) 101 | //} else { 102 | // r 103 | //} 104 | } 105 | } 106 | 107 | //TODO can be heavily optimized (e.g. no maps, sets, etc) 108 | //TODO support other weights -- currently using max length 109 | //Maybe update to pseudo-code from miniasm paper? 110 | pub struct SbSearchParams { 111 | pub max_length: usize, 112 | pub max_diff: usize, 113 | pub max_count: usize, 114 | } 115 | 116 | impl SbSearchParams { 117 | //all usize values should probably default to max values 118 | //FIXME provide builder 119 | pub fn unrestricted() -> SbSearchParams { 120 | SbSearchParams { 121 | max_length: usize::MAX, 122 | max_diff: usize::MAX, 123 | max_count: usize::MAX, 124 | } 125 | } 126 | } 127 | 128 | pub fn find_superbubble(g: &Graph, v: Vertex, params: &SbSearchParams) -> Option { 129 | find_superbubble_subgraph(g, v, params, None) 130 | } 131 | 132 | //TODO handle case when first/last vertex have other outgoing/incoming edges 133 | //last vertex case is almost handled 134 | pub fn find_superbubble_subgraph( 135 | g: &Graph, 136 | s: Vertex, 137 | params: &SbSearchParams, 138 | consider_vertex_f: Option<&dyn Fn(Vertex) -> bool>, 139 | ) -> Option { 140 | if let Some(f) = consider_vertex_f { 141 | if !f(s) { 142 | return None; 143 | } 144 | }; 145 | 146 | let mut bubble = Superbubble { 147 | start_vertex: s, 148 | reached_vertices: HashMap::new(), 149 | end_vertex: None, 150 | }; 151 | 152 | let outgoing_edge_cnt = |v| match consider_vertex_f { 153 | None => g.outgoing_edge_cnt(v), 154 | Some(avail) => g.outgoing_edges(v).iter().filter(|l| avail(l.end)).count(), 155 | }; 156 | 157 | let incoming_edge_cnt = |v| match consider_vertex_f { 158 | None => g.incoming_edge_cnt(v), 159 | Some(avail) => g 160 | .incoming_edges(v) 161 | .iter() 162 | .filter(|l| avail(l.start)) 163 | .count(), 164 | }; 165 | 166 | let outgoing_edges = |v| match consider_vertex_f { 167 | None => g.outgoing_edges(v), 168 | Some(avail) => g 169 | .outgoing_edges(v) 170 | .iter() 171 | .copied() 172 | .filter(|l| avail(l.end)) 173 | .collect(), 174 | }; 175 | 176 | let _incoming_edges = |v| match consider_vertex_f { 177 | None => g.incoming_edges(v), 178 | Some(avail) => g 179 | .incoming_edges(v) 180 | .iter() 181 | .copied() 182 | .filter(|l| avail(l.start)) 183 | .collect(), 184 | }; 185 | 186 | if outgoing_edge_cnt(bubble.start_vertex) < 2 187 | //same check, but excluding loops 188 | || outgoing_edges(bubble.start_vertex).iter().filter(|l| l.start != l.end).count() < 2 189 | { 190 | return None; 191 | } 192 | 193 | debug!( 194 | "Adding starting vertex {} to stack", 195 | g.v_str(bubble.start_vertex) 196 | ); 197 | //vertices with all incoming edges considered (can be processed) 198 | let mut can_be_processed: Vec = vec![bubble.start_vertex]; 199 | bubble.reached_vertices.insert(bubble.start_vertex, (0, 0)); 200 | 201 | //reached vertices that can't be processed yet 202 | let mut not_ready_cnt = 0; 203 | let mut remaining_incoming: HashMap = HashMap::new(); 204 | 205 | while !can_be_processed.is_empty() { 206 | if bubble.reached_vertices.len() > params.max_count { 207 | return None; 208 | } 209 | 210 | let v = can_be_processed.pop().unwrap(); 211 | debug!("Adding vertex {} to the bubble", g.v_str(v)); 212 | 213 | if outgoing_edge_cnt(v) == 0 { 214 | debug!("Hit dead-end"); 215 | return None; 216 | } 217 | 218 | debug!("Looking at neighbors"); 219 | for l in outgoing_edges(v) { 220 | let w = l.end; 221 | if w == bubble.start_vertex { 222 | return None; 223 | //FIXME re-enable after dealing with usage wrt start/end symmetry absense 224 | //if v != self.start_vertex { 225 | // //no loops involiving the start vertex 226 | // return false; 227 | //} else { 228 | // //unless self-loop 229 | // continue; 230 | //} 231 | } 232 | 233 | if !bubble.reached_vertices.contains_key(&w) { 234 | if bubble.reached_vertices.contains_key(&w.rc()) { 235 | debug!( 236 | "Reverse-complement vertex {} was already reached", 237 | g.v_str(w.rc()) 238 | ); 239 | return None; 240 | } 241 | not_ready_cnt += 1; 242 | remaining_incoming.insert(w, incoming_edge_cnt(w)); 243 | bubble 244 | .reached_vertices 245 | .insert(w, bubble.link_dist_range(l, g).unwrap()); 246 | } 247 | let rem_inc = remaining_incoming.get_mut(&w).unwrap(); 248 | *rem_inc -= 1; 249 | //self.reached_vertices.get(w) = 250 | bubble.reached_vertices.insert( 251 | w, 252 | merge_range( 253 | *bubble.reached_vertices.get(&w).unwrap(), 254 | bubble.link_dist_range(l, g).unwrap(), 255 | ), 256 | ); 257 | 258 | if *remaining_incoming.get(&w).unwrap() == 0 { 259 | can_be_processed.push(w); 260 | not_ready_cnt -= 1; 261 | } 262 | } 263 | 264 | if can_be_processed.len() == 1 && not_ready_cnt == 0 { 265 | //FIXME second case is not a classic one, check that it works! 266 | //Also needs more work to get final vertex!!! 267 | //|| (can_be_processed.len() == 0 && not_ready_cnt == 1) 268 | //process last vertex? 269 | let t = can_be_processed.pop().unwrap(); 270 | debug!("End node found! Vertex {}", g.v_str(t)); 271 | 272 | let &(min_len, max_len) = bubble.reached_vertices.get(&t).unwrap(); 273 | 274 | let v_len = g.vertex_length(t); 275 | 276 | //FIXME it seems like only start_pos is ever checked 277 | if min_len > v_len && (min_len - v_len) > params.max_length { 278 | debug!( 279 | "Length of minimal additional sequence {} exceeded limit {}", 280 | min_len - v_len, 281 | params.max_length 282 | ); 283 | return None; 284 | } 285 | if max_len - min_len > params.max_diff { 286 | debug!( 287 | "Minimal and maximal lengths differed by {} exceeded limit {}", 288 | max_len - min_len, 289 | params.max_diff 290 | ); 291 | return None; 292 | } 293 | bubble.end_vertex = Some(t); 294 | return Some(bubble); 295 | } 296 | } 297 | 298 | debug!("No more nodes could be added"); 299 | debug!( 300 | "Finished search for starting vertex {}", 301 | g.v_str(bubble.start_vertex) 302 | ); 303 | None 304 | } 305 | 306 | pub fn find_all_outer(g: &Graph, params: &SbSearchParams) -> Vec { 307 | let mut used_starts = HashSet::new(); 308 | let mut start_2_bubble = HashMap::new(); 309 | for v in g.all_vertices() { 310 | if used_starts.contains(&v) { 311 | continue; 312 | } 313 | if let Some(bubble) = find_superbubble(g, v, params) { 314 | //used_starts.insert(bubble.start_vertex()); 315 | used_starts.insert(bubble.end_vertex().rc()); 316 | assert!(!start_2_bubble.contains_key(&bubble.end_vertex().rc())); 317 | for &w in bubble.inner_vertices() { 318 | used_starts.insert(w); 319 | used_starts.insert(w.rc()); 320 | start_2_bubble.remove(&w); 321 | start_2_bubble.remove(&w.rc()); 322 | } 323 | start_2_bubble.insert(v, bubble); 324 | } 325 | } 326 | start_2_bubble.into_values().collect() 327 | } 328 | 329 | pub type BubbleChain = Vec; 330 | 331 | //TODO maybe switch to Option? 332 | pub fn find_chain_ahead(g: &Graph, init_v: Vertex, params: &SbSearchParams) -> BubbleChain { 333 | let mut chain = Vec::new(); 334 | //FIXME no need to check here, since we are marking everything, but useful for general code 335 | let mut v = init_v; 336 | 337 | loop { 338 | match find_superbubble(g, v, params) { 339 | None => break, 340 | Some(bubble) => { 341 | v = bubble.end_vertex(); 342 | chain.push(bubble); 343 | if v == init_v { 344 | break; 345 | } 346 | } 347 | } 348 | } 349 | chain 350 | } 351 | 352 | //TODO test 353 | pub fn find_maximal_chain(g: &Graph, mut init_v: Vertex, params: &SbSearchParams) -> BubbleChain { 354 | let chain_back = find_chain_ahead(g, init_v.rc(), params); 355 | if !chain_back.is_empty() { 356 | init_v = chain_back.last().unwrap().end_vertex().rc(); 357 | } 358 | find_chain_ahead(g, init_v, params) 359 | } 360 | 361 | pub fn find_maximal_chains(g: &Graph, params: &SbSearchParams) -> Vec { 362 | let mut considered_start_nodes = HashSet::new(); 363 | let mut maximal_chains = Vec::new(); 364 | for outer_bubble in find_all_outer(g, params) { 365 | let v = outer_bubble.start_vertex(); 366 | if considered_start_nodes.contains(&v.node_id) { 367 | continue; 368 | } 369 | let chain = find_maximal_chain(g, v, params); 370 | assert!(!chain.is_empty()); 371 | for bubble in &chain { 372 | considered_start_nodes.insert(bubble.start_vertex().node_id); 373 | considered_start_nodes.insert(bubble.end_vertex().node_id); 374 | } 375 | maximal_chains.push(chain); 376 | } 377 | maximal_chains 378 | } 379 | 380 | //will need adjustment if ever 'start' can be same as 'end' in superbubble 381 | pub fn length_range(chain: &[Superbubble], g: &Graph) -> DistRange { 382 | let mut tot_min = 0; 383 | let mut tot_max = 0; 384 | for bubble in chain { 385 | //TODO implement via negative shift and tuple addition 386 | let (min, max) = bubble.length_range(g); 387 | let s_l = g.vertex_length(bubble.start_vertex()); 388 | tot_min += min - s_l; 389 | tot_max += max - s_l; 390 | } 391 | if !chain.is_empty() && chain[0].start_vertex() != chain.last().unwrap().end_vertex() { 392 | let s_l = g.vertex_length(chain[0].start_vertex()); 393 | (tot_min + s_l, tot_max + s_l) 394 | } else { 395 | (tot_min, tot_max) 396 | } 397 | } 398 | 399 | //TODO make chain its own structure not to allow empty chains 400 | pub fn longest_path(chain: &[Superbubble], g: &Graph) -> Option { 401 | if chain.is_empty() { 402 | return None; 403 | } 404 | let start_vertex = chain[0].start_vertex(); 405 | let mut total = chain[0].longest_path(g); 406 | for (i, bubble) in chain.iter().enumerate() { 407 | if i == 0 { 408 | continue; 409 | } 410 | let mut p = bubble.longest_path(g); 411 | if i == (chain.len() - 1) && bubble.end_vertex() == start_vertex { 412 | p.trim(1); 413 | } 414 | total.extend(p); 415 | } 416 | Some(total) 417 | } 418 | 419 | pub fn linear_frac(chain: &[Superbubble], g: &Graph) -> f32 { 420 | assert!(!chain.is_empty()); 421 | let start_vertex = chain[0].start_vertex(); 422 | let mut total_linear = g.vertex_length(start_vertex); 423 | for (i, bubble) in chain.iter().enumerate() { 424 | if bubble.end_vertex() != start_vertex { 425 | total_linear += g.vertex_length(bubble.end_vertex()); 426 | } else { 427 | assert!(i == chain.len() - 1); 428 | } 429 | } 430 | let longest_path_len = length_range(chain, g).1; 431 | if total_linear > longest_path_len { 432 | 1. 433 | } else { 434 | total_linear as f32 / longest_path_len as f32 435 | } 436 | } 437 | 438 | //FIXME implement flattened vertex iterator even if it has duplicates 439 | pub fn check_chain(chain: &[Superbubble], mut f: F) -> bool 440 | where 441 | F: FnMut(&Vertex) -> bool, 442 | { 443 | chain.iter().flat_map(|b| b.vertices()).all(&mut f) 444 | } 445 | -------------------------------------------------------------------------------- /src/pseudo_hap.rs: -------------------------------------------------------------------------------- 1 | use crate::graph::*; 2 | use crate::graph_algos::*; 3 | use std::collections::HashSet; 4 | 5 | pub struct LinearBlock { 6 | instance_path: Path, 7 | known_alt_nodes: HashSet, 8 | } 9 | 10 | impl LinearBlock { 11 | //pub fn print(&self, g: &Graph) -> String { 12 | // format!("", self.instance_path().print(g), 13 | // self.known_alt_nodes.iter().map(|&node_id| g.name(node_id)).collect::>().join(",")) 14 | //} 15 | 16 | pub fn instance_path(&self) -> &Path { 17 | &self.instance_path 18 | } 19 | 20 | pub fn known_alt_nodes(&self) -> &HashSet { 21 | &self.known_alt_nodes 22 | } 23 | 24 | pub fn all_nodes(&self) -> impl Iterator + '_ { 25 | self.instance_path 26 | .vertices() 27 | .iter() 28 | .map(|v| v.node_id) 29 | .chain(self.known_alt_nodes.iter().copied()) 30 | } 31 | 32 | fn can_merge_in(&self, other: &LinearBlock) -> bool { 33 | self.instance_path.can_merge_in(&other.instance_path) 34 | && other 35 | .all_nodes() 36 | .all(|n| !self.known_alt_nodes.contains(&n)) 37 | } 38 | 39 | fn merge_in(&mut self, other: LinearBlock) { 40 | debug_assert!(self.can_merge_in(&other)); 41 | self.instance_path.merge_in(other.instance_path); 42 | self.known_alt_nodes 43 | .extend(other.known_alt_nodes.into_iter()); 44 | } 45 | 46 | fn try_merge_in(mut self, other: LinearBlock) -> Option { 47 | if self.can_merge_in(&other) { 48 | self.merge_in(other); 49 | Some(self) 50 | } else { 51 | None 52 | } 53 | } 54 | 55 | fn from_path(path: Path, iter: impl Iterator) -> LinearBlock { 56 | LinearBlock { 57 | instance_path: path, 58 | known_alt_nodes: iter.map(|v| v.node_id).collect(), 59 | } 60 | } 61 | 62 | fn from_bubble(g: &Graph, bubble: superbubble::Superbubble) -> LinearBlock { 63 | let p = bubble.longest_path(g); 64 | let mut nodes: HashSet = bubble.vertices().map(|v| v.node_id).collect(); 65 | for v in p.vertices() { 66 | nodes.remove(&v.node_id); 67 | } 68 | LinearBlock { 69 | instance_path: p, 70 | known_alt_nodes: nodes.into_iter().collect(), 71 | } 72 | } 73 | 74 | fn from_bubble_chain(g: &Graph, bubble_chain: superbubble::BubbleChain) -> LinearBlock { 75 | assert!(!bubble_chain.is_empty()); 76 | let mut block = Self::vertex_block(bubble_chain[0].start_vertex()); 77 | for b in bubble_chain.into_iter() { 78 | let b_lb = Self::from_bubble(g, b); 79 | assert!(block.can_merge_in(&b_lb)); 80 | block.merge_in(b_lb); 81 | } 82 | block 83 | } 84 | 85 | fn vertex_block(v: Vertex) -> LinearBlock { 86 | LinearBlock { 87 | instance_path: Path::new(v), 88 | known_alt_nodes: HashSet::new(), 89 | } 90 | } 91 | 92 | fn search_ahead(g: &Graph, v: Vertex, params: &superbubble::SbSearchParams) -> LinearBlock { 93 | let chain = superbubble::find_chain_ahead(g, v, params); 94 | if !chain.is_empty() { 95 | Self::from_bubble_chain(g, chain) 96 | } else { 97 | Self::vertex_block(v) 98 | } 99 | } 100 | 101 | //fn is_bridge(&self, g: &Graph) -> bool { 102 | // g.incoming_edge_cnt(self.instance_path.start()) == 1 103 | // && g.outgoing_edge_cnt(self.instance_path.end()) == 1 104 | //} 105 | 106 | fn reverse_complement(self) -> LinearBlock { 107 | LinearBlock { 108 | instance_path: self.instance_path.reverse_complement(), 109 | known_alt_nodes: self.known_alt_nodes, 110 | //..self 111 | } 112 | } 113 | } 114 | 115 | //todo maybe support blocks here? (use block search and is_block method) 116 | #[allow(clippy::many_single_char_names)] 117 | fn bridged_by_vertex(g: &Graph, v: Vertex) -> Option { 118 | if g.incoming_edge_cnt(v) == 1 && g.outgoing_edge_cnt(v) == 1 { 119 | let u = g.incoming_edges(v)[0].start; 120 | let w = g.outgoing_edges(v)[0].end; 121 | if u.node_id == v.node_id || w.node_id == v.node_id || w.node_id == u.node_id { 122 | return None; 123 | } 124 | let mut p = Path::from_link(g.incoming_edges(v)[0]); 125 | p.append(g.outgoing_edges(v)[0]); 126 | Some(p) 127 | } else { 128 | None 129 | } 130 | } 131 | 132 | fn other_outgoing(g: &Graph, v: Vertex, l: Link) -> Option { 133 | if g.outgoing_edge_cnt(v) == 2 { 134 | let alt = g 135 | .outgoing_edges(v) 136 | .iter() 137 | .copied() 138 | .find(|&x| x != l) 139 | .unwrap(); 140 | assert!(alt.end != l.end); 141 | return Some(alt); 142 | } 143 | None 144 | } 145 | 146 | fn other_incoming(g: &Graph, v: Vertex, l: Link) -> Option { 147 | if g.incoming_edge_cnt(v) == 2 { 148 | let alt = g 149 | .incoming_edges(v) 150 | .iter() 151 | .copied() 152 | .find(|&x| x != l) 153 | .unwrap(); 154 | assert!(alt.start != l.start); 155 | return Some(alt); 156 | } 157 | None 158 | } 159 | 160 | //Bridge is a path of length 3 with middle vertex having single incoming and single outgoing link 161 | //Returning both middle vertex and entire path 162 | fn bridge_ahead(g: &Graph, v: Vertex) -> Option { 163 | let bridges: Vec = g 164 | .outgoing_edges(v) 165 | .iter() 166 | .filter_map(|l| bridged_by_vertex(g, l.end)) 167 | .collect(); 168 | if bridges.len() == 1 { 169 | Some(bridges.into_iter().next().unwrap()) 170 | } else { 171 | None 172 | } 173 | } 174 | 175 | //TODO move into PrimaryDecomposer and parameterize with superbubble search params 176 | fn unique_block_ahead(g: &Graph, v: Vertex, unique_block_len: usize) -> Option { 177 | let block = LinearBlock::search_ahead(g, v, &superbubble::SbSearchParams::unrestricted()); 178 | if block.instance_path.total_length(g) >= unique_block_len { 179 | Some(block) 180 | } else { 181 | None 182 | } 183 | } 184 | 185 | fn unambiguous_outgoing(g: &Graph, v: Vertex) -> Option { 186 | match g.outgoing_edge_cnt(v) { 187 | 1 => Some(g.outgoing_edges(v)[0]), 188 | _ => None, 189 | } 190 | } 191 | 192 | fn forward_extension(g: &Graph, v: Vertex, unique_block_len: usize) -> Option { 193 | //TODO refactor 194 | extension_via_bridge(g, v, unique_block_len) 195 | .or_else(|| extension_in_deadend(g, v, unique_block_len)) 196 | .or_else(|| extension_out_deadend(g, v, unique_block_len)) 197 | } 198 | 199 | // x a (for 'alt') 200 | // \ 201 | //- v - w - 202 | #[allow(clippy::many_single_char_names)] 203 | fn extension_in_deadend(g: &Graph, v: Vertex, unique_block_len: usize) -> Option { 204 | let l = unambiguous_outgoing(g, v)?; 205 | let w = l.end; 206 | let a = other_incoming(g, w, l)?.start; 207 | 208 | if is_deadend(g, a) { 209 | let ext_block = LinearBlock::from_path(Path::from_link(l), std::iter::once(a)); 210 | let ext_block = ext_block.try_merge_in(unique_block_ahead(g, w, unique_block_len)?)?; 211 | Some(ext_block) 212 | } else { 213 | None 214 | } 215 | } 216 | 217 | // a x a x 218 | // / or / 219 | //- v - w - - v - o x 220 | //l -- 'horizontal' link 221 | fn extension_out_deadend(g: &Graph, v: Vertex, unique_block_len: usize) -> Option { 222 | if g.outgoing_edge_cnt(v) == 2 { 223 | //TODO generalize? 224 | let mut deadend_links: Vec = g 225 | .outgoing_edges(v) 226 | .into_iter() 227 | .filter(|&l| is_deadend(g, l.end)) 228 | .collect(); 229 | deadend_links.sort_by_key(|&l| g.vertex_length(l.end)); 230 | match deadend_links.len() { 231 | 2 => { 232 | assert!(deadend_links[0].end != deadend_links[1].end); 233 | let a = deadend_links[0].end; 234 | let l = deadend_links[1]; 235 | let ext = LinearBlock::from_path(Path::from_link(l), std::iter::once(a)); 236 | return Some(ext); 237 | } 238 | 1 => { 239 | let a = deadend_links[0].end; 240 | let l = other_outgoing(g, v, deadend_links[0]).unwrap(); 241 | let mut ext = LinearBlock::from_path(Path::from_link(l), std::iter::once(a)); 242 | ext.merge_in(unique_block_ahead(g, l.end, unique_block_len)?); 243 | return Some(ext); 244 | } 245 | x => assert!(x == 0), 246 | } 247 | } 248 | None 249 | } 250 | 251 | // s t 252 | // / \ 253 | //- u - v - w - 254 | #[allow(clippy::many_single_char_names)] 255 | fn extension_via_bridge(g: &Graph, u: Vertex, unique_block_len: usize) -> Option { 256 | if let Some(bridge_p) = bridge_ahead(g, u) { 257 | assert!(bridge_p.len() == 3); 258 | //let v = bridge_p.vertices()[1]; 259 | let w = bridge_p.end(); 260 | let s = other_outgoing(g, u, bridge_p.link_at(0))?.end; 261 | let t = other_incoming(g, w, bridge_p.link_at(1))?.start; 262 | 263 | let ext_block = LinearBlock::from_path( 264 | bridge_p, 265 | admissible_alt_class(g, s, t, unique_block_len)?.into_iter(), 266 | ); 267 | let ext_block = ext_block.try_merge_in(unique_block_ahead(g, w, unique_block_len)?)?; 268 | Some(ext_block) 269 | } else { 270 | None 271 | } 272 | } 273 | 274 | //checks if s & t belong to one of considered alt cases and returns alt vertices 275 | fn admissible_alt_class( 276 | g: &Graph, 277 | s: Vertex, 278 | t: Vertex, 279 | unique_block_len: usize, 280 | ) -> Option> { 281 | if s == t { 282 | //FIXME in this case there can be loop on top of s which won't be added to alt 283 | return Some(vec![s]); 284 | } 285 | if is_deadend(g, s) && is_deadend(g, t) { 286 | return Some(vec![s, t]); 287 | } 288 | joining_vertices(g, s, t, unique_block_len) 289 | } 290 | 291 | //returns all vertices lying on 292 | //fn has_alt_path(g: &Graph, s: Vertex, t: Vertex, node_len_thr: usize) 293 | //-> Option> { 294 | // let (visited, _) = graph_algos::bounded_dfs(g, w, node_len_thr); 295 | // assert!(visited.contains(&w)); 296 | // if visited.contains(&p.end()) { 297 | // return true; 298 | // } 299 | //} 300 | 301 | //TODO Generalize maybe support simple blocks and/or extra dead-ends (need to then return subgraph info) 302 | fn is_deadend(g: &Graph, v: Vertex) -> bool { 303 | g.outgoing_edge_cnt(v) == 0 || g.incoming_edge_cnt(v) == 0 304 | } 305 | 306 | fn visited_if_reachable( 307 | g: &Graph, 308 | v: Vertex, 309 | w: Vertex, 310 | direction: dfs::TraversalDirection, 311 | max_node_len: usize, 312 | ) -> Option> { 313 | let mut dfs = dfs::DFS::new(g, direction, None); 314 | dfs.set_max_node_len(max_node_len); 315 | dfs.extend_blocked(std::iter::once(w)); 316 | dfs.run_from(v); 317 | if dfs.boundary().contains(&w) { 318 | Some(dfs.exit_order().iter().copied().collect()) 319 | } else { 320 | None 321 | } 322 | } 323 | 324 | fn joining_vertices(g: &Graph, s: Vertex, t: Vertex, max_node_len: usize) -> Option> { 325 | let visited_fwd = 326 | visited_if_reachable(g, s, t, dfs::TraversalDirection::FORWARD, max_node_len)?; 327 | let visited_rev = 328 | visited_if_reachable(g, t, s, dfs::TraversalDirection::REVERSE, max_node_len).unwrap(); 329 | let mut reachable: Vec = visited_fwd.intersection(&visited_rev).copied().collect(); 330 | reachable.push(s); 331 | reachable.push(t); 332 | Some(reachable) 333 | } 334 | 335 | struct PrimaryDecomposer<'a> { 336 | g: &'a Graph, 337 | unique_block_len: usize, 338 | used_nodes: HashSet, 339 | } 340 | 341 | //TODO extend to situations when no single end vertex 342 | //(i.e. blocks ending with simple bubbles) 343 | fn end_vertex(b: &LinearBlock) -> Vertex { 344 | b.instance_path.end() 345 | } 346 | 347 | impl<'a> PrimaryDecomposer<'a> { 348 | fn new(g: &Graph, unique_block_len: usize) -> PrimaryDecomposer { 349 | PrimaryDecomposer { 350 | g, 351 | unique_block_len, 352 | used_nodes: HashSet::new(), 353 | } 354 | } 355 | 356 | fn extend_forward(&self, block: &mut LinearBlock) -> bool { 357 | let v = end_vertex(block); 358 | if let Some(ext) = forward_extension(self.g, v, self.unique_block_len) { 359 | if ext.all_nodes().all(|n| !self.used_nodes.contains(&n)) && block.can_merge_in(&ext) { 360 | block.merge_in(ext); 361 | return true; 362 | } 363 | } 364 | false 365 | } 366 | 367 | fn max_extend_forward(&self, block: &mut LinearBlock) -> bool { 368 | let mut extended = false; 369 | while self.extend_forward(block) { 370 | extended = true; 371 | } 372 | extended 373 | } 374 | 375 | //return none if failed to extend 376 | //FIXME make logic less surprising 377 | fn extended_block(&self, mut block: LinearBlock) -> Option { 378 | let mut extended = self.max_extend_forward(&mut block); 379 | let mut rc_block = block.reverse_complement(); 380 | extended |= self.max_extend_forward(&mut rc_block); 381 | if extended { 382 | Some(rc_block.reverse_complement()) 383 | } else { 384 | None 385 | } 386 | } 387 | 388 | fn run(&mut self) -> Vec { 389 | let mut resulting_blocks = Vec::new(); 390 | for simple_block in simple_unique_blocks(self.g, self.unique_block_len) { 391 | if simple_block 392 | .all_nodes() 393 | .all(|n| !self.used_nodes.contains(&n)) 394 | { 395 | if let Some(block) = self.extended_block(simple_block) { 396 | assert!(block.all_nodes().all(|n| !self.used_nodes.contains(&n))); 397 | self.used_nodes.extend(block.all_nodes()); 398 | resulting_blocks.push(block); 399 | } 400 | } 401 | } 402 | 403 | for simple_block in simple_unique_blocks(self.g, self.unique_block_len) { 404 | if simple_block 405 | .all_nodes() 406 | .any(|n| self.used_nodes.contains(&n)) 407 | { 408 | assert!(simple_block 409 | .all_nodes() 410 | .all(|n| self.used_nodes.contains(&n))); 411 | } else { 412 | resulting_blocks.push(simple_block); 413 | } 414 | } 415 | 416 | resulting_blocks 417 | } 418 | } 419 | 420 | //prioritization step is cheap 421 | fn simple_unique_blocks(g: &Graph, unique_block_len: usize) -> Vec { 422 | use superbubble::*; 423 | let nodes_in_sccs = scc::nodes_in_sccs(g, &scc::strongly_connected(g)); 424 | let mut used_nodes = HashSet::new(); 425 | //block and it's 'linear fraction' -- 426 | // for single node always 1, 427 | // for bubble chains -- total fraction of 'joins' in instance paths 428 | let mut unique_blocks = Vec::new(); 429 | 430 | //pub fn linear_frac(chain: &BubbleChain, g: &Graph) -> f32 { 431 | for chain in find_maximal_chains(g, &SbSearchParams::unrestricted()) 432 | .into_iter() 433 | .filter(|c| check_chain(c, |v| !nodes_in_sccs.contains(&v.node_id)) 434 | //FIXME think of supporting looped bubble chains 435 | && c.first().unwrap().start_vertex() != c.last().unwrap().end_vertex() 436 | //FIXME think if makes more sense to check shortest one 437 | //but longest path used elsewhere 438 | && length_range(c, g).1 >= unique_block_len) { 439 | assert!(!chain.is_empty()); 440 | assert!(check_chain(&chain, |v| !used_nodes.contains(&v.node_id))); 441 | for bubble in &chain { 442 | used_nodes.extend(bubble.vertices().map(|&v| v.node_id)); 443 | } 444 | let linear_frac = linear_frac(&chain, g); 445 | unique_blocks.push((LinearBlock::from_bubble_chain(g, chain), linear_frac)); 446 | } 447 | 448 | for (node_id, node) in g.all_nodes().enumerate() { 449 | if node.length >= unique_block_len && !used_nodes.contains(&node_id) { 450 | unique_blocks.push((LinearBlock::vertex_block(Vertex::forward(node_id)), 1.)); 451 | } 452 | } 453 | 454 | unique_blocks.sort_by_cached_key(|(block, lin_frac)| { 455 | let length = block.instance_path.total_length(g); 456 | //less linear fraction is better, pulled in one of 11 buckets [0..10] depending on percentage 457 | let lin_frac_grade = (lin_frac * 10.).round() as u32; 458 | //within the same 'linear fraction' bucket the longer block the better 459 | (lin_frac_grade, usize::MAX - length) 460 | }); 461 | 462 | unique_blocks.into_iter().map(|(block, _)| block).collect() 463 | } 464 | 465 | pub fn pseudo_hap_decompose(g: &Graph, unique_block_len: usize) -> Vec { 466 | let mut decomposer = PrimaryDecomposer::new(g, unique_block_len); 467 | decomposer.run() 468 | } 469 | 470 | // s t 471 | // / \ 472 | //- u - v - w - 473 | #[allow(clippy::many_single_char_names)] 474 | pub fn detect_gap(g: &Graph, u: Vertex) -> Option { 475 | if let Some(bridge_p) = bridge_ahead(g, u) { 476 | assert!(bridge_p.len() == 3); 477 | //let v = bridge_p.vertices()[1]; 478 | let w = bridge_p.end(); 479 | let s_l = other_outgoing(g, u, bridge_p.link_at(0))?; 480 | let t_l = other_incoming(g, w, bridge_p.link_at(1))?; 481 | let s = s_l.end; 482 | let t = t_l.start; 483 | 484 | if is_deadend(g, s) && is_deadend(g, t) { 485 | return Some(GapInfo { 486 | start: s, 487 | end: t, 488 | gap_size: (bridge_p.total_length(g) as i64 489 | - Path::from_link(s_l).total_length(g) as i64 490 | - Path::from_link(t_l).total_length(g) as i64), 491 | info: String::from(""), 492 | }); 493 | } 494 | } 495 | None 496 | } 497 | -------------------------------------------------------------------------------- /src/trio.rs: -------------------------------------------------------------------------------- 1 | use crate::graph::*; 2 | use crate::graph_algos::dfs; 3 | use crate::graph_algos::superbubble; 4 | use log::debug; 5 | use log::info; 6 | use std::cmp::{max, min}; 7 | use std::collections::{HashMap, HashSet}; 8 | use std::fs::File; 9 | use std::io::Result as IOResult; 10 | use std::io::{BufRead, BufReader}; 11 | use std::path::PathBuf; 12 | 13 | //TODO add UNASSIGNED to display useful info for all nodes 14 | #[derive(Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord)] 15 | pub enum TrioGroup { 16 | MATERNAL, 17 | PATERNAL, 18 | HOMOZYGOUS, 19 | ISSUE, 20 | } 21 | 22 | impl TrioGroup { 23 | pub fn incompatible(g1: TrioGroup, g2: TrioGroup) -> bool { 24 | g1 == TrioGroup::ISSUE 25 | || g2 == TrioGroup::ISSUE 26 | || (g1 == TrioGroup::MATERNAL && g2 == TrioGroup::PATERNAL) 27 | || (g1 == TrioGroup::PATERNAL && g2 == TrioGroup::MATERNAL) 28 | } 29 | 30 | pub fn compatible(g1: TrioGroup, g2: TrioGroup) -> bool { 31 | !Self::incompatible(g1, g2) 32 | } 33 | 34 | pub fn is_definite(&self) -> bool { 35 | matches!(*self, TrioGroup::MATERNAL | TrioGroup::PATERNAL) 36 | } 37 | 38 | pub fn blend(g1: TrioGroup, g2: TrioGroup) -> TrioGroup { 39 | assert!(g1 != TrioGroup::ISSUE && g2 != TrioGroup::ISSUE); 40 | if g1 == g2 { 41 | g1 42 | } else { 43 | TrioGroup::HOMOZYGOUS 44 | } 45 | } 46 | 47 | pub fn optional_blend(og1: Option, og2: Option) -> Option { 48 | match og1 { 49 | None => og2, 50 | Some(g1) => match og2 { 51 | None => og1, 52 | Some(g2) => Some(Self::blend(g1, g2)), 53 | }, 54 | } 55 | } 56 | } 57 | 58 | #[derive(Clone, Debug)] 59 | pub struct Assignment { 60 | pub group: TrioGroup, 61 | pub info: String, 62 | } 63 | 64 | #[derive(Clone, Debug)] 65 | pub struct TrioInfo { 66 | pub node_name: String, 67 | pub mat: usize, 68 | pub pat: usize, 69 | } 70 | 71 | impl TrioInfo { 72 | fn _total(&self) -> usize { 73 | self.mat + self.pat 74 | } 75 | 76 | fn counts_str(&self) -> String { 77 | format!("m{}:p{}", self.mat, self.pat) 78 | } 79 | } 80 | 81 | pub fn read_trio(path: &PathBuf) -> IOResult> { 82 | let mut infos = Vec::new(); 83 | let file = File::open(path)?; 84 | for line in BufReader::new(file).lines() { 85 | let l = line?; 86 | let split: Vec<&str> = l.trim().split('\t').collect(); 87 | if &split[0].to_lowercase() != "node" && &split[0].to_lowercase() != "contig" { 88 | let node_name = String::from(split[0]); 89 | let mat: usize = split[1].parse().expect("Invalid maternal count"); 90 | let pat: usize = split[2].parse().expect("Invalid paternal count"); 91 | infos.push(TrioInfo { 92 | node_name, 93 | mat, 94 | pat, 95 | }) 96 | } 97 | } 98 | Ok(infos) 99 | } 100 | 101 | //TODO add template parameter 102 | #[derive(Clone)] 103 | pub struct AssignmentStorage { 104 | storage: HashMap, 105 | } 106 | 107 | impl Default for AssignmentStorage { 108 | fn default() -> Self { 109 | Self::new() 110 | } 111 | } 112 | 113 | //TODO remove by_name methods 114 | impl AssignmentStorage { 115 | pub fn new() -> AssignmentStorage { 116 | AssignmentStorage { 117 | storage: HashMap::new(), 118 | } 119 | } 120 | 121 | pub fn assigned(&self) -> impl Iterator + '_ { 122 | self.storage.keys().copied() 123 | } 124 | 125 | pub fn is_definite(&self, node_id: usize) -> bool { 126 | if let Some(assign) = self.storage.get(&node_id) { 127 | if TrioGroup::is_definite(&assign.group) { 128 | return true; 129 | } 130 | } 131 | false 132 | } 133 | 134 | pub fn assign>( 135 | &mut self, 136 | node_id: usize, 137 | group: TrioGroup, 138 | info: S, 139 | ) -> Option { 140 | self.storage.insert( 141 | node_id, 142 | Assignment { 143 | group, 144 | info: info.into(), 145 | }, 146 | ) 147 | } 148 | 149 | pub fn update_group(&mut self, node_id: usize, group: TrioGroup) { 150 | match self.group(node_id) { 151 | //FIXME how to simultaneously check key and get mutable reference to stored value? 152 | Some(exist_group) => { 153 | self.storage.get_mut(&node_id).unwrap().group = TrioGroup::blend(exist_group, group) 154 | } 155 | None => { 156 | self.assign(node_id, group, ""); 157 | } 158 | }; 159 | } 160 | 161 | pub fn update_all(&mut self, iter: impl Iterator, group: TrioGroup) { 162 | for node_id in iter { 163 | self.update_group(node_id, group); 164 | } 165 | } 166 | 167 | pub fn get(&self, node_id: usize) -> Option<&Assignment> { 168 | self.storage.get(&node_id) 169 | } 170 | 171 | pub fn get_mut(&mut self, node_id: usize) -> Option<&mut Assignment> { 172 | self.storage.get_mut(&node_id) 173 | } 174 | 175 | pub fn contains(&self, node_id: usize) -> bool { 176 | self.storage.contains_key(&node_id) 177 | } 178 | 179 | pub fn group(&self, node_id: usize) -> Option { 180 | self.storage.get(&node_id).map(|assign| assign.group) 181 | } 182 | } 183 | 184 | pub struct GroupAssignmentSettings { 185 | /// Minimal number of parent-specific markers required for assigning parental group to a node 186 | pub assign_cnt: usize, 187 | /// Require at least (node_length / ) markers within the node for parental group assignment 188 | pub assign_sparsity: usize, 189 | /// Sets minimal marker excess for assigning a parental group to :1 190 | pub assign_ratio: f64, 191 | /// Sets minimal marker excess for assigning a parental group of solid nodes to :1 192 | pub solid_ratio: f64, 193 | /// Minimal node length for assigning ISSUE label 194 | pub issue_len: usize, 195 | /// Minimal number of markers for assigning ISSUE label, will typically be set to a value >= assign_cnt 196 | pub issue_cnt: usize, 197 | /// Require at least (node_length / ) markers for assigning ISSUE label, typically set to a value >= assign_sparsity 198 | pub issue_sparsity: usize, 199 | /// Require primary marker excess BELOW :1 for assigning ISSUE label. Must be <= marker_ratio 200 | pub issue_ratio: f64, 201 | } 202 | 203 | impl Default for GroupAssignmentSettings { 204 | fn default() -> Self { 205 | Self { 206 | assign_cnt: 10, 207 | assign_sparsity: 10_000, 208 | assign_ratio: 5., 209 | solid_ratio: 5., 210 | issue_len: 50_000, 211 | issue_cnt: 10, 212 | issue_sparsity: 10_000, 213 | issue_ratio: 5., 214 | } 215 | } 216 | } 217 | 218 | pub fn assign_parental_groups( 219 | g: &Graph, 220 | trio_infos: &[TrioInfo], 221 | settings: &GroupAssignmentSettings, 222 | solid_len: usize, 223 | solid_cov: f64, 224 | ) -> AssignmentStorage { 225 | let mut assignments = AssignmentStorage::new(); 226 | 227 | info!("Running parental group assignment."); 228 | debug!("Parental group assignment settings: Minimal marker count -- {}; Minimal sparsity -- 1 in {}; Minimal ratio -- {} to 1", 229 | settings.assign_cnt, settings.assign_sparsity, settings.assign_ratio); 230 | debug!("ISSUE labeling settings: Minimal marker count -- {}; Minimal sparsity -- 1 in {}; Maximal ratio -- {} to 1", 231 | settings.issue_cnt, settings.issue_sparsity, settings.issue_ratio); 232 | assert!(settings.issue_ratio <= settings.assign_ratio); 233 | 234 | let assign_node_f = |x: usize, y: usize, node_len: usize, node_cov: f64| { 235 | assert!(x >= y); 236 | let tot = x + y; 237 | tot >= settings.assign_cnt 238 | && node_len <= tot * settings.assign_sparsity 239 | && ((x as f64) > settings.assign_ratio * (y as f64) - 1e-6 240 | || (node_len > solid_len 241 | && (x as f64) > settings.solid_ratio * (y as f64) - 1e-6 242 | && node_cov < solid_cov + 1e-6)) 243 | }; 244 | 245 | let issue_node_f = |x: usize, y: usize, node_len: usize| { 246 | assert!(x >= y); 247 | let tot = x + y; 248 | node_len >= settings.issue_len 249 | && tot >= settings.issue_cnt 250 | && node_len <= tot * settings.issue_sparsity 251 | && (x as f64) < settings.issue_ratio * (y as f64) - 1e-6 252 | }; 253 | 254 | for trio_info in trio_infos { 255 | let node_id = g.name2id(&trio_info.node_name); 256 | let node_len = g.node_length(node_id); 257 | let node_cov = g.node(node_id).coverage; 258 | debug!( 259 | "Looking at node {} (len={}), mat:pat={}", 260 | trio_info.node_name, 261 | node_len, 262 | trio_info.counts_str() 263 | ); 264 | 265 | if issue_node_f( 266 | max(trio_info.mat, trio_info.pat), 267 | min(trio_info.mat, trio_info.pat), 268 | node_len, 269 | ) { 270 | debug!("Assigning ISSUE label"); 271 | assignments.assign(node_id, TrioGroup::ISSUE, trio_info.counts_str()); 272 | } else if assign_node_f( 273 | max(trio_info.mat, trio_info.pat), 274 | min(trio_info.mat, trio_info.pat), 275 | node_len, 276 | node_cov, 277 | ) { 278 | if trio_info.mat >= trio_info.pat { 279 | debug!("Looks MATERNAL"); 280 | assignments.assign(node_id, TrioGroup::MATERNAL, trio_info.counts_str()); 281 | } else { 282 | debug!("Looks PATERNAL"); 283 | assignments.assign(node_id, TrioGroup::PATERNAL, trio_info.counts_str()); 284 | } 285 | } else { 286 | debug!("Failed to assign label based on marker counts"); 287 | } 288 | } 289 | assignments 290 | } 291 | 292 | fn parse_group(group_str: &str) -> TrioGroup { 293 | match group_str { 294 | "MATERNAL" => TrioGroup::MATERNAL, 295 | "PATERNAL" => TrioGroup::PATERNAL, 296 | "HOMOZYGOUS" => TrioGroup::HOMOZYGOUS, 297 | "ISSUE" => TrioGroup::ISSUE, 298 | _ => panic!("Invalid group string {group_str}"), 299 | } 300 | } 301 | 302 | pub fn parse_node_assignments( 303 | g: &Graph, 304 | assignments_fn: &str, 305 | ) -> std::io::Result { 306 | let mut assignments = AssignmentStorage::new(); 307 | for line in std::fs::read_to_string(assignments_fn)?.lines() { 308 | let split: Vec<&str> = line.trim().split('\t').collect(); 309 | if &split[0].to_lowercase() != "node" && &split[0].to_lowercase() != "contig" { 310 | let node_name = split[0]; 311 | let group = parse_group(split[1]); 312 | assignments.update_group(g.name2id(node_name), group); 313 | } 314 | } 315 | Ok(assignments) 316 | } 317 | 318 | const MAX_COMPONENT_SIZE: usize = 100; 319 | 320 | pub struct HomozygousAssigner<'a> { 321 | g: &'a Graph, 322 | assignments: AssignmentStorage, 323 | trusted_len: usize, 324 | min_suspect_cov: Option, 325 | solid_len: usize, 326 | min_solid_cov: f64, 327 | max_assign_len: usize, 328 | complex_component_size: usize, 329 | considered: HashSet, 330 | } 331 | 332 | impl<'a> HomozygousAssigner<'a> { 333 | pub fn new( 334 | g: &'a Graph, 335 | assignments: AssignmentStorage, 336 | trusted_len: usize, 337 | min_suspect_cov: Option, 338 | solid_len: usize, 339 | min_solid_cov: f64, 340 | max_assign_len: usize, 341 | ) -> HomozygousAssigner<'a> { 342 | HomozygousAssigner { 343 | g, 344 | assignments, 345 | trusted_len, 346 | min_suspect_cov, 347 | solid_len, 348 | min_solid_cov, 349 | max_assign_len, 350 | complex_component_size: MAX_COMPONENT_SIZE, 351 | considered: HashSet::new(), 352 | } 353 | } 354 | 355 | fn can_assign(&self, node_id: usize) -> bool { 356 | let n = self.g.node(node_id); 357 | if n.length > self.max_assign_len { 358 | return false; 359 | } 360 | match self.assignments.group(node_id) { 361 | None => n.length < self.solid_len || n.coverage > self.min_solid_cov - 1e-5, 362 | //TODO think if we should be able to also reclassify ISSUE nodes 363 | Some(TrioGroup::ISSUE) => false, 364 | //TODO can probably be removed / asserted if only single round allowed 365 | Some(TrioGroup::HOMOZYGOUS) => true, 366 | _ => { 367 | n.length < self.trusted_len 368 | && self.min_suspect_cov.is_some() 369 | && n.coverage > self.min_suspect_cov.unwrap() - 1e-5 370 | } 371 | } 372 | } 373 | 374 | fn exclude_complicated(&mut self) { 375 | let mut accounted_long_starts = HashSet::new(); 376 | for v in self.g.all_vertices() { 377 | if self.g.vertex_length(v) < self.trusted_len || accounted_long_starts.contains(&v) { 378 | continue; 379 | } 380 | 381 | let short_node_component = 382 | dfs::ShortNodeComponent::ahead_from_long(self.g, v, self.trusted_len); 383 | if short_node_component.inner.len() >= self.complex_component_size { 384 | for w in short_node_component.inner { 385 | self.considered.insert(w.node_id); 386 | } 387 | } 388 | for s in &short_node_component.sources { 389 | accounted_long_starts.insert(*s); 390 | } 391 | for t in &short_node_component.sinks { 392 | accounted_long_starts.insert(t.rc()); 393 | } 394 | } 395 | } 396 | 397 | pub fn run(mut self) -> AssignmentStorage { 398 | assert!(self.considered.is_empty()); 399 | 400 | self.exclude_complicated(); 401 | //FIXME call only on the outer bubble chains 402 | let mut marked = 0; 403 | //TODO think how it should work with generalized super-bubbles 404 | //(probably should give a chance to extend even the node is already marked) 405 | for v in self.g.all_vertices() { 406 | debug!("Considering vertex {}", self.g.v_str(v)); 407 | if !self.considered.contains(&v.node_id) 408 | && self.can_assign(v.node_id) 409 | && self.check_homozygous_neighborhood(v) 410 | { 411 | marked += self.mark_vertex_and_chains(v); 412 | } 413 | } 414 | info!("Marked {}", marked); 415 | self.assignments 416 | } 417 | 418 | fn mark_vertex_and_chains(&mut self, v: Vertex) -> usize { 419 | debug!("Marking vertex {}", self.g.v_str(v)); 420 | //hit node with existing assignment 421 | let mut marked = self.make_homozygous(v); 422 | marked += self.mark_chain_ahead(v); 423 | marked += self.mark_chain_ahead(v.rc()); 424 | debug!("Done marking"); 425 | marked 426 | } 427 | 428 | fn make_homozygous(&mut self, v: Vertex) -> usize { 429 | self.considered.insert(v.node_id); 430 | if self.can_assign(v.node_id) 431 | && self.assignments.group(v.node_id) != Some(TrioGroup::HOMOZYGOUS) 432 | { 433 | self.assignments 434 | .assign(v.node_id, TrioGroup::HOMOZYGOUS, "HomozygousAssigner"); 435 | 1 436 | } else { 437 | 0 438 | } 439 | } 440 | 441 | fn mark_chain_ahead(&mut self, v: Vertex) -> usize { 442 | //FIXME proper parameterization 443 | let params = superbubble::SbSearchParams::unrestricted(); 444 | let mut marked = 0; 445 | for bubble in superbubble::find_chain_ahead(self.g, v, ¶ms) { 446 | marked += self.make_homozygous(bubble.end_vertex()); 447 | } 448 | marked 449 | } 450 | 451 | //TODO checking only one is probably enough, since iterating over all vertices 452 | fn check_homozygous_neighborhood(&self, v: Vertex) -> bool { 453 | self.check_homozygous_fork_ahead(v) || self.check_homozygous_fork_ahead(v.rc()) 454 | } 455 | 456 | //includes boundary (unvisited) vertices 457 | //returns pair of sinks and all ('inner') visited vertices 458 | //visited vertices will overlap sinks by short dead-ends 459 | fn sinks_ahead(&self, v: Vertex, node_len_thr: usize) -> (HashSet, HashSet) { 460 | let mut dfs = dfs::DFS::new_forward(self.g); 461 | dfs.set_max_node_len(node_len_thr); 462 | //inner_dfs(g, v, node_len_thr, &mut visited, &mut border); 463 | dfs.run_from(v); 464 | let mut sinks = dfs.boundary().clone(); 465 | //extend to dead-ends 466 | sinks.extend(dfs.dead_ends()); 467 | (sinks, dfs.visited()) 468 | } 469 | 470 | fn check_homozygous_fork_ahead(&self, v: Vertex) -> bool { 471 | //trick is that v no longer has to itself be long 472 | let (long_ahead, mut visited_vertices) = self.sinks_ahead(v, self.trusted_len); 473 | visited_vertices.extend(&long_ahead); 474 | let mut blended_group = None; 475 | 476 | //todo maybe chack long_ahead size 477 | for v_ahead in &long_ahead { 478 | match self.assignments.group(v_ahead.node_id) { 479 | Some(TrioGroup::ISSUE) => return false, 480 | og => blended_group = TrioGroup::optional_blend(blended_group, og), 481 | }; 482 | } 483 | 484 | if blended_group != Some(TrioGroup::HOMOZYGOUS) { 485 | return false; 486 | } 487 | 488 | //check that all incoming edges go from visited vertices 489 | visited_vertices.iter().all(|&x| { 490 | x == v 491 | || self 492 | .g 493 | .incoming_edges(x) 494 | .iter() 495 | .all(|&l| visited_vertices.contains(&l.start)) 496 | }) 497 | } 498 | } 499 | 500 | pub struct TangleAssignmentSettings { 501 | pub allow_deadend: bool, 502 | pub check_inner: bool, 503 | pub allow_reassign: bool, 504 | } 505 | 506 | impl Default for TangleAssignmentSettings { 507 | fn default() -> Self { 508 | Self { 509 | allow_deadend: false, 510 | check_inner: false, 511 | allow_reassign: true, 512 | } 513 | } 514 | } 515 | 516 | pub fn assign_short_node_tangles( 517 | g: &Graph, 518 | mut assignments: AssignmentStorage, 519 | solid_len: usize, 520 | settings: TangleAssignmentSettings, 521 | ) -> AssignmentStorage { 522 | let mut considered_boundary = HashSet::::new(); 523 | for v in g.all_vertices() { 524 | if !considered_boundary.contains(&v) && g.vertex_length(v) >= solid_len { 525 | let comp = dfs::ShortNodeComponent::ahead_from_long(g, v, solid_len); 526 | 527 | for s in comp.sources.iter() { 528 | considered_boundary.insert(*s); 529 | } 530 | for s in comp.sinks.iter() { 531 | considered_boundary.insert(s.rc()); 532 | } 533 | 534 | if !settings.allow_deadend && comp.has_deadends { 535 | continue; 536 | } 537 | 538 | if !assignments.is_definite(v.node_id) { 539 | continue; 540 | } 541 | let group = assignments.group(v.node_id).unwrap(); 542 | 543 | if comp 544 | .sources 545 | .iter() 546 | .chain(comp.sinks.iter()) 547 | .any(|&x| Some(group) != assignments.group(x.node_id)) 548 | { 549 | continue; 550 | } 551 | 552 | if settings.check_inner 553 | && comp 554 | .inner 555 | .iter() 556 | .any(|&x| group != assignments.group(x.node_id).unwrap_or(group)) 557 | { 558 | continue; 559 | } 560 | 561 | for w in comp.inner.iter() { 562 | match assignments.group(w.node_id) { 563 | None => { 564 | assignments.assign(w.node_id, group, "TangleAssignment"); 565 | } 566 | Some(g) if g == group => {} 567 | _ => { 568 | if settings.allow_reassign { 569 | assignments.assign(w.node_id, group, "TangleReAssignment"); 570 | } 571 | } 572 | } 573 | } 574 | } 575 | } 576 | assignments 577 | } 578 | 579 | #[cfg(test)] 580 | mod tests { 581 | use crate::graph::*; 582 | use crate::trio; 583 | use std::fs; 584 | 585 | fn init() { 586 | let _ = env_logger::builder().is_test(true).try_init(); 587 | } 588 | 589 | #[test] 590 | fn homozygous_fork_test() { 591 | init(); 592 | 593 | let graph_fn = "tests/test_graphs/test1.gfa"; 594 | let assignments_fn = "tests/test_graphs/test1.ann.csv"; 595 | let g = Graph::read(&fs::read_to_string(graph_fn).unwrap()); 596 | let assignments = trio::parse_node_assignments(&g, assignments_fn).unwrap(); 597 | 598 | let assigner = 599 | trio::HomozygousAssigner::new(&g, assignments, 100_000, None, 500_000, 1.5, usize::MAX); 600 | assert!(assigner.check_homozygous_fork_ahead(Vertex::forward(g.name2id("utig4-1237")))); 601 | assert!(assigner.check_homozygous_fork_ahead(Vertex::reverse(g.name2id("utig4-1237")))); 602 | assert!(!assigner.check_homozygous_fork_ahead(Vertex::forward(g.name2id("utig4-1554")))); 603 | assert!(!assigner.check_homozygous_fork_ahead(Vertex::reverse(g.name2id("utig4-1554")))); 604 | } 605 | } 606 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | use log::{debug, info, warn}; 2 | use std::collections::HashMap; 3 | use std::error::Error; 4 | use std::fs; 5 | use std::fs::File; 6 | use std::io::{BufWriter, Write}; 7 | use std::{collections::HashSet, path::PathBuf}; 8 | use trio_walk::HaploSearchSettings; 9 | 10 | //tests don't compile without the pub 11 | //FIXME what to do? 12 | pub mod graph; 13 | pub mod graph_algos; 14 | pub mod pseudo_hap; 15 | pub mod trio; 16 | pub mod trio_walk; 17 | 18 | pub use graph::*; 19 | 20 | use crate::trio::{ 21 | assign_short_node_tangles, GroupAssignmentSettings, TangleAssignmentSettings, TrioGroup, 22 | }; 23 | use crate::trio_walk::HaploSearcher; 24 | 25 | //TODO use PathBuf 26 | #[derive(clap::Args, Debug)] 27 | pub struct TrioSettings { 28 | /// GFA file 29 | #[clap(short, long)] 30 | graph: PathBuf, 31 | 32 | /// Parental markers file 33 | #[clap(short, long)] 34 | markers: PathBuf, 35 | 36 | /// Marker-based annotation output file 37 | #[clap(long)] 38 | init_assign: Option, 39 | 40 | /// Refined annotation output file 41 | #[clap(long)] 42 | refined_assign: Option, 43 | 44 | /// Final annotation output file 45 | #[clap(long)] 46 | final_assign: Option, 47 | 48 | /// Comma separated haplotype names to be used in outputs (default: "mat,pat") 49 | #[clap(long, default_value_t = String::from("mat,pat"))] 50 | hap_names: String, 51 | 52 | /// Marker-assisted extracted haplo-paths 53 | #[clap(long, short)] 54 | paths: Option, 55 | 56 | /// Use GAF ([<>])+ format for paths 57 | #[clap(long)] 58 | gaf_format: bool, 59 | 60 | /// Minimal number of parent-specific markers required for assigning parental group to a node 61 | #[clap(long, default_value_t = 10)] 62 | marker_cnt: usize, 63 | 64 | /// Require at least (node_length / ) markers within the node for parental group assignment 65 | #[clap(long, default_value_t = 10_000)] 66 | marker_sparsity: usize, 67 | 68 | /// Sets minimal marker excess for assigning a parental group to :1 69 | #[clap(long, default_value_t = 5.0)] 70 | marker_ratio: f64, 71 | 72 | /// Longer nodes are unlikely to be spurious and likely to be reliably assigned based on markers (used in HOMOZYGOUS node labeling) 73 | #[clap(long, default_value_t = 200_000)] 74 | trusted_len: usize, 75 | 76 | /// Nodes with coverage below * can not be 'reclassified' as homozygous. 77 | /// Negative turns off reclassification, 0. disables coverage check 78 | #[clap(long, default_value_t = 1.5)] 79 | suspect_homozygous_cov_coeff: f64, 80 | 81 | /// Longer nodes can not be classified as homozygous 82 | #[clap(long, default_value_t = 2_000_000)] 83 | max_homozygous_len: usize, 84 | 85 | //TODO maybe check that it is > trusted_len 86 | /// Longer nodes are unlikely to represent repeats, polymorphic variants, etc (used to seed and guide the path search) 87 | #[clap(long, default_value_t = 500_000)] 88 | solid_len: usize, 89 | 90 | /// Sets minimal marker excess for assigning a parental group of solid nodes to :1. 91 | /// Must be <= marker_ratio (by default == marker_ratio) 92 | #[clap(long)] 93 | solid_ratio: Option, 94 | 95 | /// Solid nodes with coverage below * can not be classified as homozygous. 96 | /// 0. disables check 97 | #[clap(long, default_value_t = 1.5)] 98 | solid_homozygous_cov_coeff: f64, 99 | 100 | /// Minimal node length for assigning ISSUE label 101 | #[clap(long, default_value_t = 50_000)] 102 | issue_len: usize, 103 | 104 | /// Minimal number of markers for assigning ISSUE label (by default == marker_cnt, will typically be set to a value >= marker_cnt) 105 | #[clap(long)] 106 | issue_cnt: Option, 107 | 108 | /// Require at least (node_length / ) markers for assigning ISSUE label (by default == marker_sparsity, will typically be set to a value >= marker_sparsity) 109 | #[clap(long)] 110 | issue_sparsity: Option, 111 | 112 | /// Require primary marker excess BELOW :1 for assigning ISSUE label. Must be <= marker_ratio (by default == marker_ratio) 113 | #[clap(long)] 114 | issue_ratio: Option, 115 | 116 | /// Try to fill in small ambiguous bubbles 117 | #[clap(long)] 118 | try_fill_bubbles: bool, 119 | 120 | /// Do not fill bubble if source or sink is non-solid, non-homozygous and has coverage above * . 121 | /// Negative disables check, 0. makes it fail 122 | #[clap(long, default_value_t = 1.5)] 123 | max_unique_cov_coeff: f64, 124 | 125 | /// Bubbles including a longer alternative sequence will not be filled 126 | #[clap(long, default_value_t = 50_000)] 127 | fillable_bubble_len: usize, 128 | 129 | /// Bubbles with bigger difference between alternatives' lengths will not be filled 130 | #[clap(long, default_value_t = 200)] 131 | fillable_bubble_diff: usize, 132 | 133 | /// Heterozygous bubbles including a longer alternative sequence will not be filled (by default equal to fillable_bubble_len) 134 | #[clap(long)] 135 | het_fill_bubble_len: Option, 136 | 137 | /// Heterozygous bubbles with bigger difference between alternatives' lengths will not be filled (by default equal to fillable_bubble_diff) 138 | #[clap(long)] 139 | het_fill_bubble_diff: Option, 140 | 141 | /// During bubble filling ignore simple sides of bubbles with coverage less than source/sink average divided by this value 142 | /// 0. disables check 143 | #[clap(long, default_value_t = 5.0)] 144 | good_side_cov_gap: f64, 145 | 146 | /// Minimal introducible gap size (number of Ns reported). If the gap size estimate is smaller it will be artificially increased to this value. 147 | #[clap(long, default_value_t = 1000)] 148 | min_gap_size: usize, 149 | 150 | /// Default gap size, which will be output in cases where reasonable estimate is not possible or (more likely) hasn't been implemented yet. 151 | #[clap(long, default_value_t = 5000)] 152 | default_gap_size: usize, 153 | 154 | /// Assign tangles flanked by solid nodes from the same class 155 | #[clap(long)] 156 | assign_tangles: bool, 157 | 158 | /// Allow dead-end nodes in the tangles 159 | #[clap(long)] 160 | tangle_allow_deadend: bool, 161 | 162 | /// Check that inner tangle nodes are either unassigned or assigned to correct class 163 | #[clap(long)] 164 | tangle_check_inner: bool, 165 | 166 | /// Prevent reassignment of nodes 167 | #[clap(long)] 168 | tangle_prevent_reassign: bool, 169 | } 170 | 171 | impl TrioSettings { 172 | pub fn validate(&self) { 173 | if let Some(issue_ratio) = self.issue_ratio { 174 | assert!( 175 | issue_ratio <= self.marker_ratio, 176 | "--issue-ratio can't be set to a value higher than --marker-ratio" 177 | ); 178 | } 179 | 180 | if let Some(solid_ratio) = self.solid_ratio { 181 | assert!( 182 | solid_ratio <= self.marker_ratio, 183 | "--solid-ratio can't be set to a value higher than --marker-ratio" 184 | ); 185 | 186 | if solid_ratio < self.issue_ratio.unwrap_or(self.marker_ratio) { 187 | warn!( 188 | "Specified --solid-ratio value is smaller than --issue-ratio. \ 189 | Please double-check the logic and consider specifying smaller --issue-ratio." 190 | ); 191 | } 192 | } 193 | 194 | assert!(self.good_side_cov_gap >= 0.); 195 | assert!(self.solid_homozygous_cov_coeff >= 0.); 196 | } 197 | } 198 | 199 | fn read_graph(graph_fn: &PathBuf) -> Result> { 200 | info!("Reading graph from {}", graph_fn.to_str().unwrap()); 201 | let g = Graph::read_sanitize(&fs::read_to_string(graph_fn)?); 202 | 203 | info!("Graph read successfully"); 204 | info!("Node count: {}", g.node_cnt()); 205 | info!("Link count: {}", g.link_cnt()); 206 | Ok(g) 207 | } 208 | 209 | fn output_coloring( 210 | g: &Graph, 211 | assignments: &trio::AssignmentStorage, 212 | file_name: &PathBuf, 213 | hap_names: &(&str, &str), 214 | ) -> Result<(), std::io::Error> { 215 | let mut output = BufWriter::new(File::create(file_name)?); 216 | writeln!(output, "node\tassignment\tlength\tinfo\tcolor")?; 217 | for (node_id, n) in g.all_nodes().enumerate() { 218 | assert!(g.name2id(&n.name) == node_id); 219 | if let Some(assign) = assignments.get(node_id) { 220 | let color = match assign.group { 221 | trio::TrioGroup::PATERNAL => "#8888FF", 222 | trio::TrioGroup::MATERNAL => "#FF8888", 223 | trio::TrioGroup::ISSUE => "#FFDE24", 224 | trio::TrioGroup::HOMOZYGOUS => "#7900D6", 225 | }; 226 | writeln!( 227 | output, 228 | "{}\t{}\t{}\t{}\t{}", 229 | n.name, 230 | group_str(Some(assign.group), hap_names).to_uppercase(), 231 | n.length, 232 | assign.info, 233 | color 234 | )?; 235 | } 236 | } 237 | Ok(()) 238 | } 239 | 240 | pub fn augment_by_path_search( 241 | g: &Graph, 242 | assignments: trio::AssignmentStorage, 243 | settings: HaploSearchSettings, 244 | ) -> trio::AssignmentStorage { 245 | info!("Augmenting node annotation by path search. Round 1."); 246 | let assignments = augment_by_path_search_round(g, assignments, settings); 247 | info!("Augmenting node annotation by path search. Round 2."); 248 | augment_by_path_search_round(g, assignments, settings) 249 | } 250 | 251 | fn augment_by_path_search_round( 252 | g: &Graph, 253 | assignments: trio::AssignmentStorage, 254 | settings: HaploSearchSettings, 255 | ) -> trio::AssignmentStorage { 256 | let mut path_searcher = 257 | HaploSearcher::new(g, &assignments, settings.assigning_stage_adjusted(), None); 258 | 259 | path_searcher.find_all(); 260 | let node_usage = path_searcher.take_used(); 261 | augment_assignments(g, assignments, &node_usage, true) 262 | } 263 | 264 | fn augment_assignments( 265 | g: &Graph, 266 | mut assignments: trio::AssignmentStorage, 267 | extra_assignments: &trio::AssignmentStorage, 268 | exclude_homozygous: bool, 269 | ) -> trio::AssignmentStorage { 270 | for node_id in extra_assignments.assigned() { 271 | let tentative_group = extra_assignments.group(node_id).unwrap(); 272 | assert!(tentative_group != TrioGroup::ISSUE); 273 | //any mixed assignment has chance to be erroneous due to graph issues 274 | if exclude_homozygous && !tentative_group.is_definite() { 275 | continue; 276 | } 277 | match assignments.group(node_id) { 278 | None => { 279 | debug!( 280 | "Assigning tentative group {:?} to node {}", 281 | tentative_group, 282 | g.name(node_id) 283 | ); 284 | assignments.assign(node_id, tentative_group, "PathSearch"); 285 | } 286 | Some(init_group) => { 287 | assert!(init_group == tentative_group || init_group == trio::TrioGroup::HOMOZYGOUS) 288 | } 289 | } 290 | } 291 | assignments 292 | } 293 | 294 | fn weighted_mean_solid_cov(g: &Graph, solid_len_thr: usize) -> f64 { 295 | let mut total_len = 0; 296 | let mut total_cov = 0.; 297 | for n in g.all_nodes() { 298 | if n.length >= solid_len_thr { 299 | total_len += n.length; 300 | total_cov += n.coverage * (n.length as f64); 301 | } 302 | } 303 | total_cov / total_len as f64 304 | } 305 | 306 | fn parse_hap_names(hap_names_s: &str) -> Option<(&str, &str)> { 307 | let mut split = hap_names_s.split(','); 308 | Some((split.next()?, split.next()?)) 309 | } 310 | 311 | fn group_str<'a>(o_g: Option, hap_names: &'a (&'a str, &'a str)) -> &'a str { 312 | match o_g { 313 | Some(TrioGroup::MATERNAL) => hap_names.0, 314 | Some(TrioGroup::PATERNAL) => hap_names.1, 315 | Some(TrioGroup::HOMOZYGOUS) => "hom", 316 | Some(TrioGroup::ISSUE) => "issue", 317 | _ => "na", 318 | } 319 | } 320 | 321 | pub fn write_paths( 322 | g: &Graph, 323 | haplo_paths: Vec, 324 | assignments: &trio::AssignmentStorage, 325 | node_usage: &trio::AssignmentStorage, 326 | output: &PathBuf, 327 | gaf_format: bool, 328 | hap_names: &(&str, &str), 329 | ) -> Result<(), std::io::Error> { 330 | //FIXME buffer 331 | let mut output = File::create(output)?; 332 | writeln!(output, "name\tpath\tassignment")?; 333 | for (path, node_id, group) in haplo_paths { 334 | assert!(path.vertices().contains(&Vertex::forward(node_id))); 335 | //info!("Identified {:?} path: {}", group, path.print(&g)); 336 | writeln!( 337 | output, 338 | "{}_from_{}\t{}\t{}", 339 | group_str(Some(group), hap_names), 340 | g.node(node_id).name, 341 | path.print_format(g, gaf_format), 342 | group_str(Some(group), hap_names).to_uppercase() 343 | )?; 344 | } 345 | 346 | let mut write_node = |n: &Node, group: Option| { 347 | writeln!( 348 | output, 349 | "{}_unused_{}\t{}\t{}", 350 | group_str(group, hap_names), 351 | n.name, 352 | Direction::format_node(&n.name, Direction::FORWARD, gaf_format), 353 | group_str(group, hap_names).to_uppercase() 354 | ) 355 | }; 356 | 357 | for (node_id, n) in g.all_nodes().enumerate() { 358 | let haplopath_assign = node_usage.group(node_id); 359 | match assignments.group(node_id) { 360 | None | Some(TrioGroup::ISSUE) => { 361 | assert!(!node_usage.contains(node_id)); 362 | debug!( 363 | "Node: {} length: {} not assigned to any haplotype (adding trivial NA path)", 364 | n.name, n.length 365 | ); 366 | write_node(g.node(node_id), None)?; 367 | } 368 | Some(assign) => { 369 | if TrioGroup::compatible(assign, TrioGroup::MATERNAL) 370 | //not present in haplopaths paths or incompatible 371 | && haplopath_assign.map_or(true, 372 | |x| TrioGroup::incompatible(x, TrioGroup::MATERNAL)) 373 | { 374 | debug!("Node: {} length: {} not present in MATERNAL haplo-paths (adding trivial MATERNAL path)", 375 | n.name, n.length); 376 | write_node(g.node(node_id), Some(TrioGroup::MATERNAL))?; 377 | } 378 | if TrioGroup::compatible(assign, TrioGroup::PATERNAL) 379 | //not present in haplopaths paths or incompatible 380 | && haplopath_assign.map_or(true, 381 | |x| TrioGroup::incompatible(x, TrioGroup::PATERNAL)) 382 | { 383 | debug!("Node: {} length: {} not present in PATERNAL haplo-paths (adding trivial PATERNAL path)", 384 | n.name, n.length); 385 | write_node(g.node(node_id), Some(TrioGroup::PATERNAL))?; 386 | } 387 | } 388 | } 389 | } 390 | Ok(()) 391 | } 392 | 393 | pub fn run_trio_analysis(settings: &TrioSettings) -> Result<(), Box> { 394 | let g = read_graph(&settings.graph)?; 395 | 396 | //for n in g.all_nodes() { 397 | // println!("Node: {} length: {} cov: {}", n.name, n.length, n.coverage); 398 | //} 399 | //for l in g.all_links() { 400 | // println!("Link: {}", g.l_str(l)); 401 | //} 402 | //write!(output, "{}", g.as_gfa())?; 403 | 404 | let hap_names = 405 | parse_hap_names(&settings.hap_names).expect("Problem while parsing haplotype names"); 406 | 407 | info!( 408 | "Reading trio marker information from {}", 409 | &settings.markers.to_str().unwrap() 410 | ); 411 | let trio_infos = trio::read_trio(&settings.markers)?; 412 | 413 | let solid_cov_est = weighted_mean_solid_cov(&g, settings.solid_len); 414 | if settings.suspect_homozygous_cov_coeff > 0. || settings.solid_homozygous_cov_coeff > 0. { 415 | info!("Coverage estimate based on long nodes was {solid_cov_est}"); 416 | if solid_cov_est == 0. { 417 | warn!("Looks like the graph didn't have coverage information, which we were hoping to use. \ 418 | Consider providing it or changing --suspect-homozygous-cov-coeff and --solid-homozygous-cov-coeff"); 419 | } 420 | } 421 | 422 | let suspect_homozygous_cov = if settings.suspect_homozygous_cov_coeff < 0. { 423 | None 424 | } else { 425 | Some(settings.suspect_homozygous_cov_coeff * solid_cov_est) 426 | }; 427 | 428 | let solid_homozygous_cov = settings.solid_homozygous_cov_coeff * solid_cov_est; 429 | 430 | info!("Assigning initial parental groups to the nodes"); 431 | let assignments = trio::assign_parental_groups( 432 | &g, 433 | &trio_infos, 434 | &GroupAssignmentSettings { 435 | assign_cnt: settings.marker_cnt, 436 | assign_sparsity: settings.marker_sparsity, 437 | assign_ratio: settings.marker_ratio, 438 | solid_ratio: settings.solid_ratio.unwrap_or(settings.marker_ratio), 439 | issue_len: settings.issue_len, 440 | issue_cnt: settings.issue_cnt.unwrap_or(settings.marker_cnt), 441 | issue_sparsity: settings.issue_sparsity.unwrap_or(settings.marker_sparsity), 442 | issue_ratio: settings.issue_ratio.unwrap_or(settings.marker_ratio), 443 | }, 444 | settings.solid_len, 445 | solid_homozygous_cov, 446 | ); 447 | 448 | let raw_cnts = trio_infos 449 | .into_iter() 450 | .map(|ti| (g.name2id(&ti.node_name), ti)) 451 | .collect::>(); 452 | 453 | if let Some(output) = &settings.init_assign { 454 | info!( 455 | "Writing initial node annotation to {}", 456 | output.to_str().unwrap() 457 | ); 458 | output_coloring(&g, &assignments, output, &hap_names)?; 459 | } 460 | 461 | info!("Marking homozygous nodes"); 462 | let assigner = trio::HomozygousAssigner::new( 463 | &g, 464 | assignments, 465 | settings.trusted_len, 466 | suspect_homozygous_cov, 467 | settings.solid_len, 468 | solid_homozygous_cov, 469 | settings.max_homozygous_len, 470 | ); 471 | 472 | let assignments = assigner.run(); 473 | 474 | let mut search_settings = HaploSearchSettings { 475 | solid_len: settings.solid_len, 476 | trusted_len: settings.trusted_len, 477 | fill_bubbles: settings.try_fill_bubbles, 478 | fillable_bubble_len: settings.fillable_bubble_len, 479 | fillable_bubble_diff: settings.fillable_bubble_diff, 480 | het_fill_bubble_len: settings 481 | .het_fill_bubble_len 482 | .unwrap_or(settings.fillable_bubble_len), 483 | het_fill_bubble_diff: settings 484 | .het_fill_bubble_diff 485 | .unwrap_or(settings.fillable_bubble_diff), 486 | good_side_cov_gap: settings.good_side_cov_gap, 487 | min_gap_size: settings.min_gap_size as i64, 488 | default_gap_size: settings.default_gap_size as i64, 489 | ..HaploSearchSettings::default() 490 | }; 491 | 492 | if search_settings.fill_bubbles { 493 | info!("Will try filling small bubbles"); 494 | //assert!(settings.max_unique_cov_coeff >= 0.); 495 | if settings.max_unique_cov_coeff < 0. { 496 | //leaving default 497 | search_settings.max_unique_cov = f64::MAX; 498 | info!("Negative '--max-unique-cov-coeff' provided. All nodes will be considered unique for purposes of bubble filling"); 499 | } 500 | if settings.max_unique_cov_coeff > 0. && solid_cov_est == 0. { 501 | warn!("Looks like the graph didn't have coverage information, which we were hoping to use. Consider providing it or changing --max-unique-cov-coeff"); 502 | } 503 | search_settings.max_unique_cov = settings.max_unique_cov_coeff * solid_cov_est; 504 | info!( 505 | "Maximal 'unique' coverage for bubble filling set to {}", 506 | search_settings.max_unique_cov 507 | ); 508 | if search_settings.max_unique_cov == 0. { 509 | info!("Will only fill bubbles between solid or homozygous nodes"); 510 | } 511 | } 512 | 513 | let assignments = augment_by_path_search(&g, assignments, search_settings); 514 | 515 | let assignments = if settings.assign_tangles { 516 | assign_short_node_tangles( 517 | &g, 518 | assignments, 519 | settings.solid_len, 520 | TangleAssignmentSettings { 521 | allow_deadend: settings.tangle_allow_deadend, 522 | check_inner: settings.tangle_check_inner, 523 | allow_reassign: !settings.tangle_prevent_reassign, 524 | }, 525 | ) 526 | } else { 527 | assignments 528 | }; 529 | 530 | if let Some(output) = &settings.refined_assign { 531 | info!( 532 | "Writing refined node annotation to {}", 533 | output.to_str().unwrap() 534 | ); 535 | output_coloring(&g, &assignments, output, &hap_names)?; 536 | } 537 | let mut path_searcher = HaploSearcher::new(&g, &assignments, search_settings, Some(&raw_cnts)); 538 | 539 | let haplo_paths = path_searcher.find_all(); 540 | let node_usage = path_searcher.take_used(); 541 | 542 | let assignments = augment_assignments(&g, assignments, &node_usage, false); 543 | 544 | if let Some(output) = &settings.final_assign { 545 | info!( 546 | "Writing final node annotation to {}", 547 | output.to_str().unwrap() 548 | ); 549 | output_coloring(&g, &assignments, output, &hap_names)?; 550 | } 551 | 552 | if let Some(output) = &settings.paths { 553 | info!("Outputting haplo-paths to {}", output.to_str().unwrap()); 554 | write_paths( 555 | &g, 556 | haplo_paths, 557 | &assignments, 558 | &node_usage, 559 | output, 560 | settings.gaf_format, 561 | &hap_names, 562 | )?; 563 | } 564 | 565 | info!("All done"); 566 | Ok(()) 567 | } 568 | 569 | pub fn run_primary_alt_analysis( 570 | graph_fn: &PathBuf, 571 | colors_fn: &Option, 572 | paths_fn: &Option, 573 | gaf_paths: bool, 574 | ) -> Result<(), Box> { 575 | let g = read_graph(graph_fn)?; 576 | let unique_block_len = 500_000; 577 | let linear_blocks = pseudo_hap::pseudo_hap_decompose(&g, unique_block_len); 578 | 579 | if let Some(output) = colors_fn { 580 | info!("Writing node colors to {}", output); 581 | let mut output = File::create(output)?; 582 | 583 | let mut primary_nodes = HashSet::new(); 584 | let mut alt_nodes = HashSet::new(); 585 | let mut boundary_nodes = HashSet::new(); 586 | 587 | for block in &linear_blocks { 588 | let p = block.instance_path(); 589 | primary_nodes.extend(p.vertices().iter().map(|&v| v.node_id)); 590 | alt_nodes.extend(block.known_alt_nodes().iter().copied()); 591 | boundary_nodes.extend([p.start().node_id, p.end().node_id]); 592 | } 593 | 594 | writeln!(output, "node\tlength\tassignment\tcolor")?; 595 | for (node_id, n) in g.all_nodes().enumerate() { 596 | assert!(g.name2id(&n.name) == node_id); 597 | let mut color = "#808080"; 598 | let mut assign = "NA"; 599 | if boundary_nodes.contains(&node_id) { 600 | assert!(!alt_nodes.contains(&node_id)); 601 | color = "#fbb117"; 602 | assign = "PRIMARY_BOUNDARY"; 603 | } else if primary_nodes.contains(&node_id) { 604 | assert!(!alt_nodes.contains(&node_id)); 605 | color = "#8888FF"; 606 | assign = "PRIMARY"; 607 | } else if alt_nodes.contains(&node_id) { 608 | color = "#FF8888"; 609 | assign = "ALT"; 610 | } 611 | writeln!(output, "{}\t{}\t{}\t{}", n.name, n.length, assign, color)?; 612 | } 613 | } 614 | 615 | let used: HashSet = linear_blocks.iter().flat_map(|b| b.all_nodes()).collect(); 616 | 617 | if let Some(output) = paths_fn { 618 | info!("Outputting paths in {}", output); 619 | let mut output = File::create(output)?; 620 | 621 | writeln!(output, "name\tlen\tpath\tassignment")?; 622 | 623 | for (block_id, block) in linear_blocks.into_iter().enumerate() { 624 | writeln!( 625 | output, 626 | "primary_{}\t{}\t{}\tPRIMARY", 627 | block_id, 628 | block.instance_path().total_length(&g), 629 | block.instance_path().print_format(&g, gaf_paths) 630 | )?; 631 | for (alt_id, &known_alt) in block.known_alt_nodes().iter().enumerate() { 632 | writeln!( 633 | output, 634 | "alt_{}_{}\t{}\t{}\tALT", 635 | block_id, 636 | alt_id, 637 | g.node(known_alt).length, 638 | Path::new(Vertex::forward(known_alt)).print_format(&g, gaf_paths) 639 | )?; 640 | } 641 | } 642 | 643 | for (node_id, n) in g.all_nodes().enumerate() { 644 | if !used.contains(&node_id) { 645 | writeln!( 646 | output, 647 | "unused_{}\t{}\t{}\tNA", 648 | n.name, 649 | n.length, 650 | Path::new(Vertex::forward(node_id)).print_format(&g, gaf_paths) 651 | )?; 652 | } 653 | } 654 | } 655 | 656 | info!("All done"); 657 | Ok(()) 658 | } 659 | -------------------------------------------------------------------------------- /src/graph.rs: -------------------------------------------------------------------------------- 1 | use log::warn; 2 | use std::collections::HashMap; 3 | use std::str; 4 | 5 | #[derive(Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash)] 6 | pub enum Direction { 7 | FORWARD, 8 | REVERSE, 9 | } 10 | 11 | impl Direction { 12 | pub fn flip(d: Direction) -> Direction { 13 | match d { 14 | Self::FORWARD => Self::REVERSE, 15 | Self::REVERSE => Self::FORWARD, 16 | } 17 | } 18 | 19 | fn parse_char(c: char) -> Direction { 20 | match c { 21 | '+' => Self::FORWARD, 22 | '-' => Self::REVERSE, 23 | _ => panic!("Unknown direction {c}"), 24 | } 25 | } 26 | 27 | fn parse(s: &str) -> Direction { 28 | assert!(s.len() == 1, "Unknown direction {s}"); 29 | Self::parse_char(s.chars().next().unwrap()) 30 | } 31 | 32 | pub fn str(d: Direction) -> &'static str { 33 | match d { 34 | Self::FORWARD => "+", 35 | Self::REVERSE => "-", 36 | } 37 | } 38 | 39 | pub fn gaf_str(d: Direction) -> &'static str { 40 | match d { 41 | Self::FORWARD => ">", 42 | Self::REVERSE => "<", 43 | } 44 | } 45 | 46 | pub fn format_node(name: &str, d: Direction, gaf: bool) -> String { 47 | if gaf { 48 | format!("{}{}", Direction::gaf_str(d), name) 49 | } else { 50 | format!("{}{}", name, Direction::str(d)) 51 | } 52 | } 53 | } 54 | 55 | #[derive(Clone)] 56 | pub struct Node { 57 | //node size 58 | pub name: String, 59 | pub length: usize, 60 | pub coverage: f64, 61 | } 62 | 63 | //TODO which ones are redundant? 64 | #[derive(Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash)] 65 | pub struct Vertex { 66 | //node id 67 | pub node_id: usize, 68 | //direction 69 | pub direction: Direction, 70 | } 71 | 72 | impl Vertex { 73 | pub fn forward(node_id: usize) -> Vertex { 74 | Vertex { 75 | node_id, 76 | direction: Direction::FORWARD, 77 | } 78 | } 79 | 80 | pub fn reverse(node_id: usize) -> Vertex { 81 | Vertex { 82 | node_id, 83 | direction: Direction::REVERSE, 84 | } 85 | } 86 | 87 | pub fn rc(&self) -> Vertex { 88 | Vertex { 89 | node_id: self.node_id, 90 | direction: Direction::flip(self.direction), 91 | } 92 | } 93 | } 94 | 95 | //TODO support link coverage! 96 | //TODO separate 'links' and 'edges' 97 | //links will have overlap size, CIGAR, etc 98 | //edges will represent a Vertex pair 99 | #[derive(Copy, Clone, Debug, PartialEq, PartialOrd)] 100 | pub struct Link { 101 | pub start: Vertex, 102 | pub end: Vertex, 103 | pub overlap: usize, 104 | } 105 | 106 | impl Link { 107 | pub fn rc(&self) -> Link { 108 | Link { 109 | start: self.end.rc(), 110 | end: self.start.rc(), 111 | overlap: self.overlap, 112 | } 113 | } 114 | 115 | //fn is_canonical(&self) -> bool { 116 | // self <= &self.rc() 117 | //} 118 | 119 | //fn join_same(l1: &Link, l2: &Link) -> bool { 120 | // l1.start == l2.start && l1.end == l2.end 121 | //} 122 | 123 | //fn parallel(l1: &Link, l2: &Link) -> bool { 124 | // Self::join_same(l1, l2) || Self::join_same(l1, &l2.rc()) 125 | //} 126 | } 127 | 128 | pub struct Graph { 129 | nodes: Vec, 130 | //TODO storage is excessive, should only store neighbor 131 | //incoming & outgoing links for every node 132 | incoming_links: Vec>, 133 | outgoing_links: Vec>, 134 | //TODO switch to &str and figure out how to work with lifetimes 135 | name2ids: HashMap, 136 | } 137 | 138 | //TODO think about useful iterators and reimplement this one via composition 139 | //FIXME improve when learn how to store iterator as a field :) 140 | struct AllLinkIter<'a> { 141 | g: &'a Graph, 142 | curr_node: usize, 143 | incoming_flag: bool, 144 | pos: usize, 145 | } 146 | 147 | impl<'a> AllLinkIter<'a> { 148 | fn new(g: &'a Graph) -> AllLinkIter<'a> { 149 | AllLinkIter { 150 | g, 151 | curr_node: 0, 152 | incoming_flag: true, 153 | pos: 0, 154 | } 155 | } 156 | } 157 | 158 | impl<'a> Iterator for AllLinkIter<'a> { 159 | type Item = Link; 160 | 161 | fn next(&mut self) -> Option { 162 | while self.curr_node < self.g.node_cnt() { 163 | if self.incoming_flag { 164 | let links = &self.g.incoming_links[self.curr_node]; 165 | assert!(self.pos <= links.len()); 166 | if self.pos < links.len() { 167 | let link = links[self.pos]; 168 | assert!(link.end.node_id == self.curr_node); 169 | self.pos += 1; 170 | if link.end < link.start { 171 | return Some(link); 172 | } 173 | } else { 174 | self.incoming_flag = false; 175 | self.pos = 0; 176 | } 177 | } else { 178 | let links = &self.g.outgoing_links[self.curr_node]; 179 | assert!(self.pos <= links.len()); 180 | if self.pos < links.len() { 181 | let link = links[self.pos]; 182 | assert!(link.start.node_id == self.curr_node); 183 | self.pos += 1; 184 | if link.start <= link.end { 185 | return Some(link); 186 | } 187 | } else { 188 | self.incoming_flag = true; 189 | self.pos = 0; 190 | self.curr_node += 1; 191 | } 192 | } 193 | } 194 | None 195 | } 196 | } 197 | 198 | struct VertexIter<'a> { 199 | g: &'a Graph, 200 | curr_node: usize, 201 | forward_flag: bool, 202 | } 203 | 204 | impl<'a> VertexIter<'a> { 205 | fn new(g: &'a Graph) -> VertexIter<'a> { 206 | VertexIter { 207 | g, 208 | curr_node: 0, 209 | forward_flag: true, 210 | } 211 | } 212 | } 213 | 214 | impl<'a> Iterator for VertexIter<'a> { 215 | type Item = Vertex; 216 | 217 | fn next(&mut self) -> Option { 218 | if self.curr_node < self.g.node_cnt() { 219 | if self.forward_flag { 220 | self.forward_flag = false; 221 | return Some(Vertex::forward(self.curr_node)); 222 | } else { 223 | let node_id = self.curr_node; 224 | self.forward_flag = true; 225 | self.curr_node += 1; 226 | return Some(Vertex::reverse(node_id)); 227 | } 228 | } 229 | None 230 | } 231 | } 232 | 233 | impl Default for Graph { 234 | fn default() -> Self { 235 | Self::new() 236 | } 237 | } 238 | 239 | impl Graph { 240 | pub fn new() -> Graph { 241 | Graph { 242 | nodes: Vec::new(), 243 | incoming_links: Vec::new(), 244 | outgoing_links: Vec::new(), 245 | name2ids: HashMap::new(), 246 | } 247 | } 248 | 249 | pub fn node_cnt(&self) -> usize { 250 | self.nodes.len() 251 | } 252 | 253 | pub fn node_iter(&self) -> std::slice::Iter { 254 | self.nodes.iter() 255 | } 256 | 257 | pub fn add_node(&mut self, node: Node) -> usize { 258 | //TODO rewrite without cloning with lifetimes 259 | let node_id = self.nodes.len(); 260 | self.name2ids.insert(node.name.clone(), node_id); 261 | self.nodes.push(node); 262 | self.incoming_links.push(Vec::new()); 263 | self.outgoing_links.push(Vec::new()); 264 | node_id 265 | } 266 | 267 | pub fn add_link(&mut self, link: Link) { 268 | //FIXME Currently doesn't check that every link is represented only once 269 | //TODO Think of some nice 'views' for vectors that will reverse complement everything put 270 | //there 271 | match link.start.direction { 272 | Direction::FORWARD => self.outgoing_links[link.start.node_id].push(link), 273 | Direction::REVERSE => self.incoming_links[link.start.node_id].push(link.rc()), 274 | }; 275 | 276 | if link == link.rc() { 277 | return; 278 | }; 279 | 280 | match link.end.direction { 281 | Direction::FORWARD => self.incoming_links[link.end.node_id].push(link), 282 | Direction::REVERSE => self.outgoing_links[link.end.node_id].push(link.rc()), 283 | }; 284 | } 285 | 286 | //FIXME add this check within add_link function 287 | fn check_links(&self) { 288 | assert!(self.nodes.len() == self.incoming_links.len()); 289 | assert!(self.nodes.len() == self.outgoing_links.len()); 290 | for (node_id, _) in self.all_nodes().enumerate() { 291 | let v = Vertex::forward(node_id); 292 | assert!( 293 | self.incoming_links[node_id] 294 | .iter() 295 | .filter(|l| l.end != v) 296 | .count() 297 | == 0, 298 | "Problem with incoming links for node {}", 299 | self.nodes[node_id].name 300 | ); 301 | assert!( 302 | self.outgoing_links[node_id] 303 | .iter() 304 | .filter(|l| l.start != v) 305 | .count() 306 | == 0, 307 | "Problem with incoming links for node {}", 308 | self.nodes[node_id].name 309 | ); 310 | } 311 | } 312 | 313 | //TODO switch to iterator? 314 | fn parse_tag(fields: &[&str], prefix: &str) -> Option { 315 | fields 316 | .iter() 317 | .filter(|s| s.starts_with(prefix)) 318 | .map(|s| match s[prefix.len()..].parse::() { 319 | Ok(t) => t, 320 | Err(_) => panic!("Couldn't parse tag {s}"), 321 | }) 322 | .next() 323 | } 324 | 325 | fn parse_overlap(cigar: &str) -> usize { 326 | assert!(cigar.ends_with('M'), "Invalid overlap {cigar}"); 327 | let ovl = &cigar[..(cigar.len() - 1)]; 328 | ovl.trim().parse().expect("Invalid overlap") 329 | } 330 | 331 | //TODO switch to something iterable 332 | pub fn custom_read( 333 | graph_str: &str, 334 | collapse_multi_edges: bool, 335 | normalize_overlaps: bool, 336 | ) -> Graph { 337 | let mut g = Self::new(); 338 | 339 | for line in graph_str.lines() { 340 | if line.starts_with("S\t") { 341 | let split: Vec<&str> = line.split('\t').collect(); 342 | //println!("Node line {:?}", split); 343 | let name = String::from(split[1]); 344 | let tags = &split[3..split.len()]; 345 | let length = if split[2] != "*" { 346 | split[2].trim().len() 347 | } else { 348 | Self::parse_tag(tags, "LN:i:").expect("Neither sequence nor LN tag provided") 349 | }; 350 | assert!(length > 0); 351 | let coverage = match Self::parse_tag::(tags, "RC:i:") 352 | .or_else(|| Self::parse_tag::(tags, "FC:i:")) 353 | { 354 | None => Self::parse_tag(tags, "ll:f:").unwrap_or(0.), 355 | Some(raw_cnt) => raw_cnt as f64 / length as f64, 356 | }; 357 | g.add_node(Node { 358 | name, 359 | length, 360 | coverage, 361 | }); 362 | } 363 | } 364 | 365 | for line in graph_str.lines() { 366 | if line.starts_with("L\t") { 367 | let split: Vec<&str> = line.trim().split('\t').collect(); 368 | //println!("Link line {:?}", split); 369 | let start = Vertex { 370 | node_id: g.name2id(split[1]), 371 | direction: Direction::parse(split[2]), 372 | }; 373 | let end = Vertex { 374 | node_id: g.name2id(split[3]), 375 | direction: Direction::parse(split[4]), 376 | }; 377 | let mut overlap = Self::parse_overlap(split[5]); 378 | if collapse_multi_edges { 379 | if let Some(connect) = g.connector(start, end) { 380 | if connect.overlap != overlap { 381 | warn!("Multiple links connecting {} and {} with different overlap sizes ({} and {})" 382 | , g.v_str(start), g.v_str(end), overlap, connect.overlap) 383 | } 384 | continue; 385 | } 386 | } 387 | let max_ovl = std::cmp::min(g.vertex_length(start), g.vertex_length(end)) - 1; 388 | if overlap > max_ovl { 389 | assert!( 390 | normalize_overlaps, 391 | "Invalid (too long) overlap of size {} between {} and {}", 392 | overlap, 393 | g.v_str(start), 394 | g.v_str(end) 395 | ); 396 | warn!( 397 | "Normalizing overlap between {} and {} ({} -> {})", 398 | g.v_str(start), 399 | g.v_str(end), 400 | overlap, 401 | max_ovl 402 | ); 403 | overlap = max_ovl; 404 | } 405 | g.add_link(Link { 406 | start, 407 | end, 408 | overlap, 409 | }); 410 | } 411 | } 412 | g.check_links(); 413 | g 414 | } 415 | 416 | pub fn as_gfa(&self) -> String { 417 | let mut gfa = String::new(); 418 | 419 | for n in self.all_nodes() { 420 | gfa += &format!( 421 | "S\t{}\t*\tLN:i:{}\tRC:i:{}\tll:f:{:.1}\n", 422 | n.name, 423 | n.length, 424 | (n.coverage * n.length as f64).round() as u64, 425 | n.coverage 426 | ); 427 | } 428 | 429 | for l in self.all_links() { 430 | gfa += &format!( 431 | "L\t{}\t{}\t{}\t{}\t{}M\n", 432 | self.node(l.start.node_id).name, 433 | Direction::str(l.start.direction), 434 | self.node(l.end.node_id).name, 435 | Direction::str(l.end.direction), 436 | l.overlap 437 | ); 438 | } 439 | 440 | gfa 441 | } 442 | 443 | pub fn read(graph_str: &str) -> Self { 444 | Self::custom_read(graph_str, false, false) 445 | } 446 | 447 | pub fn read_sanitize(graph_str: &str) -> Self { 448 | Self::custom_read(graph_str, true, true) 449 | } 450 | 451 | //fn get_vertex(&self, name: &str, direction: Direction) -> Vertex { 452 | // let node_id = self.name2id(name); 453 | // Vertex {node_id, direction} 454 | //} 455 | 456 | fn rc(links: &[Link]) -> Vec { 457 | links.iter().map(|x| x.rc()).collect() 458 | } 459 | 460 | pub fn node(&self, node_id: usize) -> &Node { 461 | &self.nodes[node_id] 462 | } 463 | 464 | pub fn node_length(&self, node_id: usize) -> usize { 465 | self.node(node_id).length 466 | } 467 | 468 | pub fn vertex_length(&self, v: Vertex) -> usize { 469 | self.node_length(v.node_id) 470 | } 471 | 472 | pub fn node_by_name(&self, name: &str) -> &Node { 473 | &self.nodes[self.name2id(name)] 474 | } 475 | 476 | pub fn name(&self, node_id: usize) -> &str { 477 | &self.node(node_id).name 478 | } 479 | 480 | //TODO switch to iterators when learn enough Rust :) 481 | pub fn outgoing_edge_cnt(&self, v: Vertex) -> usize { 482 | match v.direction { 483 | Direction::FORWARD => self.outgoing_links[v.node_id].len(), 484 | Direction::REVERSE => self.incoming_links[v.node_id].len(), 485 | } 486 | } 487 | 488 | //TODO switch to iterators when learn enough Rust :) 489 | pub fn outgoing_edges(&self, v: Vertex) -> Vec { 490 | match v.direction { 491 | Direction::FORWARD => self.outgoing_links[v.node_id].clone(), 492 | Direction::REVERSE => Self::rc(&self.incoming_links[v.node_id]), 493 | } 494 | } 495 | 496 | pub fn incoming_edge_cnt(&self, v: Vertex) -> usize { 497 | self.outgoing_edge_cnt(v.rc()) 498 | } 499 | 500 | //TODO switch to iterators when learn enough Rust :) 501 | pub fn incoming_edges(&self, v: Vertex) -> Vec { 502 | match v.direction { 503 | Direction::FORWARD => self.incoming_links[v.node_id].clone(), 504 | Direction::REVERSE => Self::rc(&self.outgoing_links[v.node_id]), 505 | } 506 | } 507 | 508 | pub fn name2id(&self, name: &str) -> usize { 509 | match self.name2ids.get(name) { 510 | Some(&id) => id, 511 | None => panic!("Node {name} is not in the graph"), 512 | } 513 | } 514 | 515 | //TODO iterate over references 516 | pub fn all_links(&self) -> impl Iterator + '_ { 517 | AllLinkIter::new(self) 518 | } 519 | 520 | pub fn all_nodes(&self) -> impl Iterator + '_ { 521 | self.nodes.iter() 522 | } 523 | 524 | //TODO iterate over references 525 | pub fn all_vertices(&self) -> impl Iterator + '_ { 526 | VertexIter::new(self) 527 | } 528 | 529 | //TODO iterate over references 530 | pub fn canonic_vertices(&self) -> impl Iterator + '_ { 531 | (1..self.node_cnt()).map(Vertex::forward) 532 | } 533 | 534 | pub fn link_cnt(&self) -> usize { 535 | self.all_links().count() 536 | } 537 | 538 | //note that the graph supports multi-edges, 539 | // if they are present returns only the first one 540 | pub fn connector(&self, v: Vertex, w: Vertex) -> Option { 541 | self.outgoing_edges(v).into_iter().find(|&l| l.end == w) 542 | } 543 | 544 | pub fn v_str_format(&self, v: Vertex, gaf: bool) -> String { 545 | Direction::format_node(&self.node(v.node_id).name, v.direction, gaf) 546 | } 547 | 548 | pub fn v_str(&self, v: Vertex) -> String { 549 | self.v_str_format(v, false) 550 | } 551 | 552 | //pub fn gaf_str(&self, v: Vertex) -> String { 553 | // format!("{}{}", Direction::gaf_str(v.direction), self.node(v.node_id).name) 554 | //} 555 | 556 | pub fn l_str(&self, l: Link) -> String { 557 | format!("{}->{}", self.v_str(l.start), self.v_str(l.end)) 558 | } 559 | } 560 | 561 | #[derive(Clone, Debug, PartialEq, PartialOrd)] 562 | pub struct GapInfo { 563 | pub start: Vertex, 564 | pub end: Vertex, 565 | pub gap_size: i64, 566 | pub info: String, 567 | } 568 | 569 | impl GapInfo { 570 | pub fn rc(&self) -> GapInfo { 571 | GapInfo { 572 | start: self.end.rc(), 573 | end: self.start.rc(), 574 | gap_size: self.gap_size, 575 | info: self.info.clone(), 576 | } 577 | } 578 | } 579 | 580 | #[derive(Clone, Debug, PartialEq, PartialOrd)] 581 | pub enum GeneralizedLink { 582 | LINK(Link), 583 | GAP(GapInfo), 584 | } 585 | 586 | //TODO think of refactoring 587 | impl GeneralizedLink { 588 | pub fn start(&self) -> Vertex { 589 | match &self { 590 | Self::LINK(l) => l.start, 591 | Self::GAP(g) => g.start, 592 | } 593 | } 594 | 595 | pub fn end(&self) -> Vertex { 596 | match &self { 597 | Self::LINK(l) => l.end, 598 | Self::GAP(g) => g.end, 599 | } 600 | } 601 | 602 | //pub fn is_gap(&self) -> bool { 603 | // match &self { 604 | // Self::LINK(l) => false, 605 | // Self::GAP(g) => true, 606 | // Self::AMBIG(a) => true, 607 | // } 608 | //} 609 | 610 | pub fn overlap(&self) -> i64 { 611 | match &self { 612 | Self::LINK(l) => l.overlap as i64, 613 | Self::GAP(g) => -g.gap_size, 614 | } 615 | } 616 | 617 | pub fn rc(&self) -> GeneralizedLink { 618 | match &self { 619 | Self::LINK(l) => Self::LINK(l.rc()), 620 | Self::GAP(g) => Self::GAP(g.rc()), 621 | } 622 | } 623 | } 624 | 625 | #[derive(Clone)] 626 | pub struct Path { 627 | v_storage: Vec, 628 | l_storage: Vec, 629 | } 630 | 631 | //Never empty! Use None instead 632 | impl Path { 633 | pub fn new(init_v: Vertex) -> Path { 634 | Path { 635 | v_storage: vec![init_v], 636 | l_storage: Vec::new(), 637 | } 638 | } 639 | 640 | pub fn from_link(l: Link) -> Path { 641 | Self::from_general_link(GeneralizedLink::LINK(l)) 642 | } 643 | 644 | pub fn from_general_link(l: GeneralizedLink) -> Path { 645 | //assert!(l.start().node_id != l.end().node_id); 646 | Path { 647 | v_storage: vec![l.start(), l.end()], 648 | l_storage: vec![l], 649 | } 650 | } 651 | 652 | pub fn vertices(&self) -> &Vec { 653 | &self.v_storage 654 | } 655 | 656 | pub fn start(&self) -> Vertex { 657 | self.v_storage[0] 658 | } 659 | 660 | pub fn end(&self) -> Vertex { 661 | self.v_storage[self.v_storage.len() - 1] 662 | } 663 | 664 | pub fn len(&self) -> usize { 665 | self.v_storage.len() 666 | } 667 | 668 | pub fn is_empty(&self) -> bool { 669 | self.v_storage.is_empty() 670 | } 671 | 672 | pub fn general_link_at(&self, idx: usize) -> &GeneralizedLink { 673 | &self.l_storage[idx] 674 | } 675 | 676 | pub fn link_at(&self, idx: usize) -> Link { 677 | match self.general_link_at(idx) { 678 | GeneralizedLink::LINK(l) => *l, 679 | _ => panic!("Not an actual graph link at index {idx}"), 680 | } 681 | } 682 | 683 | pub fn links(&self) -> &Vec { 684 | &self.l_storage 685 | } 686 | 687 | //TODO rename to rc?:write! 688 | pub fn reverse_complement(self) -> Path { 689 | //TODO optimize since consuming self 690 | Path { 691 | v_storage: self.v_storage.iter().rev().map(|v| v.rc()).collect(), 692 | l_storage: self.l_storage.iter().rev().map(|l| l.rc()).collect(), 693 | } 694 | } 695 | 696 | pub fn trim(&mut self, step: usize) { 697 | assert!(step < self.len()); 698 | //TODO optimize 699 | for _ in 0..step { 700 | self.v_storage.pop(); 701 | //it's ok to pop even if it is empty 702 | self.l_storage.pop(); 703 | } 704 | } 705 | 706 | pub fn trim_to(&mut self, v: &Vertex) -> bool { 707 | //TODO optimize 708 | if self.v_storage.contains(v) { 709 | while self.v_storage.last().unwrap() != v { 710 | self.v_storage.pop(); 711 | self.l_storage.pop(); 712 | } 713 | return true; 714 | } 715 | false 716 | } 717 | 718 | //TODO rename 719 | pub fn append_general(&mut self, l: GeneralizedLink) { 720 | assert!(self.v_storage.last().unwrap() == &l.start()); 721 | //TODO disable expensive assert? 722 | debug_assert!(!self.in_path(l.end().node_id)); 723 | self.v_storage.push(l.end()); 724 | self.l_storage.push(l); 725 | } 726 | 727 | //TODO rename 728 | pub fn append(&mut self, l: Link) { 729 | self.append_general(GeneralizedLink::LINK(l)); 730 | } 731 | 732 | //TODO rename? 733 | //TODO optimize (can work directly with vectors) 734 | //NB does not support intersecting paths (e.g. forming loop) 735 | pub fn extend(&mut self, other: Path) { 736 | assert!(self.v_storage.last().unwrap() == other.v_storage.first().unwrap()); 737 | for l in other.l_storage { 738 | self.append_general(l); 739 | } 740 | } 741 | 742 | pub fn in_path(&self, node_id: usize) -> bool { 743 | self.v_storage.iter().any(|v| v.node_id == node_id) 744 | } 745 | 746 | pub fn can_merge_in(&self, path: &Path) -> bool { 747 | assert!(self.v_storage.last() == path.v_storage.first()); 748 | !path.l_storage.iter().any(|l| self.in_path(l.end().node_id)) 749 | } 750 | 751 | pub fn merge_in(&mut self, path: Path) { 752 | assert!(self.can_merge_in(&path)); 753 | for l in path.l_storage { 754 | self.append_general(l); 755 | } 756 | } 757 | 758 | pub fn print(&self, g: &Graph) -> String { 759 | //self.v_storage.iter().map(|&v| g.v_str(v)).join(",") 760 | self.print_format(g, false) 761 | } 762 | 763 | //fn print_ungapped(g: &Graph, vertices: &[Vertex], gaf: bool) -> String { 764 | // let delim = if gaf { "" } else { "," }; 765 | // vertices.iter().map(|&v| g.v_str_format(v, gaf)).join(delim) 766 | //} 767 | 768 | pub fn print_format(&self, g: &Graph, gaf: bool) -> String { 769 | let delim = if gaf { "" } else { "," }; 770 | let mut ans = String::new(); 771 | for (i, &v) in self.v_storage.iter().enumerate() { 772 | if i > 0 { 773 | match &self.l_storage[i - 1] { 774 | GeneralizedLink::GAP(gap_info) => { 775 | ans += delim; 776 | ans += &format!("[N{}N:{}]", gap_info.gap_size, gap_info.info); 777 | } 778 | GeneralizedLink::LINK(_) => {} 779 | }; 780 | } 781 | if i > 0 { 782 | ans += delim; 783 | } 784 | ans += &g.v_str_format(v, gaf); 785 | } 786 | ans 787 | } 788 | 789 | pub fn total_length(&self, g: &Graph) -> usize { 790 | let mut tot_length = g.vertex_length(self.v_storage[0]) as i64; 791 | for l in &self.l_storage { 792 | tot_length += g.vertex_length(l.end()) as i64 - l.overlap(); 793 | } 794 | tot_length as usize 795 | } 796 | 797 | pub fn check_subpath(&self, other: &Path, start_pos: usize) -> bool { 798 | if self.len() < start_pos + other.len() { 799 | return false; 800 | } 801 | match other.len() { 802 | 1 => self.v_storage[start_pos] == other.start(), 803 | _ => self.l_storage[start_pos..(start_pos + other.len() - 1)] == other.l_storage, 804 | } 805 | } 806 | 807 | pub fn check_subpath_rc(&self, other: &Path, start_pos: usize) -> bool { 808 | if start_pos < (other.len() - 1) { 809 | return false; 810 | } 811 | self.check_subpath( 812 | &other.clone().reverse_complement(), 813 | start_pos - (other.len() - 1), 814 | ) 815 | } 816 | } 817 | --------------------------------------------------------------------------------