├── .github └── workflows │ ├── ci.yml │ └── release.yml ├── .gitignore ├── .gitmodules ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── crates ├── api │ ├── Cargo.lock │ ├── Cargo.toml │ ├── example │ │ ├── Cargo.toml │ │ └── src │ │ │ └── main.rs │ └── src │ │ ├── lib.rs │ │ └── utils.rs ├── assembler │ ├── Cargo.toml │ └── src │ │ ├── lib.rs │ │ ├── pipeline.rs │ │ ├── pipeline │ │ ├── build_unitigs.rs │ │ ├── compute_matchtigs.rs │ │ ├── eulertigs.rs │ │ ├── hashes_sorting.rs │ │ ├── links_compaction.rs │ │ ├── maximal_unitig_links.rs │ │ ├── maximal_unitig_links │ │ │ ├── mappings_loader.rs │ │ │ ├── maximal_hash_entry.rs │ │ │ └── maximal_unitig_index.rs │ │ └── reorganize_reads.rs │ │ ├── structs.rs │ │ └── structs │ │ └── link_mapping.rs ├── assembler_kmers_merge │ ├── Cargo.toml │ └── src │ │ ├── final_executor.rs │ │ ├── lib.rs │ │ ├── map_processor.rs │ │ └── structs.rs ├── assembler_minimizer_bucketing │ ├── Cargo.toml │ └── src │ │ ├── lib.rs │ │ └── rewrite_bucket.rs ├── capi │ ├── Cargo.toml │ ├── build.rs │ ├── ggcat-cpp-api │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── example │ │ │ ├── .gitignore │ │ │ ├── Makefile │ │ │ └── main.cc │ │ ├── include │ │ │ ├── ggcat-cpp-bindings.hh │ │ │ └── ggcat.hh │ │ └── src │ │ │ └── ggcat.cc │ └── src │ │ └── lib.rs ├── cmdline │ ├── Cargo.lock │ ├── Cargo.toml │ └── src │ │ ├── benchmarks.rs │ │ ├── cmd_utils.rs │ │ ├── cmd_utils │ │ └── cmd_rewrite.rs │ │ └── main.rs ├── colors │ ├── Cargo.toml │ └── src │ │ ├── async_slice_queue.rs │ │ ├── bundles.rs │ │ ├── bundles │ │ ├── graph_querying.rs │ │ └── multifile_building.rs │ │ ├── colors_manager.rs │ │ ├── colors_memmap_writer.rs │ │ ├── lib.rs │ │ ├── managers.rs │ │ ├── managers │ │ ├── multiple.rs │ │ └── single.rs │ │ ├── non_colored.rs │ │ ├── parsers.rs │ │ ├── parsers │ │ ├── graph.rs │ │ └── separate.rs │ │ ├── storage.rs │ │ └── storage │ │ ├── deserializer.rs │ │ ├── roaring.rs │ │ ├── run_length.rs │ │ └── serializer.rs ├── config │ ├── Cargo.toml │ └── src │ │ └── lib.rs ├── dumper │ ├── Cargo.toml │ └── src │ │ ├── lib.rs │ │ ├── pipeline.rs │ │ └── pipeline │ │ ├── dumper_colormap_querying.rs │ │ ├── dumper_colormap_reading.rs │ │ └── dumper_minimizer_bucketing.rs ├── hashes │ ├── Cargo.toml │ ├── benches │ │ └── hashes-bench.rs │ └── src │ │ ├── base │ │ ├── cn_rkhash_base.rs │ │ ├── cn_seqhash_base.rs │ │ ├── fw_rkhash_base.rs │ │ └── fw_seqhash_base.rs │ │ ├── cn_nthash.rs │ │ ├── cn_rkhash.rs │ │ ├── cn_seqhash.rs │ │ ├── dummy_hasher.rs │ │ ├── fw_nthash.rs │ │ ├── fw_rkhash.rs │ │ ├── fw_seqhash.rs │ │ ├── lib.rs │ │ ├── nthash_base.rs │ │ ├── rolling.rs │ │ └── rolling │ │ ├── kseq_iterator.rs │ │ └── minqueue.rs ├── io │ ├── Cargo.toml │ └── src │ │ ├── chunks_writer.rs │ │ ├── compressed_read.rs │ │ ├── concurrent.rs │ │ ├── concurrent │ │ ├── structured_sequences.rs │ │ ├── structured_sequences │ │ │ ├── binary.rs │ │ │ ├── concurrent.rs │ │ │ ├── fasta.rs │ │ │ ├── gfa.rs │ │ │ └── stream_finish.rs │ │ ├── temp_reads.rs │ │ └── temp_reads │ │ │ ├── creads_utils.rs │ │ │ └── extra_data.rs │ │ ├── lib.rs │ │ ├── lines_reader.rs │ │ ├── reads_writer.rs │ │ ├── sequences_reader.rs │ │ ├── sequences_stream.rs │ │ ├── sequences_stream │ │ ├── fasta.rs │ │ └── general.rs │ │ ├── structs.rs │ │ ├── structs │ │ ├── hash_entry.rs │ │ └── unitig_link.rs │ │ └── varint.rs ├── kmers_transform │ ├── Cargo.toml │ └── src │ │ ├── debug_bucket_stats.rs │ │ ├── lib.rs │ │ ├── processor.rs │ │ ├── reader.rs │ │ ├── reads_buffer.rs │ │ ├── resplitter.rs │ │ └── writer.rs ├── logging │ ├── Cargo.toml │ └── src │ │ ├── lib.rs │ │ └── stats.rs ├── minimizer_bucketing │ ├── Cargo.toml │ └── src │ │ ├── compactor.rs │ │ ├── compactor │ │ └── extra_data.rs │ │ ├── counters_analyzer.rs │ │ ├── lib.rs │ │ ├── queue_data.rs │ │ ├── reader.rs │ │ ├── resplit_bucket.rs │ │ └── sequences_splitter.rs ├── querier │ ├── Cargo.toml │ └── src │ │ ├── lib.rs │ │ ├── pipeline.rs │ │ ├── pipeline │ │ ├── colored_query_output.rs │ │ ├── colormap_reading.rs │ │ ├── counters_sorting.rs │ │ ├── parallel_kmers_query.rs │ │ └── querier_minimizer_bucketing.rs │ │ ├── structs.rs │ │ └── structs │ │ └── query_colored_counters.rs ├── structs │ ├── Cargo.toml │ └── src │ │ ├── lib.rs │ │ ├── map_entry.rs │ │ └── unitigs_counters.rs └── utils │ ├── Cargo.toml │ └── src │ ├── debug_functions.rs │ ├── fast_rand_bool.rs │ ├── lib.rs │ ├── owned_drop.rs │ ├── resource_counter.rs │ └── vec_slice.rs ├── example-inputs ├── query.fa ├── sal1.fa ├── sal2.fa └── sal3.fa ├── rust-toolchain └── tests └── build_checking.sh /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | tags: 8 | - '*' 9 | pull_request: 10 | workflow_dispatch: 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | linux: 17 | strategy: 18 | matrix: 19 | platform: 20 | - runner: ubuntu-latest 21 | target: x86_64-unknown-linux-gnu 22 | - runner: ubuntu-latest 23 | target: aarch64-unknown-linux-gnu 24 | runs-on: ${{ matrix.platform.runner }} 25 | steps: 26 | - uses: actions/checkout@v4 27 | with: 28 | submodules: true 29 | - name: Rust setup 30 | uses: actions-rust-lang/setup-rust-toolchain@v1 31 | with: 32 | toolchain: stable 33 | target: ${{ matrix.platform.target }} 34 | - name: Cross install 35 | shell: bash 36 | run: cargo install cross 37 | - name: Build cli 38 | shell: bash 39 | run: | 40 | cross build --release --target ${{ matrix.platform.target }} --locked -p ggcat_cmdline 41 | 42 | linux-musl: 43 | strategy: 44 | matrix: 45 | platform: 46 | - runner: ubuntu-latest 47 | target: x86_64-unknown-linux-musl 48 | image: messense/rust-musl-cross:x86_64-musl 49 | - runner: ubuntu-latest 50 | target: aarch64-unknown-linux-musl 51 | image: messense/rust-musl-cross:aarch64-musl 52 | runs-on: ${{ matrix.platform.runner }} 53 | container: 54 | image: ${{ matrix.platform.image }} 55 | volumes: 56 | - .:/home/rust/src 57 | steps: 58 | - uses: actions/checkout@v4 59 | with: 60 | submodules: true 61 | - name: Build cli 62 | run: cargo build --release --target ${{ matrix.platform.target }} --locked -p ggcat_cmdline 63 | 64 | windows: 65 | strategy: 66 | matrix: 67 | platform: 68 | - runner: windows-latest 69 | target: x86_64-pc-windows-msvc 70 | runs-on: ${{ matrix.platform.runner }} 71 | steps: 72 | - uses: actions/checkout@v4 73 | with: 74 | submodules: true 75 | - name: Rust setup 76 | uses: actions-rust-lang/setup-rust-toolchain@v1 77 | with: 78 | toolchain: stable 79 | target: ${{ matrix.platform.target }} 80 | - name: Build cli 81 | run: cargo build --release --target ${{ matrix.platform.target }} --locked -p ggcat_cmdline 82 | 83 | macos: 84 | strategy: 85 | matrix: 86 | platform: 87 | - runner: macos-13 88 | target: x86_64-apple-darwin 89 | - runner: macos-14 90 | target: aarch64-apple-darwin 91 | runs-on: ${{ matrix.platform.runner }} 92 | steps: 93 | - uses: actions/checkout@v4 94 | with: 95 | submodules: true 96 | - name: Rust setup 97 | uses: actions-rust-lang/setup-rust-toolchain@v1 98 | with: 99 | toolchain: stable 100 | target: ${{ matrix.platform.target }} 101 | - name: Build cli 102 | run: cargo build --release --target ${{ matrix.platform.target }} --locked -p ggcat_cmdline -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | permissions: 8 | contents: write 9 | 10 | jobs: 11 | linux-release: 12 | strategy: 13 | matrix: 14 | platform: 15 | - runner: ubuntu-latest 16 | target: x86_64-unknown-linux-gnu 17 | - runner: ubuntu-latest 18 | target: aarch64-unknown-linux-gnu 19 | runs-on: ${{ matrix.platform.runner }} 20 | steps: 21 | - uses: actions/checkout@v4 22 | with: 23 | submodules: true 24 | - name: Rust setup 25 | uses: actions-rust-lang/setup-rust-toolchain@v1 26 | with: 27 | toolchain: stable 28 | target: ${{ matrix.platform.target }} 29 | - name: Cross install 30 | shell: bash 31 | run: cargo install cross 32 | - name: Build cli 33 | shell: bash 34 | run: | 35 | cross build --release --target ${{ matrix.platform.target }} --locked -p ggcat_cmdline 36 | - name: Pack release 37 | shell: bash 38 | run: | 39 | tar zcvf "ggcat-${{ matrix.platform.target }}.tar.gz" -C target/${{ matrix.platform.target }}/release/ ggcat 40 | - name: Publish release 41 | uses: AButler/upload-release-assets@v3.0 42 | with: 43 | files: "ggcat-${{ matrix.platform.target }}.tar.gz" 44 | repo-token: ${{ secrets.GITHUB_TOKEN }} 45 | 46 | linux-musl-release: 47 | strategy: 48 | matrix: 49 | platform: 50 | - runner: ubuntu-latest 51 | target: x86_64-unknown-linux-musl 52 | image: messense/rust-musl-cross:x86_64-musl 53 | - runner: ubuntu-latest 54 | target: aarch64-unknown-linux-musl 55 | image: messense/rust-musl-cross:aarch64-musl 56 | runs-on: ${{ matrix.platform.runner }} 57 | container: 58 | image: ${{ matrix.platform.image }} 59 | volumes: 60 | - .:/home/rust/src 61 | steps: 62 | - uses: actions/checkout@v4 63 | with: 64 | submodules: true 65 | - name: Build cli 66 | run: cargo build --release --target ${{ matrix.platform.target }} --locked -p ggcat_cmdline 67 | - name: Pack release 68 | shell: bash 69 | run: | 70 | tar zcvf "ggcat-${{ matrix.platform.target }}.tar.gz" -C target/${{ matrix.platform.target }}/release/ ggcat 71 | - name: Publish release 72 | uses: AButler/upload-release-assets@v3.0 73 | with: 74 | files: "ggcat-${{ matrix.platform.target }}.tar.gz" 75 | repo-token: ${{ secrets.GITHUB_TOKEN }} 76 | 77 | windows-release: 78 | strategy: 79 | matrix: 80 | platform: 81 | - runner: windows-latest 82 | target: x86_64-pc-windows-msvc 83 | runs-on: ${{ matrix.platform.runner }} 84 | steps: 85 | - uses: actions/checkout@v4 86 | with: 87 | submodules: true 88 | - name: Rust setup 89 | uses: actions-rust-lang/setup-rust-toolchain@v1 90 | with: 91 | toolchain: stable 92 | target: ${{ matrix.platform.target }} 93 | - name: Build cli 94 | run: cargo build --release --target ${{ matrix.platform.target }} --locked -p ggcat_cmdline 95 | - name: Pack release 96 | shell: bash 97 | run: | 98 | tar zcvf "ggcat-${{ matrix.platform.target }}.tar.gz" -C target/${{ matrix.platform.target }}/release/ ggcat 99 | - name: Publish release 100 | uses: AButler/upload-release-assets@v3.0 101 | with: 102 | files: "ggcat-${{ matrix.platform.target }}.tar.gz" 103 | repo-token: ${{ secrets.GITHUB_TOKEN }} 104 | 105 | macos-release: 106 | strategy: 107 | matrix: 108 | platform: 109 | - runner: macos-13 110 | target: x86_64-apple-darwin 111 | - runner: macos-14 112 | target: aarch64-apple-darwin 113 | runs-on: ${{ matrix.platform.runner }} 114 | steps: 115 | - uses: actions/checkout@v4 116 | with: 117 | submodules: true 118 | - name: Rust setup 119 | uses: actions-rust-lang/setup-rust-toolchain@v1 120 | with: 121 | toolchain: stable 122 | target: ${{ matrix.platform.target }} 123 | - name: Build cli 124 | run: cargo build --release --target ${{ matrix.platform.target }} --locked -p ggcat_cmdline 125 | - name: Pack release 126 | shell: bash 127 | run: | 128 | tar zcvf "ggcat-${{ matrix.platform.target }}.tar.gz" -C target/${{ matrix.platform.target }}/release/ ggcat 129 | - name: Publish release 130 | uses: AButler/upload-release-assets@v3.0 131 | with: 132 | files: "ggcat-${{ matrix.platform.target }}.tar.gz" 133 | repo-token: ${{ secrets.GITHUB_TOKEN }} 134 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | .idea 3 | .vscode 4 | working-dir/ 5 | 6 | # Files for testingm 7 | *.fa 8 | *.log -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "libs-crates/nightly-quirks-rs"] 2 | path = libs-crates/nightly-quirks-rs 3 | url = ../../Guilucand/nightly-quirks-rs.git 4 | [submodule "libs-crates/mt-debug-counters-rs"] 5 | path = libs-crates/mt-debug-counters-rs 6 | url = ../../Guilucand/mt-debug-counters-rs.git 7 | [submodule "libs-crates/streaming-libdeflate-rs"] 8 | path = libs-crates/streaming-libdeflate-rs 9 | url = ../../Guilucand/streaming-libdeflate-rs.git 10 | [submodule "libs-crates/papi-bindings-rs"] 11 | path = libs-crates/papi-bindings-rs 12 | url = ../../Guilucand/papi-bindings-rs.git 13 | [submodule "libs-crates/instrumenter-rs"] 14 | path = libs-crates/instrumenter-rs 15 | url = ../../Guilucand/instrumenter-rs.git 16 | [submodule "libs-crates/dynamic-dispatch-rs"] 17 | path = libs-crates/dynamic-dispatch-rs 18 | url = ../../Guilucand/dynamic-dispatch-rs.git 19 | [submodule "libs-crates/parallel-processor-rs"] 20 | path = libs-crates/parallel-processor-rs 21 | url = ../../Guilucand/parallel-processor-rs.git 22 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | # [patch.crates-io] 2 | # dynamic-dispatch-proc-macro = { path = "libs-crates/dynamic-dispatch-rs/dynamic-dispatch-proc-macro" } 3 | # dynamic-dispatch = { path = "libs-crates/dynamic-dispatch-rs" } 4 | # mt-debug-counters = { path = "libs-crates/mt-debug-counters-rs" } 5 | # streaming-libdeflate-rs = { path = "libs-crates/streaming-libdeflate-rs" } 6 | # papi-bindings = { path = "libs-crates/papi-bindings-rs" } 7 | # nightly-quirks = { path = "libs-crates/nightly-quirks-rs" } 8 | # instrumenter-proc-macro = { path = "libs-crates/instrumenter-rs/instrumenter-proc-macro-rs" } 9 | # instrumenter = { path = "libs-crates/instrumenter-rs" } 10 | # parallel-processor = { path = "libs-crates/parallel-processor-rs" } 11 | 12 | [workspace] 13 | 14 | resolver = "2" 15 | 16 | members = [ 17 | # Library crates added as submodules 18 | # "libs-crates/dynamic-dispatch-rs/dynamic-dispatch-proc-macro", 19 | # "libs-crates/dynamic-dispatch-rs", 20 | # "libs-crates/mt-debug-counters-rs", 21 | # "libs-crates/streaming-libdeflate-rs", 22 | # "libs-crates/papi-bindings-rs", 23 | # "libs-crates/nightly-quirks-rs", 24 | # "libs-crates/instrumenter-rs/instrumenter-proc-macro-rs", 25 | # "libs-crates/instrumenter-rs", 26 | # "libs-crates/parallel-processor-rs", 27 | 28 | # Main crate for the command line tools 29 | "crates/cmdline", 30 | 31 | # Main library for api usage 32 | "crates/api", 33 | 34 | # Main c/c++ library for api usage 35 | "crates/capi", 36 | 37 | # Global config constants 38 | "crates/config", 39 | 40 | # Global utilities 41 | "crates/utils", 42 | 43 | # Common pipeline crates 44 | "crates/logging", 45 | "crates/hashes", 46 | "crates/io", 47 | "crates/minimizer_bucketing", 48 | "crates/kmers_transform", 49 | "crates/colors", 50 | "crates/structs", 51 | 52 | # Assembler pipeline 53 | "crates/assembler_minimizer_bucketing", 54 | "crates/assembler_kmers_merge", 55 | "crates/assembler", 56 | 57 | # Querier pipeline 58 | "crates/querier", 59 | 60 | # Dumper pipeline 61 | "crates/dumper", 62 | 63 | 64 | # Examples 65 | "crates/api/example", 66 | ] 67 | 68 | default-members = ["crates/cmdline", "crates/api"] 69 | 70 | [profile.dev] 71 | debug = true 72 | opt-level = 2 73 | lto = "off" 74 | 75 | [profile.release] 76 | debug = false 77 | strip = true 78 | lto = "thin" 79 | 80 | 81 | [profile.with_debug] 82 | inherits = "release" 83 | debug = true 84 | strip = false 85 | lto = "thin" 86 | 87 | # split-debuginfo = "packed" 88 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Andrea Cracco, Alexandru I. Tomescu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /crates/api/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors = ["Andrea Cracco "] 3 | edition = "2021" 4 | name = "ggcat-api" 5 | version = "2.0.0" 6 | 7 | [lib] 8 | crate-type = ["rlib", "staticlib"] 9 | 10 | [dependencies] 11 | 12 | # Global config 13 | config = { package = "ggcat_config", path = "../config" } 14 | 15 | # GGCAT main modules 16 | assembler = { package = "ggcat_assembler", path = "../assembler" } 17 | querier = { package = "ggcat_querier", path = "../querier" } 18 | dumper = { package = "ggcat_dumper", path = "../dumper" } 19 | 20 | # GGCAT extra modules 21 | io = { package = "ggcat_io", path = "../io" } 22 | hashes = { package = "ggcat_hashes", path = "../hashes" } 23 | colors = { package = "ggcat_colors", path = "../colors" } 24 | utils = { package = "ggcat_utils", path = "../utils" } 25 | 26 | fdlimit = "0.3.0" 27 | parallel-processor = "0.1.24" 28 | rayon = "1.10.0" 29 | dynamic-dispatch = "0.5.4" 30 | parking_lot = "0.12.3" 31 | uuid = { version = "1.9.1", features = ["v4"] } 32 | ggcat-logging = { version = "2.0.0", path = "../logging" } 33 | anyhow = "1.0.89" 34 | -------------------------------------------------------------------------------- /crates/api/example/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "example" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | ggcat-api = { version = "2.0.0", path = ".." } 10 | itertools = "0.13.0" 11 | -------------------------------------------------------------------------------- /crates/api/example/src/main.rs: -------------------------------------------------------------------------------- 1 | use ggcat_api::{ 2 | ColoredQueryOutputFormat, ExtraElaboration, GGCATConfig, GGCATInstance, 3 | GeneralSequenceBlockData, 4 | }; 5 | use itertools::Itertools; 6 | use std::{path::PathBuf, sync::Mutex}; 7 | 8 | fn main() { 9 | let instance = GGCATInstance::create(GGCATConfig { 10 | temp_dir: Some(PathBuf::from("/tmp")), 11 | memory: 2.0, 12 | prefer_memory: true, 13 | total_threads_count: 16, 14 | intermediate_compression_level: None, 15 | stats_file: None, 16 | messages_callback: Some(|lvl, msg| match lvl { 17 | ggcat_api::MessageLevel::Info => { 18 | println!("Info: {}", msg); 19 | } 20 | ggcat_api::MessageLevel::Warning => { 21 | println!("Warning: {}", msg); 22 | } 23 | ggcat_api::MessageLevel::Error => { 24 | println!("Error: {}", msg); 25 | } 26 | ggcat_api::MessageLevel::UnrecoverableError => { 27 | panic!("Unrecoverable error: {}", msg); 28 | } 29 | }), 30 | disk_optimization_level: 5, 31 | }) 32 | .unwrap(); 33 | 34 | let graph_file = PathBuf::from("/tmp/sal-dbg.fa"); 35 | let k = 31; 36 | let threads_count = 16; 37 | 38 | // Example building of a colored graph from three FASTA files 39 | // building also bcalm2-style links across maximal unitigs 40 | let graph_file = instance 41 | .build_graph( 42 | vec![ 43 | GeneralSequenceBlockData::FASTA(( 44 | PathBuf::from("../../../example-inputs/sal1.fa"), 45 | None, 46 | )), 47 | GeneralSequenceBlockData::FASTA(( 48 | PathBuf::from("../../../example-inputs/sal2.fa"), 49 | None, 50 | )), 51 | GeneralSequenceBlockData::FASTA(( 52 | PathBuf::from("../../../example-inputs/sal3.fa"), 53 | None, 54 | )), 55 | ], 56 | graph_file.clone(), 57 | Some(&["sal1".to_string(), "sal2".to_string(), "sal3".to_string()]), 58 | k, 59 | threads_count, 60 | false, 61 | None, 62 | true, 63 | 1, 64 | ExtraElaboration::UnitigLinks, 65 | None, 66 | 5, 67 | ) 68 | .unwrap(); 69 | 70 | let input_query = PathBuf::from("../../../example-inputs/query.fa"); 71 | 72 | let output_query = instance 73 | .query_graph( 74 | graph_file.clone(), 75 | input_query, 76 | PathBuf::from("/tmp/query-results"), 77 | k, 78 | threads_count, 79 | false, 80 | None, 81 | true, 82 | ColoredQueryOutputFormat::JsonLinesWithNames, 83 | ) 84 | .unwrap(); 85 | 86 | println!("Output query file: {:?}", output_query.display()); 87 | 88 | let print_kmer_lock = Mutex::new(()); 89 | 90 | let color_names: Vec<_> = 91 | GGCATInstance::dump_colors(GGCATInstance::get_colormap_file(&graph_file)) 92 | .unwrap() 93 | .collect(); 94 | 95 | instance 96 | .dump_unitigs( 97 | &graph_file, 98 | k, 99 | None, 100 | true, 101 | threads_count, 102 | false, 103 | // WARNING: this function is called asynchronously from multiple threads, so it must be thread-safe. 104 | // Also the same_colors boolean is referred to the previous call of this function from the current thread 105 | |read, colors, same_colors| { 106 | let _lock = print_kmer_lock.lock().unwrap(); 107 | if read.len() < 100 { 108 | println!("Dump unitig '{}'", std::str::from_utf8(read).unwrap()); 109 | } else { 110 | println!( 111 | "Dump unitig '{}...'", 112 | std::str::from_utf8(&read[..100]).unwrap() 113 | ); 114 | } 115 | println!( 116 | "\t colors: {:?} same_colors: {}", 117 | colors.iter().map(|c| &color_names[*c as usize]).format(" "), 118 | same_colors 119 | ); 120 | }, 121 | ) 122 | .unwrap(); 123 | 124 | let colormap = GGCATInstance::get_colormap_file(&graph_file); 125 | instance 126 | .query_colormap(colormap, vec![0, 1, 2, 3, 4], true, |subset, colors| { 127 | print!("Subset: {} has colors:", subset); 128 | for color in colors { 129 | println!(" {}[{}]", color, color_names[*color as usize]); 130 | } 131 | }) 132 | .unwrap() 133 | } 134 | -------------------------------------------------------------------------------- /crates/api/src/utils.rs: -------------------------------------------------------------------------------- 1 | use dynamic_dispatch::DynamicDispatch; 2 | 3 | #[derive(Copy, Clone)] 4 | pub enum HashType { 5 | Auto = 0, 6 | SeqHash = 1, 7 | RabinKarp32 = 2, 8 | RabinKarp64 = 3, 9 | RabinKarp128 = 4, 10 | } 11 | 12 | pub(crate) fn get_hash_static_id( 13 | hash_type: HashType, 14 | k: usize, 15 | forward_only: bool, 16 | ) -> DynamicDispatch<()> { 17 | use hashes::*; 18 | 19 | let hash_type = match hash_type { 20 | HashType::Auto => { 21 | if k <= 64 { 22 | HashType::SeqHash 23 | } else { 24 | HashType::RabinKarp128 25 | } 26 | } 27 | x => x, 28 | }; 29 | 30 | match hash_type { 31 | HashType::SeqHash => { 32 | if k <= 8 { 33 | if forward_only { 34 | fw_seqhash::u16::ForwardSeqHashFactory::dynamic_dispatch_id() 35 | } else { 36 | cn_seqhash::u16::CanonicalSeqHashFactory::dynamic_dispatch_id() 37 | } 38 | } else if k <= 16 { 39 | if forward_only { 40 | fw_seqhash::u32::ForwardSeqHashFactory::dynamic_dispatch_id() 41 | } else { 42 | cn_seqhash::u32::CanonicalSeqHashFactory::dynamic_dispatch_id() 43 | } 44 | } else if k <= 32 { 45 | if forward_only { 46 | fw_seqhash::u64::ForwardSeqHashFactory::dynamic_dispatch_id() 47 | } else { 48 | cn_seqhash::u64::CanonicalSeqHashFactory::dynamic_dispatch_id() 49 | } 50 | } else if k <= 64 { 51 | if forward_only { 52 | fw_seqhash::u128::ForwardSeqHashFactory::dynamic_dispatch_id() 53 | } else { 54 | cn_seqhash::u128::CanonicalSeqHashFactory::dynamic_dispatch_id() 55 | } 56 | } else { 57 | panic!("Cannot use sequence hash for k > 64!"); 58 | } 59 | } 60 | HashType::RabinKarp32 => { 61 | if forward_only { 62 | fw_rkhash::u32::ForwardRabinKarpHashFactory::dynamic_dispatch_id() 63 | } else { 64 | cn_rkhash::u32::CanonicalRabinKarpHashFactory::dynamic_dispatch_id() 65 | } 66 | } 67 | HashType::RabinKarp64 => { 68 | if forward_only { 69 | fw_rkhash::u64::ForwardRabinKarpHashFactory::dynamic_dispatch_id() 70 | } else { 71 | cn_rkhash::u64::CanonicalRabinKarpHashFactory::dynamic_dispatch_id() 72 | } 73 | } 74 | HashType::RabinKarp128 => { 75 | if forward_only { 76 | fw_rkhash::u128::ForwardRabinKarpHashFactory::dynamic_dispatch_id() 77 | } else { 78 | cn_rkhash::u128::CanonicalRabinKarpHashFactory::dynamic_dispatch_id() 79 | } 80 | } 81 | HashType::Auto => { 82 | unreachable!() 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /crates/assembler/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ggcat_assembler" 3 | version = "2.0.0" 4 | edition = "2021" 5 | [dependencies] 6 | 7 | # Config 8 | config = { package = "ggcat_config", path = "../config" } 9 | 10 | # Utils 11 | utils = { package = "ggcat_utils", path = "../utils" } 12 | 13 | 14 | # Static dispatch 15 | dynamic-dispatch = "0.5.4" 16 | 17 | # Common libraries 18 | parallel-processor = "0.1.24" 19 | streaming-libdeflate-rs = "0.1.5" 20 | nightly-quirks = "0.1.4" 21 | 22 | # Pipeline crates 23 | assembler_minimizer_bucketing = { package = "ggcat_assembler_minibuck", path = "../assembler_minimizer_bucketing" } 24 | assembler_kmers_merge = { package = "ggcat_assembler_kmerge", path = "../assembler_kmers_merge" } 25 | kmers_transform = { package = "ggcat_kmers_transform", path = "../kmers_transform" } 26 | 27 | 28 | # Common pipeline libraries 29 | hashes = { package = "ggcat_hashes", path = "../hashes" } 30 | io = { package = "ggcat_io", path = "../io" } 31 | colors = { package = "ggcat_colors", path = "../colors" } 32 | structs = { package = "ggcat_structs", path = "../structs" } 33 | 34 | 35 | # Matchtigs support 36 | matchtigs = "2.1.8" 37 | genome-graph = { version = "9.0.0", features = ["traitgraph-algo"] } 38 | traitgraph-algo = { version = "8.1.0", features = [ 39 | "hashbrown_dijkstra_node_weight_array", 40 | ] } 41 | 42 | 43 | # Other libraries 44 | typenum = "1.17.0" 45 | parking_lot = "0.12.3" 46 | fs_extra = "1.3.0" 47 | rayon = "1.10.0" 48 | hashbrown = "0.14.5" 49 | itertools = "0.13.0" 50 | byteorder = "1.5.0" 51 | serde = "1.0.203" 52 | bincode = "1.3.3" 53 | crossbeam = "0.8.4" 54 | dashmap = "6.0.1" 55 | ggcat-logging = { version = "2.0.0", path = "../logging" } 56 | anyhow = "1.0.89" 57 | 58 | [features] 59 | devel-build = ["assembler_minimizer_bucketing/devel-build"] 60 | support_kmer_counters = [ 61 | "io/support_kmer_counters", 62 | "colors/support_kmer_counters", 63 | "structs/support_kmer_counters", 64 | "assembler_kmers_merge/support_kmer_counters", 65 | ] 66 | -------------------------------------------------------------------------------- /crates/assembler/src/pipeline.rs: -------------------------------------------------------------------------------- 1 | pub mod build_unitigs; 2 | pub mod compute_matchtigs; 3 | pub mod eulertigs; 4 | pub mod hashes_sorting; 5 | pub mod links_compaction; 6 | pub mod maximal_unitig_links; 7 | pub mod reorganize_reads; 8 | -------------------------------------------------------------------------------- /crates/assembler/src/pipeline/hashes_sorting.rs: -------------------------------------------------------------------------------- 1 | use std::path::Path; 2 | use std::sync::atomic::Ordering; 3 | use std::sync::Arc; 4 | 5 | use config::{ 6 | get_memory_mode, SwapPriority, DEFAULT_PER_CPU_BUFFER_SIZE, DEFAULT_PREFETCH_AMOUNT, KEEP_FILES, 7 | }; 8 | use hashes::HashFunctionFactory; 9 | use io::structs::hash_entry::{Direction, HashCompare, HashEntrySerializer}; 10 | use io::structs::unitig_link::{UnitigFlags, UnitigIndex, UnitigLink, UnitigLinkSerializer}; 11 | use nightly_quirks::slice_group_by::SliceGroupBy; 12 | use parallel_processor::buckets::concurrent::{BucketsThreadBuffer, BucketsThreadDispatcher}; 13 | use parallel_processor::buckets::readers::lock_free_binary_reader::LockFreeBinaryReader; 14 | use parallel_processor::buckets::readers::BucketReader; 15 | use parallel_processor::buckets::writers::lock_free_binary_writer::LockFreeBinaryWriter; 16 | use parallel_processor::buckets::{MultiThreadBuckets, SingleBucket}; 17 | use parallel_processor::fast_smart_bucket_sort::fast_smart_radix_sort; 18 | use parallel_processor::memory_fs::RemoveFileMode; 19 | use parallel_processor::phase_times_monitor::PHASES_TIMES_MONITOR; 20 | use parallel_processor::utils::scoped_thread_local::ScopedThreadLocal; 21 | use rayon::iter::IntoParallelRefIterator; 22 | use rayon::iter::ParallelIterator; 23 | use utils::fast_rand_bool::FastRandBool; 24 | use utils::vec_slice::VecSlice; 25 | 26 | pub fn hashes_sorting>( 27 | file_hashes_inputs: Vec, 28 | output_dir: P, 29 | buckets_count: usize, 30 | ) -> Vec { 31 | PHASES_TIMES_MONITOR 32 | .write() 33 | .start_phase("phase: hashes sorting".to_string()); 34 | 35 | let links_buckets = Arc::new(MultiThreadBuckets::::new( 36 | buckets_count, 37 | output_dir.as_ref().join("links"), 38 | None, 39 | &( 40 | get_memory_mode(SwapPriority::LinksBuckets), 41 | LockFreeBinaryWriter::CHECKPOINT_SIZE_UNLIMITED, 42 | ), 43 | &(), 44 | )); 45 | 46 | let buckets_thread_buffers = ScopedThreadLocal::new(move || { 47 | BucketsThreadBuffer::new(DEFAULT_PER_CPU_BUFFER_SIZE, buckets_count) 48 | }); 49 | 50 | file_hashes_inputs 51 | .par_iter() 52 | .for_each(|input| { 53 | 54 | let mut buffers = buckets_thread_buffers.get(); 55 | let mut links_tmp = BucketsThreadDispatcher::<_, UnitigLinkSerializer>::new( 56 | &links_buckets, 57 | buffers.take() 58 | ); 59 | 60 | let mut rand_bool = FastRandBool::<1>::new(); 61 | 62 | let mut hashes_vec = Vec::new(); 63 | 64 | LockFreeBinaryReader::new(&input.path, RemoveFileMode::Remove { 65 | remove_fs: !KEEP_FILES.load(Ordering::Relaxed) 66 | }, DEFAULT_PREFETCH_AMOUNT).decode_all_bucket_items::, _>((), &mut (), |h, _| { 67 | hashes_vec.push(h); 68 | }); 69 | 70 | fast_smart_radix_sort::<_, HashCompare, false>(&mut hashes_vec[..]); 71 | 72 | let mut unitigs_vec = Vec::new(); 73 | 74 | for x in hashes_vec.nq_group_by(|a, b| a.hash == b.hash) { 75 | match x.len() { 76 | 2 => { 77 | let mut reverse_complemented = [false, false]; 78 | 79 | // Can happen with canonical kmers, we should reverse-complement one of the strands 80 | // the direction reverse is implicit as x[1] is treated as if it had the opposite of the x[0] direction 81 | if x[0].direction() == x[1].direction() { 82 | reverse_complemented[1] = true; 83 | } 84 | 85 | let (fw, bw) = match x[0].direction() { 86 | Direction::Forward => (0, 1), 87 | Direction::Backward => (1, 0), 88 | }; 89 | 90 | let (slice_fw, slice_bw) = if rand_bool.get_randbool() { 91 | unitigs_vec.push(UnitigIndex::new(x[bw].bucket(), x[bw].entry() as usize, reverse_complemented[bw])); 92 | (VecSlice::new(unitigs_vec.len() - 1, 1), VecSlice::EMPTY) 93 | } else { 94 | unitigs_vec.push(UnitigIndex::new(x[fw].bucket(), x[fw].entry() as usize, reverse_complemented[fw])); 95 | (VecSlice::EMPTY, VecSlice::new(unitigs_vec.len() - 1, 1)) 96 | }; 97 | 98 | links_tmp.add_element( 99 | x[fw].bucket(), 100 | &unitigs_vec, 101 | &UnitigLink::new( 102 | x[fw].entry(), 103 | UnitigFlags::new_direction(true, reverse_complemented[fw]), 104 | slice_fw, 105 | ), 106 | ); 107 | 108 | links_tmp.add_element( 109 | x[bw].bucket(), 110 | &unitigs_vec, 111 | &UnitigLink::new( 112 | x[bw].entry(), 113 | UnitigFlags::new_direction(false, reverse_complemented[bw]), 114 | slice_bw, 115 | ), 116 | ); 117 | }, 118 | 1 => { 119 | ggcat_logging::error!("Warning spurious hash detected ({:?}) with index {}, this is a bug or a collision in the KmersMerge phase!", x[0].hash, x[0].entry()); 120 | } 121 | _ => { 122 | ggcat_logging::error!("More than 2 equal hashes found in hashes sorting phase, this indicates an hash ({}) collision!", x[0].hash); 123 | } 124 | } 125 | } 126 | buffers.put_back(links_tmp.finalize().0); 127 | }); 128 | links_buckets.finalize_single() 129 | } 130 | -------------------------------------------------------------------------------- /crates/assembler/src/pipeline/maximal_unitig_links/mappings_loader.rs: -------------------------------------------------------------------------------- 1 | use crate::pipeline::maximal_unitig_links::maximal_unitig_index::{ 2 | DoubleMaximalUnitigLinks, MaximalUnitigIndex, MaximalUnitigLink, 3 | }; 4 | use config::{DEFAULT_PREFETCH_AMOUNT, KEEP_FILES}; 5 | use parallel_processor::buckets::bucket_writer::BucketItemSerializer; 6 | use parallel_processor::buckets::readers::compressed_binary_reader::CompressedBinaryReader; 7 | use parallel_processor::buckets::SingleBucket; 8 | use parallel_processor::memory_fs::RemoveFileMode; 9 | use parking_lot::{Mutex, RwLock}; 10 | use std::cmp::min; 11 | use std::path::Path; 12 | use std::sync::atomic::{AtomicUsize, Ordering}; 13 | use std::sync::Arc; 14 | use utils::vec_slice::VecSlice; 15 | 16 | use super::maximal_unitig_index::MaximalUnitigLinkSerializer; 17 | 18 | pub struct MaximalUnitigLinksMapping { 19 | start_index: u64, 20 | mappings: Vec, 21 | mappings_data: Vec, 22 | } 23 | 24 | impl MaximalUnitigLinksMapping { 25 | pub fn empty() -> Self { 26 | Self { 27 | start_index: 0, 28 | mappings: Vec::new(), 29 | mappings_data: Vec::new(), 30 | } 31 | } 32 | 33 | fn load_from_bucket(bucket: &Path, start_index: u64, unitigs_per_bucket: usize) -> Self { 34 | let mut self_ = Self { 35 | start_index, 36 | mappings: vec![ 37 | DoubleMaximalUnitigLinks { 38 | links: [ 39 | MaximalUnitigLink::new(0, VecSlice::EMPTY), 40 | MaximalUnitigLink::new(0, VecSlice::EMPTY) 41 | ], 42 | is_self_complemental: false 43 | }; 44 | unitigs_per_bucket 45 | ], 46 | mappings_data: vec![], 47 | }; 48 | 49 | let mut reader = CompressedBinaryReader::new( 50 | bucket, 51 | RemoveFileMode::Remove { 52 | remove_fs: !KEEP_FILES.load(Ordering::Relaxed), 53 | }, 54 | DEFAULT_PREFETCH_AMOUNT, 55 | ); 56 | 57 | let mut stream = reader.get_single_stream(); 58 | 59 | let mut deserializer = MaximalUnitigLinkSerializer::new(); 60 | 61 | while let Some(item) = 62 | deserializer.read_from(&mut stream, &mut self_.mappings_data, &mut ()) 63 | { 64 | let index = item.index() - self_.start_index; 65 | 66 | let current_slice = item.entries.get_slice(&self_.mappings_data); 67 | 68 | let forward_index = if current_slice[0].flags.flip_current() { 69 | 1 70 | } else { 71 | 0 72 | }; 73 | 74 | self_.mappings[index as usize].links[forward_index] = item; 75 | } 76 | 77 | self_ 78 | } 79 | 80 | pub(crate) fn has_mapping(&self, index: u64) -> bool { 81 | (index - self.start_index) < self.mappings.len() as u64 82 | } 83 | 84 | pub(crate) fn get_mapping( 85 | &self, 86 | index: u64, 87 | ) -> (DoubleMaximalUnitigLinks, &Vec) { 88 | ( 89 | self.mappings[(index - self.start_index) as usize].clone(), 90 | &self.mappings_data, 91 | ) 92 | } 93 | } 94 | 95 | pub struct MaximalUnitigLinksMappingsLoader { 96 | buckets: Vec, 97 | unitigs_per_bucket: usize, 98 | 99 | minimum_buckets: Vec, 100 | 101 | next_disposed_bucket_index: RwLock, 102 | loaded_buckets: Vec>>>, 103 | } 104 | 105 | impl MaximalUnitigLinksMappingsLoader { 106 | pub fn new( 107 | buckets: Vec, 108 | unitigs_per_bucket: usize, 109 | threads_count: usize, 110 | ) -> Self { 111 | let buckets_count = buckets.len(); 112 | 113 | Self { 114 | buckets, 115 | unitigs_per_bucket, 116 | minimum_buckets: (0..threads_count).map(|_| AtomicUsize::new(0)).collect(), 117 | next_disposed_bucket_index: RwLock::new(0), 118 | loaded_buckets: (0..buckets_count).map(|_| Mutex::new(None)).collect(), 119 | } 120 | } 121 | 122 | fn dispose_buckets(&self) { 123 | let minimum_bucket = self 124 | .minimum_buckets 125 | .iter() 126 | .map(|b| b.load(Ordering::Relaxed)) 127 | .min() 128 | .unwrap(); 129 | 130 | if *self.next_disposed_bucket_index.read() < minimum_bucket { 131 | let mut next_disposed_bucket_index = self.next_disposed_bucket_index.write(); 132 | while *next_disposed_bucket_index < min(minimum_bucket, self.loaded_buckets.len()) { 133 | self.loaded_buckets[*next_disposed_bucket_index] 134 | .lock() 135 | .take(); 136 | // ggcat_logging::info!("Disposing bucket {}", *next_disposed_bucket_index); 137 | *next_disposed_bucket_index += 1; 138 | } 139 | } 140 | } 141 | 142 | pub fn get_mapping_for( 143 | &self, 144 | index: u64, 145 | thread_index: usize, 146 | ) -> Arc { 147 | let bucket_index = (index / self.unitigs_per_bucket as u64) as usize; 148 | 149 | self.minimum_buckets[thread_index].store(bucket_index, Ordering::Relaxed); 150 | 151 | self.dispose_buckets(); 152 | 153 | let mut bucket_guard = self.loaded_buckets[bucket_index].lock(); 154 | 155 | if let Some(bucket) = bucket_guard.as_ref() { 156 | bucket.clone() 157 | } else { 158 | let bucket = Arc::new(MaximalUnitigLinksMapping::load_from_bucket( 159 | &self.buckets[bucket_index].path, 160 | bucket_index as u64 * self.unitigs_per_bucket as u64, 161 | self.unitigs_per_bucket, 162 | )); 163 | *bucket_guard = Some(bucket.clone()); 164 | bucket 165 | } 166 | } 167 | 168 | pub fn notify_thread_ending(&self, thread_index: usize) { 169 | self.minimum_buckets[thread_index].store(usize::MAX, Ordering::Relaxed); 170 | self.dispose_buckets(); 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /crates/assembler/src/pipeline/maximal_unitig_links/maximal_hash_entry.rs: -------------------------------------------------------------------------------- 1 | use bincode::{deserialize_from, serialize_into}; 2 | use hashes::HashFunctionFactory; 3 | use parallel_processor::buckets::bucket_writer::BucketItemSerializer; 4 | use parallel_processor::fast_smart_bucket_sort::SortKey; 5 | use serde::de::DeserializeOwned; 6 | use serde::{Deserialize, Serialize}; 7 | use std::io::Read; 8 | use std::marker::PhantomData; 9 | use std::mem::size_of; 10 | 11 | #[derive(Copy, Clone, Eq, PartialEq, Serialize, Deserialize, Debug)] 12 | #[repr(u8)] 13 | pub enum MaximalUnitigPosition { 14 | Beginning, 15 | Ending, 16 | } 17 | 18 | #[derive(Copy, Clone, Serialize, Deserialize, Debug)] 19 | pub struct MaximalHashEntry { 20 | pub hash: H, 21 | encoded: u64, 22 | overlap_start: u64, 23 | } 24 | 25 | impl MaximalHashEntry { 26 | const ENTRY_OFFSET: usize = 2; 27 | const DIRECTION_OFFSET: usize = 1; 28 | const POSITION_OFFSET: usize = 0; 29 | 30 | pub fn new( 31 | hash: H, 32 | entry: u64, 33 | position: MaximalUnitigPosition, 34 | direction_forward: bool, 35 | overlap_start: u64, 36 | ) -> Self { 37 | Self { 38 | hash, 39 | encoded: (entry << Self::ENTRY_OFFSET) 40 | | ((match position { 41 | MaximalUnitigPosition::Ending => 1, 42 | MaximalUnitigPosition::Beginning => 0, 43 | }) << Self::POSITION_OFFSET) 44 | | ((if direction_forward { 1 } else { 0 }) << Self::DIRECTION_OFFSET), 45 | overlap_start, 46 | } 47 | } 48 | 49 | pub fn entry(&self) -> u64 { 50 | self.encoded >> Self::ENTRY_OFFSET 51 | } 52 | 53 | pub fn overlap_start(&self) -> u64 { 54 | self.overlap_start 55 | } 56 | 57 | pub fn position(&self) -> MaximalUnitigPosition { 58 | if (self.encoded >> Self::POSITION_OFFSET) & 0x1 == 0 { 59 | MaximalUnitigPosition::Beginning 60 | } else { 61 | MaximalUnitigPosition::Ending 62 | } 63 | } 64 | 65 | pub fn direction(&self) -> bool { 66 | ((self.encoded >> Self::DIRECTION_OFFSET) & 0x1) == 1 67 | } 68 | } 69 | 70 | pub struct MaximalHashEntrySerializer(PhantomData); 71 | impl BucketItemSerializer 72 | for MaximalHashEntrySerializer 73 | { 74 | type InputElementType<'a> = MaximalHashEntry; 75 | type ExtraData = (); 76 | type ReadBuffer = (); 77 | type ExtraDataBuffer = (); 78 | type ReadType<'a> = MaximalHashEntry; 79 | 80 | type CheckpointData = (); 81 | 82 | #[inline(always)] 83 | fn new() -> Self { 84 | Self(PhantomData) 85 | } 86 | 87 | #[inline(always)] 88 | fn reset(&mut self) {} 89 | 90 | #[inline(always)] 91 | fn write_to( 92 | &mut self, 93 | element: &Self::InputElementType<'_>, 94 | bucket: &mut Vec, 95 | _extra_data: &Self::ExtraData, 96 | _: &(), 97 | ) { 98 | serialize_into(bucket, element).unwrap(); 99 | } 100 | 101 | fn read_from<'a, S: Read>( 102 | &mut self, 103 | stream: S, 104 | _read_buffer: &'a mut Self::ReadBuffer, 105 | _: &mut (), 106 | ) -> Option> { 107 | deserialize_from(stream).ok() 108 | } 109 | 110 | #[inline(always)] 111 | fn get_size(&self, _element: &Self::InputElementType<'_>, _: &()) -> usize { 112 | size_of::() + 8 + 1 113 | } 114 | } 115 | 116 | pub struct MaximalHashCompare { 117 | _phantom: PhantomData, 118 | } 119 | 120 | impl SortKey> 121 | for MaximalHashCompare 122 | { 123 | type KeyType = H::HashTypeUnextendable; 124 | const KEY_BITS: usize = size_of::() * 8; 125 | 126 | #[inline(always)] 127 | fn compare( 128 | left: &MaximalHashEntry<::HashTypeUnextendable>, 129 | right: &MaximalHashEntry<::HashTypeUnextendable>, 130 | ) -> std::cmp::Ordering { 131 | left.hash.cmp(&right.hash) 132 | } 133 | 134 | #[inline(always)] 135 | fn get_shifted(value: &MaximalHashEntry, rhs: u8) -> u8 { 136 | H::get_shifted(value.hash, rhs) as u8 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /crates/assembler/src/structs.rs: -------------------------------------------------------------------------------- 1 | pub mod link_mapping; 2 | -------------------------------------------------------------------------------- /crates/assembler/src/structs/link_mapping.rs: -------------------------------------------------------------------------------- 1 | use byteorder::ReadBytesExt; 2 | use config::BucketIndexType; 3 | use io::varint::{decode_varint, encode_varint, VARINT_MAX_SIZE}; 4 | use parallel_processor::buckets::bucket_writer::BucketItemSerializer; 5 | use std::io::{Read, Write}; 6 | 7 | #[derive(Clone, Debug)] 8 | pub struct LinkMapping { 9 | pub bucket: BucketIndexType, 10 | pub entry: u64, 11 | } 12 | 13 | pub struct LinkMappingSerializer; 14 | 15 | impl BucketItemSerializer for LinkMappingSerializer { 16 | type InputElementType<'a> = LinkMapping; 17 | type ExtraData = (); 18 | type ReadBuffer = (); 19 | type ExtraDataBuffer = (); 20 | type ReadType<'a> = LinkMapping; 21 | 22 | type CheckpointData = (); 23 | 24 | #[inline(always)] 25 | fn new() -> Self { 26 | Self 27 | } 28 | 29 | #[inline(always)] 30 | fn reset(&mut self) {} 31 | 32 | #[inline(always)] 33 | fn write_to( 34 | &mut self, 35 | element: &Self::InputElementType<'_>, 36 | bucket: &mut Vec, 37 | _extra_data: &Self::ExtraData, 38 | _: &Self::ExtraDataBuffer, 39 | ) { 40 | encode_varint(|b| bucket.write_all(b), element.bucket as u64).unwrap(); 41 | encode_varint(|b| bucket.write_all(b), element.entry).unwrap(); 42 | } 43 | 44 | fn read_from<'a, S: Read>( 45 | &mut self, 46 | mut stream: S, 47 | _read_buffer: &'a mut Self::ReadBuffer, 48 | _: &mut Self::ExtraDataBuffer, 49 | ) -> Option> { 50 | let bucket = decode_varint(|| stream.read_u8().ok())?; 51 | let entry = decode_varint(|| stream.read_u8().ok())?; 52 | Some(LinkMapping { 53 | bucket: bucket as BucketIndexType, 54 | entry, 55 | }) 56 | } 57 | 58 | #[inline(always)] 59 | fn get_size(&self, _element: &Self::InputElementType<'_>, _: &()) -> usize { 60 | VARINT_MAX_SIZE * 2 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /crates/assembler_kmers_merge/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ggcat_assembler_kmerge" 3 | version = "2.0.0" 4 | edition = "2021" 5 | [dependencies] 6 | 7 | # Config 8 | config = { package = "ggcat_config", path = "../config" } 9 | utils = { package = "ggcat_utils", path = "../utils" } 10 | 11 | # Common libraries 12 | parallel-processor = "0.1.24" 13 | streaming-libdeflate-rs = "0.1.5" 14 | nightly-quirks = "0.1.4" 15 | 16 | # Common pipeline libraries 17 | hashes = { package = "ggcat_hashes", path = "../hashes" } 18 | io = { package = "ggcat_io", path = "../io" } 19 | colors = { package = "ggcat_colors", path = "../colors" } 20 | structs = { package = "ggcat_structs", path = "../structs" } 21 | kmers_transform = { package = "ggcat_kmers_transform", path = "../kmers_transform" } 22 | minimizer_bucketing = { package = "ggcat_minimizer_bucketing", path = "../minimizer_bucketing" } 23 | 24 | # Dependencies 25 | assembler_minimizer_bucketing = { package = "ggcat_assembler_minibuck", path = "../assembler_minimizer_bucketing" } 26 | 27 | 28 | # Other crates 29 | hashbrown = "0.14.5" 30 | typenum = "1.17.0" 31 | crossbeam = "0.8.4" 32 | 33 | # Profiler 34 | instrumenter = "0.1.3" 35 | parking_lot = "0.12.3" 36 | ggcat-logging = { version = "2.0.0", path = "../logging" } 37 | rustc-hash = "2.1.0" 38 | 39 | [dev-dependencies] 40 | fdlimit = "0.3.0" 41 | rayon = "1.10.0" 42 | 43 | 44 | [features] 45 | support_kmer_counters = [ 46 | "colors/support_kmer_counters", 47 | "structs/support_kmer_counters", 48 | ] 49 | -------------------------------------------------------------------------------- /crates/assembler_kmers_merge/src/structs.rs: -------------------------------------------------------------------------------- 1 | use config::BucketIndexType; 2 | use io::concurrent::temp_reads::creads_utils::{ 3 | CompressedReadsBucketData, CompressedReadsBucketDataSerializer, NoMultiplicity, NoSecondBucket, 4 | }; 5 | use io::concurrent::temp_reads::extra_data::{ 6 | SequenceExtraDataConsecutiveCompression, SequenceExtraDataTempBufferManagement, 7 | }; 8 | use parallel_processor::buckets::bucket_writer::BucketItemSerializer; 9 | use parallel_processor::buckets::writers::compressed_binary_writer::CompressedBinaryWriter; 10 | use parallel_processor::buckets::{LockFreeBucket, SingleBucket}; 11 | use std::marker::PhantomData; 12 | #[cfg(feature = "support_kmer_counters")] 13 | use structs::unitigs_counters::UnitigsCounters; 14 | use utils::owned_drop::OwnedDrop; 15 | 16 | pub struct ResultsBucket { 17 | pub read_index: u64, 18 | pub reads_writer: OwnedDrop, 19 | pub temp_buffer: Vec, 20 | pub bucket_index: BucketIndexType, 21 | pub serializer: CompressedReadsBucketDataSerializer< 22 | PartialUnitigExtraData, 23 | typenum::U0, 24 | NoSecondBucket, 25 | NoMultiplicity, 26 | >, 27 | pub _phantom: PhantomData, 28 | } 29 | 30 | #[derive(Clone, Debug)] 31 | pub struct PartialUnitigExtraData { 32 | #[cfg(feature = "support_kmer_counters")] 33 | pub counters: UnitigsCounters, 34 | pub colors: X, 35 | } 36 | 37 | impl SequenceExtraDataTempBufferManagement 38 | for PartialUnitigExtraData 39 | { 40 | type TempBuffer = X::TempBuffer; 41 | 42 | fn new_temp_buffer() -> Self::TempBuffer { 43 | X::new_temp_buffer() 44 | } 45 | 46 | fn clear_temp_buffer(buffer: &mut Self::TempBuffer) { 47 | X::clear_temp_buffer(buffer) 48 | } 49 | 50 | fn copy_temp_buffer(dest: &mut Self::TempBuffer, src: &Self::TempBuffer) { 51 | X::copy_temp_buffer(dest, src) 52 | } 53 | 54 | fn copy_extra_from(extra: Self, src: &Self::TempBuffer, dst: &mut Self::TempBuffer) -> Self { 55 | Self { 56 | colors: X::copy_extra_from(extra.colors, src, dst), 57 | #[cfg(feature = "support_kmer_counters")] 58 | counters: extra.counters, 59 | } 60 | } 61 | } 62 | 63 | impl SequenceExtraDataConsecutiveCompression 64 | for PartialUnitigExtraData 65 | { 66 | type LastData = X::LastData; 67 | 68 | fn decode_extended( 69 | buffer: &mut Self::TempBuffer, 70 | reader: &mut impl std::io::Read, 71 | last_data: Self::LastData, 72 | ) -> Option { 73 | let color = X::decode_extended(buffer, reader, last_data)?; 74 | #[cfg(feature = "support_kmer_counters")] 75 | let counter = UnitigsCounters::decode_extended(&mut (), reader, ())?; 76 | Some(Self { 77 | colors: color, 78 | #[cfg(feature = "support_kmer_counters")] 79 | counters: counter, 80 | }) 81 | } 82 | 83 | fn encode_extended( 84 | &self, 85 | buffer: &Self::TempBuffer, 86 | writer: &mut impl std::io::Write, 87 | last_data: Self::LastData, 88 | ) { 89 | self.colors.encode_extended(buffer, writer, last_data); 90 | #[cfg(feature = "support_kmer_counters")] 91 | self.counters.encode_extended(&(), writer, ()); 92 | } 93 | 94 | fn obtain_last_data(&self, last_data: Self::LastData) -> Self::LastData { 95 | self.colors.obtain_last_data(last_data) 96 | } 97 | 98 | fn max_size(&self) -> usize { 99 | self.colors.max_size() 100 | + match () { 101 | #[cfg(feature = "support_kmer_counters")] 102 | () => self.counters.max_size(), 103 | #[cfg(not(feature = "support_kmer_counters"))] 104 | () => 0, 105 | } 106 | } 107 | } 108 | 109 | impl ResultsBucket { 110 | pub fn add_read( 111 | &mut self, 112 | el: PartialUnitigExtraData, 113 | read: &[u8], 114 | extra_buffer: &X::TempBuffer, 115 | ) -> u64 { 116 | self.temp_buffer.clear(); 117 | self.serializer.write_to( 118 | &CompressedReadsBucketData::new(read, 0, 0), 119 | &mut self.temp_buffer, 120 | &el, 121 | extra_buffer, 122 | ); 123 | self.reads_writer.write_data(self.temp_buffer.as_slice()); 124 | 125 | let read_index = self.read_index; 126 | self.read_index += 1; 127 | read_index 128 | } 129 | 130 | pub fn get_bucket_index(&self) -> BucketIndexType { 131 | self.bucket_index 132 | } 133 | } 134 | 135 | impl Drop for ResultsBucket { 136 | fn drop(&mut self) { 137 | unsafe { self.reads_writer.take().finalize() } 138 | } 139 | } 140 | 141 | pub struct RetType { 142 | pub sequences: Vec, 143 | pub hashes: Vec, 144 | } 145 | -------------------------------------------------------------------------------- /crates/assembler_minimizer_bucketing/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ggcat_assembler_minibuck" 3 | version = "2.0.0" 4 | edition = "2021" 5 | [dependencies] 6 | 7 | # Config 8 | config = { package = "ggcat_config", path = "../config" } 9 | 10 | # Common libraries 11 | parallel-processor = "0.1.24" 12 | streaming-libdeflate-rs = "0.1.5" 13 | 14 | # Static dispatch 15 | dynamic-dispatch = "0.5.4" 16 | 17 | # Common pipeline libraries 18 | hashes = { package = "ggcat_hashes", path = "../hashes" } 19 | io = { package = "ggcat_io", path = "../io" } 20 | colors = { package = "ggcat_colors", path = "../colors" } 21 | structs = { package = "ggcat_structs", path = "../structs" } 22 | minimizer_bucketing = { package = "ggcat_minimizer_bucketing", path = "../minimizer_bucketing" } 23 | 24 | # Other libraries 25 | typenum = "1.17.0" 26 | 27 | [features] 28 | devel-build = [] 29 | -------------------------------------------------------------------------------- /crates/assembler_minimizer_bucketing/src/rewrite_bucket.rs: -------------------------------------------------------------------------------- 1 | use config::BucketIndexType; 2 | use config::MultiplicityCounterType; 3 | use config::READ_FLAG_INCL_END; 4 | use hashes::default::MNHFactory; 5 | use hashes::ExtendableHashTraitType; 6 | use hashes::HashFunction; 7 | use hashes::{HashFunctionFactory, HashableSequence, MinimizerHashFunctionFactory}; 8 | use io::compressed_read::CompressedRead; 9 | use minimizer_bucketing::resplit_bucket::RewriteBucketCompute; 10 | 11 | pub struct RewriteBucketComputeAssembler; 12 | 13 | impl RewriteBucketCompute for RewriteBucketComputeAssembler { 14 | fn get_rewrite_bucket( 15 | k: usize, 16 | m: usize, 17 | seq_data: &(u8, u8, C, CompressedRead, MultiplicityCounterType), 18 | used_hash_bits: usize, 19 | bucket_bits_count: usize, 20 | ) -> BucketIndexType { 21 | let read = &seq_data.3; 22 | let flags = seq_data.0; 23 | let decr_val = ((read.bases_count() == k) && (flags & READ_FLAG_INCL_END) == 0) as usize; 24 | 25 | let hashes = MNHFactory::new(read.sub_slice((1 - decr_val)..(k - decr_val)), m); 26 | 27 | let minimizer = hashes 28 | .iter() 29 | .min_by_key(|k| MNHFactory::get_full_minimizer(k.to_unextendable())) 30 | .unwrap(); 31 | 32 | MNHFactory::get_bucket( 33 | used_hash_bits, 34 | bucket_bits_count, 35 | minimizer.to_unextendable(), 36 | ) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /crates/capi/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ggcat-cpp-bindings" 3 | version = "2.0.0" 4 | edition = "2021" 5 | 6 | [lib] 7 | crate-type = ["staticlib"] 8 | 9 | [dependencies] 10 | cxx = "1.0.124" 11 | ggcat-api = { version = "2.0.0", path = "../api" } 12 | static_assertions = "1.1.0" 13 | 14 | [build-dependencies] 15 | cxx-build = "1.0.124" 16 | -------------------------------------------------------------------------------- /crates/capi/build.rs: -------------------------------------------------------------------------------- 1 | use std::fs::create_dir_all; 2 | 3 | fn main() { 4 | cxx_build::bridge("src/lib.rs") 5 | .flag_if_supported("-std=c++11") 6 | .compile("ggcat_cxx_interop"); 7 | 8 | std::fs::copy( 9 | format!( 10 | "{}{}", 11 | std::env::var("OUT_DIR").unwrap(), 12 | "/cxxbridge/include/ggcat-cpp-bindings/src/lib.rs.h" 13 | ), 14 | "ggcat-cpp-api/include/ggcat-cpp-bindings.hh", 15 | ) 16 | .unwrap(); 17 | 18 | let _ = create_dir_all("ggcat-cpp-api/lib"); 19 | 20 | std::fs::copy( 21 | format!( 22 | "{}{}", 23 | std::env::var("OUT_DIR").unwrap(), 24 | "/libggcat_cxx_interop.a" 25 | ), 26 | "ggcat-cpp-api/lib/libggcat_cxx_interop.a", 27 | ) 28 | .expect(&format!( 29 | "Cannot copy file: '{}'", 30 | format!( 31 | "{}{}", 32 | std::env::var("OUT_DIR").unwrap(), 33 | "/libggcat_cxx_interop.a" 34 | ) 35 | )); 36 | 37 | println!("cargo:rerun-if-changed=src/lib.rs"); 38 | } 39 | -------------------------------------------------------------------------------- /crates/capi/ggcat-cpp-api/.gitignore: -------------------------------------------------------------------------------- 1 | lib/ 2 | build/ 3 | -------------------------------------------------------------------------------- /crates/capi/ggcat-cpp-api/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all ggcat-capi ggcat-source 2 | 3 | all: lib/libggcat_api.a 4 | 5 | clean: 6 | cargo clean 7 | rm -r build/ lib/ 8 | 9 | lib/libggcat_api.a: ./lib/libggcat_cpp_bindings.a 10 | mkdir -p build/ 11 | $(CXX) -std=c++11 -fPIE -O3 -Wno-unused-command-line-argument -I./include -I./src -c ./src/ggcat.cc -lggcat_cpp_bindings -lggcat_cxx_interop -o build/ggcat.o -Wall -Wextra -Werror 12 | ar cr lib/libggcat_api.a build/ggcat.o 13 | 14 | ./lib/libggcat_cpp_bindings.a: ggcat-source 15 | cargo build --release --package ggcat-cpp-bindings 16 | cp ../../../target/release/libggcat_cpp_bindings.a ./lib/ 17 | -------------------------------------------------------------------------------- /crates/capi/ggcat-cpp-api/example/.gitignore: -------------------------------------------------------------------------------- 1 | ggcat-example 2 | -------------------------------------------------------------------------------- /crates/capi/ggcat-cpp-api/example/Makefile: -------------------------------------------------------------------------------- 1 | 2 | .PHONY: all ggcat-capi ggcat-source 3 | 4 | all: ggcat-example 5 | 6 | ggcat-example: ../lib/libggcat_cpp_bindings.a ../lib/libggcat_cxx_interop.a ../lib/libggcat_api.a main.cc 7 | g++ -flto -std=c++11 -O3 -I../include -L../lib main.cc -lggcat_api -lggcat_cpp_bindings -lggcat_cxx_interop -o ggcat-example -lpthread -ldl 8 | -------------------------------------------------------------------------------- /crates/capi/ggcat-cpp-api/example/main.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace ggcat; 7 | 8 | class MemorySequencesReader : public ggcat::StreamReader 9 | { 10 | public: 11 | MemorySequencesReader() {} 12 | 13 | static uint64_t estimated_base_count(void *block) 14 | { 15 | return 10; 16 | } 17 | 18 | // This is a toy example where the block corresponds to a single sequence. 19 | // In practice a block should be a chunk of sequences, e.g. a pointer to a file descriptor 20 | // that can be read to retrieve the sequences. 21 | void read_block( 22 | void *block, 23 | bool copy_ident_data, 24 | size_t partial_read_copyback, 25 | void (*callback)(DnaSequence sequence, SequenceInfo info)) override 26 | { 27 | const char *seq = (const char *)block; 28 | static uint32_t color_idx = 0; 29 | 30 | callback(DnaSequence{ 31 | // To build a graph the ident_data can be empty 32 | .ident_data = Slice(nullptr, 0), 33 | .seq = Slice((char *)seq, strlen(seq)), 34 | }, 35 | SequenceInfo{ 36 | // This is the index of the color of the current sequence 37 | .color = (color_idx++) % 3, 38 | }); 39 | } 40 | }; 41 | 42 | void log_message(MessageLevel level, const char *message) 43 | { 44 | switch (level) 45 | { 46 | case MessageLevel_Info: 47 | { 48 | printf("Info: %s\n", message); 49 | } 50 | break; 51 | case MessageLevel_Warning: 52 | { 53 | printf("Warning: %s\n", message); 54 | } 55 | break; 56 | case MessageLevel_Error: 57 | { 58 | printf("Error: %s\n", message); 59 | } 60 | case MessageLevel_UnrecoverableError: 61 | { 62 | printf("Unrecoverable error: %s\n", message); 63 | abort(); 64 | } 65 | } 66 | } 67 | 68 | int main(int argc, char const *argv[]) 69 | { 70 | GGCATConfig config; 71 | 72 | config.use_temp_dir = true; 73 | config.temp_dir = "/tmp", 74 | config.memory = 2.0, 75 | config.prefer_memory = true, 76 | config.total_threads_count = 16, 77 | config.intermediate_compression_level = -1, 78 | 79 | config.use_stats_file = false; 80 | config.stats_file = ""; 81 | 82 | config.messages_callback = log_message; 83 | 84 | GGCATInstance *instance = GGCATInstance::create(config); 85 | 86 | std::vector input_files = { 87 | "../../../../example-inputs/sal1.fa", 88 | "../../../../example-inputs/sal2.fa", 89 | "../../../../example-inputs/sal3.fa", 90 | }; 91 | std::string graph_file = "/tmp/sal-dbg.fa"; 92 | 93 | std::vector color_names = { 94 | "sal1.fa", 95 | "sal2.fa", 96 | "sal3.fa", 97 | }; 98 | 99 | size_t k = 31; 100 | size_t threads_count = 16; 101 | 102 | std::string output_file = instance->build_graph_from_files( 103 | Slice(input_files.data(), input_files.size()), 104 | graph_file, 105 | k, 106 | threads_count, 107 | false, 108 | 1, 109 | ExtraElaborationStep_UnitigLinks, 110 | true, 111 | Slice(color_names.data(), color_names.size()), 112 | -1); 113 | 114 | std::string input_query = "../../../../example-inputs/query.fa"; 115 | 116 | std::string output_query = instance->query_graph( 117 | graph_file, 118 | input_query, 119 | "/tmp/query-results", 120 | k, 121 | threads_count, 122 | false, 123 | true, 124 | ColoredQueryOutputFormat_JsonLinesWithNames); 125 | 126 | std::cout << "Output query file: " << output_query << std::endl; 127 | 128 | std::mutex print_kmer_lock; 129 | 130 | auto file_color_names = 131 | GGCATInstance::dump_colors(GGCATInstance::get_colormap_file(graph_file)); 132 | 133 | instance->dump_unitigs( 134 | graph_file, 135 | k, 136 | threads_count, 137 | false, 138 | // WARNING: this function is called asynchronously from multiple threads, so it must be thread-safe. 139 | // Also the same_colors boolean is referred to the previous call of this function from the current thread 140 | [&](Slice read, Slice colors, bool same_colors) 141 | { 142 | std::lock_guard _lock(print_kmer_lock); 143 | if (read.size < 100) 144 | { 145 | std::cout << "Dump unitig '"; 146 | std::cout.write(read.data, read.size); 147 | std::cout << "'" << std::endl; 148 | } 149 | else 150 | { 151 | std::cout << "Dump unitig '"; 152 | std::cout.write(read.data, 100); 153 | std::cout << "...'" << std::endl; 154 | } 155 | 156 | std::cout << "\t colors: ["; 157 | 158 | for (size_t i = 0; i < colors.size; i++) 159 | { 160 | if (i > 0) 161 | { 162 | std::cout << ", "; 163 | } 164 | std::cout << '"' << file_color_names[colors.data[i]] << '"'; 165 | } 166 | 167 | std::cout << "] same_colors: " << same_colors << std::endl; 168 | }, 169 | true); 170 | 171 | auto colormap = GGCATInstance::get_colormap_file(graph_file); 172 | 173 | uint32_t query_subsets[] = {0, 1, 2, 3, 4}; 174 | instance->query_colormap(colormap, query_subsets, 5, true, [&](uint32_t subset, Slice colors) 175 | { 176 | std::cout << "Subset: " << subset << " has colors:"; 177 | for (auto color : colors) { 178 | std::cout << " "; 179 | std::cout << color << "[" << file_color_names[color] << "]"; 180 | } 181 | std::cout << std::endl; }); 182 | 183 | const char *sequences[] = {"AAAAACACACATATACAGTGTGTGAGTAGTATGATGT", "AAAATTTTTTTTTTTGGGGGGGGGGACACACATATACA", "AAAAACACACATATACACCCCCGGGAAAAAC", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"}; 184 | 185 | // Advanced building using in-memory data 186 | instance->build_graph_from_streams( 187 | Slice((void **)sequences, 4), 188 | "/tmp/advanced-building.fa", 189 | k, 190 | threads_count, 191 | false, 192 | 1, 193 | ExtraElaborationStep_None, 194 | true, 195 | Slice(color_names.data(), color_names.size())); 196 | 197 | return 0; 198 | } 199 | -------------------------------------------------------------------------------- /crates/cmdline/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors = ["Andrea Cracco "] 3 | edition = "2021" 4 | name = "ggcat_cmdline" 5 | version = "2.0.0" 6 | 7 | [[bin]] 8 | name = "ggcat" 9 | path = "src/main.rs" 10 | 11 | [dependencies] 12 | bincode = "1.3.3" 13 | bstr = "1.9.1" 14 | byteorder = "1.5.0" 15 | itertools = "0.13.0" 16 | lazy_static = "1.5.0" 17 | lz4 = "1.25.0" 18 | rayon = "1.10.0" 19 | serde = "1.0.203" 20 | structopt = "0.3.26" 21 | hashbrown = "0.14.5" 22 | rand = "0.8.5" 23 | libc = "0.2.155" 24 | filebuffer = "1.0.0" 25 | crossbeam = "0.8.4" 26 | fdlimit = "0.3.0" 27 | parking_lot = "0.12.3" 28 | dashmap = "6.0.1" 29 | serde_json = "1.0.118" 30 | ahash = "0.8.11" 31 | siphasher = "1.0.1" 32 | desse = "0.2.1" 33 | replace_with = "0.1.7" 34 | roaring = "0.10.5" 35 | backtrace = "0.3.73" 36 | csv = "1.3.0" 37 | flate2 = { default-features = false, features = [ 38 | "cloudflare_zlib", 39 | ], version = "1.0.30" } 40 | typenum = "1.17.0" 41 | fs_extra = "1.3.0" 42 | atoi = "2.0.0" 43 | 44 | # Config 45 | config = { package = "ggcat_config", path = "../config" } 46 | utils = { package = "ggcat_utils", path = "../utils" } 47 | 48 | # Static dispatch 49 | dynamic-dispatch = "0.5.4" 50 | 51 | 52 | # Common libraries 53 | parallel-processor = "0.1.24" 54 | io = { package = "ggcat_io", path = "../io" } 55 | colors = { package = "ggcat_colors", path = "../colors" } 56 | 57 | assembler = { package = "ggcat_assembler", path = "../assembler" } 58 | instrumenter = "0.1.3" 59 | 60 | querier = { package = "ggcat_querier", path = "../querier" } 61 | ggcat-api = { version = "2.0.0", path = "../api" } 62 | ggcat-logging = { version = "2.0.0", path = "../logging" } 63 | 64 | 65 | [features] 66 | mem-analysis = ["parallel-processor/track-usage"] 67 | no-stats = ["parallel-processor/no-stats"] 68 | process-stats = ["parallel-processor/process-stats", "ggcat-logging/stats"] 69 | tracing = ["instrumenter/enabled"] 70 | devel-build = ["assembler/devel-build", "querier/devel-build"] 71 | kmer-counters = ["assembler/support_kmer_counters"] 72 | 73 | [build-dependencies] 74 | make-cmd = "0.1.0" 75 | 76 | [dev-dependencies] 77 | hashes = { package = "ggcat_hashes", path = "../hashes" } 78 | -------------------------------------------------------------------------------- /crates/cmdline/src/benchmarks.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | mod tests { 3 | use byteorder::ReadBytesExt; 4 | use hashes::fw_nthash::ForwardNtHashIterator; 5 | use hashes::HashFunction; 6 | use io::compressed_read::CompressedRead; 7 | use std::io::Cursor; 8 | use test::Bencher; 9 | use utils::Utils; 10 | 11 | const TEST_SIZE: usize = 10000000; 12 | 13 | type VecType = u8; 14 | 15 | #[bench] 16 | fn bench_loop_vec(b: &mut Bencher) { 17 | let mut vec = Vec::with_capacity(TEST_SIZE); 18 | for i in 0..TEST_SIZE { 19 | vec.push(i as VecType); 20 | } 21 | let mut sum = 0; 22 | 23 | b.iter(|| { 24 | sum = 0; 25 | for i in 0..TEST_SIZE { 26 | sum += vec[i] as usize; 27 | } 28 | }); 29 | 30 | assert_ne!(sum, 49999995000000); 31 | } 32 | 33 | #[bench] 34 | fn bench_loop_optimized(b: &mut Bencher) { 35 | let mut vec = Vec::with_capacity(TEST_SIZE); 36 | for i in 0..TEST_SIZE { 37 | vec.push(i as VecType); 38 | } 39 | let mut sum = 0; 40 | 41 | b.iter(|| { 42 | sum = 0; 43 | let ptr = vec.as_ptr(); 44 | unsafe { 45 | for i in 0..TEST_SIZE { 46 | sum += (*ptr.add(i)) as usize; 47 | } 48 | } 49 | }); 50 | 51 | assert_ne!(sum, 49999995000000); 52 | } 53 | 54 | #[bench] 55 | fn bench_iter_vec(b: &mut Bencher) { 56 | let mut vec = Vec::with_capacity(TEST_SIZE); 57 | for i in 0..TEST_SIZE { 58 | vec.push(i as VecType); 59 | } 60 | let mut sum = 0; 61 | 62 | b.iter(|| { 63 | sum = 0; 64 | for x in vec.iter() { 65 | sum += *x as usize; 66 | } 67 | }); 68 | 69 | assert_ne!(sum, 49999995000000); 70 | } 71 | 72 | #[bench] 73 | fn bench_cursor_vec(b: &mut Bencher) { 74 | let mut vec = Vec::with_capacity(TEST_SIZE); 75 | for i in 0..TEST_SIZE { 76 | vec.push(i as u8); 77 | } 78 | let mut sum = 0; 79 | 80 | b.iter(|| { 81 | sum = 0; 82 | let mut cursor = Cursor::new(&vec); 83 | while let Ok(byte) = cursor.read_u8() { 84 | sum += byte as usize; 85 | } 86 | }); 87 | 88 | assert_ne!(sum, 49999995000000); 89 | } 90 | 91 | #[test] 92 | fn test_nthash() { 93 | // TGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATGGG ==> TATGTATATATATATATATATATATATATATATATATATATATATATATATATATATATGTGT 94 | // let str0 = b"GNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNG"; 95 | // let str1 = b"TNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNT"; 96 | 97 | // let str0 = b"GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATTT"; 98 | // let str1 = b"TATATATATATATATATATATATATATATATATATATATATATATATATATATATATATTG"; 99 | 100 | // let str0 = b"NGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNN"; 101 | // let str1 = b"NTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNN"; 102 | 103 | let str0 = b"TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; 104 | let str1 = b"TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT"; 105 | 106 | let hash = ForwardNtHashIterator::new(&str0[..], 15).unwrap(); 107 | println!("{:?}", hash.iter().collect::>()); 108 | let hash1 = ForwardNtHashIterator::new(&str1[..], 15).unwrap(); 109 | println!("{:?}", hash1.iter().collect::>()) 110 | } 111 | 112 | #[test] 113 | fn test_seqhash() { 114 | // TGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATGGG ==> TATGTATATATATATATATATATATATATATATATATATATATATATATATATATATATGTGT 115 | // let str0 = b"GNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNG"; 116 | // let str1 = b"TNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNT"; 117 | 118 | // let str0 = b"GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATTT"; 119 | // let str1 = b"TATATATATATATATATATATATATATATATATATATATATATATATATATATATATATTG"; 120 | 121 | // let str0 = b"NGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNNGNNN"; 122 | // let str1 = b"NTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNNTNNN"; 123 | 124 | // let str0 = b"TTTCTTTTTTTTTTTTTTTAATTTTGAGACAA"; 125 | // let str1 = b"TTTCTTTTTTTTTTTTTTTAATTTTGCCCCAATTTCTTTTTTTTTTTTTTTAATTTTGAGACAA"; 126 | // 127 | // let hash = SeqHashIterator::new(&str0[..], 32).unwrap(); 128 | // println!("{:?}", hash.iter().collect::>()); 129 | // let hash1 = SeqHashIterator::new(&str1[..], 32).unwrap(); 130 | // println!("{:?}", hash1.iter().collect::>()); 131 | 132 | let a: Vec<_> = (&b"GAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC"[..]) 133 | .iter() 134 | .map(|x| Utils::compress_base(*x)) 135 | .collect(); 136 | let _b: Vec<_> = (&b"CACCCACCCATTCACCTATCCATCCATCCAACCGTCCATCTGTTCATTC"[..]) 137 | .iter() 138 | .map(|x| Utils::compress_base(*x)) 139 | .collect(); 140 | 141 | let ha: Vec<_> = ForwardNtHashIterator::new(&a[..], 15) 142 | .unwrap() 143 | .iter() 144 | .collect(); 145 | // let hb: Vec<_> = SeqHashIterator::new(&b[..], 31).unwrap().iter().collect(); 146 | 147 | // let hc = SeqHashFactory::manual_roll_forward(ha, 32, a[0], *b.last().unwrap()); 148 | 149 | println!("X {:?}", ha); 150 | // println!("Y {:?}", hb); 151 | // println!("{:b}", hc); 152 | } 153 | 154 | #[test] 155 | fn test_comprread() { 156 | let vec = vec![0x23, 0x47, 0xFA, 0x7D, 0x59, 0xFF, 0xA1, 0x84]; 157 | 158 | let read1 = CompressedRead::from_compressed_reads(&vec[..], 0, 32).sub_slice(1..32); 159 | let read12 = CompressedRead::from_compressed_reads(&vec[..], 1, 31).sub_slice(0..31); 160 | println!("{}", read1.to_string()); 161 | println!("{}", read12.to_string()); 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /crates/cmdline/src/cmd_utils.rs: -------------------------------------------------------------------------------- 1 | mod cmd_rewrite; 2 | 3 | use crate::cmd_utils::cmd_rewrite::{cmd_rewrite, CmdRewriteArgs}; 4 | use structopt::StructOpt; 5 | 6 | #[derive(StructOpt, Debug)] 7 | pub enum CmdUtilsArgs { 8 | Rewrite(CmdRewriteArgs), 9 | } 10 | 11 | pub fn process_cmdutils(args: CmdUtilsArgs) { 12 | match args { 13 | CmdUtilsArgs::Rewrite(args) => { 14 | cmd_rewrite(args); 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /crates/cmdline/src/cmd_utils/cmd_rewrite.rs: -------------------------------------------------------------------------------- 1 | use io::reads_writer::ReadsWriter; 2 | use io::sequences_reader::SequencesReader; 3 | use rayon::prelude::*; 4 | use std::fs::File; 5 | use std::io::{BufRead, BufReader}; 6 | use std::path::PathBuf; 7 | use structopt::StructOpt; 8 | 9 | #[derive(StructOpt, Debug)] 10 | pub struct CmdRewriteArgs { 11 | /// The input files 12 | pub input: Vec, 13 | 14 | /// The lists of input files 15 | #[structopt(short = "l", long = "input-lists")] 16 | pub input_lists: Vec, 17 | 18 | #[structopt(short = "o", long = "output-path")] 19 | pub output_path: PathBuf, 20 | } 21 | 22 | pub fn cmd_rewrite(args: CmdRewriteArgs) { 23 | let mut files_list = Vec::new(); 24 | 25 | files_list.extend(args.input.into_iter()); 26 | 27 | for list in args.input_lists { 28 | let file_list = BufReader::new(File::open(list).unwrap()); 29 | 30 | files_list.extend(file_list.lines().map(|l| PathBuf::from(l.unwrap()))); 31 | } 32 | 33 | files_list.par_iter().for_each(|x| { 34 | let new_file_path = args.output_path.join(x.file_name().unwrap()); 35 | 36 | let mut writer = ReadsWriter::new_compressed_gzip(new_file_path, 9); 37 | 38 | SequencesReader::process_file_extended( 39 | x, 40 | |f| { 41 | writer.add_read(f); 42 | }, 43 | None, 44 | true, 45 | false, 46 | ) 47 | }); 48 | } 49 | -------------------------------------------------------------------------------- /crates/colors/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ggcat_colors" 3 | version = "2.0.0" 4 | edition = "2021" 5 | [dependencies] 6 | 7 | # Config 8 | config = { package = "ggcat_config", path = "../config" } 9 | 10 | # Static dispatch 11 | dynamic-dispatch = "0.5.4" 12 | 13 | # Common libraries 14 | parallel-processor = "0.1.24" 15 | streaming-libdeflate-rs = "0.1.5" 16 | nightly-quirks = "0.1.4" 17 | 18 | # Common pipeline libraries 19 | hashes = { package = "ggcat_hashes", path = "../hashes" } 20 | io = { package = "ggcat_io", path = "../io" } 21 | structs = { package = "ggcat_structs", path = "../structs" } 22 | 23 | # Other libraries 24 | typenum = "1.17.0" 25 | hashbrown = "0.14.5" 26 | dashmap = "6.0.1" 27 | byteorder = "1.5.0" 28 | siphasher = "1.0.1" 29 | desse = "0.2.1" 30 | replace_with = "0.1.7" 31 | bstr = "1.9.1" 32 | atoi = "2.0.0" 33 | crossbeam = "0.8.4" 34 | rayon = "1.10.0" 35 | rand = "0.8.5" 36 | roaring = "0.10.5" 37 | parking_lot = "0.12.3" 38 | serde = "1.0.203" 39 | lz4 = "1.25.0" 40 | bincode = "1.3.3" 41 | itertools = "0.13.0" 42 | ggcat-logging = { version = "2.0.0", path = "../logging" } 43 | anyhow = "1.0.89" 44 | rustc-hash = "2.1.0" 45 | 46 | [features] 47 | support_kmer_counters = [] 48 | -------------------------------------------------------------------------------- /crates/colors/src/bundles.rs: -------------------------------------------------------------------------------- 1 | pub mod graph_querying; 2 | pub mod multifile_building; 3 | -------------------------------------------------------------------------------- /crates/colors/src/bundles/graph_querying.rs: -------------------------------------------------------------------------------- 1 | use crate::colors_manager::ColorsManager; 2 | use crate::managers::single::SingleColorManager; 3 | use crate::parsers::graph::GraphColorsParser; 4 | use config::{BucketIndexType, ColorIndexType, COLORS_SINGLE_BATCH_SIZE}; 5 | use dynamic_dispatch::dynamic_dispatch; 6 | 7 | #[derive(Copy, Clone, Debug)] 8 | pub struct ColorBundleGraphQuerying; 9 | 10 | #[dynamic_dispatch] 11 | impl ColorsManager for ColorBundleGraphQuerying { 12 | const COLORS_ENABLED: bool = true; 13 | type SingleKmerColorDataType = ColorIndexType; 14 | 15 | #[inline(always)] 16 | fn get_bucket_from_color( 17 | color: &Self::SingleKmerColorDataType, 18 | colors_count: u64, 19 | buckets_count_log: u32, 20 | ) -> BucketIndexType { 21 | Self::get_bucket_from_u64_color( 22 | *color as u64, 23 | colors_count, 24 | buckets_count_log, 25 | COLORS_SINGLE_BATCH_SIZE, 26 | ) 27 | } 28 | 29 | type ColorsParserType = GraphColorsParser; 30 | type ColorsMergeManagerType = SingleColorManager; 31 | } 32 | -------------------------------------------------------------------------------- /crates/colors/src/bundles/multifile_building.rs: -------------------------------------------------------------------------------- 1 | use crate::colors_manager::ColorsManager; 2 | use crate::managers::multiple::MultipleColorsManager; 3 | use crate::parsers::separate::SeparateColorsParser; 4 | use config::{BucketIndexType, ColorIndexType, COLORS_SINGLE_BATCH_SIZE}; 5 | use dynamic_dispatch::dynamic_dispatch; 6 | 7 | #[derive(Copy, Clone, Debug)] 8 | pub struct ColorBundleMultifileBuilding; 9 | 10 | #[dynamic_dispatch] 11 | impl ColorsManager for ColorBundleMultifileBuilding { 12 | const COLORS_ENABLED: bool = true; 13 | type SingleKmerColorDataType = ColorIndexType; 14 | 15 | #[inline(always)] 16 | fn get_bucket_from_color( 17 | color: &Self::SingleKmerColorDataType, 18 | colors_count: u64, 19 | buckets_count_log: u32, 20 | ) -> BucketIndexType { 21 | Self::get_bucket_from_u64_color( 22 | *color as u64, 23 | colors_count, 24 | buckets_count_log, 25 | COLORS_SINGLE_BATCH_SIZE, 26 | ) 27 | } 28 | 29 | type ColorsParserType = SeparateColorsParser; 30 | type ColorsMergeManagerType = MultipleColorsManager; 31 | } 32 | -------------------------------------------------------------------------------- /crates/colors/src/colors_memmap_writer.rs: -------------------------------------------------------------------------------- 1 | // use crate::storage::roaring::ColorsStorage; 2 | use crate::storage::serializer::ColorsSerializer; 3 | use crate::storage::ColorsSerializerTrait; 4 | use config::ColorIndexType; 5 | use dashmap::DashMap; 6 | use hashes::dummy_hasher::DummyHasherBuilder; 7 | use rand::{thread_rng, RngCore}; 8 | use siphasher::sip128::{Hasher128, SipHasher13}; 9 | use std::hash::Hash; 10 | use std::path::Path; 11 | 12 | pub struct ColorsMemMapWriter { 13 | colors: DashMap, 14 | colors_storage: ColorsSerializer, 15 | hash_keys: (u64, u64), 16 | } 17 | 18 | impl ColorsMemMapWriter { 19 | pub fn new(file: impl AsRef, color_names: &[String]) -> anyhow::Result { 20 | let mut rng = thread_rng(); 21 | Ok(Self { 22 | colors: DashMap::with_hasher_and_shard_amount( 23 | DummyHasherBuilder, 24 | // Increase the number of shards to decrease stall while inserting new colors 25 | rayon::current_num_threads() * 8, 26 | ), 27 | colors_storage: ColorsSerializer::new(file, color_names)?, 28 | hash_keys: (rng.next_u64(), rng.next_u64()), 29 | }) 30 | } 31 | 32 | fn hash_colors(&self, colors: &[ColorIndexType]) -> u128 { 33 | let mut hasher = SipHasher13::new_with_keys(self.hash_keys.0, self.hash_keys.1); 34 | colors.hash(&mut hasher); 35 | hasher.finish128().as_u128() 36 | } 37 | 38 | #[inline(always)] 39 | pub fn get_id(&self, colors: &[ColorIndexType]) -> ColorIndexType { 40 | let hash = self.hash_colors(colors); 41 | 42 | match self.colors.entry(hash) { 43 | dashmap::Entry::Occupied(occupied_entry) => *occupied_entry.get(), 44 | dashmap::Entry::Vacant(vacant_entry) => { 45 | let color = self.colors_storage.serialize_colors(colors); 46 | vacant_entry.insert(color); 47 | color 48 | } 49 | } 50 | } 51 | 52 | pub fn print_stats(&self) { 53 | self.colors_storage.print_stats(); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /crates/colors/src/lib.rs: -------------------------------------------------------------------------------- 1 | use crate::storage::run_length::RunLengthColorsSerializer; 2 | 3 | pub mod bundles; 4 | pub mod colors_manager; 5 | pub mod colors_memmap_writer; 6 | pub mod managers; 7 | pub mod non_colored; 8 | pub mod parsers; 9 | pub mod storage; 10 | 11 | pub(crate) mod async_slice_queue; 12 | 13 | pub type DefaultColorsSerializer = RunLengthColorsSerializer; 14 | -------------------------------------------------------------------------------- /crates/colors/src/managers.rs: -------------------------------------------------------------------------------- 1 | pub mod multiple; 2 | pub mod single; 3 | -------------------------------------------------------------------------------- /crates/colors/src/parsers.rs: -------------------------------------------------------------------------------- 1 | use config::ColorIndexType; 2 | 3 | pub mod graph; 4 | pub mod separate; 5 | 6 | pub enum SequenceIdent<'a> { 7 | FASTA(&'a [u8]), 8 | GFA { colors: &'a [u8] }, 9 | } 10 | 11 | pub struct SingleSequenceInfo<'a> { 12 | pub static_color: ColorIndexType, 13 | pub sequence_ident: SequenceIdent<'a>, 14 | } 15 | -------------------------------------------------------------------------------- /crates/colors/src/parsers/separate.rs: -------------------------------------------------------------------------------- 1 | use crate::colors_manager::{ColorsParser, MinimizerBucketingSeqColorData}; 2 | use crate::parsers::SingleSequenceInfo; 3 | use byteorder::ReadBytesExt; 4 | use config::ColorIndexType; 5 | use io::concurrent::temp_reads::extra_data::{ 6 | HasEmptyExtraBuffer, SequenceExtraDataConsecutiveCompression, 7 | }; 8 | use io::varint::{decode_varint, encode_varint, VARINT_MAX_SIZE}; 9 | use std::io::{Read, Write}; 10 | use std::ops::Range; 11 | 12 | #[derive(Copy, Clone, Debug, Eq, PartialEq)] 13 | pub struct MinBkSingleColor(ColorIndexType); 14 | 15 | impl Default for MinBkSingleColor { 16 | fn default() -> Self { 17 | Self(ColorIndexType::MAX) 18 | } 19 | } 20 | 21 | #[inline(always)] 22 | fn decode_minbk_single_color( 23 | get_byte_fn: impl FnMut() -> Option, 24 | last_data: MinBkSingleColor, 25 | ) -> Option { 26 | let color_value = decode_varint(get_byte_fn)? as ColorIndexType; 27 | 28 | Some(if color_value == 0 { 29 | last_data 30 | } else { 31 | MinBkSingleColor(color_value - 1) 32 | }) 33 | } 34 | 35 | impl HasEmptyExtraBuffer for MinBkSingleColor {} 36 | impl SequenceExtraDataConsecutiveCompression for MinBkSingleColor { 37 | type LastData = Self; 38 | 39 | fn decode_from_slice_extended( 40 | _: &mut (), 41 | slice: &[u8], 42 | last_data: Self::LastData, 43 | ) -> Option { 44 | let mut index = 0; 45 | decode_minbk_single_color( 46 | || { 47 | let data = slice[index]; 48 | index += 1; 49 | Some(data) 50 | }, 51 | last_data, 52 | ) 53 | } 54 | 55 | unsafe fn decode_from_pointer_extended( 56 | _: &mut (), 57 | mut ptr: *const u8, 58 | last_data: Self::LastData, 59 | ) -> Option { 60 | decode_minbk_single_color( 61 | || { 62 | let data = *ptr; 63 | ptr = ptr.add(1); 64 | Some(data) 65 | }, 66 | last_data, 67 | ) 68 | } 69 | 70 | fn decode_extended( 71 | _: &mut (), 72 | reader: &mut impl Read, 73 | last_data: Self::LastData, 74 | ) -> Option { 75 | decode_minbk_single_color(|| reader.read_u8().ok(), last_data) 76 | } 77 | 78 | fn encode_extended(&self, _: &(), writer: &mut impl Write, last_data: Self::LastData) { 79 | encode_varint( 80 | |b| writer.write_all(b), 81 | if last_data == *self { 82 | 0 83 | } else { 84 | self.0 as u64 + 1 85 | }, 86 | ) 87 | .unwrap(); 88 | } 89 | 90 | #[inline(always)] 91 | fn max_size(&self) -> usize { 92 | VARINT_MAX_SIZE 93 | } 94 | 95 | #[inline(always)] 96 | fn obtain_last_data(&self, _last_data: Self::LastData) -> Self::LastData { 97 | *self 98 | } 99 | } 100 | 101 | impl MinimizerBucketingSeqColorData for MinBkSingleColor { 102 | type KmerColor = ColorIndexType; 103 | type KmerColorIterator<'a> = std::iter::Repeat; 104 | 105 | fn create(sequence_info: SingleSequenceInfo, _: &mut ()) -> Self { 106 | Self(sequence_info.static_color as ColorIndexType) 107 | } 108 | 109 | fn get_iterator<'a>(&'a self, _: &'a ()) -> Self::KmerColorIterator<'a> { 110 | std::iter::repeat(self.0) 111 | } 112 | 113 | fn get_subslice(&self, _range: Range) -> Self { 114 | *self 115 | } 116 | } 117 | 118 | pub struct SeparateColorsParser; 119 | 120 | impl ColorsParser for SeparateColorsParser { 121 | type SingleKmerColorDataType = ColorIndexType; 122 | type MinimizerBucketingSeqColorDataType = MinBkSingleColor; 123 | } 124 | -------------------------------------------------------------------------------- /crates/colors/src/storage.rs: -------------------------------------------------------------------------------- 1 | use crate::storage::serializer::ColorsFlushProcessing; 2 | use config::ColorIndexType; 3 | use std::io::Read; 4 | 5 | pub mod deserializer; 6 | pub mod roaring; 7 | pub mod run_length; 8 | pub mod serializer; 9 | 10 | pub trait ColorsSerializerTrait: 'static { 11 | const MAGIC: [u8; 16]; 12 | 13 | fn decode_color(reader: impl Read, out_vec: Option<&mut Vec>); 14 | // fn decode_colors(reader: impl Read) -> ; 15 | 16 | fn new(writer: ColorsFlushProcessing, checkpoint_distance: usize, colors_count: u64) -> Self; 17 | fn serialize_colors(&self, colors: &[ColorIndexType]) -> ColorIndexType; 18 | fn get_subsets_count(&self) -> u64; 19 | fn print_stats(&self); 20 | fn finalize(self) -> ColorsFlushProcessing; 21 | } 22 | -------------------------------------------------------------------------------- /crates/colors/src/storage/roaring.rs: -------------------------------------------------------------------------------- 1 | #![allow(warnings)] 2 | use crate::storage::serializer::ColorsFlushProcessing; 3 | use crate::storage::ColorsSerializerTrait; 4 | use config::ColorIndexType; 5 | use io::chunks_writer::ChunksWriter; 6 | use parking_lot::Mutex; 7 | use roaring::RoaringBitmap; 8 | use std::io::Read; 9 | use std::sync::atomic::{AtomicU32, Ordering}; 10 | 11 | struct RoaringBitmapInstance { 12 | bitmap: RoaringBitmap, 13 | offset: ColorIndexType, 14 | colors_count: u64, 15 | checkpoint_distance: u64, 16 | stride: ColorIndexType, 17 | last_color: ColorIndexType, 18 | } 19 | 20 | impl RoaringBitmapInstance { 21 | fn new( 22 | colors_count: u64, 23 | checkpoint_distance: u64, 24 | offset: ColorIndexType, 25 | stride: ColorIndexType, 26 | ) -> Self { 27 | todo!("Fix meaning of 'stride'!"); 28 | Self { 29 | bitmap: RoaringBitmap::new(), 30 | offset, 31 | colors_count, 32 | checkpoint_distance, 33 | stride, 34 | last_color: 0, 35 | } 36 | } 37 | 38 | fn try_append( 39 | &mut self, 40 | color_index: ColorIndexType, 41 | colors: impl Iterator, 42 | writer: &ColorsFlushProcessing, 43 | ) -> bool { 44 | let base_color = color_index - self.offset; 45 | 46 | // Another append is in queue and the current is not the first one 47 | if base_color > self.last_color + self.stride { 48 | return false; 49 | } 50 | 51 | self.last_color = base_color; 52 | 53 | assert_eq!(base_color % self.stride, 0); 54 | let strided_color = base_color / self.stride; 55 | 56 | let local_position = strided_color * (self.colors_count as u32); 57 | 58 | self.bitmap 59 | .append(colors.map(|c| local_position + c)) 60 | .unwrap(); 61 | 62 | // Flush the partial bitmap 63 | if strided_color >= self.checkpoint_distance as u32 { 64 | ggcat_logging::info!("Flushing with offset: {}", self.offset); 65 | self.flush(writer); 66 | } 67 | 68 | true 69 | } 70 | 71 | fn flush(&mut self, writer: &ColorsFlushProcessing) { 72 | let mut pdata = writer.start_processing(); 73 | self.bitmap 74 | .serialize_into(writer.get_stream(&mut pdata)) 75 | .unwrap(); 76 | writer.end_processing(pdata, self.offset); 77 | self.offset += self.last_color; 78 | self.last_color = 0; 79 | self.bitmap.clear(); 80 | } 81 | } 82 | 83 | pub struct RoaringColorsSerializer { 84 | colors_count: u64, 85 | roaring_bitmaps: Vec>, 86 | writer: ColorsFlushProcessing, 87 | colors_index: AtomicU32, 88 | } 89 | 90 | impl ColorsSerializerTrait for RoaringColorsSerializer { 91 | const MAGIC: [u8; 16] = *b"GGCAT_CMAP_ROARG"; 92 | 93 | // FIXME: Implement! 94 | fn decode_color(_reader: impl Read, _out_vec: Option<&mut Vec>) { 95 | todo!() 96 | } 97 | 98 | fn new(writer: ColorsFlushProcessing, checkpoint_distance: usize, colors_count: u64) -> Self { 99 | todo!("Fix meaning of 'stride'!"); 100 | let stride = rayon::current_num_threads() as ColorIndexType; 101 | 102 | Self { 103 | roaring_bitmaps: (0..stride) 104 | .map(|off| { 105 | Mutex::new(RoaringBitmapInstance::new( 106 | colors_count, 107 | checkpoint_distance as u64, 108 | off, 109 | stride, 110 | )) 111 | }) 112 | .collect(), 113 | writer, 114 | colors_index: AtomicU32::new(0), 115 | colors_count, 116 | } 117 | } 118 | 119 | fn serialize_colors(&self, colors: &[ColorIndexType]) -> ColorIndexType { 120 | let color_index = self.colors_index.fetch_add(1, Ordering::Relaxed); 121 | 122 | let target_bitmap = color_index % self.roaring_bitmaps.len() as ColorIndexType; 123 | 124 | loop { 125 | let mut bitmap_lock = self.roaring_bitmaps[target_bitmap as usize].lock(); 126 | if bitmap_lock.try_append(color_index, colors.iter().copied(), &self.writer) { 127 | break; 128 | } 129 | drop(bitmap_lock); 130 | std::thread::yield_now(); 131 | } 132 | 133 | color_index 134 | } 135 | 136 | fn get_subsets_count(&self) -> u64 { 137 | self.colors_index.load(Ordering::Relaxed) as u64 138 | } 139 | 140 | fn print_stats(&self) { 141 | ggcat_logging::info!( 142 | "Subsets count: {} witn {} colors", 143 | self.get_subsets_count(), 144 | self.colors_count 145 | ); 146 | } 147 | 148 | fn finalize(mut self) -> ColorsFlushProcessing { 149 | for bitmap in self.roaring_bitmaps { 150 | bitmap.lock().flush(&mut self.writer); 151 | } 152 | 153 | self.writer 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /crates/config/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ggcat_config" 3 | version = "2.0.0" 4 | edition = "2021" 5 | [dependencies] 6 | 7 | 8 | # Common libraries 9 | parallel-processor = "0.1.24" 10 | parking_lot = "0.12.3" 11 | -------------------------------------------------------------------------------- /crates/config/src/lib.rs: -------------------------------------------------------------------------------- 1 | // use crate::RunLengthColorsSerializer; 2 | use parallel_processor::buckets::writers::compressed_binary_writer::{ 3 | CompressedCheckpointSize, CompressionLevelInfo, 4 | }; 5 | use parallel_processor::memory_data_size::MemoryDataSize; 6 | use parallel_processor::memory_fs::file::internal::MemoryFileMode; 7 | use std::sync::atomic::{AtomicBool, AtomicU32, AtomicUsize, Ordering}; 8 | use std::time::Duration; 9 | 10 | pub type BucketIndexType = u16; 11 | pub type MinimizerType = u32; 12 | pub type ColorIndexType = u32; 13 | pub type ColorCounterType = usize; 14 | pub type MultiplicityCounterType = u32; 15 | 16 | pub const PACKETS_PRIORITY_DEFAULT: usize = 0; 17 | pub const PACKETS_PRIORITY_REWRITTEN: usize = 0; 18 | pub const PACKETS_PRIORITY_COMPACT: usize = 1; 19 | pub const PACKETS_PRIORITY_DONE_RESPLIT: usize = 0; 20 | pub const PACKETS_PRIORITY_FILES: usize = 1; 21 | // pub const PACKETS_PRIORITY_PENDING_RESPLIT: usize = 3; 22 | 23 | pub const WORKERS_PRIORITY_LOW: usize = 0; 24 | pub const WORKERS_PRIORITY_BASE: usize = 0; 25 | pub const WORKERS_PRIORITY_HIGH: usize = 0; 26 | 27 | // pub type DefaultColorsSerializer = RunLengthColorsSerializer; 28 | 29 | pub const READ_INTERMEDIATE_CHUNKS_SIZE: usize = 1024 * 512 * 1; 30 | pub static READ_INTERMEDIATE_QUEUE_MULTIPLIER: AtomicUsize = AtomicUsize::new(2); 31 | 32 | pub const KMERS_TRANSFORM_READS_CHUNKS_SIZE: usize = 1024 * 24; 33 | 34 | /// 2MB read file prefetch 35 | pub const DEFAULT_PREFETCH_AMOUNT: Option = Some(1024 * 1024 * 2); 36 | 37 | pub const FLUSH_QUEUE_FACTOR: usize = 16; 38 | 39 | pub const PARTIAL_VECS_CHECKPOINT_SIZE: CompressedCheckpointSize = 40 | CompressedCheckpointSize::new_from_size(MemoryDataSize::from_mebioctets(2)); 41 | 42 | pub const MINIMIZER_BUCKETS_CHECKPOINT_SIZE: CompressedCheckpointSize = 43 | CompressedCheckpointSize::new_from_size(MemoryDataSize::from_mebioctets(8)); 44 | 45 | pub const DEFAULT_OUTPUT_BUFFER_SIZE: usize = 1024 * 1024 * 4; 46 | pub const DEFAULT_PER_CPU_BUFFER_SIZE: MemoryDataSize = MemoryDataSize::from_kibioctets(4); 47 | 48 | pub const MINIMUM_LOG_DELTA_TIME: Duration = Duration::from_secs(10); 49 | 50 | // 192MB of reads for each bucket 51 | pub const MAX_BUCKET_SIZE: u64 = 192 * 1024 * 1024; 52 | pub const MIN_BUCKETS_COUNT_LOG: usize = 10; 53 | pub const MAX_BUCKETS_COUNT_LOG: usize = 13; 54 | pub const MAX_RESPLIT_BUCKETS_COUNT_LOG: usize = 9; 55 | 56 | pub const MIN_BUCKET_CHUNKS_FOR_READING_THREAD: usize = 2; 57 | 58 | pub const USE_SECOND_BUCKET: bool = false; 59 | 60 | pub const RESPLITTING_MAX_K_M_DIFFERENCE: usize = 10; 61 | 62 | pub const MINIMUM_SUBBUCKET_KMERS_COUNT: usize = 1024 * 32; 63 | pub const MAXIMUM_SECOND_BUCKETS_LOG: usize = 8; 64 | pub const MAXIMUM_SECOND_BUCKETS_COUNT: usize = 1 << MAXIMUM_SECOND_BUCKETS_LOG; 65 | pub const MAXIMUM_JIT_PROCESSED_BUCKETS: usize = 16; 66 | 67 | pub const MAX_INTERMEDIATE_MAP_SIZE: u64 = 1024 * 1024 * 32; 68 | 69 | // Assembler include flags 70 | pub const READ_FLAG_INCL_BEGIN: u8 = 1 << 0; 71 | pub const READ_FLAG_INCL_END: u8 = 1 << 1; 72 | 73 | pub const COLORS_SINGLE_BATCH_SIZE: u64 = 20000; 74 | pub const QUERIES_COUNT_MIN_BATCH: u64 = 1000; 75 | 76 | pub const DEFAULT_COMPACTION_MAP_SUBBUCKET_ELEMENTS: usize = 512; 77 | pub const MAX_COMPACTION_MAP_SUBBUCKET_ELEMENTS: usize = 1024 * 4; 78 | 79 | pub const PRIORITY_SCHEDULING_HIGH: usize = 0; 80 | pub const PRIORITY_SCHEDULING_BASE: usize = 1; 81 | pub const PRIORITY_SCHEDULING_LOW: usize = 2; 82 | 83 | pub const MIN_BUCKET_CHUNK_SIZE: u64 = 1024 * 1024 * 8; 84 | pub const MAX_BUCKET_CHUNK_SIZE: u64 = 1024 * 1024 * 1024; 85 | 86 | pub struct SwapPriority {} 87 | #[allow(non_upper_case_globals)] 88 | impl SwapPriority { 89 | pub const MinimizerBuckets: usize = 0; 90 | pub const FinalMaps: usize = 1; 91 | pub const ResultBuckets: usize = 1; 92 | pub const HashBuckets: usize = 2; 93 | pub const QueryCounters: usize = 2; 94 | pub const ReorganizeReads: usize = 3; 95 | pub const LinksBuckets: usize = 3; 96 | pub const LinkPairs: usize = 4; 97 | pub const KmersMergeTempColors: usize = 4; 98 | pub const ColoredQueryBuckets: usize = 5; 99 | pub const KmersMergeBuckets: usize = 6; 100 | } 101 | 102 | // Functions depending on global config parameters set at runtime 103 | pub static KEEP_FILES: AtomicBool = AtomicBool::new(false); 104 | pub static INTERMEDIATE_COMPRESSION_LEVEL_SLOW: AtomicU32 = AtomicU32::new(3); 105 | pub static INTERMEDIATE_COMPRESSION_LEVEL_FAST: AtomicU32 = AtomicU32::new(0); 106 | pub static PREFER_MEMORY: AtomicBool = AtomicBool::new(false); 107 | 108 | pub fn get_memory_mode(swap_priority: usize) -> MemoryFileMode { 109 | if PREFER_MEMORY.load(Ordering::Relaxed) { 110 | MemoryFileMode::PreferMemory { swap_priority } 111 | } else { 112 | MemoryFileMode::DiskOnly 113 | } 114 | } 115 | 116 | pub fn get_compression_level_info() -> CompressionLevelInfo { 117 | CompressionLevelInfo { 118 | fast_disk: INTERMEDIATE_COMPRESSION_LEVEL_FAST.load(Ordering::Relaxed), 119 | slow_disk: INTERMEDIATE_COMPRESSION_LEVEL_SLOW.load(Ordering::Relaxed), 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /crates/dumper/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ggcat_dumper" 3 | version = "0.1.1" 4 | edition = "2021" 5 | [dependencies] 6 | 7 | # Config 8 | config = { package = "ggcat_config", path = "../config" } 9 | 10 | # Utils 11 | utils = { package = "ggcat_utils", path = "../utils" } 12 | 13 | # Common libraries 14 | parallel-processor = "0.1.24" 15 | streaming-libdeflate-rs = "0.1.5" 16 | nightly-quirks = "0.1.4" 17 | 18 | # Common pipeline libraries 19 | hashes = { package = "ggcat_hashes", path = "../hashes" } 20 | io = { package = "ggcat_io", path = "../io" } 21 | colors = { package = "ggcat_colors", path = "../colors" } 22 | structs = { package = "ggcat_structs", path = "../structs" } 23 | minimizer_bucketing = { package = "ggcat_minimizer_bucketing", path = "../minimizer_bucketing" } 24 | kmers_transform = { package = "ggcat_kmers_transform", path = "../kmers_transform" } 25 | typenum = "1.17.0" 26 | rayon = "1.10.0" 27 | byteorder = "1.5.0" 28 | hashbrown = "0.14.5" 29 | csv = "1.3.0" 30 | parking_lot = "0.12.3" 31 | lz4 = "1.25.0" 32 | flate2 = "1.0.30" 33 | ggcat-logging = { version = "2.0.0", path = "../logging" } 34 | anyhow = "1.0.89" 35 | 36 | [features] 37 | devel-build = [] 38 | -------------------------------------------------------------------------------- /crates/dumper/src/lib.rs: -------------------------------------------------------------------------------- 1 | use crate::pipeline::dumper_minimizer_bucketing::minimizer_bucketing; 2 | use colors::bundles::graph_querying::ColorBundleGraphQuerying; 3 | use colors::colors_manager::{ColorMapReader, ColorsManager, ColorsMergeManager}; 4 | use colors::DefaultColorsSerializer; 5 | use config::{ 6 | ColorIndexType, INTERMEDIATE_COMPRESSION_LEVEL_FAST, INTERMEDIATE_COMPRESSION_LEVEL_SLOW, 7 | }; 8 | use io::compute_stats_from_input_blocks; 9 | use io::sequences_stream::general::GeneralSequenceBlockData; 10 | use parallel_processor::memory_fs::MemoryFs; 11 | use parallel_processor::phase_times_monitor::PHASES_TIMES_MONITOR; 12 | use pipeline::dumper_colormap_querying::colormap_query; 13 | use pipeline::dumper_colormap_reading::colormap_reading; 14 | use std::fs::remove_file; 15 | use std::path::{Path, PathBuf}; 16 | use std::sync::atomic::Ordering; 17 | 18 | mod pipeline; 19 | 20 | #[derive(Copy, Clone, Debug, PartialOrd, PartialEq)] 21 | pub enum QuerierStartingStep { 22 | MinimizerBucketing = 0, 23 | KmersCounting = 1, 24 | CountersSorting = 2, 25 | ColorMapReading = 3, 26 | } 27 | 28 | #[derive(Copy, Clone, Debug, PartialEq, Eq)] 29 | pub enum ColoredQueryOutputFormat { 30 | JsonLinesWithNumbers, 31 | JsonLinesWithNames, 32 | } 33 | 34 | pub fn dump_unitigs( 35 | k: usize, 36 | m: usize, 37 | graph_input: impl AsRef, 38 | temp_dir: Option, 39 | buckets_count_log: Option, 40 | threads_count: usize, 41 | single_thread_output_function: bool, 42 | default_compression_level: Option, 43 | output_function: impl Fn(&[u8], &[ColorIndexType], bool) + Send + Sync, 44 | ) -> anyhow::Result<()> { 45 | let temp_dir = temp_dir.unwrap_or(PathBuf::new()); 46 | 47 | PHASES_TIMES_MONITOR.write().init(); 48 | 49 | let color_map = 50 | ::ColorsMergeManagerType::open_colors_table( 51 | graph_input.as_ref().with_extension("colors.dat"), 52 | )?; 53 | 54 | // TODO: Support GFA input 55 | let file_stats = compute_stats_from_input_blocks(&[GeneralSequenceBlockData::FASTA(( 56 | graph_input.as_ref().to_path_buf(), 57 | None, 58 | ))])?; 59 | 60 | let buckets_count_log = buckets_count_log.unwrap_or_else(|| file_stats.best_buckets_count_log); 61 | 62 | if let Some(default_compression_level) = default_compression_level { 63 | INTERMEDIATE_COMPRESSION_LEVEL_SLOW.store(default_compression_level, Ordering::Relaxed); 64 | INTERMEDIATE_COMPRESSION_LEVEL_FAST.store(default_compression_level, Ordering::Relaxed); 65 | } 66 | 67 | let buckets_count = 1 << buckets_count_log; 68 | 69 | let (reorganized_unitigs, buckets_stats) = minimizer_bucketing::( 70 | graph_input.as_ref().to_path_buf(), 71 | buckets_count, 72 | threads_count, 73 | temp_dir.as_path(), 74 | k, 75 | m, 76 | color_map.colors_subsets_count(), 77 | ); 78 | let _ = remove_file(buckets_stats); 79 | 80 | MemoryFs::flush_all_to_disk(); 81 | MemoryFs::free_memory(); 82 | 83 | let colormap_file = graph_input.as_ref().with_extension("colors.dat"); 84 | colormap_reading::( 85 | colormap_file, 86 | reorganized_unitigs, 87 | single_thread_output_function, 88 | output_function, 89 | ) 90 | } 91 | 92 | pub fn dump_colormap_query( 93 | colormap_file: PathBuf, 94 | color_subsets: Vec, 95 | single_thread_output_function: bool, 96 | output_function: impl Fn(ColorIndexType, &[ColorIndexType]) + Send + Sync, 97 | ) -> anyhow::Result<()> { 98 | PHASES_TIMES_MONITOR.write().init(); 99 | 100 | colormap_query::( 101 | colormap_file, 102 | color_subsets, 103 | single_thread_output_function, 104 | output_function, 105 | ) 106 | } 107 | -------------------------------------------------------------------------------- /crates/dumper/src/pipeline.rs: -------------------------------------------------------------------------------- 1 | pub mod dumper_colormap_reading; 2 | // pub mod counters_sorting; 3 | pub mod dumper_colormap_querying; 4 | pub mod dumper_minimizer_bucketing; 5 | -------------------------------------------------------------------------------- /crates/dumper/src/pipeline/dumper_colormap_querying.rs: -------------------------------------------------------------------------------- 1 | use colors::colors_manager::ColorsManager; 2 | use colors::storage::deserializer::ColorsDeserializer; 3 | use colors::storage::ColorsSerializerTrait; 4 | use config::ColorIndexType; 5 | use parallel_processor::phase_times_monitor::PHASES_TIMES_MONITOR; 6 | use parallel_processor::utils::scoped_thread_local::ScopedThreadLocal; 7 | use parking_lot::Mutex; 8 | use rayon::prelude::*; 9 | use std::path::PathBuf; 10 | 11 | pub fn colormap_query< 12 | CX: ColorsManager, 13 | CD: ColorsSerializerTrait, 14 | >( 15 | colormap_file: PathBuf, 16 | mut color_subsets: Vec, 17 | single_thread_output_function: bool, 18 | output_function: impl Fn(ColorIndexType, &[ColorIndexType]) + Send + Sync, 19 | ) -> anyhow::Result<()> { 20 | PHASES_TIMES_MONITOR 21 | .write() 22 | .start_phase("phase: colormap query".to_string()); 23 | 24 | // Try to build a color deserializer to check colormap correctness 25 | let _ = ColorsDeserializer::::new(&colormap_file, false)?; 26 | 27 | let tlocal_colormap_decoder = ScopedThreadLocal::new(move || { 28 | ColorsDeserializer::::new(&colormap_file, false).unwrap() 29 | }); 30 | 31 | let single_thread_lock = Mutex::new(()); 32 | 33 | color_subsets.sort_unstable(); 34 | color_subsets.dedup(); 35 | 36 | let threads_count = rayon::current_num_threads(); 37 | 38 | let chunks_size = std::cmp::max(10000, color_subsets.len() / (threads_count * 2 + 1)); 39 | 40 | color_subsets 41 | .chunks(chunks_size) 42 | .par_bridge() 43 | .for_each(|subsets| { 44 | let mut colormap_decoder = tlocal_colormap_decoder.get(); 45 | let mut temp_colors_buffer = Vec::new(); 46 | 47 | for &color in subsets { 48 | temp_colors_buffer.clear(); 49 | colormap_decoder.get_color_mappings(color, &mut temp_colors_buffer); 50 | 51 | let _lock = if single_thread_output_function { 52 | Some(single_thread_lock.lock()) 53 | } else { 54 | None 55 | }; 56 | 57 | output_function(color, &temp_colors_buffer[..]); 58 | } 59 | }); 60 | Ok(()) 61 | } 62 | -------------------------------------------------------------------------------- /crates/dumper/src/pipeline/dumper_colormap_reading.rs: -------------------------------------------------------------------------------- 1 | use crate::pipeline::dumper_minimizer_bucketing::DumperKmersReferenceData; 2 | use colors::colors_manager::color_types::SingleKmerColorDataType; 3 | use colors::colors_manager::ColorsManager; 4 | use colors::storage::deserializer::ColorsDeserializer; 5 | use colors::storage::ColorsSerializerTrait; 6 | use config::{ColorIndexType, DEFAULT_PREFETCH_AMOUNT, KEEP_FILES}; 7 | use io::compressed_read::CompressedReadIndipendent; 8 | use io::concurrent::temp_reads::creads_utils::{ 9 | CompressedReadsBucketDataSerializer, NoMultiplicity, NoSecondBucket, 10 | }; 11 | use nightly_quirks::slice_group_by::SliceGroupBy; 12 | use parallel_processor::buckets::readers::compressed_binary_reader::CompressedBinaryReader; 13 | use parallel_processor::buckets::readers::BucketReader; 14 | use parallel_processor::buckets::SingleBucket; 15 | use parallel_processor::fast_smart_bucket_sort::{fast_smart_radix_sort, FastSortable, SortKey}; 16 | use parallel_processor::memory_fs::RemoveFileMode; 17 | use parallel_processor::phase_times_monitor::PHASES_TIMES_MONITOR; 18 | use parallel_processor::utils::scoped_thread_local::ScopedThreadLocal; 19 | use parking_lot::Mutex; 20 | use rayon::prelude::*; 21 | use std::marker::PhantomData; 22 | use std::path::PathBuf; 23 | use std::sync::atomic::Ordering; 24 | 25 | pub fn colormap_reading< 26 | CX: ColorsManager, 27 | CD: ColorsSerializerTrait, 28 | >( 29 | colormap_file: PathBuf, 30 | colored_unitigs_buckets: Vec, 31 | single_thread_output_function: bool, 32 | output_function: impl Fn(&[u8], &[ColorIndexType], bool) + Send + Sync, 33 | ) -> anyhow::Result<()> { 34 | PHASES_TIMES_MONITOR 35 | .write() 36 | .start_phase("phase: colormap reading".to_string()); 37 | 38 | // Try to build a color deserializer to check colormap correctness 39 | let _ = ColorsDeserializer::::new(&colormap_file, false)?; 40 | 41 | let tlocal_colormap_decoder = ScopedThreadLocal::new(move || { 42 | ColorsDeserializer::::new(&colormap_file, false).unwrap() 43 | }); 44 | 45 | let single_thread_lock = Mutex::new(()); 46 | 47 | colored_unitigs_buckets.par_iter().for_each(|input| { 48 | let mut colormap_decoder = tlocal_colormap_decoder.get(); 49 | let mut temp_colors_buffer = Vec::new(); 50 | let mut temp_decompressed_sequence = Vec::new(); 51 | 52 | let mut temp_bases = Vec::new(); 53 | let mut temp_sequences = Vec::new(); 54 | 55 | CompressedBinaryReader::new( 56 | &input.path, 57 | RemoveFileMode::Remove { 58 | remove_fs: !KEEP_FILES.load(Ordering::Relaxed), 59 | }, 60 | DEFAULT_PREFETCH_AMOUNT, 61 | ) 62 | .decode_all_bucket_items::>, 64 | typenum::consts::U0, 65 | NoSecondBucket, 66 | NoMultiplicity, 67 | >, _>(vec![], &mut (), |(_, _, color_extra, read, _), _| { 68 | let new_read = CompressedReadIndipendent::from_read(&read, &mut temp_bases); 69 | temp_sequences.push((new_read, color_extra)); 70 | }); 71 | 72 | struct ColoredUnitigsCompare(PhantomData<&'static CX>); 73 | impl 74 | SortKey<( 75 | CompressedReadIndipendent, 76 | DumperKmersReferenceData>, 77 | )> for ColoredUnitigsCompare 78 | { 79 | type KeyType = SingleKmerColorDataType; 80 | const KEY_BITS: usize = std::mem::size_of::>() * 8; 81 | 82 | fn compare( 83 | left: &( 84 | CompressedReadIndipendent, 85 | DumperKmersReferenceData>, 86 | ), 87 | right: &( 88 | CompressedReadIndipendent, 89 | DumperKmersReferenceData>, 90 | ), 91 | ) -> std::cmp::Ordering { 92 | left.1.cmp(&right.1) 93 | } 94 | 95 | fn get_shifted( 96 | value: &( 97 | CompressedReadIndipendent, 98 | DumperKmersReferenceData>, 99 | ), 100 | rhs: u8, 101 | ) -> u8 { 102 | value.1.color.get_shifted(rhs) 103 | } 104 | } 105 | 106 | fast_smart_radix_sort::<_, ColoredUnitigsCompare, false>(&mut temp_sequences[..]); 107 | 108 | for unitigs_by_color in temp_sequences.nq_group_by_mut(|a, b| a.1 == b.1) { 109 | let color = unitigs_by_color[0].1.color; 110 | temp_colors_buffer.clear(); 111 | colormap_decoder.get_color_mappings(color, &mut temp_colors_buffer); 112 | 113 | let mut same_color = false; 114 | 115 | let _lock = if single_thread_output_function { 116 | Some(single_thread_lock.lock()) 117 | } else { 118 | None 119 | }; 120 | 121 | for unitig in unitigs_by_color { 122 | let read = unitig.0.as_reference(&temp_bases); 123 | temp_decompressed_sequence.clear(); 124 | temp_decompressed_sequence.extend(read.as_bases_iter()); 125 | output_function(&temp_decompressed_sequence, &temp_colors_buffer, same_color); 126 | same_color = true; 127 | } 128 | } 129 | }); 130 | Ok(()) 131 | } 132 | -------------------------------------------------------------------------------- /crates/hashes/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ggcat_hashes" 3 | version = "2.0.0" 4 | edition = "2021" 5 | [lib] 6 | name = "ggcat_hashes" 7 | 8 | [dependencies] 9 | serde = "1.0.203" 10 | ahash = "0.8.11" 11 | 12 | # Config 13 | config = { package = "ggcat_config", path = "../config" } 14 | 15 | # Static dispatch 16 | dynamic-dispatch = "0.5.4" 17 | 18 | [[bench]] 19 | name = "hashes-bench" 20 | harness = false 21 | 22 | [dev-dependencies] 23 | pcg_rand = "0.13.0" 24 | rand = "0.8.5" 25 | utils = { package = "ggcat_utils", path = "../utils" } 26 | io = { package = "ggcat_io", path = "../io" } 27 | criterion = "0.5.1" 28 | 29 | 30 | -------------------------------------------------------------------------------- /crates/hashes/benches/hashes-bench.rs: -------------------------------------------------------------------------------- 1 | use criterion::*; 2 | use ggcat_hashes::*; 3 | use io::compressed_read::CompressedRead; 4 | use rand::{RngCore, SeedableRng}; 5 | use utils::Utils; 6 | 7 | // From rand test library 8 | /// Construct a deterministic RNG with the given seed 9 | pub fn rng(seed: u64) -> impl RngCore { 10 | // For tests, we want a statistically good, fast, reproducible RNG. 11 | // PCG32 will do fine, and will be easy to embed if we ever need to. 12 | pcg_rand::Pcg32::seed_from_u64(seed) 13 | } 14 | 15 | fn generate_bases(len: usize, seed: u64) -> Vec { 16 | let mut rng = rng(seed); 17 | 18 | let result = (0..len) 19 | .map(|_| Utils::decompress_base((rng.next_u32() % 4) as u8)) 20 | .collect::>(); 21 | 22 | result 23 | } 24 | 25 | pub fn criterion_benchmark(c: &mut Criterion) { 26 | let bases = generate_bases(63, 0); 27 | 28 | let read = CompressedRead::new_from_compressed(&bases, 63); 29 | 30 | for k in [15, 35, 47, 63] { 31 | c.bench_function(&format!("single-canonical-u128-k{}", k), |b| { 32 | b.iter(|| { 33 | let hashes = cn_seqhash::u128::CanonicalSeqHashFactory::new(read, k); 34 | let hash = hashes.iter().next().unwrap(); 35 | black_box(hash); 36 | }) 37 | }); 38 | } 39 | } 40 | 41 | criterion_group!(benches, criterion_benchmark); 42 | 43 | criterion_main!(benches); 44 | -------------------------------------------------------------------------------- /crates/hashes/src/cn_rkhash.rs: -------------------------------------------------------------------------------- 1 | pub mod u32 { 2 | use crate::dummy_hasher::DummyHasherBuilder; 3 | 4 | type HashIntegerType = u32; 5 | const MULTIPLIER: HashIntegerType = 0xdc7d07b1; 6 | const MULT_INV: HashIntegerType = 0xfd0ee151; 7 | 8 | pub const MULT_A: HashIntegerType = 0x58107bed; 9 | pub const MULT_C: HashIntegerType = 0x6da984cf; 10 | pub const MULT_G: HashIntegerType = 0x7d6c2d5d; 11 | pub const MULT_T: HashIntegerType = 0x3ea1c319; 12 | 13 | include!("base/cn_rkhash_base.rs"); 14 | } 15 | 16 | pub mod u64 { 17 | use crate::dummy_hasher::DummyHasherBuilder; 18 | 19 | type HashIntegerType = u64; 20 | const MULTIPLIER: HashIntegerType = 0x660b123642ca9149; 21 | const MULT_INV: HashIntegerType = 0x397f178c6ae330f9; 22 | 23 | pub const MULT_A: HashIntegerType = 0x34889973de695e1b; 24 | pub const MULT_C: HashIntegerType = 0x72dacb3a60672825; 25 | pub const MULT_G: HashIntegerType = 0x61bf33e452d231a5; 26 | pub const MULT_T: HashIntegerType = 0x759db32ccd931bb5; 27 | 28 | include!("base/cn_rkhash_base.rs"); 29 | 30 | #[cfg(test)] 31 | mod tests_reverse { 32 | use crate::{cn_seqhash::u64::CanonicalSeqHashFactory, HashFunctionFactory}; 33 | 34 | #[test] 35 | fn cn_seqhash_reverse() { 36 | let hash = 1531907577009573; // 11581873256642304; 37 | let inverted = CanonicalSeqHashFactory::invert(hash); 38 | const C_INV_LETTERS: [u8; 4] = [b'A', b'C', b'T', b'G']; 39 | 40 | println!( 41 | "{:?}", 42 | String::from_utf8( 43 | inverted 44 | .iter() 45 | .map(|b| { 46 | let b = *b as usize; 47 | [ 48 | C_INV_LETTERS[b & 0b11], 49 | C_INV_LETTERS[(b >> 2) & 0b11], 50 | C_INV_LETTERS[(b >> 4) & 0b11], 51 | C_INV_LETTERS[(b >> 6) & 0b11], 52 | ] 53 | .into_iter() 54 | }) 55 | .flatten() 56 | .take(27) 57 | .collect::>() 58 | ) 59 | .unwrap() 60 | ); 61 | } 62 | } 63 | } 64 | 65 | pub mod u128 { 66 | use crate::dummy_hasher::DummyHasherBuilder; 67 | 68 | type HashIntegerType = u128; 69 | const MULTIPLIER: HashIntegerType = 0x3eb9402f3e733993add64d3ca00e1b6b; 70 | const MULT_INV: HashIntegerType = 0x9cb6ff6f1b1a6d733e0952e899c3943; 71 | 72 | pub const MULT_A: HashIntegerType = 0x4751137d01d863c5b8c36de2b7d399df; 73 | pub const MULT_C: HashIntegerType = 0x37ea3a13226503fb783f5cb69f4552bd; 74 | pub const MULT_G: HashIntegerType = 0x50796b285343f09a0c53113ae736572b; 75 | pub const MULT_T: HashIntegerType = 0x1e62d96a5e1f5ade2d4e68d8f88110b7; 76 | 77 | include!("base/cn_rkhash_base.rs"); 78 | } 79 | -------------------------------------------------------------------------------- /crates/hashes/src/cn_seqhash.rs: -------------------------------------------------------------------------------- 1 | pub mod u16 { 2 | type HashIntegerType = u16; 3 | include!("base/cn_seqhash_base.rs"); 4 | } 5 | 6 | pub mod u32 { 7 | type HashIntegerType = u32; 8 | include!("base/cn_seqhash_base.rs"); 9 | } 10 | 11 | pub mod u64 { 12 | type HashIntegerType = u64; 13 | include!("base/cn_seqhash_base.rs"); 14 | } 15 | 16 | pub mod u128 { 17 | type HashIntegerType = u128; 18 | include!("base/cn_seqhash_base.rs"); 19 | } 20 | -------------------------------------------------------------------------------- /crates/hashes/src/dummy_hasher.rs: -------------------------------------------------------------------------------- 1 | use std::hash::{BuildHasher, Hasher}; 2 | 3 | #[derive(Copy, Clone)] 4 | pub struct DummyHasherBuilder; 5 | 6 | impl BuildHasher for DummyHasherBuilder { 7 | type Hasher = DummyHasher; 8 | 9 | #[inline(always)] 10 | fn build_hasher(&self) -> Self::Hasher { 11 | DummyHasher { 0: 0 } 12 | } 13 | } 14 | 15 | pub struct DummyHasher(u64); 16 | 17 | impl Hasher for DummyHasher { 18 | #[inline(always)] 19 | fn finish(&self) -> u64 { 20 | self.0 21 | } 22 | 23 | fn write(&mut self, _bytes: &[u8]) { 24 | panic!("Not supported!"); 25 | } 26 | 27 | #[inline(always)] 28 | fn write_u32(&mut self, i: u32) { 29 | self.0 = i as u64 | ((i as u64) << 32) 30 | } 31 | 32 | #[inline(always)] 33 | fn write_u64(&mut self, i: u64) { 34 | self.0 = i; 35 | } 36 | 37 | #[inline(always)] 38 | fn write_u128(&mut self, i: u128) { 39 | self.0 = i as u64; 40 | } 41 | 42 | #[inline(always)] 43 | fn write_usize(&mut self, i: usize) { 44 | self.0 = i as u64; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /crates/hashes/src/fw_rkhash.rs: -------------------------------------------------------------------------------- 1 | pub mod u32 { 2 | use crate::dummy_hasher::DummyHasherBuilder; 3 | 4 | type HashIntegerType = u32; 5 | const MULTIPLIER: HashIntegerType = 0xdc7d07b1; 6 | const MULT_INV: HashIntegerType = 0xfd0ee151; 7 | 8 | pub const MULT_A: HashIntegerType = 0x58107bed; 9 | pub const MULT_C: HashIntegerType = 0x6da984cf; 10 | pub const MULT_G: HashIntegerType = 0x7d6c2d5d; 11 | pub const MULT_T: HashIntegerType = 0x3ea1c319; 12 | 13 | include!("base/fw_rkhash_base.rs"); 14 | } 15 | 16 | pub mod u64 { 17 | use crate::dummy_hasher::DummyHasherBuilder; 18 | 19 | type HashIntegerType = u64; 20 | const MULTIPLIER: HashIntegerType = 0x660b123642ca9149; 21 | const MULT_INV: HashIntegerType = 0x397f178c6ae330f9; 22 | 23 | pub const MULT_A: HashIntegerType = 0x34889973de695e1b; 24 | pub const MULT_C: HashIntegerType = 0x72dacb3a60672825; 25 | pub const MULT_G: HashIntegerType = 0x61bf33e452d231a5; 26 | pub const MULT_T: HashIntegerType = 0x759db32ccd931bb5; 27 | 28 | include!("base/fw_rkhash_base.rs"); 29 | } 30 | 31 | pub mod u128 { 32 | use crate::dummy_hasher::DummyHasherBuilder; 33 | 34 | type HashIntegerType = u128; 35 | const MULTIPLIER: HashIntegerType = 0x3eb9402f3e733993add64d3ca00e1b6b; 36 | const MULT_INV: HashIntegerType = 0x9cb6ff6f1b1a6d733e0952e899c3943; 37 | 38 | pub const MULT_A: HashIntegerType = 0x4751137d01d863c5b8c36de2b7d399df; 39 | pub const MULT_C: HashIntegerType = 0x37ea3a13226503fb783f5cb69f4552bd; 40 | pub const MULT_G: HashIntegerType = 0x50796b285343f09a0c53113ae736572b; 41 | pub const MULT_T: HashIntegerType = 0x1e62d96a5e1f5ade2d4e68d8f88110b7; 42 | 43 | include!("base/fw_rkhash_base.rs"); 44 | } 45 | -------------------------------------------------------------------------------- /crates/hashes/src/fw_seqhash.rs: -------------------------------------------------------------------------------- 1 | pub mod u16 { 2 | type HashIntegerType = u16; 3 | include!("base/fw_seqhash_base.rs"); 4 | } 5 | 6 | pub mod u32 { 7 | type HashIntegerType = u32; 8 | include!("base/fw_seqhash_base.rs"); 9 | } 10 | 11 | pub mod u64 { 12 | type HashIntegerType = u64; 13 | include!("base/fw_seqhash_base.rs"); 14 | } 15 | 16 | pub mod u128 { 17 | type HashIntegerType = u128; 18 | include!("base/fw_seqhash_base.rs"); 19 | } 20 | -------------------------------------------------------------------------------- /crates/hashes/src/nthash_base.rs: -------------------------------------------------------------------------------- 1 | pub const HASH_A: u64 = 0x3c8b_fbb3_95c6_0474; 2 | pub const HASH_C: u64 = 0x3193_c185_62a0_2b4c; 3 | pub const HASH_G: u64 = 0x2032_3ed0_8257_2324; 4 | pub const HASH_T: u64 = 0x2955_49f5_4be2_4456; 5 | 6 | #[inline(always)] 7 | pub fn h(c: u8) -> u64 { 8 | unsafe { *H_LOOKUP.get_unchecked(c as usize) } 9 | } 10 | 11 | #[inline(always)] 12 | pub fn rc(c: u8) -> u64 { 13 | unsafe { *RC_LOOKUP.get_unchecked(c as usize) } 14 | } 15 | 16 | const H_LOOKUP: [u64; 256] = { 17 | let mut lookup = [1; 256]; 18 | 19 | // Support compressed reads transparently 20 | lookup[0 /*b'A'*/] = HASH_A; 21 | lookup[1 /*b'C'*/] = HASH_C; 22 | lookup[2 /*b'T'*/] = HASH_T; 23 | lookup[3 /*b'G'*/] = HASH_G; 24 | lookup[4 /*b'N'*/] = 0; 25 | 26 | lookup[b'A' as usize] = HASH_A; 27 | lookup[b'C' as usize] = HASH_C; 28 | lookup[b'G' as usize] = HASH_G; 29 | lookup[b'T' as usize] = HASH_T; 30 | lookup[b'N' as usize] = 0; 31 | lookup 32 | }; 33 | 34 | const RC_LOOKUP: [u64; 256] = { 35 | let mut lookup = [1; 256]; 36 | 37 | // Support compressed reads transparently 38 | lookup[0 /*b'A'*/] = HASH_T; 39 | lookup[1 /*b'C'*/] = HASH_G; 40 | lookup[2 /*b'T'*/] = HASH_A; 41 | lookup[3 /*b'G'*/] = HASH_C; 42 | lookup[4 /*b'N'*/] = 0; 43 | 44 | lookup[b'A' as usize] = HASH_T; 45 | lookup[b'C' as usize] = HASH_G; 46 | lookup[b'G' as usize] = HASH_C; 47 | lookup[b'T' as usize] = HASH_A; 48 | lookup[b'N' as usize] = 0; 49 | lookup 50 | }; 51 | -------------------------------------------------------------------------------- /crates/hashes/src/rolling.rs: -------------------------------------------------------------------------------- 1 | pub mod kseq_iterator; 2 | pub mod minqueue; 3 | -------------------------------------------------------------------------------- /crates/hashes/src/rolling/kseq_iterator.rs: -------------------------------------------------------------------------------- 1 | pub trait RollingKseqImpl { 2 | fn clear(&mut self, ksize: usize); 3 | fn init(&mut self, index: usize, base: T); 4 | fn iter(&mut self, index: usize, out_base: T, in_base: T) -> U; 5 | } 6 | 7 | #[derive(Debug)] 8 | pub struct RollingKseqIterator {} 9 | 10 | impl RollingKseqIterator { 11 | pub fn iter_seq<'a, T: Copy, U: Copy>( 12 | seq: &'a [T], 13 | k: usize, 14 | iter_impl: &'a mut (impl RollingKseqImpl + 'a), 15 | ) -> impl Iterator + 'a { 16 | let k_minus1 = k - 1; 17 | 18 | let maxv = if seq.len() > k_minus1 { 19 | iter_impl.clear(k); 20 | for (i, v) in seq[0..k_minus1].iter().enumerate() { 21 | iter_impl.init(i, *v); 22 | } 23 | seq.len() 24 | } else { 25 | 0 26 | }; 27 | 28 | (k_minus1..maxv).map(move |idx| { 29 | iter_impl.iter(idx, unsafe { *seq.get_unchecked(idx - k_minus1) }, unsafe { 30 | *seq.get_unchecked(idx) 31 | }) 32 | }) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /crates/hashes/src/rolling/minqueue.rs: -------------------------------------------------------------------------------- 1 | use crate::default::{MNHFactory, MNHUnextendable}; 2 | use crate::MinimizerHashFunctionFactory; 3 | use std::cmp::min_by_key; 4 | 5 | pub struct RollingMinQueue { 6 | queue: Vec<(MNHUnextendable, MNHUnextendable)>, 7 | index: usize, 8 | capacity_mask: usize, 9 | size: usize, 10 | minimum: (MNHUnextendable, usize), 11 | } 12 | 13 | impl RollingMinQueue { 14 | pub fn new(size: usize) -> RollingMinQueue { 15 | let capacity = size.next_power_of_two(); 16 | let mut queue = Vec::with_capacity(capacity); 17 | unsafe { 18 | queue.set_len(capacity); 19 | } 20 | 21 | RollingMinQueue { 22 | queue, 23 | index: 0, 24 | capacity_mask: capacity - 1, 25 | size, 26 | minimum: (MNHUnextendable::default(), 0), 27 | } 28 | } 29 | 30 | fn rebuild_minimums(&mut self, size: usize) { 31 | let mut i = self.index.wrapping_sub(2) & self.capacity_mask; 32 | 33 | self.minimum = ( 34 | self.queue[(i + 1) & self.capacity_mask].0, 35 | (self.index + size) & self.capacity_mask, 36 | ); 37 | 38 | let li = (self.index.wrapping_sub(size + 1)) & self.capacity_mask; 39 | while i != li { 40 | unsafe { 41 | self.queue.get_unchecked_mut(i).1 = min_by_key( 42 | self.queue.get_unchecked_mut(i).1, 43 | self.queue.get_unchecked_mut((i + 1) & self.capacity_mask).1, 44 | |x| MNHFactory::get_full_minimizer(*x), 45 | ); 46 | } 47 | i = i.wrapping_sub(1) & self.capacity_mask; 48 | } 49 | } 50 | 51 | pub fn make_iter<'a, I: Iterator + 'a>( 52 | &'a mut self, 53 | mut iter: I, 54 | ) -> impl Iterator + 'a { 55 | for i in 0..(self.size - 1) { 56 | unsafe { 57 | let value = iter.next().unwrap_unchecked(); 58 | *self.queue.get_unchecked_mut(i) = (value, value); 59 | } 60 | } 61 | 62 | self.index = self.size - 1; 63 | self.rebuild_minimums(self.size - 1); 64 | 65 | iter.map(move |x| unsafe { 66 | *self.queue.get_unchecked_mut(self.index) = (x, x); 67 | 68 | self.minimum = min_by_key( 69 | self.minimum, 70 | (x, (self.index + self.size) & self.capacity_mask), 71 | |x| MNHFactory::get_full_minimizer(x.0), 72 | ); 73 | self.index = (self.index + 1) & self.capacity_mask; 74 | 75 | if self.index == self.minimum.1 { 76 | self.rebuild_minimums(self.size); 77 | } 78 | 79 | min_by_key( 80 | self.minimum.0, 81 | self.queue 82 | .get_unchecked_mut((self.index.wrapping_sub(self.size)) & self.capacity_mask) 83 | .1, 84 | |x| MNHFactory::get_full_minimizer(*x), 85 | ) 86 | }) 87 | } 88 | } 89 | 90 | #[cfg(test)] 91 | mod tests { 92 | use crate::fw_nthash::ForwardNtHashIteratorFactory; 93 | use crate::rolling::minqueue::RollingMinQueue; 94 | use crate::MinimizerHashFunctionFactory; 95 | use rand::{RngCore, SeedableRng}; 96 | 97 | #[test] 98 | fn minqueue_test() { 99 | const SIZE: usize = 10000000; 100 | const MINWINDOW: usize = 32; 101 | 102 | let mut queue = RollingMinQueue::new(MINWINDOW); 103 | 104 | let mut items = Vec::new(); 105 | items.reserve(SIZE); 106 | 107 | let mut random = pcg_rand::Pcg64::seed_from_u64(2); 108 | 109 | for i in 0..SIZE { 110 | let value = random.next_u64(); 111 | if i > 52 + 37 { 112 | items.push(value); 113 | } 114 | } 115 | 116 | for (index, item) in queue.make_iter(items.clone().into_iter()).enumerate() { 117 | assert_eq!( 118 | item, 119 | *items[index..index + MINWINDOW] 120 | .iter() 121 | .min_by_key(|x| ForwardNtHashIteratorFactory::get_full_minimizer(**x)) 122 | .unwrap(), 123 | "Error slice: {:?}", 124 | &items[index..index + MINWINDOW] 125 | ); 126 | } 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /crates/io/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ggcat_io" 3 | version = "2.0.0" 4 | authors = ["Andrea Cracco "] 5 | edition = "2021" 6 | categories = [] 7 | description = "GGCAT io crate" 8 | license = "MIT OR Apache-2.0" 9 | repository = "https://github.com/algbio/ggcat" 10 | 11 | [dependencies] 12 | 13 | # Config 14 | config = { package = "ggcat_config", path = "../config" } 15 | utils = { package = "ggcat_utils", path = "../utils" } 16 | 17 | 18 | # Common libraries 19 | parallel-processor = "0.1.24" 20 | streaming-libdeflate-rs = "0.1.5" 21 | nightly-quirks = "0.1.4" 22 | 23 | # Pipeline libraries 24 | hashes = { package = "ggcat_hashes", path = "../hashes" } 25 | 26 | parking_lot = "0.12.3" 27 | byteorder = "1.5.0" 28 | lz4 = "1.25.0" 29 | bincode = "1.3.3" 30 | serde = "1.0.203" 31 | flate2 = "1.0.30" 32 | typenum = "1.17.0" 33 | bstr = "1.9.1" 34 | ggcat-logging = { version = "2.0.0", path = "../logging" } 35 | anyhow = "1.0.89" 36 | dynamic-dispatch = "0.5.4" 37 | 38 | [dev-dependencies] 39 | rand = "0.8.5" 40 | 41 | 42 | [features] 43 | support_kmer_counters = [] 44 | -------------------------------------------------------------------------------- /crates/io/src/chunks_writer.rs: -------------------------------------------------------------------------------- 1 | use config::ColorIndexType; 2 | 3 | pub trait ChunksWriter { 4 | type ProcessingData; 5 | type TargetData; 6 | type StreamType<'a> 7 | where 8 | Self: 'a; 9 | 10 | fn start_processing(&self) -> Self::ProcessingData; 11 | fn flush_data(&self, tmp_data: &mut Self::ProcessingData, data: &[Self::TargetData]); 12 | fn get_stream<'a>(&'a self, tmp_data: &'a mut Self::ProcessingData) -> Self::StreamType<'a>; 13 | 14 | fn end_processing(&self, tmp_data: Self::ProcessingData, start_index: ColorIndexType); 15 | } 16 | -------------------------------------------------------------------------------- /crates/io/src/concurrent.rs: -------------------------------------------------------------------------------- 1 | pub mod structured_sequences; 2 | pub mod temp_reads; 3 | -------------------------------------------------------------------------------- /crates/io/src/concurrent/structured_sequences/concurrent.rs: -------------------------------------------------------------------------------- 1 | use crate::concurrent::structured_sequences::{ 2 | IdentSequenceWriter, StructuredSequenceBackend, StructuredSequenceWriter, 3 | }; 4 | use utils::vec_slice::VecSlice; 5 | 6 | use super::SequenceAbundanceType; 7 | 8 | pub struct FastaWriterConcurrentBuffer< 9 | 'a, 10 | ColorInfo: IdentSequenceWriter, 11 | LinksInfo: IdentSequenceWriter, 12 | Backend: StructuredSequenceBackend, 13 | > { 14 | target: &'a StructuredSequenceWriter, 15 | sequences: Vec<(VecSlice, ColorInfo, LinksInfo, SequenceAbundanceType)>, 16 | seq_buf: Vec, 17 | extra_buffers: (ColorInfo::TempBuffer, LinksInfo::TempBuffer), 18 | temp_buffer: Backend::SequenceTempBuffer, 19 | current_index: Option, 20 | auto_flush: bool, 21 | } 22 | 23 | impl< 24 | 'a, 25 | ColorInfo: IdentSequenceWriter, 26 | LinksInfo: IdentSequenceWriter, 27 | Backend: StructuredSequenceBackend, 28 | > FastaWriterConcurrentBuffer<'a, ColorInfo, LinksInfo, Backend> 29 | { 30 | pub fn new( 31 | target: &'a StructuredSequenceWriter, 32 | max_size: usize, 33 | auto_flush: bool, 34 | ) -> Self { 35 | Self { 36 | target, 37 | sequences: Vec::with_capacity(max_size / 128), 38 | seq_buf: Vec::with_capacity(max_size), 39 | extra_buffers: (ColorInfo::new_temp_buffer(), LinksInfo::new_temp_buffer()), 40 | temp_buffer: Backend::alloc_temp_buffer(), 41 | current_index: None, 42 | auto_flush, 43 | } 44 | } 45 | 46 | pub fn flush(&mut self) -> u64 { 47 | if self.sequences.len() == 0 { 48 | return 0; 49 | } 50 | 51 | let first_read_index = self.target.write_sequences( 52 | &mut self.temp_buffer, 53 | self.current_index.map(|c| c - self.sequences.len() as u64), 54 | self.sequences 55 | .drain(..) 56 | .map(|(slice, col, link, abundance)| { 57 | (slice.get_slice(&self.seq_buf), col, link, abundance) 58 | }), 59 | &self.extra_buffers, 60 | ); 61 | 62 | ColorInfo::clear_temp_buffer(&mut self.extra_buffers.0); 63 | LinksInfo::clear_temp_buffer(&mut self.extra_buffers.1); 64 | self.seq_buf.clear(); 65 | 66 | first_read_index 67 | } 68 | 69 | #[inline(always)] 70 | fn will_overflow(vec: &Vec, len: usize) -> bool { 71 | vec.len() > 0 && (vec.len() + len > vec.capacity()) 72 | } 73 | 74 | pub fn add_read( 75 | &mut self, 76 | sequence: &[u8], 77 | sequence_index: Option, 78 | color: ColorInfo, 79 | color_extra_buffer: &ColorInfo::TempBuffer, 80 | links: LinksInfo, 81 | links_extra_buffer: &LinksInfo::TempBuffer, 82 | #[cfg(feature = "support_kmer_counters")] abundance: SequenceAbundanceType, 83 | ) -> Option { 84 | let mut result = None; 85 | let mut different_index = false; 86 | 87 | if let Some(sequence_index) = sequence_index { 88 | if Some(sequence_index) != self.current_index { 89 | result = Some(self.flush()); 90 | self.current_index = Some(sequence_index); 91 | different_index = true; 92 | } 93 | } 94 | 95 | if !different_index && self.auto_flush && Self::will_overflow(&self.seq_buf, sequence.len()) 96 | { 97 | result = Some(self.flush()); 98 | } 99 | 100 | let color = 101 | ColorInfo::copy_extra_from(color, &color_extra_buffer, &mut self.extra_buffers.0); 102 | let links = 103 | LinksInfo::copy_extra_from(links, &links_extra_buffer, &mut self.extra_buffers.1); 104 | 105 | self.sequences.push(( 106 | VecSlice::new_extend(&mut self.seq_buf, sequence), 107 | color, 108 | links, 109 | match () { 110 | #[cfg(feature = "support_kmer_counters")] 111 | () => abundance, 112 | #[cfg(not(feature = "support_kmer_counters"))] 113 | () => (), 114 | }, 115 | )); 116 | 117 | if let Some(current_index) = &mut self.current_index { 118 | *current_index += 1; 119 | } 120 | 121 | result 122 | } 123 | 124 | pub fn finalize(mut self) -> u64 { 125 | self.flush() 126 | } 127 | } 128 | 129 | impl< 130 | 'a, 131 | ColorInfo: IdentSequenceWriter, 132 | LinksInfo: IdentSequenceWriter, 133 | Backend: StructuredSequenceBackend, 134 | > Drop for FastaWriterConcurrentBuffer<'a, ColorInfo, LinksInfo, Backend> 135 | { 136 | fn drop(&mut self) { 137 | self.flush(); 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /crates/io/src/concurrent/structured_sequences/fasta.rs: -------------------------------------------------------------------------------- 1 | use crate::concurrent::structured_sequences::{IdentSequenceWriter, StructuredSequenceBackend}; 2 | use config::{DEFAULT_OUTPUT_BUFFER_SIZE, DEFAULT_PER_CPU_BUFFER_SIZE}; 3 | use dynamic_dispatch::dynamic_dispatch; 4 | use flate2::write::GzEncoder; 5 | use flate2::Compression; 6 | use lz4::{BlockMode, BlockSize, ContentChecksum}; 7 | use std::fs::File; 8 | use std::io::{BufWriter, Write}; 9 | use std::marker::PhantomData; 10 | use std::path::{Path, PathBuf}; 11 | 12 | use super::stream_finish::SequencesWriterWrapper; 13 | 14 | #[cfg(feature = "support_kmer_counters")] 15 | use super::SequenceAbundance; 16 | use super::{StructuredSequenceBackendInit, StructuredSequenceBackendWrapper}; 17 | 18 | pub struct FastaWriterWrapper; 19 | 20 | #[dynamic_dispatch] 21 | impl StructuredSequenceBackendWrapper for FastaWriterWrapper { 22 | type Backend = 23 | FastaWriter; 24 | } 25 | 26 | pub struct FastaWriter { 27 | writer: Box, 28 | path: PathBuf, 29 | _phantom: PhantomData<(ColorInfo, LinksInfo)>, 30 | } 31 | 32 | unsafe impl Send 33 | for FastaWriter 34 | { 35 | } 36 | 37 | unsafe impl Sync 38 | for FastaWriter 39 | { 40 | } 41 | 42 | impl StructuredSequenceBackendInit 43 | for FastaWriter 44 | { 45 | fn new_compressed_gzip(path: impl AsRef, level: u32) -> Self { 46 | let compress_stream = GzEncoder::new( 47 | BufWriter::with_capacity(DEFAULT_OUTPUT_BUFFER_SIZE, File::create(&path).unwrap()), 48 | Compression::new(level), 49 | ); 50 | 51 | FastaWriter { 52 | writer: Box::new(SequencesWriterWrapper::new(BufWriter::with_capacity( 53 | DEFAULT_OUTPUT_BUFFER_SIZE, 54 | compress_stream, 55 | ))), 56 | path: path.as_ref().to_path_buf(), 57 | _phantom: PhantomData, 58 | } 59 | } 60 | 61 | fn new_compressed_lz4(path: impl AsRef, level: u32) -> Self { 62 | let compress_stream = lz4::EncoderBuilder::new() 63 | .level(level) 64 | .checksum(ContentChecksum::NoChecksum) 65 | .block_mode(BlockMode::Linked) 66 | .block_size(BlockSize::Max1MB) 67 | .build(BufWriter::with_capacity( 68 | DEFAULT_OUTPUT_BUFFER_SIZE, 69 | File::create(&path).unwrap(), 70 | )) 71 | .unwrap(); 72 | 73 | FastaWriter { 74 | writer: Box::new(SequencesWriterWrapper::new(BufWriter::with_capacity( 75 | DEFAULT_OUTPUT_BUFFER_SIZE, 76 | compress_stream, 77 | ))), 78 | path: path.as_ref().to_path_buf(), 79 | _phantom: PhantomData, 80 | } 81 | } 82 | 83 | fn new_plain(path: impl AsRef) -> Self { 84 | FastaWriter { 85 | writer: Box::new(SequencesWriterWrapper::new(BufWriter::with_capacity( 86 | DEFAULT_OUTPUT_BUFFER_SIZE, 87 | File::create(&path).unwrap(), 88 | ))), 89 | path: path.as_ref().to_path_buf(), 90 | _phantom: PhantomData, 91 | } 92 | } 93 | } 94 | 95 | impl 96 | StructuredSequenceBackend for FastaWriter 97 | { 98 | type SequenceTempBuffer = Vec; 99 | 100 | fn alloc_temp_buffer() -> Self::SequenceTempBuffer { 101 | Vec::with_capacity(DEFAULT_PER_CPU_BUFFER_SIZE.as_bytes()) 102 | } 103 | 104 | fn write_sequence( 105 | _k: usize, 106 | buffer: &mut Self::SequenceTempBuffer, 107 | sequence_index: u64, 108 | sequence: &[u8], 109 | 110 | color_info: ColorInfo, 111 | links_info: LinksInfo, 112 | extra_buffers: &(ColorInfo::TempBuffer, LinksInfo::TempBuffer), 113 | 114 | #[cfg(feature = "support_kmer_counters")] abundance: SequenceAbundance, 115 | ) { 116 | #[cfg(feature = "support_kmer_counters")] 117 | write!( 118 | buffer, 119 | ">{} LN:i:{} KC:i:{} km:f:{:.1}", 120 | sequence_index, 121 | sequence.len(), 122 | abundance.sum, 123 | abundance.sum as f64 / (sequence.len() - _k + 1) as f64 124 | ) 125 | .unwrap(); 126 | 127 | #[cfg(not(feature = "support_kmer_counters"))] 128 | write!(buffer, ">{} LN:i:{}", sequence_index, sequence.len(),).unwrap(); 129 | 130 | color_info.write_as_ident(buffer, &extra_buffers.0); 131 | links_info.write_as_ident(buffer, &extra_buffers.1); 132 | buffer.extend_from_slice(b"\n"); 133 | buffer.extend_from_slice(sequence); 134 | buffer.extend_from_slice(b"\n"); 135 | } 136 | 137 | fn get_path(&self) -> PathBuf { 138 | self.path.clone() 139 | } 140 | 141 | fn flush_temp_buffer(&mut self, buffer: &mut Self::SequenceTempBuffer) { 142 | self.writer.write_all(buffer).unwrap(); 143 | buffer.clear(); 144 | } 145 | 146 | fn finalize(self) {} 147 | } 148 | 149 | impl Drop 150 | for FastaWriter 151 | { 152 | fn drop(&mut self) { 153 | self.writer.flush().unwrap(); 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /crates/io/src/concurrent/structured_sequences/stream_finish.rs: -------------------------------------------------------------------------------- 1 | use flate2::write::GzEncoder; 2 | use std::{ 3 | fmt::Debug, 4 | fs::File, 5 | io::{BufWriter, Write}, 6 | }; 7 | 8 | pub(crate) trait SequencesFileFinish: Write + Debug { 9 | fn finalize(self); 10 | } 11 | impl SequencesFileFinish for BufWriter { 12 | fn finalize(self) { 13 | self.into_inner().unwrap().finalize(); 14 | } 15 | } 16 | impl SequencesFileFinish for File { 17 | fn finalize(mut self) { 18 | self.flush().unwrap(); 19 | } 20 | } 21 | impl SequencesFileFinish for lz4::Encoder { 22 | fn finalize(self) { 23 | let (w, err) = self.finish(); 24 | err.unwrap(); 25 | w.finalize(); 26 | } 27 | } 28 | impl SequencesFileFinish for GzEncoder { 29 | fn finalize(self) { 30 | let w = self.finish().unwrap(); 31 | w.finalize(); 32 | } 33 | } 34 | 35 | pub(crate) struct SequencesWriterWrapper { 36 | writer: Option, 37 | } 38 | 39 | impl SequencesWriterWrapper { 40 | pub fn new(writer: W) -> Self { 41 | Self { 42 | writer: Some(writer), 43 | } 44 | } 45 | } 46 | 47 | impl Write for SequencesWriterWrapper { 48 | fn write(&mut self, buf: &[u8]) -> std::io::Result { 49 | unsafe { self.writer.as_mut().unwrap_unchecked() }.write(buf) 50 | } 51 | 52 | fn write_all(&mut self, buf: &[u8]) -> std::io::Result<()> { 53 | unsafe { self.writer.as_mut().unwrap_unchecked() }.write_all(buf) 54 | } 55 | 56 | fn flush(&mut self) -> std::io::Result<()> { 57 | unsafe { self.writer.as_mut().unwrap_unchecked() }.flush() 58 | } 59 | } 60 | 61 | impl Drop for SequencesWriterWrapper { 62 | fn drop(&mut self) { 63 | self.writer.take().unwrap().finalize(); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /crates/io/src/concurrent/temp_reads.rs: -------------------------------------------------------------------------------- 1 | pub mod creads_utils; 2 | pub mod extra_data; 3 | -------------------------------------------------------------------------------- /crates/io/src/lib.rs: -------------------------------------------------------------------------------- 1 | use crate::sequences_stream::general::GeneralSequenceBlockData; 2 | use config::{MAX_BUCKETS_COUNT_LOG, MAX_BUCKET_SIZE, MIN_BUCKETS_COUNT_LOG}; 3 | use parallel_processor::buckets::SingleBucket; 4 | use std::cmp::{max, min}; 5 | use std::path::Path; 6 | 7 | pub mod chunks_writer; 8 | pub mod compressed_read; 9 | pub mod concurrent; 10 | pub mod lines_reader; 11 | // pub mod reads_writer; 12 | pub mod sequences_reader; 13 | pub mod sequences_stream; 14 | pub mod structs; 15 | pub mod varint; 16 | 17 | pub fn generate_bucket_names( 18 | root: impl AsRef, 19 | count: usize, 20 | suffix: Option<&str>, 21 | ) -> Vec { 22 | (0..count) 23 | .map(|i| SingleBucket { 24 | index: i, 25 | path: root.as_ref().with_extension(format!( 26 | "{}{}", 27 | i, 28 | match suffix { 29 | None => String::from(""), 30 | Some(s) => format!(".{}", s), 31 | } 32 | )), 33 | }) 34 | .collect() 35 | } 36 | 37 | pub struct FilesStatsInfo { 38 | pub best_buckets_count_log: usize, 39 | // pub best_lz4_compression_level: u32, 40 | } 41 | 42 | pub fn compute_stats_from_input_blocks( 43 | blocks: &[GeneralSequenceBlockData], 44 | ) -> anyhow::Result { 45 | let mut bases_count = 0; 46 | for block in blocks { 47 | bases_count += block.estimated_bases_count()?; 48 | } 49 | 50 | let buckets_count = bases_count / MAX_BUCKET_SIZE; 51 | 52 | let buckets_log = (max(1, buckets_count) - 1).next_power_of_two().ilog2() as usize; 53 | 54 | Ok(FilesStatsInfo { 55 | best_buckets_count_log: min( 56 | MAX_BUCKETS_COUNT_LOG, 57 | max(MIN_BUCKETS_COUNT_LOG, buckets_log), 58 | ), 59 | // best_lz4_compression_level: 0, 60 | }) 61 | } 62 | -------------------------------------------------------------------------------- /crates/io/src/reads_writer.rs: -------------------------------------------------------------------------------- 1 | use crate::sequences_reader::FastaSequence; 2 | use byteorder::WriteBytesExt; 3 | use config::DEFAULT_OUTPUT_BUFFER_SIZE; 4 | use flate2::write::GzEncoder; 5 | use flate2::Compression; 6 | use lz4::{BlockMode, BlockSize, ContentChecksum}; 7 | use std::fs::File; 8 | use std::io::{BufWriter, Write}; 9 | use std::path::{Path, PathBuf}; 10 | 11 | enum WriterChannels { 12 | None, 13 | File(BufWriter), 14 | CompressedFileGzip(BufWriter>>), 15 | CompressedFileLZ4(BufWriter>>), 16 | } 17 | 18 | impl WriterChannels { 19 | fn get_writer(&mut self) -> &mut dyn Write { 20 | match self { 21 | WriterChannels::File(x) => x, 22 | WriterChannels::CompressedFileGzip(x) => x, 23 | WriterChannels::CompressedFileLZ4(x) => x, 24 | WriterChannels::None => unreachable!(), 25 | } 26 | } 27 | } 28 | 29 | pub struct ReadsWriter { 30 | writer: WriterChannels, 31 | path: PathBuf, 32 | reads_count: usize, 33 | } 34 | 35 | impl ReadsWriter { 36 | pub fn new_compressed_gzip(path: impl AsRef, level: u32) -> ReadsWriter { 37 | let compress_stream = GzEncoder::new( 38 | BufWriter::with_capacity(DEFAULT_OUTPUT_BUFFER_SIZE, File::create(&path).unwrap()), 39 | Compression::new(level), 40 | ); 41 | 42 | ReadsWriter { 43 | writer: WriterChannels::CompressedFileGzip(BufWriter::with_capacity( 44 | DEFAULT_OUTPUT_BUFFER_SIZE, 45 | compress_stream, 46 | )), 47 | path: path.as_ref().to_path_buf(), 48 | reads_count: 0, 49 | } 50 | } 51 | 52 | pub fn new_compressed_lz4(path: impl AsRef, level: u32) -> ReadsWriter { 53 | let compress_stream = lz4::EncoderBuilder::new() 54 | .level(level) 55 | .checksum(ContentChecksum::NoChecksum) 56 | .block_mode(BlockMode::Linked) 57 | .block_size(BlockSize::Max1MB) 58 | .build(BufWriter::with_capacity( 59 | DEFAULT_OUTPUT_BUFFER_SIZE, 60 | File::create(&path).unwrap(), 61 | )) 62 | .unwrap(); 63 | 64 | ReadsWriter { 65 | writer: WriterChannels::CompressedFileLZ4(BufWriter::with_capacity( 66 | DEFAULT_OUTPUT_BUFFER_SIZE, 67 | compress_stream, 68 | )), 69 | path: path.as_ref().to_path_buf(), 70 | reads_count: 0, 71 | } 72 | } 73 | 74 | pub fn new_plain(path: impl AsRef) -> ReadsWriter { 75 | ReadsWriter { 76 | writer: WriterChannels::File(BufWriter::with_capacity( 77 | DEFAULT_OUTPUT_BUFFER_SIZE, 78 | File::create(&path).unwrap(), 79 | )), 80 | path: path.as_ref().to_path_buf(), 81 | reads_count: 0, 82 | } 83 | } 84 | 85 | pub fn get_reads_count(&mut self) -> usize { 86 | self.reads_count 87 | } 88 | 89 | pub fn add_read(&mut self, read: FastaSequence) { 90 | let writer = self.writer.get_writer(); 91 | writer.write_all(read.ident).unwrap(); 92 | writer.write_all(b"\n").unwrap(); 93 | writer.write_all(read.seq).unwrap(); 94 | if let Some(qual) = read.qual { 95 | writer.write_all(b"\n+\n").unwrap(); 96 | writer.write_all(qual).unwrap(); 97 | } 98 | writer.write_u8(b'\n').unwrap(); 99 | 100 | self.reads_count += 1; 101 | } 102 | 103 | #[allow(dead_code)] 104 | pub fn get_path(&self) -> PathBuf { 105 | self.path.clone() 106 | } 107 | 108 | pub fn finalize(self) {} 109 | } 110 | 111 | impl Drop for ReadsWriter { 112 | fn drop(&mut self) { 113 | let writer = std::mem::replace(&mut self.writer, WriterChannels::None); 114 | match writer { 115 | WriterChannels::File(writer) => { 116 | writer.into_inner().unwrap().flush().unwrap(); 117 | } 118 | WriterChannels::CompressedFileGzip(mut writer) => { 119 | writer.flush().unwrap(); 120 | writer 121 | .into_inner() 122 | .unwrap_or_else(|_| panic!("Cannot unwrap!")) 123 | .finish() 124 | .unwrap_or_else(|_| panic!("Cannot unwrap!")) 125 | .flush() 126 | .unwrap(); 127 | } 128 | WriterChannels::CompressedFileLZ4(mut writer) => { 129 | writer.flush().unwrap(); 130 | writer 131 | .into_inner() 132 | .unwrap_or_else(|_| panic!("Cannot unwrap!")) 133 | .finish() 134 | .0 135 | .flush() 136 | .unwrap(); 137 | } 138 | WriterChannels::None => unreachable!(), 139 | } 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /crates/io/src/sequences_stream.rs: -------------------------------------------------------------------------------- 1 | pub mod fasta; 2 | pub mod general; 3 | 4 | use crate::sequences_reader::DnaSequence; 5 | use config::ColorIndexType; 6 | 7 | #[derive(Copy, Clone)] 8 | pub struct SequenceInfo { 9 | pub color: Option, 10 | } 11 | 12 | pub trait GenericSequencesStream: 'static { 13 | type SequenceBlockData: Sync + Send + 'static; 14 | 15 | fn new() -> Self; 16 | 17 | fn read_block( 18 | &mut self, 19 | block: &Self::SequenceBlockData, 20 | copy_ident_data: bool, 21 | partial_read_copyback: Option, 22 | callback: impl FnMut(DnaSequence, SequenceInfo), 23 | ); 24 | } 25 | -------------------------------------------------------------------------------- /crates/io/src/sequences_stream/fasta.rs: -------------------------------------------------------------------------------- 1 | use ggcat_logging::UnrecoverableErrorLogging; 2 | 3 | use crate::sequences_reader::{DnaSequence, SequencesReader}; 4 | use crate::sequences_stream::{GenericSequencesStream, SequenceInfo}; 5 | use std::path::PathBuf; 6 | 7 | pub struct FastaFileSequencesStream { 8 | sequences_reader: SequencesReader, 9 | } 10 | 11 | impl FastaFileSequencesStream { 12 | pub fn get_estimated_bases_count(file: &PathBuf) -> anyhow::Result { 13 | // TODO: Improve this ratio estimation 14 | const COMPRESSED_READS_RATIO: f64 = 0.5; 15 | 16 | let length = std::fs::metadata(file) 17 | .log_unrecoverable_error_with_data("Error while opening file", file.display())? 18 | .len(); 19 | 20 | let file_bases_count = if file 21 | .extension() 22 | .map(|x| x == "gz" || x == "lz4") 23 | .unwrap_or(false) 24 | { 25 | (length as f64 * COMPRESSED_READS_RATIO) as u64 26 | } else { 27 | length 28 | }; 29 | Ok(file_bases_count) 30 | } 31 | } 32 | 33 | impl GenericSequencesStream for FastaFileSequencesStream { 34 | type SequenceBlockData = (PathBuf, Option); 35 | 36 | fn new() -> Self { 37 | Self { 38 | sequences_reader: SequencesReader::new(), 39 | } 40 | } 41 | 42 | fn read_block( 43 | &mut self, 44 | block: &Self::SequenceBlockData, 45 | copy_ident_data: bool, 46 | partial_read_copyback: Option, 47 | mut callback: impl FnMut(DnaSequence, SequenceInfo), 48 | ) { 49 | self.sequences_reader.process_file_extended( 50 | &block.0, 51 | |x| callback(x, SequenceInfo { color: block.1 }), 52 | partial_read_copyback, 53 | copy_ident_data, 54 | false, 55 | ); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /crates/io/src/sequences_stream/general.rs: -------------------------------------------------------------------------------- 1 | use crate::sequences_reader::DnaSequence; 2 | use crate::sequences_stream::fasta::FastaFileSequencesStream; 3 | use crate::sequences_stream::{GenericSequencesStream, SequenceInfo}; 4 | use std::sync::Arc; 5 | 6 | pub trait DynamicSequencesStream: Sync + Send + 'static { 7 | fn read_block( 8 | &self, 9 | block: usize, 10 | copy_ident_data: bool, 11 | partial_read_copyback: Option, 12 | callback: &mut dyn FnMut(DnaSequence, SequenceInfo), 13 | ); 14 | 15 | fn estimated_base_count(&self, block: usize) -> u64; 16 | } 17 | 18 | pub enum GeneralSequenceBlockData { 19 | FASTA(::SequenceBlockData), 20 | GFA(), 21 | Dynamic((Arc, usize)), 22 | } 23 | 24 | impl GeneralSequenceBlockData { 25 | pub fn estimated_bases_count(&self) -> anyhow::Result { 26 | match self { 27 | GeneralSequenceBlockData::FASTA(block) => { 28 | FastaFileSequencesStream::get_estimated_bases_count(&block.0) 29 | } 30 | GeneralSequenceBlockData::GFA() => { 31 | todo!() 32 | } 33 | GeneralSequenceBlockData::Dynamic((reader, block)) => { 34 | Ok(reader.estimated_base_count(*block)) 35 | } 36 | } 37 | } 38 | } 39 | 40 | pub struct GeneralSequencesStream { 41 | fasta_file_reader: Option, 42 | } 43 | 44 | impl GenericSequencesStream for GeneralSequencesStream { 45 | type SequenceBlockData = GeneralSequenceBlockData; 46 | 47 | fn new() -> Self { 48 | Self { 49 | fasta_file_reader: None, 50 | } 51 | } 52 | 53 | fn read_block( 54 | &mut self, 55 | block: &Self::SequenceBlockData, 56 | copy_ident_data: bool, 57 | partial_read_copyback: Option, 58 | mut callback: impl FnMut(DnaSequence, SequenceInfo), 59 | ) { 60 | match block { 61 | GeneralSequenceBlockData::FASTA(block) => { 62 | if self.fasta_file_reader.is_none() { 63 | self.fasta_file_reader = Some(FastaFileSequencesStream::new()); 64 | } 65 | self.fasta_file_reader.as_mut().unwrap().read_block( 66 | block, 67 | copy_ident_data, 68 | partial_read_copyback, 69 | callback, 70 | ); 71 | } 72 | GeneralSequenceBlockData::GFA() => { 73 | unimplemented!(); 74 | } 75 | GeneralSequenceBlockData::Dynamic((reader, index)) => { 76 | reader.read_block( 77 | *index, 78 | copy_ident_data, 79 | partial_read_copyback, 80 | &mut callback, 81 | ); 82 | } 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /crates/io/src/structs.rs: -------------------------------------------------------------------------------- 1 | pub mod hash_entry; 2 | pub mod unitig_link; 3 | -------------------------------------------------------------------------------- /crates/io/src/structs/hash_entry.rs: -------------------------------------------------------------------------------- 1 | use bincode::{deserialize_from, serialize_into}; 2 | use config::BucketIndexType; 3 | use hashes::HashFunctionFactory; 4 | use parallel_processor::buckets::bucket_writer::BucketItemSerializer; 5 | use parallel_processor::fast_smart_bucket_sort::SortKey; 6 | use serde::de::DeserializeOwned; 7 | use serde::{Deserialize, Serialize}; 8 | use std::io::Read; 9 | use std::marker::PhantomData; 10 | use std::mem::size_of; 11 | 12 | #[derive(Copy, Clone, Eq, PartialEq, Serialize, Deserialize, Debug)] 13 | #[repr(u8)] 14 | pub enum Direction { 15 | Backward, 16 | Forward, 17 | } 18 | 19 | #[derive(Copy, Clone, Serialize, Deserialize, Debug)] 20 | pub struct HashEntry { 21 | pub hash: H, 22 | encoded: u64, 23 | // pub bucket: BucketIndexType, 24 | // pub entry: u64, 25 | // pub direction: Direction, 26 | } 27 | 28 | impl HashEntry { 29 | const ENTRY_OFFSET: usize = (size_of::() * 8) + 1; 30 | const BUCKET_OFFSET: usize = 1; 31 | const DIRECTION_OFFSET: usize = 0; 32 | 33 | pub fn new(hash: H, bucket: BucketIndexType, entry: u64, direction: Direction) -> Self { 34 | Self { 35 | hash, 36 | encoded: (entry << Self::ENTRY_OFFSET) 37 | | ((bucket as u64) << Self::BUCKET_OFFSET) 38 | | ((match direction { 39 | Direction::Forward => 1, 40 | Direction::Backward => 0, 41 | }) << Self::DIRECTION_OFFSET), 42 | } 43 | } 44 | 45 | pub fn entry(&self) -> u64 { 46 | self.encoded >> Self::ENTRY_OFFSET 47 | } 48 | 49 | pub fn bucket(&self) -> BucketIndexType { 50 | (self.encoded >> Self::BUCKET_OFFSET) as BucketIndexType 51 | } 52 | 53 | pub fn direction(&self) -> Direction { 54 | if (self.encoded >> Self::DIRECTION_OFFSET) & 0x1 == 0 { 55 | Direction::Backward 56 | } else { 57 | Direction::Forward 58 | } 59 | } 60 | } 61 | 62 | pub struct HashEntrySerializer(PhantomData); 63 | 64 | impl BucketItemSerializer for HashEntrySerializer { 65 | type InputElementType<'a> = HashEntry; 66 | type ExtraData = (); 67 | type ReadBuffer = (); 68 | type ExtraDataBuffer = (); 69 | type ReadType<'a> = HashEntry; 70 | 71 | type CheckpointData = (); 72 | 73 | #[inline(always)] 74 | fn new() -> Self { 75 | Self(PhantomData) 76 | } 77 | 78 | #[inline(always)] 79 | fn reset(&mut self) {} 80 | 81 | #[inline(always)] 82 | fn write_to( 83 | &mut self, 84 | element: &Self::InputElementType<'_>, 85 | bucket: &mut Vec, 86 | _extra_data: &Self::ExtraData, 87 | _: &(), 88 | ) { 89 | serialize_into(bucket, element).unwrap(); 90 | } 91 | 92 | fn read_from<'a, S: Read>( 93 | &mut self, 94 | stream: S, 95 | _read_buffer: &'a mut Self::ReadBuffer, 96 | _: &mut (), 97 | ) -> Option> { 98 | deserialize_from(stream).ok() 99 | } 100 | 101 | #[inline(always)] 102 | fn get_size(&self, _: &Self::InputElementType<'_>, _: &()) -> usize { 103 | size_of::() + size_of::() + 8 + 1 104 | } 105 | } 106 | 107 | pub struct HashCompare { 108 | _phantom: PhantomData, 109 | } 110 | 111 | impl SortKey> for HashCompare { 112 | type KeyType = H::HashTypeUnextendable; 113 | const KEY_BITS: usize = size_of::() * 8; 114 | 115 | #[inline(always)] 116 | fn compare( 117 | left: &HashEntry<::HashTypeUnextendable>, 118 | right: &HashEntry<::HashTypeUnextendable>, 119 | ) -> std::cmp::Ordering { 120 | left.hash.cmp(&right.hash) 121 | } 122 | 123 | #[inline(always)] 124 | fn get_shifted(value: &HashEntry, rhs: u8) -> u8 { 125 | H::get_shifted(value.hash, rhs) as u8 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /crates/io/src/varint.rs: -------------------------------------------------------------------------------- 1 | use std::mem::MaybeUninit; 2 | 3 | pub const VARINT_FLAGS_MAX_SIZE: usize = 10; 4 | pub const VARINT_MAX_SIZE: usize = 9; 5 | 6 | #[inline(always)] 7 | #[allow(clippy::uninit_assumed_init)] 8 | pub fn encode_varint(write_bytes: impl FnOnce(&[u8]) -> T, mut value: u64) -> T { 9 | #[allow(invalid_value)] 10 | let mut bytes: [u8; VARINT_MAX_SIZE] = unsafe { MaybeUninit::uninit().assume_init() }; 11 | let mut index = 0; 12 | while index < bytes.len() { 13 | let rem = ((value > 127) as u8) << 7; 14 | bytes[index] = ((value as u8) & 0b1111111) | rem; 15 | value >>= 7; 16 | index += 1; 17 | if value == 0 { 18 | break; 19 | } 20 | } 21 | write_bytes(&bytes[..index]) 22 | } 23 | 24 | #[inline(always)] 25 | #[allow(non_camel_case_types)] 26 | #[allow(clippy::uninit_assumed_init)] 27 | pub fn encode_varint_flags T, FLAGS_COUNT: typenum::Unsigned>( 28 | write_bytes: F, 29 | mut value: u64, 30 | flags: u8, 31 | ) -> T { 32 | #[allow(invalid_value)] 33 | let mut bytes: [u8; VARINT_FLAGS_MAX_SIZE] = unsafe { MaybeUninit::uninit().assume_init() }; 34 | 35 | let useful_first_bits: usize = 8 - FLAGS_COUNT::to_usize(); 36 | let first_byte_max_value: u8 = ((1u16 << (useful_first_bits - 1)) - 1) as u8; 37 | 38 | let fr_rem = ((value > first_byte_max_value as u64) as u8) << (useful_first_bits - 1); 39 | 40 | bytes[0] = (((flags as u16) << useful_first_bits) as u8) 41 | | (value as u8 & first_byte_max_value) 42 | | fr_rem; 43 | 44 | value >>= useful_first_bits - 1; 45 | let mut index = 1; 46 | 47 | while index < bytes.len() { 48 | if value == 0 { 49 | break; 50 | } 51 | let rem = ((value > 127) as u8) << 7; 52 | bytes[index] = ((value as u8) & 0b1111111) | rem; 53 | value >>= 7; 54 | index += 1; 55 | } 56 | write_bytes(&bytes[..index]) 57 | } 58 | 59 | #[inline(always)] 60 | #[allow(non_camel_case_types)] 61 | pub fn decode_varint_flags Option, FLAGS_COUNT: typenum::Unsigned>( 62 | mut read_byte: F, 63 | ) -> Option<(u64, u8)> { 64 | let first_byte = read_byte()?; 65 | 66 | let useful_first_bits: usize = 8 - FLAGS_COUNT::to_usize(); 67 | let first_byte_max_value: u8 = ((1u16 << (useful_first_bits - 1)) - 1) as u8; 68 | 69 | let flags = ((first_byte as u16) >> useful_first_bits) as u8; 70 | let mut result = (first_byte & first_byte_max_value) as u64; 71 | let mut offset = useful_first_bits - 1; 72 | let mut next = first_byte & (1 << (useful_first_bits - 1)) != 0; 73 | 74 | loop { 75 | if !next { 76 | break; 77 | } 78 | let value = read_byte()?; 79 | next = (value & 0b10000000) != 0; 80 | result |= ((value & 0b1111111) as u64) << offset; 81 | offset += 7; 82 | } 83 | Some((result, flags)) 84 | } 85 | 86 | #[inline(always)] 87 | pub fn decode_varint(mut read_byte: impl FnMut() -> Option) -> Option { 88 | let mut result = 0; 89 | let mut offset = 0u32; 90 | loop { 91 | let value = read_byte()?; 92 | let next = (value & 0b10000000) != 0; 93 | result |= ((value & 0b1111111) as u64) << offset; 94 | if !next { 95 | break; 96 | } 97 | offset += 7; 98 | } 99 | Some(result) 100 | } 101 | 102 | #[cfg(test)] 103 | mod tests { 104 | use crate::varint::{decode_varint, decode_varint_flags, encode_varint, encode_varint_flags}; 105 | use byteorder::ReadBytesExt; 106 | use std::io::{Cursor, Write}; 107 | 108 | #[test] 109 | fn varints() { 110 | let mut result: Vec = vec![]; 111 | 112 | for i in 0..100000 { 113 | result.clear(); 114 | encode_varint(|b| result.write_all(b), i).unwrap(); 115 | let mut cursor = Cursor::new(&result); 116 | assert_eq!( 117 | i, 118 | decode_varint(|| Some(cursor.read_u8().unwrap())).unwrap() 119 | ); 120 | } 121 | } 122 | 123 | #[test] 124 | fn varints_flags() { 125 | let mut result: Vec = vec![]; 126 | 127 | for i in 0..100000 { 128 | result.clear(); 129 | encode_varint_flags::<_, _, typenum::U2>(|b| result.write_all(b), i, (i % 4) as u8) 130 | .unwrap(); 131 | let mut cursor = Cursor::new(&result); 132 | assert_eq!( 133 | (i, (i % 4) as u8), 134 | decode_varint_flags::<_, typenum::U2>(|| Some(cursor.read_u8().unwrap())).unwrap() 135 | ); 136 | } 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /crates/kmers_transform/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ggcat_kmers_transform" 3 | version = "2.0.0" 4 | edition = "2021" 5 | [dependencies] 6 | 7 | # Config 8 | config = { package = "ggcat_config", path = "../config" } 9 | utils = { package = "ggcat_utils", path = "../utils" } 10 | 11 | # Common libraries 12 | parallel-processor = "0.1.24" 13 | streaming-libdeflate-rs = "0.1.5" 14 | nightly-quirks = "0.1.4" 15 | hashes = { package = "ggcat_hashes", path = "../hashes" } 16 | minimizer_bucketing = { package = "ggcat_minimizer_bucketing", path = "../minimizer_bucketing" } 17 | 18 | # Other libraries 19 | io = { package = "ggcat_io", path = "../io" } 20 | parking_lot = "0.12.3" 21 | typenum = "1.17.0" 22 | replace_with = "0.1.7" 23 | tokio = "1.38.0" 24 | 25 | 26 | # Tracing 27 | instrumenter = "0.1.3" 28 | ggcat-logging = { version = "2.0.0", path = "../logging" } 29 | -------------------------------------------------------------------------------- /crates/kmers_transform/src/debug_bucket_stats.rs: -------------------------------------------------------------------------------- 1 | use config::{ 2 | BucketIndexType, MultiplicityCounterType, DEFAULT_OUTPUT_BUFFER_SIZE, DEFAULT_PREFETCH_AMOUNT, 3 | READ_FLAG_INCL_END, USE_SECOND_BUCKET, 4 | }; 5 | use hashes::default::MNHFactory; 6 | use hashes::{ 7 | ExtendableHashTraitType, HashFunction, HashFunctionFactory, HashableSequence, 8 | MinimizerHashFunctionFactory, 9 | }; 10 | use io::compressed_read::CompressedRead; 11 | use io::concurrent::temp_reads::creads_utils::{ 12 | BucketModeFromBoolean, CompressedReadsBucketDataSerializer, NoMultiplicity, 13 | }; 14 | use parallel_processor::buckets::readers::async_binary_reader::{ 15 | AllowedCheckpointStrategy, AsyncBinaryReader, AsyncReaderThread, 16 | }; 17 | use parallel_processor::memory_fs::RemoveFileMode; 18 | use parallel_processor::scheduler::ThreadPriorityHandle; 19 | use std::collections::HashSet; 20 | use std::path::PathBuf; 21 | 22 | fn get_sequence_bucket( 23 | k: usize, 24 | m: usize, 25 | seq_data: &(u8, u8, C, CompressedRead, MultiplicityCounterType), 26 | used_hash_bits: usize, 27 | bucket_bits_count: usize, 28 | ) -> BucketIndexType { 29 | let read = &seq_data.3; 30 | let flags = seq_data.0; 31 | let decr_val = ((read.bases_count() == k) && (flags & READ_FLAG_INCL_END) == 0) as usize; 32 | 33 | let hashes = MNHFactory::new(read.sub_slice((1 - decr_val)..(k - decr_val)), m); 34 | 35 | let minimizer = hashes 36 | .iter() 37 | .min_by_key(|k| MNHFactory::get_full_minimizer(k.to_unextendable())) 38 | .unwrap(); 39 | 40 | MNHFactory::get_bucket( 41 | used_hash_bits, 42 | bucket_bits_count, 43 | minimizer.to_unextendable(), 44 | ) 45 | } 46 | 47 | pub fn compute_stats_for_bucket( 48 | bucket: PathBuf, 49 | bucket_index: usize, 50 | buckets_count: usize, 51 | second_buckets_log_max: usize, 52 | k: usize, 53 | m: usize, 54 | thread_handle: &ThreadPriorityHandle, 55 | ) { 56 | let reader = AsyncBinaryReader::new( 57 | &bucket, 58 | true, 59 | RemoveFileMode::Remove { remove_fs: false }, 60 | DEFAULT_PREFETCH_AMOUNT, 61 | ); 62 | 63 | let file_size = reader.get_file_size(); 64 | 65 | let reader_thread = AsyncReaderThread::new(DEFAULT_OUTPUT_BUFFER_SIZE, 4); 66 | 67 | let second_buckets_max = 1 << second_buckets_log_max; 68 | 69 | let mut hash_maps = (0..second_buckets_max) 70 | .map(|_| HashSet::new()) 71 | .collect::>(); 72 | 73 | let mut checkpoints_iterator = reader.get_items_stream::, 77 | NoMultiplicity, 78 | >>( 79 | reader_thread.clone(), 80 | Vec::new(), 81 | (), 82 | AllowedCheckpointStrategy::DecompressOnly, 83 | thread_handle, 84 | ); 85 | 86 | let mut total_counters = vec![0; second_buckets_max]; 87 | 88 | while let Some((items_iterator, _)) = checkpoints_iterator.get_next_checkpoint() { 89 | while let Some((read_info, _)) = items_iterator.next() { 90 | let orig_bucket = get_sequence_bucket::<()>( 91 | k, 92 | m, 93 | &read_info, 94 | buckets_count.ilog2() as usize, 95 | second_buckets_log_max, 96 | ) as usize; 97 | 98 | let hashes = MH::new(read_info.3, k); 99 | 100 | for hash in hashes.iter() { 101 | total_counters[orig_bucket] += 1; 102 | hash_maps[orig_bucket].insert(hash.to_unextendable()); 103 | } 104 | } 105 | } 106 | 107 | let counters_string = hash_maps 108 | .iter() 109 | .zip(total_counters.iter()) 110 | .map(|(h, t)| format!("({}/{})", h.len(), t)) 111 | .collect::>() 112 | .join(";"); 113 | 114 | let tot_seqs = total_counters.iter().sum::(); 115 | let uniq_seqs = hash_maps.iter().map(|h| h.len()).sum::(); 116 | 117 | ggcat_logging::info!("Stats for bucket: {}", bucket_index); 118 | ggcat_logging::info!( 119 | "FSIZE: {} SEQUENCES: {}/{} UNIQUE_RATIO: {} COMPR_RATIO: {} ", 120 | file_size, 121 | tot_seqs, 122 | uniq_seqs, 123 | (tot_seqs as f64 / uniq_seqs as f64), 124 | (file_size as f64 / tot_seqs as f64) 125 | ); 126 | ggcat_logging::info!("Results: {}", counters_string); 127 | } 128 | -------------------------------------------------------------------------------- /crates/kmers_transform/src/processor.rs: -------------------------------------------------------------------------------- 1 | use crate::reads_buffer::ReadsBuffer; 2 | use crate::{ 3 | KmersTransformContext, KmersTransformExecutorFactory, KmersTransformFinalExecutor, 4 | KmersTransformMapProcessor, 5 | }; 6 | use config::{PRIORITY_SCHEDULING_HIGH, WORKERS_PRIORITY_BASE}; 7 | use parallel_processor::execution_manager::executor::{AsyncExecutor, ExecutorReceiver}; 8 | use parallel_processor::execution_manager::memory_tracker::MemoryTracker; 9 | use parallel_processor::execution_manager::objects_pool::PoolObjectTrait; 10 | use parallel_processor::execution_manager::packet::{Packet, PacketTrait}; 11 | use parallel_processor::mt_debug_counters::counter::{AtomicCounter, SumMode}; 12 | use parallel_processor::mt_debug_counters::declare_counter_i64; 13 | use parallel_processor::scheduler::PriorityScheduler; 14 | use std::future::Future; 15 | use std::marker::PhantomData; 16 | use std::path::PathBuf; 17 | use std::sync::atomic::Ordering; 18 | use utils::track; 19 | 20 | pub struct KmersTransformProcessor(PhantomData); 21 | 22 | static ADDR_WAITING_COUNTER: AtomicCounter = 23 | declare_counter_i64!("kt_addr_wait_processor", SumMode, false); 24 | 25 | static PACKET_WAITING_COUNTER: AtomicCounter = 26 | declare_counter_i64!("kt_packet_wait_processor", SumMode, false); 27 | 28 | #[derive(Clone)] 29 | pub struct KmersProcessorInitData { 30 | pub sequences_count: usize, 31 | pub sub_bucket: usize, 32 | pub is_resplitted: bool, 33 | pub bucket_paths: Vec, 34 | } 35 | 36 | impl AsyncExecutor for KmersTransformProcessor { 37 | type InputPacket = ReadsBuffer; 38 | type OutputPacket = (); 39 | type GlobalParams = KmersTransformContext; 40 | type InitData = KmersProcessorInitData; 41 | 42 | fn new() -> Self { 43 | Self(PhantomData) 44 | } 45 | 46 | fn async_executor_main<'a>( 47 | &'a mut self, 48 | global_context: &'a Self::GlobalParams, 49 | mut receiver: ExecutorReceiver, 50 | memory_tracker: MemoryTracker, 51 | ) -> impl Future + 'a { 52 | async move { 53 | let mut map_processor = 54 | F::new_map_processor(&global_context.global_extra_data, memory_tracker.clone()); 55 | let mut final_executor = F::new_final_executor(&global_context.global_extra_data); 56 | 57 | let mut packet = Packet::new_simple( 58 | >::MapStruct::allocate_new(&()), 59 | ); 60 | 61 | let thread_handle = PriorityScheduler::declare_thread(PRIORITY_SCHEDULING_HIGH); 62 | 63 | while let Ok((address, proc_info)) = track!( 64 | receiver 65 | .obtain_address_with_priority(WORKERS_PRIORITY_BASE, &thread_handle) 66 | .await, 67 | ADDR_WAITING_COUNTER 68 | ) { 69 | map_processor.process_group_start(packet, &global_context.global_extra_data); 70 | 71 | let mut real_size = 0; 72 | let mut total_kmers = 0; 73 | let mut unique_kmers = 0; 74 | 75 | while let Some(input_packet) = track!( 76 | address.receive_packet(&thread_handle).await, 77 | PACKET_WAITING_COUNTER 78 | ) { 79 | real_size += input_packet.reads.len() as usize; 80 | let stats = map_processor.process_group_batch_sequences( 81 | &global_context.global_extra_data, 82 | &input_packet.reads, 83 | &input_packet.extra_buffer, 84 | &input_packet.reads_buffer, 85 | ); 86 | total_kmers += stats.total_kmers; 87 | unique_kmers += stats.unique_kmers; 88 | } 89 | 90 | if !proc_info.is_resplitted { 91 | global_context 92 | .total_sequences 93 | .fetch_add(real_size as u64, Ordering::Relaxed); 94 | global_context 95 | .total_kmers 96 | .fetch_add(total_kmers, Ordering::Relaxed); 97 | global_context 98 | .unique_kmers 99 | .fetch_add(unique_kmers, Ordering::Relaxed); 100 | } 101 | 102 | packet = map_processor.process_group_finalize(&global_context.global_extra_data); 103 | 104 | // static MAX_PACKET_SIZE: AtomicUsize = AtomicUsize::new(0); 105 | let current_size = packet.get_size(); 106 | 107 | if real_size != proc_info.sequences_count { 108 | //MAX_PACKET_SIZE.fetch_max(current_size, Ordering::Relaxed) < current_size { 109 | ggcat_logging::info!( 110 | "Found bucket with max size {} ==> {} // EXPECTED_SIZE: {} REAL_SIZE: {} SUB: {}", 111 | current_size, 112 | proc_info.bucket_paths[0].display(), 113 | proc_info.sequences_count, 114 | real_size, 115 | proc_info.sub_bucket 116 | ); 117 | } 118 | 119 | packet = final_executor.process_map(&global_context.global_extra_data, packet); 120 | packet.reset(); 121 | // address.packet_send( 122 | // global_context 123 | // .finalizer_address 124 | // .read() 125 | // .as_ref() 126 | // .unwrap() 127 | // .clone(), 128 | // packet, 129 | // ); 130 | } 131 | final_executor.finalize(&global_context.global_extra_data); 132 | } 133 | } 134 | } 135 | // const MEMORY_FIELDS_COUNT: usize = 2; 136 | // const MEMORY_FIELDS: &'static [&'static str] = &["MAP_SIZE", "CORRECT_READS"]; 137 | -------------------------------------------------------------------------------- /crates/kmers_transform/src/reads_buffer.rs: -------------------------------------------------------------------------------- 1 | use config::MultiplicityCounterType; 2 | use io::compressed_read::CompressedReadIndipendent; 3 | use io::concurrent::temp_reads::extra_data::SequenceExtraDataTempBufferManagement; 4 | use parallel_processor::execution_manager::objects_pool::PoolObjectTrait; 5 | use parallel_processor::execution_manager::packet::PacketTrait; 6 | use std::mem::size_of; 7 | 8 | pub struct ReadsVector { 9 | reads: Vec, 10 | extra_data: Vec, 11 | flags: Vec, 12 | multiplicities: Vec, 13 | pub total_multiplicity: u64, 14 | } 15 | 16 | impl ReadsVector { 17 | pub fn push( 18 | &mut self, 19 | read: CompressedReadIndipendent, 20 | extra_data: E, 21 | flags: u8, 22 | multiplicity: MultiplicityCounterType, 23 | ) { 24 | if self.reads.len() == self.reads.capacity() { 25 | self.reads.reserve(1); 26 | self.extra_data.reserve(1); 27 | self.flags.reserve(1); 28 | if WITH_MULTIPLICITY { 29 | self.multiplicities.reserve(1); 30 | } 31 | } 32 | 33 | // Unsafe to avoid multiple checks 34 | unsafe { 35 | let index = self.reads.len(); 36 | self.reads.set_len(index + 1); 37 | *self.reads.get_unchecked_mut(index) = read; 38 | 39 | // Increment all the lengths when debug assertions are enabled, else it crashes 40 | if cfg!(debug_assertions) { 41 | self.extra_data.set_len(index + 1); 42 | self.flags.set_len(index + 1); 43 | } 44 | 45 | // Extra data and flags 46 | *self.extra_data.get_unchecked_mut(index) = extra_data; 47 | *self.flags.get_unchecked_mut(index) = flags; 48 | if WITH_MULTIPLICITY { 49 | self.multiplicities.set_len(index + 1); 50 | *self.multiplicities.get_unchecked_mut(index) = multiplicity; 51 | } 52 | } 53 | 54 | self.total_multiplicity += multiplicity as u64; 55 | 56 | // Sanity checks 57 | if cfg!(debug_assertions) { 58 | if self.multiplicities.len() > 0 { 59 | debug_assert_eq!(self.multiplicities.len(), self.reads.len()); 60 | } 61 | } 62 | } 63 | 64 | pub fn get_total_multiplicity(&self) -> u64 { 65 | self.total_multiplicity 66 | } 67 | 68 | pub fn len(&self) -> usize { 69 | self.reads.len() 70 | } 71 | 72 | pub fn capacity(&self) -> usize { 73 | self.reads.capacity() 74 | } 75 | 76 | fn clear(&mut self) { 77 | self.reads.clear(); 78 | self.multiplicities.clear(); 79 | 80 | unsafe { 81 | // Allow dropping if needed 82 | self.extra_data.set_len(self.reads.len()); 83 | self.flags.set_len(self.reads.len()); 84 | } 85 | 86 | self.extra_data.clear(); 87 | self.flags.clear(); 88 | self.total_multiplicity = 0; 89 | } 90 | 91 | pub fn iter(&self) -> ReadsVectorIterator { 92 | // Sanity checks 93 | if self.multiplicities.len() > 0 { 94 | assert_eq!(self.multiplicities.len(), self.reads.len()); 95 | } 96 | 97 | ReadsVectorIterator { 98 | reads: self, 99 | index: 0, 100 | } 101 | } 102 | } 103 | 104 | pub struct ReadsVectorIterator<'a, E> { 105 | reads: &'a ReadsVector, 106 | index: usize, 107 | } 108 | 109 | impl<'a, E> Iterator for ReadsVectorIterator<'a, E> { 110 | type Item = ( 111 | u8, 112 | &'a E, 113 | CompressedReadIndipendent, 114 | MultiplicityCounterType, 115 | ); 116 | 117 | fn next(&mut self) -> Option { 118 | if self.index < self.reads.len() { 119 | let index = self.index; 120 | self.index += 1; 121 | Some(unsafe { 122 | ( 123 | *self.reads.flags.get_unchecked(index), 124 | self.reads.extra_data.get_unchecked(index), 125 | *self.reads.reads.get_unchecked(index), 126 | if self.reads.multiplicities.len() > 0 { 127 | *self.reads.multiplicities.get_unchecked(index) 128 | } else { 129 | 1 130 | }, 131 | ) 132 | }) 133 | } else { 134 | None 135 | } 136 | } 137 | } 138 | 139 | pub struct ReadsBuffer { 140 | pub reads: ReadsVector, 141 | pub sub_bucket: usize, 142 | pub extra_buffer: E::TempBuffer, 143 | pub reads_buffer: Vec, 144 | } 145 | 146 | impl ReadsBuffer { 147 | pub fn is_full(&self) -> bool { 148 | self.reads.len() == self.reads.capacity() 149 | } 150 | } 151 | 152 | impl PoolObjectTrait for ReadsBuffer { 153 | type InitData = usize; 154 | 155 | fn allocate_new(init_data: &Self::InitData) -> Self { 156 | Self { 157 | reads: ReadsVector { 158 | reads: Vec::with_capacity(*init_data), 159 | extra_data: Vec::with_capacity(*init_data), 160 | flags: Vec::with_capacity(*init_data), 161 | multiplicities: Vec::with_capacity(*init_data), 162 | total_multiplicity: 0, 163 | }, 164 | sub_bucket: 0, 165 | extra_buffer: E::new_temp_buffer(), 166 | reads_buffer: vec![], 167 | } 168 | } 169 | 170 | fn reset(&mut self) { 171 | self.reads.clear(); 172 | self.reads_buffer.clear(); 173 | } 174 | } 175 | 176 | impl PacketTrait for ReadsBuffer { 177 | fn get_size(&self) -> usize { 178 | self.reads.len() * size_of::<(u8, E, CompressedReadIndipendent)>() + self.reads_buffer.len() 179 | } 180 | } 181 | -------------------------------------------------------------------------------- /crates/kmers_transform/src/writer.rs: -------------------------------------------------------------------------------- 1 | // use crate::{ 2 | // KmersTransformContext, KmersTransformExecutorFactory, KmersTransformFinalExecutor, 3 | // KmersTransformMapProcessor, 4 | // }; 5 | // use parallel_processor::mt_debug_counters::counter::{AtomicCounter, SumMode}; 6 | // use parallel_processor::mt_debug_counters::declare_counter_i64; 7 | // use parallel_processor::execution_manager::executor::{AsyncExecutor, ExecutorReceiver}; 8 | // use parallel_processor::execution_manager::memory_tracker::MemoryTracker; 9 | // use parallel_processor::execution_manager::objects_pool::PoolObjectTrait; 10 | // use parallel_processor::execution_manager::packet::Packet; 11 | // use std::future::Future; 12 | // use std::marker::PhantomData; 13 | // use std::sync::Arc; 14 | // use utils::track; 15 | // 16 | // static ADDR_WAITING_COUNTER: AtomicCounter = 17 | // declare_counter_i64!("kt_addr_wait_writer", SumMode, false); 18 | // 19 | // static PACKET_WAITING_COUNTER: AtomicCounter = 20 | // declare_counter_i64!("kt_packet_wait_writer", SumMode, false); 21 | // 22 | // static PACKET_ALLOC_COUNTER: AtomicCounter = 23 | // declare_counter_i64!("kt_packet_alloc_writer", SumMode, false); 24 | // 25 | // pub struct KmersTransformWriter(PhantomData); 26 | // 27 | // impl AsyncExecutor for KmersTransformWriter { 28 | // type InputPacket = >::MapStruct; 29 | // type OutputPacket = (); 30 | // type GlobalParams = KmersTransformContext; 31 | // type InitData = (); 32 | // 33 | // 34 | // fn new() -> Self { 35 | // Self(PhantomData) 36 | // } 37 | // 38 | // fn async_executor_main<'a>( 39 | // &'a mut self, 40 | // global_context: &'a Self::GlobalParams, 41 | // mut receiver: ExecutorReceiver, 42 | // memory_tracker: MemoryTracker, 43 | // ) -> impl Future + 'a { 44 | // async move { 45 | // // Only one address 46 | // let (address, _) = track!( 47 | // receiver.obtain_address().await.unwrap(), 48 | // ADDR_WAITING_COUNTER 49 | // ); 50 | // 51 | // let mut spawner = address.make_spawner(); 52 | // 53 | // for _ in 0..global_context.compute_threads_count { 54 | // spawner.spawn_executor(async { 55 | // let mut final_executor = 56 | // F::new_final_executor(&global_context.global_extra_data); 57 | // while let Some(packet) = 58 | // track!(address.receive_packet().await, PACKET_WAITING_COUNTER) 59 | // { 60 | // final_executor.process_map(&global_context.global_extra_data, packet); 61 | // } 62 | // final_executor.finalize(&global_context.global_extra_data); 63 | // }); 64 | // } 65 | // spawner.executors_await().await; 66 | // } 67 | // } 68 | // } 69 | // // const MEMORY_FIELDS_COUNT: usize = 1; 70 | // // const MEMORY_FIELDS: &'static [&'static str] = &["BUFFER_SIZES"]; 71 | -------------------------------------------------------------------------------- /crates/logging/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ggcat-logging" 3 | version = "2.0.0" 4 | edition = "2021" 5 | 6 | [dependencies] 7 | parking_lot = "0.12.3" 8 | 9 | 10 | [features] 11 | stats = [] -------------------------------------------------------------------------------- /crates/logging/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod stats; 2 | 3 | use std::fmt::{Debug, Display}; 4 | 5 | use parking_lot::Mutex; 6 | 7 | #[repr(u8)] 8 | pub enum MessageLevel { 9 | Info = 0, 10 | Warning = 1, 11 | Error = 2, 12 | UnrecoverableError = 3, 13 | } 14 | 15 | static MESSAGES_CALLBACK: Mutex> = Mutex::new(None); 16 | 17 | pub fn setup_logging_callback(callback: fn(MessageLevel, &str)) { 18 | let mut messages_callback = MESSAGES_CALLBACK.lock(); 19 | *messages_callback = Some(callback); 20 | } 21 | 22 | pub fn log(level: MessageLevel, message: &str) { 23 | let messages_callback = MESSAGES_CALLBACK.lock(); 24 | if let Some(callback) = &*messages_callback { 25 | callback(level, message); 26 | } else { 27 | if let MessageLevel::UnrecoverableError = level { 28 | panic!("{}", message); 29 | } else { 30 | println!("{}", message); 31 | } 32 | } 33 | } 34 | 35 | #[macro_export] 36 | macro_rules! info { 37 | ($($arg:tt)*) => { 38 | $crate::log($crate::MessageLevel::Info, &format!($($arg)*)); 39 | }; 40 | } 41 | 42 | #[macro_export] 43 | macro_rules! warn { 44 | ($($arg:tt)*) => { 45 | $crate::log($crate::MessageLevel::Warning, &format!($($arg)*)); 46 | }; 47 | } 48 | 49 | #[macro_export] 50 | macro_rules! error { 51 | ($($arg:tt)*) => { 52 | $crate::log($crate::MessageLevel::Error, &format!($($arg)*)); 53 | }; 54 | } 55 | 56 | pub trait UnrecoverableErrorLogging { 57 | fn log_unrecoverable_error(self, message: &str) -> Self; 58 | fn log_unrecoverable_error_with_data(self, message: &str, data: D) -> Self; 59 | } 60 | 61 | impl UnrecoverableErrorLogging for std::result::Result { 62 | fn log_unrecoverable_error(self, message: &str) -> Self { 63 | if let Err(err) = &self { 64 | log( 65 | MessageLevel::UnrecoverableError, 66 | &format!("{}: {:?}", message, err), 67 | ); 68 | } 69 | self 70 | } 71 | 72 | fn log_unrecoverable_error_with_data(self, message: &str, data: D) -> Self { 73 | if let Err(err) = &self { 74 | log( 75 | MessageLevel::UnrecoverableError, 76 | &format!("{} [{}]: {:?}", message, data, err), 77 | ); 78 | } 79 | self 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /crates/logging/src/stats.rs: -------------------------------------------------------------------------------- 1 | use std::time::Duration; 2 | 3 | pub struct CompactReport { 4 | pub input_files: Vec, 5 | pub output_file: String, 6 | pub elapsed_time: Duration, 7 | } 8 | 9 | pub struct ProcessBatchReport { 10 | pub total_sequences: usize, 11 | pub elapsed_time: Duration, 12 | } 13 | 14 | pub struct BucketStat { 15 | pub index: usize, 16 | pub chunks_count: usize, 17 | pub total_size: usize, 18 | pub sequences_count: usize, 19 | pub kmers_count: usize, 20 | pub process_batches: Vec, 21 | pub final_execution_repprt: Option, 22 | pub compact_reports: Vec, 23 | pub resplit_info: Vec, 24 | pub rewrite_info: Vec, 25 | } 26 | 27 | pub struct AssemblerStats { 28 | pub bucket_stats: Vec, 29 | } 30 | -------------------------------------------------------------------------------- /crates/minimizer_bucketing/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ggcat_minimizer_bucketing" 3 | version = "2.0.0" 4 | edition = "2021" 5 | [dependencies] 6 | 7 | # Config 8 | config = { package = "ggcat_config", path = "../config" } 9 | 10 | # Common libraries 11 | parallel-processor = "0.1.24" 12 | streaming-libdeflate-rs = "0.1.5" 13 | nightly-quirks = "0.1.4" 14 | 15 | # Pipeline libraries 16 | hashes = { package = "ggcat_hashes", path = "../hashes" } 17 | io = { package = "ggcat_io", path = "../io" } 18 | 19 | # Other libraries 20 | replace_with = "0.1.7" 21 | serde = "1.0.203" 22 | bincode = "1.3.3" 23 | typenum = "1.17.0" 24 | parking_lot = "0.12.3" 25 | ggcat-logging = { version = "2.0.0", path = "../logging" } 26 | tokio = "1.38.0" 27 | 28 | # Utils 29 | utils = { package = "ggcat_utils", path = "../utils" } 30 | colors = { package = "ggcat_colors", path = "../colors" } 31 | byteorder = "1.5.0" 32 | rustc-hash = "2.1.0" 33 | -------------------------------------------------------------------------------- /crates/minimizer_bucketing/src/compactor/extra_data.rs: -------------------------------------------------------------------------------- 1 | // use byteorder::ReadBytesExt; 2 | // use colors::colors_manager::{color_types::PartialUnitigsColorStructure, ColorsManager}; 3 | // use io::{ 4 | // concurrent::temp_reads::extra_data::{ 5 | // SequenceExtraDataConsecutiveCompression, SequenceExtraDataTempBufferManagement, 6 | // }, 7 | // varint::{decode_varint, encode_varint, VARINT_MAX_SIZE}, 8 | // }; 9 | 10 | // #[derive(Clone, Debug)] 11 | // pub struct ExtraCompactedData { 12 | // pub multiplicity: u64, 13 | // pub colors: PartialUnitigsColorStructure, 14 | // } 15 | 16 | // impl SequenceExtraDataTempBufferManagement for ExtraCompactedData { 17 | // type TempBuffer = 18 | // as SequenceExtraDataTempBufferManagement>::TempBuffer; 19 | 20 | // fn new_temp_buffer() -> Self::TempBuffer { 21 | // as SequenceExtraDataTempBufferManagement>::new_temp_buffer( 22 | // ) 23 | // } 24 | 25 | // fn clear_temp_buffer(buffer: &mut Self::TempBuffer) { 26 | // as SequenceExtraDataTempBufferManagement>::clear_temp_buffer(buffer); 27 | // } 28 | 29 | // fn copy_temp_buffer(dest: &mut Self::TempBuffer, src: &Self::TempBuffer) { 30 | // as SequenceExtraDataTempBufferManagement>::copy_temp_buffer(dest, src); 31 | // } 32 | 33 | // fn copy_extra_from( 34 | // mut extra: Self, 35 | // src: &Self::TempBuffer, 36 | // dst: &mut Self::TempBuffer, 37 | // ) -> Self { 38 | // extra.colors = as SequenceExtraDataTempBufferManagement>::copy_extra_from(extra.colors, src, dst); 39 | // extra 40 | // } 41 | // } 42 | 43 | // impl SequenceExtraDataConsecutiveCompression for ExtraCompactedData { 44 | // type LastData = 45 | // as SequenceExtraDataConsecutiveCompression>::LastData; 46 | 47 | // fn decode_extended( 48 | // buffer: &mut Self::TempBuffer, 49 | // reader: &mut impl std::io::Read, 50 | // last_data: Self::LastData, 51 | // ) -> Option { 52 | // let multiplicity = decode_varint(|| reader.read_u8().ok())?; 53 | // let colors = 54 | // PartialUnitigsColorStructure::::decode_extended(buffer, reader, last_data)?; 55 | // Some(Self { 56 | // multiplicity, 57 | // colors, 58 | // }) 59 | // } 60 | 61 | // fn encode_extended( 62 | // &self, 63 | // buffer: &Self::TempBuffer, 64 | // writer: &mut impl std::io::Write, 65 | // last_data: Self::LastData, 66 | // ) { 67 | // encode_varint(|b| writer.write_all(b), self.multiplicity).unwrap(); 68 | // self.colors.encode_extended(buffer, writer, last_data); 69 | // } 70 | 71 | // fn obtain_last_data(&self, last_data: Self::LastData) -> Self::LastData { 72 | // self.colors.obtain_last_data(last_data) 73 | // } 74 | 75 | // fn max_size(&self) -> usize { 76 | // self.colors.max_size() + VARINT_MAX_SIZE 77 | // } 78 | // } 79 | -------------------------------------------------------------------------------- /crates/minimizer_bucketing/src/counters_analyzer.rs: -------------------------------------------------------------------------------- 1 | use config::BucketIndexType; 2 | use serde::{Deserialize, Serialize}; 3 | use std::fs::File; 4 | use std::io::{BufReader, BufWriter}; 5 | use std::path::Path; 6 | use std::sync::atomic::{AtomicI64, AtomicU64}; 7 | 8 | #[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize)] 9 | pub struct BucketCounter { 10 | pub count: u64, 11 | } 12 | 13 | #[derive(Serialize, Deserialize)] 14 | pub struct CountersAnalyzer { 15 | counters: Vec>, 16 | compaction_offsets: Vec, 17 | median: u64, 18 | } 19 | 20 | impl CountersAnalyzer { 21 | pub fn new(counters: Vec>, offsets: Vec) -> Self { 22 | let mut sorted_counters: Vec<(u64, usize, usize)> = Vec::new(); 23 | 24 | let counters: Vec> = counters 25 | .into_iter() 26 | .enumerate() 27 | .map(|(bucket, vec)| { 28 | vec.into_iter() 29 | .enumerate() 30 | .map(|(second_bucket, mut a)| { 31 | let count = *a.get_mut(); 32 | if count != 0 { 33 | sorted_counters.push((count, bucket, second_bucket)); 34 | } 35 | BucketCounter { count } 36 | }) 37 | .collect() 38 | }) 39 | .collect(); 40 | sorted_counters.sort_unstable_by(|a, b| b.cmp(a)); 41 | 42 | let median = if sorted_counters.len() > 0 { 43 | sorted_counters[sorted_counters.len() / 2].0 44 | } else { 45 | 0 46 | }; 47 | 48 | let compaction_offsets = offsets.into_iter().map(AtomicI64::into_inner).collect(); 49 | 50 | Self { 51 | counters, 52 | median, 53 | compaction_offsets, 54 | } 55 | } 56 | 57 | pub fn get_counters_for_bucket(&self, bucket: BucketIndexType) -> &Vec { 58 | &self.counters[bucket as usize] 59 | } 60 | 61 | pub fn get_compaction_offset(&self, bucket: BucketIndexType) -> i64 { 62 | self.compaction_offsets[bucket as usize] 63 | } 64 | 65 | pub fn print_debug(&self) { 66 | ggcat_logging::info!("************** BUCKETS DEBUG: **************"); 67 | // for (i, cnt_bucket) in self.counters.iter().enumerate() { 68 | // let mut buffer = String::new(); 69 | // for cnt_sub_bucket in cnt_bucket.iter() { 70 | // buffer.push_str(&format!( 71 | // "{}{} ", 72 | // cnt_sub_bucket.count, 73 | // if cnt_sub_bucket.is_outlier { "*" } else { "" }, 74 | // )); 75 | // } 76 | // ggcat_logging::info!("{} SIZES: {}", i, buffer); 77 | // } 78 | ggcat_logging::info!("Sub-bucket median: {}", self.median); 79 | ggcat_logging::info!( 80 | "Sub-bucket maximum: {}", 81 | self.counters 82 | .iter() 83 | .map(|x| x.iter().map(|c| c.count).max().unwrap_or(0)) 84 | .max() 85 | .unwrap_or(0) 86 | ); 87 | } 88 | 89 | pub fn load_from_file(path: impl AsRef, remove: bool) -> Self { 90 | let file = BufReader::new(File::open(&path).unwrap()); 91 | let rval: CountersAnalyzer = bincode::deserialize_from(file).unwrap(); 92 | 93 | // rval.counters.iter_mut().enumerate().for_each(|(bn, x)| { 94 | // x.iter_mut().enumerate().for_each(|(sbn, y)| { 95 | // if y.is_outlier { 96 | // ggcat_logging::info!("Found outlier: vec{}.{}", bn, sbn); 97 | // // y.is_outlier = false 98 | // } 99 | // }) 100 | // }); 101 | 102 | // rval.print_debug(); 103 | 104 | if remove { 105 | let _ = std::fs::remove_file(path); 106 | } 107 | rval 108 | } 109 | 110 | pub fn serialize_to_file(&self, path: impl AsRef) { 111 | let file = BufWriter::new( 112 | File::create(path.as_ref()) 113 | .expect(&format!("Cannot open file {}", path.as_ref().display())), 114 | ); 115 | bincode::serialize_into(file, self).unwrap(); 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /crates/minimizer_bucketing/src/queue_data.rs: -------------------------------------------------------------------------------- 1 | use io::sequences_reader::{DnaSequence, DnaSequencesFileType}; 2 | use io::sequences_stream::SequenceInfo; 3 | use parallel_processor::execution_manager::objects_pool::PoolObjectTrait; 4 | use parallel_processor::execution_manager::packet::PacketTrait; 5 | use std::mem::size_of; 6 | 7 | type SequencesType = (usize, usize, usize, DnaSequencesFileType, SequenceInfo); 8 | 9 | pub struct MinimizerBucketingQueueData { 10 | data: Vec, 11 | pub sequences: Vec, 12 | pub stream_info: F, 13 | pub start_read_index: u64, 14 | } 15 | 16 | impl MinimizerBucketingQueueData { 17 | pub fn new(capacity: usize, stream_info: F) -> Self { 18 | Self { 19 | data: Vec::with_capacity(capacity), 20 | sequences: Vec::with_capacity(capacity / 512), 21 | stream_info, 22 | start_read_index: 0, 23 | } 24 | } 25 | 26 | pub fn push_sequences(&mut self, seq: DnaSequence, seq_info: SequenceInfo) -> bool { 27 | let ident_len = seq.ident_data.len(); 28 | let seq_len = seq.seq.len(); 29 | 30 | let tot_len = ident_len + seq_len; 31 | 32 | if self.data.len() != 0 && (self.data.capacity() - self.data.len()) < tot_len { 33 | return false; 34 | } 35 | 36 | let start = self.data.len(); 37 | self.data.extend_from_slice(seq.ident_data); 38 | self.data.extend_from_slice(seq.seq); 39 | 40 | self.sequences 41 | .push((start, ident_len, seq_len, seq.format, seq_info)); 42 | 43 | true 44 | } 45 | 46 | pub fn iter_sequences(&self) -> impl Iterator { 47 | self.sequences 48 | .iter() 49 | .map(move |&(start, id_len, seq_len, format, seq_info)| { 50 | let mut start = start; 51 | 52 | let ident_data = &self.data[start..start + id_len]; 53 | start += id_len; 54 | 55 | let seq = &self.data[start..start + seq_len]; 56 | 57 | ( 58 | DnaSequence { 59 | ident_data, 60 | seq, 61 | format, 62 | }, 63 | seq_info, 64 | ) 65 | }) 66 | } 67 | } 68 | 69 | impl PoolObjectTrait 70 | for MinimizerBucketingQueueData 71 | { 72 | type InitData = usize; 73 | 74 | fn allocate_new(init_data: &Self::InitData) -> Self { 75 | Self::new(*init_data, F::default()) 76 | } 77 | 78 | fn reset(&mut self) { 79 | self.data.clear(); 80 | self.sequences.clear(); 81 | } 82 | } 83 | 84 | impl PacketTrait for MinimizerBucketingQueueData { 85 | fn get_size(&self) -> usize { 86 | self.data.len() + (self.sequences.len() * size_of::()) 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /crates/minimizer_bucketing/src/resplit_bucket.rs: -------------------------------------------------------------------------------- 1 | use config::{BucketIndexType, MultiplicityCounterType}; 2 | use io::compressed_read::CompressedRead; 3 | 4 | pub trait RewriteBucketCompute: Sized + 'static + Send { 5 | fn get_rewrite_bucket( 6 | k: usize, 7 | m: usize, 8 | seq_data: &(u8, u8, C, CompressedRead, MultiplicityCounterType), 9 | used_hash_bits: usize, 10 | bucket_bits_count: usize, 11 | ) -> BucketIndexType; 12 | } 13 | -------------------------------------------------------------------------------- /crates/minimizer_bucketing/src/sequences_splitter.rs: -------------------------------------------------------------------------------- 1 | use io::sequences_reader::DnaSequence; 2 | use std::ops::Range; 3 | 4 | pub struct SequencesSplitter { 5 | k: usize, 6 | pub valid_bases: u64, 7 | } 8 | 9 | impl SequencesSplitter { 10 | pub fn new(k: usize) -> Self { 11 | Self { k, valid_bases: 0 } 12 | } 13 | 14 | #[inline] 15 | pub fn process_sequences( 16 | &mut self, 17 | fasta_seq: &DnaSequence, 18 | mut process_fn: impl FnMut(&[u8], Range), 19 | ) { 20 | let mut start; 21 | let mut end = 0; 22 | 23 | while end < fasta_seq.seq.len() { 24 | start = end; 25 | // Skip all not recognized characters 26 | while start < fasta_seq.seq.len() && fasta_seq.seq[start] == b'N' { 27 | start += 1; 28 | } 29 | end = start; 30 | // Find the last valid character in this sequence 31 | while end < fasta_seq.seq.len() && fasta_seq.seq[end] != b'N' { 32 | end += 1; 33 | } 34 | // If the length of the read is long enough, return it 35 | if end - start >= self.k { 36 | self.valid_bases += (end - start) as u64; 37 | process_fn(&fasta_seq.seq[start..end], start..end); 38 | } 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /crates/querier/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ggcat_querier" 3 | version = "0.1.1" 4 | edition = "2021" 5 | [dependencies] 6 | 7 | # Config 8 | config = { package = "ggcat_config", path = "../config" } 9 | 10 | # Utils 11 | utils = { package = "ggcat_utils", path = "../utils" } 12 | 13 | 14 | # Static dispatch 15 | dynamic-dispatch = "0.5.4" 16 | 17 | # Common libraries 18 | parallel-processor = "0.1.24" 19 | streaming-libdeflate-rs = "0.1.5" 20 | nightly-quirks = "0.1.4" 21 | 22 | # Common pipeline libraries 23 | hashes = { package = "ggcat_hashes", path = "../hashes" } 24 | io = { package = "ggcat_io", path = "../io" } 25 | colors = { package = "ggcat_colors", path = "../colors" } 26 | structs = { package = "ggcat_structs", path = "../structs" } 27 | minimizer_bucketing = { package = "ggcat_minimizer_bucketing", path = "../minimizer_bucketing" } 28 | kmers_transform = { package = "ggcat_kmers_transform", path = "../kmers_transform" } 29 | typenum = "1.17.0" 30 | rayon = "1.10.0" 31 | byteorder = "1.5.0" 32 | hashbrown = "0.14.5" 33 | csv = "1.3.0" 34 | parking_lot = "0.12.3" 35 | lz4 = "1.25.0" 36 | flate2 = "1.0.30" 37 | ggcat-logging = { version = "2.0.0", path = "../logging" } 38 | anyhow = "1.0.89" 39 | 40 | [features] 41 | devel-build = [] 42 | -------------------------------------------------------------------------------- /crates/querier/src/pipeline.rs: -------------------------------------------------------------------------------- 1 | pub mod colored_query_output; 2 | pub mod colormap_reading; 3 | pub mod counters_sorting; 4 | pub mod parallel_kmers_query; 5 | pub mod querier_minimizer_bucketing; 6 | -------------------------------------------------------------------------------- /crates/querier/src/structs.rs: -------------------------------------------------------------------------------- 1 | pub mod query_colored_counters; 2 | -------------------------------------------------------------------------------- /crates/querier/src/structs/query_colored_counters.rs: -------------------------------------------------------------------------------- 1 | use byteorder::ReadBytesExt; 2 | use colors::storage::run_length::ColorIndexSerializer; 3 | use config::ColorIndexType; 4 | use io::varint::{decode_varint, encode_varint, VARINT_MAX_SIZE}; 5 | use parallel_processor::buckets::bucket_writer::BucketItemSerializer; 6 | use std::io::Read; 7 | use std::ops::Range; 8 | 9 | #[derive(Debug, Clone)] 10 | pub enum ColorsRange { 11 | Range(Range), 12 | } 13 | 14 | impl ColorsRange { 15 | pub fn write_to_vec(self, vec: &mut Vec) { 16 | let ColorsRange::Range(range) = self; 17 | vec.push(range.start); 18 | vec.push(range.end); 19 | } 20 | 21 | pub fn from_slice(slice: &[ColorIndexType]) -> Self { 22 | ColorsRange::Range(slice[0]..slice[1]) 23 | } 24 | } 25 | 26 | #[derive(Debug, Clone)] 27 | pub struct QueryColorDesc { 28 | pub query_index: u64, 29 | pub count: u64, 30 | } 31 | 32 | pub struct QueryColoredCounters<'a> { 33 | pub queries: &'a [QueryColorDesc], 34 | pub colors: &'a [ColorIndexType], 35 | } 36 | 37 | pub struct QueryColoredCountersSerializer; 38 | 39 | impl BucketItemSerializer for QueryColoredCountersSerializer { 40 | type InputElementType<'a> = QueryColoredCounters<'a>; 41 | type ExtraData = (); 42 | type ReadBuffer = (Vec, Vec); 43 | type ExtraDataBuffer = (); 44 | type ReadType<'b> = QueryColoredCounters<'b>; 45 | 46 | type CheckpointData = (); 47 | 48 | fn new() -> Self { 49 | Self 50 | } 51 | 52 | fn reset(&mut self) {} 53 | 54 | fn write_to( 55 | &mut self, 56 | element: &QueryColoredCounters<'_>, 57 | bucket: &mut Vec, 58 | _extra_data: &Self::ExtraData, 59 | _extra_read_buffer: &Self::ExtraDataBuffer, 60 | ) { 61 | encode_varint( 62 | |b| bucket.extend_from_slice(b), 63 | element.queries.len() as u64, 64 | ); 65 | for query in element.queries.iter() { 66 | encode_varint(|b| bucket.extend_from_slice(b), query.query_index); 67 | encode_varint(|b| bucket.extend_from_slice(b), query.count); 68 | } 69 | 70 | assert_eq!(element.colors.len() % 2, 0); 71 | ColorIndexSerializer::serialize_colors(bucket, &element.colors); 72 | } 73 | 74 | fn read_from<'b, S: Read>( 75 | &mut self, 76 | mut stream: S, 77 | read_buffer: &'b mut Self::ReadBuffer, 78 | _extra_read_buffer: &mut Self::ExtraDataBuffer, 79 | ) -> Option> { 80 | read_buffer.0.clear(); 81 | read_buffer.1.clear(); 82 | 83 | let queries_count = decode_varint(|| stream.read_u8().ok())?; 84 | for _ in 0..queries_count { 85 | let query_index = decode_varint(|| stream.read_u8().ok())?; 86 | let count = decode_varint(|| stream.read_u8().ok())?; 87 | read_buffer.0.push(QueryColorDesc { query_index, count }); 88 | } 89 | 90 | ColorIndexSerializer::deserialize_colors(stream, &mut read_buffer.1)?; 91 | Some(QueryColoredCounters { 92 | queries: &read_buffer.0, 93 | colors: &read_buffer.1, 94 | }) 95 | } 96 | 97 | fn get_size(&self, element: &Self::InputElementType<'_>, _extra: &Self::ExtraData) -> usize { 98 | (element.colors.len() + element.queries.len() + 1) * VARINT_MAX_SIZE * 4 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /crates/structs/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ggcat_structs" 3 | version = "2.0.0" 4 | edition = "2021" 5 | [dependencies] 6 | bincode = "1.3.3" 7 | byteorder = "1.5.0" 8 | config = { package = "ggcat_config", path = "../config" } 9 | io = { package = "ggcat_io", path = "../io" } 10 | parallel-processor = "0.1.24" 11 | serde = { version = "1.0.203", features = ["derive"] } 12 | 13 | 14 | [features] 15 | support_kmer_counters = [] 16 | -------------------------------------------------------------------------------- /crates/structs/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod map_entry; 2 | #[cfg(feature = "support_kmer_counters")] 3 | pub mod unitigs_counters; 4 | -------------------------------------------------------------------------------- /crates/structs/src/map_entry.rs: -------------------------------------------------------------------------------- 1 | use config::{MultiplicityCounterType, READ_FLAG_INCL_BEGIN, READ_FLAG_INCL_END}; 2 | use std::cell::Cell; 3 | use std::mem::size_of; 4 | 5 | const FLAGS_COUNT: usize = 2; 6 | const FLAGS_SHIFT: usize = size_of::() * 8 - FLAGS_COUNT; 7 | const USED_MARKER: usize = 1 << (FLAGS_SHIFT - 1); 8 | const COUNTER_MASK: usize = (1 << (FLAGS_SHIFT - 1)) - 1; 9 | 10 | pub const COUNTER_BITS: usize = FLAGS_SHIFT - 1; 11 | 12 | pub struct MapEntry { 13 | count_flags: Cell, 14 | pub color_index: CHI, 15 | } 16 | 17 | unsafe impl Sync for MapEntry {} 18 | 19 | impl MapEntry { 20 | pub fn new(color_index: CHI) -> Self { 21 | Self { 22 | count_flags: Cell::new(0), 23 | color_index, 24 | } 25 | } 26 | 27 | #[inline(always)] 28 | pub fn incr(&mut self) { 29 | self.count_flags.set(self.count_flags.get() + 1); 30 | } 31 | 32 | // Increment the multiplicity anc check if the threshold is just crossed 33 | #[inline(always)] 34 | pub fn incr_by_and_check( 35 | &mut self, 36 | value: MultiplicityCounterType, 37 | check_threshold: usize, 38 | ) -> bool { 39 | let exceeded = self.count_flags.get() >= check_threshold; 40 | self.count_flags 41 | .set(self.count_flags.get() + value as usize); 42 | let now_exceeded = self.count_flags.get() < check_threshold; 43 | !exceeded && now_exceeded 44 | } 45 | 46 | #[inline(always)] 47 | pub fn set_used(&self) { 48 | self.count_flags.set(self.count_flags.get() | USED_MARKER); 49 | } 50 | 51 | #[inline(always)] 52 | pub fn is_used(&self) -> bool { 53 | (self.count_flags.get() & USED_MARKER) == USED_MARKER 54 | } 55 | 56 | #[inline(always)] 57 | pub fn get_counter(&self) -> usize { 58 | self.count_flags.get() & COUNTER_MASK 59 | } 60 | 61 | #[track_caller] 62 | pub fn set_counter_after_check(&mut self, value: usize) { 63 | self.count_flags 64 | .set((self.count_flags.get() & !COUNTER_MASK) | (value & COUNTER_MASK)); 65 | } 66 | 67 | #[inline(always)] 68 | pub fn update_flags(&mut self, flags: u8) { 69 | self.count_flags 70 | .set(self.count_flags.get() | ((flags as usize) << FLAGS_SHIFT)); 71 | } 72 | 73 | #[inline(always)] 74 | pub fn get_flags(&self) -> u8 { 75 | (self.count_flags.get() >> FLAGS_SHIFT) as u8 76 | } 77 | 78 | pub fn get_kmer_multiplicity(&self) -> usize { 79 | // If the current set has both the partial sequences endings, we should divide the counter by 2, 80 | // as all the kmers are counted exactly two times 81 | self.get_counter() 82 | >> ((self.get_flags() == (READ_FLAG_INCL_BEGIN | READ_FLAG_INCL_END)) as u8) 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /crates/structs/src/unitigs_counters.rs: -------------------------------------------------------------------------------- 1 | use byteorder::ReadBytesExt; 2 | use io::{ 3 | concurrent::temp_reads::extra_data::{HasEmptyExtraBuffer, SequenceExtraData}, 4 | varint::{decode_varint, encode_varint, VARINT_MAX_SIZE}, 5 | }; 6 | use serde::{Deserialize, Serialize}; 7 | use std::io::{Read, Write}; 8 | 9 | #[derive(Copy, Clone, Serialize, Deserialize, Debug)] 10 | pub struct UnitigsCounters { 11 | pub first: u64, 12 | pub sum: u64, 13 | pub last: u64, 14 | } 15 | 16 | impl UnitigsCounters { 17 | #[inline] 18 | pub fn new() -> Self { 19 | Self { 20 | first: 0, 21 | sum: 0, 22 | last: 0, 23 | } 24 | } 25 | } 26 | 27 | impl HasEmptyExtraBuffer for UnitigsCounters {} 28 | 29 | impl SequenceExtraData for UnitigsCounters { 30 | fn decode_extended(_: &mut Self::TempBuffer, reader: &mut impl Read) -> Option { 31 | let first = decode_varint(|| reader.read_u8().ok())?; 32 | let sum = decode_varint(|| reader.read_u8().ok())?; 33 | let last = decode_varint(|| reader.read_u8().ok())?; 34 | Some(Self { first, sum, last }) 35 | } 36 | 37 | fn encode_extended(&self, _: &Self::TempBuffer, writer: &mut impl Write) { 38 | encode_varint(|b| writer.write(b).ok(), self.first).unwrap(); 39 | encode_varint(|b| writer.write(b).ok(), self.sum).unwrap(); 40 | encode_varint(|b| writer.write(b).ok(), self.last).unwrap(); 41 | } 42 | 43 | #[inline(always)] 44 | fn max_size(&self) -> usize { 45 | 3 * VARINT_MAX_SIZE 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /crates/utils/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ggcat_utils" 3 | version = "2.0.0" 4 | edition = "2021" 5 | [dependencies] 6 | parking_lot = "0.12.3" 7 | rand = "0.8.5" 8 | 9 | parallel-processor = "0.1.24" 10 | -------------------------------------------------------------------------------- /crates/utils/src/debug_functions.rs: -------------------------------------------------------------------------------- 1 | #![allow(dead_code)] 2 | 3 | use std::sync::atomic::{AtomicU64, Ordering}; 4 | 5 | pub static KCOUNTER: AtomicU64 = AtomicU64::new(0); 6 | 7 | pub fn debug_increase() { 8 | KCOUNTER.fetch_add(1, Ordering::Relaxed); 9 | } 10 | 11 | pub fn debug_print() { 12 | println!("COUNTER: {:?}", KCOUNTER.load(Ordering::Relaxed)); 13 | } 14 | 15 | #[macro_export] 16 | macro_rules! track { 17 | ($code:expr, $tracker:ident) => {{ 18 | use parallel_processor::mt_debug_counters::counter::AtomicCounterGuardSum; 19 | let guard = AtomicCounterGuardSum::new(&$tracker, 1); 20 | $code 21 | }}; 22 | } 23 | 24 | // pub fn debug_minimizers( 25 | // read: R, 26 | // m: usize, 27 | // k: usize, 28 | // ) { 29 | // println!("Debugging sequence: {}", read.debug_to_string()); 30 | // 31 | // let mut queue = RollingMinQueue::::new(k - m); 32 | // 33 | // let hashes = MNHFactory::new(read, m); 34 | // 35 | // let rolling_iter = queue.make_iter(hashes.iter().map(|x| x.to_unextendable())); 36 | // 37 | // for (idx, hash) in rolling_iter.enumerate() { 38 | // println!( 39 | // "Minimizer info for kmer: {}\nHASH: {} UNMASKED_HASH: {} FB: {} SB: {} SH: {}", 40 | // read.get_subslice(idx..(idx + k - 1)).debug_to_string(), 41 | // MNHFactory::get_full_minimizer(hash), 42 | // MNHFactory::get_full_minimizer(hash), 43 | // MNHFactory::get_first_bucket(hash), 44 | // MNHFactory::get_second_bucket(hash), 45 | // MNHFactory::get_sorting_hash(hash), 46 | // ); 47 | // } 48 | // } 49 | -------------------------------------------------------------------------------- /crates/utils/src/fast_rand_bool.rs: -------------------------------------------------------------------------------- 1 | use rand::rngs::ThreadRng; 2 | use rand::{thread_rng, RngCore}; 3 | 4 | // Increasing PROB_ITERS decreases the probability that a true value happens, 5 | // by combining with and multiple random values 6 | pub struct FastRandBool { 7 | random: ThreadRng, 8 | randidx: usize, 9 | randval: u64, 10 | } 11 | 12 | impl FastRandBool { 13 | fn get_random(&mut self) -> u64 { 14 | let mut val = u64::MAX; 15 | for _ in 0..PROB_ITERS { 16 | val &= self.random.next_u64(); 17 | } 18 | 19 | val 20 | } 21 | 22 | pub fn new() -> Self { 23 | let random = thread_rng(); 24 | Self { 25 | random, 26 | randidx: 0, 27 | randval: 0, 28 | } 29 | } 30 | 31 | pub fn get_randbool(&mut self) -> bool { 32 | if self.randidx == 0 { 33 | self.randval = self.get_random(); 34 | self.randidx = 64; 35 | } 36 | self.randidx -= 1; 37 | let result = (self.randval & 0x1) == 1; 38 | self.randval >>= 1; 39 | result 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /crates/utils/src/lib.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | pub mod debug_functions; 3 | pub mod fast_rand_bool; 4 | pub mod owned_drop; 5 | pub mod resource_counter; 6 | pub mod vec_slice; 7 | 8 | use std::cmp::max; 9 | 10 | pub struct Utils; 11 | 12 | const C_INV_LETTERS: [u8; 4] = [b'A', b'C', b'T', b'G']; 13 | 14 | #[macro_export] 15 | macro_rules! panic_debug { 16 | ($($arg:tt)*) => { 17 | #[cfg(feature = "debug")] 18 | panic!($($arg)*); 19 | #[cfg(not(feature = "debug"))] 20 | unsafe { std::hint::unreachable_unchecked() } 21 | }; 22 | } 23 | 24 | pub fn compute_best_m(k: usize) -> usize { 25 | match k { 26 | 0..=13 => max(k / 2, k - 4), 27 | 14..=15 => 9, 28 | 16..=21 => 10, 29 | 22..=30 => 11, 30 | 31..=37 => 12, 31 | 38..=42 => 13, 32 | 43..=64 => 14, 33 | _ => ((k as f64) / 4.0).round() as usize, 34 | } 35 | } 36 | 37 | impl Utils { 38 | #[inline(always)] 39 | pub fn compress_base(base: u8) -> u8 { 40 | (base >> 1) & 0x3 41 | } 42 | 43 | #[inline(always)] 44 | pub fn decompress_base(cbase: u8) -> u8 { 45 | C_INV_LETTERS[cbase as usize] 46 | } 47 | 48 | #[inline(always)] 49 | pub fn conditional_rc_base(cbase: u8, do_rc: bool) -> u8 { 50 | cbase ^ if do_rc { 2 } else { 0 } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /crates/utils/src/owned_drop.rs: -------------------------------------------------------------------------------- 1 | use std::mem::MaybeUninit; 2 | use std::ops::{Deref, DerefMut}; 3 | 4 | pub struct OwnedDrop { 5 | val: MaybeUninit, 6 | was_taken: bool, 7 | } 8 | 9 | impl OwnedDrop { 10 | #[inline(always)] 11 | pub fn new(val: X) -> Self { 12 | Self { 13 | val: MaybeUninit::new(val), 14 | was_taken: false, 15 | } 16 | } 17 | 18 | pub unsafe fn take(&mut self) -> X { 19 | self.was_taken = true; 20 | std::ptr::read(self.val.assume_init_mut() as *const X) 21 | } 22 | } 23 | 24 | impl Deref for OwnedDrop { 25 | type Target = X; 26 | 27 | fn deref(&self) -> &Self::Target { 28 | unsafe { self.val.assume_init_ref() } 29 | } 30 | } 31 | 32 | impl DerefMut for OwnedDrop { 33 | fn deref_mut(&mut self) -> &mut Self::Target { 34 | unsafe { self.val.assume_init_mut() } 35 | } 36 | } 37 | 38 | impl Drop for OwnedDrop { 39 | fn drop(&mut self) { 40 | if !self.was_taken { 41 | unsafe { 42 | std::mem::drop(std::ptr::read(self.val.assume_init_ref() as *const X)); 43 | } 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /crates/utils/src/resource_counter.rs: -------------------------------------------------------------------------------- 1 | use parking_lot::{Condvar, Mutex}; 2 | use std::sync::Arc; 3 | 4 | pub struct ResourceCounter { 5 | counter: Mutex, 6 | condvar: Condvar, 7 | } 8 | 9 | impl ResourceCounter { 10 | pub fn new(limit: u64) -> Arc { 11 | Arc::new(Self { 12 | counter: Mutex::new(limit as i64), 13 | condvar: Condvar::new(), 14 | }) 15 | } 16 | 17 | pub fn allocate_blocking(self: &Arc, count: u64) { 18 | let mut counter = self.counter.lock(); 19 | loop { 20 | if *counter >= count as i64 { 21 | *counter -= count as i64; 22 | break; 23 | } else { 24 | self.condvar.wait(&mut counter); 25 | } 26 | } 27 | } 28 | 29 | pub fn allocate_overflow(self: &Arc, count: u64) { 30 | let mut counter = self.counter.lock(); 31 | *counter -= count as i64; 32 | } 33 | 34 | pub fn deallocate(self: &Arc, count: u64, notify_amount: u64) { 35 | let mut counter = self.counter.lock(); 36 | *counter += count as i64; 37 | if *counter >= notify_amount as i64 { 38 | self.condvar.notify_all(); 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /crates/utils/src/vec_slice.rs: -------------------------------------------------------------------------------- 1 | use std::marker::PhantomData; 2 | 3 | #[derive(Clone, Debug)] 4 | pub struct VecSlice { 5 | pub pos: usize, 6 | len: usize, 7 | _phantom: PhantomData, 8 | } 9 | 10 | impl VecSlice { 11 | pub const EMPTY: Self = Self::new(0, 0); 12 | 13 | pub const fn new(pos: usize, len: usize) -> Self { 14 | Self { 15 | pos, 16 | len, 17 | _phantom: PhantomData, 18 | } 19 | } 20 | 21 | pub fn new_extend_iter(ref_vec: &mut Vec, iter: impl Iterator) -> Self { 22 | let pos = ref_vec.len(); 23 | ref_vec.extend(iter); 24 | Self { 25 | pos, 26 | len: ref_vec.len() - pos, 27 | _phantom: Default::default(), 28 | } 29 | } 30 | 31 | pub fn len(&self) -> usize { 32 | self.len 33 | } 34 | 35 | pub fn iter(&self) -> impl Iterator { 36 | (self.pos..self.pos + self.len).into_iter() 37 | } 38 | 39 | pub fn get_slice<'a>(&self, vec: &'a Vec) -> &'a [T] { 40 | &vec[self.pos..self.pos + self.len] 41 | } 42 | pub fn get_slice_mut<'a>(&self, vec: &'a mut Vec) -> &'a mut [T] { 43 | &mut vec[self.pos..self.pos + self.len] 44 | } 45 | } 46 | 47 | impl VecSlice { 48 | pub fn new_extend(ref_vec: &mut Vec, slice: &[T]) -> Self { 49 | let pos = ref_vec.len(); 50 | ref_vec.extend_from_slice(slice); 51 | Self { 52 | pos, 53 | len: slice.len(), 54 | _phantom: Default::default(), 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /example-inputs/query.fa: -------------------------------------------------------------------------------- 1 | >NODE_29_length_964_cov_255.453_ID_57 2 | GTGAGACAGGTGCTGCATGGCTGTCGTCAGCTCGTGTTGTGAAATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTATCCTTTGTTGCCAGCGATTAGGTCGGGAACTCAAAGGAGACTGCCAGTGATAAACTGGAGGAAGGTGGGGATGACGTCAAGTCATCATGGCCCTTACGACCAGGGCTACACACGTGCTACAATGGCGCATACAAAGAGAAGCGACCTCGCGAGAGCAAGCGGACCTCATAAAGTGCGTCGTAGTCCGGATTGGAGTCTGCAACTCGACTCCATGAAGTCGGAATCGCTAGTAATCGTGGATCAGAATGCCACGGTGAATACGTTCCCGGGCCTTGTACACACCGCCCGTCACACCATGGGAGTGGGTTGCAAAAGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTTACCTTAAAGAAGCGTACTTTGCAGTGCTCACACAGATTGTCTGATGAAAAGTAAATAGCAAGGCGTCTTGCGAAGCAGACTGATACGTCCCCTTCGTCTAGAGGCCCAGGACACCGCCCTTTCACGGCGGTAACAGGGGTTCGAATCCCCTAGGGGACGCCACTTGCTGGTTTGTGAGTGAAAGTCACCTGCCCTAATATCTCAAAACTGACTTACGAGTCACGTTTGAGATATTTGCTCTTTAAAAATCTGGATCAAGCTGAAAATTGAAACACAGAACAACGAAAGTTGTTCGTGAGTCTCTCAAATTTTCGCGACACGATGATGAATCGCGAGAAACATCTTCGGGTTGTGAGGTTAAGCGACTAAGCGTACACGGTGGATGCCCTGGCAGTCAGAGGCGATGAAGGGCGTGCTAATCTGCGATAAGCGCCGGTAAGGTGATATGAACCGTTATAACC 3 | >NODE_33_length_560_cov_85.1478_ID_65 4 | CCGGTGGTGGCATGACAGTGCAGGTGCAGGCGCACCTCAAAACGTTTTTTGATTTCGCTGACCAGCTCATACGCCGCCATCGGTGTAAGAATGCCGGACATATCCTTGATGGCAATCGAGTCAACGCCGGTTTCCAGCAGTTGCTCTGTTAAATCCAGCCAGGTCTGCAACGTGTGCGCCGGACTGGTGGTATAGCTCAGCGTGCCCTGGGCGTGCGCGCCGTGGCTGCGCACCGCCTGCAGGGCGGCTTTCATATTGCGCGGGTCGTTCATGGCATCGAAGACGCGGAACACGTCCATGCCGTTTTTCACTGCCCGCTCAACGAAGCGTTCCACCACGTCATCGGCGTAGTGGCGATAACCGAGCAGGTTCTGACCACGCAGCAGCATCTGCAACGGGGTTTTGGGCATGGCTTTTTTCAGTTCGCGCAGGCGCAGCCACGGGTCTTCGCCGAGGAAACGGATGCAGGCGTCAAAGGTGGCGCCGCCCCAGCACTCCAGCGACCCGTAGCCCACGTCGTCGAGCGCGGCGGCAATCGGCAGCATATCGTCAAGGCGCAG 5 | -------------------------------------------------------------------------------- /example-inputs/sal2.fa: -------------------------------------------------------------------------------- 1 | >NODE_29_length_964_cov_255.453_ID_57 2 | GTGAGACAGGTGCTGCATGGCTGTCGTCAGCTCGTGTTGTGAAATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTATCCTTTGTTGCCAGCGATTAGGTCGGGAACTCAAAGGAGACTGCCAGTGATAAACTGGAGGAAGGTGGGGATGACGTCAAGTCATCATGGCCCTTACGACCAGGGCTACACACGTGCTACAATGGCGCATACAAAGAGAAGCGACCTCGCGAGAGCAAGCGGACCTCATAAAGTGCGTCGTAGTCCGGATTGGAGTCTGCAACTCGACTCCATGAAGTCGGAATCGCTAGTAATCGTGGATCAGAATGCCACGGTGAATACGTTCCCGGGCCTTGTACACACCGCCCGTCACACCATGGGAGTGGGTTGCAAAAGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTTACCTTAAAGAAGCGTACTTTGCAGTGCTCACACAGATTGTCTGATGAAAAGTAAATAGCAAGGCGTCTTGCGAAGCAGACTGATACGTCCCCTTCGTCTAGAGGCCCAGGACACCGCCCTTTCACGGCGGTAACAGGGGTTCGAATCCCCTAGGGGACGCCACTTGCTGGTTTGTGAGTGAAAGTCACCTGCCCTAATATCTCAAAACTGACTTACGAGTCACGTTTGAGATATTTGCTCTTTAAAAATCTGGATCAAGCTGAAAATTGAAACACAGAACAACGAAAGTTGTTCGTGAGTCTCTCAAATTTTCGCGACACGATGATGAATCGCGAGAAACATCTTCGGGTTGTGAGGTTAAGCGACTAAGCGTACACGGTGGATGCCCTGGCAGTCAGAGGCGATGAAGGGCGTGCTAATCTGCGATAAGCGCCGGTAAGGTGATATGAACCGTTATAACC 3 | >NODE_33_length_560_cov_85.1478_ID_65 4 | CCGGTGGTGGCATGACAGTGCAGGTGCAGGCGCACCTCAAAACGTTTTTTGATTTCGCTGACCAGCTCATACGCCGCCATCGGTGTAAGAATGCCGGACATATCCTTGATGGCAATCGAGTCAACGCCGGTTTCCAGCAGTTGCTCTGTTAAATCCAGCCAGGTCTGCAACGTGTGCGCCGGACTGGTGGTATAGCTCAGCGTGCCCTGGGCGTGCGCGCCGTGGCTGCGCACCGCCTGCAGGGCGGCTTTCATATTGCGCGGGTCGTTCATGGCATCGAAGACGCGGAACACGTCCATGCCGTTTTTCACTGCCCGCTCAACGAAGCGTTCCACCACGTCATCGGCGTAGTGGCGATAACCGAGCAGGTTCTGACCACGCAGCAGCATCTGCAACGGGGTTTTGGGCATGGCTTTTTTCAGTTCGCGCAGGCGCAGCCACGGGTCTTCGCCGAGGAAACGGATGCAGGCGTCAAAGGTGGCGCCGCCCCAGCACTCCAGCGACCCGTAGCCCACGTCGTCGAGCGCGGCGGCAATCGGCAGCATATCGTCAAGGCGCAG 5 | -------------------------------------------------------------------------------- /rust-toolchain: -------------------------------------------------------------------------------- 1 | stable 2 | -------------------------------------------------------------------------------- /tests/build_checking.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | mkdir -p target/checking 3 | export CARGO_TARGET_DIR="$PWD/target/checking" 4 | 5 | pushd ../ 6 | 7 | # cargo clean 8 | 9 | cargo check --tests --all 10 | cargo check 11 | cargo check --release 12 | 13 | 14 | export TESTABLE_FEATURES="mem-analysis no-stats process-stats tracing devel-build kmer-counters" 15 | for feature in ${TESTABLE_FEATURES}; do 16 | echo "Checking with feature $feature" 17 | cargo check --tests --features "$feature" 18 | cargo check --features "$feature" 19 | cargo check --release --features "$feature" 20 | done 21 | 22 | pushd crates/api/example/ 23 | cargo check 24 | popd 25 | pushd crates/capi/ggcat-cpp-api/example 26 | make -j 27 | popd 28 | 29 | cargo clean 30 | --------------------------------------------------------------------------------