├── .github └── workflows │ └── rust.yml ├── .gitignore ├── .gitmodules ├── CHANGELOG.md ├── Cargo.toml ├── LICENSE ├── README.md ├── RELEASENOTE.md ├── extract.png └── src ├── commands.rs ├── commands ├── coverage.rs ├── depth.rs ├── extract.rs ├── index.rs ├── intersect.rs ├── sample.rs └── search.rs ├── index_builder.rs ├── index_builder └── core.rs ├── index_loader.rs ├── index_loader ├── a2f.rs ├── core.rs ├── fts.rs ├── gof.rs └── prt.rs ├── lib.rs ├── main.rs ├── utils.rs └── utils ├── common.rs ├── tree.rs ├── tree_index.rs └── tree_io.rs /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - "v*.*.*" 7 | workflow_dispatch: {} 8 | 9 | permissions: 10 | contents: write 11 | 12 | concurrency: 13 | group: release-${{ github.ref }} 14 | cancel-in-progress: false 15 | 16 | env: 17 | CARGO_TERM_COLOR: always 18 | BIN_NAME: gffx 19 | 20 | jobs: 21 | build: 22 | name: Build ${{ matrix.target }} on ${{ matrix.os }} 23 | runs-on: ${{ matrix.os }} 24 | strategy: 25 | fail-fast: false 26 | matrix: 27 | include: 28 | # --- Linux (musl / 静态) --- 29 | - os: ubuntu-latest 30 | target: x86_64-unknown-linux-musl 31 | bin_ext: "" 32 | archive: tar.gz 33 | use_cross: true 34 | - os: ubuntu-latest 35 | target: aarch64-unknown-linux-musl 36 | bin_ext: "" 37 | archive: tar.gz 38 | use_cross: true 39 | 40 | # --- macOS --- 41 | - os: macos-latest 42 | target: x86_64-apple-darwin 43 | bin_ext: "" 44 | archive: tar.gz 45 | use_cross: false 46 | - os: macos-latest 47 | target: aarch64-apple-darwin 48 | bin_ext: "" 49 | archive: tar.gz 50 | use_cross: false 51 | 52 | # --- Windows (MSVC) --- 53 | - os: windows-latest 54 | target: x86_64-pc-windows-msvc 55 | bin_ext: ".exe" 56 | archive: zip 57 | use_cross: false 58 | 59 | 60 | steps: 61 | - uses: actions/checkout@v4 62 | 63 | - name: Install Rust toolchain + target 64 | uses: dtolnay/rust-toolchain@stable 65 | with: 66 | targets: ${{ matrix.target }} 67 | 68 | - uses: Swatinem/rust-cache@v2 69 | with: 70 | prefix-key: ${{ matrix.target }} 71 | 72 | - name: Install cross 73 | if: matrix.use_cross == true 74 | uses: taiki-e/install-action@v2 75 | with: 76 | tool: cross@0.2.5 77 | 78 | - name: Build (release) 79 | shell: bash 80 | run: | 81 | set -eux 82 | if [ "${{ matrix.use_cross }}" = "true" ]; then 83 | cross build --release --target ${{ matrix.target }} 84 | else 85 | cargo build --release --target ${{ matrix.target }} 86 | fi 87 | 88 | - name: Prepare artifact 89 | shell: bash 90 | run: | 91 | set -eux 92 | BIN_PATH="target/${{ matrix.target }}/release/${BIN_NAME}${{ matrix.bin_ext }}" 93 | OUT_NAME="${BIN_NAME}-${{ matrix.target }}" 94 | 95 | mkdir -p dist 96 | cp "$BIN_PATH" "dist/${OUT_NAME}${{ matrix.bin_ext }}" 97 | 98 | if [ "${{ matrix.archive }}" = "tar.gz" ]; then 99 | tar -czf "dist/${OUT_NAME}.tar.gz" -C dist "${OUT_NAME}${{ matrix.bin_ext }}" 100 | (sha256sum "dist/${OUT_NAME}.tar.gz" || shasum -a 256 "dist/${OUT_NAME}.tar.gz") > "dist/${OUT_NAME}.tar.gz.sha256" 101 | else 102 | (cd dist && 7z a -tzip "${OUT_NAME}.zip" "${OUT_NAME}${{ matrix.bin_ext }}") 103 | (sha256sum "dist/${OUT_NAME}.zip" || shasum -a 256 "dist/${OUT_NAME}.zip") > "dist/${OUT_NAME}.zip.sha256" 104 | fi 105 | 106 | - name: Upload build artifacts 107 | uses: actions/upload-artifact@v4 108 | with: 109 | name: ${{ matrix.target }} 110 | path: dist/* 111 | 112 | release: 113 | name: Create GitHub Release 114 | needs: build 115 | runs-on: ubuntu-latest 116 | if: startsWith(github.ref, 'refs/tags/') 117 | steps: 118 | - uses: actions/download-artifact@v4 119 | with: 120 | path: artifacts 121 | 122 | - name: Gather files 123 | run: | 124 | set -eux 125 | mkdir -p upload 126 | find artifacts -type f \( -name "*.tar.gz" -o -name "*.zip" -o -name "*.sha256" \) -exec cp {} upload/ \; 127 | 128 | - name: Publish Release 129 | uses: softprops/action-gh-release@v2 130 | with: 131 | files: upload/* 132 | generate_release_notes: true 133 | draft: true 134 | prerelease: true 135 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # === Rust build output === 2 | /target/ 3 | 4 | # === IDEs and editors === 5 | **/*.swp 6 | .idea/ 7 | .vscode/ 8 | 9 | # === OS files === 10 | .DS_Store 11 | Thumbs.db 12 | 13 | # === Jupyter checkpoints === 14 | .ipynb_checkpoints/ 15 | **/.ipynb_checkpoints/ 16 | 17 | # === Dependency lock file (optional, keep if you want reproducibility) === 18 | Cargo.lock 19 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "benchmark"] 2 | path = benchmark 3 | url = https://github.com/Baohua-Chen/GFFx_benchmarks.git 4 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # GFFx Changelog 2 | 3 | --- 4 | 5 | Release v0.4.0: 6 | 7 | ### Changed 8 | 9 | Added two useful functionalities: `coverage` for calculating breadth of coverage and `depth` for calculating depth of coverage from BAM/SAM/CRAM or BED files on a GFF file. 10 | 11 | Added a `sample` functionality for random downsampling of feature groups from each chromosome at equal ratios. 12 | 13 | Updated module organization and source code directory layout to conform to the Rust 2024 edition guidelines for module visibility (`pub`) and path imports. 14 | 15 | --- 16 | 17 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "gffx" 3 | version = "0.4.0" 4 | edition = "2024" 5 | authors = ["Baohua Chen "] 6 | description = "An ultra-fast and memory-efficient toolkit for querying GFF files, written with Rust" 7 | license = "MIT OR Apache-2.0" 8 | readme = "README.md" 9 | repository = "https://github.com/Baohua-Chen/GFFx" 10 | homepage = "https://github.com/Baohua-Chen" 11 | keywords = ["gff-file", "bioinformatics", "genomics"] 12 | categories = ["command-line-utilities", "science"] 13 | documentation = "https://docs.rs/gffx" 14 | exclude = ["benchmark/**", "target/**"] 15 | 16 | [dependencies] 17 | regex = "1.11.1" 18 | memchr = "2.7.4" 19 | clap = { version = "4.5.37", features = ["derive"] } 20 | anyhow = "1.0.98" 21 | byteorder = "1.5.0" 22 | memmap2 = "0.9.5" 23 | rayon = '1.10.0' 24 | indexmap = '2.10.0' 25 | rustc-hash = '2.1.1' 26 | bstr = '1.12.0' 27 | lexical-core = '1.0.5' 28 | meminterval = '0.4.1' 29 | serde_json = '1.0.140' 30 | serde = { version = '1.0.219', features = ["derive"] } 31 | bincode2 = '2.0.1' 32 | rust-htslib = '0.50' 33 | num_cpus = '1.17.0' 34 | rand = '0.9.2' -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [2025] [Baohua CHen] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # GFFx Command Line Manual 3 | 4 | **GFFx** is a high-performance, Rust-based toolkit for extracting and querying annotations from GFF3 files. It supports fast indexing and feature retrieval with several subcommands. 5 | It can be used both as a **command-line tool** and as a **Rust library**. 6 | 7 |

8 | Benchmarking results 9 |
10 | Benchmarking runtime and memory usage of ID-based feature extraction 11 |

12 | 13 | --- 14 | 15 | ## Breaking Changes 16 | 17 | Added two useful functionalities: `coverage` for calculating breadth of coverage and `depth` for calculating depth of coverage from BAM/SAM/CRAM or BED files on a GFF file. 18 | 19 | Added a `sample` functionality for random downsampling of feature groups from each chromosome at equal ratios. 20 | 21 | Updated module organization and source code directory layout to conform to the Rust 2024 edition guidelines for module visibility (`pub`) and path imports. 22 | 23 | --- 24 | 25 | ## Table of Contents 26 | 27 | *GFFx version 0.4.0* 28 | 29 | --- 30 | 31 | - [Installation](#installation) 32 | - [Basic Usage](#basic-usage) 33 | - [index](#index) - Build index files 34 | - [extract](#extract) - Extract features by ID 35 | - [intersect](#intersect) - Extract features by regions 36 | - [search](#search) - Search features by attributes 37 | - [coverage](#coverage) - Calculate coverage breadth 38 | - [depth](#depth) - Calculate coverage depth 39 | - [sample](#sample) - Randomly downsample feature groups 40 | 41 | 42 | - [Example Use Cases](#example-use-cases) 43 | - [Using GFFx as a Rust Library](#using-gffx-as-a-rust-library) 44 | - [Available Public APIs](#available-public-apis) 45 | - [Index File Types](#index-file-types) 46 | - [License](#license) 47 | - [Citation](#citation) 48 | 49 | --- 50 | ## Installation 51 | 52 | ### Option 1: Install via [crates.io](https://crates.io/crates/gffx) 53 | 54 | ```bash 55 | cargo install gffx # install to default location (~/.cargo/bin) 56 | cargo install gffx --root /your/path # optional: install to custom location 57 | ``` 58 | 59 | ### Option 2: Install from source 60 | 61 | ```bash 62 | git clone https://github.com/Baohua-Chen/GFFx.git 63 | cd GFFx 64 | cargo build --release 65 | # Optionally copy the binary 66 | cp target/release/gffx /your/path 67 | ``` 68 | 69 | > Requires **Rust 1.70 or later**. You can install or update Rust using [rustup](https://rustup.rs). 70 | --- 71 | 72 | 73 | ## Basic Usage 74 | 75 | ```bash 76 | gffx [OPTIONS] 77 | ``` 78 | 79 | Available subcommands: 80 | 81 | - [index] Build index files 82 | - [intersect] Extract features by region 83 | - [extract] Extract features by ID 84 | - [search] Search features by attribute 85 | 86 | --- 87 | 88 | ### `index` 89 | 90 | Builds index files from a GFF file to accelerate downstream operations. 91 | 92 | ```bash 93 | gffx index [OPTIONS] --input 94 | ``` 95 | 96 | **Options:** 97 | 98 | | Option | Description | 99 | |------------------------|-------------------------------------------------| 100 | | `-i`, `--input` | Input GFF file | 101 | | `-a`, `--attribute` | Attribute key to extract (default: `gene_name`) | 102 | | `-v`, `--verbose` | Enable verbose output | 103 | | `-h`, `--help ` | Print help | 104 | 105 | --- 106 | 107 | ### `intersect` 108 | 109 | Extracts models intersecting with regions from a GFF file, either from a single region or a BED file. 110 | 111 | ```bash 112 | gffx intersect [OPTIONS] --input <--region |--bed > 113 | ``` 114 | 115 | **Options:** 116 | Required 117 | | Option | Description | 118 | | --------------------------- | ------------------------------------------------------------ | 119 | | `-i`, `--input` `` | Input GFF file path | 120 | | `-r`, `--region` `` | Single region in `chr:start-end` format | 121 | | `-b`, `--bed` `` | BED file containing multiple regions | 122 | 123 | > **Note**: Exactly one of `--region` or `--bed` must be specified. 124 | 125 | 126 | Optional 127 | | Option | Description | 128 | | ------------------------- | ------------------------------------------------------------------------------ | 129 | | `-o`, `--output` `` | Output file path (default: stdout) | 130 | | `-e`, `--entire-group` | Enable the "entire-group" mode. Return entire gene models or feature groups | 131 | | | for all matched features, instead of only the directly matched features. | 132 | | `-v`, `--invert` | Invert selection (exclude matched features) | 133 | | `-T`, `--types` `` | Filter output to include only features of specified types (e.g., `gene,exon`) | 134 | | `-t`, `--threads` `` | Number of threads [default: 12] | 135 | | `-V`, `--verbose` | Enable verbose output | 136 | | `-h`, `--help` | Show help message | 137 | | *(one of)* | | 138 | | `-c`, `--contained` | Only keep features fully contained within the region | 139 | | `-C`, `--contains-region` | Only keep features that fully contain the region | 140 | | `-O`, `--overlap` | Keep features that partially or fully overlap (default mode) | 141 | 142 | --- 143 | 144 | ### `extract` 145 | 146 | Extracts annotation models by feature ID(s), including their parent models. 147 | 148 | ```bash 149 | gffx extract [OPTIONS] --input <--feature-file |--feature-id > 150 | ``` 151 | 152 | **Options:** 153 | 154 | Required 155 | | Option | Description | 156 | | ---------------------------------------- | ------------------------------------------------------------ | 157 | | `-i`, `--input` `` | Input GFF file path | 158 | | *(one of)* | | 159 | | `-f`, `--feature-id` `` | Extrach by a single feature id | 160 | | `-e`, `--feature-file` `` | Extrach by a BED file containing multiple regions | 161 | 162 | Optional 163 | | Option | Description | 164 | | ------------------------- | ------------------------------------------------------------------------------ | 165 | | `-o`, `--output` `` | Output file path (default: stdout) | 166 | | `-e`, `--entire-group` | Enable the "entire-group" mode. Return entire gene models or feature groups | 167 | | | for all matched features, instead of only the directly matched features. | 168 | | `-T`, `--types` `` | Filter output to include only features of specified types (e.g., `gene,exon`) | 169 | | `-t`, `--threads` `` | Number of threads [default: 12] | 170 | | `-V`, `--verbose` | Enable verbose output | 171 | | `-h`, `--help` | Show help message | 172 | 173 | --- 174 | 175 | ### `search` 176 | 177 | Searches for features using a specified attribute value and retrieves the full annotation models. 178 | 179 | ```bash 180 | gffx search -a geneX -i input.gff 181 | ``` 182 | 183 | **Options:** 184 | 185 | Required 186 | | Option | Description | 187 | | ---------------------------------------- | ------------------------------------------------------------ | 188 | | `-i`, `--input` `` | Input GFF file path | 189 | | *(one of)* | | 190 | | `-a`, `--attr` `` | Search a single attribute value/pattern | 191 | | `-A`, `--attr-list` `` | Search attribute values/patterns defined in a text file | 192 | 193 | Optional 194 | | Option | Description | 195 | | ------------------------- | ------------------------------------------------------------------------------ | 196 | | `-o`, `--output` `` | Output file path (default: stdout) | 197 | | `-e`, `--entire-group` | Enable the "entire-group" mode. Return entire gene models or feature groups | 198 | | | for all matched features, instead of only the directly matched features. | 199 | | `-r`, `--regex` `` | Enable regex matching for attribute values | 200 | | `-T`, `--types` `` | Filter output to include only features of specified types (e.g., `gene,exon`) | 201 | | `-t`, `--threads` `` | Number of threads [default: 12] | 202 | | `-V`, `--verbose` | Enable verbose output | 203 | | `-h`, `--help` | Show help message | 204 | 205 | --- 206 | 207 | ### `coverage` 208 | 209 | Compute coverage breadth across genomic feature. 210 | 211 | ```bash 212 | gffx coverage -i input.gff -s source.bam 213 | ``` 214 | 215 | **Options:** 216 | 217 | Required 218 | | Option | Description | 219 | | ---------------------------------------- | ------------------------------------------------------------ | 220 | | `-i`, `--input` `` | Input GFF file path | 221 | | `-s`, `--source` `` | Source file in BAM/SAM/CRAM or BED format | 222 | 223 | Optional 224 | | Option | Description | 225 | | ------------------------- | ------------------------------------------------------------------------------ | 226 | | `-o`, `--output` `` | Output file path (default: stdout) | 227 | | `-e`, `--entire-group` | Enable the "entire-group" mode. Return entire gene models or feature groups | 228 | | | for all matched features, instead of only the directly matched features. | 229 | | `-t`, `--threads` `` | Number of threads [default: 12] | 230 | | `-V`, `--verbose` | Enable verbose output | 231 | | `-h`, `--help` | Show help message | 232 | 233 | --- 234 | 235 | ### `depth` 236 | 237 | Compute coverage depth across genomic feature. 238 | 239 | ```bash 240 | gffx depth -i input.gff -s source.bam 241 | ``` 242 | 243 | **Options:** 244 | 245 | Required 246 | | Option | Description | 247 | | ---------------------------------------- | ------------------------------------------------------------ | 248 | | `-i`, `--input` `` | Input GFF file path | 249 | | `-s`, `--source` `` | Source file in BAM/SAM/CRAM or BED format | 250 | 251 | Optional 252 | | Option | Description | 253 | | ------------------------- | ------------------------------------------------------------------------------ | 254 | | `-o`, `--output` `` | Output file path (default: stdout) | 255 | | `--bin-shift` `` | Bin width parameter (2^k bp) for spatial bucketing of features and queries. | 256 | | | Choose k so that a typical read and feature span ~1–2 bins [default: 12] | 257 | | `-e`, `--entire-group` | Enable the "entire-group" mode. Return entire gene models or feature groups | 258 | | | for all matched features, instead of only the directly matched features. | 259 | | `-t`, `--threads` `` | Number of threads [default: 12] | 260 | | `-V`, `--verbose` | Enable verbose output | 261 | | `-h`, `--help` | Show help message | 262 | 263 | --- 264 | 265 | ### `sample` 266 | 267 | Ramdomly downsample feature groups. 268 | 269 | ```bash 270 | gffx sample -i input.gff -r 0.33 271 | ``` 272 | 273 | **Options:** 274 | 275 | Required 276 | | Option | Description | 277 | | ---------------------------------------- | ------------------------------------------------------------ | 278 | | `-i`, `--input` `` | Input GFF file path | 279 | | `-r`, `--ratio` `` | Ratio of downsampling, should be between 0 and 1 | 280 | 281 | Optional 282 | | Option | Description | 283 | | ------------------------- | ------------------------------------------------------------------------------ | 284 | | `-o`, `--output` `` | Output file path (default: stdout) | 285 | | `-t`, `--threads` `` | Number of threads [default: 12] | 286 | | `-V`, `--verbose` | Enable verbose output | 287 | | `-h`, `--help` | Show help message | 288 | 289 | --- 290 | 291 | ## Example Use Cases 292 | 293 | ```bash 294 | # Build index 295 | gffx index -i genes.gff -a gene_name 296 | 297 | # Extract all features overlapping with a region 298 | gffx intersect --region chr1:10000-20000 -i genes.gff -o out.gff -F 299 | 300 | # Extract models from a list of gene IDs 301 | gffx extract --feature-file genes.txt -i genes.gff -o subset.gff -F 302 | 303 | # Search by gene name and extract the full model 304 | gffx search -a TP53 -i genes.gff -o tp53_model.gff 305 | 306 | ``` 307 | 308 | --- 309 | 310 | ## Using GFFx as a Rust Library 311 | 312 | You can use GFFx as a Rust library in your own project. 313 | 314 | ### Add to Cargo.toml 315 | 316 | ```toml 317 | [dependencies] 318 | gffx = "^0.2.0" # Please check the latest version in crates.io 319 | ``` 320 | 321 | ### Example: Manually extract features from region using index files 322 | The following example runs inside a main() -> Result<()> context: 323 | 324 | ```rust 325 | use anyhow::{Result, bail}; 326 | use std::path::Path; 327 | 328 | pub fn extract_one_id_full_model( // Define a minimal single-ID full-model extractor 329 | gff_path: &Path, // Path to the input GFF file 330 | feature_id: &str, // Target feature ID (string form) 331 | out_path: &Path // Path to the output GFF file 332 | ) -> Result<()> { // Return anyhow::Result for error propagation 333 | let fts = load_fts(gff_path)?; // Load feature table: maps string IDs <-> numeric fids 334 | let prt = load_prt(gff_path)?; // Load parent relations: map child fid -> root fid 335 | let gof = load_gof(gff_path)?; // Load offsets: map root fid -> byte ranges in the file 336 | 337 | let (fid_set, missing) = fts.map_fnames_to_fids( // Map the single feature name to a numeric fid via batch API 338 | std::iter::once(feature_id.to_string()).collect(), // Build a one-element set of the feature name 339 | 1 // Use 1 thread since this is a single lookup 340 | ); 341 | 342 | let fid = *fid_set.iter().next().unwrap(); // Extract the only fid from the set 343 | 344 | let root = prt.map_fids_to_roots(&[fid], 1)[0]; // Map fid -> root fid using the batch API with a single item 345 | 346 | let blocks = gof.roots_to_offsets(&[root], 1); // Convert the root fid to file block offsets (full model span) 347 | 348 | write_gff_output( // Write the entire blocks (full-model output, no filtering) 349 | gff_path, // Input GFF file path 350 | &blocks, // Byte-range blocks to emit 351 | out_path, // Output file path 352 | false // Verbose: false for minimal logging 353 | )?; 354 | 355 | Ok(()) // Return success 356 | } // End of function 357 | ``` 358 | 359 | --- 360 | 361 | ## Available Public APIs 362 | 363 | ### Index building & checking (`index_builder`) 364 | - `build_index` 365 | 366 | ### Index loading (`index_loader`) 367 | - `load_gof`, `load_prt`, `load_fts`, `load_atn`, `load_a2f`, `load_sqs` 368 | - `safe_mmap_readonly` 369 | - `GofMap`, `PrtMap`, `FtsMap`, `A2fMap` 370 | 371 | ### Interval querying data structures (`utils::serial_interval_trees`) 372 | - `IntervalTree`, `Interval` 373 | - `save_multiple_trees`, `write_offsets_to_file` 374 | 375 | ### Other utilities (`utils::common`) 376 | - `CommonArgs`, `append_suffix` 377 | - `write_gff_output`, `write_gff_output_filtered` 378 | - `check_index_files_exist` 379 | 380 | --- 381 | 382 | 383 | ## Index File Types 384 | 385 | | File Extension | Purpose | 386 | |----------------|-----------------------------------------------------| 387 | | `.gof` | Byte offset index for GFF feature blocks | 388 | | `.fts` | Feature ID table | 389 | | `.prt` | Child to parent mapping | 390 | | `.a2f` | Attribute to feature ID mapping | 391 | | `.atn` | Attribute value table | 392 | | `.sqs` | Sequence ID table | 393 | | `.rit` | Interval tree index | 394 | | `.rix` | Byte offest index for interval trees in.rit file | 395 | 396 | --- 397 | 398 | ## Notes 399 | 400 | - Make sure you run `gffx index` before using `intersect`, `extract`, or `search`. 401 | 402 | --- 403 | 404 | ## License 405 | 406 | GFFx is released under the MIT or Apache-2.0 License. 407 | 408 | --- 409 | 410 | ## Citation 411 | 412 | If you use **GFFx**, please cite our forthcoming paper in *GigaScience*: 413 | 414 | Baohua Chen, Dongya Wu, Guojie Zhang, GFFx: A Rust-based suite of utilities for ultra-fast genomic feature extraction, GigaScience, Volume 14, 2025, giaf124, https://doi.org/10.1093/gigascience/giaf124 415 | 416 | --- 417 | -------------------------------------------------------------------------------- /RELEASENOTE.md: -------------------------------------------------------------------------------- 1 | # GFFx v0.4.0 Release Notes 2 | 3 | Added two useful functionalities: `coverage` for calculating breadth of coverage and `depth` for calculating depth of coverage from BAM/SAM/CRAM or BED files on a GFF file. 4 | 5 | Added a `sample` functionality for random downsampling of feature groups from each chromosome at equal ratios. 6 | 7 | Updated module organization and source code directory layout to conform to the Rust 2024 edition guidelines for module visibility (`pub`) and path imports. -------------------------------------------------------------------------------- /extract.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Baohua-Chen/GFFx/9e889a9eee5536547e63f3fc7ad63306bb8f3b7c/extract.png -------------------------------------------------------------------------------- /src/commands.rs: -------------------------------------------------------------------------------- 1 | pub mod index; 2 | pub mod extract; 3 | pub mod intersect; 4 | pub mod search; 5 | pub mod coverage; 6 | pub mod depth; 7 | pub mod sample; 8 | 9 | pub use index::{IndexArgs, run as run_index}; 10 | pub use extract::{ExtractArgs, run as run_extract}; 11 | pub use intersect::{IntersectArgs, run as run_intersect}; 12 | pub use search::{SearchArgs, run as run_search}; 13 | pub use coverage::{CoverageArgs, run as run_coverage}; 14 | pub use depth::{DepthArgs, run as run_depth}; 15 | pub use sample::{SampleArgs, run as run_sample}; 16 | -------------------------------------------------------------------------------- /src/commands/coverage.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{Result, bail, Context}; 2 | use rayon::prelude::*; 3 | use memmap2::Mmap; 4 | use rust_htslib::bam::{self, Read}; 5 | use rustc_hash::{FxHashMap, FxHashSet}; 6 | use std::{ 7 | str, 8 | fs::File, 9 | path::{Path, PathBuf}, 10 | io::{BufWriter, Write}, 11 | }; 12 | use clap::Parser; 13 | use crate::{ 14 | Interval, TreeIndexData, load_gof, GofMap, 15 | }; 16 | use std::time::{Instant, Duration}; 17 | use rust_htslib::bam::ext::BamRecordExtensions; 18 | 19 | const MISSING: u64 = u64::MAX; // Sentinel for missing entries 20 | 21 | // Number of IoSlices per batch writer 22 | const WRITE_BUF_SIZE: usize = 32 * 1024 * 1024; 23 | 24 | /// Compute bin index for coordinate `x` given shift k (unused in the new pipeline). 25 | #[inline] 26 | fn _bin_of(x: u32, shift: u32) -> u32 { 27 | let _ = shift; 28 | x >> 0 29 | } 30 | 31 | /// Arguments 32 | #[derive(Parser, Debug)] 33 | #[command( 34 | about = "Compute coverage breadth across genomic feature.", 35 | long_about = "This tool computes sequencing coverage breadth and fraction from high-throughput sequencing (HTS) alignment files (SAM/BAM/CRAM) or user-specified genomic intervals (BED)." 36 | )] 37 | pub struct CoverageArgs { 38 | /// GFF file path (indexed via GOF) 39 | #[arg(short = 'i', long = "input", value_name = "FILE")] 40 | pub input: PathBuf, 41 | 42 | /// Source: BAM/SAM/CRAM or BED 43 | #[arg(short = 's', long)] 44 | pub source: PathBuf, 45 | 46 | /// Output file (required) 47 | #[arg(short = 'o', long = "output", value_name = "FILE")] 48 | pub output: Option, 49 | 50 | /// Number of threads 51 | #[arg(short = 't', long = "threads", default_value_t = 12, value_name = "NUM")] 52 | pub threads: usize, 53 | 54 | /// Verbose logs 55 | #[arg(short = 'v', long = "verbose", default_value_t = false, value_name = "BOOL")] 56 | pub verbose: bool, 57 | } 58 | 59 | /// Fast u32 parse 60 | #[inline(always)] 61 | fn parse_u32_fast(s: &str) -> Option { 62 | if s.is_empty() { return None; } 63 | let mut n: u32 = 0; 64 | for b in s.as_bytes() { 65 | let d = b.wrapping_sub(b'0'); 66 | if d > 9 { return None; } 67 | n = n.checked_mul(10)?.checked_add(d as u32)?; 68 | } 69 | Some(n) 70 | } 71 | 72 | /// Extract `ID=` from GFF attributes quickly 73 | #[inline(always)] 74 | fn fast_id(attrs: &str) -> Option<&str> { 75 | let bytes = attrs.as_bytes(); 76 | let mut i = 0; 77 | while i + 2 < bytes.len() { 78 | if bytes[i] == b'I' && bytes[i+1] == b'D' && bytes[i+2] == b'=' { 79 | let mut j = i + 3; 80 | while j < bytes.len() && bytes[j] != b';' && bytes[j] != b' ' && bytes[j] != b'\t' { 81 | j += 1; 82 | } 83 | return std::str::from_utf8(&bytes[i+3..j]).ok(); 84 | } 85 | i += 1; 86 | } 87 | None 88 | } 89 | 90 | /// Disjoint union of intervals assumed to be half-open [s, e) 91 | /// Input may be unsorted and overlapping; output is sorted, non-overlapping. 92 | fn merge_intervals(mut ivs: Vec<(u32,u32)>) -> Vec<(u32,u32)> { 93 | if ivs.is_empty() { return ivs; } 94 | ivs.sort_unstable_by_key(|x| x.0); 95 | let mut out: Vec<(u32,u32)> = Vec::with_capacity(ivs.len()); 96 | let (mut cs, mut ce) = ivs[0]; 97 | for (s,e) in ivs.into_iter().skip(1) { 98 | if s <= ce { 99 | if e > ce { ce = e; } 100 | } else { 101 | out.push((cs, ce)); 102 | cs = s; ce = e; 103 | } 104 | } 105 | out.push((cs, ce)); 106 | out 107 | } 108 | 109 | /// Union length of intervals (half-open). Input may be unsorted. 110 | fn union_len(mut ivs: Vec<(u32,u32)>) -> usize { 111 | if ivs.is_empty() { return 0; } 112 | ivs.sort_unstable_by_key(|x| x.0); 113 | let mut total: usize = 0; 114 | let (mut cs, mut ce) = ivs[0]; 115 | for (s, e) in ivs.into_iter().skip(1) { 116 | if s <= ce { ce = ce.max(e); } 117 | else { total += (ce - cs) as usize; cs = s; ce = e; } 118 | } 119 | total += (ce - cs) as usize; 120 | total 121 | } 122 | 123 | /// Collect coverage intervals per root (from BAM/SAM/CRAM). 124 | /// We DO NOT read GFF slices here; only group regions by root_fid. 125 | fn collect_by_root_from_bam( 126 | bam_path: &Path, 127 | index_data: &TreeIndexData, 128 | verbose: bool, 129 | threads: usize, 130 | ) -> Result>> { 131 | let t_open = Instant::now(); 132 | let mut reader = bam::Reader::from_path(bam_path)?; 133 | reader.set_threads(std::cmp::max(2, threads))?; 134 | let t_open_elapsed = t_open.elapsed(); 135 | 136 | let header = reader.header().to_owned(); 137 | 138 | // Build tid -> chr_id mapping 139 | let t_map_build = Instant::now(); 140 | let mut tid2num: Vec> = Vec::with_capacity(header.target_count() as usize); 141 | for tid in 0..header.target_count() { 142 | let chrom_bytes = header.tid2name(tid).to_owned(); 143 | let chrom = std::str::from_utf8(&chrom_bytes)?; 144 | let chr_id = index_data.seqid_to_num.get(chrom).copied(); 145 | tid2num.push(chr_id); 146 | } 147 | let t_map_build_elapsed = t_map_build.elapsed(); 148 | 149 | // Storage: root_fid -> list of raw intervals (to be merged later) 150 | let mut by_root: FxHashMap> = FxHashMap::default(); 151 | 152 | let mut t_parse = Duration::ZERO; 153 | let mut t_tidmap = Duration::ZERO; 154 | let t_tree = Duration::ZERO; 155 | 156 | let mut hits: Vec<&Interval> = Vec::new(); 157 | for r in reader.records() { 158 | let t0 = Instant::now(); 159 | let rec = r?; 160 | if rec.is_unmapped() { 161 | t_parse += t0.elapsed(); 162 | continue; 163 | } 164 | t_parse += t0.elapsed(); 165 | 166 | let t1 = Instant::now(); 167 | let tid = rec.tid(); 168 | if tid < 0 { 169 | t_tidmap += t1.elapsed(); 170 | continue; 171 | } 172 | if let Some(chr_id) = tid2num[tid as usize] { 173 | let start_i64 = rec.pos(); 174 | let end_i64 = rec.reference_end(); 175 | if start_i64 >= 0 && end_i64 > start_i64 { 176 | let start = (start_i64 as i128).clamp(0, u32::MAX as i128) as u32; 177 | let end = (end_i64 as i128).clamp(0, u32::MAX as i128) as u32; 178 | 179 | // Query candidate roots for this region 180 | if let Some(tree) = index_data.chr_entries.get(&chr_id) { 181 | hits.clear(); 182 | tree.query_interval(start, end, &mut hits); 183 | // De-duplicate roots within a single region 184 | let mut seen_in_region: FxHashSet = FxHashSet::default(); 185 | for h in &hits { 186 | if seen_in_region.insert(h.root_fid) { 187 | by_root.entry(h.root_fid).or_default().push((start, end)); 188 | } 189 | } 190 | } 191 | } 192 | } 193 | t_tidmap += t1.elapsed(); 194 | } 195 | 196 | if verbose { 197 | eprintln!("[TIMER] (1) Opening BAM: {:.2?}", t_open_elapsed); 198 | eprintln!("[TIMER] (1b) Build tid2num: {:.2?}", t_map_build_elapsed); 199 | eprintln!("[TIMER] (2) Parse+map records: {:.2?}", t_parse + t_tidmap); 200 | eprintln!("[TIMER] (3) Tree queries: {:.2?}", t_tree); 201 | eprintln!("[INFO] Collected {} roots with coverage", by_root.len()); 202 | } 203 | 204 | Ok(by_root) 205 | } 206 | 207 | /// Collect coverage intervals per root (from BED). 208 | fn collect_by_root_from_bed( 209 | bed_path: &Path, 210 | index_data: &TreeIndexData, 211 | verbose: bool, 212 | ) -> Result>> { 213 | // mmap the entire BED file 214 | let file = File::open(bed_path)?; 215 | let mmap = unsafe { Mmap::map(&file)? }; 216 | let data = &mmap[..]; 217 | 218 | if verbose { 219 | eprintln!("[INFO] mmap BED file: {} bytes", data.len()); 220 | } 221 | 222 | // collect line offsets 223 | let mut line_offsets = Vec::with_capacity(1_000_000); 224 | line_offsets.push(0usize); 225 | for (i, &b) in data.iter().enumerate() { 226 | if b == b'\n' { 227 | line_offsets.push(i + 1); 228 | } 229 | } 230 | if *line_offsets.last().unwrap_or(&0) != data.len() { 231 | line_offsets.push(data.len()); 232 | } 233 | 234 | let mut by_root: FxHashMap> = FxHashMap::default(); 235 | let mut hits: Vec<&Interval> = Vec::new(); 236 | 237 | for w in line_offsets.windows(2) { 238 | let start = w[0]; 239 | let end = w[1]; 240 | if start >= end { continue; } 241 | let line = &data[start..end]; 242 | if line.is_empty() || line[0] == b'#' { continue; } 243 | 244 | let fields: Vec<&[u8]> = line 245 | .split(|&b| b == b'\t' || b == b' ') 246 | .filter(|f| !f.is_empty()) 247 | .collect(); 248 | if fields.len() < 3 { continue; } 249 | 250 | let chrom = match std::str::from_utf8(fields[0]) { Ok(s) => s, Err(_) => continue }; 251 | let s = match std::str::from_utf8(fields[1]).ok().and_then(|x| x.parse::().ok()) { Some(v) => v, None => continue }; 252 | let e = match std::str::from_utf8(fields[2]).ok().and_then(|x| x.trim_end().parse::().ok()) { Some(v) => v, None => continue }; 253 | if s >= e { continue; } 254 | 255 | let Some(&chr_num) = index_data.seqid_to_num.get(chrom) else { continue }; 256 | 257 | if let Some(tree) = index_data.chr_entries.get(&chr_num) { 258 | hits.clear(); 259 | tree.query_interval(s, e, &mut hits); 260 | let mut seen_in_region: FxHashSet = FxHashSet::default(); 261 | for h in &hits { 262 | if seen_in_region.insert(h.root_fid) { 263 | by_root.entry(h.root_fid).or_default().push((s, e)); 264 | } 265 | } 266 | } 267 | } 268 | 269 | if verbose { 270 | eprintln!("[INFO] Collected {} roots with coverage (BED)", by_root.len()); 271 | } 272 | 273 | Ok(by_root) 274 | } 275 | 276 | /// Compute breadth for all features within a root using pre-merged disjoint coverage. 277 | fn compute_breadth_for_root( 278 | gff_slice: &[u8], 279 | cov_merged: &[(u32,u32)], // sorted, non-overlapping coverage intervals 280 | ) -> FxHashMap { 281 | #[derive(Clone)] 282 | struct FeatLine { 283 | id_idx: u32, 284 | start0: u32, // 0-based inclusive 285 | end0: u32, // 0-based exclusive 286 | } 287 | 288 | // Map ID string -> index 289 | let mut id_to_idx: FxHashMap = FxHashMap::default(); 290 | let mut id_strings: Vec = Vec::new(); 291 | let mut id_chrom: Vec = Vec::new(); 292 | 293 | // Collect feature lines (within this root) 294 | let mut lines: Vec = Vec::new(); 295 | 296 | if let Ok(text) = str::from_utf8(gff_slice) { 297 | for line in text.split_terminator('\n') { 298 | if line.is_empty() || line.as_bytes()[0] == b'#' { continue; } 299 | let mut cols = line.splitn(9, '\t'); 300 | let (Some(seqid), _, _, Some(start_s), Some(end_s), _, _, _, Some(attrs)) = ( 301 | cols.next(), cols.next(), cols.next(), 302 | cols.next(), cols.next(), cols.next(), 303 | cols.next(), cols.next(), cols.next() 304 | ) else { continue; }; 305 | 306 | let (Some(s1), Some(e1)) = (parse_u32_fast(start_s), parse_u32_fast(end_s)) else { continue; }; 307 | if e1 == 0 { continue; } 308 | let (s1, e1) = if s1 > e1 { (e1, s1) } else { (s1, e1) }; 309 | // GFF is 1-based, inclusive end; convert to half-open 0-based 310 | let fstart0 = s1.saturating_sub(1); 311 | let fend0 = e1; 312 | 313 | if let Some(id) = fast_id(attrs) { 314 | let idx = *id_to_idx.entry(id.to_owned()).or_insert_with(|| { 315 | let k = id_strings.len() as u32; 316 | id_strings.push(id.to_owned()); 317 | id_chrom.push(seqid.to_owned()); 318 | k 319 | }); 320 | lines.push(FeatLine { id_idx: idx, start0: fstart0, end0: fend0 }); 321 | } 322 | } 323 | } 324 | 325 | if lines.is_empty() || cov_merged.is_empty() { 326 | return FxHashMap::default(); 327 | } 328 | 329 | // Sort features by start for two-pointer sweep against cov_merged 330 | lines.sort_unstable_by_key(|x| x.start0); 331 | 332 | let mut min_s: Vec = vec![u32::MAX; id_strings.len()]; 333 | let mut max_e: Vec = vec![0; id_strings.len()]; 334 | // For each ID, collect overlaps with coverage (we'll union at the end per ID) 335 | let mut id_overlap: Vec> = vec![Vec::new(); id_strings.len()]; 336 | 337 | // Two-pointer scan: iterate features in order, and advance cov pointer monotonically 338 | let mut j = 0usize; 339 | for fl in &lines { 340 | // Advance coverage pointer until cov[j].end <= feature.start 341 | while j < cov_merged.len() && cov_merged[j].1 <= fl.start0 { 342 | j += 1; 343 | } 344 | // Record feature span extents for this ID 345 | if fl.start0 < min_s[fl.id_idx as usize] { min_s[fl.id_idx as usize] = fl.start0; } 346 | if fl.end0 > max_e[fl.id_idx as usize] { max_e[fl.id_idx as usize] = fl.end0; } 347 | 348 | // Walk through all coverage intervals that might overlap this feature 349 | let mut k = j; 350 | while k < cov_merged.len() && cov_merged[k].0 < fl.end0 { 351 | let s = fl.start0.max(cov_merged[k].0); 352 | let e = fl.end0.min(cov_merged[k].1); 353 | if e > s { 354 | id_overlap[fl.id_idx as usize].push((s, e)); 355 | } 356 | if cov_merged[k].1 <= fl.end0 { 357 | k += 1; 358 | } else { 359 | break; 360 | } 361 | } 362 | } 363 | 364 | // Finalize per-ID breadth (union of all overlap pieces), and produce outputs 365 | let mut out: FxHashMap = FxHashMap::default(); 366 | for i in 0..id_strings.len() { 367 | let length = if max_e[i] > min_s[i] { (max_e[i] - min_s[i]) as usize } else { 0 }; 368 | let breadth = union_len(std::mem::take(&mut id_overlap[i])); 369 | if length > 0 || breadth > 0 { 370 | out.insert( 371 | id_strings[i].clone(), 372 | (id_chrom[i].clone(), min_s[i], max_e[i], breadth), 373 | ); 374 | } 375 | } 376 | out 377 | } 378 | 379 | /// After collecting raw intervals per root: 380 | /// 1) Merge (union) them into disjoint intervals; 381 | /// 2) Parse GFF slice for that root; 382 | /// 3) Compute breadth/fraction for each feature under this root. 383 | fn finalize_compute_breadth( 384 | by_root_raw: FxHashMap>, 385 | gof: &GofMap, 386 | gff_mmap: &Mmap, 387 | threads: usize, 388 | verbose: bool, 389 | ) -> Result> { 390 | let gff_bytes: &[u8] = &gff_mmap[..]; 391 | let idx = gof.index_cached(); 392 | 393 | if by_root_raw.is_empty() { 394 | return Ok(FxHashMap::default()); 395 | } 396 | 397 | let roots_iter = by_root_raw.into_iter(); 398 | 399 | // Parallel per-root processing if threads > 1 400 | let partials: Vec> = if threads > 1 { 401 | roots_iter.par_bridge().map(|(root, ivs)| { 402 | // Merge coverage intervals for this root 403 | let cov = merge_intervals(ivs); 404 | // Locate GFF slice for this root 405 | match idx.get(&root) { 406 | Some(&(s_off, e_off)) if s_off != MISSING && e_off != MISSING && e_off > s_off => { 407 | let su = usize::try_from(s_off).unwrap(); 408 | let eu = usize::try_from(e_off).unwrap(); 409 | compute_breadth_for_root(&gff_bytes[su..eu], &cov) 410 | } 411 | _ => FxHashMap::default(), 412 | } 413 | }).collect() 414 | } else { 415 | let mut v = Vec::new(); 416 | for (root, ivs) in roots_iter { 417 | let cov = merge_intervals(ivs); 418 | match idx.get(&root) { 419 | Some(&(s_off, e_off)) if s_off != MISSING && e_off != MISSING && e_off > s_off => { 420 | let su = usize::try_from(s_off).unwrap(); 421 | let eu = usize::try_from(e_off).unwrap(); 422 | v.push(compute_breadth_for_root(&gff_bytes[su..eu], &cov)); 423 | } 424 | _ => v.push(FxHashMap::default()), 425 | } 426 | } 427 | v 428 | }; 429 | 430 | // Merge per-root maps into global results 431 | let mut global: FxHashMap = FxHashMap::default(); 432 | for m in partials { 433 | for (id, (chrom, s, e, b)) in m { 434 | global.entry(id).and_modify(|(c0, s0, e0, breadth)| { 435 | if s < *s0 { *s0 = s; } 436 | if e > *e0 { *e0 = e; } 437 | // Note: If the same ID appears under multiple roots (rare), breadth is summed. 438 | // In well-formed GFF partitioning, one ID should belong to a single root. 439 | *breadth += b; 440 | let _ = c0; 441 | }).or_insert((chrom, s, e, b)); 442 | } 443 | } 444 | 445 | if verbose { 446 | eprintln!("[INFO] Aggregated {} feature IDs", global.len()); 447 | } 448 | 449 | Ok(global) 450 | } 451 | 452 | /// Write "id\tchr\tstart\tend\tbreadth\tfraction" per line. 453 | pub fn write_breadth_results( 454 | id_map: FxHashMap, 455 | mut out: W, 456 | verbose: bool, 457 | ) -> Result<()> { 458 | use std::fmt::Write as FmtWrite; 459 | let mut buf = String::with_capacity(WRITE_BUF_SIZE); 460 | let mut written = 0usize; 461 | 462 | // Header: 6 columns 463 | writeln!(buf, "id\tchr\tstart\tend\tbreadth\tfraction")?; 464 | 465 | for (id, (chr, start, end, breadth)) in id_map { 466 | let length = end.saturating_sub(start) as usize; 467 | let fraction = if length > 0 { 468 | breadth as f64 / length as f64 469 | } else { 470 | 0.0 471 | }; 472 | writeln!(buf, "{id}\t{chr}\t{start}\t{end}\t{breadth}\t{:.6}", fraction)?; 473 | written += 1; 474 | 475 | if buf.len() >= WRITE_BUF_SIZE { 476 | out.write_all(buf.as_bytes())?; 477 | buf.clear(); 478 | } 479 | } 480 | 481 | if !buf.is_empty() { 482 | out.write_all(buf.as_bytes())?; 483 | } 484 | out.flush()?; 485 | 486 | if verbose { 487 | eprintln!("[INFO] Wrote {written} feature coverage rows."); 488 | } 489 | Ok(()) 490 | } 491 | 492 | /// Main 493 | pub fn run(args: &CoverageArgs) -> Result<()> { 494 | let verbose = args.verbose; 495 | let threads = if args.threads == 0 { 496 | std::thread::available_parallelism() 497 | .map(|n| n.get()) 498 | .unwrap_or(1) 499 | } else { 500 | args.threads 501 | }; 502 | let _ = rayon::ThreadPoolBuilder::new().num_threads(threads).build_global(); 503 | let gff_path = &args.input; 504 | 505 | // Step 1: load GOF index + mmap GFF 506 | let t0 = Instant::now(); 507 | let gof = load_gof(&gff_path)?; 508 | let file = File::open(gff_path).with_context(|| format!("Cannot open GFF file: {:?}", gff_path))?; 509 | let gff_mmap = unsafe { Mmap::map(&file) }.with_context(|| format!("GFF mmap failed for {:?}", gff_path))?; 510 | let t_load_fts = t0.elapsed(); 511 | if verbose { 512 | eprintln!("[TIMER] [run] Step 1: Load GOF & mmap GFF: {:.2?}", t_load_fts); 513 | } 514 | 515 | // Step 2: load interval tree index 516 | let t1 = Instant::now(); 517 | let index_data = TreeIndexData::load_tree_index(&gff_path)?; 518 | let t_build_index = t1.elapsed(); 519 | if verbose { 520 | eprintln!("[TIMER] [run] Step 2: Load tree index: {:.2?}", t_build_index); 521 | } 522 | 523 | // Step 3: collect coverage intervals per root 524 | let t2 = Instant::now(); 525 | let source_path = &args.source; 526 | 527 | let ext = source_path 528 | .extension() 529 | .and_then(|s| s.to_str()) 530 | .map(|s| s.to_lowercase()); 531 | 532 | let by_root = match ext.as_deref() { 533 | Some("bam") | Some("sam") | Some("cram") => { 534 | collect_by_root_from_bam(source_path.as_path(), &index_data, verbose, threads)? 535 | } 536 | Some("bed") => { 537 | collect_by_root_from_bed(source_path.as_path(), &index_data, verbose)? 538 | } 539 | _ => { 540 | bail!( 541 | "Unsupported file type: {:?}. Expected .bam/.sam/.cram or .bed", 542 | source_path 543 | ); 544 | } 545 | }; 546 | let t_collect = t2.elapsed(); 547 | if verbose { 548 | eprintln!("[TIMER] [run] Step 3: Collect intervals: {:.2?}", t_collect); 549 | } 550 | 551 | // Step 4: per-root merge & compute breadth over GFF slices 552 | let t3 = Instant::now(); 553 | let id_map = finalize_compute_breadth(by_root, &gof, &gff_mmap, threads, verbose)?; 554 | let t_compute = t3.elapsed(); 555 | if verbose { 556 | eprintln!("[TIMER] [run] Step 4: Compute breadth: {:.2?}", t_compute); 557 | } 558 | 559 | // Step 5: write results 560 | let t4 = Instant::now(); 561 | 562 | let out: Box = match &args.output { 563 | Some(path) => { 564 | let file = File::create(path)?; 565 | Box::new(BufWriter::with_capacity(WRITE_BUF_SIZE, file)) 566 | } 567 | None => { 568 | let stdout = std::io::stdout(); 569 | let handle = stdout.lock(); 570 | Box::new(BufWriter::with_capacity(WRITE_BUF_SIZE, handle)) 571 | } 572 | }; 573 | write_breadth_results(id_map, out, verbose)?; 574 | let t_write_out = t4.elapsed(); 575 | if verbose { 576 | eprintln!("[TIMER] [run] Step 5: Write output: {:.2?}", t_write_out); 577 | let total = t0.elapsed(); 578 | eprintln!("[TIMER] [run] Total time: {:.2?}", total); 579 | } 580 | 581 | Ok(()) 582 | } 583 | -------------------------------------------------------------------------------- /src/commands/depth.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{Result, bail, Context}; 2 | use rayon::prelude::*; 3 | use memmap2::Mmap; 4 | use rust_htslib::bam::{self, Read}; 5 | use rust_htslib::bam::ext::BamRecordExtensions; 6 | use rustc_hash::{FxHashMap, FxHashSet}; 7 | use std::{ 8 | str, 9 | fs::File, 10 | path::{Path, PathBuf}, 11 | io::{BufWriter, Write}, 12 | }; 13 | use clap::Parser; 14 | use crate::{ 15 | Interval, TreeIndexData, load_gof, GofMap, 16 | }; 17 | use std::time::{Instant, Duration}; 18 | 19 | // Sentinel for missing entries 20 | const MISSING: u64 = u64::MAX; 21 | // Number of IoSlices per batch writer 22 | const WRITE_BUF_SIZE: usize = 32 * 1024 * 1024; 23 | // BufWriter buffer size 24 | const BATCH_SIZE: usize = 100_000; 25 | 26 | /// Compute bin index for coordinate `x` given shift k. 27 | /// Each bin has width 2^k bp. Smaller k = finer bins, larger k = coarser bins. 28 | #[inline] 29 | fn bin_of(x: u32, shift: u32) -> u32 { 30 | x >> shift 31 | } 32 | 33 | /// Arguments for `depth` command 34 | #[derive(Parser, Debug)] 35 | #[command( 36 | about = "Compute coverage depth across genomic features", 37 | long_about = "This tool computes sequencing depth (number of overlapping regions/reads per feature) \ 38 | from SAM/BAM/CRAM or BED input. It does not compute breadth/fraction coverage." 39 | )] 40 | pub struct DepthArgs { 41 | /// Input GFF file path 42 | #[arg(short = 'i', long = "input", value_name = "FILE")] 43 | pub input: PathBuf, 44 | 45 | /// Input source (BAM/SAM/CRAM or BED) 46 | #[arg(short = 's', long)] 47 | pub source: PathBuf, 48 | 49 | /// Output file (stdout if not provided) 50 | #[arg(short = 'o', long = "output", value_name = "FILE")] 51 | pub output: Option, 52 | 53 | /// Bin width parameter (2^k bp) for spatial bucketing of features and queries. 54 | /// Choose k so that a typical read and feature span ~1–2 bins. 55 | /// 56 | /// Typical values: 57 | /// Short reads (Illumina 100–150 bp): k=10–11 (1–2 kb bins) 58 | /// PacBio HiFi (15–20 kb): k=13–14 (8–16 kb bins) 59 | /// ONT long reads (30–60 kb): k=14–15 (16–32 kb bins) 60 | /// 61 | /// Adjust: 62 | /// Longer features → increase k (larger bins, less fragmentation). 63 | /// Denser features → decrease k (smaller bins, stronger filtering). 64 | #[arg(long = "bin-shift", default_value_t = 12)] 65 | pub bin_shift: u32, 66 | 67 | /// Number of threads for parallel processing 68 | #[arg(short = 't', long = "threads", default_value_t = 12)] 69 | pub threads: usize, 70 | 71 | /// Enable verbose output 72 | #[arg(short = 'v', long = "verbose", default_value_t = false)] 73 | pub verbose: bool, 74 | } 75 | 76 | /// Check half-open overlap: [a1, a2) vs [b1, b2) 77 | #[inline(always)] 78 | fn overlaps(a1: u32, a2: u32, b1: u32, b2: u32) -> bool { 79 | let left = if a1 > b1 { a1 } else { b1 }; 80 | let right = if a2 < b2 { a2 } else { b2 }; 81 | left < right 82 | } 83 | 84 | /// Parse unsigned int quickly into u32 85 | #[inline(always)] 86 | fn parse_u32_fast(s: &str) -> Option { 87 | if s.is_empty() { return None; } 88 | let mut n: u32 = 0; 89 | for b in s.as_bytes() { 90 | let d = b.wrapping_sub(b'0'); 91 | if d > 9 { return None; } 92 | n = n.checked_mul(10)?.checked_add(d as u32)?; 93 | } 94 | Some(n) 95 | } 96 | 97 | #[derive(Clone, Copy)] 98 | struct RegionRef { start: u32, end: u32 } 99 | 100 | #[derive(Clone, Copy)] 101 | struct FeatureInst { start: u32, end: u32, id_idx: u32 } 102 | 103 | #[inline(always)] 104 | fn fast_id(attrs: &str) -> Option<&str> { 105 | let bytes = attrs.as_bytes(); 106 | let mut i = 0; 107 | while i + 2 < bytes.len() { 108 | if bytes[i] == b'I' && bytes[i+1] == b'D' && bytes[i+2] == b'=' { 109 | let mut j = i + 3; 110 | while j < bytes.len() && bytes[j] != b';' && bytes[j] != b' ' && bytes[j] != b'\t' { 111 | j += 1; 112 | } 113 | return std::str::from_utf8(&bytes[i+3..j]).ok(); 114 | } 115 | i += 1; 116 | } 117 | None 118 | } 119 | 120 | /// Parse one GFF slice and count feature *depth* (how many regions overlap it; deduped per region) 121 | fn compute_root_depth( 122 | gff_slice: &[u8], 123 | regions: &[RegionRef], 124 | bin_shift: u32, 125 | ) -> FxHashMap { 126 | let mut id_to_idx: FxHashMap = FxHashMap::default(); 127 | let mut id_strings: Vec = Vec::new(); 128 | let mut id_chrom: Vec = Vec::new(); 129 | let mut feats: Vec = Vec::new(); 130 | 131 | if let Ok(text) = str::from_utf8(gff_slice) { 132 | for line in text.split_terminator('\n') { 133 | if line.is_empty() || line.as_bytes()[0] == b'#' { continue; } 134 | let mut cols = line.splitn(9, '\t'); 135 | let (Some(seqid), _, _, Some(start_s), Some(end_s), _, _, _, Some(attrs)) = ( 136 | cols.next(), cols.next(), cols.next(), 137 | cols.next(), cols.next(), cols.next(), 138 | cols.next(), cols.next(), cols.next() 139 | ) else { continue; }; 140 | 141 | let (Some(s1), Some(e1)) = (parse_u32_fast(start_s), parse_u32_fast(end_s)) else { continue; }; 142 | if e1 == 0 { continue; } 143 | let (s1, e1) = if s1 > e1 { (e1, s1) } else { (s1, e1) }; 144 | let fstart0 = s1.saturating_sub(1); 145 | let fend0 = e1; 146 | 147 | if let Some(id) = fast_id(attrs) { 148 | let idx = *id_to_idx.entry(id.to_owned()).or_insert_with(|| { 149 | let k = id_strings.len() as u32; 150 | id_strings.push(id.to_owned()); 151 | id_chrom.push(seqid.to_owned()); 152 | k 153 | }); 154 | feats.push(FeatureInst { start: fstart0, end: fend0, id_idx: idx }); 155 | } 156 | } 157 | } 158 | 159 | if feats.is_empty() || regions.is_empty() { 160 | return FxHashMap::default(); 161 | } 162 | 163 | let max_feat_bin = feats.iter().map(|f| bin_of(f.end.saturating_sub(1), bin_shift)).max().unwrap_or(0); 164 | let mut feat_bins: Vec> = vec![Vec::new(); (max_feat_bin as usize) + 1]; 165 | for (i, f) in feats.iter().enumerate() { 166 | let b0 = bin_of(f.start, bin_shift); 167 | let b1 = bin_of(f.end.saturating_sub(1), bin_shift); 168 | for b in b0..=b1 { 169 | feat_bins[b as usize].push(i as u32); 170 | } 171 | } 172 | 173 | let n_ids = id_strings.len(); 174 | let mut min_s: Vec = vec![u32::MAX; n_ids]; 175 | let mut max_e: Vec = vec![0; n_ids]; 176 | let mut depths: Vec = vec![0; n_ids]; 177 | 178 | let mut cand: Vec = Vec::new(); 179 | let mut hit_ids: Vec = Vec::new(); 180 | 181 | for r in regions { 182 | cand.clear(); 183 | let rb0 = bin_of(r.start, bin_shift); 184 | let rb1 = bin_of(r.end.saturating_sub(1), bin_shift); 185 | for b in rb0..=rb1 { 186 | if let Some(v) = feat_bins.get(b as usize) { 187 | cand.extend_from_slice(v); 188 | } 189 | } 190 | cand.sort_unstable(); 191 | cand.dedup(); 192 | 193 | hit_ids.clear(); 194 | for &fi in &cand { 195 | let f = feats[fi as usize]; 196 | if overlaps(f.start, f.end, r.start, r.end) { 197 | hit_ids.push(f.id_idx); 198 | if f.start < min_s[f.id_idx as usize] { min_s[f.id_idx as usize] = f.start; } 199 | if f.end > max_e[f.id_idx as usize] { max_e[f.id_idx as usize] = f.end; } 200 | } 201 | } 202 | hit_ids.sort_unstable(); 203 | hit_ids.dedup(); 204 | for &ii in &hit_ids { 205 | depths[ii as usize] += 1; 206 | } 207 | } 208 | 209 | let mut out: FxHashMap = FxHashMap::default(); 210 | for (i, &d) in depths.iter().enumerate() { 211 | if d > 0 { 212 | let s = if min_s[i] == u32::MAX { 0 } else { min_s[i] }; 213 | out.insert(id_strings[i].clone(), (id_chrom[i].clone(), s, max_e[i], d)); 214 | } 215 | } 216 | out 217 | } 218 | 219 | /// Batch API: for a batch of regions, return "feature ID -> (chrom, start, end, depth)". 220 | /// 221 | /// - depth = how many regions overlap with the feature (count of regions, deduped per region) 222 | pub fn compute_hit_depth( 223 | index_data: &TreeIndexData, 224 | regions: &[(u32, u32, u32)], 225 | gof: &GofMap, 226 | gff_mmap: &Mmap, 227 | bin_shift: u32, 228 | threads: usize, 229 | ) -> Result> { 230 | let mut by_root: FxHashMap> = FxHashMap::default(); 231 | let idx = gof.index_cached(); 232 | let gff_bytes: &[u8] = &gff_mmap[..]; 233 | 234 | let mut hits: Vec<&Interval> = Vec::new(); 235 | for &(chr, rstart, rend) in regions { 236 | if let Some(tree) = index_data.chr_entries.get(&chr) { 237 | hits.clear(); 238 | tree.query_interval(rstart, rend, &mut hits); 239 | let mut seen_in_region: FxHashSet = FxHashSet::default(); 240 | for h in &hits { 241 | if !seen_in_region.insert(h.root_fid) { continue; } 242 | if let Some(&(s_off, e_off)) = idx.get(&h.root_fid) { 243 | if s_off == MISSING || e_off == MISSING || e_off <= s_off { continue; } 244 | by_root.entry(h.root_fid).or_default().push(RegionRef { start: rstart, end: rend }); 245 | } 246 | } 247 | } 248 | } 249 | 250 | let mut out: FxHashMap = FxHashMap::default(); 251 | if by_root.is_empty() { return Ok(out); } 252 | 253 | let roots_iter = by_root.into_iter(); 254 | if threads > 1 { 255 | // Parallel execution: each root slice is processed independently 256 | let partials: Vec<_> = roots_iter.par_bridge().map(|(root, regs)| { 257 | let (s_off, e_off) = *idx.get(&root).unwrap(); 258 | let su = usize::try_from(s_off).unwrap(); 259 | let eu = usize::try_from(e_off).unwrap(); 260 | compute_root_depth(&gff_bytes[su..eu], ®s, bin_shift) 261 | }).collect(); 262 | 263 | // Merge results from all roots 264 | for m in partials { 265 | for (id, (chrom, s, e, d)) in m { 266 | out.entry(id).and_modify(|(c0, s0, e0, depth)| { 267 | if s < *s0 { *s0 = s; } 268 | if e > *e0 { *e0 = e; } 269 | *depth += d; 270 | let _ = c0; 271 | }).or_insert((chrom, s, e, d)); 272 | } 273 | } 274 | } else { 275 | // Serial execution 276 | for (root, regs) in roots_iter { 277 | let (s_off, e_off) = *idx.get(&root).unwrap(); 278 | let su = usize::try_from(s_off).unwrap(); 279 | let eu = usize::try_from(e_off).unwrap(); 280 | let m = compute_root_depth(&gff_bytes[su..eu], ®s, bin_shift); 281 | for (id, (chrom, s, e, d)) in m { 282 | out.entry(id).and_modify(|(c0, s0, e0, depth)| { 283 | if s < *s0 { *s0 = s; } 284 | if e > *e0 { *e0 = e; } 285 | *depth += d; 286 | let _ = c0; 287 | }).or_insert((chrom, s, e, d)); 288 | } 289 | } 290 | } 291 | 292 | Ok(out) 293 | } 294 | 295 | /// Process BAM input with mmap/htslib and batch queries. 296 | /// Returns: "feature ID -> (chrom, start, end, depth)". 297 | pub fn process_bam( 298 | bam_path: &Path, 299 | index_data: &TreeIndexData, 300 | gof: GofMap, 301 | gff_mmap: Mmap, 302 | bin_shift: u32, 303 | threads: usize, 304 | verbose: bool, 305 | ) -> Result> { 306 | let mut global_id_counts: FxHashMap = FxHashMap::default(); 307 | 308 | let t_open = Instant::now(); 309 | let mut reader = bam::Reader::from_path(bam_path)?; 310 | reader.set_threads(std::cmp::max(2, threads))?; 311 | let t_open_elapsed = t_open.elapsed(); 312 | 313 | let header = reader.header().to_owned(); 314 | 315 | // Build tid -> chr_id mapping 316 | let t_map_build = Instant::now(); 317 | let mut tid2num: Vec> = Vec::with_capacity(header.target_count() as usize); 318 | for tid in 0..header.target_count() { 319 | let chrom_bytes = header.tid2name(tid).to_owned(); 320 | let chrom = std::str::from_utf8(&chrom_bytes)?; 321 | let chr_id = index_data.seqid_to_num.get(chrom).copied(); 322 | tid2num.push(chr_id); 323 | } 324 | let t_map_build_elapsed = t_map_build.elapsed(); 325 | 326 | let mut batch: Vec<(u32, u32, u32)> = Vec::with_capacity(BATCH_SIZE); 327 | 328 | // Timers 329 | let mut t_parse = Duration::ZERO; 330 | let mut t_tidmap = Duration::ZERO; 331 | let mut t_filtermap = Duration::ZERO; 332 | let mut t_tree = Duration::ZERO; 333 | let mut t_depthmap = Duration::ZERO; 334 | 335 | for r in reader.records() { 336 | let t0 = Instant::now(); 337 | let rec = r?; 338 | if rec.is_unmapped() { 339 | t_parse += t0.elapsed(); 340 | continue; 341 | } 342 | t_parse += t0.elapsed(); 343 | 344 | let t1 = Instant::now(); 345 | let tid = rec.tid(); 346 | if tid < 0 { 347 | t_tidmap += t1.elapsed(); 348 | continue; 349 | } 350 | if let Some(chr_id) = tid2num[tid as usize] { 351 | let start_i64 = rec.pos(); 352 | if start_i64 < 0 { 353 | t_tidmap += t1.elapsed(); 354 | continue; 355 | } 356 | let end_i64 = rec.reference_end(); 357 | if end_i64 <= start_i64 { 358 | t_tidmap += t1.elapsed(); 359 | continue; 360 | } 361 | let start = (start_i64 as i128).clamp(0, u32::MAX as i128) as u32; 362 | let end = (end_i64 as i128).clamp(0, u32::MAX as i128) as u32; 363 | 364 | batch.push((chr_id, start, end)); 365 | } 366 | t_tidmap += t1.elapsed(); 367 | 368 | if batch.len() >= BATCH_SIZE { 369 | let t2 = Instant::now(); 370 | let regions = std::mem::take(&mut batch); 371 | t_filtermap += t2.elapsed(); 372 | 373 | let t3 = Instant::now(); 374 | let id_counts = compute_hit_depth(index_data, ®ions, &gof, &gff_mmap, bin_shift, threads)?; 375 | t_tree += t3.elapsed(); 376 | 377 | let t4 = Instant::now(); 378 | for (id, (chrom, s, e, d)) in id_counts { 379 | global_id_counts.entry(id).and_modify(|(c0, s0, e0, depth)| { 380 | if s < *s0 { *s0 = s; } 381 | if e > *e0 { *e0 = e; } 382 | *depth += d; 383 | let _ = c0; 384 | }).or_insert((chrom, s, e, d)); 385 | } 386 | t_depthmap += t4.elapsed(); 387 | batch.clear(); 388 | } 389 | } 390 | 391 | if !batch.is_empty() { 392 | let t2 = Instant::now(); 393 | let regions = std::mem::take(&mut batch); 394 | t_filtermap += t2.elapsed(); 395 | 396 | let t3 = Instant::now(); 397 | let id_counts = compute_hit_depth(index_data, ®ions, &gof, &gff_mmap, bin_shift, threads)?; 398 | t_tree += t3.elapsed(); 399 | 400 | let t4 = Instant::now(); 401 | for (id, (chrom, s, e, d)) in id_counts { 402 | global_id_counts.entry(id).and_modify(|(c0, s0, e0, depth)| { 403 | if s < *s0 { *s0 = s; } 404 | if e > *e0 { *e0 = e; } 405 | *depth += d; 406 | let _ = c0; 407 | }).or_insert((chrom, s, e, d)); 408 | } 409 | t_depthmap += t4.elapsed(); 410 | } 411 | 412 | if verbose { 413 | eprintln!("[TIMER] (1) Opening BAM file: {:.2?}", t_open_elapsed); 414 | eprintln!("[TIMER] (1b) Build tid2num map: {:.2?}", t_map_build_elapsed); 415 | eprintln!("[TIMER] (2) Parsing records: {:.2?}", t_parse); 416 | eprintln!("[TIMER] (3) Chrom ID mapping: {:.2?}", t_tidmap); 417 | eprintln!("[TIMER] (4) Batch filter_map: {:.2?}", t_filtermap); 418 | eprintln!("[TIMER] (5) Interval tree query: {:.2?}", t_tree); 419 | eprintln!("[TIMER] (6) DepthMap updates: {:.2?}", t_depthmap); 420 | } 421 | 422 | Ok(global_id_counts) 423 | } 424 | 425 | /// Process BED input with mmap, parallel line parsing, and batch queries. 426 | /// 427 | /// Returns: "feature ID -> (chrom, start, end, depth)". 428 | /// - depth = number of regions overlapping the feature 429 | pub fn process_bed( 430 | bed_path: &Path, 431 | index_data: &TreeIndexData, 432 | gof: GofMap, 433 | gff_mmap: Mmap, 434 | bin_shift: u32, 435 | threads: usize, 436 | verbose: bool, 437 | ) -> Result> { 438 | let mut global_id_counts: FxHashMap = FxHashMap::default(); 439 | 440 | // mmap the entire BED file 441 | let file = File::open(bed_path)?; 442 | let mmap = unsafe { Mmap::map(&file)? }; 443 | let data = &mmap[..]; 444 | 445 | if verbose { 446 | eprintln!("[INFO] mmap BED file: {} bytes", data.len()); 447 | } 448 | 449 | // collect line offsets 450 | let mut line_offsets = Vec::with_capacity(1_000_000); 451 | line_offsets.push(0usize); 452 | for (i, &b) in data.iter().enumerate() { 453 | if b == b'\n' { 454 | line_offsets.push(i + 1); 455 | } 456 | } 457 | if *line_offsets.last().unwrap_or(&0) != data.len() { 458 | line_offsets.push(data.len()); 459 | } 460 | 461 | // process chunks in batches 462 | for chunk in line_offsets.windows(2).collect::>().chunks(BATCH_SIZE) { 463 | // parse BED lines into (chr_id, start, end) 464 | let regions: Vec<(u32, u32, u32)> = chunk 465 | .par_iter() 466 | .filter_map(|w| { 467 | let start = w[0]; 468 | let end = w[1]; 469 | if start >= end { 470 | return None; 471 | } 472 | let line = &data[start..end]; 473 | if line.is_empty() || line[0] == b'#' { 474 | return None; 475 | } 476 | 477 | let fields: Vec<&[u8]> = line 478 | .split(|&b| b == b'\t' || b == b' ') 479 | .filter(|f| !f.is_empty()) 480 | .collect(); 481 | if fields.len() < 3 { 482 | return None; 483 | } 484 | 485 | let chrom = std::str::from_utf8(fields[0]).ok()?; 486 | let s = std::str::from_utf8(fields[1]).ok()?.parse::().ok()?; 487 | let e = std::str::from_utf8(fields[2]).ok()?.trim_end().parse::().ok()?; 488 | if s >= e { 489 | return None; 490 | } 491 | 492 | let &chr_num = index_data.seqid_to_num.get(chrom)?; 493 | Some((chr_num, s, e)) 494 | }) 495 | .collect(); 496 | 497 | // compute depth only 498 | let id_counts = compute_hit_depth(index_data, ®ions, &gof, &gff_mmap, bin_shift, threads)?; 499 | 500 | // merge into global results 501 | for (id, (chrom, s, e, d)) in id_counts { 502 | global_id_counts.entry(id).and_modify(|(c0, s0, e0, depth)| { 503 | if s < *s0 { *s0 = s; } 504 | if e > *e0 { *e0 = e; } 505 | *depth += d; 506 | let _ = c0; // chrom assumed consistent 507 | }).or_insert((chrom, s, e, d)); 508 | } 509 | } 510 | 511 | Ok(global_id_counts) 512 | } 513 | 514 | /// Write "id\tchr\tstart\tend\tdepth" per line to output file. 515 | pub fn write_depth_results( 516 | id_counts: FxHashMap, 517 | mut out: W, 518 | verbose: bool, 519 | ) -> Result<()> { 520 | use std::fmt::Write as FmtWrite; 521 | let mut buf = String::with_capacity(WRITE_BUF_SIZE); 522 | let mut written = 0usize; 523 | 524 | writeln!(buf, "id\tchr\tstart\tend\tdepth")?; 525 | 526 | for (id, (chr, start, end, depth)) in id_counts { 527 | writeln!(buf, "{id}\t{chr}\t{start}\t{end}\t{depth}")?; 528 | written += 1; 529 | 530 | if buf.len() >= WRITE_BUF_SIZE { 531 | out.write_all(buf.as_bytes())?; 532 | buf.clear(); 533 | } 534 | } 535 | 536 | if !buf.is_empty() { 537 | out.write_all(buf.as_bytes())?; 538 | } 539 | out.flush()?; 540 | 541 | if verbose { 542 | eprintln!("[INFO] Wrote {written} ID depth records"); 543 | } 544 | Ok(()) 545 | } 546 | 547 | /// Main entry for depth pipeline 548 | pub fn run(args: &DepthArgs) -> Result<()> { 549 | let verbose = args.verbose; 550 | let bin_shift = args.bin_shift; 551 | let threads = if args.threads == 0 { 552 | std::thread::available_parallelism() 553 | .map(|n| n.get()) 554 | .unwrap_or(1) 555 | } else { 556 | args.threads 557 | }; 558 | let _ = rayon::ThreadPoolBuilder::new().num_threads(threads).build_global(); 559 | let gff_path = &args.input; 560 | 561 | // Step 1: load GFF index 562 | let t0 = Instant::now(); 563 | let gof = load_gof(&gff_path)?; 564 | let file = File::open(gff_path).with_context(|| format!("Cannot open GFF file: {:?}", gff_path))?; 565 | let gff_mmap = unsafe { Mmap::map(&file) }.with_context(|| format!("GFF mmap failed for {:?}", gff_path))?; 566 | let t_load_fts = t0.elapsed(); 567 | if verbose { 568 | eprintln!("[TIMER] [run] Step 1: Loading index took {:.2?}", t_load_fts); 569 | } 570 | 571 | // Step 2: build interval index 572 | let t1 = Instant::now(); 573 | let index_data = TreeIndexData::load_tree_index(&gff_path)?; 574 | let t_build_index = t1.elapsed(); 575 | if verbose { 576 | eprintln!("[TIMER] [run] Step 2: Building tree index took {:.2?}", t_build_index); 577 | } 578 | 579 | // Step 3: process input file 580 | let t2 = Instant::now(); 581 | let source_path = &args.source; 582 | 583 | let ext = source_path 584 | .extension() 585 | .and_then(|s| s.to_str()) 586 | .map(|s| s.to_lowercase()); 587 | 588 | let id_counts = match ext.as_deref() { 589 | Some("bam") | Some("sam") | Some("cram") => { 590 | process_bam(source_path.as_path(), &index_data, gof, gff_mmap, bin_shift, threads, verbose)? 591 | } 592 | Some("bed") => { 593 | process_bed(source_path.as_path(), &index_data, gof, gff_mmap, bin_shift, threads, verbose)? 594 | } 595 | _ => { 596 | bail!( 597 | "Unsupported file type: {:?}. Expected .bam/.sam/.cram or .bed", 598 | source_path 599 | ); 600 | } 601 | }; 602 | let t_process_input = t2.elapsed(); 603 | if verbose { 604 | eprintln!("[TIMER] [run] Step 3: Processing input took {:.2?}", t_process_input); 605 | } 606 | 607 | // Step 4: write results 608 | let t3 = Instant::now(); 609 | 610 | let out: Box = match &args.output { 611 | Some(path) => { 612 | let file = File::create(path)?; 613 | Box::new(BufWriter::with_capacity(WRITE_BUF_SIZE, file)) 614 | } 615 | None => { 616 | let stdout = std::io::stdout(); 617 | let handle = stdout.lock(); 618 | Box::new(BufWriter::with_capacity(WRITE_BUF_SIZE, handle)) 619 | } 620 | }; 621 | 622 | write_depth_results(id_counts, out, verbose)?; 623 | 624 | let t_write_out = t3.elapsed(); 625 | if verbose { 626 | eprintln!("[TIMER] [run] Step 4: Writing results took {:.2?}", t_write_out); 627 | } 628 | 629 | if verbose { 630 | let total = t0.elapsed(); 631 | eprintln!("[TIMER] [run] Total pipeline time: {:.2?}", total); 632 | } 633 | 634 | Ok(()) 635 | } -------------------------------------------------------------------------------- /src/commands/extract.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | CommonArgs, load_fts, load_gof, load_prt, write_gff_output, write_gff_output_filtered 3 | }; 4 | use anyhow::{Context, Result, bail}; 5 | use clap::Parser; 6 | use rustc_hash::{FxHashMap, FxHashSet}; 7 | use std::{ 8 | fs::File, 9 | io::{BufRead, BufReader}, 10 | path::PathBuf, 11 | time::Instant, 12 | }; 13 | 14 | 15 | /// Extract subtrees from a GFF file by a list of feature names (from --feature-file). 16 | #[derive(Parser, Debug)] 17 | #[command( 18 | about = "Extract models by feature IDs", 19 | long_about = "This tool extracts features and their parent models by feature IDs" 20 | )] 21 | #[clap(group( 22 | clap::ArgGroup::new("feature") 23 | .required(true) 24 | .args(&["feature_file", "feature_id"]) 25 | ))] 26 | pub struct ExtractArgs { 27 | #[clap(flatten)] 28 | pub common: CommonArgs, 29 | 30 | #[arg(short = 'f', long, group = "feature")] 31 | pub feature_id: Option, 32 | 33 | #[arg(short = 'F', long, group = "feature")] 34 | pub feature_file: Option, 35 | } 36 | 37 | pub fn run(args: &ExtractArgs) -> Result<()> { 38 | let gff_path = &args.common.input; 39 | 40 | // Start overall timer 41 | let overall_start = Instant::now(); 42 | let verbose = args.common.verbose; 43 | if verbose { 44 | eprintln!("[DEBUG] Starting processing of {:?}", gff_path); 45 | eprintln!( 46 | "[DEBUG] Thread pool initialized with {} threads", 47 | args.common.effective_threads() 48 | ); 49 | } 50 | 51 | // Load features 52 | let fts = load_fts(gff_path)?; 53 | 54 | // Load parent relations 55 | let prt = load_prt(gff_path)?; 56 | 57 | // Load GFF offsets 58 | let gof = load_gof(gff_path)?; 59 | 60 | // Read feature string IDs (feature name) 61 | let feature_names: FxHashSet = if let Some(ref file_path) = args.feature_file { 62 | let file = File::open(file_path) 63 | .with_context(|| format!("Cannot open feature list: {:?}", file_path))?; 64 | let reader = BufReader::new(file); 65 | reader.lines().try_fold( 66 | FxHashSet::default(), 67 | |mut set, line| -> Result, std::io::Error> { 68 | let s = line?; 69 | let s = s.trim(); 70 | if !s.is_empty() { 71 | set.insert(s.to_owned()); 72 | } 73 | Ok(set) 74 | }, 75 | )? 76 | } else if let Some(ref single_id) = args.feature_id { 77 | [single_id.clone()].into_iter().collect() 78 | } else { 79 | bail!("Either --feature-id (-f) or --feature-file (-F) must be specified"); 80 | }; 81 | 82 | // Phase A: group matches by root 83 | // Phase A: map feature names to numeric fids 84 | let (fids_set, missing) = fts.map_fnames_to_fids( 85 | &feature_names, 86 | args.common.effective_threads() 87 | ); 88 | if !missing.is_empty() { 89 | eprintln!("[WARN] {} feature IDs not found: {:?}", missing.len(), missing); 90 | } 91 | 92 | // Convert set to vec for alignment with roots 93 | let fid_vec: Vec = fids_set.iter().copied().collect(); 94 | 95 | // Use PrtMap fast resolver to map fid -> root (u32::MAX = invalid) 96 | let threads = args.common.effective_threads(); 97 | let roots_vec: Vec = prt.map_fids_to_roots(&fid_vec, threads); 98 | 99 | // Collect invalid fids (print once) and exclude invalid roots 100 | let mut invalid_fids: Vec = fid_vec.iter() 101 | .zip(roots_vec.iter()) 102 | .filter_map(|(&fid, &r)| if r == u32::MAX { Some(fid) } else { None }) 103 | .collect(); 104 | invalid_fids.sort_unstable(); 105 | invalid_fids.dedup(); 106 | if !invalid_fids.is_empty() { 107 | eprintln!( 108 | "[WARN] {} numeric feature IDs are invalid (out-of-range child or parent), skipped: {:?}", 109 | invalid_fids.len(), invalid_fids 110 | ); 111 | } 112 | 113 | // Deduplicate valid roots (exclude u32::MAX) 114 | let mut roots: Vec = roots_vec.iter().copied().filter(|&r| r != u32::MAX).collect(); 115 | roots.sort_unstable(); 116 | roots.dedup(); 117 | 118 | // Phase B: roots -> block offsets 119 | let blocks: Vec<(u32, u64, u64)> = gof.roots_to_offsets(&roots, args.common.effective_threads()); 120 | 121 | if !args.common.entire_group || args.common.types.is_some() { 122 | // Build per_root_matches: root_id -> set of STRING feature IDs 123 | let mut per_root_matches: FxHashMap> = FxHashMap::default(); 124 | per_root_matches.reserve(roots.len()); 125 | 126 | // roots_vec[i] is the root, fid_vec[i] is the numeric fid 127 | for (i, &root) in roots_vec.iter().enumerate() { 128 | if root == u32::MAX { 129 | continue; 130 | } 131 | // Convert numeric fid -> string ID only once here 132 | if let Some(id_str) = fts.ids.get(fid_vec[i] as usize) { 133 | per_root_matches.entry(root).or_default().insert(id_str.clone()); 134 | } 135 | } 136 | 137 | // Emit only exactly matched lines within blocks 138 | write_gff_output_filtered( 139 | gff_path, 140 | &blocks, 141 | &per_root_matches, 142 | "ID", 143 | &args.common.output, 144 | args.common.types.as_deref(), 145 | verbose, 146 | )?; 147 | } else { 148 | // Entire-group mode: emit entire blocks without filtering 149 | write_gff_output( 150 | gff_path, 151 | &blocks, 152 | &args.common.output, 153 | verbose, 154 | )?; 155 | } 156 | 157 | if verbose { 158 | eprintln!("[timing] Total elapsed: {:?}", overall_start.elapsed()); 159 | } 160 | 161 | Ok(()) 162 | } 163 | 164 | -------------------------------------------------------------------------------- /src/commands/index.rs: -------------------------------------------------------------------------------- 1 | use crate::build_index; 2 | use anyhow::Result; 3 | use clap::Parser; 4 | use std::path::PathBuf; 5 | 6 | #[derive(Parser, Debug)] 7 | #[command( 8 | about = "Build index for GFF file", 9 | long_about = "This command builds index files for fast retrieval from a GFF file." 10 | )] 11 | pub struct IndexArgs { 12 | #[arg(short, long)] 13 | input: PathBuf, 14 | 15 | #[arg(short, long, default_value = "gene_name")] 16 | pub attribute: String, 17 | 18 | #[arg(short, long, default_value = "remark,note,comment,region,gap,assembly_gap,contig,scaffold,source")] 19 | pub skip_types: String, 20 | 21 | #[arg(short, long, default_value_t = false)] 22 | verbose: bool, 23 | } 24 | 25 | pub fn run(args: &IndexArgs) -> Result<()> { 26 | if args.verbose { 27 | println!("Indexing: {}", args.input.display()); 28 | } 29 | 30 | build_index(&args.input, &args.attribute, &args.skip_types, args.verbose)?; 31 | 32 | if args.verbose { 33 | println!("Index created successfully."); 34 | } 35 | 36 | Ok(()) 37 | } 38 | -------------------------------------------------------------------------------- /src/commands/intersect.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{Context, Result}; 2 | use clap::{ArgGroup, Parser}; 3 | use lexical_core::parse; 4 | use memchr::memchr; 5 | use memmap2::Mmap; 6 | use rayon::prelude::*; 7 | use rustc_hash::{FxHashMap, FxHashSet}; 8 | use std::{ 9 | fs::File, 10 | io::{self, BufWriter, IoSlice, Write}, 11 | path::{Path, PathBuf}, 12 | }; 13 | 14 | use crate::{ 15 | CommonArgs, Interval, TreeIndexData, load_gof, write_gff_output, 16 | }; 17 | 18 | const MISSING: u64 = u64::MAX; // Set sentinel value for missing entries 19 | 20 | /// Number of IoSlices per batch writer 21 | const IOV_BATCH: usize = 256; 22 | /// BufWriter buffer size 23 | const WRITE_BUF_SIZE: usize = 32 * 1024 * 1024; 24 | 25 | #[derive(Debug, Clone)] 26 | pub struct RootMatched { 27 | pub root: u32, 28 | pub matched: Vec, 29 | } 30 | 31 | /// Arguments for region intersection operations 32 | #[derive(Parser, Debug)] 33 | #[command( 34 | about = "Extract models by a region or regions from a BED file", 35 | long_about = "This tool extracts features and their parent models that intersect with specified regions" 36 | )] 37 | #[clap(group( 38 | ArgGroup::new("regions").required(true).args(&["region", "bed"]) 39 | ))] 40 | #[clap(group( 41 | ArgGroup::new("mode").args(&["contained", "contains_region", "overlap"]) 42 | ))] 43 | pub struct IntersectArgs { 44 | #[clap(flatten)] 45 | pub common: CommonArgs, 46 | 47 | /// Single region in format "chr:start-end" 48 | #[arg(short = 'r', long, group = "regions")] 49 | pub region: Option, 50 | 51 | /// BED file containing regions 52 | #[arg(short = 'b', long, group = "regions")] 53 | pub bed: Option, 54 | 55 | /// Only return features fully contained within regions 56 | #[arg(short = 'c', long, group = "mode")] 57 | pub contained: bool, 58 | 59 | /// Only return features that fully contain the regions 60 | #[arg(short = 'C', long, group = "mode")] 61 | pub contains_region: bool, 62 | 63 | /// Return any overlapping features (default) 64 | #[arg(short = 'O', long, group = "mode")] 65 | pub overlap: bool, 66 | 67 | /// Invert the selection (exclude matching features) 68 | #[arg(short = 'I', long, default_value_t = false)] 69 | pub invert: bool, 70 | } 71 | 72 | /// Overlap detection modes 73 | #[derive(Debug, Clone, Copy)] 74 | pub enum OverlapMode { 75 | Contained, 76 | ContainsRegion, 77 | Overlap, 78 | } 79 | 80 | pub fn gff_type_allowed(line: &[u8], allow: &FxHashSet) -> bool { 81 | // Fast parse the 3rd field (type) without allocations 82 | let mut off = 0usize; 83 | let mut tabs = 0u8; 84 | while tabs < 2 { 85 | match memchr(b'\t', &line[off..]) { 86 | Some(i) => { 87 | off += i + 1; 88 | tabs += 1; 89 | } 90 | None => return false, 91 | } 92 | } 93 | let i2 = match memchr(b'\t', &line[off..]) { 94 | Some(i) => off + i, 95 | None => return false, 96 | }; 97 | let ty = &line[off..i2]; 98 | match std::str::from_utf8(ty) { 99 | Ok(s) => allow.contains(s), 100 | Err(_) => false, 101 | } 102 | } 103 | 104 | /// Core feature query logic using interval trees 105 | pub fn query_features( 106 | index_data: &TreeIndexData, 107 | regions: &[(u32, u32, u32)], 108 | mode: OverlapMode, 109 | invert: bool, 110 | verbose: bool, 111 | ) -> Result> { 112 | 113 | // Bucket regions by chromosome 114 | let buckets: Vec> = { 115 | let mut b = vec![Vec::new(); index_data.seqid_to_num.len()]; 116 | for (chr, start, end) in regions.iter().copied() { 117 | b[chr as usize].push((chr, start, end)); 118 | } 119 | b 120 | }; 121 | 122 | let mut results = Vec::new(); 123 | { 124 | for (&seq_num, tree) in &index_data.chr_entries { 125 | let chr_regs = &buckets[seq_num as usize]; 126 | if chr_regs.is_empty() { 127 | continue; 128 | } 129 | if verbose { 130 | eprintln!( 131 | "[DEBUG] Querying chromosome {} with {} regions", 132 | seq_num, 133 | chr_regs.len() 134 | ); 135 | } 136 | 137 | let mut hits: Vec<&Interval> = Vec::new(); 138 | 139 | for &(_, rstart, rend) in chr_regs { 140 | hits.clear(); 141 | tree.query_interval(rstart, rend, &mut hits); 142 | 143 | for &iv in &hits { 144 | // Decide whether to keep this feature based on mode 145 | let keep = match mode { 146 | OverlapMode::Contained => { 147 | // Feature must be fully contained in region 148 | iv.start >= rstart && iv.end <= rend 149 | } 150 | OverlapMode::ContainsRegion => { 151 | // Feature must fully contain region 152 | iv.start <= rstart && iv.end >= rend 153 | } 154 | OverlapMode::Overlap => { 155 | // Any overlap is acceptable 156 | true 157 | } 158 | }; 159 | 160 | // Apply invert flag (XOR logic) 161 | if invert ^ keep { 162 | results.push((iv.root_fid, iv.start, iv.end)); 163 | } 164 | } 165 | } 166 | } 167 | } 168 | Ok(results) 169 | } 170 | 171 | /// Parse a single genomic region string (chr:start-end) 172 | pub fn parse_region( 173 | region: &str, 174 | seqid_map: &FxHashMap, 175 | common: &CommonArgs, 176 | ) -> Result<(u32, u32, u32)> { 177 | let (seq, range) = region 178 | .split_once(':') 179 | .context("Invalid region format, expected 'chr:start-end'")?; 180 | let (s, e) = range 181 | .split_once('-') 182 | .context("Invalid range format, expected 'start-end'")?; 183 | let start = s.parse::()?; 184 | let end = e.parse::()?; 185 | let chr = seqid_map 186 | .get(seq) 187 | .with_context(|| format!("Sequence ID not found: {}", seq))?; 188 | if start >= end { 189 | anyhow::bail!("Region start must be less than end ({} >= {})", start, end); 190 | } 191 | if common.verbose { 192 | eprintln!( 193 | "[DEBUG] Parsed region: chr={}, start={}, end={}", 194 | chr, start, end 195 | ); 196 | } 197 | Ok((*chr, start, end)) 198 | } 199 | 200 | /// Parse BED file using mmap zero-copy field splitting 201 | pub fn parse_bed_file( 202 | bed_path: &Path, 203 | seqid_map: &FxHashMap, 204 | ) -> Result> { 205 | let mmap = { 206 | let file = File::open(bed_path)?; 207 | unsafe { Mmap::map(&file)? } 208 | }; 209 | let regions = { 210 | let mut regions = Vec::new(); 211 | for line in mmap.split(|&b| b == b'\n') { 212 | if line.is_empty() || line[0] == b'#' { 213 | continue; 214 | } 215 | let line_str = std::str::from_utf8(line)?; 216 | let mut parts = line_str.split_ascii_whitespace(); 217 | let (Some(seq), Some(s), Some(e)) = (parts.next(), parts.next(), parts.next()) else { 218 | continue; 219 | }; 220 | let Some(&chr) = seqid_map.get(seq) else { 221 | continue; 222 | }; 223 | let start = parse::(s.as_bytes())?; 224 | let end = parse::(e.as_bytes())?; 225 | regions.push((chr, start, end)); 226 | } 227 | regions 228 | }; 229 | Ok(regions) 230 | } 231 | 232 | pub fn write_gff_match_only_by_coords( 233 | gff_path: &Path, 234 | blocks: &[(u32, u64, u64)], //Per-block parallel scan to collect (line_start, line_end) offsets 235 | query_ivmap: &FxHashMap>, 236 | types_filter: Option<&str>, 237 | output_path: &Option, 238 | mode: OverlapMode, 239 | verbose: bool, 240 | ) -> Result<()> { 241 | // mmap the whole GFF once 242 | let (mmap, file_len) = { 243 | let file = std::fs::File::open(gff_path) 244 | .with_context(|| format!("Cannot open GFF: {:?}", gff_path))?; 245 | let mmap = unsafe { Mmap::map(&file) } 246 | .with_context(|| format!("mmap failed for {:?}", gff_path))?; 247 | let len = mmap.len(); 248 | (mmap, len) 249 | }; 250 | 251 | // parse type filters to a set once 252 | let type_allow: Option> = { 253 | types_filter.map(|s| { 254 | s.split(',') 255 | .map(|t| t.trim().to_string()) 256 | .filter(|t| !t.is_empty()) 257 | .collect() 258 | }) 259 | }; 260 | 261 | // Parallel scan blocks: produce (block_start, Vec<(line_start,line_end)>) 262 | // Note: we never copy line bytes, only collect offsets. 263 | let mut parts: Vec<(u64, Vec<(u64, u64)>)> = { 264 | let bytes_out = std::sync::atomic::AtomicU64::new(0); 265 | 266 | let parts: Vec<(u64, Vec<(u64, u64)>)> = blocks 267 | .par_iter() 268 | .filter_map(|&(root, start, end)| { 269 | if start == MISSING { 270 | eprintln!("[WARN] skipped fid={} due to sentinel start offset", root); 271 | return None; 272 | } 273 | let s = start as usize; 274 | let e = (end as usize).min(file_len); 275 | if s >= e || e > file_len { 276 | return None; 277 | } 278 | let src = &mmap[s..e]; 279 | 280 | // Collect matched line ranges as global file offsets. 281 | let mut matched_offsets: Vec<(u64, u64)> = Vec::with_capacity(256); 282 | let mut pos = 0usize; 283 | 284 | while pos < src.len() { 285 | // Find next newline boundary 286 | let nl = match memchr(b'\n', &src[pos..]) { 287 | Some(i) => pos + i + 1, // include '\n' 288 | None => src.len(), 289 | }; 290 | let line = &src[pos..nl]; 291 | 292 | // Trim trailing '\n' for parsing 293 | let line_nocr = if line.ends_with(b"\n") { 294 | &line[..line.len() - 1] 295 | } else { 296 | line 297 | }; 298 | 299 | if !line_nocr.is_empty() && line_nocr[0] != b'#' { 300 | // Optional: type filter first to early discard 301 | let mut pass = true; 302 | if let Some(allow) = &type_allow 303 | && !gff_type_allowed(line_nocr, allow) 304 | { 305 | pass = false; 306 | } 307 | if pass && gff_line_overlaps_queries(line_nocr, query_ivmap, mode) { 308 | // Record absolute offsets in the file (including '\n') 309 | let abs_start = start + pos as u64; 310 | let abs_end = start + nl as u64; 311 | // Safety: bounds already clamped by file_len 312 | matched_offsets.push((abs_start, abs_end)); 313 | bytes_out.fetch_add( 314 | (abs_end - abs_start) as u64, 315 | std::sync::atomic::Ordering::Relaxed, 316 | ); 317 | } 318 | } 319 | 320 | pos = nl; 321 | } 322 | 323 | if matched_offsets.is_empty() { 324 | None 325 | } else { 326 | Some((start, matched_offsets)) 327 | } 328 | }) 329 | .collect(); 330 | parts 331 | }; 332 | 333 | // Keep global order stable by block start (we don't merge offsets as per user's requirement) 334 | { 335 | parts.sort_unstable_by_key(|(s, _)| *s); 336 | } 337 | 338 | // Helper: write all slices using write_vectored with partial-write handling. 339 | // We construct a temporary Vec per batch; batch size is small (<= IOV_BATCH). 340 | fn write_all_vectored(w: &mut W, mut slices: Vec<&[u8]>) -> io::Result<()> { 341 | // Fast path: nothing to write 342 | if slices.is_empty() { 343 | return Ok(()); 344 | } 345 | 346 | // Keep writing until all slices are fully consumed 347 | while !slices.is_empty() { 348 | // Rebuild IoSlice views for current remainder 349 | let iov: Vec> = slices.iter().map(|s| IoSlice::new(s)).collect(); 350 | 351 | let wrote = w.write_vectored(&iov)?; 352 | if wrote == 0 { 353 | return Err(io::Error::new( 354 | io::ErrorKind::WriteZero, 355 | "write_vectored returned 0", 356 | )); 357 | } 358 | 359 | // Consume 'wrote' bytes from the front of `slices` 360 | let mut remaining = wrote; 361 | let mut drop_count = 0; 362 | 363 | for s in &mut slices { 364 | if remaining == 0 { 365 | break; 366 | } 367 | if remaining >= s.len() { 368 | remaining -= s.len(); 369 | drop_count += 1; 370 | } else { 371 | // Advance within the first partially-written slice 372 | *s = &s[remaining..]; 373 | remaining = 0; 374 | } 375 | } 376 | 377 | if drop_count > 0 { 378 | slices.drain(0..drop_count); 379 | } 380 | } 381 | 382 | Ok(()) 383 | } 384 | 385 | // Write out: use large BufWriter and batch IoSlice slices across consecutive parts. 386 | { 387 | // Assemble and write batches, reusing a small Vec<&[u8]> to avoid reallocs 388 | let mut batch: Vec<&[u8]> = Vec::with_capacity(IOV_BATCH); 389 | 390 | if let Some(p) = output_path { 391 | // File output path: create file and large BufWriter 392 | let file = std::fs::File::create(p)?; 393 | let mut writer = BufWriter::with_capacity(WRITE_BUF_SIZE, file); 394 | 395 | for (_, ranges) in parts.iter() { 396 | for &(ls, le) in ranges { 397 | // Safety: ls/le were validated against file_len earlier 398 | let slice = &mmap[ls as usize..le as usize]; 399 | batch.push(slice); 400 | if batch.len() >= IOV_BATCH { 401 | write_all_vectored(&mut writer, std::mem::take(&mut batch))?; 402 | } 403 | } 404 | } 405 | if !batch.is_empty() { 406 | write_all_vectored(&mut writer, std::mem::take(&mut batch))?; 407 | } 408 | writer.flush()?; 409 | } else { 410 | // Stdout path: lock stdout and use large BufWriter 411 | let stdout = std::io::stdout(); 412 | let handle = stdout.lock(); 413 | let mut writer = BufWriter::with_capacity(WRITE_BUF_SIZE, handle); 414 | 415 | for (_, ranges) in parts.iter() { 416 | for &(ls, le) in ranges { 417 | let slice = &mmap[ls as usize..le as usize]; 418 | batch.push(slice); 419 | if batch.len() >= IOV_BATCH { 420 | write_all_vectored(&mut writer, std::mem::take(&mut batch))?; 421 | } 422 | } 423 | } 424 | if !batch.is_empty() { 425 | write_all_vectored(&mut writer, std::mem::take(&mut batch))?; 426 | } 427 | writer.flush()?; 428 | } 429 | } 430 | 431 | if verbose { 432 | eprintln!( 433 | "[INFO] match-only by coords completed; minput blocks {}", 434 | blocks.len() 435 | ); 436 | } 437 | Ok(()) 438 | } 439 | 440 | /// Parse GFF line and check if it overlaps with query intervals 441 | pub fn gff_line_overlaps_queries( 442 | line: &[u8], 443 | ivmap: &FxHashMap>, 444 | mode: OverlapMode, 445 | ) -> bool { 446 | // Parse columns: seq, source, type, start, end 447 | let mut off = 0usize; 448 | 449 | let i1 = match memchr(b'\t', &line[off..]) { 450 | Some(i) => off + i, 451 | None => return false, 452 | }; 453 | let seq = &line[off..i1]; 454 | off = i1 + 1; 455 | 456 | // skip source 457 | let i2 = match memchr(b'\t', &line[off..]) { 458 | Some(i) => off + i, 459 | None => return false, 460 | }; 461 | off = i2 + 1; 462 | 463 | // skip type 464 | let i3 = match memchr(b'\t', &line[off..]) { 465 | Some(i) => off + i, 466 | None => return false, 467 | }; 468 | off = i3 + 1; 469 | 470 | // parse start 471 | let i4 = match memchr(b'\t', &line[off..]) { 472 | Some(i) => off + i, 473 | None => return false, 474 | }; 475 | let start = match parse_u32_ascii(&line[off..i4]) { 476 | Some(v) => v, 477 | None => return false, 478 | }; 479 | off = i4 + 1; 480 | 481 | // parse end 482 | let i5 = match memchr(b'\t', &line[off..]) { 483 | Some(i) => off + i, 484 | None => return false, 485 | }; 486 | let end = match parse_u32_ascii(&line[off..i5]) { 487 | Some(v) => v, 488 | None => return false, 489 | }; 490 | 491 | let seq_str = match std::str::from_utf8(seq) { 492 | Ok(s) => s, 493 | Err(_) => return false, 494 | }; 495 | let ivs = match ivmap.get(seq_str) { 496 | Some(v) => v, 497 | None => return false, 498 | }; 499 | 500 | for &(qs, qe) in ivs { 501 | let keep = match mode { 502 | OverlapMode::Contained => { 503 | // feature must be fully inside query 504 | start >= qs && end <= qe 505 | } 506 | OverlapMode::ContainsRegion => { 507 | // feature must fully contain query 508 | start <= qs && end >= qe 509 | } 510 | OverlapMode::Overlap => { 511 | // any overlap 512 | (qs <= start && start <= qe) 513 | || (qs <= end && end <= qe) 514 | || (start <= qs && qs <= end) 515 | || (start <= qe && qe <= end) 516 | } 517 | }; 518 | if keep { 519 | return true; 520 | } 521 | } 522 | false 523 | } 524 | 525 | #[inline] 526 | fn parse_u32_ascii(s: &[u8]) -> Option { 527 | let mut v: u32 = 0; 528 | if s.is_empty() { 529 | return None; 530 | } 531 | for &c in s { 532 | if !c.is_ascii_digit() { 533 | return None; 534 | } 535 | v = v.checked_mul(10)?.checked_add((c - b'0') as u32)?; 536 | } 537 | Some(v) 538 | } 539 | 540 | /// Main execution function 541 | pub fn run(args: &IntersectArgs) -> Result<()> { 542 | let verbose = args.common.verbose; 543 | 544 | if verbose { 545 | eprintln!("[DEBUG] Starting processing of {:?}", args.common.input); 546 | eprintln!( 547 | "[DEBUG] Thread pool initialized with {} threads", 548 | args.common.effective_threads() 549 | ); 550 | } 551 | 552 | // Determine overlap mode 553 | let mode = if args.contained { 554 | OverlapMode::Contained 555 | } else if args.contains_region { 556 | OverlapMode::ContainsRegion 557 | } else { 558 | OverlapMode::Overlap 559 | }; 560 | 561 | let index_data = TreeIndexData::load_tree_index(&args.common.input)?; 562 | let seqid_map = &index_data.seqid_to_num; 563 | 564 | let regions = { 565 | if let Some(bed) = &args.bed { 566 | parse_bed_file(bed, seqid_map)? 567 | } else if let Some(r) = &args.region { 568 | vec![parse_region(r, seqid_map, &args.common)?] 569 | } else { 570 | anyhow::bail!("No region specified"); 571 | } 572 | }; 573 | 574 | 575 | if verbose { 576 | eprintln!( 577 | "[DEBUG] Starting query_features with {} regions", 578 | regions.len() 579 | ); 580 | eprintln!( 581 | "[DEBUG] Mode: {:?}", 582 | mode 583 | ); 584 | } 585 | 586 | let feats = { 587 | query_features( 588 | &index_data, 589 | ®ions, 590 | mode, 591 | args.invert, 592 | args.common.verbose, 593 | )? 594 | }; 595 | 596 | // Collect IDs for root features only 597 | let gof = load_gof(&args.common.input)?; 598 | let root_matches: Vec = { 599 | let mut grouped: FxHashMap> = FxHashMap::default(); 600 | for (root, _s, _e) in feats { 601 | grouped.entry(root).or_default().push(root); 602 | } 603 | grouped 604 | .into_iter() 605 | .map(|(root, matched)| RootMatched { root, matched }) 606 | .collect() 607 | }; 608 | 609 | let roots: Vec = { 610 | let mut s: FxHashSet = FxHashSet::default(); 611 | for rm in &root_matches { 612 | s.insert(rm.root); 613 | } 614 | s.into_iter().collect() 615 | }; 616 | 617 | let blocks: Vec<(u32, u64, u64)> = gof.roots_to_offsets(&roots, args.common.effective_threads()); 618 | 619 | if !args.common.entire_group || args.common.types.is_some() { 620 | // Build query interval map by seq name 621 | let query_ivmap: FxHashMap> = { 622 | let mut num_to_seq: FxHashMap = FxHashMap::default(); 623 | for (name, &num) in index_data.seqid_to_num.iter() { 624 | num_to_seq.insert(num, name.clone()); 625 | } 626 | let mut m: FxHashMap> = FxHashMap::default(); 627 | for &(chr_num, s, e) in ®ions { 628 | if let Some(seq_name) = num_to_seq.get(&chr_num) { 629 | m.entry(seq_name.clone()).or_default().push((s, e)); 630 | } 631 | } 632 | m 633 | }; 634 | 635 | { 636 | write_gff_match_only_by_coords( 637 | args.common.input.as_path(), 638 | &blocks, 639 | &query_ivmap, 640 | args.common.types.as_deref(), 641 | &args.common.output, 642 | mode, 643 | args.common.verbose, 644 | )?; 645 | } 646 | } else { 647 | write_gff_output( 648 | args.common.input.as_path(), 649 | &blocks, 650 | &args.common.output, 651 | args.common.verbose, 652 | )?; 653 | } 654 | Ok(()) 655 | } 656 | -------------------------------------------------------------------------------- /src/commands/sample.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use clap::Parser; 3 | use rayon::prelude::*; 4 | use rand::seq::{IndexedRandom}; 5 | use rand::rng; 6 | use std::{ 7 | path::PathBuf, 8 | }; 9 | use crate::{load_gof, write_gff_output}; 10 | 11 | /// Arguments 12 | #[derive(Parser, Debug)] 13 | #[command( 14 | about = "Sample feature groups per chromosome", 15 | long_about = "Sample feature groups per chromosome." 16 | )] 17 | pub struct SampleArgs { 18 | /// GFF file path (indexed via GOF) 19 | #[arg(short = 'i', long = "input", value_name = "FILE")] 20 | pub input: PathBuf, 21 | 22 | /// Ratio of downsampling 23 | #[arg(short = 'r', long = "ratio")] 24 | pub ratio: f32, 25 | 26 | /// Output file (required) 27 | #[arg(short = 'o', long = "output", value_name = "FILE")] 28 | pub output: Option, 29 | 30 | /// Number of threads 31 | #[arg(short = 't', long = "threads", default_value_t = 12, value_name = "NUM")] 32 | pub threads: usize, 33 | 34 | /// Verbose logs 35 | #[arg(short = 'v', long = "verbose", default_value_t = false, value_name = "BOOL")] 36 | pub verbose: bool, 37 | } 38 | 39 | pub fn run(args: &SampleArgs) -> Result<()> { 40 | let verbose = args.verbose; 41 | let threads = if args.threads == 0 { 42 | std::thread::available_parallelism() 43 | .map(|n| n.get()) 44 | .unwrap_or(1) 45 | } else { 46 | args.threads 47 | }; 48 | let _ = rayon::ThreadPoolBuilder::new().num_threads(threads).build_global(); 49 | let gff_path = &args.input; 50 | 51 | let gof = load_gof(&gff_path)?; 52 | 53 | let blocks: Vec<(u32, u64, u64)> = gof.seqid_index 54 | .par_iter() 55 | .flat_map(|(_seqid_num, indices)| { 56 | let mut rng = rng(); 57 | 58 | // 1. collect all fids for this chromosome 59 | let fids: Vec = indices 60 | .iter() 61 | .map(|&i| gof.entries[i].feature_id) 62 | .collect(); 63 | 64 | if fids.is_empty() { 65 | return Vec::new(); 66 | } 67 | 68 | // 2. sample 10% fids 69 | let sample_size = (fids.len() as f32 * args.ratio).ceil() as usize; 70 | let sampled: Vec = fids.choose_multiple(&mut rng, sample_size).cloned().collect(); 71 | 72 | // 3. use index_cached() to get offsets 73 | let idx = gof.index_cached(); 74 | sampled 75 | .into_iter() 76 | .filter_map(|fid| idx.get(&fid).map(|&(s, e)| (fid, s, e))) 77 | .collect::>() 78 | }) 79 | .collect(); 80 | 81 | // Step 3: write sampled GFF blocks 82 | write_gff_output(gff_path, &blocks, &args.output, verbose)?; 83 | Ok(()) 84 | } 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /src/commands/search.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{Result, bail}; 2 | use clap::{ArgGroup, Parser}; 3 | use rustc_hash::{FxHashMap, FxHashSet}; 4 | use regex::Regex; 5 | use std::{ 6 | fs::File, 7 | io::{BufReader, BufRead}, 8 | path::PathBuf, 9 | time::Instant, 10 | }; 11 | 12 | 13 | use crate::{ 14 | CommonArgs, load_gof, load_prt, load_a2f, load_atn, 15 | write_gff_output, write_gff_output_filtered, 16 | }; 17 | 18 | #[derive(Parser, Debug)] 19 | #[command( 20 | about = "Search features by attribute values", 21 | group = ArgGroup::new("attr_input") 22 | .required(true) 23 | .args(["attr_list", "attr"]) 24 | )] 25 | pub struct SearchArgs { 26 | /// Common input/output/thread arguments 27 | #[clap(flatten)] 28 | pub common: CommonArgs, 29 | 30 | #[arg( 31 | short = 'A', 32 | long, 33 | help = "Attribute list file (one per line)", 34 | group = "attr_input" 35 | )] 36 | attr_list: Option, 37 | 38 | #[arg( 39 | short = 'a', 40 | long, 41 | help = "Single attribute value to search", 42 | group = "attr_input" 43 | )] 44 | attr: Option, 45 | 46 | #[arg( 47 | short = 'r', 48 | long, 49 | help = "Enable regex mode for attribute matching")] 50 | regex: bool, 51 | } 52 | 53 | /// When in `per-feature` mode: within each root block, emit only lines whose `ID` exactly matches 54 | /// the user-specified features under that root. Optional `types_filter` is applied to column 3. 55 | pub fn run(args: &SearchArgs) -> Result<()> { 56 | let verbose = args.common.verbose; 57 | let gff_path = &args.common.input; 58 | 59 | // Init thread pool 60 | let overall_start = Instant::now(); 61 | if verbose { 62 | eprintln!("[DEBUG] Starting processing of {:?}", gff_path); 63 | eprintln!( 64 | "[DEBUG] Thread pool initialized with {} threads", 65 | args.common.effective_threads() 66 | ); 67 | } 68 | 69 | // Load index artifacts 70 | let prt = load_prt(gff_path)?; // parent pointers (fid -> parent fid) 71 | let gof = load_gof(gff_path)?; // GOF offsets (fid -> (start,end)) 72 | let a2f = load_a2f(gff_path)?; // attribute index -> fid 73 | let (atn_attr_name, atn_values) = load_atn(gff_path)?; // attribute values table (index-aligned) 74 | 75 | // Collect attribute values from file or single arg 76 | let attr_values: Vec = if let Some(file) = &args.attr_list { 77 | let reader = BufReader::new(File::open(file)?); 78 | reader 79 | .lines() 80 | .map(|r| r.map(|s| s.trim().to_owned())) 81 | .filter(|r| r.as_ref().map_or(true, |s| !s.is_empty())) 82 | .collect::, _>>()? 83 | } else if let Some(val) = &args.attr { 84 | vec![val.clone()] 85 | } else { 86 | bail!("Either --attr-list (-A) or --attr (-a) must be provided."); 87 | }; 88 | 89 | // Step 1: build attribute -> AID list 90 | // In regex mode, match by regex; otherwise exact string match. 91 | let mut attr_to_aids: FxHashMap> = FxHashMap::default(); 92 | if args.regex { 93 | let patterns: Vec = attr_values 94 | .iter() 95 | .map(String::as_str) 96 | .map(Regex::new) 97 | .collect::, _>>()?; 98 | 99 | for (i, val) in atn_values.iter().enumerate() { 100 | if patterns.iter().any(|re| re.is_match(val)) { 101 | attr_to_aids.entry(val.clone()).or_default().push(i as u32); 102 | } 103 | } 104 | } else { 105 | let wanted: FxHashSet<&str> = attr_values.iter().map(String::as_str).collect(); 106 | for (i, val) in atn_values.iter().enumerate() { 107 | if wanted.contains(val.as_str()) { 108 | attr_to_aids.entry(val.clone()).or_default().push(i as u32); 109 | } 110 | } 111 | } 112 | 113 | // Nothing matched → early exit with a helpful error 114 | if attr_to_aids.is_empty() { 115 | bail!("None of the attributes matched."); 116 | } 117 | 118 | if verbose { 119 | eprintln!("[DEBUG] Matched attribute -> AIDs:"); 120 | for (attr_val, aids) in &attr_to_aids { 121 | eprintln!(" {} => {:?}", attr_val, aids); 122 | } 123 | } 124 | 125 | // Step 2: map AIDs -> FIDs via a2f (attribute index to feature id) 126 | // Note: a2f is expected to be indexable by AID (usize). 127 | // We also deduplicate per attribute to keep vectors lean. 128 | let mut attr_to_fids: FxHashMap> = FxHashMap::default(); 129 | 130 | for (attr_val, aids) in &attr_to_aids { 131 | let mut fids = a2f.map_aids_to_fids_vec(aids); 132 | fids.sort_unstable(); 133 | fids.dedup(); 134 | 135 | if !fids.is_empty() { 136 | attr_to_fids.insert(attr_val.clone(), fids); 137 | } 138 | } 139 | 140 | if attr_to_fids.is_empty() { 141 | bail!("No feature IDs (FIDs) resolved from matched attributes."); 142 | } 143 | 144 | if verbose { 145 | eprintln!("[DEBUG] Attribute -> FIDs after a2f mapping:"); 146 | for (attr_val, fids) in &attr_to_fids { 147 | eprintln!(" {} => {:?} ", attr_val, fids); 148 | } 149 | } 150 | 151 | // Step 3: map FIDs -> root FIDs using PrtMap::map_fids_to_roots (fast) 152 | let mut fid_vec: Vec = attr_to_fids 153 | .values() 154 | .flat_map(|v| v.iter().copied()) 155 | .collect(); 156 | fid_vec.sort_unstable(); 157 | fid_vec.dedup(); 158 | 159 | if verbose { 160 | eprintln!("[DEBUG] Total unique FIDs: {}", fid_vec.len()); 161 | } 162 | 163 | let threads = args.common.effective_threads(); 164 | let root: Vec = prt.map_fids_to_roots(&fid_vec, threads); 165 | 166 | // Collect invalid fids (mapped to u32::MAX), and build a unique root list 167 | let mut invalid_fids: Vec = Vec::new(); 168 | let mut roots_effective: Vec = Vec::with_capacity(root.len()); 169 | for (&fid, r) in fid_vec.iter().zip(root.iter()) { 170 | if *r == u32::MAX { 171 | invalid_fids.push(fid); 172 | } else { 173 | roots_effective.push(*r); 174 | } 175 | } 176 | 177 | if !invalid_fids.is_empty() { 178 | invalid_fids.sort_unstable(); 179 | invalid_fids.dedup(); 180 | eprintln!( 181 | "[WARN] {} FIDs have invalid parent chains (or out-of-range): {:?}", 182 | invalid_fids.len(), 183 | invalid_fids 184 | ); 185 | } 186 | roots_effective.sort_unstable(); 187 | roots_effective.dedup(); 188 | 189 | if roots_effective.is_empty() { 190 | bail!("No valid root features resolved from matched attributes."); 191 | } 192 | if verbose { 193 | eprintln!("[DEBUG] Total unique roots: {}", roots_effective.len()); 194 | } 195 | 196 | let blocks: Vec<(u32, u64, u64)> = gof.roots_to_offsets(&roots_effective, args.common.effective_threads()); 197 | 198 | if !args.common.entire_group|| args.common.types.is_some() { 199 | let allowed_roots: FxHashSet = roots_effective.iter().copied().collect(); 200 | 201 | let mut fid_to_root: FxHashMap = FxHashMap::default(); 202 | fid_to_root.reserve(fid_vec.len()); 203 | for (fid, r) in fid_vec.iter().copied().zip(root.iter().copied()) { 204 | if r != u32::MAX && allowed_roots.contains(&r) { 205 | fid_to_root.insert(fid, r); 206 | } 207 | } 208 | 209 | let mut per_root_matches: FxHashMap> = FxHashMap::default(); 210 | per_root_matches.reserve(roots_effective.len()); 211 | 212 | for (attr_val, fids) in &attr_to_fids { 213 | for &fid in fids { 214 | if let Some(&r) = fid_to_root.get(&fid) { 215 | per_root_matches.entry(r).or_default().insert(attr_val.clone()); 216 | } 217 | } 218 | } 219 | 220 | write_gff_output_filtered( 221 | gff_path, 222 | &blocks, 223 | &per_root_matches, 224 | &atn_attr_name, 225 | &args.common.output, 226 | args.common.types.as_deref(), 227 | verbose, 228 | )?; 229 | } else { 230 | let mut fid_to_root: FxHashMap = FxHashMap::default(); 231 | for (fid, r) in fid_vec.iter().copied().zip(root.clone().into_iter()) { 232 | if r != u32::MAX { 233 | fid_to_root.insert(fid, r); 234 | } 235 | } 236 | 237 | // Step 4: map roots -> (start, end) offsets from GOF 238 | // Use a cached index to avoid rebuilding a HashMap on every call. 239 | write_gff_output( 240 | gff_path, 241 | &blocks, 242 | &args.common.output, 243 | verbose, 244 | )?; 245 | } 246 | 247 | if verbose { 248 | eprintln!("[timing] Total elapsed: {:?}", overall_start.elapsed()); 249 | } 250 | 251 | Ok(()) 252 | } 253 | -------------------------------------------------------------------------------- /src/index_builder.rs: -------------------------------------------------------------------------------- 1 | pub mod core; 2 | pub use core::{build_index, write_binary_u32, write_gof, write_lines}; 3 | -------------------------------------------------------------------------------- /src/index_builder/core.rs: -------------------------------------------------------------------------------- 1 | use crate::append_suffix; 2 | use crate::{Interval, IntervalTree, save_multiple_trees, write_offsets_to_file}; 3 | use anyhow::anyhow; 4 | use anyhow::{Result, bail}; 5 | use byteorder::{LittleEndian, WriteBytesExt}; 6 | use indexmap::IndexMap; 7 | use memchr::memchr; 8 | use memmap2::Mmap; 9 | use regex::{Regex, escape}; 10 | use std::{fs::File, io::Write, path::PathBuf}; 11 | use rustc_hash::{FxHashMap, FxHashSet}; 12 | 13 | // Writes text lines to a file 14 | pub fn write_lines(path: PathBuf, lines: &[String]) -> Result<()> { 15 | let mut file = File::create(path)?; 16 | for line in lines { 17 | writeln!(file, "{}", line)?; 18 | } 19 | Ok(()) 20 | } 21 | 22 | // Writes u32 values in binary little-endian format 23 | pub fn write_binary_u32(path: PathBuf, values: &[u32]) -> Result<()> { 24 | let mut file = File::create(path)?; 25 | for &v in values { 26 | file.write_u32::(v)?; 27 | } 28 | Ok(()) 29 | } 30 | 31 | // Writes GFF offset records (gof) 32 | pub fn write_gof(file: &mut File, id: u32, seq_num: u32, start: u64, end: u64) -> Result<()> { 33 | file.write_u32::(id)?; // root feature id 34 | file.write_u32::(seq_num)?; // seq numeric id 35 | file.write_u64::(start)?; // start offset in GFF file 36 | file.write_u64::(end)?; // end offset in GFF file 37 | Ok(()) 38 | } 39 | 40 | /// Builds various index files for a GFF: .fts, .prt, .a2f, .atn, .sqs, .gof, .rit, .rix 41 | pub fn build_index(gff: &PathBuf, attr_key: &str, skip_types: &str, verbose: bool) -> Result<()> { 42 | // Compile regex patterns 43 | let id_re = Regex::new(r"ID=([^;\s]+)")?; 44 | let parent_re = Regex::new(r"Parent=([^;\s]+)")?; 45 | let attr_re = Regex::new(&format!(r"{}=([^;]+)", escape(attr_key)))?; 46 | 47 | let skip_types_set: FxHashSet<&str> = skip_types.split(',').collect(); 48 | 49 | if verbose { 50 | eprintln!("Building index for {} ...", gff.display()); 51 | } 52 | 53 | // Memory-map input file 54 | let file = File::open(gff)?; 55 | let mmap = unsafe { Mmap::map(&file)? }; 56 | let data = &mmap[..]; 57 | 58 | // First pass: parse raw features 59 | struct RawFeature { 60 | seqid: String, 61 | start: u32, 62 | end: u32, 63 | line_offset: u64, 64 | id: String, 65 | parent: Option, 66 | attr: Option, 67 | } 68 | let mut raw_features = Vec::new(); 69 | let mut offset = 0; 70 | 71 | while offset < data.len() { 72 | let nl_pos = memchr(b'\n', &data[offset..]) 73 | .map(|pos| pos + offset) 74 | .unwrap_or(data.len()); 75 | let line_bytes = &data[offset..nl_pos]; 76 | let line_offset = offset as u64; 77 | offset = nl_pos + 1; 78 | 79 | if line_bytes.is_empty() || line_bytes[0] == b'#' { 80 | continue; 81 | } 82 | let line = std::str::from_utf8(line_bytes)?.trim(); 83 | if line.is_empty() { 84 | continue; 85 | } 86 | 87 | let fields: Vec<&str> = line.split('\t').collect(); 88 | if fields.len() != 9 { 89 | bail!("Invalid GFF line (expected 9 columns): {}", line); 90 | } 91 | 92 | let seqid = fields[0].to_string(); 93 | let ftype = fields[2]; 94 | 95 | if skip_types_set.contains(ftype) { 96 | if verbose { 97 | println!("skip comment feature: {}", ftype); 98 | } 99 | continue; 100 | } 101 | 102 | let s1 = fields[3].parse::()?; 103 | let e1 = fields[4].parse::()?; 104 | if e1 == 0 { 105 | continue; 106 | } 107 | let (s1, e1) = if s1 > e1 { (e1, s1) } else { (s1, e1) }; 108 | let start = s1.saturating_sub(1); 109 | let end = e1; 110 | 111 | // Extract ID 112 | let id = id_re 113 | .captures(line) 114 | .ok_or_else(|| anyhow!("Missing ID in feature: {}", line))?[1] 115 | .to_string(); 116 | // Extract raw Parent (may refer to unseen ID) 117 | let parent = parent_re.captures(line).map(|cap| cap[1].to_string()); 118 | // Extract attribute value 119 | let attr = attr_re.captures(line).map(|cap| { 120 | let val = cap[1].to_string(); 121 | // GFF3 spec: attribute values must be URL-encoded. 122 | // Raw characters such as space, semicolon, or comma are not allowed. 123 | if val.contains(' ') || val.contains(';') || val.contains(',') { 124 | eprintln!("[WARN] Attribute value contains invalid chars (.,;) (should be URL-encoded): in '{}'", val); 125 | } 126 | val 127 | }); 128 | 129 | raw_features.push(RawFeature { 130 | seqid, 131 | start, 132 | end, 133 | line_offset, 134 | id, 135 | parent, 136 | attr, 137 | }); 138 | } 139 | 140 | // Build feature_map: string ID -> numeric ID 141 | let mut feature_map: FxHashMap = FxHashMap::default(); 142 | for (i, rf) in raw_features.iter().enumerate() { 143 | feature_map.insert(rf.id.clone(), i as u32); 144 | } 145 | 146 | // Open output files 147 | let mut fts_file = File::create(append_suffix(gff, ".fts"))?; 148 | let mut prt_entries = Vec::with_capacity(raw_features.len()); 149 | let mut a2f_entries = Vec::with_capacity(raw_features.len()); 150 | let mut atn_entries = Vec::new(); 151 | let mut attr_value_to_id: FxHashMap = FxHashMap::default(); 152 | let mut gof_file = File::create(append_suffix(gff, ".gof"))?; 153 | let mut seqid_to_num: IndexMap = IndexMap::new(); 154 | let mut trees_input: IndexMap> = IndexMap::new(); 155 | let mut next_seqid_num: u32 = 0; 156 | let mut current_root: Option<(u32, u64, u32)> = None; 157 | 158 | // Write .fts and build .prt, .a2f, .gof, and seqid intervals 159 | for rf in &raw_features { 160 | let fid = feature_map[&rf.id]; 161 | writeln!(fts_file, "{}", rf.id)?; 162 | // Resolve parent (fallback to self if missing) 163 | let parent_id = rf 164 | .parent 165 | .as_ref() 166 | .and_then(|p| feature_map.get(p).cloned()) 167 | .unwrap_or(fid); 168 | prt_entries.push(parent_id); 169 | // Record roots for GOF and intervals 170 | if parent_id == fid { 171 | let seqid_num = *seqid_to_num.entry(rf.seqid.clone()).or_insert_with(|| { 172 | let id = next_seqid_num; 173 | next_seqid_num += 1; 174 | id 175 | }); 176 | 177 | trees_input 178 | .entry(seqid_num) 179 | .or_default() 180 | .push((rf.start, rf.end, fid)); 181 | 182 | if let Some((old_id, old_off, old_seqid_num)) = current_root.take() { 183 | write_gof(&mut gof_file, old_id, old_seqid_num, old_off, rf.line_offset)?; 184 | } 185 | current_root = Some((fid, rf.line_offset, seqid_num)); 186 | } 187 | 188 | // Attribute mapping 189 | if let Some(val) = &rf.attr { 190 | let aid = *attr_value_to_id.entry(val.clone()).or_insert_with(|| { 191 | let a = atn_entries.len() as u32; 192 | atn_entries.push(val.clone()); 193 | a 194 | }); 195 | a2f_entries.push(aid); 196 | } else { 197 | a2f_entries.push(u32::MAX); 198 | } 199 | } 200 | // Write final GOF record 201 | if let Some((last_id, last_off, last_seqid_num)) = current_root { 202 | write_gof(&mut gof_file, last_id, last_seqid_num, last_off, data.len() as u64)?; 203 | } 204 | 205 | // Build interval trees per seqid 206 | let mut trees = Vec::with_capacity(seqid_to_num.len()); 207 | for (_seqid, seqid_num) in &seqid_to_num { 208 | let ivs = &trees_input[seqid_num]; 209 | let iv_structs: Vec> = ivs 210 | .iter() 211 | .map(|&(start, end, fid)| Interval { 212 | start, 213 | end, 214 | root_fid: fid, 215 | }) 216 | .collect(); 217 | trees.push(IntervalTree::new(iv_structs)); 218 | } 219 | 220 | // Write .rit and .rix 221 | let rit = append_suffix(gff, ".rit"); 222 | let rix = append_suffix(gff, ".rix"); 223 | let offsets = save_multiple_trees(&trees, rit.as_path())?; 224 | write_offsets_to_file(&offsets, rix.as_path())?; 225 | 226 | // Write .sqs (sequence list) 227 | let seqids: Vec = seqid_to_num.keys().cloned().collect(); 228 | write_lines(append_suffix(gff, ".sqs"), &seqids)?; 229 | 230 | // Write .atn, .a2f, .prt 231 | let mut atn_out = Vec::with_capacity(atn_entries.len() + 1); 232 | atn_out.push(format!("#attribute={}", attr_key)); 233 | atn_out.extend(atn_entries.clone()); 234 | write_lines(append_suffix(gff, ".atn"), &atn_out)?; 235 | write_binary_u32(append_suffix(gff, ".a2f"), &a2f_entries)?; 236 | write_binary_u32(append_suffix(gff, ".prt"), &prt_entries)?; 237 | 238 | if verbose { 239 | eprintln!("Index built successfully for {}", gff.display()); 240 | } 241 | Ok(()) 242 | } 243 | -------------------------------------------------------------------------------- /src/index_loader.rs: -------------------------------------------------------------------------------- 1 | pub mod core; 2 | pub mod gof; 3 | pub mod fts; 4 | pub mod prt; 5 | pub mod a2f; 6 | 7 | pub use core::{load_atn, load_sqs, safe_mmap_readonly}; 8 | pub use gof::{GofMap, load_gof}; 9 | pub use fts::{FtsMap, load_fts}; 10 | pub use prt::{PrtMap, load_prt}; 11 | pub use a2f::{A2fMap, load_a2f}; 12 | -------------------------------------------------------------------------------- /src/index_loader/a2f.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{bail, Context, Result}; 2 | use byteorder::{ByteOrder, LittleEndian}; 3 | use rustc_hash::{FxHashMap, FxHashSet}; 4 | use std::path::Path; 5 | 6 | use crate::{append_suffix, safe_mmap_readonly}; 7 | 8 | /// A2fMap stores two indexes: 9 | /// - `aid_to_fids`: mapping from Attribute ID (AID) to all Feature IDs (FIDs) that reference it. 10 | /// - `fid_to_aid`: reverse mapping from Feature ID (FID) to its Attribute ID (if any). 11 | /// 12 | /// The `.a2f` file on disk is encoded as `fid -> aid` (one u32 per fid). 13 | /// During loading, we construct the reverse map `aid -> fids` for efficient queries. 14 | #[derive(Debug)] 15 | pub struct A2fMap { 16 | aid_to_fids: FxHashMap>, 17 | fid_to_aid: Vec>, 18 | } 19 | 20 | impl A2fMap { 21 | pub fn new(aid_to_fids: FxHashMap>, fid_to_aid: Vec>) -> Self { 22 | Self { aid_to_fids, fid_to_aid } 23 | } 24 | 25 | /// Get all FIDs associated with a given AID. 26 | #[inline] 27 | pub fn fids_for_aid(&self, aid: u32) -> Option<&[u32]> { 28 | self.aid_to_fids.get(&aid).map(|v| v.as_slice()) 29 | } 30 | 31 | /// Get the AID associated with a given FID (if any). 32 | #[inline] 33 | pub fn aid_for_fid(&self, fid: u32) -> Option { 34 | self.fid_to_aid.get(fid as usize).and_then(|x| *x) 35 | } 36 | 37 | /// Map a set of AIDs into a set of FIDs (deduplicated, no order guaranteed). 38 | #[inline] 39 | pub fn map_aids_to_fids_set(&self, aids: &FxHashSet) -> FxHashSet { 40 | let mut out = FxHashSet::default(); 41 | for &aid in aids { 42 | if let Some(fids) = self.aid_to_fids.get(&aid) { 43 | out.extend(fids.iter().copied()); 44 | } else { 45 | eprintln!("[WARN] AID {} not found (no FIDs).", aid); 46 | } 47 | } 48 | out 49 | } 50 | 51 | /// Map a list of AIDs into a combined Vec of FIDs. 52 | /// The caller should sort/deduplicate if stable order is needed. 53 | #[inline] 54 | pub fn map_aids_to_fids_vec(&self, aids: &[u32]) -> Vec { 55 | let mut out = Vec::new(); 56 | for &aid in aids { 57 | if let Some(fids) = self.aid_to_fids.get(&aid) { 58 | out.extend_from_slice(fids); 59 | } else { 60 | eprintln!("[WARN] AID {} not found (no FIDs).", aid); 61 | } 62 | } 63 | out 64 | } 65 | 66 | #[inline] 67 | pub fn len_fids(&self) -> usize { self.fid_to_aid.len() } 68 | 69 | #[inline] 70 | pub fn is_empty(&self) -> bool { self.fid_to_aid.is_empty() } 71 | } 72 | 73 | /// Load `.a2f` file and build A2fMap: 74 | /// - On disk: each 4-byte little-endian u32 represents the AID for a given FID. 75 | /// - Value `u32::MAX` means "no attribute" (None). 76 | /// - In memory: build both `fid -> aid` (vector) and `aid -> fids` (hashmap). 77 | pub fn load_a2f>(gff_path: P) -> Result { 78 | let path = gff_path.as_ref(); 79 | let a2f_path = append_suffix(path, ".a2f"); 80 | 81 | let mmap = safe_mmap_readonly(&a2f_path) 82 | .with_context(|| format!("Failed to mmap {}", a2f_path.display()))?; 83 | 84 | if mmap.len() % 4 != 0 { 85 | bail!( 86 | "Corrupted A2F ({}): length {} not aligned to u32", 87 | a2f_path.display(), 88 | mmap.len() 89 | ); 90 | } 91 | 92 | let n = mmap.len() / 4; 93 | let mut fid_to_aid: Vec> = Vec::with_capacity(n); 94 | let mut aid_to_fids: FxHashMap> = FxHashMap::default(); 95 | 96 | for (fid, chunk) in mmap.chunks_exact(4).enumerate() { 97 | let raw = LittleEndian::read_u32(chunk); 98 | let aid = if raw == u32::MAX { None } else { Some(raw) }; 99 | fid_to_aid.push(aid); 100 | 101 | if let Some(aid_val) = aid { 102 | aid_to_fids.entry(aid_val).or_default().push(fid as u32); 103 | } 104 | } 105 | 106 | // Optional: deduplicate and sort FID lists 107 | for fids in aid_to_fids.values_mut() { 108 | fids.sort_unstable(); 109 | fids.dedup(); 110 | } 111 | 112 | Ok(A2fMap::new(aid_to_fids, fid_to_aid)) 113 | } 114 | -------------------------------------------------------------------------------- /src/index_loader/core.rs: -------------------------------------------------------------------------------- 1 | use crate::append_suffix; 2 | use anyhow::{Context, Result, bail}; 3 | use memmap2::Mmap; 4 | use rustc_hash::FxHashMap; 5 | use std::{ 6 | // collections::HashMap, 7 | fs::File, 8 | io::{BufRead, BufReader}, 9 | path::Path, 10 | }; 11 | 12 | 13 | 14 | pub fn safe_mmap_readonly(path: &Path) -> Result { 15 | let file = File::open(path).with_context(|| format!("Failed to open file: {:?}", path))?; 16 | unsafe { Mmap::map(&file) }.with_context(|| format!("Failed to mmap file: {:?}", path)) 17 | } 18 | 19 | pub fn load_sqs>(path: P) -> Result<(Vec, FxHashMap)> { 20 | let path = path.as_ref(); 21 | let sqs_path = append_suffix(path, ".sqs"); 22 | let file = File::open(&sqs_path) 23 | .with_context(|| format!("Failed to open SQS file: {:?}", &sqs_path))?; 24 | let reader = BufReader::new(file); 25 | 26 | let id_to_name: Vec = reader.lines().collect::>()?; 27 | let name_to_id: FxHashMap<_, _> = id_to_name 28 | .iter() 29 | .enumerate() 30 | .map(|(id, name)| (name.clone(), id as u32)) 31 | .collect(); 32 | 33 | Ok((id_to_name, name_to_id)) 34 | } 35 | 36 | 37 | pub fn load_atn(path: &Path) -> Result<(String, Vec)> { 38 | let atn_path = append_suffix(path, ".atn"); 39 | let mmap = safe_mmap_readonly(&atn_path)?; 40 | let data = &mmap[..]; 41 | 42 | let mut values = Vec::new(); 43 | let mut attr_name: Option = None; 44 | 45 | // Helper to process a single line (without trailing '\n') 46 | let mut push_line = |bytes: &[u8]| -> Result<()> { 47 | if bytes.is_empty() { 48 | return Ok(()); 49 | } 50 | let mut line = std::str::from_utf8(bytes) 51 | .context("ATN contains invalid UTF-8")? 52 | .trim(); 53 | 54 | // Strip UTF-8 BOM if it appears at the start of the very first line 55 | if attr_name.is_none() && line.starts_with('\u{feff}') { 56 | line = &line['\u{feff}'.len_utf8()..]; 57 | } 58 | 59 | if let Some(rest) = line.strip_prefix("#attribute=") { 60 | // Only a single header is allowed 61 | if attr_name.is_some() { 62 | bail!("Multiple #attribute= headers found in .atn file"); 63 | } 64 | attr_name = Some(rest.to_string()); 65 | } else if !line.is_empty() && !line.starts_with('#') { 66 | // Collect non-empty, non-comment value lines 67 | values.push(line.to_string()); 68 | } 69 | Ok(()) 70 | }; 71 | 72 | // Scan by '\n' 73 | let mut start = 0usize; 74 | for (i, &b) in data.iter().enumerate() { 75 | if b == b'\n' { 76 | push_line(&data[start..i])?; 77 | start = i + 1; 78 | } 79 | } 80 | // Handle a trailing line without '\n' 81 | if start < data.len() { 82 | push_line(&data[start..])?; 83 | } 84 | 85 | let attr_name = attr_name 86 | .ok_or_else(|| anyhow::anyhow!("Missing #attribute=... header in .atn file"))?; 87 | 88 | Ok((attr_name, values)) 89 | } -------------------------------------------------------------------------------- /src/index_loader/fts.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{Context, Result}; 2 | use rustc_hash::{FxHashMap, FxHashSet}; 3 | use std::path::Path; 4 | use std::sync::OnceLock; // lazy cache 5 | use rayon::prelude::*; 6 | use crate::{append_suffix, safe_mmap_readonly}; 7 | 8 | #[derive(Debug)] 9 | pub struct FtsMap { 10 | pub ids: Vec, 11 | /// String -> numeric ID (u32) 12 | index_fwd: OnceLock>, 13 | } 14 | 15 | impl FtsMap { 16 | fn build_fwd(&self) -> FxHashMap { 17 | let mut m = FxHashMap::with_capacity_and_hasher(self.ids.len(), Default::default()); 18 | for (i, s) in self.ids.iter().enumerate() { 19 | m.insert(s.clone(), i as u32); 20 | } 21 | m 22 | } 23 | 24 | /// Access forward index (String -> u32) 25 | pub fn index_fwd(&self) -> &FxHashMap { 26 | self.index_fwd.get_or_init(|| self.build_fwd()) 27 | } 28 | 29 | /// Convert string ID to numeric fid 30 | pub fn get_fid(&self, id: &str) -> Option { 31 | self.index_fwd().get(id).copied() 32 | } 33 | 34 | /// Convert numeric fid to string ID (direct from ids vec) 35 | pub fn get_id(&self, fid: u32) -> Option<&str> { 36 | self.ids.get(fid as usize).map(|s| s.as_str()) 37 | } 38 | 39 | /// Map a set of feature names to their fids 40 | /// Returns: (found_fids, missing_names) 41 | pub fn map_fnames_to_fids( 42 | &self, 43 | feature_names: &FxHashSet, 44 | threads: usize, 45 | ) -> (FxHashSet, Vec) { 46 | enum Either { Left(L), Right(R) } 47 | 48 | let idx = self.index_fwd(); 49 | 50 | let mapper = |fname: &String| { 51 | if let Some(&fid) = idx.get(fname) { 52 | Either::Left(fid) 53 | } else { 54 | Either::Right(fname.clone()) 55 | } 56 | }; 57 | 58 | if threads > 1 && feature_names.len() > 2 { 59 | feature_names 60 | .par_iter() 61 | .map(mapper) 62 | .fold( 63 | || (FxHashSet::default(), Vec::new()), 64 | |mut acc, e| { 65 | match e { 66 | Either::Left(n) => { acc.0.insert(n); } 67 | Either::Right(s) => { acc.1.push(s); } 68 | } 69 | acc 70 | }, 71 | ) 72 | .reduce( 73 | || (FxHashSet::default(), Vec::new()), 74 | |mut a, b| { 75 | a.0.extend(b.0); 76 | a.1.extend(b.1); 77 | a 78 | }, 79 | ) 80 | } else { 81 | let mut set = FxHashSet::default(); 82 | set.reserve(feature_names.len()); 83 | let mut miss = Vec::new(); 84 | for fname in feature_names { 85 | if let Some(&fid) = idx.get(fname) { 86 | set.insert(fid); 87 | } else { 88 | miss.push(fname.clone()); 89 | } 90 | } 91 | (set, miss) 92 | } 93 | } 94 | } 95 | 96 | /// Load `.fts` file into FtsMap 97 | pub fn load_fts>(gff_path: P) -> Result { 98 | let path = gff_path.as_ref(); 99 | let fts_path = append_suffix(path, ".fts"); 100 | 101 | let mmap = safe_mmap_readonly(&fts_path) 102 | .with_context(|| format!("Failed to mmap {}", fts_path.display()))?; 103 | let data = &mmap[..]; 104 | 105 | let mut lines = Vec::new(); 106 | let mut start = 0; 107 | 108 | for (i, &b) in data.iter().enumerate() { 109 | if b == b'\n' { 110 | let mut slice = &data[start..i]; 111 | if !slice.is_empty() { 112 | if slice.ends_with(b"\r") { 113 | slice = &slice[..slice.len() - 1]; 114 | } 115 | let s = std::str::from_utf8(slice) 116 | .with_context(|| format!("FTS contains invalid UTF-8 at byte {}", start))?; 117 | lines.push(s.to_string()); 118 | } 119 | start = i + 1; 120 | } 121 | } 122 | if start < data.len() { 123 | let mut slice = &data[start..]; 124 | if !slice.is_empty() { 125 | if slice.ends_with(b"\r") { 126 | slice = &slice[..slice.len() - 1]; 127 | } 128 | let s = std::str::from_utf8(slice) 129 | .with_context(|| format!("FTS contains invalid UTF-8 at byte {}", start))?; 130 | lines.push(s.to_string()); 131 | } 132 | } 133 | 134 | Ok(FtsMap { 135 | ids: lines, 136 | index_fwd: OnceLock::new(), 137 | }) 138 | } 139 | -------------------------------------------------------------------------------- /src/index_loader/gof.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{bail, Context, Result}; 2 | use byteorder::{ByteOrder, LittleEndian}; 3 | use rustc_hash::FxHashMap; 4 | use std::{path::Path, sync::OnceLock}; 5 | use crate::{append_suffix, safe_mmap_readonly}; 6 | 7 | const MISSING: u64 = u64::MAX; // Set sentinel value for missing entries 8 | 9 | #[derive(Debug)] 10 | pub struct GofEntry { 11 | pub feature_id: u32, 12 | pub seqid_num: u32, 13 | pub start_offset: u64, 14 | pub end_offset: u64, 15 | } 16 | 17 | 18 | 19 | #[derive(Debug)] 20 | pub struct GofMap { 21 | /// Flat list of (feature_id, start, end) tuples as read from .gof 22 | pub entries: Vec, 23 | /// Lazy, thread-safe cache: feature_id -> (start, end) 24 | index_cache: OnceLock>, 25 | pub seqid_index: FxHashMap>, // seqid_num -> entry indices 26 | 27 | } 28 | 29 | impl GofMap { 30 | /// Build a transient index (allocates on every call). 31 | /// Prefer `index_cached()` for hot paths. 32 | pub fn index(&self) -> FxHashMap { 33 | self.entries 34 | .iter() 35 | .map(|e| (e.feature_id, (e.start_offset, e.end_offset))) 36 | .collect() 37 | } 38 | 39 | /// Get (or build once) the cached index. 40 | #[inline] 41 | pub fn index_cached(&self) -> &FxHashMap { 42 | self.index_cache.get_or_init(|| self.index()) 43 | } 44 | 45 | /// O(1) lookup using the cached index. 46 | #[inline] 47 | pub fn get(&self, fid: u32) -> Option<&(u64, u64)> { 48 | self.index_cached().get(&fid) 49 | } 50 | 51 | /// Map a list of root IDs to their (start, end) offsets using the cached index. 52 | /// Missing roots are silently skipped (consistent with the original free function). 53 | #[inline] 54 | pub fn roots_to_offsets( 55 | &self, 56 | roots: &[u32], 57 | threads: usize, 58 | ) -> Vec<(u32, u64, u64)> { 59 | let idx = self.index_cached(); 60 | 61 | // Simple heuristic: parallelize only for large inputs 62 | let should_parallel = threads > 1 && roots.len() > 2048; 63 | 64 | if should_parallel { 65 | use rayon::prelude::*; 66 | roots 67 | .par_iter() 68 | .map(|&r| match idx.get(&r) { 69 | Some(&(s, e)) => (r, s, e), 70 | None => (r, MISSING, MISSING), // use u64::MAX as sentinel value 71 | }) 72 | .collect() 73 | } else { 74 | let mut out = Vec::with_capacity(roots.len()); 75 | for &r in roots { 76 | if let Some(&(s, e)) = idx.get(&r) { 77 | out.push((r, s, e)); 78 | } else { 79 | out.push((r, MISSING, MISSING)); // use u64::MAX as sentinel value 80 | } 81 | } 82 | out 83 | } 84 | } 85 | 86 | pub fn roots_for_seqid(&self, seqid_num: u32) -> Vec<&GofEntry> { 87 | match self.seqid_index.get(&seqid_num) { 88 | Some(indices) => indices.iter().map(|&i| &self.entries[i]).collect(), 89 | None => Vec::new(), 90 | } 91 | } 92 | } 93 | 94 | /// Load a `.gof` file containing (u32 fid, u32 padding, u64 start, u64 end) records. 95 | pub fn load_gof>(gff_path: P) -> Result { 96 | let path = gff_path.as_ref(); 97 | let gof_path = append_suffix(path, ".gof"); 98 | let mmap = safe_mmap_readonly(&gof_path) 99 | .with_context(|| format!("Failed to mmap {}", gof_path.display()))?; 100 | let bytes = &mmap[..]; 101 | const REC_SIZE: usize = 4 + 4 + 8 + 8; 102 | 103 | if bytes.len() % REC_SIZE != 0 { 104 | bail!( 105 | "Corrupted GOF ({}): length {} not multiple of {}", 106 | gof_path.display(), 107 | bytes.len(), 108 | REC_SIZE 109 | ); 110 | } 111 | 112 | let mut entries = Vec::with_capacity(bytes.len() / REC_SIZE); 113 | let mut seqid_index: FxHashMap> = FxHashMap::default(); 114 | for (i, rec) in bytes.chunks_exact(REC_SIZE).enumerate() { 115 | let fid = LittleEndian::read_u32(&rec[0..4]); 116 | let seqid_num = LittleEndian::read_u32(&rec[4..8]); 117 | let start = LittleEndian::read_u64(&rec[8..16]); 118 | let end = LittleEndian::read_u64(&rec[16..24]); 119 | entries.push(GofEntry { feature_id: fid, seqid_num: seqid_num, start_offset: start, end_offset: end }); 120 | seqid_index.entry(seqid_num).or_default().push(i); 121 | } 122 | 123 | Ok(GofMap { 124 | entries, 125 | index_cache: OnceLock::new(), 126 | seqid_index, 127 | }) 128 | } 129 | -------------------------------------------------------------------------------- /src/index_loader/prt.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{bail, Result}; 2 | use byteorder::{ByteOrder, LittleEndian}; 3 | use rustc_hash::FxHashMap; 4 | use std::{path::Path, sync::OnceLock}; 5 | use rayon::prelude::*; // Parallel iteration (no feature gate) 6 | 7 | use crate::{append_suffix, safe_mmap_readonly}; 8 | 9 | #[derive(Debug, Clone, Copy)] 10 | pub struct PrtEntry { 11 | /// Child node id 12 | pub child: u32, 13 | /// Parent node id 14 | pub parent: u32, 15 | } 16 | 17 | #[derive(Debug)] 18 | pub struct PrtMap { 19 | /// Flat list of (child -> parent) entries, where index i is the child id 20 | pub entries: Vec, 21 | /// Lazy, thread-safe cache for child->parent index 22 | index_cache: OnceLock>, 23 | } 24 | 25 | impl PrtMap { 26 | /// Construct a PrtMap from a list of entries. 27 | pub fn new(entries: Vec) -> Self { 28 | Self { 29 | entries, 30 | index_cache: OnceLock::new(), 31 | } 32 | } 33 | 34 | /// Build a child -> parent hashmap for O(1) lookups (allocates every call). 35 | pub fn index(&self) -> FxHashMap { 36 | self.entries.iter().map(|e| (e.child, e.parent)).collect() 37 | } 38 | 39 | /// Get the cached child -> parent index, building it once on first use. 40 | pub fn index_cached(&self) -> &FxHashMap { 41 | self.index_cache.get_or_init(|| self.index()) 42 | } 43 | 44 | /// Small helper: get the parent of a child via linear scan (O(n)). 45 | pub fn get_parent(&self, child: u32) -> Option { 46 | self.entries 47 | .iter() 48 | .find(|e| e.child == child) 49 | .map(|e| e.parent) 50 | } 51 | 52 | /// Resolve the root of a node by following parent pointers via *array access* (fast path). 53 | #[inline] 54 | fn resolve_root(&self, start: u32) -> (u32, bool) { 55 | let n = self.entries.len() as u32; 56 | let mut cur = start; 57 | 58 | loop { 59 | if cur >= n { 60 | return (u32::MAX, true); 61 | } 62 | let p = self.entries[cur as usize].parent; 63 | 64 | if p == cur { 65 | return (cur, false); 66 | } 67 | if p >= n { 68 | return (u32::MAX, true); 69 | } 70 | cur = p; 71 | } 72 | } 73 | 74 | #[inline] 75 | /// Map a Vec to a Vec using the fast resolver. 76 | /// - If a FID equals `u32::MAX`, keep it as-is (sentinel). 77 | /// - Output order matches the input order. 78 | pub fn map_fids_to_roots(&self, fids: &Vec, threads: usize) -> Vec { 79 | let should_parallel = threads > 1 && fids.len() > 256; // tune threshold as needed 80 | 81 | if should_parallel { 82 | fids.par_iter() 83 | .map(|&fid| { 84 | if fid == u32::MAX { 85 | u32::MAX 86 | } else { 87 | self.resolve_root(fid).0 88 | } 89 | }) 90 | .collect() 91 | } else { 92 | let mut out = Vec::with_capacity(fids.len()); 93 | for &fid in fids { 94 | if fid == u32::MAX { 95 | out.push(u32::MAX); 96 | } else { 97 | out.push(self.resolve_root(fid).0); 98 | } 99 | } 100 | out 101 | } 102 | } 103 | } 104 | 105 | /// Load a `.prt` file that encodes parent pointers as a u32 array. 106 | /// Each 4-byte little-endian word is the parent id of the child at the same index. 107 | /// For child i, parent = data[i]. 108 | pub fn load_prt>(gff_path: P) -> Result { 109 | let path = gff_path.as_ref(); 110 | let prt_path = append_suffix(path, ".prt"); 111 | let mmap = safe_mmap_readonly(&prt_path)?; 112 | if mmap.len() % 4 != 0 { 113 | bail!("Corrupted PRT: not aligned to u32"); 114 | } 115 | 116 | let mut entries = Vec::with_capacity(mmap.len() / 4); 117 | for (i, chunk) in mmap.chunks_exact(4).enumerate() { 118 | let parent = LittleEndian::read_u32(chunk); 119 | entries.push(PrtEntry { 120 | child: i as u32, 121 | parent, 122 | }); 123 | } 124 | Ok(PrtMap::new(entries)) 125 | } 126 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | // src/lib.rs 2 | pub mod commands; 3 | pub mod index_builder; 4 | pub mod index_loader; 5 | pub mod utils; 6 | 7 | pub use index_builder::core::build_index; 8 | pub use index_loader::{ 9 | core::{load_atn, load_sqs, safe_mmap_readonly}, 10 | gof::{GofMap, load_gof}, 11 | fts::{FtsMap, load_fts}, 12 | prt::{PrtMap, load_prt}, 13 | a2f::{A2fMap, load_a2f} 14 | }; 15 | 16 | 17 | pub use utils::common::{ 18 | CommonArgs, append_suffix, check_index_files_exist, write_gff_output, write_gff_output_filtered, 19 | }; 20 | pub use utils::tree_io::{save_multiple_trees, write_offsets_to_file}; 21 | pub use utils::tree::{Interval, IntervalTree}; 22 | pub use utils::tree_index::TreeIndexData; 23 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use clap::{Parser, Subcommand}; 3 | use gffx::commands::*; 4 | 5 | #[derive(Parser)] 6 | #[command( 7 | name = "gffx", 8 | version, 9 | about = concat!("GFFx: A ultra-fast feature extractor for GFF files\nVersion: ", env!("CARGO_PKG_VERSION")), 10 | propagate_version = true 11 | )] 12 | struct Cli { 13 | #[command(subcommand)] 14 | command: Commands, 15 | } 16 | 17 | #[derive(Subcommand)] 18 | enum Commands { 19 | Index(IndexArgs), 20 | Intersect(IntersectArgs), 21 | Extract(ExtractArgs), 22 | Search(SearchArgs), 23 | Coverage(CoverageArgs), 24 | Depth(DepthArgs), 25 | Sample(SampleArgs) 26 | } 27 | 28 | fn main() -> Result<()> { 29 | let cli = Cli::parse(); 30 | 31 | match cli.command { 32 | Commands::Index(args) => run_index(&args)?, 33 | Commands::Intersect(args) => run_intersect(&args)?, 34 | Commands::Extract(args) => run_extract(&args)?, 35 | Commands::Search(args) => run_search(&args)?, 36 | Commands::Coverage(args) => run_coverage(&args)?, 37 | Commands::Depth(args) => run_depth(&args)?, 38 | Commands::Sample(args) => run_sample(&args)?, 39 | } 40 | 41 | Ok(()) 42 | } 43 | -------------------------------------------------------------------------------- /src/utils.rs: -------------------------------------------------------------------------------- 1 | pub mod common; 2 | pub mod tree; 3 | pub mod tree_io; 4 | pub mod tree_index; 5 | 6 | pub use tree::{Interval, IntervalTree}; 7 | pub use tree_index::TreeIndexData; -------------------------------------------------------------------------------- /src/utils/common.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{Result, Context}; 2 | use clap::{Parser, CommandFactory}; 3 | use clap::error::ErrorKind; 4 | use memchr::{memchr, memmem}; 5 | use memmap2::Mmap; 6 | use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; 7 | use rustc_hash::{FxHashMap, FxHashSet}; 8 | use std::{ 9 | fs::File, 10 | io::{BufWriter, IoSlice, Write, stdout}, 11 | path::{Path, PathBuf}, 12 | str, 13 | }; 14 | 15 | const MISSING: u64 = u64::MAX; // Set sentinel value for missing entries 16 | 17 | #[derive(Debug, Clone, Parser)] 18 | pub struct CommonArgs { 19 | /// Input GFF file path 20 | #[arg(short = 'i', long = "input", value_name = "FILE")] 21 | pub input: PathBuf, 22 | 23 | /// Output file (stdout if not provided) 24 | #[arg(short = 'o', long = "output", value_name = "FILE")] 25 | pub output: Option, 26 | 27 | /// Return the entire feature group for each match (entire-group mode); default: only the matched feature (per-feature mode). 28 | #[arg(short = 'e', long = "entire_group", default_value_t = false)] 29 | pub entire_group: bool, 30 | 31 | /// Comma-separated feature types to retain (e.g. exon,gene); only effective in feature-only mode 32 | #[arg(short = 'T', long = "types", value_name = "TYPES")] 33 | pub types: Option, 34 | 35 | /// Number of threads for parallel processing 36 | #[arg( 37 | short = 't', 38 | long = "threads", 39 | default_value_t = 12, 40 | value_name = "NUM", 41 | )] 42 | pub threads: usize, 43 | 44 | /// Enable verbose output 45 | #[arg( 46 | short = 'v', 47 | long = "verbose", 48 | default_value_t = false, 49 | value_name = "BOOL", 50 | )] 51 | pub verbose: bool, 52 | } 53 | 54 | impl CommonArgs { 55 | /// Return the number of effective threads: 56 | /// - If user sets `--threads 0`, use all available cores 57 | /// - Otherwise, use the user-specified number 58 | #[inline] 59 | pub fn effective_threads(&self) -> usize { 60 | if self.threads == 0 { 61 | std::thread::available_parallelism() 62 | .map(|n| n.get()) 63 | .unwrap_or(1) 64 | } else { 65 | self.threads 66 | } 67 | } 68 | 69 | /// Initialize rayon global thread pool 70 | /// - Uses `effective_threads()` to decide the number of threads 71 | /// - Prints info/warning if verbose mode is enabled 72 | pub fn init_rayon(&self) { 73 | let n = self.effective_threads(); 74 | match rayon::ThreadPoolBuilder::new().num_threads(n).build_global() { 75 | Ok(()) => { 76 | if self.verbose { 77 | eprintln!("[INFO] rayon threads = {}", n); 78 | } 79 | } 80 | Err(e) => { 81 | if self.verbose { 82 | eprintln!("[WARN] rayon global pool already initialized: {e}"); 83 | } 84 | } 85 | } 86 | } 87 | 88 | /// Post-parse hook: 89 | /// - Validate argument combinations 90 | /// - Print info messages 91 | /// - Initialize rayon 92 | pub fn post_parse(&self) -> Result<(), clap::Error> { 93 | // Custom conflict error: entire-group mode cannot be combined with --types 94 | if self.entire_group && self.types.is_some() { 95 | return Err( 96 | clap::Error::raw( 97 | ErrorKind::ArgumentConflict, 98 | "Full-model mode does not support filtering by feature types (-T/--types).", 99 | ) 100 | .with_cmd(&Self::command()) 101 | ); 102 | } 103 | 104 | // Initialize rayon after validation 105 | self.init_rayon(); 106 | 107 | Ok(()) 108 | } 109 | 110 | /// Combined parse + post-parse helper: 111 | /// - Parses CLI arguments 112 | /// - Runs `post_parse()` 113 | /// - Exits with error if validation fails 114 | pub fn parse_and_init() -> Self { 115 | let args = Self::parse(); 116 | if let Err(e) = args.post_parse() { 117 | e.exit(); 118 | } 119 | args 120 | } 121 | } 122 | 123 | pub fn append_suffix(path: &Path, suffix: &str) -> PathBuf { 124 | let parent = path.parent().unwrap_or_else(|| Path::new("")); 125 | let filename = path.file_name().unwrap_or_default().to_string_lossy(); 126 | parent.join(format!("{filename}{suffix}")) 127 | } 128 | 129 | /// Write GFF header lines (starting with '#') to output 130 | /// Returns the byte position after the header 131 | pub fn write_gff_header(writer: &mut W, gff_buf: &[u8]) -> Result { 132 | let mut pos = 0; 133 | while pos < gff_buf.len() && gff_buf[pos] == b'#' { 134 | if let Some(nl) = gff_buf[pos..].iter().position(|&b| b == b'\n') { 135 | let end = pos + nl + 1; 136 | writer.write_all(&gff_buf[pos..end])?; 137 | pos = end; 138 | } else { 139 | break; 140 | } 141 | } 142 | Ok(pos) 143 | } 144 | 145 | /// Check if all expected index files for a given GFF exist. 146 | /// 147 | /// Expected suffixes: `.gof`, `.fts`, `.prt`, `.sqs`, `.atn`, `.a2f`, `.rit`, `.rix`. 148 | /// 149 | /// If any are missing: 150 | /// - Otherwise, return `Ok(false)`. 151 | pub fn check_index_files_exist(gff: &PathBuf) -> Result { 152 | let expected_suffixes = [ 153 | ".gof", ".fts", ".prt", ".sqs", ".atn", ".a2f", ".rit", ".rix", 154 | ]; 155 | let mut missing = Vec::new(); 156 | 157 | for ext in &expected_suffixes { 158 | let path = append_suffix(gff, ext); 159 | if !path.exists() { 160 | missing.push(ext.to_string()); 161 | } 162 | } 163 | 164 | if !missing.is_empty() { 165 | eprintln!("Missing index file(s): {:?}", missing); 166 | Ok(false) 167 | } else { 168 | Ok(true) 169 | } 170 | } 171 | 172 | /// Write selected byte ranges ("blocks") of a GFF file to an output file or stdout. 173 | /// 174 | /// Features: 175 | /// - Uses memory-mapped I/O for efficiency. 176 | /// - Merges adjacent or overlapping ranges before writing. 177 | /// - Uses vectored I/O (`write_vectored`) to minimize syscalls. 178 | /// 179 | /// # Arguments 180 | /// - `gff_path`: Path to the source GFF file. 181 | /// - `blocks`: A list of `(start, end)` byte ranges to extract. 182 | /// - `output_path`: Output file path. If `None`, writes to stdout. 183 | /// - `_allowed_types`: Reserved for future filtering by feature type (currently unused). 184 | /// - `verbose`: Whether to print diagnostic output. 185 | /// 186 | /// # Errors 187 | /// Returns any I/O or mmap errors. 188 | pub fn write_gff_output( 189 | gff_path: &Path, 190 | blocks: &[(u32, u64, u64)], 191 | output_path: &Option, 192 | verbose: bool, 193 | ) -> Result<()> { 194 | let file = File::open(gff_path)?; 195 | let mmap = unsafe { Mmap::map(&file)? }; 196 | let file_len = mmap.len(); 197 | 198 | // sort and merge blocks 199 | let mut sorted: Vec<(u64, u64)> = { 200 | let mut v = Vec::with_capacity(blocks.len()); 201 | for &(fid, s, e) in blocks { 202 | if s == MISSING { 203 | eprintln!("[WARN] skipped fid={} due to sentinel start offset", fid); 204 | continue; 205 | } 206 | v.push((s, e)); 207 | } 208 | v 209 | }; 210 | sorted.sort_unstable_by_key(|&(s, _)| s); 211 | 212 | let mut merged: Vec<(u64, u64)> = Vec::with_capacity(sorted.len()); 213 | let mut it = sorted.into_iter(); 214 | if let Some((mut cs, mut ce)) = it.next() { 215 | for (s, e) in it { 216 | if s <= ce { 217 | ce = ce.max(e); 218 | } else { 219 | if cs < ce { 220 | merged.push((cs, ce)); 221 | } 222 | cs = s; 223 | ce = e; 224 | } 225 | } 226 | if cs < ce { 227 | merged.push((cs, ce)); 228 | } 229 | } 230 | 231 | // build IoSlice list 232 | let mut slices: Vec> = Vec::with_capacity(merged.len()); 233 | for &(so, eo) in &merged { 234 | if so >= eo { 235 | continue; 236 | } 237 | let (start, end) = (so as usize, eo as usize); 238 | if end > file_len { 239 | continue; 240 | } 241 | slices.push(IoSlice::new(&mmap[start..end])); 242 | } 243 | 244 | // Write in batches 245 | let mut writer: Box = match output_path { 246 | Some(p) => Box::new(BufWriter::new(File::create(p)?)), 247 | None => Box::new(BufWriter::new(stdout())), 248 | }; 249 | 250 | const MAX_IOV: usize = 1024; 251 | let mut base = 0; 252 | while base < slices.len() { 253 | let end = (base + MAX_IOV).min(slices.len()); 254 | let batch = &slices[base..end]; 255 | 256 | let nw = writer.write_vectored(batch)?; 257 | let mut remaining = nw; 258 | let mut i = 0; 259 | 260 | while i < batch.len() && remaining >= batch[i].len() { 261 | remaining -= batch[i].len(); 262 | i += 1; 263 | } 264 | 265 | if i < batch.len() && remaining > 0 { 266 | let cur = &batch[i]; 267 | writer.write_all(&cur[remaining..])?; 268 | i += 1; 269 | } 270 | 271 | for s in &batch[i..] { 272 | writer.write_all(s)?; 273 | } 274 | 275 | base = end; 276 | } 277 | 278 | writer.flush()?; 279 | 280 | if verbose { 281 | eprintln!( 282 | "Wrote {} merged GFF block(s) with vectored I/O", 283 | merged.len() 284 | ); 285 | } 286 | Ok(()) 287 | } 288 | 289 | pub fn write_gff_output_filtered( 290 | gff_path: &PathBuf, 291 | blocks: &[(u32, u64, u64)], 292 | per_root_matches: &FxHashMap>, 293 | atn_attr_name: &str, 294 | output_path: &Option, 295 | types_filter: Option<&str>, 296 | verbose: bool, 297 | ) -> Result<()> { 298 | // mmap GFF 299 | let file = 300 | File::open(gff_path).with_context(|| format!("Cannot open GFF file: {:?}", gff_path))?; 301 | let mmap = 302 | unsafe { Mmap::map(&file) }.with_context(|| format!("mmap failed for {:?}", gff_path))?; 303 | let file_len = mmap.len(); 304 | 305 | // parse optional type filter: comma-separated into a HashSet 306 | let type_allow: Option> = types_filter.map(|s| { 307 | s.split(',') 308 | .map(|t| t.trim().to_string()) 309 | .filter(|t| !t.is_empty()) 310 | .collect() 311 | }); 312 | 313 | let bkey: Vec = { 314 | let mut k = atn_attr_name.as_bytes().to_vec(); 315 | k.push(b'='); 316 | k 317 | }; 318 | let bkey_finder = memmem::Finder::new(&bkey); 319 | 320 | // Process blocks in parallel; each task returns (block_start, matched_bytes) 321 | let mut parts: Vec<(u64, Vec)> = blocks 322 | .par_iter() 323 | .filter_map(|&(root, start, end)| { 324 | // root -> set of string IDs to keep 325 | let keep: &FxHashSet = per_root_matches.get(&root)?; 326 | if keep.is_empty() { 327 | return None; 328 | } 329 | 330 | let s = start as usize; 331 | let e = end.min(file_len as u64) as usize; 332 | if s >= e || e > file_len { 333 | return None; 334 | } 335 | let window = &mmap[s..e]; 336 | 337 | // Output buffer for this block 338 | let mut out = Vec::::with_capacity(1024); 339 | let mut pos = 0usize; 340 | 341 | // Iterate lines in [s, e) 342 | let next_line = |from: usize| -> Option<(usize, usize, usize)> { 343 | if from >= window.len() { 344 | return None; 345 | } 346 | // '\n' inclusive; rel is the index after '\n' or end-of-window 347 | let rel = memchr(b'\n', &window[from..]) 348 | .map(|i| from + i + 1) 349 | .unwrap_or(window.len()); 350 | // strip trailing '\n' and optional '\r' 351 | let mut end_no_nl = rel; 352 | if end_no_nl > from && window[end_no_nl - 1] == b'\n' { 353 | end_no_nl -= 1; 354 | } 355 | if end_no_nl > from && window[end_no_nl - 1] == b'\r' { 356 | end_no_nl -= 1; 357 | } 358 | Some((from, rel, end_no_nl)) 359 | }; 360 | 361 | // If a type filter is supplied, check column-3 equals one of allowed types 362 | let type_ok = |line: &[u8]| -> bool { 363 | if let Some(allow) = &type_allow { 364 | // find first three tabs 365 | let i1 = match memchr(b'\t', line) { 366 | Some(i) => i, 367 | None => return false, 368 | }; 369 | let i2 = match memchr(b'\t', &line[i1 + 1..]) { 370 | Some(x) => i1 + 1 + x, 371 | None => return false, 372 | }; 373 | let i3 = match memchr(b'\t', &line[i2 + 1..]) { 374 | Some(x) => i2 + 1 + x, 375 | None => return false, 376 | }; 377 | let ty = &line[i2 + 1..i3]; 378 | if let Ok(ty_str) = std::str::from_utf8(ty) { 379 | allow.contains(ty_str) 380 | } else { 381 | false 382 | } 383 | } else { 384 | true 385 | } 386 | }; 387 | 388 | // Return true if attributes contain `ID=` and value ∈ keep 389 | let id_hits_keep = |line_no_crlf: &[u8]| -> bool { 390 | // move to 9th field (attributes) 391 | let mut off = 0usize; 392 | let mut tabs = 0u8; 393 | while tabs < 8 { 394 | match memchr(b'\t', &line_no_crlf[off..]) { 395 | Some(i) => { 396 | off += i + 1; 397 | tabs += 1; 398 | } 399 | None => return false, 400 | } 401 | } 402 | let attr = &line_no_crlf[off..]; 403 | if let Some(p) = bkey_finder.find(attr) { 404 | let vstart = p + bkey.len(); 405 | // value ends at ';' or end-of-line 406 | let vend = memchr(b';', &attr[vstart..]) 407 | .map(|i| vstart + i) 408 | .unwrap_or(attr.len()); 409 | let id_slice = &attr[vstart..vend]; 410 | if let Ok(id_str) = std::str::from_utf8(id_slice) { 411 | return keep.contains(id_str); 412 | } 413 | } 414 | false 415 | }; 416 | 417 | // Scan lines in this block window 418 | while let Some((ls, le, ln_end)) = next_line(pos) { 419 | pos = le; 420 | let line = &window[ls..le]; 421 | if !line.is_empty() && line[0] == b'#' { 422 | continue; // skip comments 423 | } 424 | let line_no_crlf = &window[ls..ln_end]; 425 | 426 | if !type_ok(line_no_crlf) { 427 | continue; 428 | } 429 | if id_hits_keep(line_no_crlf) { 430 | out.extend_from_slice(line); 431 | } 432 | } 433 | 434 | if verbose { 435 | let matched_lines = out.iter().filter(|&&b| b == b'\n').count(); 436 | eprintln!( 437 | "[filter] root={} block=[{}..{}] keep_ids={} matched_lines={}", 438 | root, start, end, keep.len(), matched_lines 439 | ); 440 | } 441 | 442 | if out.is_empty() { 443 | None 444 | } else { 445 | Some((start, out)) 446 | } 447 | }) 448 | .collect(); 449 | 450 | // Keep original block order 451 | parts.sort_unstable_by_key(|(s, _)| *s); 452 | 453 | // Write output (stdout or file) 454 | let raw: Box = match output_path { 455 | Some(p) => Box::new(File::create(p).with_context(|| format!("Cannot create output: {:?}", p))?), 456 | None => Box::new(std::io::stdout()), 457 | }; 458 | // Bigger buffer reduces syscalls; tune as needed 459 | let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, raw); 460 | for (_, buf) in parts { 461 | writer.write_all(&buf)?; 462 | } 463 | writer.flush()?; 464 | Ok(()) 465 | } 466 | 467 | 468 | -------------------------------------------------------------------------------- /src/utils/tree.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | 3 | /// Closed interval on [start, end] for point queries. 4 | /// For range queries we use half-open logic. 5 | #[derive(Debug, Clone, Serialize, Deserialize)] 6 | pub struct Interval { 7 | pub start: T, 8 | pub end: T, 9 | pub root_fid: u32, 10 | } 11 | 12 | #[derive(Debug, Serialize, Deserialize)] 13 | pub struct IntervalTree { 14 | root: Option>>, 15 | } 16 | 17 | #[derive(Debug, Serialize, Deserialize)] 18 | struct Node { 19 | center: T, 20 | intervals: Vec>, 21 | left: Option>>, 22 | right: Option>>, 23 | } 24 | 25 | impl IntervalTree 26 | where 27 | T: Ord + Copy + Serialize + for<'de> Deserialize<'de>, 28 | { 29 | /// Build a tree from a list of intervals. 30 | pub fn new(intervals: Vec>) -> Self { 31 | let root = Self::build(intervals); 32 | Self { root } 33 | } 34 | 35 | fn build(mut intervals: Vec>) -> Option>> { 36 | if intervals.is_empty() { 37 | return None; 38 | } 39 | 40 | intervals.sort_by_key(|iv| iv.start); 41 | let mid = intervals.len() / 2; 42 | let center = intervals[mid].start; 43 | 44 | let mut left = Vec::new(); 45 | let mut right = Vec::new(); 46 | let mut center_ivs = Vec::new(); 47 | 48 | for iv in intervals { 49 | if iv.end < center { 50 | left.push(iv); 51 | } else if iv.start > center { 52 | right.push(iv); 53 | } else { 54 | center_ivs.push(iv); 55 | } 56 | } 57 | 58 | Some(Box::new(Node { 59 | center, 60 | intervals: center_ivs, 61 | left: Self::build(left), 62 | right: Self::build(right), 63 | })) 64 | } 65 | 66 | /// Point query: returns all intervals covering `point` (closed semantics on [start, end]). 67 | pub fn query_point(&self, point: T) -> Vec<&Interval> { 68 | let mut result = Vec::new(); 69 | Self::query_point_rec(&self.root, point, &mut result); 70 | result 71 | } 72 | 73 | fn query_point_rec<'a>( 74 | node: &'a Option>>, 75 | point: T, 76 | result: &mut Vec<&'a Interval>, 77 | ) { 78 | if let Some(n) = node { 79 | for iv in &n.intervals { 80 | if iv.start <= point && point <= iv.end { 81 | result.push(iv); 82 | } 83 | } 84 | if point < n.center { 85 | Self::query_point_rec(&n.left, point, result); 86 | } else if point > n.center { 87 | Self::query_point_rec(&n.right, point, result); 88 | } else { 89 | // Equal to center: search both sides 90 | Self::query_point_rec(&n.left, point, result); 91 | Self::query_point_rec(&n.right, point, result); 92 | } 93 | } 94 | } 95 | 96 | /// Interval query (half-open semantics): returns intervals `iv` where 97 | /// `iv.start < end && iv.end > start`. 98 | pub fn query_interval<'a>(&'a self, start: T, end: T, out: &mut Vec<&'a Interval>) { 99 | Self::query_interval_rec(&self.root, start, end, out); 100 | } 101 | 102 | fn query_interval_rec<'a>( 103 | node: &'a Option>>, 104 | start: T, 105 | end: T, 106 | out: &mut Vec<&'a Interval>, 107 | ) { 108 | if let Some(n) = node { 109 | for iv in &n.intervals { 110 | if iv.start < end && iv.end > start { 111 | out.push(iv); 112 | } 113 | } 114 | if start < n.center { 115 | Self::query_interval_rec(&n.left, start, end, out); 116 | } 117 | if end > n.center { 118 | Self::query_interval_rec(&n.right, start, end, out); 119 | } 120 | } 121 | } 122 | } -------------------------------------------------------------------------------- /src/utils/tree_index.rs: -------------------------------------------------------------------------------- 1 | use crate::{IntervalTree, load_sqs, append_suffix}; 2 | use anyhow::{bail, Context, Result}; 3 | use bincode2::deserialize; 4 | use memmap2::MmapOptions; 5 | use rustc_hash::FxHashMap; 6 | use std::{fs::File, path::Path}; 7 | 8 | /// Application-facing structure: 9 | /// - per-sequence interval trees 10 | /// - string -> numeric ID mapping 11 | #[derive(Debug)] 12 | pub struct TreeIndexData { 13 | pub chr_entries: FxHashMap>, 14 | pub seqid_to_num: FxHashMap, 15 | pub num_to_seqid: Vec, 16 | } 17 | 18 | impl TreeIndexData { 19 | /// Construct TreeIndexData from explicit sequence IDs and `.rit/.rix` files. 20 | /// Convenience: derive `{prefix}.rit/.rix` from `index_prefix`. 21 | pub fn load_tree_index>(gff_path: P) -> Result { 22 | let path = gff_path.as_ref(); 23 | let (num_to_seqid, seqid_to_num) = load_sqs(path)?; 24 | let rit_path = append_suffix(path, ".rit"); 25 | let rix_path = append_suffix(path, ".rix"); 26 | 27 | let chr_entries = Self::load_region_index(&rit_path, &rix_path)?; 28 | 29 | Ok(Self { 30 | chr_entries, 31 | seqid_to_num, 32 | num_to_seqid 33 | }) 34 | } 35 | 36 | fn load_region_index( 37 | rit_path: &Path, 38 | rix_path: &Path, 39 | ) -> Result>> { 40 | let file = File::open(rit_path).with_context(|| format!("open {}", rit_path.display()))?; 41 | let mmap = unsafe { MmapOptions::new().map(&file) } 42 | .with_context(|| format!("mmap {}", rit_path.display()))?; 43 | let buf: &[u8] = &mmap; 44 | 45 | let offsets: Vec = { 46 | let f = File::open(rix_path).with_context(|| format!("open {}", rix_path.display()))?; 47 | serde_json::from_reader::<_, Vec>(f) 48 | .with_context(|| format!("parse json {}", rix_path.display()))? 49 | }; 50 | if offsets.is_empty() { 51 | return Ok(FxHashMap::default()); 52 | } 53 | 54 | for w in offsets.windows(2) { 55 | if w[0] > w[1] { 56 | bail!("offsets not sorted ascending: {:?} > {:?}", w[0], w[1]); 57 | } 58 | } 59 | let last = *offsets.last().unwrap() as usize; 60 | if last > buf.len() { 61 | bail!("last offset {} out of file size {}", last, buf.len()); 62 | } 63 | 64 | let mut map = FxHashMap::with_capacity_and_hasher(offsets.len(), Default::default()); 65 | for (i, start_u64) in offsets.iter().copied().enumerate() { 66 | let start = start_u64 as usize; 67 | let end = if i + 1 < offsets.len() { 68 | offsets[i + 1] as usize 69 | } else { 70 | buf.len() 71 | }; 72 | if end < start || end > buf.len() { 73 | bail!("bad slice range: {}..{} (file len {})", start, end, buf.len()); 74 | } 75 | let slice = &buf[start..end]; 76 | let tree: IntervalTree = deserialize(slice).with_context(|| { 77 | format!("bincode2 deserialize tree #{} ({}..{})", i, start, end) 78 | })?; 79 | map.insert(i as u32, tree); 80 | } 81 | Ok(map) 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/utils/tree_io.rs: -------------------------------------------------------------------------------- 1 | use crate::utils::tree::IntervalTree; 2 | use anyhow::{bail, Context, Result}; 3 | use bincode2::{deserialize, deserialize_from, serialize, serialize_into}; 4 | 5 | use memmap2::MmapOptions; 6 | use serde::de::DeserializeOwned; 7 | use std::{ 8 | fs::{self, File}, 9 | io::{BufReader, Read, Seek, SeekFrom, BufWriter, Write}, 10 | path::Path, 11 | }; 12 | 13 | impl IntervalTree 14 | where 15 | T: Ord + Copy + serde::Serialize + for<'de> serde::Deserialize<'de>, 16 | { 17 | /// Serialize the whole tree to a file via bincode2. 18 | pub fn save_to_file(&self, path: &Path) -> std::io::Result<()> { 19 | let encoded = serialize(self).map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; 20 | fs::write(path, encoded)?; 21 | Ok(()) 22 | } 23 | 24 | /// Deserialize a tree from a file (whole-file read). 25 | pub fn load_from_file(path: &Path) -> std::io::Result { 26 | let mut file = File::open(path)?; 27 | let mut buf = Vec::new(); 28 | file.read_to_end(&mut buf)?; 29 | let tree: Self = deserialize(&buf) 30 | .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; 31 | Ok(tree) 32 | } 33 | } 34 | 35 | /// Save multiple trees back-to-back into a single `.rit` file. 36 | /// Offsets of each tree (in bytes) should be recorded separately. 37 | pub fn save_multiple_trees(trees: &[IntervalTree], rit_path: &Path) -> Result> 38 | where 39 | T: Ord + Copy + serde::Serialize + for<'de> serde::Deserialize<'de>, 40 | { 41 | let mut offsets = Vec::with_capacity(trees.len()); 42 | let file = File::create(rit_path)?; 43 | let mut writer = BufWriter::new(file); 44 | 45 | for tree in trees { 46 | // Record current offset 47 | let pos = writer.seek(SeekFrom::Current(0))?; 48 | offsets.push(pos); 49 | // Write tree 50 | serialize_into(&mut writer, tree)?; 51 | } 52 | writer.flush()?; 53 | Ok(offsets) 54 | } 55 | 56 | /// Write offsets (Vec) as JSON into `.rix` file. 57 | pub fn write_offsets_to_file(offsets: &[u64], rix_path: &Path) -> Result<()> { 58 | let file = File::create(rix_path)?; 59 | let mut writer = BufWriter::new(file); 60 | serde_json::to_writer(&mut writer, offsets)?; 61 | writer.flush()?; 62 | Ok(()) 63 | } 64 | 65 | /// Load multiple trees with streaming (using BufReader). 66 | pub fn load_trees_streaming(rit_path: &Path, rix_path: &Path) -> Result>> 67 | where 68 | T: Ord + Copy + DeserializeOwned, 69 | { 70 | let mut reader = BufReader::new(File::open(rit_path)?); 71 | let offsets: Vec = { 72 | let f = File::open(rix_path)?; 73 | serde_json::from_reader(f)? 74 | }; 75 | 76 | let mut trees = Vec::with_capacity(offsets.len()); 77 | for &off in &offsets { 78 | reader.seek(SeekFrom::Start(off))?; 79 | let tree: IntervalTree = deserialize_from(&mut reader) 80 | .map_err(|e| anyhow::anyhow!("Deserializing failed: {}", e))?; 81 | trees.push(tree); 82 | } 83 | Ok(trees) 84 | } 85 | 86 | /// Load multiple trees via memory-mapping. 87 | pub fn load_trees_mmap(rit_path: &Path, rix_path: &Path) -> Result>> 88 | where 89 | T: Ord + Copy + for<'de> serde::Deserialize<'de>, 90 | { 91 | let file = File::open(rit_path).with_context(|| format!("open {}", rit_path.display()))?; 92 | let mmap = unsafe { MmapOptions::new().map(&file) } 93 | .with_context(|| format!("mmap {}", rit_path.display()))?; 94 | let buf: &[u8] = &mmap; 95 | 96 | let offsets: Vec = { 97 | let f = File::open(rix_path).with_context(|| format!("open {}", rix_path.display()))?; 98 | serde_json::from_reader::<_, Vec>(f) 99 | .with_context(|| format!("parse json {}", rix_path.display()))? 100 | }; 101 | 102 | if offsets.is_empty() { 103 | return Ok(Vec::new()); 104 | } 105 | 106 | for w in offsets.windows(2) { 107 | if w[0] > w[1] { 108 | bail!("offsets not sorted ascending: {:?} > {:?}", w[0], w[1]); 109 | } 110 | } 111 | let last = *offsets.last().unwrap() as usize; 112 | if last > buf.len() { 113 | bail!("last offset {} out of file size {}", last, buf.len()); 114 | } 115 | 116 | let mut out = Vec::with_capacity(offsets.len()); 117 | for (i, start_u64) in offsets.iter().copied().enumerate() { 118 | let start = start_u64 as usize; 119 | let end = if i + 1 < offsets.len() { 120 | offsets[i + 1] as usize 121 | } else { 122 | buf.len() 123 | }; 124 | if end < start || end > buf.len() { 125 | bail!("bad slice range: {}..{} (file len {})", start, end, buf.len()); 126 | } 127 | let slice = &buf[start..end]; 128 | let tree: IntervalTree = deserialize(slice) 129 | .with_context(|| format!("bincode2 deserialize tree #{} ({}..{})", i, start, end))?; 130 | out.push(tree); 131 | } 132 | 133 | Ok(out) 134 | } 135 | --------------------------------------------------------------------------------