├── .github
    └── workflows
    │   └── rust.yml
├── .gitignore
├── .gitmodules
├── CHANGELOG.md
├── Cargo.toml
├── LICENSE
├── README.md
├── RELEASENOTE.md
├── extract.png
└── src
    ├── commands.rs
    ├── commands
        ├── coverage.rs
        ├── depth.rs
        ├── extract.rs
        ├── index.rs
        ├── intersect.rs
        ├── sample.rs
        └── search.rs
    ├── index_builder.rs
    ├── index_builder
        └── core.rs
    ├── index_loader.rs
    ├── index_loader
        ├── a2f.rs
        ├── core.rs
        ├── fts.rs
        ├── gof.rs
        └── prt.rs
    ├── lib.rs
    ├── main.rs
    ├── utils.rs
    └── utils
        ├── common.rs
        ├── tree.rs
        ├── tree_index.rs
        └── tree_io.rs


/.github/workflows/rust.yml:
--------------------------------------------------------------------------------
  1 | name: Release
  2 | 
  3 | on:
  4 |   push:
  5 |     tags:
  6 |       - "v*.*.*"
  7 |   workflow_dispatch: {}
  8 | 
  9 | permissions:
 10 |   contents: write
 11 | 
 12 | concurrency:
 13 |   group: release-${{ github.ref }}
 14 |   cancel-in-progress: false
 15 |   
 16 | env:
 17 |   CARGO_TERM_COLOR: always
 18 |   BIN_NAME: gffx
 19 | 
 20 | jobs:
 21 |   build:
 22 |     name: Build ${{ matrix.target }} on ${{ matrix.os }}
 23 |     runs-on: ${{ matrix.os }}
 24 |     strategy:
 25 |       fail-fast: false
 26 |       matrix:
 27 |         include:
 28 |           # --- Linux (musl / 静态) ---
 29 |           - os: ubuntu-latest
 30 |             target: x86_64-unknown-linux-musl
 31 |             bin_ext: ""
 32 |             archive: tar.gz
 33 |             use_cross: true
 34 |           - os: ubuntu-latest
 35 |             target: aarch64-unknown-linux-musl
 36 |             bin_ext: ""
 37 |             archive: tar.gz
 38 |             use_cross: true
 39 | 
 40 |           # --- macOS ---
 41 |           - os: macos-latest
 42 |             target: x86_64-apple-darwin
 43 |             bin_ext: ""
 44 |             archive: tar.gz
 45 |             use_cross: false
 46 |           - os: macos-latest
 47 |             target: aarch64-apple-darwin
 48 |             bin_ext: ""
 49 |             archive: tar.gz
 50 |             use_cross: false
 51 | 
 52 |           # --- Windows (MSVC) ---
 53 |           - os: windows-latest
 54 |             target: x86_64-pc-windows-msvc
 55 |             bin_ext: ".exe"
 56 |             archive: zip
 57 |             use_cross: false
 58 |             
 59 | 
 60 |     steps:
 61 |       - uses: actions/checkout@v4
 62 | 
 63 |       - name: Install Rust toolchain + target
 64 |         uses: dtolnay/rust-toolchain@stable
 65 |         with:
 66 |           targets: ${{ matrix.target }}
 67 | 
 68 |       - uses: Swatinem/rust-cache@v2
 69 |         with:
 70 |           prefix-key: ${{ matrix.target }}
 71 | 
 72 |       - name: Install cross
 73 |         if: matrix.use_cross == true
 74 |         uses: taiki-e/install-action@v2
 75 |         with:
 76 |           tool: cross@0.2.5
 77 | 
 78 |       - name: Build (release)
 79 |         shell: bash
 80 |         run: |
 81 |           set -eux
 82 |           if [ "${{ matrix.use_cross }}" = "true" ]; then
 83 |             cross build --release --target ${{ matrix.target }}
 84 |           else
 85 |             cargo build --release --target ${{ matrix.target }}
 86 |           fi
 87 | 
 88 |       - name: Prepare artifact
 89 |         shell: bash
 90 |         run: |
 91 |           set -eux
 92 |           BIN_PATH="target/${{ matrix.target }}/release/${BIN_NAME}${{ matrix.bin_ext }}"
 93 |           OUT_NAME="${BIN_NAME}-${{ matrix.target }}"
 94 | 
 95 |           mkdir -p dist
 96 |           cp "$BIN_PATH" "dist/${OUT_NAME}${{ matrix.bin_ext }}"
 97 | 
 98 |           if [ "${{ matrix.archive }}" = "tar.gz" ]; then
 99 |             tar -czf "dist/${OUT_NAME}.tar.gz" -C dist "${OUT_NAME}${{ matrix.bin_ext }}"
100 |             (sha256sum "dist/${OUT_NAME}.tar.gz" || shasum -a 256 "dist/${OUT_NAME}.tar.gz") > "dist/${OUT_NAME}.tar.gz.sha256"
101 |           else
102 |             (cd dist && 7z a -tzip "${OUT_NAME}.zip" "${OUT_NAME}${{ matrix.bin_ext }}")
103 |             (sha256sum "dist/${OUT_NAME}.zip" || shasum -a 256 "dist/${OUT_NAME}.zip") > "dist/${OUT_NAME}.zip.sha256"
104 |           fi
105 | 
106 |       - name: Upload build artifacts
107 |         uses: actions/upload-artifact@v4
108 |         with:
109 |           name: ${{ matrix.target }}
110 |           path: dist/*
111 | 
112 |   release:
113 |     name: Create GitHub Release
114 |     needs: build
115 |     runs-on: ubuntu-latest
116 |     if: startsWith(github.ref, 'refs/tags/')
117 |     steps:
118 |       - uses: actions/download-artifact@v4
119 |         with:
120 |           path: artifacts
121 | 
122 |       - name: Gather files
123 |         run: |
124 |           set -eux
125 |           mkdir -p upload
126 |           find artifacts -type f \( -name "*.tar.gz" -o -name "*.zip" -o -name "*.sha256" \) -exec cp {} upload/ \;
127 | 
128 |       - name: Publish Release
129 |         uses: softprops/action-gh-release@v2
130 |         with:
131 |           files: upload/*
132 |           generate_release_notes: true
133 |           draft: true
134 |           prerelease: true
135 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # === Rust build output ===
 2 | /target/
 3 | 
 4 | # === IDEs and editors ===
 5 | **/*.swp
 6 | .idea/
 7 | .vscode/
 8 | 
 9 | # === OS files ===
10 | .DS_Store
11 | Thumbs.db
12 | 
13 | # === Jupyter checkpoints ===
14 | .ipynb_checkpoints/
15 | **/.ipynb_checkpoints/
16 | 
17 | # === Dependency lock file (optional, keep if you want reproducibility) ===
18 | Cargo.lock
19 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "benchmark"]
2 |     path = benchmark
3 | 	url = https://github.com/Baohua-Chen/GFFx_benchmarks.git
4 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # GFFx Changelog
 2 | 
 3 | ---
 4 | 
 5 | Release v0.4.0:
 6 | 
 7 | ### Changed
 8 | 
 9 | Added two useful functionalities: `coverage` for calculating breadth of coverage and `depth` for calculating depth of coverage from BAM/SAM/CRAM or BED files on a GFF file.
10 | 
11 | Added a `sample` functionality for random downsampling of feature groups from each chromosome at equal ratios.
12 | 
13 | Updated module organization and source code directory layout to conform to the Rust 2024 edition guidelines for module visibility (`pub`) and path imports.
14 | 
15 | ---
16 | 
17 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "gffx"
 3 | version = "0.4.0"
 4 | edition = "2024"
 5 | authors = ["Baohua Chen <cbh1117.163.com>"]
 6 | description = "An ultra-fast and memory-efficient toolkit for querying GFF files, written with Rust"
 7 | license = "MIT OR Apache-2.0"
 8 | readme = "README.md"
 9 | repository = "https://github.com/Baohua-Chen/GFFx"
10 | homepage = "https://github.com/Baohua-Chen"
11 | keywords = ["gff-file", "bioinformatics", "genomics"]
12 | categories = ["command-line-utilities", "science"]
13 | documentation = "https://docs.rs/gffx"
14 | exclude = ["benchmark/**", "target/**"]
15 | 
16 | [dependencies]
17 | regex = "1.11.1"
18 | memchr = "2.7.4"
19 | clap = { version = "4.5.37", features = ["derive"] }
20 | anyhow = "1.0.98"
21 | byteorder = "1.5.0"
22 | memmap2 = "0.9.5"
23 | rayon = '1.10.0'
24 | indexmap = '2.10.0'
25 | rustc-hash = '2.1.1'
26 | bstr = '1.12.0'
27 | lexical-core = '1.0.5'
28 | meminterval = '0.4.1'
29 | serde_json = '1.0.140'
30 | serde = { version = '1.0.219', features = ["derive"] }
31 | bincode2 = '2.0.1'
32 | rust-htslib = '0.50'
33 | num_cpus = '1.17.0'
34 | rand = '0.9.2'


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [2025] [Baohua CHen]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # GFFx Command Line Manual
  3 | 
  4 | **GFFx** is a high-performance, Rust-based toolkit for extracting and querying annotations from GFF3 files. It supports fast indexing and feature retrieval with several subcommands.
  5 | It can be used both as a **command-line tool** and as a **Rust library**. 
  6 | 
  7 | <p align="center">
  8 |   <img src="extract.png" alt="Benchmarking results" width="600"/>
  9 |   <br>
 10 |   <em>Benchmarking runtime and memory usage of ID-based feature extraction</em>
 11 | </p>
 12 | 
 13 | ---
 14 | 
 15 | ## Breaking Changes
 16 | 
 17 | Added two useful functionalities: `coverage` for calculating breadth of coverage and `depth` for calculating depth of coverage from BAM/SAM/CRAM or BED files on a GFF file.
 18 | 
 19 | Added a `sample` functionality for random downsampling of feature groups from each chromosome at equal ratios.
 20 | 
 21 | Updated module organization and source code directory layout to conform to the Rust 2024 edition guidelines for module visibility (`pub`) and path imports.
 22 | 
 23 | ---
 24 | 
 25 | ## Table of Contents
 26 | 
 27 | *GFFx version 0.4.0*
 28 | 
 29 | ---
 30 | 
 31 | - [Installation](#installation)
 32 | - [Basic Usage](#basic-usage)
 33 |   - [index](#index) - Build index files
 34 |   - [extract](#extract) - Extract features by ID
 35 |   - [intersect](#intersect) - Extract features by regions
 36 |   - [search](#search) - Search features by attributes
 37 |   - [coverage](#coverage) - Calculate coverage breadth
 38 |   - [depth](#depth) - Calculate coverage depth
 39 |   - [sample](#sample) - Randomly downsample feature groups
 40 | 
 41 | 
 42 | - [Example Use Cases](#example-use-cases)
 43 | - [Using GFFx as a Rust Library](#using-gffx-as-a-rust-library)
 44 | - [Available Public APIs](#available-public-apis)
 45 | - [Index File Types](#index-file-types)
 46 | - [License](#license)
 47 | - [Citation](#citation)
 48 | 
 49 | ---
 50 | ## Installation
 51 | 
 52 | ### Option 1: Install via [crates.io](https://crates.io/crates/gffx)
 53 | 
 54 | ```bash
 55 | cargo install gffx                  # install to default location (~/.cargo/bin)
 56 | cargo install gffx --root /your/path  # optional: install to custom location
 57 | ```
 58 | 
 59 | ### Option 2: Install from source
 60 | 
 61 | ```bash
 62 | git clone https://github.com/Baohua-Chen/GFFx.git
 63 | cd GFFx
 64 | cargo build --release
 65 | # Optionally copy the binary
 66 | cp target/release/gffx /your/path
 67 | ```
 68 | 
 69 | > Requires **Rust 1.70 or later**. You can install or update Rust using [rustup](https://rustup.rs).
 70 | ---
 71 | 
 72 | 
 73 | ## Basic Usage
 74 | 
 75 | ```bash
 76 | gffx <SUBCOMMAND> [OPTIONS]
 77 | ```
 78 | 
 79 | Available subcommands:
 80 | 
 81 | - [index] Build index files
 82 | - [intersect] Extract features by region
 83 | - [extract] Extract features by ID
 84 | - [search] Search features by attribute
 85 | 
 86 | ---
 87 | 
 88 | ### `index`
 89 | 
 90 | Builds index files from a GFF file to accelerate downstream operations.
 91 | 
 92 | ```bash
 93 | gffx index [OPTIONS] --input <INPUT>
 94 | ```
 95 | 
 96 | **Options:**
 97 | 
 98 | | Option                 | Description                                     |
 99 | |------------------------|-------------------------------------------------|
100 | | `-i`, `--input`        | Input GFF file                                  |
101 | | `-a`, `--attribute`    | Attribute key to extract (default: `gene_name`) |
102 | | `-v`, `--verbose`      | Enable verbose output                           |
103 | | `-h`, `--help   `      | Print help                                      |
104 | 
105 | ---
106 | 
107 | ### `intersect`
108 | 
109 | Extracts models intersecting with regions from a GFF file, either from a single region or a BED file.
110 | 
111 | ```bash
112 | gffx intersect [OPTIONS] --input <INPUT> <--region <REGION>|--bed <BED>>
113 | ```
114 | 
115 | **Options:**
116 | Required
117 | | Option                      | Description                                                  |
118 | | --------------------------- | ------------------------------------------------------------ |
119 | | `-i`, `--input` `<INPUT>`   | Input GFF file path                                          |
120 | | `-r`, `--region` `<REGION>` | Single region in `chr:start-end` format                      |
121 | | `-b`, `--bed` `<BED>`       | BED file containing multiple regions                         |
122 | 
123 | > **Note**: Exactly one of `--region` or `--bed` must be specified.
124 | 
125 | 
126 | Optional
127 | | Option                      | Description                                                                    |
128 | | -------------------------   | ------------------------------------------------------------------------------ |
129 | | `-o`, `--output` `<OUT>`    | Output file path (default: stdout)                                             |
130 | | `-e`, `--entire-group`      | Enable the "entire-group" mode. Return entire gene models or feature groups    |
131 | |                             | for all matched features, instead of only the directly matched features.       |
132 | | `-v`, `--invert`            | Invert selection (exclude matched features)                                    |
133 | | `-T`, `--types` `<TYPES>`   | Filter output to include only features of specified types (e.g., `gene,exon`)  |
134 | | `-t`, `--threads` `<NUM>`   | Number of threads [default: 12]                                                |
135 | | `-V`, `--verbose`           | Enable verbose output                                                          |
136 | | `-h`, `--help`              | Show help message                                                              |
137 | | *(one of)*                  |                                                                                |
138 | | `-c`, `--contained`         | Only keep features fully contained within the region                           |
139 | | `-C`, `--contains-region`   | Only keep features that fully contain the region                               |
140 | | `-O`, `--overlap`           | Keep features that partially or fully overlap (default mode)                   |
141 | 
142 | ---
143 | 
144 | ### `extract`
145 | 
146 | Extracts annotation models by feature ID(s), including their parent models.
147 | 
148 | ```bash
149 | gffx extract [OPTIONS] --input <INPUT> <--feature-file <FEATURE_FILE>|--feature-id <FEATURE_ID>>
150 | ```
151 | 
152 | **Options:**
153 | 
154 | Required
155 | | Option                                   | Description                                                  |
156 | | ---------------------------------------- | ------------------------------------------------------------ |
157 | | `-i`, `--input` `<INPUT>`                | Input GFF file path                                          |
158 | | *(one of)*                               |                                                              |
159 | | `-f`, `--feature-id` `<FEATURE_ID>`      | Extrach by a single feature id                               |
160 | | `-e`, `--feature-file` `<FEATURE_FILE>`  | Extrach by a BED file containing multiple regions            |
161 | 
162 | Optional
163 | | Option                      | Description                                                                    |
164 | | -------------------------   | ------------------------------------------------------------------------------ |
165 | | `-o`, `--output` `<OUT>`    | Output file path (default: stdout)                                             |
166 | | `-e`, `--entire-group`      | Enable the "entire-group" mode. Return entire gene models or feature groups    |
167 | |                             | for all matched features, instead of only the directly matched features.       |
168 | | `-T`, `--types` `<TYPES>`   | Filter output to include only features of specified types (e.g., `gene,exon`)  |
169 | | `-t`, `--threads` `<NUM>`   | Number of threads [default: 12]                                                |
170 | | `-V`, `--verbose`           | Enable verbose output                                                          |
171 | | `-h`, `--help`              | Show help message                                                              |
172 | 
173 | ---
174 | 
175 | ### `search`
176 | 
177 | Searches for features using a specified attribute value and retrieves the full annotation models.
178 | 
179 | ```bash
180 | gffx search -a geneX -i input.gff
181 | ```
182 | 
183 | **Options:**
184 | 
185 | Required
186 | | Option                                   | Description                                                  |
187 | | ---------------------------------------- | ------------------------------------------------------------ |
188 | | `-i`, `--input` `<INPUT>`                | Input GFF file path                                          |
189 | | *(one of)*                               |                                                              |
190 | | `-a`, `--attr` `<ATTRIBUTE_VALUE>`        | Search a single attribute value/pattern                      |
191 | | `-A`, `--attr-list` `<ATTRIBUTE_LIST>`   | Search attribute values/patterns defined in a text file      |
192 | 
193 | Optional
194 | | Option                      | Description                                                                    |
195 | | -------------------------   | ------------------------------------------------------------------------------ |
196 | | `-o`, `--output` `<OUT>`    | Output file path (default: stdout)                                             |
197 | | `-e`, `--entire-group`      | Enable the "entire-group" mode. Return entire gene models or feature groups    |
198 | |                             | for all matched features, instead of only the directly matched features.       |
199 | | `-r`, `--regex` `<REGEX>`   | Enable regex matching for attribute values                                     |
200 | | `-T`, `--types` `<TYPES>`   | Filter output to include only features of specified types (e.g., `gene,exon`)  |
201 | | `-t`, `--threads` `<NUM>`   | Number of threads [default: 12]                                                |
202 | | `-V`, `--verbose`           | Enable verbose output                                                          |
203 | | `-h`, `--help`              | Show help message                                                              |
204 | 
205 | ---
206 | 
207 | ### `coverage`
208 | 
209 | Compute coverage breadth across genomic feature.
210 | 
211 | ```bash
212 | gffx coverage -i input.gff -s source.bam
213 | ```
214 | 
215 | **Options:**
216 | 
217 | Required
218 | | Option                                   | Description                                                  |
219 | | ---------------------------------------- | ------------------------------------------------------------ |
220 | | `-i`, `--input` `<INPUT>`                | Input GFF file path                                          |
221 | | `-s`, `--source` `<SOURCE>`              | Source file in BAM/SAM/CRAM or BED format                    |
222 | 
223 | Optional
224 | | Option                      | Description                                                                    |
225 | | -------------------------   | ------------------------------------------------------------------------------ |
226 | | `-o`, `--output` `<OUT>`    | Output file path (default: stdout)                                             |
227 | | `-e`, `--entire-group`      | Enable the "entire-group" mode. Return entire gene models or feature groups    |
228 | |                             | for all matched features, instead of only the directly matched features.       |
229 | | `-t`, `--threads` `<NUM>`   | Number of threads [default: 12]                                                |
230 | | `-V`, `--verbose`           | Enable verbose output                                                          |
231 | | `-h`, `--help`              | Show help message                                                              |
232 | 
233 | ---
234 | 
235 | ### `depth`
236 | 
237 | Compute coverage depth across genomic feature.
238 | 
239 | ```bash
240 | gffx depth -i input.gff -s source.bam
241 | ```
242 | 
243 | **Options:**
244 | 
245 | Required
246 | | Option                                   | Description                                                  |
247 | | ---------------------------------------- | ------------------------------------------------------------ |
248 | | `-i`, `--input` `<INPUT>`                | Input GFF file path                                          |
249 | | `-s`, `--source` `<SOURCE>`              | Source file in BAM/SAM/CRAM or BED format                    |
250 | 
251 | Optional
252 | | Option                      | Description                                                                    |
253 | | -------------------------   | ------------------------------------------------------------------------------ |
254 | | `-o`, `--output` `<OUT>`    | Output file path (default: stdout)                                             |
255 | | `--bin-shift` `<BIN_SHIFT>` | Bin width parameter (2^k bp) for spatial bucketing of features and queries.    |
256 | |                             | Choose k so that a typical read and feature span ~1–2 bins [default: 12]       |
257 | | `-e`, `--entire-group`      | Enable the "entire-group" mode. Return entire gene models or feature groups    |
258 | |                             | for all matched features, instead of only the directly matched features.       |
259 | | `-t`, `--threads` `<NUM>`   | Number of threads [default: 12]                                                |
260 | | `-V`, `--verbose`           | Enable verbose output                                                          |
261 | | `-h`, `--help`              | Show help message                                                              |
262 | 
263 | ---
264 | 
265 | ### `sample`
266 | 
267 | Ramdomly downsample feature groups.
268 | 
269 | ```bash
270 | gffx sample -i input.gff -r 0.33
271 | ```
272 | 
273 | **Options:**
274 | 
275 | Required
276 | | Option                                   | Description                                                  |
277 | | ---------------------------------------- | ------------------------------------------------------------ |
278 | | `-i`, `--input` `<INPUT>`                | Input GFF file path                                          |
279 | | `-r`, `--ratio` `<RATIO>`                | Ratio of downsampling, should be between 0 and 1             |
280 | 
281 | Optional
282 | | Option                      | Description                                                                    |
283 | | -------------------------   | ------------------------------------------------------------------------------ |
284 | | `-o`, `--output` `<OUT>`    | Output file path (default: stdout)                                             |
285 | | `-t`, `--threads` `<NUM>`   | Number of threads [default: 12]                                                |
286 | | `-V`, `--verbose`           | Enable verbose output                                                          |
287 | | `-h`, `--help`              | Show help message                                                              |
288 | 
289 | ---
290 | 
291 | ## Example Use Cases
292 | 
293 | ```bash
294 | # Build index
295 | gffx index -i genes.gff -a gene_name
296 | 
297 | # Extract all features overlapping with a region
298 | gffx intersect --region chr1:10000-20000 -i genes.gff -o out.gff -F
299 | 
300 | # Extract models from a list of gene IDs
301 | gffx extract --feature-file genes.txt -i genes.gff -o subset.gff -F
302 | 
303 | # Search by gene name and extract the full model
304 | gffx search -a TP53 -i genes.gff -o tp53_model.gff
305 | 
306 | ```
307 | 
308 | ---
309 | 
310 | ## Using GFFx as a Rust Library
311 | 
312 | You can use GFFx as a Rust library in your own project.
313 | 
314 | ### Add to Cargo.toml
315 | 
316 | ```toml
317 | [dependencies]
318 | gffx = "^0.2.0" # Please check the latest version in crates.io
319 | ```
320 | 
321 | ### Example: Manually extract features from region using index files
322 | The following example runs inside a main() -> Result<()> context:
323 | 
324 | ```rust
325 | use anyhow::{Result, bail};
326 | use std::path::Path; 
327 | 
328 | pub fn extract_one_id_full_model(                       // Define a minimal single-ID full-model extractor
329 |     gff_path: &Path,                                    // Path to the input GFF file
330 |     feature_id: &str,                                   // Target feature ID (string form)
331 |     out_path: &Path                                     // Path to the output GFF file
332 | ) -> Result<()> {                                       // Return anyhow::Result for error propagation
333 |     let fts = load_fts(gff_path)?;                      // Load feature table: maps string IDs <-> numeric fids
334 |     let prt = load_prt(gff_path)?;                      // Load parent relations: map child fid -> root fid
335 |     let gof = load_gof(gff_path)?;                      // Load offsets: map root fid -> byte ranges in the file
336 | 
337 |     let (fid_set, missing) = fts.map_fnames_to_fids(    // Map the single feature name to a numeric fid via batch API
338 |         std::iter::once(feature_id.to_string()).collect(), // Build a one-element set of the feature name
339 |         1                                               // Use 1 thread since this is a single lookup
340 |     );                                                  
341 |     
342 |     let fid = *fid_set.iter().next().unwrap();          // Extract the only fid from the set
343 | 
344 |     let root = prt.map_fids_to_roots(&[fid], 1)[0];     // Map fid -> root fid using the batch API with a single item
345 | 
346 |     let blocks = gof.roots_to_offsets(&[root], 1);      // Convert the root fid to file block offsets (full model span)
347 | 
348 |     write_gff_output(                                   // Write the entire blocks (full-model output, no filtering)
349 |         gff_path,                                       // Input GFF file path
350 |         &blocks,                                        // Byte-range blocks to emit
351 |         out_path,                                       // Output file path
352 |         false                                           // Verbose: false for minimal logging
353 |     )?;                                                 
354 | 
355 |     Ok(())                                              // Return success
356 | }                                                       // End of function
357 | ```
358 | 
359 | ---
360 | 
361 | ## Available Public APIs
362 | 
363 | ### Index building & checking (`index_builder`)
364 | - `build_index`
365 | 
366 | ### Index loading (`index_loader`)
367 | - `load_gof`, `load_prt`, `load_fts`, `load_atn`, `load_a2f`, `load_sqs`
368 | - `safe_mmap_readonly`
369 | - `GofMap`, `PrtMap`, `FtsMap`, `A2fMap`
370 | 
371 | ### Interval querying data structures (`utils::serial_interval_trees`)
372 | - `IntervalTree`, `Interval`
373 | - `save_multiple_trees`, `write_offsets_to_file`
374 | 
375 | ### Other utilities (`utils::common`)
376 | - `CommonArgs`, `append_suffix`
377 | - `write_gff_output`, `write_gff_output_filtered`
378 | - `check_index_files_exist`
379 | 
380 | ---
381 | 
382 | 
383 | ## Index File Types
384 | 
385 | | File Extension | Purpose                                             |
386 | |----------------|-----------------------------------------------------|
387 | | `.gof`         | Byte offset index for GFF feature blocks            |
388 | | `.fts`         | Feature ID table                                    |
389 | | `.prt`         | Child to parent mapping                             |
390 | | `.a2f`         | Attribute to feature ID mapping                     |
391 | | `.atn`         | Attribute value table                               |
392 | | `.sqs`         | Sequence ID table                                   |
393 | | `.rit`         | Interval tree index                                 |
394 | | `.rix`         | Byte offest index for interval trees in.rit file    |
395 | 
396 | ---
397 | 
398 | ## Notes
399 | 
400 | - Make sure you run `gffx index` before using `intersect`, `extract`, or `search`.
401 | 
402 | ---
403 | 
404 | ## License
405 | 
406 | GFFx is released under the MIT or Apache-2.0 License.
407 | 
408 | ---
409 | 
410 | ## Citation
411 | 
412 | If you use **GFFx**, please cite our forthcoming paper in *GigaScience*:
413 | 
414 | Baohua Chen, Dongya Wu, Guojie Zhang, GFFx: A Rust-based suite of utilities for ultra-fast genomic feature extraction, GigaScience, Volume 14, 2025, giaf124, https://doi.org/10.1093/gigascience/giaf124
415 | 
416 | ---
417 | 


--------------------------------------------------------------------------------
/RELEASENOTE.md:
--------------------------------------------------------------------------------
1 | # GFFx v0.4.0 Release Notes
2 | 
3 | Added two useful functionalities: `coverage` for calculating breadth of coverage and `depth` for calculating depth of coverage from BAM/SAM/CRAM or BED files on a GFF file.
4 | 
5 | Added a `sample` functionality for random downsampling of feature groups from each chromosome at equal ratios.
6 | 
7 | Updated module organization and source code directory layout to conform to the Rust 2024 edition guidelines for module visibility (`pub`) and path imports.


--------------------------------------------------------------------------------
/extract.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Baohua-Chen/GFFx/9e889a9eee5536547e63f3fc7ad63306bb8f3b7c/extract.png


--------------------------------------------------------------------------------
/src/commands.rs:
--------------------------------------------------------------------------------
 1 | pub mod index;
 2 | pub mod extract;
 3 | pub mod intersect;
 4 | pub mod search;
 5 | pub mod coverage;
 6 | pub mod depth;
 7 | pub mod sample;
 8 | 
 9 | pub use index::{IndexArgs, run as run_index};
10 | pub use extract::{ExtractArgs, run as run_extract};
11 | pub use intersect::{IntersectArgs, run as run_intersect};
12 | pub use search::{SearchArgs, run as run_search};
13 | pub use coverage::{CoverageArgs, run as run_coverage};
14 | pub use depth::{DepthArgs, run as run_depth};
15 | pub use sample::{SampleArgs, run as run_sample};
16 | 


--------------------------------------------------------------------------------
/src/commands/coverage.rs:
--------------------------------------------------------------------------------
  1 | use anyhow::{Result, bail, Context};
  2 | use rayon::prelude::*;
  3 | use memmap2::Mmap;
  4 | use rust_htslib::bam::{self, Read};
  5 | use rustc_hash::{FxHashMap, FxHashSet};
  6 | use std::{
  7 |     str,
  8 |     fs::File,
  9 |     path::{Path, PathBuf},
 10 |     io::{BufWriter, Write},
 11 | };
 12 | use clap::Parser;
 13 | use crate::{
 14 |     Interval, TreeIndexData, load_gof, GofMap,
 15 | };
 16 | use std::time::{Instant, Duration};
 17 | use rust_htslib::bam::ext::BamRecordExtensions;
 18 | 
 19 | const MISSING: u64 = u64::MAX; // Sentinel for missing entries
 20 | 
 21 | // Number of IoSlices per batch writer
 22 | const WRITE_BUF_SIZE: usize = 32 * 1024 * 1024;
 23 | 
 24 | /// Compute bin index for coordinate `x` given shift k (unused in the new pipeline).
 25 | #[inline]
 26 | fn _bin_of(x: u32, shift: u32) -> u32 {
 27 |     let _ = shift;
 28 |     x >> 0
 29 | }
 30 | 
 31 | /// Arguments
 32 | #[derive(Parser, Debug)]
 33 | #[command(
 34 |     about = "Compute coverage breadth across genomic feature.",
 35 |     long_about = "This tool computes sequencing coverage breadth and fraction from high-throughput sequencing (HTS) alignment files (SAM/BAM/CRAM) or user-specified genomic intervals (BED)."
 36 | )]
 37 | pub struct CoverageArgs {
 38 |     /// GFF file path (indexed via GOF)
 39 |     #[arg(short = 'i', long = "input", value_name = "FILE")]
 40 |     pub input: PathBuf,
 41 | 
 42 |     /// Source: BAM/SAM/CRAM or BED
 43 |     #[arg(short = 's', long)]
 44 |     pub source: PathBuf,
 45 |     
 46 |     /// Output file (required)
 47 |     #[arg(short = 'o', long = "output", value_name = "FILE")]
 48 |     pub output: Option<PathBuf>,
 49 | 
 50 |     /// Number of threads
 51 |     #[arg(short = 't', long = "threads", default_value_t = 12, value_name = "NUM")]
 52 |     pub threads: usize,
 53 | 
 54 |     /// Verbose logs
 55 |     #[arg(short = 'v', long = "verbose", default_value_t = false, value_name = "BOOL")]
 56 |     pub verbose: bool,
 57 | }
 58 | 
 59 | /// Fast u32 parse
 60 | #[inline(always)]
 61 | fn parse_u32_fast(s: &str) -> Option<u32> {
 62 |     if s.is_empty() { return None; }
 63 |     let mut n: u32 = 0;
 64 |     for b in s.as_bytes() {
 65 |         let d = b.wrapping_sub(b'0');
 66 |         if d > 9 { return None; }
 67 |         n = n.checked_mul(10)?.checked_add(d as u32)?;
 68 |     }
 69 |     Some(n)
 70 | }
 71 | 
 72 | /// Extract `ID=` from GFF attributes quickly
 73 | #[inline(always)]
 74 | fn fast_id(attrs: &str) -> Option<&str> {
 75 |     let bytes = attrs.as_bytes();
 76 |     let mut i = 0;
 77 |     while i + 2 < bytes.len() {
 78 |         if bytes[i] == b'I' && bytes[i+1] == b'D' && bytes[i+2] == b'=' {
 79 |             let mut j = i + 3;
 80 |             while j < bytes.len() && bytes[j] != b';' && bytes[j] != b' ' && bytes[j] != b'\t' {
 81 |                 j += 1;
 82 |             }
 83 |             return std::str::from_utf8(&bytes[i+3..j]).ok();
 84 |         }
 85 |         i += 1;
 86 |     }
 87 |     None
 88 | }
 89 | 
 90 | /// Disjoint union of intervals assumed to be half-open [s, e)
 91 | /// Input may be unsorted and overlapping; output is sorted, non-overlapping.
 92 | fn merge_intervals(mut ivs: Vec<(u32,u32)>) -> Vec<(u32,u32)> {
 93 |     if ivs.is_empty() { return ivs; }
 94 |     ivs.sort_unstable_by_key(|x| x.0);
 95 |     let mut out: Vec<(u32,u32)> = Vec::with_capacity(ivs.len());
 96 |     let (mut cs, mut ce) = ivs[0];
 97 |     for (s,e) in ivs.into_iter().skip(1) {
 98 |         if s <= ce {
 99 |             if e > ce { ce = e; }
100 |         } else {
101 |             out.push((cs, ce));
102 |             cs = s; ce = e;
103 |         }
104 |     }
105 |     out.push((cs, ce));
106 |     out
107 | }
108 | 
109 | /// Union length of intervals (half-open). Input may be unsorted.
110 | fn union_len(mut ivs: Vec<(u32,u32)>) -> usize {
111 |     if ivs.is_empty() { return 0; }
112 |     ivs.sort_unstable_by_key(|x| x.0);
113 |     let mut total: usize = 0;
114 |     let (mut cs, mut ce) = ivs[0];
115 |     for (s, e) in ivs.into_iter().skip(1) {
116 |         if s <= ce { ce = ce.max(e); }
117 |         else { total += (ce - cs) as usize; cs = s; ce = e; }
118 |     }
119 |     total += (ce - cs) as usize;
120 |     total
121 | }
122 | 
123 | /// Collect coverage intervals per root (from BAM/SAM/CRAM).
124 | /// We DO NOT read GFF slices here; only group regions by root_fid.
125 | fn collect_by_root_from_bam(
126 |     bam_path: &Path,
127 |     index_data: &TreeIndexData,
128 |     verbose: bool,
129 |     threads: usize,
130 | ) -> Result<FxHashMap<u32, Vec<(u32,u32)>>> {
131 |     let t_open = Instant::now();
132 |     let mut reader = bam::Reader::from_path(bam_path)?;
133 |     reader.set_threads(std::cmp::max(2, threads))?;
134 |     let t_open_elapsed = t_open.elapsed();
135 | 
136 |     let header = reader.header().to_owned();
137 | 
138 |     // Build tid -> chr_id mapping
139 |     let t_map_build = Instant::now();
140 |     let mut tid2num: Vec<Option<u32>> = Vec::with_capacity(header.target_count() as usize);
141 |     for tid in 0..header.target_count() {
142 |         let chrom_bytes = header.tid2name(tid).to_owned();
143 |         let chrom = std::str::from_utf8(&chrom_bytes)?;
144 |         let chr_id = index_data.seqid_to_num.get(chrom).copied();
145 |         tid2num.push(chr_id);
146 |     }
147 |     let t_map_build_elapsed = t_map_build.elapsed();
148 | 
149 |     // Storage: root_fid -> list of raw intervals (to be merged later)
150 |     let mut by_root: FxHashMap<u32, Vec<(u32,u32)>> = FxHashMap::default();
151 | 
152 |     let mut t_parse = Duration::ZERO;
153 |     let mut t_tidmap = Duration::ZERO;
154 |     let t_tree = Duration::ZERO;
155 | 
156 |     let mut hits: Vec<&Interval<u32>> = Vec::new();
157 |     for r in reader.records() {
158 |         let t0 = Instant::now();
159 |         let rec = r?;
160 |         if rec.is_unmapped() {
161 |             t_parse += t0.elapsed();
162 |             continue;
163 |         }
164 |         t_parse += t0.elapsed();
165 | 
166 |         let t1 = Instant::now();
167 |         let tid = rec.tid();
168 |         if tid < 0 {
169 |             t_tidmap += t1.elapsed();
170 |             continue;
171 |         }
172 |         if let Some(chr_id) = tid2num[tid as usize] {
173 |             let start_i64 = rec.pos();
174 |             let end_i64 = rec.reference_end();
175 |             if start_i64 >= 0 && end_i64 > start_i64 {
176 |                 let start = (start_i64 as i128).clamp(0, u32::MAX as i128) as u32;
177 |                 let end   = (end_i64 as i128).clamp(0, u32::MAX as i128) as u32;
178 | 
179 |                 // Query candidate roots for this region
180 |                 if let Some(tree) = index_data.chr_entries.get(&chr_id) {
181 |                     hits.clear();
182 |                     tree.query_interval(start, end, &mut hits);
183 |                     // De-duplicate roots within a single region
184 |                     let mut seen_in_region: FxHashSet<u32> = FxHashSet::default();
185 |                     for h in &hits {
186 |                         if seen_in_region.insert(h.root_fid) {
187 |                             by_root.entry(h.root_fid).or_default().push((start, end));
188 |                         }
189 |                     }
190 |                 }
191 |             }
192 |         }
193 |         t_tidmap += t1.elapsed();
194 |     }
195 | 
196 |     if verbose {
197 |         eprintln!("[TIMER] (1) Opening BAM:       {:.2?}", t_open_elapsed);
198 |         eprintln!("[TIMER] (1b) Build tid2num:    {:.2?}", t_map_build_elapsed);
199 |         eprintln!("[TIMER] (2) Parse+map records: {:.2?}", t_parse + t_tidmap);
200 |         eprintln!("[TIMER] (3) Tree queries:      {:.2?}", t_tree);
201 |         eprintln!("[INFO] Collected {} roots with coverage", by_root.len());
202 |     }
203 | 
204 |     Ok(by_root)
205 | }
206 | 
207 | /// Collect coverage intervals per root (from BED).
208 | fn collect_by_root_from_bed(
209 |     bed_path: &Path,
210 |     index_data: &TreeIndexData,
211 |     verbose: bool,
212 | ) -> Result<FxHashMap<u32, Vec<(u32,u32)>>> {
213 |     // mmap the entire BED file
214 |     let file = File::open(bed_path)?;
215 |     let mmap = unsafe { Mmap::map(&file)? };
216 |     let data = &mmap[..];
217 | 
218 |     if verbose {
219 |         eprintln!("[INFO] mmap BED file: {} bytes", data.len());
220 |     }
221 | 
222 |     // collect line offsets
223 |     let mut line_offsets = Vec::with_capacity(1_000_000);
224 |     line_offsets.push(0usize);
225 |     for (i, &b) in data.iter().enumerate() {
226 |         if b == b'\n' {
227 |             line_offsets.push(i + 1);
228 |         }
229 |     }
230 |     if *line_offsets.last().unwrap_or(&0) != data.len() {
231 |         line_offsets.push(data.len());
232 |     }
233 | 
234 |     let mut by_root: FxHashMap<u32, Vec<(u32,u32)>> = FxHashMap::default();
235 |     let mut hits: Vec<&Interval<u32>> = Vec::new();
236 | 
237 |     for w in line_offsets.windows(2) {
238 |         let start = w[0];
239 |         let end = w[1];
240 |         if start >= end { continue; }
241 |         let line = &data[start..end];
242 |         if line.is_empty() || line[0] == b'#' { continue; }
243 | 
244 |         let fields: Vec<&[u8]> = line
245 |             .split(|&b| b == b'\t' || b == b' ')
246 |             .filter(|f| !f.is_empty())
247 |             .collect();
248 |         if fields.len() < 3 { continue; }
249 | 
250 |         let chrom = match std::str::from_utf8(fields[0]) { Ok(s) => s, Err(_) => continue };
251 |         let s = match std::str::from_utf8(fields[1]).ok().and_then(|x| x.parse::<u32>().ok()) { Some(v) => v, None => continue };
252 |         let e = match std::str::from_utf8(fields[2]).ok().and_then(|x| x.trim_end().parse::<u32>().ok()) { Some(v) => v, None => continue };
253 |         if s >= e { continue; }
254 | 
255 |         let Some(&chr_num) = index_data.seqid_to_num.get(chrom) else { continue };
256 | 
257 |         if let Some(tree) = index_data.chr_entries.get(&chr_num) {
258 |             hits.clear();
259 |             tree.query_interval(s, e, &mut hits);
260 |             let mut seen_in_region: FxHashSet<u32> = FxHashSet::default();
261 |             for h in &hits {
262 |                 if seen_in_region.insert(h.root_fid) {
263 |                     by_root.entry(h.root_fid).or_default().push((s, e));
264 |                 }
265 |             }
266 |         }
267 |     }
268 | 
269 |     if verbose {
270 |         eprintln!("[INFO] Collected {} roots with coverage (BED)", by_root.len());
271 |     }
272 | 
273 |     Ok(by_root)
274 | }
275 | 
276 | /// Compute breadth for all features within a root using pre-merged disjoint coverage.
277 | fn compute_breadth_for_root(
278 |     gff_slice: &[u8],
279 |     cov_merged: &[(u32,u32)], // sorted, non-overlapping coverage intervals
280 | ) -> FxHashMap<String, (String, u32, u32, usize)> {
281 |     #[derive(Clone)]
282 |     struct FeatLine {
283 |         id_idx: u32,
284 |         start0: u32, // 0-based inclusive
285 |         end0:   u32, // 0-based exclusive
286 |     }
287 | 
288 |     // Map ID string -> index
289 |     let mut id_to_idx: FxHashMap<String, u32> = FxHashMap::default();
290 |     let mut id_strings: Vec<String> = Vec::new();
291 |     let mut id_chrom: Vec<String> = Vec::new();
292 | 
293 |     // Collect feature lines (within this root)
294 |     let mut lines: Vec<FeatLine> = Vec::new();
295 | 
296 |     if let Ok(text) = str::from_utf8(gff_slice) {
297 |         for line in text.split_terminator('\n') {
298 |             if line.is_empty() || line.as_bytes()[0] == b'#' { continue; }
299 |             let mut cols = line.splitn(9, '\t');
300 |             let (Some(seqid), _, _, Some(start_s), Some(end_s), _, _, _, Some(attrs)) = (
301 |                 cols.next(), cols.next(), cols.next(),
302 |                 cols.next(), cols.next(), cols.next(),
303 |                 cols.next(), cols.next(), cols.next()
304 |             ) else { continue; };
305 | 
306 |             let (Some(s1), Some(e1)) = (parse_u32_fast(start_s), parse_u32_fast(end_s)) else { continue; };
307 |             if e1 == 0 { continue; }
308 |             let (s1, e1) = if s1 > e1 { (e1, s1) } else { (s1, e1) };
309 |             // GFF is 1-based, inclusive end; convert to half-open 0-based
310 |             let fstart0 = s1.saturating_sub(1);
311 |             let fend0   = e1;
312 | 
313 |             if let Some(id) = fast_id(attrs) {
314 |                 let idx = *id_to_idx.entry(id.to_owned()).or_insert_with(|| {
315 |                     let k = id_strings.len() as u32;
316 |                     id_strings.push(id.to_owned());
317 |                     id_chrom.push(seqid.to_owned());
318 |                     k
319 |                 });
320 |                 lines.push(FeatLine { id_idx: idx, start0: fstart0, end0: fend0 });
321 |             }
322 |         }
323 |     }
324 | 
325 |     if lines.is_empty() || cov_merged.is_empty() {
326 |         return FxHashMap::default();
327 |     }
328 | 
329 |     // Sort features by start for two-pointer sweep against cov_merged
330 |     lines.sort_unstable_by_key(|x| x.start0);
331 | 
332 |     let mut min_s: Vec<u32> = vec![u32::MAX; id_strings.len()];
333 |     let mut max_e: Vec<u32> = vec![0; id_strings.len()];
334 |     // For each ID, collect overlaps with coverage (we'll union at the end per ID)
335 |     let mut id_overlap: Vec<Vec<(u32,u32)>> = vec![Vec::new(); id_strings.len()];
336 | 
337 |     // Two-pointer scan: iterate features in order, and advance cov pointer monotonically
338 |     let mut j = 0usize;
339 |     for fl in &lines {
340 |         // Advance coverage pointer until cov[j].end <= feature.start
341 |         while j < cov_merged.len() && cov_merged[j].1 <= fl.start0 {
342 |             j += 1;
343 |         }
344 |         // Record feature span extents for this ID
345 |         if fl.start0 < min_s[fl.id_idx as usize] { min_s[fl.id_idx as usize] = fl.start0; }
346 |         if fl.end0   > max_e[fl.id_idx as usize] { max_e[fl.id_idx as usize] = fl.end0; }
347 | 
348 |         // Walk through all coverage intervals that might overlap this feature
349 |         let mut k = j;
350 |         while k < cov_merged.len() && cov_merged[k].0 < fl.end0 {
351 |             let s = fl.start0.max(cov_merged[k].0);
352 |             let e = fl.end0.min(cov_merged[k].1);
353 |             if e > s {
354 |                 id_overlap[fl.id_idx as usize].push((s, e));
355 |             }
356 |             if cov_merged[k].1 <= fl.end0 {
357 |                 k += 1;
358 |             } else {
359 |                 break;
360 |             }
361 |         }
362 |     }
363 | 
364 |     // Finalize per-ID breadth (union of all overlap pieces), and produce outputs
365 |     let mut out: FxHashMap<String, (String, u32, u32, usize)> = FxHashMap::default();
366 |     for i in 0..id_strings.len() {
367 |         let length = if max_e[i] > min_s[i] { (max_e[i] - min_s[i]) as usize } else { 0 };
368 |         let breadth = union_len(std::mem::take(&mut id_overlap[i]));
369 |         if length > 0 || breadth > 0 {
370 |             out.insert(
371 |                 id_strings[i].clone(),
372 |                 (id_chrom[i].clone(), min_s[i], max_e[i], breadth),
373 |             );
374 |         }
375 |     }
376 |     out
377 | }
378 | 
379 | /// After collecting raw intervals per root:
380 | /// 1) Merge (union) them into disjoint intervals;
381 | /// 2) Parse GFF slice for that root;
382 | /// 3) Compute breadth/fraction for each feature under this root.
383 | fn finalize_compute_breadth(
384 |     by_root_raw: FxHashMap<u32, Vec<(u32,u32)>>,
385 |     gof: &GofMap,
386 |     gff_mmap: &Mmap,
387 |     threads: usize,
388 |     verbose: bool,
389 | ) -> Result<FxHashMap<String, (String, u32, u32, usize)>> {
390 |     let gff_bytes: &[u8] = &gff_mmap[..];
391 |     let idx = gof.index_cached();
392 | 
393 |     if by_root_raw.is_empty() {
394 |         return Ok(FxHashMap::default());
395 |     }
396 | 
397 |     let roots_iter = by_root_raw.into_iter();
398 | 
399 |     // Parallel per-root processing if threads > 1
400 |     let partials: Vec<FxHashMap<String, (String, u32, u32, usize)>> = if threads > 1 {
401 |         roots_iter.par_bridge().map(|(root, ivs)| {
402 |             // Merge coverage intervals for this root
403 |             let cov = merge_intervals(ivs);
404 |             // Locate GFF slice for this root
405 |             match idx.get(&root) {
406 |                 Some(&(s_off, e_off)) if s_off != MISSING && e_off != MISSING && e_off > s_off => {
407 |                     let su = usize::try_from(s_off).unwrap();
408 |                     let eu = usize::try_from(e_off).unwrap();
409 |                     compute_breadth_for_root(&gff_bytes[su..eu], &cov)
410 |                 }
411 |                 _ => FxHashMap::default(),
412 |             }
413 |         }).collect()
414 |     } else {
415 |         let mut v = Vec::new();
416 |         for (root, ivs) in roots_iter {
417 |             let cov = merge_intervals(ivs);
418 |             match idx.get(&root) {
419 |                 Some(&(s_off, e_off)) if s_off != MISSING && e_off != MISSING && e_off > s_off => {
420 |                     let su = usize::try_from(s_off).unwrap();
421 |                     let eu = usize::try_from(e_off).unwrap();
422 |                     v.push(compute_breadth_for_root(&gff_bytes[su..eu], &cov));
423 |                 }
424 |                 _ => v.push(FxHashMap::default()),
425 |             }
426 |         }
427 |         v
428 |     };
429 | 
430 |     // Merge per-root maps into global results
431 |     let mut global: FxHashMap<String, (String, u32, u32, usize)> = FxHashMap::default();
432 |     for m in partials {
433 |         for (id, (chrom, s, e, b)) in m {
434 |             global.entry(id).and_modify(|(c0, s0, e0, breadth)| {
435 |                 if s < *s0 { *s0 = s; }
436 |                 if e > *e0 { *e0 = e; }
437 |                 // Note: If the same ID appears under multiple roots (rare), breadth is summed.
438 |                 // In well-formed GFF partitioning, one ID should belong to a single root.
439 |                 *breadth += b;
440 |                 let _ = c0;
441 |             }).or_insert((chrom, s, e, b));
442 |         }
443 |     }
444 | 
445 |     if verbose {
446 |         eprintln!("[INFO] Aggregated {} feature IDs", global.len());
447 |     }
448 | 
449 |     Ok(global)
450 | }
451 | 
452 | /// Write "id\tchr\tstart\tend\tbreadth\tfraction" per line.
453 | pub fn write_breadth_results<W: Write>(
454 |     id_map: FxHashMap<String, (String, u32, u32, usize)>,
455 |     mut out: W,
456 |     verbose: bool,
457 | ) -> Result<()> {
458 |     use std::fmt::Write as FmtWrite;
459 |     let mut buf = String::with_capacity(WRITE_BUF_SIZE);
460 |     let mut written = 0usize;
461 | 
462 |     // Header: 6 columns
463 |     writeln!(buf, "id\tchr\tstart\tend\tbreadth\tfraction")?;
464 |     
465 |     for (id, (chr, start, end, breadth)) in id_map {
466 |         let length = end.saturating_sub(start) as usize;
467 |         let fraction = if length > 0 {
468 |             breadth as f64 / length as f64
469 |         } else {
470 |             0.0
471 |         };
472 |         writeln!(buf, "{id}\t{chr}\t{start}\t{end}\t{breadth}\t{:.6}", fraction)?;
473 |         written += 1;
474 | 
475 |         if buf.len() >= WRITE_BUF_SIZE {
476 |             out.write_all(buf.as_bytes())?;
477 |             buf.clear();
478 |         }
479 |     }
480 | 
481 |     if !buf.is_empty() {
482 |         out.write_all(buf.as_bytes())?;
483 |     }
484 |     out.flush()?;
485 | 
486 |     if verbose {
487 |         eprintln!("[INFO] Wrote {written} feature coverage rows.");
488 |     }
489 |     Ok(())
490 | }
491 | 
492 | /// Main
493 | pub fn run(args: &CoverageArgs) -> Result<()> {
494 |     let verbose = args.verbose;
495 |     let threads = if args.threads == 0 {
496 |         std::thread::available_parallelism()
497 |             .map(|n| n.get())
498 |             .unwrap_or(1)
499 |     } else {
500 |         args.threads
501 |     };
502 |     let _ = rayon::ThreadPoolBuilder::new().num_threads(threads).build_global();
503 |     let gff_path = &args.input;
504 |     
505 |     // Step 1: load GOF index + mmap GFF
506 |     let t0 = Instant::now();
507 |     let gof = load_gof(&gff_path)?;
508 |     let file = File::open(gff_path).with_context(|| format!("Cannot open GFF file: {:?}", gff_path))?;
509 |     let gff_mmap = unsafe { Mmap::map(&file) }.with_context(|| format!("GFF mmap failed for {:?}", gff_path))?;
510 |     let t_load_fts = t0.elapsed();
511 |     if verbose {
512 |         eprintln!("[TIMER] [run] Step 1: Load GOF & mmap GFF: {:.2?}", t_load_fts);
513 |     }
514 | 
515 |     // Step 2: load interval tree index
516 |     let t1 = Instant::now();
517 |     let index_data = TreeIndexData::load_tree_index(&gff_path)?;
518 |     let t_build_index = t1.elapsed();
519 |     if verbose {
520 |         eprintln!("[TIMER] [run] Step 2: Load tree index: {:.2?}", t_build_index);
521 |     }
522 | 
523 |     // Step 3: collect coverage intervals per root
524 |     let t2 = Instant::now();
525 |     let source_path = &args.source;
526 | 
527 |     let ext = source_path
528 |         .extension()
529 |         .and_then(|s| s.to_str())
530 |         .map(|s| s.to_lowercase());
531 | 
532 |     let by_root = match ext.as_deref() {
533 |         Some("bam") | Some("sam") | Some("cram") => {
534 |             collect_by_root_from_bam(source_path.as_path(), &index_data, verbose, threads)?
535 |         }
536 |         Some("bed") => {
537 |             collect_by_root_from_bed(source_path.as_path(), &index_data, verbose)?
538 |         }
539 |         _ => {
540 |             bail!(
541 |                 "Unsupported file type: {:?}. Expected .bam/.sam/.cram or .bed",
542 |                 source_path
543 |             );
544 |         }
545 |     };
546 |     let t_collect = t2.elapsed();
547 |     if verbose {
548 |         eprintln!("[TIMER] [run] Step 3: Collect intervals: {:.2?}", t_collect);
549 |     }
550 | 
551 |     // Step 4: per-root merge & compute breadth over GFF slices
552 |     let t3 = Instant::now();
553 |     let id_map = finalize_compute_breadth(by_root, &gof, &gff_mmap, threads, verbose)?;
554 |     let t_compute = t3.elapsed();
555 |     if verbose {
556 |         eprintln!("[TIMER] [run] Step 4: Compute breadth: {:.2?}", t_compute);
557 |     }
558 | 
559 |     // Step 5: write results
560 |     let t4 = Instant::now();
561 |     
562 |     let out: Box<dyn Write> = match &args.output {
563 |         Some(path) => {
564 |             let file = File::create(path)?;
565 |             Box::new(BufWriter::with_capacity(WRITE_BUF_SIZE, file))
566 |         }
567 |         None => {
568 |             let stdout = std::io::stdout();
569 |             let handle = stdout.lock();
570 |             Box::new(BufWriter::with_capacity(WRITE_BUF_SIZE, handle))
571 |         }
572 |     };
573 |     write_breadth_results(id_map, out, verbose)?;
574 |     let t_write_out = t4.elapsed();
575 |     if verbose {
576 |         eprintln!("[TIMER] [run] Step 5: Write output: {:.2?}", t_write_out);
577 |         let total = t0.elapsed();
578 |         eprintln!("[TIMER] [run] Total time: {:.2?}", total);
579 |     }
580 | 
581 |     Ok(())
582 | }
583 | 


--------------------------------------------------------------------------------
/src/commands/depth.rs:
--------------------------------------------------------------------------------
  1 | use anyhow::{Result, bail, Context};
  2 | use rayon::prelude::*;
  3 | use memmap2::Mmap;
  4 | use rust_htslib::bam::{self, Read};
  5 | use rust_htslib::bam::ext::BamRecordExtensions;
  6 | use rustc_hash::{FxHashMap, FxHashSet};
  7 | use std::{
  8 |     str,
  9 |     fs::File,
 10 |     path::{Path, PathBuf},
 11 |     io::{BufWriter, Write},
 12 | };
 13 | use clap::Parser;
 14 | use crate::{
 15 |     Interval, TreeIndexData, load_gof, GofMap,
 16 | };
 17 | use std::time::{Instant, Duration};
 18 | 
 19 | // Sentinel for missing entries
 20 | const MISSING: u64 = u64::MAX; 
 21 | // Number of IoSlices per batch writer
 22 | const WRITE_BUF_SIZE: usize = 32 * 1024 * 1024;
 23 | // BufWriter buffer size
 24 | const BATCH_SIZE: usize = 100_000;
 25 | 
 26 | /// Compute bin index for coordinate `x` given shift k.
 27 | /// Each bin has width 2^k bp. Smaller k = finer bins, larger k = coarser bins.
 28 | #[inline]
 29 | fn bin_of(x: u32, shift: u32) -> u32 {
 30 |     x >> shift
 31 | }
 32 | 
 33 | /// Arguments for `depth` command
 34 | #[derive(Parser, Debug)]
 35 | #[command(
 36 |     about = "Compute coverage depth across genomic features",
 37 |     long_about = "This tool computes sequencing depth (number of overlapping regions/reads per feature) \
 38 |                   from SAM/BAM/CRAM or BED input. It does not compute breadth/fraction coverage."
 39 | )]
 40 | pub struct DepthArgs {
 41 |     /// Input GFF file path
 42 |     #[arg(short = 'i', long = "input", value_name = "FILE")]
 43 |     pub input: PathBuf,
 44 | 
 45 |     /// Input source (BAM/SAM/CRAM or BED)
 46 |     #[arg(short = 's', long)]
 47 |     pub source: PathBuf,
 48 |     
 49 |     /// Output file (stdout if not provided)
 50 |     #[arg(short = 'o', long = "output", value_name = "FILE")]
 51 |     pub output: Option<PathBuf>,
 52 | 
 53 |     /// Bin width parameter (2^k bp) for spatial bucketing of features and queries.
 54 |     /// Choose k so that a typical read and feature span ~1–2 bins.
 55 |     ///
 56 |     /// Typical values:
 57 |     ///   Short reads (Illumina 100–150 bp): k=10–11 (1–2 kb bins)
 58 |     ///   PacBio HiFi (15–20 kb): k=13–14 (8–16 kb bins)
 59 |     ///   ONT long reads (30–60 kb): k=14–15 (16–32 kb bins)
 60 |     ///
 61 |     /// Adjust:
 62 |     ///   Longer features → increase k (larger bins, less fragmentation).
 63 |     ///   Denser features → decrease k (smaller bins, stronger filtering).
 64 |     #[arg(long = "bin-shift", default_value_t = 12)]
 65 |     pub bin_shift: u32,
 66 |     
 67 |     /// Number of threads for parallel processing
 68 |     #[arg(short = 't', long = "threads", default_value_t = 12)]
 69 |     pub threads: usize,
 70 | 
 71 |     /// Enable verbose output
 72 |     #[arg(short = 'v', long = "verbose", default_value_t = false)]
 73 |     pub verbose: bool,
 74 | }
 75 | 
 76 | /// Check half-open overlap: [a1, a2) vs [b1, b2)
 77 | #[inline(always)]
 78 | fn overlaps(a1: u32, a2: u32, b1: u32, b2: u32) -> bool {
 79 |     let left  = if a1 > b1 { a1 } else { b1 };
 80 |     let right = if a2 < b2 { a2 } else { b2 };
 81 |     left < right
 82 | }
 83 | 
 84 | /// Parse unsigned int quickly into u32
 85 | #[inline(always)]
 86 | fn parse_u32_fast(s: &str) -> Option<u32> {
 87 |     if s.is_empty() { return None; }
 88 |     let mut n: u32 = 0;
 89 |     for b in s.as_bytes() {
 90 |         let d = b.wrapping_sub(b'0');
 91 |         if d > 9 { return None; }
 92 |         n = n.checked_mul(10)?.checked_add(d as u32)?;
 93 |     }
 94 |     Some(n)
 95 | }
 96 | 
 97 | #[derive(Clone, Copy)]
 98 | struct RegionRef { start: u32, end: u32 }
 99 | 
100 | #[derive(Clone, Copy)]
101 | struct FeatureInst { start: u32, end: u32, id_idx: u32 }
102 | 
103 | #[inline(always)]
104 | fn fast_id(attrs: &str) -> Option<&str> {
105 |     let bytes = attrs.as_bytes();
106 |     let mut i = 0;
107 |     while i + 2 < bytes.len() {
108 |         if bytes[i] == b'I' && bytes[i+1] == b'D' && bytes[i+2] == b'=' {
109 |             let mut j = i + 3;
110 |             while j < bytes.len() && bytes[j] != b';' && bytes[j] != b' ' && bytes[j] != b'\t' {
111 |                 j += 1;
112 |             }
113 |             return std::str::from_utf8(&bytes[i+3..j]).ok();
114 |         }
115 |         i += 1;
116 |     }
117 |     None
118 | }
119 | 
120 | /// Parse one GFF slice and count feature *depth* (how many regions overlap it; deduped per region)
121 | fn compute_root_depth(
122 |     gff_slice: &[u8],
123 |     regions: &[RegionRef],
124 |     bin_shift: u32,  
125 | ) -> FxHashMap<String, (String, u32, u32, usize)> {
126 |     let mut id_to_idx: FxHashMap<String, u32> = FxHashMap::default();
127 |     let mut id_strings: Vec<String> = Vec::new();
128 |     let mut id_chrom: Vec<String> = Vec::new();
129 |     let mut feats: Vec<FeatureInst> = Vec::new();
130 | 
131 |     if let Ok(text) = str::from_utf8(gff_slice) {
132 |         for line in text.split_terminator('\n') {
133 |             if line.is_empty() || line.as_bytes()[0] == b'#' { continue; }
134 |             let mut cols = line.splitn(9, '\t');
135 |             let (Some(seqid), _, _, Some(start_s), Some(end_s), _, _, _, Some(attrs)) = (
136 |                 cols.next(), cols.next(), cols.next(),
137 |                 cols.next(), cols.next(), cols.next(),
138 |                 cols.next(), cols.next(), cols.next()
139 |             ) else { continue; };
140 | 
141 |             let (Some(s1), Some(e1)) = (parse_u32_fast(start_s), parse_u32_fast(end_s)) else { continue; };
142 |             if e1 == 0 { continue; }
143 |             let (s1, e1) = if s1 > e1 { (e1, s1) } else { (s1, e1) };
144 |             let fstart0 = s1.saturating_sub(1);
145 |             let fend0   = e1;
146 | 
147 |             if let Some(id) = fast_id(attrs) {
148 |                 let idx = *id_to_idx.entry(id.to_owned()).or_insert_with(|| {
149 |                     let k = id_strings.len() as u32;
150 |                     id_strings.push(id.to_owned());
151 |                     id_chrom.push(seqid.to_owned());
152 |                     k
153 |                 });
154 |                 feats.push(FeatureInst { start: fstart0, end: fend0, id_idx: idx });
155 |             }
156 |         }
157 |     }
158 | 
159 |     if feats.is_empty() || regions.is_empty() {
160 |         return FxHashMap::default();
161 |     }
162 | 
163 |     let max_feat_bin = feats.iter().map(|f| bin_of(f.end.saturating_sub(1), bin_shift)).max().unwrap_or(0);
164 |     let mut feat_bins: Vec<Vec<u32>> = vec![Vec::new(); (max_feat_bin as usize) + 1];
165 |     for (i, f) in feats.iter().enumerate() {
166 |         let b0 = bin_of(f.start, bin_shift);
167 |         let b1 = bin_of(f.end.saturating_sub(1), bin_shift);
168 |         for b in b0..=b1 {
169 |             feat_bins[b as usize].push(i as u32);
170 |         }
171 |     }
172 | 
173 |     let n_ids = id_strings.len();
174 |     let mut min_s: Vec<u32> = vec![u32::MAX; n_ids];
175 |     let mut max_e: Vec<u32> = vec![0; n_ids];
176 |     let mut depths: Vec<usize> = vec![0; n_ids];
177 | 
178 |     let mut cand: Vec<u32> = Vec::new();
179 |     let mut hit_ids: Vec<u32> = Vec::new();
180 | 
181 |     for r in regions {
182 |         cand.clear();
183 |         let rb0 = bin_of(r.start, bin_shift);
184 |         let rb1 = bin_of(r.end.saturating_sub(1), bin_shift);
185 |         for b in rb0..=rb1 {
186 |             if let Some(v) = feat_bins.get(b as usize) {
187 |                 cand.extend_from_slice(v);
188 |             }
189 |         }
190 |         cand.sort_unstable();
191 |         cand.dedup();
192 | 
193 |         hit_ids.clear();
194 |         for &fi in &cand {
195 |             let f = feats[fi as usize];
196 |             if overlaps(f.start, f.end, r.start, r.end) {
197 |                 hit_ids.push(f.id_idx);
198 |                 if f.start < min_s[f.id_idx as usize] { min_s[f.id_idx as usize] = f.start; }
199 |                 if f.end   > max_e[f.id_idx as usize] { max_e[f.id_idx as usize] = f.end; }
200 |             }
201 |         }
202 |         hit_ids.sort_unstable();
203 |         hit_ids.dedup();
204 |         for &ii in &hit_ids {
205 |             depths[ii as usize] += 1;
206 |         }
207 |     }
208 | 
209 |     let mut out: FxHashMap<String, (String, u32, u32, usize)> = FxHashMap::default();
210 |     for (i, &d) in depths.iter().enumerate() {
211 |         if d > 0 {
212 |             let s = if min_s[i] == u32::MAX { 0 } else { min_s[i] };
213 |             out.insert(id_strings[i].clone(), (id_chrom[i].clone(), s, max_e[i], d));
214 |         }
215 |     }
216 |     out
217 | }
218 | 
219 | /// Batch API: for a batch of regions, return "feature ID -> (chrom, start, end, depth)".
220 | ///
221 | /// - depth  = how many regions overlap with the feature (count of regions, deduped per region)
222 | pub fn compute_hit_depth(
223 |     index_data: &TreeIndexData,
224 |     regions: &[(u32, u32, u32)],
225 |     gof: &GofMap,
226 |     gff_mmap: &Mmap,
227 |     bin_shift: u32,
228 |     threads: usize,
229 | ) -> Result<FxHashMap<String, (String, u32, u32, usize)>> {
230 |     let mut by_root: FxHashMap<u32, Vec<RegionRef>> = FxHashMap::default();
231 |     let idx = gof.index_cached();
232 |     let gff_bytes: &[u8] = &gff_mmap[..];
233 | 
234 |     let mut hits: Vec<&Interval<u32>> = Vec::new();
235 |     for &(chr, rstart, rend) in regions {
236 |         if let Some(tree) = index_data.chr_entries.get(&chr) {
237 |             hits.clear();
238 |             tree.query_interval(rstart, rend, &mut hits);
239 |             let mut seen_in_region: FxHashSet<u32> = FxHashSet::default();
240 |             for h in &hits {
241 |                 if !seen_in_region.insert(h.root_fid) { continue; }
242 |                 if let Some(&(s_off, e_off)) = idx.get(&h.root_fid) {
243 |                     if s_off == MISSING || e_off == MISSING || e_off <= s_off { continue; }
244 |                     by_root.entry(h.root_fid).or_default().push(RegionRef { start: rstart, end: rend });
245 |                 }
246 |             }
247 |         }
248 |     }
249 | 
250 |     let mut out: FxHashMap<String, (String, u32, u32, usize)> = FxHashMap::default();
251 |     if by_root.is_empty() { return Ok(out); }
252 | 
253 |     let roots_iter = by_root.into_iter();
254 |     if threads > 1 {
255 |         // Parallel execution: each root slice is processed independently
256 |         let partials: Vec<_> = roots_iter.par_bridge().map(|(root, regs)| {
257 |             let (s_off, e_off) = *idx.get(&root).unwrap();
258 |             let su = usize::try_from(s_off).unwrap();
259 |             let eu = usize::try_from(e_off).unwrap();
260 |             compute_root_depth(&gff_bytes[su..eu], &regs, bin_shift)
261 |         }).collect();
262 |         
263 |         // Merge results from all roots
264 |         for m in partials {
265 |             for (id, (chrom, s, e, d)) in m {
266 |                 out.entry(id).and_modify(|(c0, s0, e0, depth)| {
267 |                     if s < *s0 { *s0 = s; }
268 |                     if e > *e0 { *e0 = e; }
269 |                     *depth += d;
270 |                     let _ = c0;
271 |                 }).or_insert((chrom, s, e, d));
272 |             }
273 |         }
274 |     } else {
275 |         // Serial execution
276 |         for (root, regs) in roots_iter {
277 |             let (s_off, e_off) = *idx.get(&root).unwrap();
278 |             let su = usize::try_from(s_off).unwrap();
279 |             let eu = usize::try_from(e_off).unwrap();
280 |             let m = compute_root_depth(&gff_bytes[su..eu], &regs, bin_shift);
281 |             for (id, (chrom, s, e, d)) in m {
282 |                 out.entry(id).and_modify(|(c0, s0, e0, depth)| {
283 |                     if s < *s0 { *s0 = s; }
284 |                     if e > *e0 { *e0 = e; }
285 |                     *depth += d;
286 |                     let _ = c0;
287 |                 }).or_insert((chrom, s, e, d));
288 |             }
289 |         }
290 |     }
291 | 
292 |     Ok(out)
293 | }
294 | 
295 | /// Process BAM input with mmap/htslib and batch queries.
296 | /// Returns: "feature ID -> (chrom, start, end, depth)".
297 | pub fn process_bam(
298 |     bam_path: &Path,
299 |     index_data: &TreeIndexData,
300 |     gof: GofMap,
301 |     gff_mmap: Mmap,
302 |     bin_shift: u32,
303 |     threads: usize,
304 |     verbose: bool,
305 | ) -> Result<FxHashMap<String, (String, u32, u32, usize)>> {
306 |     let mut global_id_counts: FxHashMap<String, (String, u32, u32, usize)> = FxHashMap::default();
307 | 
308 |     let t_open = Instant::now();
309 |     let mut reader = bam::Reader::from_path(bam_path)?;
310 |     reader.set_threads(std::cmp::max(2, threads))?;
311 |     let t_open_elapsed = t_open.elapsed();
312 | 
313 |     let header = reader.header().to_owned();
314 | 
315 |     // Build tid -> chr_id mapping
316 |     let t_map_build = Instant::now();
317 |     let mut tid2num: Vec<Option<u32>> = Vec::with_capacity(header.target_count() as usize);
318 |     for tid in 0..header.target_count() {
319 |         let chrom_bytes = header.tid2name(tid).to_owned();
320 |         let chrom = std::str::from_utf8(&chrom_bytes)?;
321 |         let chr_id = index_data.seqid_to_num.get(chrom).copied();
322 |         tid2num.push(chr_id);
323 |     }
324 |     let t_map_build_elapsed = t_map_build.elapsed();
325 | 
326 |     let mut batch: Vec<(u32, u32, u32)> = Vec::with_capacity(BATCH_SIZE);
327 | 
328 |     // Timers
329 |     let mut t_parse = Duration::ZERO;
330 |     let mut t_tidmap = Duration::ZERO;
331 |     let mut t_filtermap = Duration::ZERO;
332 |     let mut t_tree = Duration::ZERO;
333 |     let mut t_depthmap = Duration::ZERO;
334 | 
335 |     for r in reader.records() {
336 |         let t0 = Instant::now();
337 |         let rec = r?;
338 |         if rec.is_unmapped() {
339 |             t_parse += t0.elapsed();
340 |             continue;
341 |         }
342 |         t_parse += t0.elapsed();
343 | 
344 |         let t1 = Instant::now();
345 |         let tid = rec.tid();
346 |         if tid < 0 {
347 |             t_tidmap += t1.elapsed();
348 |             continue;
349 |         }
350 |         if let Some(chr_id) = tid2num[tid as usize] {
351 |             let start_i64 = rec.pos();
352 |             if start_i64 < 0 {
353 |                 t_tidmap += t1.elapsed();
354 |                 continue;
355 |             }
356 |             let end_i64 = rec.reference_end();
357 |             if end_i64 <= start_i64 {
358 |                 t_tidmap += t1.elapsed();
359 |                 continue;
360 |             }
361 |             let start = (start_i64 as i128).clamp(0, u32::MAX as i128) as u32;
362 |             let end   = (end_i64 as i128).clamp(0, u32::MAX as i128) as u32;
363 | 
364 |             batch.push((chr_id, start, end));
365 |         }
366 |         t_tidmap += t1.elapsed();
367 | 
368 |         if batch.len() >= BATCH_SIZE {
369 |             let t2 = Instant::now();
370 |             let regions = std::mem::take(&mut batch);
371 |             t_filtermap += t2.elapsed();
372 | 
373 |             let t3 = Instant::now();
374 |             let id_counts = compute_hit_depth(index_data, &regions, &gof, &gff_mmap, bin_shift, threads)?;
375 |             t_tree += t3.elapsed();
376 | 
377 |             let t4 = Instant::now();
378 |             for (id, (chrom, s, e, d)) in id_counts {
379 |                 global_id_counts.entry(id).and_modify(|(c0, s0, e0, depth)| {
380 |                     if s < *s0 { *s0 = s; }
381 |                     if e > *e0 { *e0 = e; }
382 |                     *depth += d;
383 |                     let _ = c0;
384 |                 }).or_insert((chrom, s, e, d));
385 |             }
386 |             t_depthmap += t4.elapsed();
387 |             batch.clear();
388 |         }
389 |     }
390 | 
391 |     if !batch.is_empty() {
392 |         let t2 = Instant::now();
393 |         let regions = std::mem::take(&mut batch);
394 |         t_filtermap += t2.elapsed();
395 | 
396 |         let t3 = Instant::now();
397 |         let id_counts = compute_hit_depth(index_data, &regions, &gof, &gff_mmap, bin_shift, threads)?;
398 |         t_tree += t3.elapsed();
399 | 
400 |         let t4 = Instant::now();
401 |         for (id, (chrom, s, e, d)) in id_counts {
402 |             global_id_counts.entry(id).and_modify(|(c0, s0, e0, depth)| {
403 |                 if s < *s0 { *s0 = s; }
404 |                 if e > *e0 { *e0 = e; }
405 |                 *depth += d;
406 |                 let _ = c0;
407 |             }).or_insert((chrom, s, e, d));
408 |         }
409 |         t_depthmap += t4.elapsed();
410 |     }
411 | 
412 |     if verbose {
413 |         eprintln!("[TIMER] (1) Opening BAM file:     {:.2?}", t_open_elapsed);
414 |         eprintln!("[TIMER] (1b) Build tid2num map:  {:.2?}", t_map_build_elapsed);
415 |         eprintln!("[TIMER] (2) Parsing records:     {:.2?}", t_parse);
416 |         eprintln!("[TIMER] (3) Chrom ID mapping:    {:.2?}", t_tidmap);
417 |         eprintln!("[TIMER] (4) Batch filter_map:    {:.2?}", t_filtermap);
418 |         eprintln!("[TIMER] (5) Interval tree query: {:.2?}", t_tree);
419 |         eprintln!("[TIMER] (6) DepthMap updates:    {:.2?}", t_depthmap);
420 |     }
421 | 
422 |     Ok(global_id_counts)
423 | }
424 | 
425 | /// Process BED input with mmap, parallel line parsing, and batch queries.
426 | /// 
427 | /// Returns: "feature ID -> (chrom, start, end, depth)".
428 | /// - depth   = number of regions overlapping the feature
429 | pub fn process_bed(
430 |     bed_path: &Path,
431 |     index_data: &TreeIndexData,
432 |     gof: GofMap,
433 |     gff_mmap: Mmap,
434 |     bin_shift: u32,
435 |     threads: usize,
436 |     verbose: bool,
437 | ) -> Result<FxHashMap<String, (String, u32, u32, usize)>> {
438 |     let mut global_id_counts: FxHashMap<String, (String, u32, u32, usize)> = FxHashMap::default();
439 | 
440 |     // mmap the entire BED file
441 |     let file = File::open(bed_path)?;
442 |     let mmap = unsafe { Mmap::map(&file)? };
443 |     let data = &mmap[..];
444 | 
445 |     if verbose {
446 |         eprintln!("[INFO] mmap BED file: {} bytes", data.len());
447 |     }
448 | 
449 |     // collect line offsets
450 |     let mut line_offsets = Vec::with_capacity(1_000_000);
451 |     line_offsets.push(0usize);
452 |     for (i, &b) in data.iter().enumerate() {
453 |         if b == b'\n' {
454 |             line_offsets.push(i + 1);
455 |         }
456 |     }
457 |     if *line_offsets.last().unwrap_or(&0) != data.len() {
458 |         line_offsets.push(data.len());
459 |     }
460 | 
461 |     // process chunks in batches
462 |     for chunk in line_offsets.windows(2).collect::<Vec<_>>().chunks(BATCH_SIZE) {
463 |         // parse BED lines into (chr_id, start, end)
464 |         let regions: Vec<(u32, u32, u32)> = chunk
465 |             .par_iter()
466 |             .filter_map(|w| {
467 |                 let start = w[0];
468 |                 let end = w[1];
469 |                 if start >= end {
470 |                     return None;
471 |                 }
472 |                 let line = &data[start..end];
473 |                 if line.is_empty() || line[0] == b'#' {
474 |                     return None;
475 |                 }
476 | 
477 |                 let fields: Vec<&[u8]> = line
478 |                     .split(|&b| b == b'\t' || b == b' ')
479 |                     .filter(|f| !f.is_empty())
480 |                     .collect();
481 |                 if fields.len() < 3 {
482 |                     return None;
483 |                 }
484 | 
485 |                 let chrom = std::str::from_utf8(fields[0]).ok()?;
486 |                 let s = std::str::from_utf8(fields[1]).ok()?.parse::<u32>().ok()?;
487 |                 let e = std::str::from_utf8(fields[2]).ok()?.trim_end().parse::<u32>().ok()?;
488 |                 if s >= e {
489 |                     return None;
490 |                 }
491 | 
492 |                 let &chr_num = index_data.seqid_to_num.get(chrom)?;
493 |                 Some((chr_num, s, e))
494 |             })
495 |             .collect();
496 | 
497 |         // compute depth only
498 |         let id_counts = compute_hit_depth(index_data, &regions, &gof, &gff_mmap, bin_shift, threads)?;
499 | 
500 |         // merge into global results
501 |         for (id, (chrom, s, e, d)) in id_counts {
502 |             global_id_counts.entry(id).and_modify(|(c0, s0, e0, depth)| {
503 |                 if s < *s0 { *s0 = s; }
504 |                 if e > *e0 { *e0 = e; }
505 |                 *depth += d;
506 |                 let _ = c0; // chrom assumed consistent
507 |             }).or_insert((chrom, s, e, d));
508 |         }
509 |     }
510 | 
511 |     Ok(global_id_counts)
512 | }
513 | 
514 | /// Write "id\tchr\tstart\tend\tdepth" per line to output file.
515 | pub fn write_depth_results<W: Write>(
516 |     id_counts: FxHashMap<String, (String, u32, u32, usize)>,
517 |     mut out: W,
518 |     verbose: bool,
519 | ) -> Result<()> {
520 |     use std::fmt::Write as FmtWrite;
521 |     let mut buf = String::with_capacity(WRITE_BUF_SIZE);
522 |     let mut written = 0usize;
523 | 
524 |     writeln!(buf, "id\tchr\tstart\tend\tdepth")?;
525 |     
526 |     for (id, (chr, start, end, depth)) in id_counts {
527 |         writeln!(buf, "{id}\t{chr}\t{start}\t{end}\t{depth}")?;
528 |         written += 1;
529 | 
530 |         if buf.len() >= WRITE_BUF_SIZE {
531 |             out.write_all(buf.as_bytes())?;
532 |             buf.clear();
533 |         }
534 |     }
535 | 
536 |     if !buf.is_empty() {
537 |         out.write_all(buf.as_bytes())?;
538 |     }
539 |     out.flush()?;
540 | 
541 |     if verbose {
542 |         eprintln!("[INFO] Wrote {written} ID depth records");
543 |     }
544 |     Ok(())
545 | }
546 | 
547 | /// Main entry for depth pipeline
548 | pub fn run(args: &DepthArgs) -> Result<()> {
549 |     let verbose = args.verbose;
550 |     let bin_shift = args.bin_shift;
551 |     let threads = if args.threads == 0 {
552 |             std::thread::available_parallelism()
553 |                 .map(|n| n.get())
554 |                 .unwrap_or(1)
555 |         } else {
556 |             args.threads
557 |         };
558 |     let _ = rayon::ThreadPoolBuilder::new().num_threads(threads).build_global();
559 |     let gff_path = &args.input;
560 |     
561 |     // Step 1: load GFF index
562 |     let t0 = Instant::now();
563 |     let gof = load_gof(&gff_path)?;
564 |     let file = File::open(gff_path).with_context(|| format!("Cannot open GFF file: {:?}", gff_path))?;
565 |     let gff_mmap = unsafe { Mmap::map(&file) }.with_context(|| format!("GFF mmap failed for {:?}", gff_path))?;
566 |     let t_load_fts = t0.elapsed();
567 |     if verbose {
568 |         eprintln!("[TIMER] [run] Step 1: Loading index took {:.2?}", t_load_fts);
569 |     }
570 | 
571 |     // Step 2: build interval index
572 |     let t1 = Instant::now();
573 |     let index_data = TreeIndexData::load_tree_index(&gff_path)?;
574 |     let t_build_index = t1.elapsed();
575 |     if verbose {
576 |         eprintln!("[TIMER] [run] Step 2: Building tree index took {:.2?}", t_build_index);
577 |     }
578 | 
579 |     // Step 3: process input file
580 |     let t2 = Instant::now();
581 |     let source_path = &args.source;
582 | 
583 |     let ext = source_path
584 |         .extension()
585 |         .and_then(|s| s.to_str())
586 |         .map(|s| s.to_lowercase());
587 | 
588 |     let id_counts = match ext.as_deref() {
589 |         Some("bam") | Some("sam") | Some("cram") => {
590 |             process_bam(source_path.as_path(), &index_data, gof, gff_mmap, bin_shift, threads, verbose)?
591 |         }
592 |         Some("bed") => {
593 |             process_bed(source_path.as_path(), &index_data, gof, gff_mmap, bin_shift, threads, verbose)?
594 |         }
595 |         _ => {
596 |             bail!(
597 |                 "Unsupported file type: {:?}. Expected .bam/.sam/.cram or .bed",
598 |                 source_path
599 |             );
600 |         }
601 |     };
602 |     let t_process_input = t2.elapsed();
603 |     if verbose {
604 |         eprintln!("[TIMER] [run] Step 3: Processing input took {:.2?}", t_process_input);
605 |     }
606 | 
607 |     // Step 4: write results
608 |     let t3 = Instant::now();
609 |     
610 |     let out: Box<dyn Write> = match &args.output {
611 |         Some(path) => {
612 |             let file = File::create(path)?;
613 |             Box::new(BufWriter::with_capacity(WRITE_BUF_SIZE, file))
614 |         }
615 |         None => {
616 |             let stdout = std::io::stdout();
617 |             let handle = stdout.lock();
618 |             Box::new(BufWriter::with_capacity(WRITE_BUF_SIZE, handle))
619 |         }
620 |     };
621 |     
622 |     write_depth_results(id_counts, out, verbose)?;
623 |     
624 |     let t_write_out = t3.elapsed();
625 |     if verbose {
626 |         eprintln!("[TIMER] [run] Step 4: Writing results took {:.2?}", t_write_out);
627 |     }
628 | 
629 |     if verbose {
630 |         let total = t0.elapsed();
631 |         eprintln!("[TIMER] [run] Total pipeline time: {:.2?}", total);
632 |     }
633 | 
634 |     Ok(())
635 | }


--------------------------------------------------------------------------------
/src/commands/extract.rs:
--------------------------------------------------------------------------------
  1 | use crate::{
  2 |     CommonArgs,  load_fts, load_gof, load_prt, write_gff_output, write_gff_output_filtered
  3 | };
  4 | use anyhow::{Context, Result, bail};
  5 | use clap::Parser;
  6 | use rustc_hash::{FxHashMap, FxHashSet};
  7 | use std::{
  8 |     fs::File,
  9 |     io::{BufRead, BufReader},
 10 |     path::PathBuf,
 11 |     time::Instant,
 12 | };
 13 | 
 14 | 
 15 | /// Extract subtrees from a GFF file by a list of feature names (from --feature-file).
 16 | #[derive(Parser, Debug)]
 17 | #[command(
 18 |     about = "Extract models by feature IDs",
 19 |     long_about = "This tool extracts features and their parent models by feature IDs"
 20 | )]
 21 | #[clap(group(
 22 |     clap::ArgGroup::new("feature")
 23 |         .required(true)
 24 |         .args(&["feature_file", "feature_id"])
 25 | ))]
 26 | pub struct ExtractArgs {
 27 |     #[clap(flatten)]
 28 |     pub common: CommonArgs,
 29 | 
 30 |     #[arg(short = 'f', long, group = "feature")]
 31 |     pub feature_id: Option<String>,
 32 | 
 33 |     #[arg(short = 'F', long, group = "feature")]
 34 |     pub feature_file: Option<PathBuf>,
 35 | }
 36 | 
 37 | pub fn run(args: &ExtractArgs) -> Result<()> {
 38 |     let gff_path = &args.common.input;
 39 | 
 40 |     // Start overall timer
 41 |     let overall_start = Instant::now();
 42 |     let verbose = args.common.verbose;
 43 |     if verbose {
 44 |         eprintln!("[DEBUG] Starting processing of {:?}", gff_path);
 45 |         eprintln!(
 46 |             "[DEBUG] Thread pool initialized with {} threads",
 47 |             args.common.effective_threads()
 48 |         );
 49 |     }
 50 | 
 51 |     // Load features
 52 |     let fts = load_fts(gff_path)?;
 53 | 
 54 |     // Load parent relations
 55 |     let prt = load_prt(gff_path)?;
 56 | 
 57 |     // Load GFF offsets
 58 |     let gof = load_gof(gff_path)?;
 59 | 
 60 |     // Read feature string IDs （feature name）
 61 |     let feature_names: FxHashSet<String> = if let Some(ref file_path) = args.feature_file {
 62 |         let file = File::open(file_path)
 63 |             .with_context(|| format!("Cannot open feature list: {:?}", file_path))?;
 64 |         let reader = BufReader::new(file);
 65 |         reader.lines().try_fold(
 66 |             FxHashSet::default(),
 67 |             |mut set, line| -> Result<FxHashSet<String>, std::io::Error> {
 68 |                 let s = line?;
 69 |                 let s = s.trim();
 70 |                 if !s.is_empty() {
 71 |                     set.insert(s.to_owned());
 72 |                 }
 73 |                 Ok(set)
 74 |             },
 75 |         )?
 76 |     } else if let Some(ref single_id) = args.feature_id {
 77 |         [single_id.clone()].into_iter().collect()
 78 |     } else {
 79 |         bail!("Either --feature-id (-f) or --feature-file (-F) must be specified");
 80 |     };
 81 | 
 82 |     // Phase A: group matches by root
 83 |     // Phase A: map feature names to numeric fids
 84 |     let (fids_set, missing) = fts.map_fnames_to_fids(
 85 |         &feature_names,
 86 |         args.common.effective_threads()
 87 |     );
 88 |     if !missing.is_empty() {
 89 |         eprintln!("[WARN] {} feature IDs not found: {:?}", missing.len(), missing);
 90 |     }
 91 | 
 92 |     // Convert set to vec for alignment with roots
 93 |     let fid_vec: Vec<u32> = fids_set.iter().copied().collect();
 94 | 
 95 |     // Use PrtMap fast resolver to map fid -> root (u32::MAX = invalid)
 96 |     let threads = args.common.effective_threads();
 97 |     let roots_vec: Vec<u32> = prt.map_fids_to_roots(&fid_vec, threads);
 98 | 
 99 |     // Collect invalid fids (print once) and exclude invalid roots
100 |     let mut invalid_fids: Vec<u32> = fid_vec.iter()
101 |         .zip(roots_vec.iter())
102 |         .filter_map(|(&fid, &r)| if r == u32::MAX { Some(fid) } else { None })
103 |         .collect();
104 |     invalid_fids.sort_unstable();
105 |     invalid_fids.dedup();
106 |     if !invalid_fids.is_empty() {
107 |         eprintln!(
108 |             "[WARN] {} numeric feature IDs are invalid (out-of-range child or parent), skipped: {:?}",
109 |             invalid_fids.len(), invalid_fids
110 |         );
111 |     }
112 | 
113 |     // Deduplicate valid roots (exclude u32::MAX)
114 |     let mut roots: Vec<u32> = roots_vec.iter().copied().filter(|&r| r != u32::MAX).collect();
115 |     roots.sort_unstable();
116 |     roots.dedup();
117 | 
118 |     // Phase B: roots -> block offsets
119 |     let blocks: Vec<(u32, u64, u64)> = gof.roots_to_offsets(&roots, args.common.effective_threads());
120 | 
121 |     if !args.common.entire_group || args.common.types.is_some() {
122 |         // Build per_root_matches: root_id -> set of STRING feature IDs
123 |         let mut per_root_matches: FxHashMap<u32, FxHashSet<String>> = FxHashMap::default();
124 |         per_root_matches.reserve(roots.len());
125 |         
126 |         // roots_vec[i] is the root, fid_vec[i] is the numeric fid
127 |         for (i, &root) in roots_vec.iter().enumerate() {
128 |             if root == u32::MAX {
129 |                 continue;
130 |             }
131 |             // Convert numeric fid -> string ID only once here
132 |             if let Some(id_str) = fts.ids.get(fid_vec[i] as usize) {
133 |                 per_root_matches.entry(root).or_default().insert(id_str.clone());
134 |             }
135 |         }
136 |         
137 |         // Emit only exactly matched lines within blocks
138 |         write_gff_output_filtered(
139 |             gff_path,
140 |             &blocks,
141 |             &per_root_matches,
142 |             "ID",
143 |             &args.common.output,
144 |             args.common.types.as_deref(),
145 |             verbose,
146 |         )?;
147 |     } else {
148 |         // Entire-group mode: emit entire blocks without filtering
149 |         write_gff_output(
150 |             gff_path,
151 |             &blocks,
152 |             &args.common.output,
153 |             verbose,
154 |         )?;
155 |     }
156 | 
157 |     if verbose {
158 |         eprintln!("[timing] Total elapsed: {:?}", overall_start.elapsed());
159 |     }
160 | 
161 |     Ok(())
162 | }
163 | 
164 | 


--------------------------------------------------------------------------------
/src/commands/index.rs:
--------------------------------------------------------------------------------
 1 | use crate::build_index;
 2 | use anyhow::Result;
 3 | use clap::Parser;
 4 | use std::path::PathBuf;
 5 | 
 6 | #[derive(Parser, Debug)]
 7 | #[command(
 8 |     about = "Build index for GFF file",
 9 |     long_about = "This command builds index files for fast retrieval from a GFF file."
10 | )]
11 | pub struct IndexArgs {
12 |     #[arg(short, long)]
13 |     input: PathBuf,
14 | 
15 |     #[arg(short, long, default_value = "gene_name")]
16 |     pub attribute: String,
17 | 
18 |     #[arg(short, long, default_value = "remark,note,comment,region,gap,assembly_gap,contig,scaffold,source")]
19 |     pub skip_types: String,
20 | 
21 |     #[arg(short, long, default_value_t = false)]
22 |     verbose: bool,
23 | }
24 | 
25 | pub fn run(args: &IndexArgs) -> Result<()> {
26 |     if args.verbose {
27 |         println!("Indexing: {}", args.input.display());
28 |     }
29 | 
30 |     build_index(&args.input, &args.attribute, &args.skip_types, args.verbose)?;
31 | 
32 |     if args.verbose {
33 |         println!("Index created successfully.");
34 |     }
35 | 
36 |     Ok(())
37 | }
38 | 


--------------------------------------------------------------------------------
/src/commands/intersect.rs:
--------------------------------------------------------------------------------
  1 | use anyhow::{Context, Result};
  2 | use clap::{ArgGroup, Parser};
  3 | use lexical_core::parse;
  4 | use memchr::memchr;
  5 | use memmap2::Mmap;
  6 | use rayon::prelude::*;
  7 | use rustc_hash::{FxHashMap, FxHashSet};
  8 | use std::{
  9 |     fs::File,
 10 |     io::{self, BufWriter, IoSlice, Write},
 11 |     path::{Path, PathBuf},
 12 | };
 13 | 
 14 | use crate::{
 15 |     CommonArgs, Interval, TreeIndexData, load_gof, write_gff_output,
 16 | };
 17 | 
 18 | const MISSING: u64 = u64::MAX; // Set sentinel value for missing entries
 19 | 
 20 | /// Number of IoSlices per batch writer
 21 | const IOV_BATCH: usize = 256;
 22 | /// BufWriter buffer size
 23 | const WRITE_BUF_SIZE: usize = 32 * 1024 * 1024;
 24 | 
 25 | #[derive(Debug, Clone)]
 26 | pub struct RootMatched {
 27 |     pub root: u32,
 28 |     pub matched: Vec<u32>,
 29 | }
 30 | 
 31 | /// Arguments for region intersection operations
 32 | #[derive(Parser, Debug)]
 33 | #[command(
 34 |     about = "Extract models by a region or regions from a BED file",
 35 |     long_about = "This tool extracts features and their parent models that intersect with specified regions"
 36 | )]
 37 | #[clap(group(
 38 |     ArgGroup::new("regions").required(true).args(&["region", "bed"])
 39 | ))]
 40 | #[clap(group(
 41 |     ArgGroup::new("mode").args(&["contained", "contains_region", "overlap"])
 42 | ))]
 43 | pub struct IntersectArgs {
 44 |     #[clap(flatten)]
 45 |     pub common: CommonArgs,
 46 | 
 47 |     /// Single region in format "chr:start-end"
 48 |     #[arg(short = 'r', long, group = "regions")]
 49 |     pub region: Option<String>,
 50 | 
 51 |     /// BED file containing regions
 52 |     #[arg(short = 'b', long, group = "regions")]
 53 |     pub bed: Option<PathBuf>,
 54 | 
 55 |     /// Only return features fully contained within regions
 56 |     #[arg(short = 'c', long, group = "mode")]
 57 |     pub contained: bool,
 58 | 
 59 |     /// Only return features that fully contain the regions
 60 |     #[arg(short = 'C', long, group = "mode")]
 61 |     pub contains_region: bool,
 62 | 
 63 |     /// Return any overlapping features (default)
 64 |     #[arg(short = 'O', long, group = "mode")]
 65 |     pub overlap: bool,
 66 | 
 67 |     /// Invert the selection (exclude matching features)
 68 |     #[arg(short = 'I', long, default_value_t = false)]
 69 |     pub invert: bool,
 70 | }
 71 | 
 72 | /// Overlap detection modes
 73 | #[derive(Debug, Clone, Copy)]
 74 | pub enum OverlapMode {
 75 |     Contained,
 76 |     ContainsRegion,
 77 |     Overlap,
 78 | }
 79 | 
 80 | pub fn gff_type_allowed(line: &[u8], allow: &FxHashSet<String>) -> bool {
 81 |     // Fast parse the 3rd field (type) without allocations
 82 |     let mut off = 0usize;
 83 |     let mut tabs = 0u8;
 84 |     while tabs < 2 {
 85 |         match memchr(b'\t', &line[off..]) {
 86 |             Some(i) => {
 87 |                 off += i + 1;
 88 |                 tabs += 1;
 89 |             }
 90 |             None => return false,
 91 |         }
 92 |     }
 93 |     let i2 = match memchr(b'\t', &line[off..]) {
 94 |         Some(i) => off + i,
 95 |         None => return false,
 96 |     };
 97 |     let ty = &line[off..i2];
 98 |     match std::str::from_utf8(ty) {
 99 |         Ok(s) => allow.contains(s),
100 |         Err(_) => false,
101 |     }
102 | }
103 | 
104 | /// Core feature query logic using interval trees
105 | pub fn query_features(
106 |     index_data: &TreeIndexData,
107 |     regions: &[(u32, u32, u32)],
108 |     mode: OverlapMode,
109 |     invert: bool,
110 |     verbose: bool,
111 | ) -> Result<Vec<(u32, u32, u32)>> {
112 | 
113 |     // Bucket regions by chromosome
114 |     let buckets: Vec<Vec<(u32, u32, u32)>> = {
115 |         let mut b = vec![Vec::new(); index_data.seqid_to_num.len()];
116 |         for (chr, start, end) in regions.iter().copied() {
117 |             b[chr as usize].push((chr, start, end));
118 |         }
119 |         b
120 |     };
121 | 
122 |     let mut results = Vec::new();
123 |     {
124 |         for (&seq_num, tree) in &index_data.chr_entries {
125 |             let chr_regs = &buckets[seq_num as usize];
126 |             if chr_regs.is_empty() {
127 |                 continue;
128 |             }
129 |             if verbose {
130 |                 eprintln!(
131 |                     "[DEBUG] Querying chromosome {} with {} regions",
132 |                     seq_num,
133 |                     chr_regs.len()
134 |                 );
135 |             }
136 |     
137 |             let mut hits: Vec<&Interval<u32>> = Vec::new();
138 |     
139 |             for &(_, rstart, rend) in chr_regs {
140 |                 hits.clear();
141 |                 tree.query_interval(rstart, rend, &mut hits);
142 |     
143 |                 for &iv in &hits {
144 |                     // Decide whether to keep this feature based on mode
145 |                     let keep = match mode {
146 |                         OverlapMode::Contained => {
147 |                             // Feature must be fully contained in region
148 |                             iv.start >= rstart && iv.end <= rend
149 |                         }
150 |                         OverlapMode::ContainsRegion => {
151 |                             // Feature must fully contain region
152 |                             iv.start <= rstart && iv.end >= rend
153 |                         }
154 |                         OverlapMode::Overlap => {
155 |                             // Any overlap is acceptable
156 |                             true
157 |                         }
158 |                     };
159 |     
160 |                     // Apply invert flag (XOR logic)
161 |                     if invert ^ keep {
162 |                         results.push((iv.root_fid, iv.start, iv.end));
163 |                     }
164 |                 }
165 |             }
166 |         }
167 |     }  
168 |     Ok(results)
169 | }
170 | 
171 | /// Parse a single genomic region string (chr:start-end)
172 | pub fn parse_region(
173 |     region: &str,
174 |     seqid_map: &FxHashMap<String, u32>,
175 |     common: &CommonArgs,
176 | ) -> Result<(u32, u32, u32)> {
177 |     let (seq, range) = region
178 |         .split_once(':')
179 |         .context("Invalid region format, expected 'chr:start-end'")?;
180 |     let (s, e) = range
181 |         .split_once('-')
182 |         .context("Invalid range format, expected 'start-end'")?;
183 |     let start = s.parse::<u32>()?;
184 |     let end = e.parse::<u32>()?;
185 |     let chr = seqid_map
186 |         .get(seq)
187 |         .with_context(|| format!("Sequence ID not found: {}", seq))?;
188 |     if start >= end {
189 |         anyhow::bail!("Region start must be less than end ({} >= {})", start, end);
190 |     }
191 |     if common.verbose {
192 |         eprintln!(
193 |             "[DEBUG] Parsed region: chr={}, start={}, end={}",
194 |             chr, start, end
195 |         );
196 |     }
197 |     Ok((*chr, start, end))
198 | }
199 | 
200 | /// Parse BED file using mmap zero-copy field splitting
201 | pub fn parse_bed_file(
202 |     bed_path: &Path,
203 |     seqid_map: &FxHashMap<String, u32>,
204 | ) -> Result<Vec<(u32, u32, u32)>> {
205 |     let mmap = {
206 |         let file = File::open(bed_path)?;
207 |         unsafe { Mmap::map(&file)? }
208 |     };
209 |     let regions = {
210 |         let mut regions = Vec::new();
211 |         for line in mmap.split(|&b| b == b'\n') {
212 |             if line.is_empty() || line[0] == b'#' {
213 |                 continue;
214 |             }
215 |             let line_str = std::str::from_utf8(line)?;
216 |             let mut parts = line_str.split_ascii_whitespace();
217 |             let (Some(seq), Some(s), Some(e)) = (parts.next(), parts.next(), parts.next()) else {
218 |                 continue;
219 |             };
220 |             let Some(&chr) = seqid_map.get(seq) else {
221 |                 continue;
222 |             };
223 |             let start = parse::<u32>(s.as_bytes())?;
224 |             let end = parse::<u32>(e.as_bytes())?;
225 |             regions.push((chr, start, end));
226 |         }
227 |         regions
228 |     };
229 |     Ok(regions)
230 | }
231 | 
232 | pub fn write_gff_match_only_by_coords(
233 |     gff_path: &Path,
234 |     blocks: &[(u32, u64, u64)], //Per-block parallel scan to collect (line_start, line_end) offsets
235 |     query_ivmap: &FxHashMap<String, Vec<(u32, u32)>>,
236 |     types_filter: Option<&str>,
237 |     output_path: &Option<PathBuf>,
238 |     mode: OverlapMode,
239 |     verbose: bool,
240 | ) -> Result<()> {
241 |     // mmap the whole GFF once
242 |     let (mmap, file_len) = {
243 |         let file = std::fs::File::open(gff_path)
244 |             .with_context(|| format!("Cannot open GFF: {:?}", gff_path))?;
245 |         let mmap = unsafe { Mmap::map(&file) }
246 |             .with_context(|| format!("mmap failed for {:?}", gff_path))?;
247 |         let len = mmap.len();
248 |         (mmap, len)
249 |     };
250 | 
251 |     // parse type filters to a set once
252 |     let type_allow: Option<FxHashSet<String>> = {
253 |         types_filter.map(|s| {
254 |             s.split(',')
255 |                 .map(|t| t.trim().to_string())
256 |                 .filter(|t| !t.is_empty())
257 |                 .collect()
258 |         })
259 |     };
260 | 
261 |     // Parallel scan blocks: produce (block_start, Vec<(line_start,line_end)>)
262 |     // Note: we never copy line bytes, only collect offsets.
263 |     let mut parts: Vec<(u64, Vec<(u64, u64)>)> = {
264 |         let bytes_out = std::sync::atomic::AtomicU64::new(0);
265 | 
266 |         let parts: Vec<(u64, Vec<(u64, u64)>)> = blocks
267 |             .par_iter()
268 |             .filter_map(|&(root, start, end)| {
269 |                 if start == MISSING {
270 |                     eprintln!("[WARN] skipped fid={} due to sentinel start offset", root);
271 |                     return None;
272 |                 }
273 |                 let s = start as usize;
274 |                 let e = (end as usize).min(file_len);
275 |                 if s >= e || e > file_len {
276 |                     return None;
277 |                 }
278 |                 let src = &mmap[s..e];
279 | 
280 |                 // Collect matched line ranges as global file offsets.
281 |                 let mut matched_offsets: Vec<(u64, u64)> = Vec::with_capacity(256);
282 |                 let mut pos = 0usize;
283 | 
284 |                 while pos < src.len() {
285 |                     // Find next newline boundary
286 |                     let nl = match memchr(b'\n', &src[pos..]) {
287 |                         Some(i) => pos + i + 1, // include '\n'
288 |                         None => src.len(),
289 |                     };
290 |                     let line = &src[pos..nl];
291 | 
292 |                     // Trim trailing '\n' for parsing
293 |                     let line_nocr = if line.ends_with(b"\n") {
294 |                         &line[..line.len() - 1]
295 |                     } else {
296 |                         line
297 |                     };
298 | 
299 |                     if !line_nocr.is_empty() && line_nocr[0] != b'#' {
300 |                         // Optional: type filter first to early discard
301 |                         let mut pass = true;
302 |                         if let Some(allow) = &type_allow
303 |                             && !gff_type_allowed(line_nocr, allow)
304 |                         {
305 |                             pass = false;
306 |                         }
307 |                         if pass && gff_line_overlaps_queries(line_nocr, query_ivmap, mode) {
308 |                             // Record absolute offsets in the file (including '\n')
309 |                             let abs_start = start + pos as u64;
310 |                             let abs_end = start + nl as u64;
311 |                             // Safety: bounds already clamped by file_len
312 |                             matched_offsets.push((abs_start, abs_end));
313 |                             bytes_out.fetch_add(
314 |                                 (abs_end - abs_start) as u64,
315 |                                 std::sync::atomic::Ordering::Relaxed,
316 |                             );
317 |                         }
318 |                     }
319 | 
320 |                     pos = nl;
321 |                 }
322 | 
323 |                 if matched_offsets.is_empty() {
324 |                     None
325 |                 } else {
326 |                     Some((start, matched_offsets))
327 |                 }
328 |             })
329 |             .collect();
330 |         parts
331 |     };
332 | 
333 |     // Keep global order stable by block start (we don't merge offsets as per user's requirement)
334 |     {
335 |         parts.sort_unstable_by_key(|(s, _)| *s);
336 |     }
337 | 
338 |     // Helper: write all slices using write_vectored with partial-write handling.
339 |     // We construct a temporary Vec<IoSlice> per batch; batch size is small (<= IOV_BATCH).
340 |     fn write_all_vectored<W: Write>(w: &mut W, mut slices: Vec<&[u8]>) -> io::Result<()> {
341 |         // Fast path: nothing to write
342 |         if slices.is_empty() {
343 |             return Ok(());
344 |         }
345 | 
346 |         // Keep writing until all slices are fully consumed
347 |         while !slices.is_empty() {
348 |             // Rebuild IoSlice views for current remainder
349 |             let iov: Vec<IoSlice<'_>> = slices.iter().map(|s| IoSlice::new(s)).collect();
350 | 
351 |             let wrote = w.write_vectored(&iov)?;
352 |             if wrote == 0 {
353 |                 return Err(io::Error::new(
354 |                     io::ErrorKind::WriteZero,
355 |                     "write_vectored returned 0",
356 |                 ));
357 |             }
358 | 
359 |             // Consume 'wrote' bytes from the front of `slices`
360 |             let mut remaining = wrote;
361 |             let mut drop_count = 0;
362 | 
363 |             for s in &mut slices {
364 |                 if remaining == 0 {
365 |                     break;
366 |                 }
367 |                 if remaining >= s.len() {
368 |                     remaining -= s.len();
369 |                     drop_count += 1;
370 |                 } else {
371 |                     // Advance within the first partially-written slice
372 |                     *s = &s[remaining..];
373 |                     remaining = 0;
374 |                 }
375 |             }
376 | 
377 |             if drop_count > 0 {
378 |                 slices.drain(0..drop_count);
379 |             }
380 |         }
381 | 
382 |         Ok(())
383 |     }
384 | 
385 |     // Write out: use large BufWriter and batch IoSlice slices across consecutive parts.
386 |     {
387 |         // Assemble and write batches, reusing a small Vec<&[u8]> to avoid reallocs
388 |         let mut batch: Vec<&[u8]> = Vec::with_capacity(IOV_BATCH);
389 | 
390 |         if let Some(p) = output_path {
391 |             // File output path: create file and large BufWriter
392 |             let file = std::fs::File::create(p)?;
393 |             let mut writer = BufWriter::with_capacity(WRITE_BUF_SIZE, file);
394 | 
395 |             for (_, ranges) in parts.iter() {
396 |                 for &(ls, le) in ranges {
397 |                     // Safety: ls/le were validated against file_len earlier
398 |                     let slice = &mmap[ls as usize..le as usize];
399 |                     batch.push(slice);
400 |                     if batch.len() >= IOV_BATCH {
401 |                         write_all_vectored(&mut writer, std::mem::take(&mut batch))?;
402 |                     }
403 |                 }
404 |             }
405 |             if !batch.is_empty() {
406 |                 write_all_vectored(&mut writer, std::mem::take(&mut batch))?;
407 |             }
408 |             writer.flush()?;
409 |         } else {
410 |             // Stdout path: lock stdout and use large BufWriter
411 |             let stdout = std::io::stdout();
412 |             let handle = stdout.lock();
413 |             let mut writer = BufWriter::with_capacity(WRITE_BUF_SIZE, handle);
414 | 
415 |             for (_, ranges) in parts.iter() {
416 |                 for &(ls, le) in ranges {
417 |                     let slice = &mmap[ls as usize..le as usize];
418 |                     batch.push(slice);
419 |                     if batch.len() >= IOV_BATCH {
420 |                         write_all_vectored(&mut writer, std::mem::take(&mut batch))?;
421 |                     }
422 |                 }
423 |             }
424 |             if !batch.is_empty() {
425 |                 write_all_vectored(&mut writer, std::mem::take(&mut batch))?;
426 |             }
427 |             writer.flush()?;
428 |         }
429 |     }
430 | 
431 |     if verbose {
432 |         eprintln!(
433 |             "[INFO] match-only by coords completed; minput blocks {}",
434 |             blocks.len()
435 |         );
436 |     }
437 |     Ok(())
438 | }
439 | 
440 | /// Parse GFF line and check if it overlaps with query intervals
441 | pub fn gff_line_overlaps_queries(
442 |     line: &[u8],
443 |     ivmap: &FxHashMap<String, Vec<(u32, u32)>>,
444 |     mode: OverlapMode,
445 | ) -> bool {
446 |     // Parse columns: seq, source, type, start, end
447 |     let mut off = 0usize;
448 | 
449 |     let i1 = match memchr(b'\t', &line[off..]) {
450 |         Some(i) => off + i,
451 |         None => return false,
452 |     };
453 |     let seq = &line[off..i1];
454 |     off = i1 + 1;
455 | 
456 |     // skip source
457 |     let i2 = match memchr(b'\t', &line[off..]) {
458 |         Some(i) => off + i,
459 |         None => return false,
460 |     };
461 |     off = i2 + 1;
462 | 
463 |     // skip type
464 |     let i3 = match memchr(b'\t', &line[off..]) {
465 |         Some(i) => off + i,
466 |         None => return false,
467 |     };
468 |     off = i3 + 1;
469 | 
470 |     // parse start
471 |     let i4 = match memchr(b'\t', &line[off..]) {
472 |         Some(i) => off + i,
473 |         None => return false,
474 |     };
475 |     let start = match parse_u32_ascii(&line[off..i4]) {
476 |         Some(v) => v,
477 |         None => return false,
478 |     };
479 |     off = i4 + 1;
480 | 
481 |     // parse end
482 |     let i5 = match memchr(b'\t', &line[off..]) {
483 |         Some(i) => off + i,
484 |         None => return false,
485 |     };
486 |     let end = match parse_u32_ascii(&line[off..i5]) {
487 |         Some(v) => v,
488 |         None => return false,
489 |     };
490 | 
491 |     let seq_str = match std::str::from_utf8(seq) {
492 |         Ok(s) => s,
493 |         Err(_) => return false,
494 |     };
495 |     let ivs = match ivmap.get(seq_str) {
496 |         Some(v) => v,
497 |         None => return false,
498 |     };
499 | 
500 |     for &(qs, qe) in ivs {
501 |         let keep = match mode {
502 |             OverlapMode::Contained => {
503 |                 // feature must be fully inside query
504 |                 start >= qs && end <= qe
505 |             }
506 |             OverlapMode::ContainsRegion => {
507 |                 // feature must fully contain query
508 |                 start <= qs && end >= qe
509 |             }
510 |             OverlapMode::Overlap => {
511 |                 // any overlap
512 |                 (qs <= start && start <= qe)
513 |                     || (qs <= end && end <= qe)
514 |                     || (start <= qs && qs <= end)
515 |                     || (start <= qe && qe <= end)
516 |             }
517 |         };
518 |         if keep {
519 |             return true;
520 |         }
521 |     }
522 |     false
523 | }
524 | 
525 | #[inline]
526 | fn parse_u32_ascii(s: &[u8]) -> Option<u32> {
527 |     let mut v: u32 = 0;
528 |     if s.is_empty() {
529 |         return None;
530 |     }
531 |     for &c in s {
532 |         if !c.is_ascii_digit() {
533 |             return None;
534 |         }
535 |         v = v.checked_mul(10)?.checked_add((c - b'0') as u32)?;
536 |     }
537 |     Some(v)
538 | }
539 | 
540 | /// Main execution function
541 | pub fn run(args: &IntersectArgs) -> Result<()> {
542 |     let verbose = args.common.verbose;
543 |     
544 |     if verbose {
545 |         eprintln!("[DEBUG] Starting processing of {:?}", args.common.input);
546 |         eprintln!(
547 |             "[DEBUG] Thread pool initialized with {} threads",
548 |             args.common.effective_threads()
549 |         );
550 |     }
551 |     
552 |     // Determine overlap mode
553 |     let mode = if args.contained {
554 |         OverlapMode::Contained
555 |     } else if args.contains_region {
556 |         OverlapMode::ContainsRegion
557 |     } else {
558 |         OverlapMode::Overlap
559 |     };
560 | 
561 |     let index_data = TreeIndexData::load_tree_index(&args.common.input)?;
562 |     let seqid_map = &index_data.seqid_to_num;
563 | 
564 |     let regions = {
565 |         if let Some(bed) = &args.bed {
566 |             parse_bed_file(bed, seqid_map)?
567 |         } else if let Some(r) = &args.region {
568 |             vec![parse_region(r, seqid_map, &args.common)?]
569 |         } else {
570 |             anyhow::bail!("No region specified");
571 |         }
572 |     };
573 | 
574 | 
575 |     if verbose {
576 |         eprintln!(
577 |             "[DEBUG] Starting query_features with {} regions",
578 |             regions.len()
579 |         );
580 |         eprintln!(
581 |             "[DEBUG] Mode: {:?}",
582 |             mode
583 |         );
584 |     }
585 |     
586 |     let feats = {
587 |         query_features(
588 |             &index_data,
589 |             &regions,
590 |             mode,
591 |             args.invert,
592 |             args.common.verbose,
593 |         )?
594 |     };
595 | 
596 |     // Collect IDs for root features only
597 |     let gof = load_gof(&args.common.input)?;
598 |     let root_matches: Vec<RootMatched> = {
599 |         let mut grouped: FxHashMap<u32, Vec<u32>> = FxHashMap::default();
600 |         for (root, _s, _e) in feats {
601 |             grouped.entry(root).or_default().push(root);
602 |         }
603 |         grouped
604 |             .into_iter()
605 |             .map(|(root, matched)| RootMatched { root, matched })
606 |             .collect()
607 |     };
608 | 
609 |     let roots: Vec<u32> = {
610 |         let mut s: FxHashSet<u32> = FxHashSet::default();
611 |         for rm in &root_matches {
612 |             s.insert(rm.root);
613 |         }
614 |         s.into_iter().collect()
615 |     };
616 | 
617 |     let blocks: Vec<(u32, u64, u64)> = gof.roots_to_offsets(&roots, args.common.effective_threads());
618 | 
619 |     if !args.common.entire_group || args.common.types.is_some() {
620 |         // Build query interval map by seq name
621 |         let query_ivmap: FxHashMap<String, Vec<(u32, u32)>> = {
622 |             let mut num_to_seq: FxHashMap<u32, String> = FxHashMap::default();
623 |             for (name, &num) in index_data.seqid_to_num.iter() {
624 |                 num_to_seq.insert(num, name.clone());
625 |             }
626 |             let mut m: FxHashMap<String, Vec<(u32, u32)>> = FxHashMap::default();
627 |             for &(chr_num, s, e) in &regions {
628 |                 if let Some(seq_name) = num_to_seq.get(&chr_num) {
629 |                     m.entry(seq_name.clone()).or_default().push((s, e));
630 |                 }
631 |             }
632 |             m
633 |         };
634 | 
635 |         {
636 |             write_gff_match_only_by_coords(
637 |                 args.common.input.as_path(),
638 |                 &blocks,
639 |                 &query_ivmap,
640 |                 args.common.types.as_deref(),
641 |                 &args.common.output,
642 |                 mode,
643 |                 args.common.verbose,
644 |             )?;
645 |         }
646 |     } else {
647 |         write_gff_output(
648 |             args.common.input.as_path(),
649 |             &blocks,
650 |             &args.common.output,
651 |             args.common.verbose,
652 |         )?;
653 |     }
654 |     Ok(())
655 | }
656 | 


--------------------------------------------------------------------------------
/src/commands/sample.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use clap::Parser;
 3 | use rayon::prelude::*;
 4 | use rand::seq::{IndexedRandom};
 5 | use rand::rng;
 6 | use std::{
 7 |     path::PathBuf,
 8 | };
 9 | use crate::{load_gof, write_gff_output};
10 | 
11 | /// Arguments
12 | #[derive(Parser, Debug)]
13 | #[command(
14 |     about = "Sample feature groups per chromosome",
15 |     long_about = "Sample feature groups per chromosome."
16 | )]
17 | pub struct SampleArgs {
18 |     /// GFF file path (indexed via GOF)
19 |     #[arg(short = 'i', long = "input", value_name = "FILE")]
20 |     pub input: PathBuf,
21 | 
22 |     /// Ratio of downsampling
23 |     #[arg(short = 'r', long = "ratio")]
24 |     pub ratio: f32,
25 |     
26 |     /// Output file (required)
27 |     #[arg(short = 'o', long = "output", value_name = "FILE")]
28 |     pub output: Option<PathBuf>,
29 | 
30 |     /// Number of threads
31 |     #[arg(short = 't', long = "threads", default_value_t = 12, value_name = "NUM")]
32 |     pub threads: usize,
33 | 
34 |     /// Verbose logs
35 |     #[arg(short = 'v', long = "verbose", default_value_t = false, value_name = "BOOL")]
36 |     pub verbose: bool,
37 | }
38 | 
39 | pub fn run(args: &SampleArgs) -> Result<()> {
40 |     let verbose = args.verbose;
41 |     let threads = if args.threads == 0 {
42 |         std::thread::available_parallelism()
43 |             .map(|n| n.get())
44 |             .unwrap_or(1)
45 |     } else {
46 |         args.threads
47 |     };
48 |     let _ = rayon::ThreadPoolBuilder::new().num_threads(threads).build_global();
49 |     let gff_path = &args.input;
50 |     
51 |     let gof = load_gof(&gff_path)?;
52 | 
53 |     let blocks: Vec<(u32, u64, u64)> = gof.seqid_index
54 |         .par_iter()
55 |         .flat_map(|(_seqid_num, indices)| {
56 |             let mut rng = rng();
57 |     
58 |             // 1. collect all fids for this chromosome
59 |             let fids: Vec<u32> = indices
60 |                 .iter()
61 |                 .map(|&i| gof.entries[i].feature_id)
62 |                 .collect();
63 |     
64 |             if fids.is_empty() {
65 |                 return Vec::new();
66 |             }
67 |     
68 |             // 2. sample 10% fids
69 |             let sample_size = (fids.len() as f32 * args.ratio).ceil() as usize;
70 |             let sampled: Vec<u32> = fids.choose_multiple(&mut rng, sample_size).cloned().collect();
71 |     
72 |             // 3. use index_cached() to get offsets
73 |             let idx = gof.index_cached();
74 |             sampled
75 |                 .into_iter()
76 |                 .filter_map(|fid| idx.get(&fid).map(|&(s, e)| (fid, s, e)))
77 |                 .collect::<Vec<_>>()
78 |         })
79 |         .collect();
80 | 
81 |     // Step 3: write sampled GFF blocks
82 |     write_gff_output(gff_path, &blocks, &args.output, verbose)?;
83 |     Ok(())
84 | }
85 | 
86 | 
87 | 
88 |         


--------------------------------------------------------------------------------
/src/commands/search.rs:
--------------------------------------------------------------------------------
  1 | use anyhow::{Result, bail};
  2 | use clap::{ArgGroup, Parser};
  3 | use rustc_hash::{FxHashMap, FxHashSet};
  4 | use regex::Regex;
  5 | use std::{
  6 |     fs::File,
  7 |     io::{BufReader, BufRead},
  8 |     path::PathBuf,
  9 |     time::Instant,
 10 | };
 11 | 
 12 | 
 13 | use crate::{
 14 |     CommonArgs, load_gof, load_prt, load_a2f, load_atn,
 15 |     write_gff_output, write_gff_output_filtered,
 16 | };
 17 | 
 18 | #[derive(Parser, Debug)]
 19 | #[command(
 20 |     about = "Search features by attribute values",
 21 |     group = ArgGroup::new("attr_input")
 22 |         .required(true)
 23 |         .args(["attr_list", "attr"])
 24 | )]
 25 | pub struct SearchArgs {
 26 |     /// Common input/output/thread arguments
 27 |     #[clap(flatten)]
 28 |     pub common: CommonArgs,
 29 | 
 30 |     #[arg(
 31 |         short = 'A',
 32 |         long,
 33 |         help = "Attribute list file (one per line)",
 34 |         group = "attr_input"
 35 |     )]
 36 |     attr_list: Option<PathBuf>,
 37 | 
 38 |     #[arg(
 39 |         short = 'a',
 40 |         long,
 41 |         help = "Single attribute value to search",
 42 |         group = "attr_input"
 43 |     )]
 44 |     attr: Option<String>,
 45 | 
 46 |     #[arg(
 47 |         short = 'r',
 48 |         long,
 49 |         help = "Enable regex mode for attribute matching")]
 50 |     regex: bool,
 51 | }
 52 | 
 53 | /// When in `per-feature` mode: within each root block, emit only lines whose `ID` exactly matches
 54 | /// the user-specified features under that root. Optional `types_filter` is applied to column 3.
 55 | pub fn run(args: &SearchArgs) -> Result<()> {
 56 |     let verbose = args.common.verbose;
 57 |     let gff_path = &args.common.input;
 58 | 
 59 |     // Init thread pool
 60 |     let overall_start = Instant::now();
 61 |     if verbose {
 62 |         eprintln!("[DEBUG] Starting processing of {:?}", gff_path);
 63 |         eprintln!(
 64 |             "[DEBUG] Thread pool initialized with {} threads",
 65 |             args.common.effective_threads()
 66 |         );
 67 |     }
 68 | 
 69 |     // Load index artifacts
 70 |     let prt = load_prt(gff_path)?;          // parent pointers (fid -> parent fid)
 71 |     let gof = load_gof(gff_path)?;          // GOF offsets (fid -> (start,end))
 72 |     let a2f = load_a2f(gff_path)?;          // attribute index -> fid
 73 |     let (atn_attr_name, atn_values) = load_atn(gff_path)?; // attribute values table (index-aligned)
 74 | 
 75 |     // Collect attribute values from file or single arg
 76 |     let attr_values: Vec<String> = if let Some(file) = &args.attr_list {
 77 |         let reader = BufReader::new(File::open(file)?);
 78 |         reader
 79 |             .lines()
 80 |             .map(|r| r.map(|s| s.trim().to_owned()))
 81 |             .filter(|r| r.as_ref().map_or(true, |s| !s.is_empty()))
 82 |             .collect::<Result<Vec<_>, _>>()?
 83 |     } else if let Some(val) = &args.attr {
 84 |         vec![val.clone()]
 85 |     } else {
 86 |         bail!("Either --attr-list (-A) or --attr (-a) must be provided.");
 87 |     };
 88 | 
 89 |     // Step 1: build attribute -> AID list
 90 |     // In regex mode, match by regex; otherwise exact string match.
 91 |     let mut attr_to_aids: FxHashMap<String, Vec<u32>> = FxHashMap::default();
 92 |     if args.regex {
 93 |         let patterns: Vec<Regex> = attr_values
 94 |             .iter()
 95 |             .map(String::as_str)
 96 |             .map(Regex::new)
 97 |             .collect::<std::result::Result<Vec<_>, _>>()?;
 98 | 
 99 |         for (i, val) in atn_values.iter().enumerate() {
100 |             if patterns.iter().any(|re| re.is_match(val)) {
101 |                 attr_to_aids.entry(val.clone()).or_default().push(i as u32);
102 |             }
103 |         }
104 |     } else {
105 |         let wanted: FxHashSet<&str> = attr_values.iter().map(String::as_str).collect();
106 |         for (i, val) in atn_values.iter().enumerate() {
107 |             if wanted.contains(val.as_str()) {
108 |                 attr_to_aids.entry(val.clone()).or_default().push(i as u32);
109 |             }
110 |         }
111 |     }
112 | 
113 |     // Nothing matched → early exit with a helpful error
114 |     if attr_to_aids.is_empty() {
115 |         bail!("None of the attributes matched.");
116 |     }
117 | 
118 |     if verbose {
119 |         eprintln!("[DEBUG] Matched attribute -> AIDs:");
120 |         for (attr_val, aids) in &attr_to_aids {
121 |             eprintln!("  {} => {:?}", attr_val, aids);
122 |         }
123 |     }
124 | 
125 |     // Step 2: map AIDs -> FIDs via a2f (attribute index to feature id)
126 |     // Note: a2f is expected to be indexable by AID (usize).
127 |     // We also deduplicate per attribute to keep vectors lean.
128 |     let mut attr_to_fids: FxHashMap<String, Vec<u32>> = FxHashMap::default();
129 |     
130 |     for (attr_val, aids) in &attr_to_aids {
131 |         let mut fids = a2f.map_aids_to_fids_vec(aids);
132 |         fids.sort_unstable();
133 |         fids.dedup();
134 |     
135 |         if !fids.is_empty() {
136 |             attr_to_fids.insert(attr_val.clone(), fids);
137 |         }
138 |     }
139 | 
140 |     if attr_to_fids.is_empty() {
141 |         bail!("No feature IDs (FIDs) resolved from matched attributes.");
142 |     }
143 | 
144 |     if verbose {
145 |         eprintln!("[DEBUG] Attribute -> FIDs after a2f mapping:");
146 |         for (attr_val, fids) in &attr_to_fids {
147 |             eprintln!("  {} => {:?} ", attr_val, fids);
148 |         }
149 |     }
150 | 
151 |     // Step 3: map FIDs -> root FIDs using PrtMap::map_fids_to_roots (fast)
152 |     let mut fid_vec: Vec<u32> = attr_to_fids
153 |         .values()
154 |         .flat_map(|v| v.iter().copied())
155 |         .collect();
156 |     fid_vec.sort_unstable();
157 |     fid_vec.dedup();
158 | 
159 |     if verbose {
160 |         eprintln!("[DEBUG] Total unique FIDs: {}", fid_vec.len());
161 |     }
162 | 
163 |     let threads = args.common.effective_threads();
164 |     let root: Vec<u32> = prt.map_fids_to_roots(&fid_vec, threads);
165 | 
166 |     // Collect invalid fids (mapped to u32::MAX), and build a unique root list
167 |     let mut invalid_fids: Vec<u32> = Vec::new();
168 |     let mut roots_effective: Vec<u32> = Vec::with_capacity(root.len());
169 |     for (&fid, r) in fid_vec.iter().zip(root.iter()) {
170 |         if *r == u32::MAX {
171 |             invalid_fids.push(fid);
172 |         } else {
173 |             roots_effective.push(*r);
174 |         }
175 |     }
176 | 
177 |     if !invalid_fids.is_empty() {
178 |         invalid_fids.sort_unstable();
179 |         invalid_fids.dedup();
180 |         eprintln!(
181 |             "[WARN] {} FIDs have invalid parent chains (or out-of-range): {:?}",
182 |             invalid_fids.len(),
183 |             invalid_fids
184 |         );
185 |     }
186 |     roots_effective.sort_unstable();
187 |     roots_effective.dedup();
188 | 
189 |     if roots_effective.is_empty() {
190 |         bail!("No valid root features resolved from matched attributes.");
191 |     }
192 |     if verbose {
193 |         eprintln!("[DEBUG] Total unique roots: {}", roots_effective.len());
194 |     }
195 | 
196 |     let blocks: Vec<(u32, u64, u64)> = gof.roots_to_offsets(&roots_effective, args.common.effective_threads());
197 |     
198 |     if !args.common.entire_group|| args.common.types.is_some() {
199 |         let allowed_roots: FxHashSet<u32> = roots_effective.iter().copied().collect();
200 |         
201 |         let mut fid_to_root: FxHashMap<u32, u32> = FxHashMap::default();
202 |         fid_to_root.reserve(fid_vec.len());
203 |         for (fid, r) in fid_vec.iter().copied().zip(root.iter().copied()) {
204 |             if r != u32::MAX && allowed_roots.contains(&r) {
205 |                 fid_to_root.insert(fid, r);
206 |             }
207 |         }
208 |         
209 |         let mut per_root_matches: FxHashMap<u32, FxHashSet<String>> = FxHashMap::default();
210 |         per_root_matches.reserve(roots_effective.len());
211 |         
212 |         for (attr_val, fids) in &attr_to_fids {
213 |             for &fid in fids {
214 |                 if let Some(&r) = fid_to_root.get(&fid) {
215 |                     per_root_matches.entry(r).or_default().insert(attr_val.clone());
216 |                 }
217 |             }
218 |         }
219 | 
220 |         write_gff_output_filtered(
221 |             gff_path,
222 |             &blocks,
223 |             &per_root_matches,
224 |             &atn_attr_name,
225 |             &args.common.output,
226 |             args.common.types.as_deref(),
227 |             verbose,
228 |         )?;
229 |     } else {
230 |         let mut fid_to_root: FxHashMap<u32, u32> = FxHashMap::default();
231 |         for (fid, r) in fid_vec.iter().copied().zip(root.clone().into_iter()) {
232 |             if r != u32::MAX {
233 |                 fid_to_root.insert(fid, r);
234 |             }
235 |         }
236 |     
237 |         // Step 4: map roots -> (start, end) offsets from GOF
238 |         // Use a cached index to avoid rebuilding a HashMap on every call.
239 |         write_gff_output(
240 |             gff_path,
241 |             &blocks,
242 |             &args.common.output,
243 |             verbose,
244 |         )?;
245 |     }
246 | 
247 |     if verbose {
248 |         eprintln!("[timing] Total elapsed: {:?}", overall_start.elapsed());
249 |     }
250 | 
251 |     Ok(())
252 | }
253 | 


--------------------------------------------------------------------------------
/src/index_builder.rs:
--------------------------------------------------------------------------------
1 | pub mod core;
2 | pub use core::{build_index, write_binary_u32, write_gof, write_lines};
3 | 


--------------------------------------------------------------------------------
/src/index_builder/core.rs:
--------------------------------------------------------------------------------
  1 | use crate::append_suffix;
  2 | use crate::{Interval, IntervalTree, save_multiple_trees, write_offsets_to_file};
  3 | use anyhow::anyhow;
  4 | use anyhow::{Result, bail};
  5 | use byteorder::{LittleEndian, WriteBytesExt};
  6 | use indexmap::IndexMap;
  7 | use memchr::memchr;
  8 | use memmap2::Mmap;
  9 | use regex::{Regex, escape};
 10 | use std::{fs::File, io::Write, path::PathBuf};
 11 | use rustc_hash::{FxHashMap, FxHashSet};
 12 | 
 13 | // Writes text lines to a file
 14 | pub fn write_lines(path: PathBuf, lines: &[String]) -> Result<()> {
 15 |     let mut file = File::create(path)?;
 16 |     for line in lines {
 17 |         writeln!(file, "{}", line)?;
 18 |     }
 19 |     Ok(())
 20 | }
 21 | 
 22 | // Writes u32 values in binary little-endian format
 23 | pub fn write_binary_u32(path: PathBuf, values: &[u32]) -> Result<()> {
 24 |     let mut file = File::create(path)?;
 25 |     for &v in values {
 26 |         file.write_u32::<LittleEndian>(v)?;
 27 |     }
 28 |     Ok(())
 29 | }
 30 | 
 31 | // Writes GFF offset records (gof)
 32 | pub fn write_gof(file: &mut File, id: u32, seq_num: u32, start: u64, end: u64) -> Result<()> {
 33 |     file.write_u32::<LittleEndian>(id)?; // root feature id
 34 |     file.write_u32::<LittleEndian>(seq_num)?; // seq numeric id
 35 |     file.write_u64::<LittleEndian>(start)?; // start offset in GFF file
 36 |     file.write_u64::<LittleEndian>(end)?; // end offset in GFF file
 37 |     Ok(())
 38 | }
 39 | 
 40 | /// Builds various index files for a GFF: .fts, .prt, .a2f, .atn, .sqs, .gof, .rit, .rix
 41 | pub fn build_index(gff: &PathBuf, attr_key: &str, skip_types: &str, verbose: bool) -> Result<()> {
 42 |     // Compile regex patterns
 43 |     let id_re = Regex::new(r"ID=([^;\s]+)")?;
 44 |     let parent_re = Regex::new(r"Parent=([^;\s]+)")?;
 45 |     let attr_re = Regex::new(&format!(r"{}=([^;]+)", escape(attr_key)))?;
 46 | 
 47 |     let skip_types_set: FxHashSet<&str> = skip_types.split(',').collect();
 48 | 
 49 |     if verbose {
 50 |         eprintln!("Building index for {} ...", gff.display());
 51 |     }
 52 | 
 53 |     // Memory-map input file
 54 |     let file = File::open(gff)?;
 55 |     let mmap = unsafe { Mmap::map(&file)? };
 56 |     let data = &mmap[..];
 57 | 
 58 |     // First pass: parse raw features
 59 |     struct RawFeature {
 60 |         seqid: String,
 61 |         start: u32,
 62 |         end: u32,
 63 |         line_offset: u64,
 64 |         id: String,
 65 |         parent: Option<String>,
 66 |         attr: Option<String>,
 67 |     }
 68 |     let mut raw_features = Vec::new();
 69 |     let mut offset = 0;
 70 | 
 71 |     while offset < data.len() {
 72 |         let nl_pos = memchr(b'\n', &data[offset..])
 73 |             .map(|pos| pos + offset)
 74 |             .unwrap_or(data.len());
 75 |         let line_bytes = &data[offset..nl_pos];
 76 |         let line_offset = offset as u64;
 77 |         offset = nl_pos + 1;
 78 | 
 79 |         if line_bytes.is_empty() || line_bytes[0] == b'#' {
 80 |             continue;
 81 |         }
 82 |         let line = std::str::from_utf8(line_bytes)?.trim();
 83 |         if line.is_empty() {
 84 |             continue;
 85 |         }
 86 | 
 87 |         let fields: Vec<&str> = line.split('\t').collect();
 88 |         if fields.len() != 9 {
 89 |             bail!("Invalid GFF line (expected 9 columns): {}", line);
 90 |         }
 91 | 
 92 |         let seqid = fields[0].to_string();
 93 |         let ftype = fields[2];
 94 | 
 95 |         if skip_types_set.contains(ftype) {
 96 |             if verbose {
 97 |                     println!("skip comment feature: {}", ftype);
 98 |             }
 99 |             continue;
100 |         }
101 | 
102 |         let s1 = fields[3].parse::<u32>()?;
103 |         let e1 = fields[4].parse::<u32>()?;
104 |         if e1 == 0 {
105 |             continue;
106 |         }
107 |         let (s1, e1) = if s1 > e1 { (e1, s1) } else { (s1, e1) };
108 |         let start = s1.saturating_sub(1);
109 |         let end   = e1;
110 |         
111 |         // Extract ID
112 |         let id = id_re
113 |             .captures(line)
114 |             .ok_or_else(|| anyhow!("Missing ID in feature: {}", line))?[1]
115 |             .to_string();
116 |         // Extract raw Parent (may refer to unseen ID)
117 |         let parent = parent_re.captures(line).map(|cap| cap[1].to_string());
118 |         // Extract attribute value
119 |         let attr = attr_re.captures(line).map(|cap| {
120 |             let val = cap[1].to_string();       
121 |             // GFF3 spec: attribute values must be URL-encoded.
122 |             // Raw characters such as space, semicolon, or comma are not allowed.
123 |             if val.contains(' ') || val.contains(';') || val.contains(',') {
124 |                 eprintln!("[WARN] Attribute value contains invalid chars (.,;) (should be URL-encoded): in '{}'", val);
125 |             }
126 |             val
127 |         });
128 |         
129 |         raw_features.push(RawFeature {
130 |             seqid,
131 |             start,
132 |             end,
133 |             line_offset,
134 |             id,
135 |             parent,
136 |             attr,
137 |         });
138 |     }
139 | 
140 |     // Build feature_map: string ID -> numeric ID
141 |     let mut feature_map: FxHashMap<String, u32> = FxHashMap::default();
142 |     for (i, rf) in raw_features.iter().enumerate() {
143 |         feature_map.insert(rf.id.clone(), i as u32);
144 |     }
145 | 
146 |     // Open output files
147 |     let mut fts_file = File::create(append_suffix(gff, ".fts"))?;
148 |     let mut prt_entries = Vec::with_capacity(raw_features.len());
149 |     let mut a2f_entries = Vec::with_capacity(raw_features.len());
150 |     let mut atn_entries = Vec::new();
151 |     let mut attr_value_to_id: FxHashMap<String, u32> = FxHashMap::default();
152 |     let mut gof_file = File::create(append_suffix(gff, ".gof"))?;
153 |     let mut seqid_to_num: IndexMap<String, u32> = IndexMap::new();
154 |     let mut trees_input: IndexMap<u32, Vec<(u32, u32, u32)>> = IndexMap::new();
155 |     let mut next_seqid_num: u32 = 0;
156 |     let mut current_root: Option<(u32, u64, u32)> = None;
157 | 
158 |     // Write .fts and build .prt, .a2f, .gof, and seqid intervals
159 |     for rf in &raw_features {
160 |         let fid = feature_map[&rf.id];
161 |         writeln!(fts_file, "{}", rf.id)?;
162 |         // Resolve parent (fallback to self if missing)
163 |         let parent_id = rf
164 |             .parent
165 |             .as_ref()
166 |             .and_then(|p| feature_map.get(p).cloned())
167 |             .unwrap_or(fid);
168 |         prt_entries.push(parent_id);
169 |         // Record roots for GOF and intervals
170 |         if parent_id == fid {
171 |             let seqid_num = *seqid_to_num.entry(rf.seqid.clone()).or_insert_with(|| {
172 |                 let id = next_seqid_num;
173 |                 next_seqid_num += 1;
174 |                 id
175 |             });
176 |             
177 |             trees_input
178 |                 .entry(seqid_num)
179 |                 .or_default()
180 |                 .push((rf.start, rf.end, fid));
181 |     
182 |             if let Some((old_id, old_off, old_seqid_num)) = current_root.take() {
183 |                 write_gof(&mut gof_file, old_id, old_seqid_num, old_off, rf.line_offset)?;
184 |             }
185 |             current_root = Some((fid, rf.line_offset, seqid_num));
186 |         }
187 |         
188 |         // Attribute mapping
189 |         if let Some(val) = &rf.attr {
190 |             let aid = *attr_value_to_id.entry(val.clone()).or_insert_with(|| {
191 |                 let a = atn_entries.len() as u32;
192 |                 atn_entries.push(val.clone());
193 |                 a
194 |             });
195 |             a2f_entries.push(aid);
196 |         } else {
197 |             a2f_entries.push(u32::MAX);
198 |         }
199 |     }
200 |     // Write final GOF record
201 |     if let Some((last_id, last_off, last_seqid_num)) = current_root {
202 |         write_gof(&mut gof_file, last_id, last_seqid_num, last_off, data.len() as u64)?;
203 |     }
204 | 
205 |     // Build interval trees per seqid
206 |     let mut trees = Vec::with_capacity(seqid_to_num.len());
207 |     for (_seqid, seqid_num) in &seqid_to_num {
208 |         let ivs = &trees_input[seqid_num];
209 |         let iv_structs: Vec<Interval<_>> = ivs
210 |             .iter()
211 |             .map(|&(start, end, fid)| Interval {
212 |                 start,
213 |                 end,
214 |                 root_fid: fid,
215 |             })
216 |             .collect();
217 |         trees.push(IntervalTree::new(iv_structs));
218 |     }
219 | 
220 |     // Write .rit and .rix
221 |     let rit = append_suffix(gff, ".rit");
222 |     let rix = append_suffix(gff, ".rix");
223 |     let offsets = save_multiple_trees(&trees, rit.as_path())?;
224 |     write_offsets_to_file(&offsets, rix.as_path())?;
225 | 
226 |     // Write .sqs (sequence list)
227 |     let seqids: Vec<String> = seqid_to_num.keys().cloned().collect();
228 |     write_lines(append_suffix(gff, ".sqs"), &seqids)?;
229 | 
230 |     // Write .atn, .a2f, .prt
231 |     let mut atn_out = Vec::with_capacity(atn_entries.len() + 1);
232 |     atn_out.push(format!("#attribute={}", attr_key));
233 |     atn_out.extend(atn_entries.clone());
234 |     write_lines(append_suffix(gff, ".atn"), &atn_out)?;
235 |     write_binary_u32(append_suffix(gff, ".a2f"), &a2f_entries)?;
236 |     write_binary_u32(append_suffix(gff, ".prt"), &prt_entries)?;
237 | 
238 |     if verbose {
239 |         eprintln!("Index built successfully for {}", gff.display());
240 |     }
241 |     Ok(())
242 | }
243 | 


--------------------------------------------------------------------------------
/src/index_loader.rs:
--------------------------------------------------------------------------------
 1 | pub mod core;
 2 | pub mod gof;
 3 | pub mod fts;
 4 | pub mod prt;
 5 | pub mod a2f;
 6 | 
 7 | pub use core::{load_atn, load_sqs, safe_mmap_readonly};
 8 | pub use gof::{GofMap, load_gof};
 9 | pub use fts::{FtsMap, load_fts};
10 | pub use prt::{PrtMap, load_prt};
11 | pub use a2f::{A2fMap, load_a2f};
12 | 


--------------------------------------------------------------------------------
/src/index_loader/a2f.rs:
--------------------------------------------------------------------------------
  1 | use anyhow::{bail, Context, Result};
  2 | use byteorder::{ByteOrder, LittleEndian};
  3 | use rustc_hash::{FxHashMap, FxHashSet};
  4 | use std::path::Path;
  5 | 
  6 | use crate::{append_suffix, safe_mmap_readonly};
  7 | 
  8 | /// A2fMap stores two indexes:
  9 | /// - `aid_to_fids`: mapping from Attribute ID (AID) to all Feature IDs (FIDs) that reference it.
 10 | /// - `fid_to_aid`: reverse mapping from Feature ID (FID) to its Attribute ID (if any).
 11 | ///
 12 | /// The `.a2f` file on disk is encoded as `fid -> aid` (one u32 per fid).
 13 | /// During loading, we construct the reverse map `aid -> fids` for efficient queries.
 14 | #[derive(Debug)]
 15 | pub struct A2fMap {
 16 |     aid_to_fids: FxHashMap<u32, Vec<u32>>,
 17 |     fid_to_aid: Vec<Option<u32>>,
 18 | }
 19 | 
 20 | impl A2fMap {
 21 |     pub fn new(aid_to_fids: FxHashMap<u32, Vec<u32>>, fid_to_aid: Vec<Option<u32>>) -> Self {
 22 |         Self { aid_to_fids, fid_to_aid }
 23 |     }
 24 | 
 25 |     /// Get all FIDs associated with a given AID.
 26 |     #[inline]
 27 |     pub fn fids_for_aid(&self, aid: u32) -> Option<&[u32]> {
 28 |         self.aid_to_fids.get(&aid).map(|v| v.as_slice())
 29 |     }
 30 | 
 31 |     /// Get the AID associated with a given FID (if any).
 32 |     #[inline]
 33 |     pub fn aid_for_fid(&self, fid: u32) -> Option<u32> {
 34 |         self.fid_to_aid.get(fid as usize).and_then(|x| *x)
 35 |     }
 36 | 
 37 |     /// Map a set of AIDs into a set of FIDs (deduplicated, no order guaranteed).
 38 |     #[inline]
 39 |     pub fn map_aids_to_fids_set(&self, aids: &FxHashSet<u32>) -> FxHashSet<u32> {
 40 |         let mut out = FxHashSet::default();
 41 |         for &aid in aids {
 42 |             if let Some(fids) = self.aid_to_fids.get(&aid) {
 43 |                 out.extend(fids.iter().copied());
 44 |             } else {
 45 |                 eprintln!("[WARN] AID {} not found (no FIDs).", aid);
 46 |             }
 47 |         }
 48 |         out
 49 |     }
 50 | 
 51 |     /// Map a list of AIDs into a combined Vec of FIDs.
 52 |     /// The caller should sort/deduplicate if stable order is needed.
 53 |     #[inline]
 54 |     pub fn map_aids_to_fids_vec(&self, aids: &[u32]) -> Vec<u32> {
 55 |         let mut out = Vec::new();
 56 |         for &aid in aids {
 57 |             if let Some(fids) = self.aid_to_fids.get(&aid) {
 58 |                 out.extend_from_slice(fids);
 59 |             } else {
 60 |                 eprintln!("[WARN] AID {} not found (no FIDs).", aid);
 61 |             }
 62 |         }
 63 |         out
 64 |     }
 65 | 
 66 |     #[inline]
 67 |     pub fn len_fids(&self) -> usize { self.fid_to_aid.len() }
 68 | 
 69 |     #[inline]
 70 |     pub fn is_empty(&self) -> bool { self.fid_to_aid.is_empty() }
 71 | }
 72 | 
 73 | /// Load `.a2f` file and build A2fMap:
 74 | /// - On disk: each 4-byte little-endian u32 represents the AID for a given FID.
 75 | /// - Value `u32::MAX` means "no attribute" (None).
 76 | /// - In memory: build both `fid -> aid` (vector) and `aid -> fids` (hashmap).
 77 | pub fn load_a2f<P: AsRef<Path>>(gff_path: P) -> Result<A2fMap> {
 78 |     let path = gff_path.as_ref();
 79 |     let a2f_path = append_suffix(path, ".a2f");
 80 | 
 81 |     let mmap = safe_mmap_readonly(&a2f_path)
 82 |         .with_context(|| format!("Failed to mmap {}", a2f_path.display()))?;
 83 | 
 84 |     if mmap.len() % 4 != 0 {
 85 |         bail!(
 86 |             "Corrupted A2F ({}): length {} not aligned to u32",
 87 |             a2f_path.display(),
 88 |             mmap.len()
 89 |         );
 90 |     }
 91 | 
 92 |     let n = mmap.len() / 4;
 93 |     let mut fid_to_aid: Vec<Option<u32>> = Vec::with_capacity(n);
 94 |     let mut aid_to_fids: FxHashMap<u32, Vec<u32>> = FxHashMap::default();
 95 | 
 96 |     for (fid, chunk) in mmap.chunks_exact(4).enumerate() {
 97 |         let raw = LittleEndian::read_u32(chunk);
 98 |         let aid = if raw == u32::MAX { None } else { Some(raw) };
 99 |         fid_to_aid.push(aid);
100 | 
101 |         if let Some(aid_val) = aid {
102 |             aid_to_fids.entry(aid_val).or_default().push(fid as u32);
103 |         }
104 |     }
105 | 
106 |     // Optional: deduplicate and sort FID lists
107 |     for fids in aid_to_fids.values_mut() {
108 |         fids.sort_unstable();
109 |         fids.dedup();
110 |     }
111 | 
112 |     Ok(A2fMap::new(aid_to_fids, fid_to_aid))
113 | }
114 | 


--------------------------------------------------------------------------------
/src/index_loader/core.rs:
--------------------------------------------------------------------------------
 1 | use crate::append_suffix;
 2 | use anyhow::{Context, Result, bail};
 3 | use memmap2::Mmap;
 4 | use rustc_hash::FxHashMap;
 5 | use std::{
 6 |     //    collections::HashMap,
 7 |     fs::File,
 8 |     io::{BufRead, BufReader},
 9 |     path::Path,
10 | };
11 | 
12 | 
13 | 
14 | pub fn safe_mmap_readonly(path: &Path) -> Result<Mmap> {
15 |     let file = File::open(path).with_context(|| format!("Failed to open file: {:?}", path))?;
16 |     unsafe { Mmap::map(&file) }.with_context(|| format!("Failed to mmap file: {:?}", path))
17 | }
18 | 
19 | pub fn load_sqs<P: AsRef<Path>>(path: P) -> Result<(Vec<String>, FxHashMap<String, u32>)> {
20 |     let path = path.as_ref();
21 |     let sqs_path = append_suffix(path, ".sqs");
22 |     let file = File::open(&sqs_path)
23 |         .with_context(|| format!("Failed to open SQS file: {:?}", &sqs_path))?;
24 |     let reader = BufReader::new(file);
25 | 
26 |     let id_to_name: Vec<String> = reader.lines().collect::<Result<_, _>>()?;
27 |     let name_to_id: FxHashMap<_, _> = id_to_name
28 |         .iter()
29 |         .enumerate()
30 |         .map(|(id, name)| (name.clone(), id as u32))
31 |         .collect();
32 | 
33 |     Ok((id_to_name, name_to_id))
34 | }
35 | 
36 | 
37 | pub fn load_atn(path: &Path) -> Result<(String, Vec<String>)> {
38 |     let atn_path = append_suffix(path, ".atn");
39 |     let mmap = safe_mmap_readonly(&atn_path)?;
40 |     let data = &mmap[..];
41 | 
42 |     let mut values = Vec::new();
43 |     let mut attr_name: Option<String> = None;
44 | 
45 |     // Helper to process a single line (without trailing '\n')
46 |     let mut push_line = |bytes: &[u8]| -> Result<()> {
47 |         if bytes.is_empty() {
48 |             return Ok(());
49 |         }
50 |         let mut line = std::str::from_utf8(bytes)
51 |             .context("ATN contains invalid UTF-8")?
52 |             .trim();
53 | 
54 |         // Strip UTF-8 BOM if it appears at the start of the very first line
55 |         if attr_name.is_none() && line.starts_with('\u{feff}') {
56 |             line = &line['\u{feff}'.len_utf8()..];
57 |         }
58 | 
59 |         if let Some(rest) = line.strip_prefix("#attribute=") {
60 |             // Only a single header is allowed
61 |             if attr_name.is_some() {
62 |                 bail!("Multiple #attribute= headers found in .atn file");
63 |             }
64 |             attr_name = Some(rest.to_string());
65 |         } else if !line.is_empty() && !line.starts_with('#') {
66 |             // Collect non-empty, non-comment value lines
67 |             values.push(line.to_string());
68 |         }
69 |         Ok(())
70 |     };
71 | 
72 |     // Scan by '\n'
73 |     let mut start = 0usize;
74 |     for (i, &b) in data.iter().enumerate() {
75 |         if b == b'\n' {
76 |             push_line(&data[start..i])?;
77 |             start = i + 1;
78 |         }
79 |     }
80 |     // Handle a trailing line without '\n'
81 |     if start < data.len() {
82 |         push_line(&data[start..])?;
83 |     }
84 | 
85 |     let attr_name = attr_name
86 |         .ok_or_else(|| anyhow::anyhow!("Missing #attribute=... header in .atn file"))?;
87 | 
88 |     Ok((attr_name, values))
89 | }


--------------------------------------------------------------------------------
/src/index_loader/fts.rs:
--------------------------------------------------------------------------------
  1 | use anyhow::{Context, Result};
  2 | use rustc_hash::{FxHashMap, FxHashSet};
  3 | use std::path::Path;
  4 | use std::sync::OnceLock; // lazy cache
  5 | use rayon::prelude::*;
  6 | use crate::{append_suffix, safe_mmap_readonly};
  7 | 
  8 | #[derive(Debug)]
  9 | pub struct FtsMap {
 10 |     pub ids: Vec<String>,
 11 |     /// String -> numeric ID (u32)
 12 |     index_fwd: OnceLock<FxHashMap<String, u32>>,
 13 | }
 14 | 
 15 | impl FtsMap {
 16 |     fn build_fwd(&self) -> FxHashMap<String, u32> {
 17 |         let mut m = FxHashMap::with_capacity_and_hasher(self.ids.len(), Default::default());
 18 |         for (i, s) in self.ids.iter().enumerate() {
 19 |             m.insert(s.clone(), i as u32);
 20 |         }
 21 |         m
 22 |     }
 23 | 
 24 |     /// Access forward index (String -> u32)
 25 |     pub fn index_fwd(&self) -> &FxHashMap<String, u32> {
 26 |         self.index_fwd.get_or_init(|| self.build_fwd())
 27 |     }
 28 | 
 29 |     /// Convert string ID to numeric fid
 30 |     pub fn get_fid(&self, id: &str) -> Option<u32> {
 31 |         self.index_fwd().get(id).copied()
 32 |     }
 33 | 
 34 |     /// Convert numeric fid to string ID (direct from ids vec)
 35 |     pub fn get_id(&self, fid: u32) -> Option<&str> {
 36 |         self.ids.get(fid as usize).map(|s| s.as_str())
 37 |     }
 38 | 
 39 |     /// Map a set of feature names to their fids
 40 |     /// Returns: (found_fids, missing_names)
 41 |     pub fn map_fnames_to_fids(
 42 |         &self,
 43 |         feature_names: &FxHashSet<String>,
 44 |         threads: usize,
 45 |     ) -> (FxHashSet<u32>, Vec<String>) {
 46 |         enum Either<L, R> { Left(L), Right(R) }
 47 | 
 48 |         let idx = self.index_fwd();
 49 | 
 50 |         let mapper = |fname: &String| {
 51 |             if let Some(&fid) = idx.get(fname) {
 52 |                 Either::Left(fid)
 53 |             } else {
 54 |                 Either::Right(fname.clone())
 55 |             }
 56 |         };
 57 | 
 58 |         if threads > 1 && feature_names.len() > 2 {
 59 |             feature_names
 60 |                 .par_iter()
 61 |                 .map(mapper)
 62 |                 .fold(
 63 |                     || (FxHashSet::default(), Vec::new()),
 64 |                     |mut acc, e| {
 65 |                         match e {
 66 |                             Either::Left(n)  => { acc.0.insert(n); }
 67 |                             Either::Right(s) => { acc.1.push(s); }
 68 |                         }
 69 |                         acc
 70 |                     },
 71 |                 )
 72 |                 .reduce(
 73 |                     || (FxHashSet::default(), Vec::new()),
 74 |                     |mut a, b| {
 75 |                         a.0.extend(b.0);
 76 |                         a.1.extend(b.1);
 77 |                         a
 78 |                     },
 79 |                 )
 80 |         } else {
 81 |             let mut set = FxHashSet::default();
 82 |             set.reserve(feature_names.len());
 83 |             let mut miss = Vec::new();
 84 |             for fname in feature_names {
 85 |                 if let Some(&fid) = idx.get(fname) {
 86 |                     set.insert(fid);
 87 |                 } else {
 88 |                     miss.push(fname.clone());
 89 |                 }
 90 |             }
 91 |             (set, miss)
 92 |         }
 93 |     }
 94 | }
 95 | 
 96 | /// Load `.fts` file into FtsMap
 97 | pub fn load_fts<P: AsRef<Path>>(gff_path: P) -> Result<FtsMap> {
 98 |     let path = gff_path.as_ref();
 99 |     let fts_path = append_suffix(path, ".fts");
100 | 
101 |     let mmap = safe_mmap_readonly(&fts_path)
102 |         .with_context(|| format!("Failed to mmap {}", fts_path.display()))?;
103 |     let data = &mmap[..];
104 | 
105 |     let mut lines = Vec::new();
106 |     let mut start = 0;
107 | 
108 |     for (i, &b) in data.iter().enumerate() {
109 |         if b == b'\n' {
110 |             let mut slice = &data[start..i];
111 |             if !slice.is_empty() {
112 |                 if slice.ends_with(b"\r") {
113 |                     slice = &slice[..slice.len() - 1];
114 |                 }
115 |                 let s = std::str::from_utf8(slice)
116 |                     .with_context(|| format!("FTS contains invalid UTF-8 at byte {}", start))?;
117 |                 lines.push(s.to_string());
118 |             }
119 |             start = i + 1;
120 |         }
121 |     }
122 |     if start < data.len() {
123 |         let mut slice = &data[start..];
124 |         if !slice.is_empty() {
125 |             if slice.ends_with(b"\r") {
126 |                 slice = &slice[..slice.len() - 1];
127 |             }
128 |             let s = std::str::from_utf8(slice)
129 |                 .with_context(|| format!("FTS contains invalid UTF-8 at byte {}", start))?;
130 |             lines.push(s.to_string());
131 |         }
132 |     }
133 | 
134 |     Ok(FtsMap {
135 |         ids: lines,
136 |         index_fwd: OnceLock::new(),
137 |     })
138 | }
139 | 


--------------------------------------------------------------------------------
/src/index_loader/gof.rs:
--------------------------------------------------------------------------------
  1 | use anyhow::{bail, Context, Result};
  2 | use byteorder::{ByteOrder, LittleEndian};
  3 | use rustc_hash::FxHashMap;
  4 | use std::{path::Path, sync::OnceLock};
  5 | use crate::{append_suffix, safe_mmap_readonly};
  6 | 
  7 | const MISSING: u64 = u64::MAX; // Set sentinel value for missing entries
  8 | 
  9 | #[derive(Debug)]
 10 | pub struct GofEntry {
 11 |     pub feature_id: u32,
 12 |     pub seqid_num: u32, 
 13 |     pub start_offset: u64,
 14 |     pub end_offset: u64,
 15 | }
 16 | 
 17 | 
 18 | 
 19 | #[derive(Debug)]
 20 | pub struct GofMap {
 21 |     /// Flat list of (feature_id, start, end) tuples as read from .gof
 22 |     pub entries: Vec<GofEntry>,
 23 |     /// Lazy, thread-safe cache: feature_id -> (start, end)
 24 |     index_cache: OnceLock<FxHashMap<u32, (u64, u64)>>,
 25 |     pub seqid_index: FxHashMap<u32, Vec<usize>>,       // seqid_num -> entry indices
 26 | 
 27 | }
 28 | 
 29 | impl GofMap {
 30 |     /// Build a transient index (allocates on every call).
 31 |     /// Prefer `index_cached()` for hot paths.
 32 |     pub fn index(&self) -> FxHashMap<u32, (u64, u64)> {
 33 |         self.entries
 34 |             .iter()
 35 |             .map(|e| (e.feature_id, (e.start_offset, e.end_offset)))
 36 |             .collect()
 37 |     }
 38 | 
 39 |     /// Get (or build once) the cached index.
 40 |     #[inline]
 41 |     pub fn index_cached(&self) -> &FxHashMap<u32, (u64, u64)> {
 42 |         self.index_cache.get_or_init(|| self.index())
 43 |     }
 44 | 
 45 |     /// O(1) lookup using the cached index.
 46 |     #[inline]
 47 |     pub fn get(&self, fid: u32) -> Option<&(u64, u64)> {
 48 |         self.index_cached().get(&fid)
 49 |     }
 50 | 
 51 |     /// Map a list of root IDs to their (start, end) offsets using the cached index.
 52 |     /// Missing roots are silently skipped (consistent with the original free function).
 53 |     #[inline]
 54 |     pub fn roots_to_offsets(
 55 |         &self,
 56 |         roots: &[u32],
 57 |         threads: usize,
 58 |     ) -> Vec<(u32, u64, u64)> {
 59 |         let idx = self.index_cached();
 60 | 
 61 |         // Simple heuristic: parallelize only for large inputs
 62 |         let should_parallel = threads > 1 && roots.len() > 2048;
 63 | 
 64 |         if should_parallel {
 65 |             use rayon::prelude::*;
 66 |             roots
 67 |                 .par_iter()
 68 |                 .map(|&r| match idx.get(&r) {
 69 |                     Some(&(s, e)) => (r, s, e),
 70 |                     None => (r, MISSING, MISSING), // use u64::MAX as sentinel value
 71 |                 })
 72 |                 .collect()
 73 |         } else {
 74 |             let mut out = Vec::with_capacity(roots.len());
 75 |             for &r in roots {
 76 |                 if let Some(&(s, e)) = idx.get(&r) {
 77 |                     out.push((r, s, e));
 78 |                 } else {
 79 |                     out.push((r, MISSING, MISSING)); // use u64::MAX as sentinel value
 80 |                 }
 81 |             }
 82 |             out
 83 |         }
 84 |     }
 85 |     
 86 |     pub fn roots_for_seqid(&self, seqid_num: u32) -> Vec<&GofEntry> {
 87 |         match self.seqid_index.get(&seqid_num) {
 88 |             Some(indices) => indices.iter().map(|&i| &self.entries[i]).collect(),
 89 |             None => Vec::new(),
 90 |         }
 91 |     }
 92 | }
 93 | 
 94 | /// Load a `.gof` file containing (u32 fid, u32 padding, u64 start, u64 end) records.
 95 | pub fn load_gof<P: AsRef<Path>>(gff_path: P) -> Result<GofMap> {
 96 |     let path = gff_path.as_ref();
 97 |     let gof_path = append_suffix(path, ".gof");
 98 |     let mmap = safe_mmap_readonly(&gof_path)
 99 |         .with_context(|| format!("Failed to mmap {}", gof_path.display()))?;
100 |     let bytes = &mmap[..];
101 |     const REC_SIZE: usize = 4 + 4 + 8 + 8;
102 | 
103 |     if bytes.len() % REC_SIZE != 0 {
104 |         bail!(
105 |             "Corrupted GOF ({}): length {} not multiple of {}",
106 |             gof_path.display(),
107 |             bytes.len(),
108 |             REC_SIZE
109 |         );
110 |     }
111 | 
112 |     let mut entries = Vec::with_capacity(bytes.len() / REC_SIZE);
113 |     let mut seqid_index: FxHashMap<u32, Vec<usize>> = FxHashMap::default();
114 |     for (i, rec) in bytes.chunks_exact(REC_SIZE).enumerate() {
115 |         let fid   = LittleEndian::read_u32(&rec[0..4]);
116 |         let seqid_num = LittleEndian::read_u32(&rec[4..8]);
117 |         let start = LittleEndian::read_u64(&rec[8..16]);
118 |         let end   = LittleEndian::read_u64(&rec[16..24]);
119 |         entries.push(GofEntry { feature_id: fid, seqid_num: seqid_num, start_offset: start, end_offset: end });
120 |         seqid_index.entry(seqid_num).or_default().push(i);
121 |     }
122 | 
123 |     Ok(GofMap {
124 |         entries,
125 |         index_cache: OnceLock::new(),
126 |         seqid_index,
127 |     })
128 | }
129 | 


--------------------------------------------------------------------------------
/src/index_loader/prt.rs:
--------------------------------------------------------------------------------
  1 | use anyhow::{bail, Result};
  2 | use byteorder::{ByteOrder, LittleEndian};
  3 | use rustc_hash::FxHashMap;
  4 | use std::{path::Path, sync::OnceLock};
  5 | use rayon::prelude::*; // Parallel iteration (no feature gate)
  6 | 
  7 | use crate::{append_suffix, safe_mmap_readonly};
  8 | 
  9 | #[derive(Debug, Clone, Copy)]
 10 | pub struct PrtEntry {
 11 |     /// Child node id
 12 |     pub child: u32,
 13 |     /// Parent node id
 14 |     pub parent: u32,
 15 | }
 16 | 
 17 | #[derive(Debug)]
 18 | pub struct PrtMap {
 19 |     /// Flat list of (child -> parent) entries, where index i is the child id
 20 |     pub entries: Vec<PrtEntry>,
 21 |     /// Lazy, thread-safe cache for child->parent index
 22 |     index_cache: OnceLock<FxHashMap<u32, u32>>,
 23 | }
 24 | 
 25 | impl PrtMap {
 26 |     /// Construct a PrtMap from a list of entries.
 27 |     pub fn new(entries: Vec<PrtEntry>) -> Self {
 28 |         Self {
 29 |             entries,
 30 |             index_cache: OnceLock::new(),
 31 |         }
 32 |     }
 33 | 
 34 |     /// Build a child -> parent hashmap for O(1) lookups (allocates every call).
 35 |     pub fn index(&self) -> FxHashMap<u32, u32> {
 36 |         self.entries.iter().map(|e| (e.child, e.parent)).collect()
 37 |     }
 38 | 
 39 |     /// Get the cached child -> parent index, building it once on first use.
 40 |     pub fn index_cached(&self) -> &FxHashMap<u32, u32> {
 41 |         self.index_cache.get_or_init(|| self.index())
 42 |     }
 43 | 
 44 |     /// Small helper: get the parent of a child via linear scan (O(n)).
 45 |     pub fn get_parent(&self, child: u32) -> Option<u32> {
 46 |         self.entries
 47 |             .iter()
 48 |             .find(|e| e.child == child)
 49 |             .map(|e| e.parent)
 50 |     }
 51 | 
 52 |     /// Resolve the root of a node by following parent pointers via *array access* (fast path).
 53 |     #[inline]
 54 |     fn resolve_root(&self, start: u32) -> (u32, bool) {
 55 |         let n = self.entries.len() as u32;
 56 |         let mut cur = start;
 57 | 
 58 |         loop {
 59 |             if cur >= n {
 60 |                 return (u32::MAX, true);
 61 |             }
 62 |             let p = self.entries[cur as usize].parent;
 63 | 
 64 |             if p == cur {
 65 |                 return (cur, false);
 66 |             }
 67 |             if p >= n {
 68 |                 return (u32::MAX, true);
 69 |             }
 70 |             cur = p;
 71 |         }
 72 |     }
 73 | 
 74 |     #[inline]
 75 |     /// Map a Vec<FID> to a Vec<ROOT> using the fast resolver.
 76 |     /// - If a FID equals `u32::MAX`, keep it as-is (sentinel).
 77 |     /// - Output order matches the input order.
 78 |     pub fn map_fids_to_roots(&self, fids: &Vec<u32>, threads: usize) -> Vec<u32> {
 79 |         let should_parallel = threads > 1 && fids.len() > 256; // tune threshold as needed
 80 | 
 81 |         if should_parallel {
 82 |             fids.par_iter()
 83 |                 .map(|&fid| {
 84 |                     if fid == u32::MAX {
 85 |                         u32::MAX
 86 |                     } else {
 87 |                         self.resolve_root(fid).0
 88 |                     }
 89 |                 })
 90 |                 .collect()
 91 |         } else {
 92 |             let mut out = Vec::with_capacity(fids.len());
 93 |             for &fid in fids {
 94 |                 if fid == u32::MAX {
 95 |                     out.push(u32::MAX);
 96 |                 } else {
 97 |                     out.push(self.resolve_root(fid).0);
 98 |                 }
 99 |             }
100 |             out
101 |         }
102 |     }
103 | }
104 | 
105 | /// Load a `.prt` file that encodes parent pointers as a u32 array.
106 | /// Each 4-byte little-endian word is the parent id of the child at the same index.
107 | /// For child i, parent = data[i].
108 | pub fn load_prt<P: AsRef<Path>>(gff_path: P) -> Result<PrtMap> {
109 |     let path = gff_path.as_ref();
110 |     let prt_path = append_suffix(path, ".prt");
111 |     let mmap = safe_mmap_readonly(&prt_path)?;
112 |     if mmap.len() % 4 != 0 {
113 |         bail!("Corrupted PRT: not aligned to u32");
114 |     }
115 | 
116 |     let mut entries = Vec::with_capacity(mmap.len() / 4);
117 |     for (i, chunk) in mmap.chunks_exact(4).enumerate() {
118 |         let parent = LittleEndian::read_u32(chunk);
119 |         entries.push(PrtEntry {
120 |             child: i as u32,
121 |             parent,
122 |         });
123 |     }
124 |     Ok(PrtMap::new(entries))
125 | }
126 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | // src/lib.rs
 2 | pub mod commands;
 3 | pub mod index_builder;
 4 | pub mod index_loader;
 5 | pub mod utils;
 6 | 
 7 | pub use index_builder::core::build_index;
 8 | pub use index_loader::{
 9 |     core::{load_atn, load_sqs, safe_mmap_readonly},
10 |     gof::{GofMap, load_gof},
11 |     fts::{FtsMap, load_fts},
12 |     prt::{PrtMap, load_prt},
13 |     a2f::{A2fMap, load_a2f}
14 | };
15 | 
16 | 
17 | pub use utils::common::{
18 |     CommonArgs, append_suffix, check_index_files_exist, write_gff_output, write_gff_output_filtered,
19 | };
20 | pub use utils::tree_io::{save_multiple_trees, write_offsets_to_file};
21 | pub use utils::tree::{Interval, IntervalTree};
22 | pub use utils::tree_index::TreeIndexData;
23 | 


--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use clap::{Parser, Subcommand};
 3 | use gffx::commands::*;
 4 | 
 5 | #[derive(Parser)]
 6 | #[command(
 7 |     name = "gffx",
 8 |     version,
 9 |     about = concat!("GFFx: A ultra-fast feature extractor for GFF files\nVersion: ", env!("CARGO_PKG_VERSION")),
10 |     propagate_version = true
11 | )]
12 | struct Cli {
13 |     #[command(subcommand)]
14 |     command: Commands,
15 | }
16 | 
17 | #[derive(Subcommand)]
18 | enum Commands {
19 |     Index(IndexArgs),
20 |     Intersect(IntersectArgs),
21 |     Extract(ExtractArgs),
22 |     Search(SearchArgs),
23 |     Coverage(CoverageArgs),
24 |     Depth(DepthArgs),
25 |     Sample(SampleArgs)
26 | }
27 | 
28 | fn main() -> Result<()> {
29 |     let cli = Cli::parse();
30 | 
31 |     match cli.command {
32 |         Commands::Index(args) => run_index(&args)?,
33 |         Commands::Intersect(args) => run_intersect(&args)?,
34 |         Commands::Extract(args) => run_extract(&args)?,
35 |         Commands::Search(args) => run_search(&args)?,
36 |         Commands::Coverage(args) => run_coverage(&args)?,
37 |         Commands::Depth(args) => run_depth(&args)?,
38 |         Commands::Sample(args) => run_sample(&args)?,
39 |     }
40 | 
41 |     Ok(())
42 | }
43 | 


--------------------------------------------------------------------------------
/src/utils.rs:
--------------------------------------------------------------------------------
1 | pub mod common;
2 | pub mod tree;
3 | pub mod tree_io;
4 | pub mod tree_index;
5 | 
6 | pub use tree::{Interval, IntervalTree};
7 | pub use tree_index::TreeIndexData;


--------------------------------------------------------------------------------
/src/utils/common.rs:
--------------------------------------------------------------------------------
  1 | use anyhow::{Result, Context};
  2 | use clap::{Parser, CommandFactory};
  3 | use clap::error::ErrorKind;
  4 | use memchr::{memchr, memmem};
  5 | use memmap2::Mmap;
  6 | use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
  7 | use rustc_hash::{FxHashMap, FxHashSet};
  8 | use std::{
  9 |     fs::File,
 10 |     io::{BufWriter, IoSlice, Write, stdout},
 11 |     path::{Path, PathBuf},
 12 |     str,
 13 | };
 14 | 
 15 | const MISSING: u64 = u64::MAX; // Set sentinel value for missing entries
 16 | 
 17 | #[derive(Debug, Clone, Parser)]
 18 | pub struct CommonArgs {
 19 |     /// Input GFF file path
 20 |     #[arg(short = 'i', long = "input", value_name = "FILE")]
 21 |     pub input: PathBuf,
 22 | 
 23 |     /// Output file (stdout if not provided)
 24 |     #[arg(short = 'o', long = "output", value_name = "FILE")]
 25 |     pub output: Option<PathBuf>,
 26 | 
 27 |     /// Return the entire feature group for each match (entire-group mode); default: only the matched feature (per-feature mode).
 28 |     #[arg(short = 'e', long = "entire_group", default_value_t = false)]
 29 |     pub entire_group: bool,
 30 | 
 31 |     /// Comma-separated feature types to retain (e.g. exon,gene); only effective in feature-only mode
 32 |     #[arg(short = 'T', long = "types", value_name = "TYPES")]
 33 |     pub types: Option<String>,
 34 | 
 35 |     /// Number of threads for parallel processing
 36 |     #[arg(
 37 |         short = 't',
 38 |         long = "threads",
 39 |         default_value_t = 12,
 40 |         value_name = "NUM",
 41 |     )]
 42 |     pub threads: usize,
 43 | 
 44 |     /// Enable verbose output
 45 |     #[arg(
 46 |         short = 'v',
 47 |         long = "verbose",
 48 |         default_value_t = false,
 49 |         value_name = "BOOL",
 50 |     )]
 51 |     pub verbose: bool,
 52 | }
 53 | 
 54 | impl CommonArgs {
 55 |     /// Return the number of effective threads:
 56 |     /// - If user sets `--threads 0`, use all available cores
 57 |     /// - Otherwise, use the user-specified number
 58 |     #[inline]
 59 |     pub fn effective_threads(&self) -> usize {
 60 |         if self.threads == 0 {
 61 |             std::thread::available_parallelism()
 62 |                 .map(|n| n.get())
 63 |                 .unwrap_or(1)
 64 |         } else {
 65 |             self.threads
 66 |         }
 67 |     }
 68 | 
 69 |     /// Initialize rayon global thread pool
 70 |     /// - Uses `effective_threads()` to decide the number of threads
 71 |     /// - Prints info/warning if verbose mode is enabled
 72 |     pub fn init_rayon(&self) {
 73 |         let n = self.effective_threads();
 74 |         match rayon::ThreadPoolBuilder::new().num_threads(n).build_global() {
 75 |             Ok(()) => {
 76 |                 if self.verbose {
 77 |                     eprintln!("[INFO] rayon threads = {}", n);
 78 |                 }
 79 |             }
 80 |             Err(e) => {
 81 |                 if self.verbose {
 82 |                     eprintln!("[WARN] rayon global pool already initialized: {e}");
 83 |                 }
 84 |             }
 85 |         }
 86 |     }
 87 | 
 88 |     /// Post-parse hook:
 89 |     /// - Validate argument combinations
 90 |     /// - Print info messages
 91 |     /// - Initialize rayon
 92 |     pub fn post_parse(&self) -> Result<(), clap::Error> {
 93 |         // Custom conflict error: entire-group mode cannot be combined with --types
 94 |         if self.entire_group && self.types.is_some() {
 95 |             return Err(
 96 |                 clap::Error::raw(
 97 |                     ErrorKind::ArgumentConflict,
 98 |                     "Full-model mode does not support filtering by feature types (-T/--types).",
 99 |                 )
100 |                 .with_cmd(&Self::command())
101 |             );
102 |         }
103 | 
104 |         // Initialize rayon after validation
105 |         self.init_rayon();
106 | 
107 |         Ok(())
108 |     }
109 | 
110 |     /// Combined parse + post-parse helper:
111 |     /// - Parses CLI arguments
112 |     /// - Runs `post_parse()`
113 |     /// - Exits with error if validation fails
114 |     pub fn parse_and_init() -> Self {
115 |         let args = Self::parse();
116 |         if let Err(e) = args.post_parse() {
117 |             e.exit();
118 |         }
119 |         args
120 |     }
121 | }
122 | 
123 | pub fn append_suffix(path: &Path, suffix: &str) -> PathBuf {
124 |     let parent = path.parent().unwrap_or_else(|| Path::new(""));
125 |     let filename = path.file_name().unwrap_or_default().to_string_lossy();
126 |     parent.join(format!("{filename}{suffix}"))
127 | }
128 | 
129 | /// Write GFF header lines (starting with '#') to output
130 | /// Returns the byte position after the header
131 | pub fn write_gff_header<W: Write>(writer: &mut W, gff_buf: &[u8]) -> Result<usize> {
132 |     let mut pos = 0;
133 |     while pos < gff_buf.len() && gff_buf[pos] == b'#' {
134 |         if let Some(nl) = gff_buf[pos..].iter().position(|&b| b == b'\n') {
135 |             let end = pos + nl + 1;
136 |             writer.write_all(&gff_buf[pos..end])?;
137 |             pos = end;
138 |         } else {
139 |             break;
140 |         }
141 |     }
142 |     Ok(pos)
143 | }
144 | 
145 | /// Check if all expected index files for a given GFF exist.
146 | ///
147 | /// Expected suffixes: `.gof`, `.fts`, `.prt`, `.sqs`, `.atn`, `.a2f`, `.rit`, `.rix`.
148 | ///
149 | /// If any are missing:
150 | /// - Otherwise, return `Ok(false)`.
151 | pub fn check_index_files_exist(gff: &PathBuf) -> Result<bool> {
152 |     let expected_suffixes = [
153 |         ".gof", ".fts", ".prt", ".sqs", ".atn", ".a2f", ".rit", ".rix",
154 |     ];
155 |     let mut missing = Vec::new();
156 | 
157 |     for ext in &expected_suffixes {
158 |         let path = append_suffix(gff, ext);
159 |         if !path.exists() {
160 |             missing.push(ext.to_string());
161 |         }
162 |     }
163 | 
164 |     if !missing.is_empty() {
165 |         eprintln!("Missing index file(s): {:?}", missing);
166 |         Ok(false)
167 |     } else {
168 |         Ok(true)
169 |     }
170 | }
171 | 
172 | /// Write selected byte ranges ("blocks") of a GFF file to an output file or stdout.
173 | ///
174 | /// Features:
175 | /// - Uses memory-mapped I/O for efficiency.
176 | /// - Merges adjacent or overlapping ranges before writing.
177 | /// - Uses vectored I/O (`write_vectored`) to minimize syscalls.
178 | ///
179 | /// # Arguments
180 | /// - `gff_path`: Path to the source GFF file.
181 | /// - `blocks`: A list of `(start, end)` byte ranges to extract.
182 | /// - `output_path`: Output file path. If `None`, writes to stdout.
183 | /// - `_allowed_types`: Reserved for future filtering by feature type (currently unused).
184 | /// - `verbose`: Whether to print diagnostic output.
185 | ///
186 | /// # Errors
187 | /// Returns any I/O or mmap errors.
188 | pub fn write_gff_output(
189 |     gff_path: &Path,
190 |     blocks: &[(u32, u64, u64)],
191 |     output_path: &Option<std::path::PathBuf>,
192 |     verbose: bool,
193 | ) -> Result<()> {
194 |     let file = File::open(gff_path)?;
195 |     let mmap = unsafe { Mmap::map(&file)? };
196 |     let file_len = mmap.len();
197 | 
198 |     // sort and merge blocks
199 |     let mut sorted: Vec<(u64, u64)> = {
200 |         let mut v = Vec::with_capacity(blocks.len());
201 |         for &(fid, s, e) in blocks {
202 |             if s == MISSING {
203 |                 eprintln!("[WARN] skipped fid={} due to sentinel start offset", fid);
204 |                 continue;
205 |             }
206 |             v.push((s, e));
207 |         }
208 |         v
209 |     };
210 |     sorted.sort_unstable_by_key(|&(s, _)| s);
211 | 
212 |     let mut merged: Vec<(u64, u64)> = Vec::with_capacity(sorted.len());
213 |     let mut it = sorted.into_iter();
214 |     if let Some((mut cs, mut ce)) = it.next() {
215 |         for (s, e) in it {
216 |             if s <= ce {
217 |                 ce = ce.max(e);
218 |             } else {
219 |                 if cs < ce {
220 |                     merged.push((cs, ce));
221 |                 }
222 |                 cs = s;
223 |                 ce = e;
224 |             }
225 |         }
226 |         if cs < ce {
227 |             merged.push((cs, ce));
228 |         }
229 |     }
230 | 
231 |     // build IoSlice list
232 |     let mut slices: Vec<IoSlice<'_>> = Vec::with_capacity(merged.len());
233 |     for &(so, eo) in &merged {
234 |         if so >= eo {
235 |             continue;
236 |         }
237 |         let (start, end) = (so as usize, eo as usize);
238 |         if end > file_len {
239 |             continue;
240 |         }
241 |         slices.push(IoSlice::new(&mmap[start..end]));
242 |     }
243 | 
244 |     // Write in batches
245 |     let mut writer: Box<dyn Write> = match output_path {
246 |         Some(p) => Box::new(BufWriter::new(File::create(p)?)),
247 |         None => Box::new(BufWriter::new(stdout())),
248 |     };
249 | 
250 |     const MAX_IOV: usize = 1024;
251 |     let mut base = 0;
252 |     while base < slices.len() {
253 |         let end = (base + MAX_IOV).min(slices.len());
254 |         let batch = &slices[base..end];
255 | 
256 |         let nw = writer.write_vectored(batch)?;
257 |         let mut remaining = nw;
258 |         let mut i = 0;
259 | 
260 |         while i < batch.len() && remaining >= batch[i].len() {
261 |             remaining -= batch[i].len();
262 |             i += 1;
263 |         }
264 | 
265 |         if i < batch.len() && remaining > 0 {
266 |             let cur = &batch[i];
267 |             writer.write_all(&cur[remaining..])?;
268 |             i += 1;
269 |         }
270 | 
271 |         for s in &batch[i..] {
272 |             writer.write_all(s)?;
273 |         }
274 | 
275 |         base = end;
276 |     }
277 | 
278 |     writer.flush()?;
279 | 
280 |     if verbose {
281 |         eprintln!(
282 |             "Wrote {} merged GFF block(s) with vectored I/O",
283 |             merged.len()
284 |         );
285 |     }
286 |     Ok(())
287 | }
288 | 
289 | pub fn write_gff_output_filtered(
290 |     gff_path: &PathBuf,
291 |     blocks: &[(u32, u64, u64)],
292 |     per_root_matches: &FxHashMap<u32, FxHashSet<String>>,
293 |     atn_attr_name: &str,
294 |     output_path: &Option<PathBuf>,
295 |     types_filter: Option<&str>,
296 |     verbose: bool,
297 | ) -> Result<()> {
298 |     // mmap GFF
299 |     let file =
300 |         File::open(gff_path).with_context(|| format!("Cannot open GFF file: {:?}", gff_path))?;
301 |     let mmap =
302 |         unsafe { Mmap::map(&file) }.with_context(|| format!("mmap failed for {:?}", gff_path))?;
303 |     let file_len = mmap.len();
304 | 
305 |     // parse optional type filter: comma-separated into a HashSet<String>
306 |     let type_allow: Option<FxHashSet<String>> = types_filter.map(|s| {
307 |         s.split(',')
308 |             .map(|t| t.trim().to_string())
309 |             .filter(|t| !t.is_empty())
310 |             .collect()
311 |     });
312 | 
313 |     let bkey: Vec<u8> = {
314 |         let mut k = atn_attr_name.as_bytes().to_vec();
315 |         k.push(b'=');
316 |         k
317 |     };
318 |     let bkey_finder = memmem::Finder::new(&bkey);
319 |     
320 |     // Process blocks in parallel; each task returns (block_start, matched_bytes)
321 |     let mut parts: Vec<(u64, Vec<u8>)> = blocks
322 |         .par_iter()
323 |         .filter_map(|&(root, start, end)| {
324 |             // root -> set of string IDs to keep
325 |             let keep: &FxHashSet<String> = per_root_matches.get(&root)?;
326 |             if keep.is_empty() {
327 |                 return None;
328 |             }
329 | 
330 |             let s = start as usize;
331 |             let e = end.min(file_len as u64) as usize;
332 |             if s >= e || e > file_len {
333 |                 return None;
334 |             }
335 |             let window = &mmap[s..e];
336 | 
337 |             // Output buffer for this block
338 |             let mut out = Vec::<u8>::with_capacity(1024);
339 |             let mut pos = 0usize;
340 | 
341 |             // Iterate lines in [s, e)
342 |             let next_line = |from: usize| -> Option<(usize, usize, usize)> {
343 |                 if from >= window.len() {
344 |                     return None;
345 |                 }
346 |                 // '\n' inclusive; rel is the index after '\n' or end-of-window
347 |                 let rel = memchr(b'\n', &window[from..])
348 |                     .map(|i| from + i + 1)
349 |                     .unwrap_or(window.len());
350 |                 // strip trailing '\n' and optional '\r'
351 |                 let mut end_no_nl = rel;
352 |                 if end_no_nl > from && window[end_no_nl - 1] == b'\n' {
353 |                     end_no_nl -= 1;
354 |                 }
355 |                 if end_no_nl > from && window[end_no_nl - 1] == b'\r' {
356 |                     end_no_nl -= 1;
357 |                 }
358 |                 Some((from, rel, end_no_nl))
359 |             };
360 | 
361 |             // If a type filter is supplied, check column-3 equals one of allowed types
362 |             let type_ok = |line: &[u8]| -> bool {
363 |                 if let Some(allow) = &type_allow {
364 |                     // find first three tabs
365 |                     let i1 = match memchr(b'\t', line) {
366 |                         Some(i) => i,
367 |                         None => return false,
368 |                     };
369 |                     let i2 = match memchr(b'\t', &line[i1 + 1..]) {
370 |                         Some(x) => i1 + 1 + x,
371 |                         None => return false,
372 |                     };
373 |                     let i3 = match memchr(b'\t', &line[i2 + 1..]) {
374 |                         Some(x) => i2 + 1 + x,
375 |                         None => return false,
376 |                     };
377 |                     let ty = &line[i2 + 1..i3];
378 |                     if let Ok(ty_str) = std::str::from_utf8(ty) {
379 |                         allow.contains(ty_str)
380 |                     } else {
381 |                         false
382 |                     }
383 |                 } else {
384 |                     true
385 |                 }
386 |             };
387 | 
388 |             // Return true if attributes contain `ID=<value>` and value ∈ keep
389 |             let id_hits_keep = |line_no_crlf: &[u8]| -> bool {
390 |                 // move to 9th field (attributes)
391 |                 let mut off = 0usize;
392 |                 let mut tabs = 0u8;
393 |                 while tabs < 8 {
394 |                     match memchr(b'\t', &line_no_crlf[off..]) {
395 |                         Some(i) => {
396 |                             off += i + 1;
397 |                             tabs += 1;
398 |                         }
399 |                         None => return false,
400 |                     }
401 |                 }
402 |                 let attr = &line_no_crlf[off..];
403 |                 if let Some(p) = bkey_finder.find(attr) {
404 |                     let vstart = p + bkey.len();
405 |                     // value ends at ';' or end-of-line
406 |                     let vend = memchr(b';', &attr[vstart..])
407 |                         .map(|i| vstart + i)
408 |                         .unwrap_or(attr.len());
409 |                     let id_slice = &attr[vstart..vend];
410 |                     if let Ok(id_str) = std::str::from_utf8(id_slice) {
411 |                         return keep.contains(id_str);
412 |                     }
413 |                 }
414 |                 false
415 |             };
416 | 
417 |             // Scan lines in this block window
418 |             while let Some((ls, le, ln_end)) = next_line(pos) {
419 |                 pos = le;
420 |                 let line = &window[ls..le];
421 |                 if !line.is_empty() && line[0] == b'#' {
422 |                     continue; // skip comments
423 |                 }
424 |                 let line_no_crlf = &window[ls..ln_end];
425 | 
426 |                 if !type_ok(line_no_crlf) {
427 |                     continue;
428 |                 }
429 |                 if id_hits_keep(line_no_crlf) {
430 |                     out.extend_from_slice(line);
431 |                 }
432 |             }
433 | 
434 |             if verbose {
435 |                 let matched_lines = out.iter().filter(|&&b| b == b'\n').count();
436 |                 eprintln!(
437 |                     "[filter] root={} block=[{}..{}] keep_ids={} matched_lines={}",
438 |                     root, start, end, keep.len(), matched_lines
439 |                 );
440 |             }
441 | 
442 |             if out.is_empty() {
443 |                 None
444 |             } else {
445 |                 Some((start, out))
446 |             }
447 |         })
448 |         .collect();
449 | 
450 |     // Keep original block order
451 |     parts.sort_unstable_by_key(|(s, _)| *s);
452 | 
453 |     // Write output (stdout or file)
454 |     let raw: Box<dyn Write> = match output_path {
455 |         Some(p) => Box::new(File::create(p).with_context(|| format!("Cannot create output: {:?}", p))?),
456 |         None => Box::new(std::io::stdout()),
457 |     };
458 |     // Bigger buffer reduces syscalls; tune as needed
459 |     let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, raw);
460 |     for (_, buf) in parts {
461 |         writer.write_all(&buf)?;
462 |     }
463 |     writer.flush()?;
464 |     Ok(())
465 | }
466 | 
467 | 
468 | 


--------------------------------------------------------------------------------
/src/utils/tree.rs:
--------------------------------------------------------------------------------
  1 | use serde::{Deserialize, Serialize};
  2 | 
  3 | /// Closed interval on [start, end] for point queries.
  4 | /// For range queries we use half-open logic.
  5 | #[derive(Debug, Clone, Serialize, Deserialize)]
  6 | pub struct Interval<T: Ord + Copy> {
  7 |     pub start: T,
  8 |     pub end: T,
  9 |     pub root_fid: u32,
 10 | }
 11 | 
 12 | #[derive(Debug, Serialize, Deserialize)]
 13 | pub struct IntervalTree<T: Ord + Copy> {
 14 |     root: Option<Box<Node<T>>>,
 15 | }
 16 | 
 17 | #[derive(Debug, Serialize, Deserialize)]
 18 | struct Node<T: Ord + Copy> {
 19 |     center: T,
 20 |     intervals: Vec<Interval<T>>,
 21 |     left: Option<Box<Node<T>>>,
 22 |     right: Option<Box<Node<T>>>,
 23 | }
 24 | 
 25 | impl<T> IntervalTree<T>
 26 | where
 27 |     T: Ord + Copy + Serialize + for<'de> Deserialize<'de>,
 28 | {
 29 |     /// Build a tree from a list of intervals.
 30 |     pub fn new(intervals: Vec<Interval<T>>) -> Self {
 31 |         let root = Self::build(intervals);
 32 |         Self { root }
 33 |     }
 34 | 
 35 |     fn build(mut intervals: Vec<Interval<T>>) -> Option<Box<Node<T>>> {
 36 |         if intervals.is_empty() {
 37 |             return None;
 38 |         }
 39 | 
 40 |         intervals.sort_by_key(|iv| iv.start);
 41 |         let mid = intervals.len() / 2;
 42 |         let center = intervals[mid].start;
 43 | 
 44 |         let mut left = Vec::new();
 45 |         let mut right = Vec::new();
 46 |         let mut center_ivs = Vec::new();
 47 | 
 48 |         for iv in intervals {
 49 |             if iv.end < center {
 50 |                 left.push(iv);
 51 |             } else if iv.start > center {
 52 |                 right.push(iv);
 53 |             } else {
 54 |                 center_ivs.push(iv);
 55 |             }
 56 |         }
 57 | 
 58 |         Some(Box::new(Node {
 59 |             center,
 60 |             intervals: center_ivs,
 61 |             left: Self::build(left),
 62 |             right: Self::build(right),
 63 |         }))
 64 |     }
 65 | 
 66 |     /// Point query: returns all intervals covering `point` (closed semantics on [start, end]).
 67 |     pub fn query_point(&self, point: T) -> Vec<&Interval<T>> {
 68 |         let mut result = Vec::new();
 69 |         Self::query_point_rec(&self.root, point, &mut result);
 70 |         result
 71 |     }
 72 | 
 73 |     fn query_point_rec<'a>(
 74 |         node: &'a Option<Box<Node<T>>>,
 75 |         point: T,
 76 |         result: &mut Vec<&'a Interval<T>>,
 77 |     ) {
 78 |         if let Some(n) = node {
 79 |             for iv in &n.intervals {
 80 |                 if iv.start <= point && point <= iv.end {
 81 |                     result.push(iv);
 82 |                 }
 83 |             }
 84 |             if point < n.center {
 85 |                 Self::query_point_rec(&n.left, point, result);
 86 |             } else if point > n.center {
 87 |                 Self::query_point_rec(&n.right, point, result);
 88 |             } else {
 89 |                 // Equal to center: search both sides
 90 |                 Self::query_point_rec(&n.left, point, result);
 91 |                 Self::query_point_rec(&n.right, point, result);
 92 |             }
 93 |         }
 94 |     }
 95 | 
 96 |     /// Interval query (half-open semantics): returns intervals `iv` where
 97 |     /// `iv.start < end && iv.end > start`.
 98 |     pub fn query_interval<'a>(&'a self, start: T, end: T, out: &mut Vec<&'a Interval<T>>) {
 99 |         Self::query_interval_rec(&self.root, start, end, out);
100 |     }
101 | 
102 |     fn query_interval_rec<'a>(
103 |         node: &'a Option<Box<Node<T>>>,
104 |         start: T,
105 |         end: T,
106 |         out: &mut Vec<&'a Interval<T>>,
107 |     ) {
108 |         if let Some(n) = node {
109 |             for iv in &n.intervals {
110 |                 if iv.start < end && iv.end > start {
111 |                     out.push(iv);
112 |                 }
113 |             }
114 |             if start < n.center {
115 |                 Self::query_interval_rec(&n.left, start, end, out);
116 |             }
117 |             if end > n.center {
118 |                 Self::query_interval_rec(&n.right, start, end, out);
119 |             }
120 |         }
121 |     }
122 | }


--------------------------------------------------------------------------------
/src/utils/tree_index.rs:
--------------------------------------------------------------------------------
 1 | use crate::{IntervalTree, load_sqs, append_suffix};
 2 | use anyhow::{bail, Context, Result};
 3 | use bincode2::deserialize;
 4 | use memmap2::MmapOptions;
 5 | use rustc_hash::FxHashMap;
 6 | use std::{fs::File, path::Path};
 7 | 
 8 | /// Application-facing structure:
 9 | /// - per-sequence interval trees
10 | /// - string -> numeric ID mapping
11 | #[derive(Debug)]
12 | pub struct TreeIndexData {
13 |     pub chr_entries: FxHashMap<u32, IntervalTree<u32>>,
14 |     pub seqid_to_num: FxHashMap<String, u32>,
15 |     pub num_to_seqid: Vec<String>,
16 | }
17 | 
18 | impl TreeIndexData {
19 |     /// Construct TreeIndexData from explicit sequence IDs and `.rit/.rix` files.
20 |     /// Convenience: derive `{prefix}.rit/.rix` from `index_prefix`.
21 |     pub fn load_tree_index<P: AsRef<Path>>(gff_path: P) -> Result<Self> {
22 |         let path = gff_path.as_ref();
23 |         let (num_to_seqid, seqid_to_num) = load_sqs(path)?;
24 |         let rit_path = append_suffix(path, ".rit");
25 |         let rix_path = append_suffix(path, ".rix");
26 |         
27 |         let chr_entries = Self::load_region_index(&rit_path, &rix_path)?;
28 |         
29 |         Ok(Self {
30 |             chr_entries,
31 |             seqid_to_num,
32 |             num_to_seqid
33 |         })
34 |     }
35 | 
36 |     fn load_region_index(
37 |         rit_path: &Path,
38 |         rix_path: &Path,
39 |     ) -> Result<FxHashMap<u32, IntervalTree<u32>>> {
40 |         let file = File::open(rit_path).with_context(|| format!("open {}", rit_path.display()))?;
41 |         let mmap = unsafe { MmapOptions::new().map(&file) }
42 |             .with_context(|| format!("mmap {}", rit_path.display()))?;
43 |         let buf: &[u8] = &mmap;
44 | 
45 |         let offsets: Vec<u64> = {
46 |             let f = File::open(rix_path).with_context(|| format!("open {}", rix_path.display()))?;
47 |             serde_json::from_reader::<_, Vec<u64>>(f)
48 |                 .with_context(|| format!("parse json {}", rix_path.display()))?
49 |         };
50 |         if offsets.is_empty() {
51 |             return Ok(FxHashMap::default());
52 |         }
53 | 
54 |         for w in offsets.windows(2) {
55 |             if w[0] > w[1] {
56 |                 bail!("offsets not sorted ascending: {:?} > {:?}", w[0], w[1]);
57 |             }
58 |         }
59 |         let last = *offsets.last().unwrap() as usize;
60 |         if last > buf.len() {
61 |             bail!("last offset {} out of file size {}", last, buf.len());
62 |         }
63 | 
64 |         let mut map = FxHashMap::with_capacity_and_hasher(offsets.len(), Default::default());
65 |         for (i, start_u64) in offsets.iter().copied().enumerate() {
66 |             let start = start_u64 as usize;
67 |             let end = if i + 1 < offsets.len() {
68 |                 offsets[i + 1] as usize
69 |             } else {
70 |                 buf.len()
71 |             };
72 |             if end < start || end > buf.len() {
73 |                 bail!("bad slice range: {}..{} (file len {})", start, end, buf.len());
74 |             }
75 |             let slice = &buf[start..end];
76 |             let tree: IntervalTree<u32> = deserialize(slice).with_context(|| {
77 |                 format!("bincode2 deserialize tree #{} ({}..{})", i, start, end)
78 |             })?;
79 |             map.insert(i as u32, tree);
80 |         }
81 |         Ok(map)
82 |     }
83 | }
84 | 


--------------------------------------------------------------------------------
/src/utils/tree_io.rs:
--------------------------------------------------------------------------------
  1 | use crate::utils::tree::IntervalTree;
  2 | use anyhow::{bail, Context, Result};
  3 | use bincode2::{deserialize, deserialize_from, serialize, serialize_into};
  4 | 
  5 | use memmap2::MmapOptions;
  6 | use serde::de::DeserializeOwned;
  7 | use std::{
  8 |     fs::{self, File},
  9 |     io::{BufReader, Read, Seek, SeekFrom, BufWriter, Write},
 10 |     path::Path,
 11 | };
 12 | 
 13 | impl<T> IntervalTree<T>
 14 | where
 15 |     T: Ord + Copy + serde::Serialize + for<'de> serde::Deserialize<'de>,
 16 | {
 17 |     /// Serialize the whole tree to a file via bincode2.
 18 |     pub fn save_to_file(&self, path: &Path) -> std::io::Result<()> {
 19 |         let encoded = serialize(self).map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
 20 |         fs::write(path, encoded)?;
 21 |         Ok(())
 22 |     }
 23 | 
 24 |     /// Deserialize a tree from a file (whole-file read).
 25 |     pub fn load_from_file(path: &Path) -> std::io::Result<Self> {
 26 |         let mut file = File::open(path)?;
 27 |         let mut buf = Vec::new();
 28 |         file.read_to_end(&mut buf)?;
 29 |         let tree: Self = deserialize(&buf)
 30 |             .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
 31 |         Ok(tree)
 32 |     }
 33 | }
 34 | 
 35 | /// Save multiple trees back-to-back into a single `.rit` file.
 36 | /// Offsets of each tree (in bytes) should be recorded separately.
 37 | pub fn save_multiple_trees<T>(trees: &[IntervalTree<T>], rit_path: &Path) -> Result<Vec<u64>>
 38 | where
 39 |     T: Ord + Copy + serde::Serialize + for<'de> serde::Deserialize<'de>,
 40 | {
 41 |     let mut offsets = Vec::with_capacity(trees.len());
 42 |     let file = File::create(rit_path)?;
 43 |     let mut writer = BufWriter::new(file);
 44 | 
 45 |     for tree in trees {
 46 |         // Record current offset
 47 |         let pos = writer.seek(SeekFrom::Current(0))?;
 48 |         offsets.push(pos);
 49 |         // Write tree
 50 |         serialize_into(&mut writer, tree)?;
 51 |     }
 52 |     writer.flush()?;
 53 |     Ok(offsets)
 54 | }
 55 | 
 56 | /// Write offsets (Vec<u64>) as JSON into `.rix` file.
 57 | pub fn write_offsets_to_file(offsets: &[u64], rix_path: &Path) -> Result<()> {
 58 |     let file = File::create(rix_path)?;
 59 |     let mut writer = BufWriter::new(file);
 60 |     serde_json::to_writer(&mut writer, offsets)?;
 61 |     writer.flush()?;
 62 |     Ok(())
 63 | }
 64 | 
 65 | /// Load multiple trees with streaming (using BufReader).
 66 | pub fn load_trees_streaming<T>(rit_path: &Path, rix_path: &Path) -> Result<Vec<IntervalTree<T>>>
 67 | where
 68 |     T: Ord + Copy + DeserializeOwned,
 69 | {
 70 |     let mut reader = BufReader::new(File::open(rit_path)?);
 71 |     let offsets: Vec<u64> = {
 72 |         let f = File::open(rix_path)?;
 73 |         serde_json::from_reader(f)?
 74 |     };
 75 | 
 76 |     let mut trees = Vec::with_capacity(offsets.len());
 77 |     for &off in &offsets {
 78 |         reader.seek(SeekFrom::Start(off))?;
 79 |         let tree: IntervalTree<T> = deserialize_from(&mut reader)
 80 |             .map_err(|e| anyhow::anyhow!("Deserializing failed: {}", e))?;
 81 |         trees.push(tree);
 82 |     }
 83 |     Ok(trees)
 84 | }
 85 | 
 86 | /// Load multiple trees via memory-mapping.
 87 | pub fn load_trees_mmap<T>(rit_path: &Path, rix_path: &Path) -> Result<Vec<IntervalTree<T>>>
 88 | where
 89 |     T: Ord + Copy + for<'de> serde::Deserialize<'de>,
 90 | {
 91 |     let file = File::open(rit_path).with_context(|| format!("open {}", rit_path.display()))?;
 92 |     let mmap = unsafe { MmapOptions::new().map(&file) }
 93 |         .with_context(|| format!("mmap {}", rit_path.display()))?;
 94 |     let buf: &[u8] = &mmap;
 95 | 
 96 |     let offsets: Vec<u64> = {
 97 |         let f = File::open(rix_path).with_context(|| format!("open {}", rix_path.display()))?;
 98 |         serde_json::from_reader::<_, Vec<u64>>(f)
 99 |             .with_context(|| format!("parse json {}", rix_path.display()))?
100 |     };
101 | 
102 |     if offsets.is_empty() {
103 |         return Ok(Vec::new());
104 |     }
105 | 
106 |     for w in offsets.windows(2) {
107 |         if w[0] > w[1] {
108 |             bail!("offsets not sorted ascending: {:?} > {:?}", w[0], w[1]);
109 |         }
110 |     }
111 |     let last = *offsets.last().unwrap() as usize;
112 |     if last > buf.len() {
113 |         bail!("last offset {} out of file size {}", last, buf.len());
114 |     }
115 | 
116 |     let mut out = Vec::with_capacity(offsets.len());
117 |     for (i, start_u64) in offsets.iter().copied().enumerate() {
118 |         let start = start_u64 as usize;
119 |         let end = if i + 1 < offsets.len() {
120 |             offsets[i + 1] as usize
121 |         } else {
122 |             buf.len()
123 |         };
124 |         if end < start || end > buf.len() {
125 |             bail!("bad slice range: {}..{} (file len {})", start, end, buf.len());
126 |         }
127 |         let slice = &buf[start..end];
128 |         let tree: IntervalTree<T> = deserialize(slice)
129 |             .with_context(|| format!("bincode2 deserialize tree #{} ({}..{})", i, start, end))?;
130 |         out.push(tree);
131 |     }
132 | 
133 |     Ok(out)
134 | }
135 | 


--------------------------------------------------------------------------------