├── MANIFEST.in ├── src ├── main.rs ├── bindings │ ├── mod.rs │ ├── numpy_bindings │ │ ├── mod.rs │ │ ├── tile_numpy.rs │ │ ├── cluster_numpy.rs │ │ ├── max_disjoint_numpy.rs │ │ ├── count_overlaps_numpy.rs │ │ ├── extend_numpy.rs │ │ ├── merge_numpy.rs │ │ ├── boundary_numpy.rs │ │ ├── split_numpy.rs │ │ ├── complement_overlaps_numpy.rs │ │ ├── subtract_numpy.rs │ │ ├── window_numpy.rs │ │ ├── group_cumsum_numpy.rs │ │ ├── overlaps_numpy.rs │ │ ├── nearest_numpy.rs │ │ ├── sort_intervals_numpy.rs │ │ ├── genome_bounds_numpy.rs │ │ ├── complement_numpy.rs │ │ ├── overlaps_simple_numpy.rs │ │ ├── map_to_global_numpy.rs │ │ └── spliced_subsequence_numpy.rs │ └── polars_bindings.rs ├── helpers.rs ├── lib.rs ├── cluster.rs ├── merge.rs ├── boundary.rs ├── max_disjoint.rs ├── group_cumsum.rs ├── extend.rs ├── outside_bounds.rs ├── complement.rs ├── split.rs ├── complement_single.rs ├── ruranges_structs.rs ├── map_to_global.rs ├── subtract.rs ├── overlaps_simple.rs ├── multiprocessing.rs ├── spliced_subsequence.rs ├── sorts.rs ├── tile.rs ├── numpy_bindings.rs └── nearest.rs ├── Cargo.toml ├── pyproject.toml ├── .gitignore ├── LICENSE ├── .github └── workflows │ ├── build_and_publish.yml │ └── CI.yml ├── README.md └── Cargo.lock /MANIFEST.in: -------------------------------------------------------------------------------- 1 | LICENSE 2 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | println!("Hello, world!"); 3 | } -------------------------------------------------------------------------------- /src/bindings/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod polars_bindings; 2 | pub mod numpy_bindings; 3 | -------------------------------------------------------------------------------- /src/helpers.rs: -------------------------------------------------------------------------------- 1 | use rustc_hash::FxHashSet; 2 | 3 | use crate::ruranges_structs::OverlapPair; 4 | 5 | 6 | pub fn keep_last_by_idx(pairs: &mut Vec) { 7 | let mut seen_idx = FxHashSet::default(); 8 | pairs.reverse(); 9 | pairs.retain(|pair| seen_idx.insert(pair.idx)); 10 | pairs.reverse(); 11 | } 12 | 13 | 14 | pub fn keep_first_by_idx(pairs: &mut Vec) { 15 | let mut seen_idx = FxHashSet::default(); 16 | pairs.retain(|pair| seen_idx.insert(pair.idx)); 17 | } -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod boundary; 2 | pub mod cluster; 3 | pub mod complement; 4 | pub mod complement_single; 5 | pub mod extend; 6 | pub mod max_disjoint; 7 | pub mod merge; 8 | pub mod nearest; 9 | pub mod outside_bounds; 10 | pub mod overlaps; 11 | pub mod overlaps_simple; 12 | pub mod ruranges_structs; 13 | pub mod sorts; 14 | pub mod spliced_subsequence; 15 | pub mod split; 16 | pub mod subtract; 17 | pub mod tile; 18 | pub mod group_cumsum; 19 | pub mod map_to_global; 20 | 21 | pub mod helpers; 22 | 23 | pub mod bindings; 24 | pub mod numpy_bindings; 25 | 26 | -------------------------------------------------------------------------------- /src/bindings/numpy_bindings/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod nearest_numpy; 2 | pub mod overlaps_numpy; 3 | pub mod overlaps_simple_numpy; 4 | pub mod subtract_numpy; 5 | pub mod complement_overlaps_numpy; 6 | pub mod count_overlaps_numpy; 7 | pub mod sort_intervals_numpy; 8 | pub mod cluster_numpy; 9 | pub mod merge_numpy; 10 | pub mod window_numpy; 11 | pub mod tile_numpy; 12 | pub mod complement_numpy; 13 | pub mod boundary_numpy; 14 | pub mod extend_numpy; 15 | pub mod max_disjoint_numpy; 16 | pub mod spliced_subsequence_numpy; 17 | pub mod split_numpy; 18 | pub mod genome_bounds_numpy; 19 | pub mod group_cumsum_numpy; 20 | pub mod map_to_global_numpy; 21 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ruranges" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | [lib] 8 | name = "ruranges" 9 | crate-type = ["cdylib", "rlib"] 10 | 11 | [dependencies] 12 | radsort = "0.1.1" 13 | rustc-hash = "2.1.0" 14 | num-traits = "0.2.19" 15 | 16 | pyo3 = { version = "0.26.0", features = ["extension-module"], optional = false } 17 | numpy = { version = "0.26.0", optional = false } 18 | 19 | # polars = { version = "0.46.0", features = ["csv", "lazy", "dtype-categorical"], optional = false} 20 | # rust-htslib = { version = "0.49.0", optional = false} 21 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin>=1.7,<2.0"] 3 | build-backend = "maturin" 4 | 5 | [project] 6 | name = "ruranges" 7 | dependencies = ["numpy"] 8 | requires-python = ">=3.12" 9 | classifiers = [ 10 | "Programming Language :: Python :: 3", 11 | "Programming Language :: Python :: 3.12", 12 | "Programming Language :: Rust", 13 | "License :: OSI Approved :: MIT License", 14 | "Operating System :: OS Independent", 15 | ] 16 | 17 | version = "0.0.15" 18 | 19 | [tool.maturin] 20 | module-name = "ruranges" 21 | features = ["pyo3/extension-module"] 22 | 23 | authors = [ 24 | { name = "Endre Bakken Stovner", email = "endrebak@pm.me" } 25 | ] 26 | 27 | bindings = "pyo3" 28 | sdist = true 29 | manylinux = "manylinux_2_28" 30 | strip = true 31 | 32 | [project.urls] 33 | Repository = "https://github.com/pyranges/ruranges" 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | 3 | .hypothesis 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | .pytest_cache/ 8 | *.py[cod] 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | .venv/ 16 | env/ 17 | bin/ 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | include/ 28 | man/ 29 | venv/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | pip-selfcheck.json 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | 47 | # Translations 48 | *.mo 49 | 50 | # Mr Developer 51 | .mr.developer.cfg 52 | .project 53 | .pydevproject 54 | 55 | # Rope 56 | .ropeproject 57 | 58 | # Django stuff: 59 | *.log 60 | *.pot 61 | 62 | .DS_Store 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyCharm 68 | .idea/ 69 | 70 | # VSCode 71 | .vscode/ 72 | 73 | # Pyenv 74 | .python-version 75 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Endre Bakken Stovner 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/cluster.rs: -------------------------------------------------------------------------------- 1 | use crate::{ruranges_structs::{GroupType, PositionType}, sorts}; 2 | 3 | pub fn sweep_line_cluster( 4 | chrs: &[G], 5 | starts: &[T], 6 | ends: &[T], 7 | slack: T, 8 | ) -> (Vec, Vec) { 9 | let mut indices = Vec::with_capacity(chrs.len()); 10 | let mut cluster_ids = Vec::with_capacity(chrs.len()); 11 | 12 | if chrs.is_empty() { 13 | return (cluster_ids, indices); 14 | }; 15 | 16 | let events = sorts::build_sorted_events_single_collection(chrs, starts, ends, slack); 17 | 18 | let mut current_chr = events.first().unwrap().chr; 19 | let mut current_cluster = 0; 20 | let mut active_intervals = 0; 21 | 22 | for e in events { 23 | if e.chr != current_chr { 24 | current_cluster += 1; 25 | active_intervals = 0; 26 | current_chr = e.chr; 27 | } 28 | 29 | if e.is_start { 30 | indices.push(e.idx); 31 | cluster_ids.push(current_cluster); 32 | active_intervals += 1; 33 | } else { 34 | active_intervals -= 1; 35 | if active_intervals == 0 { 36 | current_cluster += 1; 37 | } 38 | } 39 | } 40 | 41 | (cluster_ids, indices) 42 | } 43 | -------------------------------------------------------------------------------- /src/bindings/numpy_bindings/tile_numpy.rs: -------------------------------------------------------------------------------- 1 | use numpy::{IntoPyArray, PyArray1, PyReadonlyArray1}; 2 | use pyo3::{pyfunction, Py, PyResult, Python}; 3 | 4 | use crate::tile::tile; 5 | 6 | 7 | macro_rules! define_tile_numpy { 8 | ($fname:ident, $pos_ty:ty) => { 9 | #[pyfunction] 10 | #[pyo3(signature = (starts, ends, negative_strand, tile_size))] 11 | pub fn $fname( 12 | starts: PyReadonlyArray1<$pos_ty>, 13 | ends: PyReadonlyArray1<$pos_ty>, 14 | negative_strand: PyReadonlyArray1, 15 | tile_size: $pos_ty, 16 | py: Python<'_>, 17 | ) -> PyResult<( 18 | Py>, // indices 19 | Py>, // tile starts 20 | Py>, // tile ends 21 | Py>, // overlap fraction 22 | )> { 23 | let (t_starts, t_ends, idx, frac) = tile( 24 | starts.as_slice()?, 25 | ends.as_slice()?, 26 | negative_strand.as_slice()?, 27 | tile_size, 28 | ); 29 | Ok(( 30 | idx .into_pyarray(py).to_owned().into(), 31 | t_starts.into_pyarray(py).to_owned().into(), 32 | t_ends .into_pyarray(py).to_owned().into(), 33 | frac .into_pyarray(py).to_owned().into(), 34 | )) 35 | } 36 | }; 37 | } 38 | 39 | // ── concrete instantiations ──────────────────────────────────────────── 40 | define_tile_numpy!(tile_numpy_i64, i64); 41 | define_tile_numpy!(tile_numpy_i32, i32); 42 | define_tile_numpy!(tile_numpy_i16, i16); -------------------------------------------------------------------------------- /src/merge.rs: -------------------------------------------------------------------------------- 1 | use crate::{ruranges_structs::{GroupType, PositionType}, sorts}; 2 | 3 | pub fn sweep_line_merge( 4 | chrs: &[G], 5 | starts: &[T], 6 | ends: &[T], 7 | slack: T, 8 | ) -> (Vec, Vec, Vec, Vec) { 9 | let mut out_indices = Vec::with_capacity(chrs.len()); 10 | let mut out_starts = Vec::with_capacity(chrs.len()); 11 | let mut out_ends = Vec::with_capacity(chrs.len()); 12 | let mut counts = Vec::with_capacity(chrs.len()); 13 | 14 | if chrs.is_empty() { 15 | return (out_indices, out_starts, out_ends, counts); 16 | }; 17 | 18 | let events = sorts::build_sorted_events_single_collection(chrs, starts, ends, slack); 19 | 20 | let mut current_chr = events.first().unwrap().chr; 21 | let mut current_start: T = T::zero(); 22 | let mut active_count = 0; 23 | let mut current_cluster_count = 0; 24 | 25 | for e in events { 26 | if e.chr != current_chr { 27 | active_count = 0; 28 | current_cluster_count = 0; 29 | current_chr = e.chr; 30 | } 31 | 32 | if active_count == 0 { 33 | current_start = e.pos; 34 | current_cluster_count = 0; 35 | } 36 | 37 | if e.is_start { 38 | active_count += 1; 39 | current_cluster_count += 1; 40 | } else { 41 | active_count -= 1; 42 | if active_count == 0 { 43 | out_indices.push(e.idx); 44 | out_starts.push(current_start); 45 | out_ends.push(e.pos - slack); 46 | counts.push(current_cluster_count); 47 | } 48 | } 49 | } 50 | 51 | (out_indices, out_starts, out_ends, counts) 52 | } 53 | -------------------------------------------------------------------------------- /src/bindings/numpy_bindings/cluster_numpy.rs: -------------------------------------------------------------------------------- 1 | use numpy::{IntoPyArray, PyArray1, PyReadonlyArray1}; 2 | use pyo3::{pyfunction, Py, PyResult, Python}; 3 | 4 | use crate::cluster::sweep_line_cluster; 5 | 6 | 7 | macro_rules! define_cluster_numpy { 8 | ($fname:ident, $chr_ty:ty, $pos_ty:ty) => { 9 | #[pyfunction] 10 | #[pyo3(signature = (chrs, starts, ends, slack = 0))] 11 | #[allow(non_snake_case)] 12 | pub fn $fname( 13 | chrs: PyReadonlyArray1<$chr_ty>, 14 | starts: PyReadonlyArray1<$pos_ty>, 15 | ends: PyReadonlyArray1<$pos_ty>, 16 | slack: $pos_ty, 17 | py: Python<'_>, 18 | ) -> PyResult<(Py>, Py>)> { 19 | let (cluster_ids, idx) = sweep_line_cluster( 20 | chrs.as_slice()?, starts.as_slice()?, ends.as_slice()?, slack, 21 | ); 22 | Ok(( 23 | cluster_ids.into_pyarray(py).to_owned().into(), 24 | idx.into_pyarray(py).to_owned().into(), 25 | )) 26 | } 27 | }; 28 | } 29 | 30 | // ── concrete instantiations ──────────────────────────────────────────── 31 | define_cluster_numpy!(cluster_numpy_u64_i64, u64, i64); 32 | define_cluster_numpy!(cluster_numpy_u32_i64, u32, i64); 33 | define_cluster_numpy!(cluster_numpy_u32_i32, u32, i32); 34 | define_cluster_numpy!(cluster_numpy_u32_i16, u32, i16); 35 | define_cluster_numpy!(cluster_numpy_u16_i64, u16, i64); 36 | define_cluster_numpy!(cluster_numpy_u16_i32, u16, i32); 37 | define_cluster_numpy!(cluster_numpy_u16_i16, u16, i16); 38 | define_cluster_numpy!(cluster_numpy_u8_i64, u8, i64); 39 | define_cluster_numpy!(cluster_numpy_u8_i32, u8, i32); 40 | define_cluster_numpy!(cluster_numpy_u8_i16, u8, i16); -------------------------------------------------------------------------------- /src/bindings/numpy_bindings/max_disjoint_numpy.rs: -------------------------------------------------------------------------------- 1 | use numpy::{IntoPyArray, PyArray1, PyReadonlyArray1}; 2 | use pyo3::{pyfunction, Py, PyResult, Python}; 3 | 4 | use crate::max_disjoint::max_disjoint; 5 | 6 | macro_rules! define_max_disjoint_numpy { 7 | ($fname:ident, $chr_ty:ty, $pos_ty:ty) => { 8 | #[pyfunction] 9 | #[pyo3(signature = (chrs, starts, ends, slack = 0))] 10 | #[allow(non_snake_case)] 11 | pub fn $fname( 12 | chrs: PyReadonlyArray1<$chr_ty>, 13 | starts: PyReadonlyArray1<$pos_ty>, 14 | ends: PyReadonlyArray1<$pos_ty>, 15 | slack: $pos_ty, 16 | py: Python<'_>, 17 | ) -> PyResult>> { 18 | let idx = max_disjoint( 19 | chrs.as_slice()?, 20 | starts.as_slice()?, 21 | ends.as_slice()?, 22 | slack, 23 | ); 24 | Ok(idx.into_pyarray(py).to_owned().into()) 25 | } 26 | }; 27 | } 28 | 29 | // ── concrete instantiations ──────────────────────────────────────────── 30 | define_max_disjoint_numpy!(max_disjoint_numpy_u64_i64, u64, i64); 31 | define_max_disjoint_numpy!(max_disjoint_numpy_u32_i64, u32, i64); 32 | define_max_disjoint_numpy!(max_disjoint_numpy_u32_i32, u32, i32); 33 | define_max_disjoint_numpy!(max_disjoint_numpy_u32_i16, u32, i16); 34 | define_max_disjoint_numpy!(max_disjoint_numpy_u16_i64, u16, i64); 35 | define_max_disjoint_numpy!(max_disjoint_numpy_u16_i32, u16, i32); 36 | define_max_disjoint_numpy!(max_disjoint_numpy_u16_i16, u16, i16); 37 | define_max_disjoint_numpy!(max_disjoint_numpy_u8_i64, u8, i64); 38 | define_max_disjoint_numpy!(max_disjoint_numpy_u8_i32, u8, i32); 39 | define_max_disjoint_numpy!(max_disjoint_numpy_u8_i16, u8, i16); -------------------------------------------------------------------------------- /src/boundary.rs: -------------------------------------------------------------------------------- 1 | use crate::{ruranges_structs::{GroupType, PositionType}, sorts}; 2 | 3 | pub fn sweep_line_boundary( 4 | chrs: &[G], 5 | starts: &[T], 6 | ends: &[T], 7 | ) -> (Vec, Vec, Vec, Vec) { 8 | let mut out_indices: Vec = Vec::with_capacity(chrs.len()); 9 | let mut out_starts = Vec::with_capacity(chrs.len()); 10 | let mut out_ends = Vec::with_capacity(chrs.len()); 11 | let mut counts = Vec::with_capacity(chrs.len()); 12 | 13 | if chrs.is_empty() { 14 | return (out_indices, out_starts, out_ends, counts); 15 | }; 16 | 17 | let events = sorts::build_sorted_events_single_collection(chrs, starts, ends, T::zero()); 18 | 19 | let mut current_chr = events.first().unwrap().chr; 20 | let mut current_start = events.first().unwrap().pos; 21 | let final_idx = events.last().unwrap().idx; 22 | let final_end = events.last().unwrap().pos; 23 | let mut prev_pos = T::zero(); 24 | let mut prev_idx = 0; 25 | let mut current_cluster_count = 0; 26 | 27 | for e in events { 28 | if e.chr != current_chr { 29 | current_cluster_count = 0; 30 | current_chr = e.chr; 31 | out_indices.push(prev_idx); 32 | out_starts.push(current_start); 33 | out_ends.push(prev_pos); 34 | counts.push(current_cluster_count); 35 | current_start = e.pos; 36 | } 37 | 38 | prev_pos = e.pos; 39 | prev_idx = e.idx; 40 | current_cluster_count += 1; 41 | } 42 | 43 | out_indices.push(final_idx); 44 | out_starts.push(current_start); 45 | out_ends.push(final_end); 46 | counts.push(current_cluster_count); 47 | 48 | (out_indices, out_starts, out_ends, counts) 49 | } 50 | -------------------------------------------------------------------------------- /src/max_disjoint.rs: -------------------------------------------------------------------------------- 1 | use radsort::sort; 2 | 3 | use crate::{ruranges_structs::{GroupType, PositionType}, sorts::build_sorted_intervals}; 4 | 5 | pub fn max_disjoint( 6 | groups: &[G], 7 | starts: &[T], 8 | ends: &[T], 9 | slack: T, 10 | ) -> Vec 11 | where 12 | G: GroupType, 13 | T: PositionType, 14 | { 15 | // Ensure the input slices all have the same length. 16 | assert_eq!(groups.len(), starts.len()); 17 | assert_eq!(starts.len(), ends.len()); 18 | 19 | // Build and sort intervals (group ➜ start ➜ end). 20 | let intervals = build_sorted_intervals(groups, starts, ends, None, slack, true); 21 | 22 | if intervals.is_empty() { 23 | return Vec::new(); 24 | } 25 | 26 | // Output indices of the chosen, mutually disjoint intervals. 27 | let mut output: Vec = Vec::with_capacity(intervals.len()); 28 | 29 | // Always accept the first interval of the first group. 30 | let mut current_group = intervals[0].group; 31 | let mut last_end = intervals[0].end; 32 | output.push(intervals[0].idx as u32); 33 | 34 | // Walk through the remaining intervals. 35 | for interval in intervals.iter().skip(1) { 36 | 37 | // NEW: different groups are automatically disjoint – start a fresh tracker. 38 | if interval.group != current_group { 39 | current_group = interval.group; 40 | last_end = interval.end; 41 | output.push(interval.idx as u32); 42 | continue; 43 | } 44 | 45 | // Same group: test true overlap. 46 | if interval.start > last_end + slack { 47 | last_end = interval.end; 48 | output.push(interval.idx as u32); 49 | } 50 | } 51 | 52 | 53 | sort(&mut output); 54 | output 55 | } -------------------------------------------------------------------------------- /src/group_cumsum.rs: -------------------------------------------------------------------------------- 1 | use radsort::sort_by_key; 2 | 3 | use crate::{ruranges_structs::{GroupType, MinInterval, PositionType}, sorts::build_subsequence_intervals}; 4 | 5 | 6 | pub fn sweep_line_cumsum( 7 | chrs: &[G], 8 | starts: &[T], 9 | ends: &[T], 10 | strand_flags: &[bool], 11 | sort: bool, 12 | ) -> (Vec, Vec, Vec) 13 | where 14 | G: GroupType, 15 | T: PositionType, 16 | { 17 | let mut ivals = build_subsequence_intervals(chrs, starts, ends, strand_flags); 18 | 19 | sort_by_key(&mut ivals, |iv| (iv.chr, iv.start)); 20 | 21 | let mut results= Vec::with_capacity(chrs.len()); 22 | 23 | if ivals.is_empty() { 24 | return (Vec::with_capacity(chrs.len()),Vec::with_capacity(chrs.len()), Vec::with_capacity(chrs.len())); 25 | } 26 | 27 | let mut current_chr = ivals[0].chr; 28 | let mut running_total = T::zero(); 29 | 30 | for iv in ivals { 31 | if iv.chr != current_chr { 32 | running_total = T::zero(); 33 | current_chr = iv.chr; 34 | } 35 | 36 | let len = if iv.end >= iv.start { iv.end - iv.start } else { iv.start - iv.end }; 37 | 38 | let s = running_total; 39 | let e = running_total + len; 40 | 41 | results.push(MinInterval {idx: iv.idx, start: s, end: e}); 42 | running_total = e; 43 | } 44 | 45 | if sort { 46 | sort_by_key(&mut results, |i| i.idx); 47 | } 48 | 49 | let mut out_idxs = Vec::with_capacity(results.len()); 50 | let mut out_starts = Vec::with_capacity(results.len()); 51 | let mut out_ends = Vec::with_capacity(results.len()); 52 | 53 | for rec in results { 54 | out_idxs.push(rec.idx); 55 | out_starts.push(rec.start); 56 | out_ends.push(rec.end); 57 | } 58 | 59 | (out_idxs, out_starts, out_ends) 60 | } 61 | -------------------------------------------------------------------------------- /.github/workflows/build_and_publish.yml: -------------------------------------------------------------------------------- 1 | name: Build and Publish Wheels 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | build: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout code 11 | uses: actions/checkout@v3 12 | 13 | - name: Set up Python 14 | uses: actions/setup-python@v4 15 | with: 16 | python-version: "3.12" 17 | 18 | - name: Set up QEMU (for ARM64 emulation) 19 | uses: docker/setup-qemu-action@v3 20 | with: 21 | platforms: "linux/arm64/v8" 22 | 23 | - name: Install build tools (Maturin & cibuildwheel) 24 | run: | 25 | python -m pip install --upgrade pip 26 | python -m pip install maturin cibuildwheel 27 | 28 | - name: Build wheels (ManyLinux 2_28 for x86_64 & aarch64) 29 | env: 30 | CIBW_ARCHS: "x86_64 aarch64" # target architectures 31 | CIBW_BUILD: "cp312-* cp313-*" # build only CPython 3.12 and 3.13 32 | CIBW_SKIP: "pp* *-musllinux_*" # skip PyPy and musllinux builds 33 | CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28 # use latest ManyLinux 2_28 image (x86_64) 34 | CIBW_MANYLINUX_AARCH64_IMAGE: manylinux_2_28 # use latest ManyLinux 2_28 image (aarch64) 35 | CIBW_BEFORE_ALL_LINUX: dnf install -y cargo rustc # install Rust toolchain in container 36 | run: python -m cibuildwheel --platform linux --output-dir dist 37 | 38 | - name: Build sdist 39 | run: | 40 | python -m pip install --upgrade pip 41 | python -m pip install build 42 | python -m build --sdist --outdir dist 43 | 44 | - name: Publish to PyPI 45 | uses: pypa/gh-action-pypi-publish@release/v1 46 | with: 47 | user: __token__ 48 | password: ${{ secrets.PYPI_API_TOKEN }} 49 | packages: "dist/*.whl dist/*.tar.gz" 50 | -------------------------------------------------------------------------------- /src/bindings/numpy_bindings/count_overlaps_numpy.rs: -------------------------------------------------------------------------------- 1 | use numpy::{IntoPyArray, PyArray1, PyReadonlyArray1}; 2 | use pyo3::{pyfunction, Py, PyResult, Python}; 3 | 4 | use crate::overlaps::count_overlaps; 5 | 6 | macro_rules! define_count_overlaps_numpy { 7 | ($fname:ident, $chr_ty:ty, $pos_ty:ty) => { 8 | #[pyfunction] 9 | #[allow(non_snake_case)] 10 | pub fn $fname( 11 | py: Python<'_>, 12 | chrs: PyReadonlyArray1<$chr_ty>, 13 | starts: PyReadonlyArray1<$pos_ty>, 14 | ends: PyReadonlyArray1<$pos_ty>, 15 | chrs2: PyReadonlyArray1<$chr_ty>, 16 | starts2: PyReadonlyArray1<$pos_ty>, 17 | ends2: PyReadonlyArray1<$pos_ty>, 18 | slack: $pos_ty, 19 | ) -> PyResult>> { 20 | let counts = count_overlaps( 21 | chrs.as_slice()?, starts.as_slice()?, ends.as_slice()?, 22 | chrs2.as_slice()?, starts2.as_slice()?, ends2.as_slice()?, 23 | slack, 24 | ); 25 | Ok(counts.into_pyarray(py).to_owned().into()) 26 | } 27 | }; 28 | } 29 | 30 | // ── concrete instantiations ──────────────────────────────────────────── 31 | define_count_overlaps_numpy!(count_overlaps_numpy_u64_i64, u64, i64); 32 | define_count_overlaps_numpy!(count_overlaps_numpy_u32_i64, u32, i64); 33 | define_count_overlaps_numpy!(count_overlaps_numpy_u32_i32, u32, i32); 34 | define_count_overlaps_numpy!(count_overlaps_numpy_u32_i16, u32, i16); 35 | define_count_overlaps_numpy!(count_overlaps_numpy_u16_i64, u16, i64); 36 | define_count_overlaps_numpy!(count_overlaps_numpy_u16_i32, u16, i32); 37 | define_count_overlaps_numpy!(count_overlaps_numpy_u16_i16, u16, i16); 38 | define_count_overlaps_numpy!(count_overlaps_numpy_u8_i64, u8, i64); 39 | define_count_overlaps_numpy!(count_overlaps_numpy_u8_i32, u8, i32); 40 | define_count_overlaps_numpy!(count_overlaps_numpy_u8_i16, u8, i16); -------------------------------------------------------------------------------- /src/bindings/numpy_bindings/extend_numpy.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use numpy::{IntoPyArray, PyReadonlyArray1, PyArray1}; 3 | 4 | use crate::extend; 5 | 6 | macro_rules! define_extend_numpy { 7 | ($fname:ident, $grp_ty:ty, $pos_ty:ty) => { 8 | #[pyfunction] 9 | #[pyo3(signature = ( 10 | groups, 11 | starts, 12 | ends, 13 | negative_strand, // optional (Python requires a default) 14 | ext_3, 15 | ext_5 16 | ))] 17 | pub fn $fname( 18 | groups: PyReadonlyArray1<$grp_ty>, 19 | starts: PyReadonlyArray1<$pos_ty>, 20 | ends: PyReadonlyArray1<$pos_ty>, 21 | negative_strand: PyReadonlyArray1, 22 | ext_3: $pos_ty, 23 | ext_5: $pos_ty, 24 | py: Python<'_>, 25 | ) -> PyResult<(Py>, Py>)> { 26 | 27 | let (new_starts, new_ends) = extend::extend_grp( 28 | groups.as_slice()?, starts.as_slice()?, ends.as_slice()?, 29 | negative_strand.as_slice()?, ext_3, ext_5, 30 | ); 31 | 32 | Ok(( 33 | new_starts.into_pyarray(py).to_owned().into(), 34 | new_ends .into_pyarray(py).to_owned().into(), 35 | )) 36 | } 37 | }; 38 | } 39 | 40 | define_extend_numpy!(extend_numpy_u64_i64, u64, i64); 41 | define_extend_numpy!(extend_numpy_u32_i64, u32, i64); 42 | define_extend_numpy!(extend_numpy_u32_i32, u32, i32); 43 | define_extend_numpy!(extend_numpy_u32_i16, u32, i16); 44 | define_extend_numpy!(extend_numpy_u16_i64, u16, i64); 45 | define_extend_numpy!(extend_numpy_u16_i32, u16, i32); 46 | define_extend_numpy!(extend_numpy_u16_i16, u16, i16); 47 | define_extend_numpy!(extend_numpy_u8_i64, u8, i64); 48 | define_extend_numpy!(extend_numpy_u8_i32, u8, i32); 49 | define_extend_numpy!(extend_numpy_u8_i16, u8, i16); -------------------------------------------------------------------------------- /src/bindings/numpy_bindings/merge_numpy.rs: -------------------------------------------------------------------------------- 1 | use numpy::{IntoPyArray, PyArray1, PyReadonlyArray1}; 2 | use pyo3::{pyfunction, Py, PyResult, Python}; 3 | 4 | use crate::merge::sweep_line_merge; 5 | 6 | 7 | macro_rules! define_merge_numpy { 8 | ($fname:ident, $chr_ty:ty, $pos_ty:ty) => { 9 | #[pyfunction] 10 | #[pyo3(signature = (chrs, starts, ends, slack = 0))] 11 | #[allow(non_snake_case)] 12 | pub fn $fname( 13 | chrs: PyReadonlyArray1<$chr_ty>, 14 | starts: PyReadonlyArray1<$pos_ty>, 15 | ends: PyReadonlyArray1<$pos_ty>, 16 | slack: $pos_ty, 17 | py: Python<'_>, 18 | ) -> PyResult<( 19 | Py>, 20 | Py>, 21 | Py>, 22 | Py>, 23 | )> { 24 | let (idx, m_starts, m_ends, counts) = sweep_line_merge( 25 | chrs.as_slice()?, starts.as_slice()?, ends.as_slice()?, slack, 26 | ); 27 | Ok(( 28 | idx .into_pyarray(py).to_owned().into(), 29 | m_starts .into_pyarray(py).to_owned().into(), 30 | m_ends .into_pyarray(py).to_owned().into(), 31 | counts .into_pyarray(py).to_owned().into(), 32 | )) 33 | } 34 | }; 35 | } 36 | 37 | // ── concrete instantiations ──────────────────────────────────────────── 38 | define_merge_numpy!(merge_numpy_u64_i64, u64, i64); 39 | define_merge_numpy!(merge_numpy_u32_i64, u32, i64); 40 | define_merge_numpy!(merge_numpy_u32_i32, u32, i32); 41 | define_merge_numpy!(merge_numpy_u32_i16, u32, i16); 42 | define_merge_numpy!(merge_numpy_u16_i64, u16, i64); 43 | define_merge_numpy!(merge_numpy_u16_i32, u16, i32); 44 | define_merge_numpy!(merge_numpy_u16_i16, u16, i16); 45 | define_merge_numpy!(merge_numpy_u8_i64, u8, i64); 46 | define_merge_numpy!(merge_numpy_u8_i32, u8, i32); 47 | define_merge_numpy!(merge_numpy_u8_i16, u8, i16); -------------------------------------------------------------------------------- /src/bindings/numpy_bindings/boundary_numpy.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use numpy::{IntoPyArray, PyReadonlyArray1, PyArray1}; 3 | 4 | use crate::boundary::sweep_line_boundary; 5 | 6 | macro_rules! define_boundary_numpy { 7 | ($fname:ident, $chr_ty:ty, $pos_ty:ty) => { 8 | #[pyfunction] 9 | #[allow(non_snake_case)] 10 | pub fn $fname( 11 | py: Python<'_>, 12 | chrs: PyReadonlyArray1<$chr_ty>, 13 | starts: PyReadonlyArray1<$pos_ty>, 14 | ends: PyReadonlyArray1<$pos_ty>, 15 | ) -> PyResult<( 16 | Py>, // indices 17 | Py>, // boundary starts 18 | Py>, // boundary ends 19 | Py>, // counts 20 | )> { 21 | let (idx, b_starts, b_ends, counts) = sweep_line_boundary( 22 | chrs.as_slice()?, starts.as_slice()?, ends.as_slice()?, 23 | ); 24 | Ok(( 25 | idx .into_pyarray(py).to_owned().into(), 26 | b_starts.into_pyarray(py).to_owned().into(), 27 | b_ends .into_pyarray(py).to_owned().into(), 28 | counts .into_pyarray(py).to_owned().into(), 29 | )) 30 | } 31 | }; 32 | } 33 | 34 | // ── concrete instantiations ──────────────────────────────────────────── 35 | define_boundary_numpy!(boundary_numpy_u64_i64, u64, i64); 36 | define_boundary_numpy!(boundary_numpy_u32_i64, u32, i64); 37 | define_boundary_numpy!(boundary_numpy_u32_i32, u32, i32); 38 | define_boundary_numpy!(boundary_numpy_u32_i16, u32, i16); 39 | define_boundary_numpy!(boundary_numpy_u16_i64, u16, i64); 40 | define_boundary_numpy!(boundary_numpy_u16_i32, u16, i32); 41 | define_boundary_numpy!(boundary_numpy_u16_i16, u16, i16); 42 | define_boundary_numpy!(boundary_numpy_u8_i64, u8, i64); 43 | define_boundary_numpy!(boundary_numpy_u8_i32, u8, i32); 44 | define_boundary_numpy!(boundary_numpy_u8_i16, u8, i16); -------------------------------------------------------------------------------- /src/bindings/numpy_bindings/split_numpy.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use numpy::{IntoPyArray, PyReadonlyArray1, PyArray1}; 3 | 4 | use crate::split::sweep_line_split; 5 | 6 | macro_rules! define_split_numpy { 7 | ($fname:ident, $chr_ty:ty, $pos_ty:ty) => { 8 | #[pyfunction] 9 | #[pyo3(signature = (chrs, starts, ends, slack = 0, between = false))] 10 | #[allow(non_snake_case)] 11 | pub fn $fname( 12 | chrs: PyReadonlyArray1<$chr_ty>, 13 | starts: PyReadonlyArray1<$pos_ty>, 14 | ends: PyReadonlyArray1<$pos_ty>, 15 | slack: $pos_ty, 16 | between: bool, 17 | py: Python<'_>, 18 | ) -> PyResult<( 19 | Py>, // indices 20 | Py>, // split starts 21 | Py>, // split ends 22 | )> { 23 | let (idx, s_starts, s_ends) = sweep_line_split( 24 | chrs.as_slice()?, 25 | starts.as_slice()?, 26 | ends.as_slice()?, 27 | slack, 28 | between, 29 | ); 30 | Ok(( 31 | idx .into_pyarray(py).to_owned().into(), 32 | s_starts .into_pyarray(py).to_owned().into(), 33 | s_ends .into_pyarray(py).to_owned().into(), 34 | )) 35 | } 36 | }; 37 | } 38 | 39 | // ── concrete instantiations ──────────────────────────────────────────── 40 | define_split_numpy!(split_numpy_u64_i64, u64, i64); 41 | define_split_numpy!(split_numpy_u32_i64, u32, i64); 42 | define_split_numpy!(split_numpy_u32_i32, u32, i32); 43 | define_split_numpy!(split_numpy_u32_i16, u32, i16); 44 | define_split_numpy!(split_numpy_u16_i64, u16, i64); 45 | define_split_numpy!(split_numpy_u16_i32, u16, i32); 46 | define_split_numpy!(split_numpy_u16_i16, u16, i16); 47 | define_split_numpy!(split_numpy_u8_i64, u8, i64); 48 | define_split_numpy!(split_numpy_u8_i32, u8, i32); 49 | define_split_numpy!(split_numpy_u8_i16, u8, i16); -------------------------------------------------------------------------------- /src/bindings/numpy_bindings/complement_overlaps_numpy.rs: -------------------------------------------------------------------------------- 1 | use numpy::{IntoPyArray, PyArray1, PyReadonlyArray1}; 2 | use pyo3::{pyfunction, Py, PyResult, Python}; 3 | 4 | use crate::complement::sweep_line_non_overlaps; 5 | 6 | 7 | macro_rules! define_complement_overlaps_numpy { 8 | ($fname:ident, $chr_ty:ty, $pos_ty:ty) => { 9 | #[pyfunction] 10 | #[allow(non_snake_case)] 11 | pub fn $fname( 12 | py: Python<'_>, 13 | chrs: PyReadonlyArray1<$chr_ty>, 14 | starts: PyReadonlyArray1<$pos_ty>, 15 | ends: PyReadonlyArray1<$pos_ty>, 16 | chrs2: PyReadonlyArray1<$chr_ty>, 17 | starts2: PyReadonlyArray1<$pos_ty>, 18 | ends2: PyReadonlyArray1<$pos_ty>, 19 | slack: $pos_ty, 20 | ) -> PyResult>> { 21 | let idx = sweep_line_non_overlaps( 22 | chrs.as_slice()?, starts.as_slice()?, ends.as_slice()?, 23 | chrs2.as_slice()?, starts2.as_slice()?, ends2.as_slice()?, 24 | slack, 25 | ); 26 | Ok(idx.into_pyarray(py).to_owned().into()) 27 | } 28 | }; 29 | } 30 | 31 | // ── concrete instantiations ──────────────────────────────────────────── 32 | define_complement_overlaps_numpy!(complement_overlaps_numpy_u64_i64, u64, i64); 33 | define_complement_overlaps_numpy!(complement_overlaps_numpy_u32_i64, u32, i64); 34 | define_complement_overlaps_numpy!(complement_overlaps_numpy_u32_i32, u32, i32); 35 | define_complement_overlaps_numpy!(complement_overlaps_numpy_u32_i16, u32, i16); 36 | define_complement_overlaps_numpy!(complement_overlaps_numpy_u16_i64, u16, i64); 37 | define_complement_overlaps_numpy!(complement_overlaps_numpy_u16_i32, u16, i32); 38 | define_complement_overlaps_numpy!(complement_overlaps_numpy_u16_i16, u16, i16); 39 | define_complement_overlaps_numpy!(complement_overlaps_numpy_u8_i64, u8, i64); 40 | define_complement_overlaps_numpy!(complement_overlaps_numpy_u8_i32, u8, i32); 41 | define_complement_overlaps_numpy!(complement_overlaps_numpy_u8_i16, u8, i16); -------------------------------------------------------------------------------- /src/bindings/numpy_bindings/subtract_numpy.rs: -------------------------------------------------------------------------------- 1 | use numpy::{IntoPyArray, PyArray1, PyReadonlyArray1}; 2 | use pyo3::{pyfunction, Py, PyResult, Python}; 3 | 4 | use crate::subtract::sweep_line_subtract; 5 | 6 | macro_rules! define_subtract_numpy { 7 | ($fname:ident, $chr_ty:ty, $pos_ty:ty) => { 8 | #[pyfunction] 9 | #[allow(non_snake_case)] 10 | pub fn $fname( 11 | py: Python<'_>, 12 | chrs: PyReadonlyArray1<$chr_ty>, 13 | starts: PyReadonlyArray1<$pos_ty>, 14 | ends: PyReadonlyArray1<$pos_ty>, 15 | chrs2: PyReadonlyArray1<$chr_ty>, 16 | starts2: PyReadonlyArray1<$pos_ty>, 17 | ends2: PyReadonlyArray1<$pos_ty>, 18 | ) -> PyResult<(Py>, 19 | Py>, 20 | Py>)> { 21 | let (idx, new_starts, new_ends) = sweep_line_subtract( 22 | chrs.as_slice()?, starts.as_slice()?, ends.as_slice()?, 23 | chrs2.as_slice()?, starts2.as_slice()?, ends2.as_slice()?, 24 | ); 25 | 26 | Ok(( 27 | idx .into_pyarray(py).to_owned().into(), 28 | new_starts .into_pyarray(py).to_owned().into(), 29 | new_ends .into_pyarray(py).to_owned().into(), 30 | )) 31 | } 32 | }; 33 | } 34 | 35 | // ── concrete instantiations ──────────────────────────────────────────── 36 | define_subtract_numpy!(subtract_numpy_u64_i64, u64, i64); 37 | define_subtract_numpy!(subtract_numpy_u32_i64, u32, i64); 38 | define_subtract_numpy!(subtract_numpy_u32_i32, u32, i32); 39 | define_subtract_numpy!(subtract_numpy_u32_i16, u32, i16); 40 | define_subtract_numpy!(subtract_numpy_u16_i64, u16, i64); 41 | define_subtract_numpy!(subtract_numpy_u16_i32, u16, i32); 42 | define_subtract_numpy!(subtract_numpy_u16_i16, u16, i16); 43 | define_subtract_numpy!(subtract_numpy_u8_i64, u8, i64); 44 | define_subtract_numpy!(subtract_numpy_u8_i32, u8, i32); 45 | define_subtract_numpy!(subtract_numpy_u8_i16, u8, i16); -------------------------------------------------------------------------------- /src/bindings/numpy_bindings/window_numpy.rs: -------------------------------------------------------------------------------- 1 | use numpy::{IntoPyArray, PyArray1, PyReadonlyArray1}; 2 | use pyo3::{pyfunction, Py, PyResult, Python}; 3 | 4 | use crate::tile::window_grouped; 5 | 6 | macro_rules! define_window_numpy { 7 | ($fname:ident, $chr_ty:ty, $pos_ty:ty) => { 8 | #[pyfunction] 9 | #[pyo3(signature = (chrs, starts, ends, negative_strand, window_size))] 10 | pub fn $fname( 11 | chrs: PyReadonlyArray1<$chr_ty>, 12 | starts: PyReadonlyArray1<$pos_ty>, 13 | ends: PyReadonlyArray1<$pos_ty>, 14 | negative_strand: PyReadonlyArray1, 15 | window_size: $pos_ty, 16 | py: Python<'_>, 17 | ) -> PyResult<( 18 | Py>, // indices 19 | Py>, // windowed starts 20 | Py>, // windowed ends 21 | )> { 22 | // NB: backend returns (starts, ends, indices) 23 | let (w_starts, w_ends, idx) = window_grouped( 24 | chrs.as_slice()?, 25 | starts.as_slice()?, 26 | ends.as_slice()?, 27 | negative_strand.as_slice()?, 28 | window_size, 29 | ); 30 | 31 | Ok(( 32 | idx .into_pyarray(py).to_owned().into(), 33 | w_starts .into_pyarray(py).to_owned().into(), 34 | w_ends .into_pyarray(py).to_owned().into(), 35 | )) 36 | } 37 | }; 38 | } 39 | 40 | // ── concrete instantiations ──────────────────────────────────────────── 41 | define_window_numpy!(window_numpy_u64_i64, u64, i64); 42 | define_window_numpy!(window_numpy_u32_i64, u32, i64); 43 | define_window_numpy!(window_numpy_u32_i32, u32, i32); 44 | define_window_numpy!(window_numpy_u32_i16, u32, i16); 45 | define_window_numpy!(window_numpy_u16_i64, u16, i64); 46 | define_window_numpy!(window_numpy_u16_i32, u16, i32); 47 | define_window_numpy!(window_numpy_u16_i16, u16, i16); 48 | define_window_numpy!(window_numpy_u8_i64, u8, i64); 49 | define_window_numpy!(window_numpy_u8_i32, u8, i32); 50 | define_window_numpy!(window_numpy_u8_i16, u8, i16); -------------------------------------------------------------------------------- /src/bindings/numpy_bindings/group_cumsum_numpy.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use numpy::{IntoPyArray, PyReadonlyArray1, PyArray1}; 3 | 4 | use crate::group_cumsum::sweep_line_cumsum; 5 | 6 | macro_rules! define_cumsum_numpy { 7 | ($fname:ident, $grp_ty:ty, $pos_ty:ty) => { 8 | #[pyfunction] 9 | #[pyo3(signature = ( 10 | groups, 11 | starts, 12 | ends, 13 | negative_strand = None, 14 | sort = true, 15 | ))] 16 | pub fn $fname( 17 | groups: PyReadonlyArray1<$grp_ty>, 18 | starts: PyReadonlyArray1<$pos_ty>, 19 | ends: PyReadonlyArray1<$pos_ty>, 20 | negative_strand: Option>, 21 | sort: bool, 22 | py: Python<'_>, 23 | ) -> PyResult<( 24 | Py>, 25 | Py>, 26 | Py>, 27 | )> 28 | { 29 | use pyo3::exceptions::PyValueError; 30 | 31 | let neg = negative_strand 32 | .ok_or_else(|| PyValueError::new_err("negative_strand is required"))?; 33 | 34 | let (idxs, cumsum_starts, cumsum_ends) = sweep_line_cumsum( 35 | groups.as_slice()?, starts.as_slice()?, ends.as_slice()?, 36 | neg.as_slice()?, sort, 37 | ); 38 | 39 | Ok(( 40 | idxs .into_pyarray(py).to_owned().into(), 41 | cumsum_starts .into_pyarray(py).to_owned().into(), 42 | cumsum_ends .into_pyarray(py).to_owned().into(), 43 | )) 44 | } 45 | }; 46 | } 47 | 48 | define_cumsum_numpy!(group_cumsum_numpy_u64_i64, u64, i64); 49 | define_cumsum_numpy!(group_cumsum_numpy_u32_i64, u32, i64); 50 | define_cumsum_numpy!(group_cumsum_numpy_u32_i32, u32, i32); 51 | define_cumsum_numpy!(group_cumsum_numpy_u32_i16, u32, i16); 52 | define_cumsum_numpy!(group_cumsum_numpy_u16_i64, u16, i64); 53 | define_cumsum_numpy!(group_cumsum_numpy_u16_i32, u16, i32); 54 | define_cumsum_numpy!(group_cumsum_numpy_u16_i16, u16, i16); 55 | define_cumsum_numpy!(group_cumsum_numpy_u8_i64, u8, i64); 56 | define_cumsum_numpy!(group_cumsum_numpy_u8_i32, u8, i32); 57 | define_cumsum_numpy!(group_cumsum_numpy_u8_i16, u8, i16); -------------------------------------------------------------------------------- /src/extend.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | 3 | use crate::ruranges_structs::{GroupType, PositionType}; 4 | 5 | fn check_ext_options( 6 | ext: Option, 7 | ext_3: Option, 8 | ext_5: Option, 9 | ) -> Result<(), &'static str> { 10 | // The condition below is true when either both ext and (ext_3 or ext_5) are provided, 11 | // or when neither is provided. 12 | if ext.is_some() == (ext_3.is_some() || ext_5.is_some()) { 13 | Err("Must use at least one and not both of ext and ext3 or ext5.") 14 | } else { 15 | Ok(()) 16 | } 17 | } 18 | 19 | 20 | /// Extend each group's intervals by modifying only the row with the minimal start 21 | /// and the row with the maximal end for that group. 22 | /// 23 | /// Returns `(group_ids, new_starts, new_ends)`. 24 | pub fn extend_grp( 25 | group_ids: &[G], 26 | starts: &[T], 27 | ends: &[T], 28 | negative_strand: &[bool], 29 | ext_3: T, 30 | ext_5: T, 31 | ) -> (Vec, Vec) { 32 | /* ─── 0. Basic sanity ─────────────────────────────────────────────────── */ 33 | assert_eq!(group_ids.len(), starts.len()); 34 | assert_eq!(starts.len(), ends.len()); 35 | assert_eq!(ends.len(), negative_strand.len()); 36 | 37 | let n = starts.len(); 38 | let mut new_start = starts.to_vec(); 39 | let mut new_end = ends.to_vec(); 40 | 41 | let mut extrema: HashMap = 42 | HashMap::with_capacity(n); 43 | 44 | for i in 0..n { 45 | extrema 46 | .entry(group_ids[i]) 47 | .and_modify(|(min_i, max_i)| { 48 | if starts[i] < starts[*min_i] { *min_i = i; } 49 | if ends [i] > ends [*max_i] { *max_i = i; } 50 | }) 51 | .or_insert((i, i)); 52 | } 53 | 54 | for (_gid, (min_i, max_i)) in extrema { 55 | if negative_strand[min_i] { 56 | new_end [max_i] = new_end [max_i] + ext_5; 57 | new_start[min_i] = new_start[min_i] - ext_3; 58 | } else { 59 | new_start[min_i] = new_start[min_i] - ext_5; 60 | new_end [max_i] = new_end [max_i] + ext_3; 61 | } 62 | } 63 | 64 | (new_start, new_end) 65 | } 66 | -------------------------------------------------------------------------------- /src/bindings/numpy_bindings/overlaps_numpy.rs: -------------------------------------------------------------------------------- 1 | use numpy::{IntoPyArray, PyArray1, PyReadonlyArray1}; 2 | use pyo3::{pyfunction, Py, PyResult, Python}; 3 | 4 | use crate::overlaps::overlaps; 5 | 6 | 7 | macro_rules! define_chromsweep_numpy { 8 | ($fname:ident, $chr_ty:ty, $pos_ty:ty) => { 9 | #[pyfunction] 10 | #[allow(non_snake_case)] 11 | pub fn $fname( 12 | py: Python, 13 | chrs: PyReadonlyArray1<$chr_ty>, 14 | starts: PyReadonlyArray1<$pos_ty>, 15 | ends: PyReadonlyArray1<$pos_ty>, 16 | chrs2: PyReadonlyArray1<$chr_ty>, 17 | starts2: PyReadonlyArray1<$pos_ty>, 18 | ends2: PyReadonlyArray1<$pos_ty>, 19 | slack: $pos_ty, 20 | overlap_type: &str, 21 | sort_output: bool, 22 | contained: bool, 23 | ) -> PyResult<(Py>, Py>)> { 24 | let chrs_slice = chrs.as_slice()?; 25 | let starts_slice = starts.as_slice()?; 26 | let ends_slice = ends.as_slice()?; 27 | let chrs_slice2 = chrs2.as_slice()?; 28 | let starts_slice2 = starts2.as_slice()?; 29 | let ends_slice2 = ends2.as_slice()?; 30 | 31 | let (idx1, idx2) = overlaps( 32 | chrs_slice, 33 | starts_slice, 34 | ends_slice, 35 | chrs_slice2, 36 | starts_slice2, 37 | ends_slice2, 38 | slack, 39 | overlap_type, 40 | sort_output, 41 | contained, 42 | ); 43 | Ok(( 44 | idx1.into_pyarray(py).to_owned().into(), 45 | idx2.into_pyarray(py).to_owned().into(), 46 | )) 47 | } 48 | } 49 | } 50 | 51 | define_chromsweep_numpy!(chromsweep_numpy_u64_i64, u64, i64); 52 | define_chromsweep_numpy!(chromsweep_numpy_u32_i64, u32, i64); 53 | define_chromsweep_numpy!(chromsweep_numpy_u32_i32, u32, i32); 54 | define_chromsweep_numpy!(chromsweep_numpy_u32_i16, u32, i16); 55 | define_chromsweep_numpy!(chromsweep_numpy_u16_i64, u16, i64); 56 | define_chromsweep_numpy!(chromsweep_numpy_u16_i32, u16, i32); 57 | define_chromsweep_numpy!(chromsweep_numpy_u16_i16, u16, i16); 58 | define_chromsweep_numpy!(chromsweep_numpy_u8_i64, u8, i64); 59 | define_chromsweep_numpy!(chromsweep_numpy_u8_i32, u8, i32); 60 | define_chromsweep_numpy!(chromsweep_numpy_u8_i16, u8, i16); 61 | -------------------------------------------------------------------------------- /src/bindings/numpy_bindings/nearest_numpy.rs: -------------------------------------------------------------------------------- 1 | use numpy::{IntoPyArray, PyArray1, PyReadonlyArray1}; 2 | use pyo3::{pyfunction, Py, PyResult, Python}; 3 | 4 | use crate::nearest::nearest; 5 | 6 | 7 | macro_rules! define_nearest_numpy { 8 | ($fname:ident, $chr_ty:ty, $pos_ty:ty) => { 9 | #[pyfunction] 10 | #[pyo3(signature = ( 11 | chrs, starts, ends, 12 | chrs2, starts2, ends2, 13 | slack = 0, // <$pos_ty>::from(0) at call-site 14 | k = 1, 15 | include_overlaps = true, 16 | direction = "any" 17 | ))] 18 | #[allow(non_snake_case)] 19 | pub fn $fname( 20 | py: Python<'_>, 21 | chrs: PyReadonlyArray1<$chr_ty>, 22 | starts: PyReadonlyArray1<$pos_ty>, 23 | ends: PyReadonlyArray1<$pos_ty>, 24 | chrs2: PyReadonlyArray1<$chr_ty>, 25 | starts2: PyReadonlyArray1<$pos_ty>, 26 | ends2: PyReadonlyArray1<$pos_ty>, 27 | slack: $pos_ty, 28 | k: usize, 29 | include_overlaps: bool, 30 | direction: &str, 31 | ) -> PyResult<(Py>, 32 | Py>, 33 | Py>)> { 34 | let (idx1, idx2, dist) = nearest( 35 | chrs.as_slice()?, starts.as_slice()?, ends.as_slice()?, 36 | chrs2.as_slice()?, starts2.as_slice()?, ends2.as_slice()?, 37 | slack, k, include_overlaps, direction, 38 | ); 39 | 40 | Ok(( 41 | idx1.into_pyarray(py).to_owned().into(), 42 | idx2.into_pyarray(py).to_owned().into(), 43 | dist.into_pyarray(py).to_owned().into(), 44 | )) 45 | } 46 | }; 47 | } 48 | 49 | // ── concrete instantiations ──────────────────────────────────────────── 50 | define_nearest_numpy!(nearest_numpy_u64_i64, u64, i64); 51 | define_nearest_numpy!(nearest_numpy_u32_i64, u32, i64); 52 | define_nearest_numpy!(nearest_numpy_u32_i32, u32, i32); 53 | define_nearest_numpy!(nearest_numpy_u32_i16, u32, i16); 54 | define_nearest_numpy!(nearest_numpy_u16_i64, u16, i64); 55 | define_nearest_numpy!(nearest_numpy_u16_i32, u16, i32); 56 | define_nearest_numpy!(nearest_numpy_u16_i16, u16, i16); 57 | define_nearest_numpy!(nearest_numpy_u8_i64, u8, i64); 58 | define_nearest_numpy!(nearest_numpy_u8_i32, u8, i32); 59 | define_nearest_numpy!(nearest_numpy_u8_i16, u8, i16); 60 | -------------------------------------------------------------------------------- /src/outside_bounds.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | 3 | use crate::ruranges_structs::{GroupType, PositionType}; 4 | 5 | pub fn outside_bounds( 6 | groups: &[G], 7 | starts: &[T], 8 | ends: &[T], 9 | chrom_lens: &[T], 10 | clip: bool, 11 | only_right: bool, 12 | ) -> Result<(Vec, Vec, Vec), String> { 13 | 14 | if starts.len() != ends.len() 15 | || groups.len() != starts.len() 16 | || chrom_lens.len() != starts.len() 17 | { 18 | return Err("All input slices must have the same length".into()); 19 | } 20 | 21 | let n = starts.len(); 22 | let mut idx = Vec::with_capacity(n); 23 | let mut out_starts = Vec::with_capacity(n); 24 | let mut out_ends = Vec::with_capacity(n); 25 | 26 | for i in 0..n { 27 | let size = chrom_lens[i]; 28 | let orig_start = starts[i]; 29 | let orig_end = ends[i]; 30 | 31 | if !clip { 32 | // ===== Removal mode ========================================= 33 | let skip = if only_right { 34 | orig_end > size 35 | } else { 36 | orig_end > size || orig_start < T::zero() 37 | }; 38 | if skip { continue; } 39 | 40 | idx.push(i); 41 | out_starts.push(orig_start); 42 | out_ends.push(orig_end); 43 | } else { 44 | // ===== Clipping mode ======================================== 45 | if only_right { 46 | // whole interval right of the chromosome 47 | if orig_start >= size { continue; } 48 | 49 | let clipped_end = if orig_end > size { size } else { orig_end }; 50 | 51 | idx.push(i); 52 | out_starts.push(orig_start); 53 | out_ends.push(clipped_end); 54 | } else { 55 | // clip on both sides 56 | if orig_start >= size || orig_end <= T::zero() { continue; } 57 | 58 | let clipped_start = if orig_start < T::zero() { T::zero() } else { orig_start }; 59 | let clipped_end = if orig_end > size { size } else { orig_end }; 60 | 61 | idx.push(i); 62 | out_starts.push(clipped_start); 63 | out_ends.push(clipped_end); 64 | } 65 | } 66 | } 67 | 68 | let idx_u32: Vec = idx.into_iter().map(|x| x as u32).collect(); 69 | 70 | Ok((idx_u32, out_starts, out_ends)) 71 | } 72 | -------------------------------------------------------------------------------- /src/complement.rs: -------------------------------------------------------------------------------- 1 | use crate::{ruranges_structs::{GroupType, PositionType}, sorts}; 2 | 3 | use rustc_hash::FxHashSet; 4 | 5 | pub fn sweep_line_non_overlaps( 6 | chrs: &[G], 7 | starts: &[T], 8 | ends: &[T], 9 | chrs2: &[G], 10 | starts2: &[T], 11 | ends2: &[T], 12 | slack: T, 13 | ) -> Vec { 14 | let mut no_overlaps = Vec::new(); 15 | 16 | // If either set is empty, none can overlap; return everything as “non-overlapping”. 17 | if chrs.is_empty() || chrs2.is_empty() { 18 | // Just return all indices as non-overlapping 19 | return no_overlaps.to_vec(); 20 | } 21 | 22 | // Build up the event list in ascending order (same as before) 23 | let events = sorts::build_sorted_events_idxs(chrs, starts, ends, chrs2, starts2, ends2, slack); 24 | 25 | let mut overlapped = FxHashSet::default(); 26 | 27 | // Active sets 28 | let mut active1 = FxHashSet::default(); 29 | let mut active2 = FxHashSet::default(); 30 | 31 | // Assume the first event determines the “current” chr 32 | let mut current_chr = events.first().unwrap().chr; 33 | 34 | for e in events { 35 | // If chromosome changed, clear active sets 36 | if e.chr != current_chr { 37 | active1.clear(); 38 | active2.clear(); 39 | current_chr = e.chr; 40 | } 41 | 42 | if e.is_start { 43 | // Interval is starting 44 | if e.first_set { 45 | // Overlaps with all currently active intervals in set2 46 | if !active2.is_empty() { 47 | overlapped.insert(e.idx); 48 | } 49 | // Insert into active1 50 | active1.insert(e.idx); 51 | } else { 52 | // Overlaps with all currently active intervals in set1 53 | for &idx1 in active1.iter() { 54 | overlapped.insert(idx1); 55 | } 56 | // Insert into active2 57 | active2.insert(e.idx); 58 | } 59 | } else { 60 | // Interval is ending 61 | if e.first_set { 62 | active1.remove(&e.idx); 63 | if !overlapped.contains(&e.idx) { 64 | no_overlaps.push(e.idx); 65 | } 66 | } else { 67 | active2.remove(&e.idx); 68 | } 69 | 70 | overlapped.remove(&e.idx); 71 | } 72 | } 73 | 74 | radsort::sort(&mut no_overlaps); 75 | no_overlaps 76 | } 77 | -------------------------------------------------------------------------------- /src/bindings/numpy_bindings/sort_intervals_numpy.rs: -------------------------------------------------------------------------------- 1 | use numpy::{IntoPyArray, PyArray1, PyReadonlyArray1}; 2 | use pyo3::{pyfunction, Py, PyResult, Python}; 3 | 4 | use crate::sorts; 5 | 6 | macro_rules! define_sort_intervals_numpy { 7 | ($fname:ident, $chr_ty:ty, $pos_ty:ty) => { 8 | #[pyfunction] 9 | #[pyo3(signature = (chrs, starts, ends, sort_reverse_direction = None))] 10 | #[allow(non_snake_case)] 11 | pub fn $fname( 12 | chrs: PyReadonlyArray1<$chr_ty>, 13 | starts: PyReadonlyArray1<$pos_ty>, 14 | ends: PyReadonlyArray1<$pos_ty>, 15 | sort_reverse_direction: Option>, 16 | py: Python<'_>, 17 | ) -> PyResult>> { 18 | let idx = sorts::sort_order_idx( 19 | chrs.as_slice()?, 20 | starts.as_slice()?, 21 | ends.as_slice()?, 22 | match &sort_reverse_direction { 23 | Some(arr) => Some(arr.as_slice()?), 24 | None => None, 25 | }, 26 | ); 27 | Ok(idx.into_pyarray(py).to_owned().into()) 28 | } 29 | }; 30 | } 31 | 32 | macro_rules! define_sort_groups_numpy { 33 | ($fname:ident, $chr_ty:ty) => { 34 | #[pyfunction] 35 | #[pyo3(signature = (chrs))] 36 | #[allow(non_snake_case)] 37 | pub fn $fname( 38 | chrs: PyReadonlyArray1<$chr_ty>, 39 | py: Python<'_>, 40 | ) -> PyResult>> { 41 | let idx = sorts::build_sorted_groups( 42 | chrs.as_slice()?, 43 | ); 44 | Ok(idx.into_pyarray(py).to_owned().into()) 45 | } 46 | }; 47 | } 48 | 49 | define_sort_intervals_numpy!(sort_intervals_numpy_u64_i64, u64, i64); 50 | define_sort_intervals_numpy!(sort_intervals_numpy_u32_i64, u32, i64); 51 | define_sort_intervals_numpy!(sort_intervals_numpy_u32_i32, u32, i32); 52 | define_sort_intervals_numpy!(sort_intervals_numpy_u32_i16, u32, i16); 53 | define_sort_intervals_numpy!(sort_intervals_numpy_u16_i64, u16, i64); 54 | define_sort_intervals_numpy!(sort_intervals_numpy_u16_i32, u16, i32); 55 | define_sort_intervals_numpy!(sort_intervals_numpy_u16_i16, u16, i16); 56 | define_sort_intervals_numpy!(sort_intervals_numpy_u8_i64, u8, i64); 57 | define_sort_intervals_numpy!(sort_intervals_numpy_u8_i32, u8, i32); 58 | define_sort_intervals_numpy!(sort_intervals_numpy_u8_i16, u8, i16); 59 | 60 | define_sort_groups_numpy!(sort_groups_numpy_u64, u64); 61 | define_sort_groups_numpy!(sort_groups_numpy_u32, u32); 62 | define_sort_groups_numpy!(sort_groups_numpy_u16, u16); 63 | define_sort_groups_numpy!(sort_groups_numpy_u8, u8); -------------------------------------------------------------------------------- /src/split.rs: -------------------------------------------------------------------------------- 1 | use crate::{ruranges_structs::{GroupType, PositionType}, sorts}; 2 | 3 | pub fn sweep_line_split( 4 | chrs: &[G], 5 | starts: &[T], 6 | ends: &[T], 7 | slack: T, 8 | between: bool, 9 | ) -> (Vec, Vec, Vec) { 10 | let events = sorts::build_sorted_events_single_collection(chrs, starts, ends, slack); 11 | 12 | // These will hold the output arrays: each emitted subinterval’s 13 | // (original_idx, start, end). 14 | let mut idxs_out = Vec::new(); 15 | let mut starts_out = Vec::new(); 16 | let mut ends_out = Vec::new(); 17 | 18 | // Edge case: no intervals 19 | if events.is_empty() { 20 | return (idxs_out, starts_out, ends_out); 21 | } 22 | 23 | // State for the sweep line 24 | let mut current_chr = events[0].chr; 25 | // We initialize coverage to 0, then we will “process” each event, 26 | // but we need a “last_pos” to track from where we last emitted. 27 | let mut active_count: u32 = 0; 28 | let mut last_pos = events[0].pos; 29 | let mut last_idx = events[0].idx; // you can store whichever index you like 30 | 31 | // Decide whether coverage is “on” at the very first position: 32 | // If the first event is a start, coverage goes from 0 → 1 at that point. 33 | if events[0].is_start { 34 | active_count = 1; 35 | } 36 | 37 | // We iterate from the *second* event onward. 38 | // At each new event, we emit from last_pos → e.pos if either coverage was > 0 or `between = true`. 39 | for e_i in 1..events.len() { 40 | let e = &events[e_i]; 41 | 42 | // If chromosome changes, we “jump” to a new chromosome 43 | // and do *not* produce an interval bridging old->new. 44 | if e.chr != current_chr { 45 | // reset 46 | current_chr = e.chr; 47 | active_count = if e.is_start { 1 } else { 0 }; 48 | last_pos = e.pos; 49 | last_idx = e.idx; 50 | continue; 51 | } 52 | 53 | // same chromosome => we may emit from last_pos..e.pos if it's > 0 length 54 | // and either coverage>0 or we want the gap (between = true). 55 | if e.pos > last_pos { 56 | // If we were in coverage or want gaps, emit the subinterval. 57 | if active_count > 0 || between { 58 | idxs_out.push(last_idx); 59 | starts_out.push(last_pos); 60 | ends_out.push(e.pos); 61 | } 62 | last_pos = e.pos; 63 | last_idx = e.idx; // you might prefer to keep the same idx as “first covering interval” 64 | } 65 | 66 | // Now handle the event itself (this flips coverage up or down). 67 | if e.is_start { 68 | active_count += 1; 69 | } else { 70 | // is an end 71 | if active_count > 0 { 72 | active_count -= 1; 73 | } 74 | } 75 | } 76 | 77 | (idxs_out, starts_out, ends_out) 78 | } 79 | -------------------------------------------------------------------------------- /src/bindings/numpy_bindings/genome_bounds_numpy.rs: -------------------------------------------------------------------------------- 1 | 2 | use pyo3::{exceptions::PyValueError, prelude::*}; 3 | use numpy::{IntoPyArray, PyArray1, PyReadonlyArray1}; 4 | use pyo3::{pyfunction, Py, PyResult, Python}; 5 | 6 | use crate::outside_bounds::outside_bounds; 7 | 8 | macro_rules! define_genome_bounds_numpy { 9 | ($fname:ident, $grp_ty:ty, $pos_ty:ty) => { 10 | #[pyfunction] 11 | #[pyo3(signature=( 12 | groups, 13 | starts, 14 | ends, 15 | chrom_lengths, // <-- single vector, same length as rows 16 | clip = false, 17 | only_right = false 18 | ))] 19 | #[allow(non_snake_case)] 20 | pub fn $fname( 21 | groups: PyReadonlyArray1<$grp_ty>, 22 | starts: PyReadonlyArray1<$pos_ty>, 23 | ends: PyReadonlyArray1<$pos_ty>, 24 | chrom_lengths: PyReadonlyArray1<$pos_ty>, 25 | clip: bool, 26 | only_right: bool, 27 | py: Python<'_>, 28 | ) -> PyResult<( 29 | Py>, // kept identical return signature 30 | Py>, 31 | Py>, 32 | )> { 33 | use pyo3::exceptions::PyValueError; 34 | 35 | // Fast length consistency check while we still hold the gil. 36 | let n = starts.len()?; 37 | if ends.len()? != n || groups.len()? != n || chrom_lengths.len()? != n { 38 | return Err(PyValueError::new_err( 39 | "`groups`, `starts`, `ends`, and `chrom_lengths` must all have the same length", 40 | )); 41 | } 42 | 43 | let (idx, new_starts, new_ends) = outside_bounds( 44 | groups.as_slice()?, 45 | starts.as_slice()?, 46 | ends.as_slice()?, 47 | chrom_lengths.as_slice()?, 48 | clip, 49 | only_right, 50 | ) 51 | .map_err(PyValueError::new_err)?; 52 | 53 | // Convert the three Vecs back to NumPy arrays. 54 | Ok(( 55 | idx.into_pyarray(py).to_owned().into(), 56 | new_starts.into_pyarray(py).to_owned().into(), 57 | new_ends.into_pyarray(py).to_owned().into(), 58 | )) 59 | } 60 | }; 61 | } 62 | 63 | // ── concrete instantiations ──────────────────────────────────────────── 64 | define_genome_bounds_numpy!(genome_bounds_numpy_u64_i64, u64, i64); 65 | define_genome_bounds_numpy!(genome_bounds_numpy_u32_i64, u32, i64); 66 | define_genome_bounds_numpy!(genome_bounds_numpy_u32_i32, u32, i32); 67 | define_genome_bounds_numpy!(genome_bounds_numpy_u32_i16, u32, i16); 68 | define_genome_bounds_numpy!(genome_bounds_numpy_u16_i64, u16, i64); 69 | define_genome_bounds_numpy!(genome_bounds_numpy_u16_i32, u16, i32); 70 | define_genome_bounds_numpy!(genome_bounds_numpy_u16_i16, u16, i16); 71 | define_genome_bounds_numpy!(genome_bounds_numpy_u8_i64, u8, i64); 72 | define_genome_bounds_numpy!(genome_bounds_numpy_u8_i32, u8, i32); 73 | define_genome_bounds_numpy!(genome_bounds_numpy_u8_i16, u8, i16); -------------------------------------------------------------------------------- /src/bindings/numpy_bindings/complement_numpy.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use numpy::{IntoPyArray, PyReadonlyArray1, PyArray1}; 3 | use rustc_hash::FxHashMap; 4 | 5 | use crate::complement_single::sweep_line_complement; 6 | 7 | macro_rules! define_complement_numpy { 8 | ($fname:ident, $chr_ty:ty, $pos_ty:ty) => { 9 | #[pyfunction] 10 | #[pyo3(signature = ( 11 | groups, 12 | starts, 13 | ends, 14 | chrom_len_ids, 15 | chrom_lens, 16 | slack = 0, 17 | include_first_interval = false 18 | ))] 19 | #[allow(non_snake_case)] 20 | pub fn $fname( 21 | py: Python<'_>, 22 | groups: PyReadonlyArray1<$chr_ty>, 23 | starts: PyReadonlyArray1<$pos_ty>, 24 | ends: PyReadonlyArray1<$pos_ty>, 25 | chrom_len_ids: PyReadonlyArray1<$chr_ty>, 26 | chrom_lens: PyReadonlyArray1<$pos_ty>, 27 | slack: $pos_ty, 28 | include_first_interval: bool, 29 | ) -> PyResult<( 30 | Py>, 31 | Py>, 32 | Py>, 33 | Py>, 34 | )> { 35 | let keys = chrom_len_ids.as_slice()?; 36 | let vals = chrom_lens.as_slice()?; 37 | if keys.len() != vals.len() { 38 | return Err(pyo3::exceptions::PyValueError::new_err( 39 | "chrom_len_ids and chrom_lens must have identical length", 40 | )); 41 | } 42 | 43 | let mut lens_map: FxHashMap<$chr_ty, $pos_ty> = 44 | FxHashMap::with_capacity_and_hasher(keys.len(), Default::default()); 45 | for (&k, &v) in keys.iter().zip(vals.iter()) { 46 | lens_map.insert(k, v); 47 | } 48 | 49 | let (out_chrs, out_starts, out_ends, out_idx) = sweep_line_complement( 50 | groups.as_slice()?, 51 | starts.as_slice()?, 52 | ends.as_slice()?, 53 | slack, 54 | &lens_map, 55 | include_first_interval, 56 | ); 57 | 58 | Ok(( 59 | out_chrs .into_pyarray(py).to_owned().into(), 60 | out_starts.into_pyarray(py).to_owned().into(), 61 | out_ends .into_pyarray(py).to_owned().into(), 62 | out_idx .into_pyarray(py).to_owned().into(), 63 | )) 64 | } 65 | }; 66 | } 67 | 68 | // ── concrete instantiations ─────────────────────────────────────────── 69 | define_complement_numpy!(complement_numpy_u64_i64, u64, i64); 70 | define_complement_numpy!(complement_numpy_u32_i64, u32, i64); 71 | define_complement_numpy!(complement_numpy_u32_i32, u32, i32); 72 | define_complement_numpy!(complement_numpy_u32_i16, u32, i16); 73 | define_complement_numpy!(complement_numpy_u16_i64, u16, i64); 74 | define_complement_numpy!(complement_numpy_u16_i32, u16, i32); 75 | define_complement_numpy!(complement_numpy_u16_i16, u16, i16); 76 | define_complement_numpy!(complement_numpy_u8_i64, u8, i64); 77 | define_complement_numpy!(complement_numpy_u8_i32, u8, i32); 78 | define_complement_numpy!(complement_numpy_u8_i16, u8, i16); -------------------------------------------------------------------------------- /src/bindings/numpy_bindings/overlaps_simple_numpy.rs: -------------------------------------------------------------------------------- 1 | 2 | use numpy::{IntoPyArray, PyArray1, PyReadonlyArray1}; 3 | use pyo3::{pyfunction, Py, PyResult, Python}; 4 | use crate::ruranges_structs::OverlapType; 5 | 6 | use crate::overlaps_simple::sweep_line_overlaps; // adjust module path if needed 7 | 8 | macro_rules! define_sweepline_numpy { 9 | ($fname:ident, $chr_ty:ty, $pos_ty:ty) => { 10 | #[pyfunction] 11 | #[allow(non_snake_case)] 12 | pub fn $fname( 13 | py: Python, 14 | chrs: PyReadonlyArray1<$chr_ty>, 15 | starts: PyReadonlyArray1<$pos_ty>, 16 | ends: PyReadonlyArray1<$pos_ty>, 17 | chrs2: PyReadonlyArray1<$chr_ty>, 18 | starts2: PyReadonlyArray1<$pos_ty>, 19 | ends2: PyReadonlyArray1<$pos_ty>, 20 | slack: $pos_ty, 21 | overlap_type: &str, 22 | contained: bool, 23 | no_checks: bool, 24 | ) -> PyResult<(Py>, Py>)> { 25 | let chrs_slice = chrs.as_slice()?; 26 | let starts_slice = starts.as_slice()?; 27 | let ends_slice = ends.as_slice()?; 28 | 29 | let chrs_slice2 = chrs2.as_slice()?; 30 | let starts_slice2 = starts2.as_slice()?; 31 | let ends_slice2 = ends2.as_slice()?; 32 | 33 | // Optional sanity checks (cheap, helps catch Python-side misuse) 34 | debug_assert!(chrs_slice.len() == starts_slice.len() && starts_slice.len() == ends_slice.len()); 35 | debug_assert!(chrs_slice2.len() == starts_slice2.len() && starts_slice2.len() == ends_slice2.len()); 36 | 37 | let (idx1_usize, idx2_usize) = sweep_line_overlaps( 38 | chrs_slice, 39 | starts_slice, 40 | ends_slice, 41 | chrs_slice2, 42 | starts_slice2, 43 | ends_slice2, 44 | slack, 45 | overlap_type, 46 | contained, 47 | no_checks, 48 | ); 49 | 50 | // Convert to u32 for numpy (matching your existing API) 51 | debug_assert!(idx1_usize.iter().all(|&x| x <= u32::MAX as usize)); 52 | debug_assert!(idx2_usize.iter().all(|&x| x <= u32::MAX as usize)); 53 | 54 | let idx1: Vec = idx1_usize.into_iter().map(|x| x as u32).collect(); 55 | let idx2: Vec = idx2_usize.into_iter().map(|x| x as u32).collect(); 56 | 57 | Ok(( 58 | idx1.into_pyarray(py).to_owned().into(), 59 | idx2.into_pyarray(py).to_owned().into(), 60 | )) 61 | } 62 | }; 63 | } 64 | 65 | // Same set of instantiations as your chromsweep wrapper 66 | define_sweepline_numpy!(sweepline_numpy_u64_i64, u64, i64); 67 | define_sweepline_numpy!(sweepline_numpy_u32_i64, u32, i64); 68 | define_sweepline_numpy!(sweepline_numpy_u32_i32, u32, i32); 69 | define_sweepline_numpy!(sweepline_numpy_u32_i16, u32, i16); 70 | define_sweepline_numpy!(sweepline_numpy_u16_i64, u16, i64); 71 | define_sweepline_numpy!(sweepline_numpy_u16_i32, u16, i32); 72 | define_sweepline_numpy!(sweepline_numpy_u16_i16, u16, i16); 73 | define_sweepline_numpy!(sweepline_numpy_u8_i64, u8, i64); 74 | define_sweepline_numpy!(sweepline_numpy_u8_i32, u8, i32); 75 | define_sweepline_numpy!(sweepline_numpy_u8_i16, u8, i16); 76 | 77 | -------------------------------------------------------------------------------- /src/complement_single.rs: -------------------------------------------------------------------------------- 1 | use rustc_hash::FxHashMap; 2 | 3 | use crate::{ruranges_structs::{GroupType, PositionType}, sorts}; 4 | 5 | pub fn sweep_line_complement( 6 | chrs: &[G], 7 | starts: &[T], 8 | ends: &[T], 9 | slack: T, 10 | chrom_lens: &FxHashMap, 11 | include_first_interval: bool, // <-- new parameter 12 | ) -> (Vec, Vec, Vec, Vec) { 13 | let mut out_chrs = Vec::with_capacity(chrs.len()); 14 | let mut out_starts = Vec::with_capacity(chrs.len()); 15 | let mut out_ends = Vec::with_capacity(chrs.len()); 16 | let mut out_idxs = Vec::with_capacity(chrs.len()); 17 | 18 | // Early return if no input 19 | if chrs.is_empty() { 20 | return (out_chrs, out_starts, out_ends, out_idxs); 21 | } 22 | 23 | // Build your events array, sorted by chr and pos 24 | let events = sorts::build_sorted_events_single_collection(chrs, starts, ends, slack); 25 | 26 | // Initialize 27 | let mut current_chr = events[0].chr; 28 | let mut active_count = 0_i64; 29 | // Whether we start "in a hole" (i.e., complement) depends on `include_first_interval` 30 | let mut in_complement = include_first_interval; 31 | // Start the first hole at position 0 of the chromosome (only matters if `in_complement == true`) 32 | let mut current_start = T::zero(); 33 | let mut current_index = 0_u32; 34 | 35 | for e in events { 36 | // If we hit a new chromosome 37 | if e.chr != current_chr { 38 | // If we ended the previous chromosome still in a hole, 39 | // optionally close it out at the chromosome’s end 40 | if let Some(chlen) = chrom_lens.get(¤t_chr) { 41 | if in_complement { 42 | out_chrs.push(current_chr); 43 | out_starts.push(current_start); 44 | out_ends.push(*chlen); 45 | out_idxs.push(current_index); 46 | } 47 | } 48 | 49 | // Reset for new chromosome 50 | current_chr = e.chr; 51 | active_count = 0; 52 | in_complement = include_first_interval; 53 | current_start = T::zero(); 54 | current_index = e.idx; 55 | } 56 | 57 | // Process this event 58 | if e.is_start { 59 | // coverage X → X + 1 60 | active_count += 1; 61 | // If coverage was zero, we just ended a hole 62 | if active_count == 1 && in_complement && current_start != e.pos { 63 | // That hole ends at e.pos 64 | out_chrs.push(current_chr); 65 | out_starts.push(current_start); 66 | out_ends.push(e.pos); 67 | out_idxs.push(current_index); 68 | 69 | // We're no longer in a hole 70 | in_complement = false; 71 | } 72 | } else { 73 | // coverage X → X - 1 74 | active_count -= 1; 75 | // If coverage has just dropped back to zero, 76 | // we start a new hole here 77 | if active_count == 0 { 78 | in_complement = true; 79 | current_start = e.pos; 80 | } 81 | } 82 | } 83 | 84 | // End of all events: if we finished in a hole and have chromosome lengths 85 | if let Some(chlen) = chrom_lens.get(¤t_chr) { 86 | if in_complement { 87 | out_chrs.push(current_chr); 88 | out_starts.push(current_start); 89 | out_ends.push(*chlen); 90 | out_idxs.push(current_index); 91 | } 92 | } 93 | 94 | (out_chrs, out_starts, out_ends, out_idxs) 95 | } 96 | -------------------------------------------------------------------------------- /src/bindings/numpy_bindings/map_to_global_numpy.rs: -------------------------------------------------------------------------------- 1 | use numpy::{IntoPyArray, PyArray1, PyReadonlyArray1}; 2 | use pyo3::prelude::*; 3 | 4 | use crate::map_to_global::map_to_global; // core algorithm 5 | 6 | /* ======================================================================= 7 | Macro: expose map_to_global_() functions to Python/NumPy 8 | ======================================================================= 9 | 10 | `_dispatch_binary("map_to_global_numpy", …)` sends the arguments in 11 | this order: 12 | 13 | (groups starts ends) (groups2 starts2 ends2) 14 | └ left table = exons ┘ └ right table = queries ┘ 15 | extra: ex_chr_code ex_genome_start ex_genome_end ex_fwd q_fwd 16 | ------------------------------------------------------------------------ */ 17 | macro_rules! define_map_to_global_numpy { 18 | ($fname:ident, $code_ty:ty, $pos_ty:ty) => { 19 | #[pyfunction] 20 | #[allow(non_snake_case)] 21 | pub fn $fname<'py>( 22 | py: Python<'py>, 23 | /* ---------- exon (annotation) table — left side ---------- */ 24 | ex_tx: PyReadonlyArray1<$code_ty>, 25 | ex_local_start: PyReadonlyArray1<$pos_ty>, 26 | ex_local_end: PyReadonlyArray1<$pos_ty>, 27 | /* ---------- query (local) table — right side ------------ */ 28 | q_tx: PyReadonlyArray1<$code_ty>, 29 | q_start: PyReadonlyArray1<$pos_ty>, 30 | q_end: PyReadonlyArray1<$pos_ty>, 31 | /* ---------- extra parameters in Rust order -------------- */ 32 | ex_chr_code: PyReadonlyArray1<$code_ty>, 33 | ex_genome_start: PyReadonlyArray1<$pos_ty>, 34 | ex_genome_end: PyReadonlyArray1<$pos_ty>, 35 | ex_fwd: PyReadonlyArray1, 36 | q_fwd: PyReadonlyArray1, 37 | ) -> PyResult<( 38 | Py>, // indices back into query table 39 | Py>, // genomic start 40 | Py>, // genomic end 41 | Py>, // strand (+ = True) 42 | )> { 43 | let (idx, g_start, g_end, strand) = map_to_global( 44 | /* exons first (left triple) */ 45 | ex_tx.as_slice()?, 46 | ex_local_start.as_slice()?, 47 | ex_local_end.as_slice()?, 48 | /* queries second (right triple) */ 49 | q_tx.as_slice()?, 50 | q_start.as_slice()?, 51 | q_end.as_slice()?, 52 | /* extras in declared order */ 53 | ex_chr_code.as_slice()?, 54 | ex_genome_start.as_slice()?, 55 | ex_genome_end.as_slice()?, 56 | ex_fwd.as_slice()?, 57 | q_fwd.as_slice()?, 58 | ); 59 | 60 | Ok(( 61 | idx .into_pyarray(py).to_owned().into(), 62 | g_start .into_pyarray(py).to_owned().into(), 63 | g_end .into_pyarray(py).to_owned().into(), 64 | strand .into_pyarray(py).to_owned().into(), 65 | )) 66 | } 67 | }; 68 | } 69 | 70 | /* --------------------------------------------------------------------- 71 | Concrete instantiations – extend as required 72 | ------------------------------------------------------------------- */ 73 | define_map_to_global_numpy!(map_to_global_numpy_u64_i64, u64, i64); 74 | define_map_to_global_numpy!(map_to_global_numpy_u32_i64, u32, i64); 75 | define_map_to_global_numpy!(map_to_global_numpy_u32_i32, u32, i32); 76 | define_map_to_global_numpy!(map_to_global_numpy_u32_i16, u32, i16); 77 | define_map_to_global_numpy!(map_to_global_numpy_u16_i64, u16, i64); 78 | define_map_to_global_numpy!(map_to_global_numpy_u16_i32, u16, i32); 79 | define_map_to_global_numpy!(map_to_global_numpy_u16_i16, u16, i16); 80 | define_map_to_global_numpy!(map_to_global_numpy_u8_i64, u8, i64); 81 | define_map_to_global_numpy!(map_to_global_numpy_u8_i32, u8, i32); 82 | define_map_to_global_numpy!(map_to_global_numpy_u8_i16, u8, i16); 83 | -------------------------------------------------------------------------------- /src/ruranges_structs.rs: -------------------------------------------------------------------------------- 1 | use num_traits::{PrimInt, Signed, ToPrimitive, Zero}; 2 | use numpy::Element; // You'll need the num-traits crate 3 | use std::{hash::Hash, str::FromStr}; 4 | 5 | pub trait PositionType: PrimInt + Signed + Hash + Copy + radsort::Key + Element + Copy + PartialOrd + ToPrimitive + Zero + std::fmt::Display + std::fmt::Debug {} 6 | impl PositionType for T where T: PrimInt + Signed + Hash + Copy + radsort::Key + Element + Copy + PartialOrd + ToPrimitive + Zero + std::fmt::Display + std::fmt::Debug {} 7 | pub trait GroupType: PrimInt + Hash + Copy + radsort::Key + Zero + std::fmt::Debug {} 8 | impl GroupType for T where T: PrimInt + Hash + Copy + radsort::Key + Zero + std::fmt::Debug {} 9 | 10 | pub struct GenomicData { 11 | pub chroms: Vec, 12 | pub starts: Vec

, 13 | pub ends: Vec

, 14 | pub strands: Option>, 15 | } 16 | 17 | #[derive(Debug, Clone)] 18 | pub struct MinInterval { 19 | pub start: T, 20 | pub end: T, 21 | pub idx: u32, 22 | } 23 | 24 | #[derive(Debug, Clone)] 25 | pub struct StrandInterval { 26 | pub start: T, 27 | pub end: T, 28 | pub idx: u32, 29 | pub fwd: bool, 30 | } 31 | 32 | #[derive(Debug, Clone)] 33 | pub struct Interval { 34 | pub group: C, 35 | pub start: T, 36 | pub end: T, 37 | pub idx: u32, 38 | } 39 | 40 | #[derive(Debug, Clone, Hash)] 41 | pub struct EventUsize { 42 | pub chr: i64, 43 | pub pos: i64, 44 | pub is_start: bool, 45 | pub first_set: bool, 46 | pub idx: usize, 47 | } 48 | /// An "event" in the sweep line: 49 | /// - `pos`: the coordinate (start or end of an interval) 50 | /// - `is_start`: true if it's a start event, false if it's an end event 51 | /// - `set_id`: which set does this interval belong to? (1 or 2) 52 | /// - `idx`: the interval's ID/index 53 | #[derive(Debug, Clone, Hash)] 54 | pub struct Event { 55 | pub chr: C, 56 | pub pos: T, 57 | pub is_start: bool, 58 | pub first_set: bool, 59 | pub idx: u32, 60 | } 61 | 62 | #[derive(Debug, Clone, Hash)] 63 | pub struct MaxEvent { 64 | pub chr: C, 65 | pub pos: T, 66 | pub start: T, 67 | pub end: T, 68 | pub is_start: bool, 69 | pub first_set: bool, 70 | pub idx: u32, 71 | } 72 | 73 | #[derive(Debug, Clone, Hash)] 74 | pub struct MinEvent { 75 | pub chr: C, 76 | pub pos: T, 77 | pub idx: u32, 78 | } 79 | 80 | #[derive(Debug, Clone, Hash)] 81 | pub struct GroupStruct { 82 | pub chr: C, 83 | pub idx: u32, 84 | } 85 | 86 | #[derive(Debug, Clone, Hash)] 87 | pub struct OverlapPair { 88 | pub idx: u32, 89 | pub idx2: u32, 90 | } 91 | 92 | #[derive(Debug, Clone, Hash, Copy)] 93 | pub struct Nearest { 94 | pub distance: T, 95 | pub idx: u32, 96 | pub idx2: u32, 97 | } 98 | 99 | #[derive(Debug, Clone)] 100 | pub struct SplicedSubsequenceInterval { 101 | /// Encoded chromosome (or chrom+strand+gene) ID. 102 | pub chr: G, 103 | 104 | /// The genomic start coordinate. 105 | pub start: T, 106 | 107 | /// The genomic end coordinate. 108 | pub end: T, 109 | 110 | pub idx: u32, 111 | 112 | pub forward_strand: bool, 113 | 114 | /// Temporary: length = (end - start). 115 | pub temp_length: T, 116 | 117 | /// Temporary: cumulative sum of lengths within this chrom group. 118 | pub temp_cumsum: T, 119 | } 120 | 121 | /// A simple struct to hold each interval's data for "subsequence" logic. 122 | #[derive(Clone)] 123 | pub struct SubsequenceInterval { 124 | pub group_id: i64, // grouping ID 125 | pub start: i64, // genomic start 126 | pub end: i64, // genomic end 127 | pub idx: i64, // e.g. row index or something else 128 | pub forward_strand: bool, // true => + strand, false => - strand 129 | } 130 | 131 | pub struct GenericEvent { 132 | pub chr: C, 133 | pub pos: T, 134 | pub is_start: bool, 135 | pub first_set: bool, 136 | pub idx: u32, 137 | } 138 | 139 | #[derive(Copy, Clone, Debug, PartialEq, Eq)] 140 | pub enum OverlapType { 141 | First, 142 | Last, 143 | All, 144 | } 145 | 146 | impl FromStr for OverlapType { 147 | type Err = &'static str; 148 | 149 | fn from_str(s: &str) -> Result { 150 | match s.to_lowercase().as_str() { 151 | "all" => Ok(OverlapType::All), 152 | "first" => Ok(OverlapType::First), 153 | "last" => Ok(OverlapType::Last), 154 | _ => Err("Invalid direction string"), 155 | } 156 | } 157 | } 158 | 159 | 160 | pub struct SplicedRecord { 161 | pub idx: u32, 162 | pub start: T, 163 | pub end: T, 164 | pub strand: bool, 165 | } 166 | -------------------------------------------------------------------------------- /src/map_to_global.rs: -------------------------------------------------------------------------------- 1 | 2 | use numpy::{IntoPyArray, PyArray1, PyReadonlyArray1}; 3 | use pyo3::prelude::*; 4 | use radsort::sort_by_key; 5 | 6 | use crate::ruranges_structs::{GroupType, PositionType, StrandInterval}; 7 | 8 | 9 | #[allow(clippy::too_many_arguments)] 10 | pub fn map_to_global( 11 | /* left table (exons / annotation) */ 12 | ex_tx: &[G], 13 | ex_local_start: &[T], 14 | ex_local_end: &[T], 15 | 16 | /* right table (queries / local intervals) */ 17 | q_tx: &[G], 18 | q_start: &[T], 19 | q_end: &[T], 20 | 21 | /* extra, still in strict order */ 22 | ex_chr_code: &[G], 23 | ex_genome_start: &[T], 24 | ex_genome_end: &[T], 25 | ex_fwd: &[bool], 26 | q_fwd: &[bool], 27 | ) -> (Vec, Vec, Vec, Vec) { 28 | // ------------------- sanity checks (debug-only) ------------------------ 29 | debug_assert_eq!(ex_tx.len(), ex_local_start.len()); 30 | debug_assert_eq!(ex_tx.len(), ex_local_end.len()); 31 | debug_assert_eq!(ex_tx.len(), ex_chr_code.len()); 32 | debug_assert_eq!(ex_tx.len(), ex_genome_start.len()); 33 | debug_assert_eq!(ex_tx.len(), ex_genome_end.len()); 34 | debug_assert_eq!(ex_tx.len(), ex_fwd.len()); 35 | 36 | debug_assert_eq!(q_tx.len(), q_start.len()); 37 | debug_assert_eq!(q_tx.len(), q_end.len()); 38 | debug_assert_eq!(q_tx.len(), q_fwd.len()); 39 | 40 | // ------------------- output buffers ----------------------------------- 41 | let mut results = Vec::new(); 42 | 43 | // ------------------- two-pointer sweep --------------------------------- 44 | let mut ei = 0usize; // exon pointer 45 | let mut qi = 0usize; // query pointer 46 | let ex_n = ex_tx.len(); 47 | let q_n = q_tx.len(); 48 | 49 | while qi < q_n { 50 | let tx_code = q_tx[qi]; 51 | 52 | // move exon pointer to this transcript (or beyond) 53 | while ei < ex_n && ex_tx[ei] < tx_code { 54 | ei += 1; 55 | } 56 | 57 | // if no exons for this transcript, skip its queries 58 | if ei >= ex_n || ex_tx[ei] != tx_code { 59 | while qi < q_n && q_tx[qi] == tx_code { 60 | qi += 1; 61 | } 62 | continue; 63 | } 64 | 65 | // ------------------------------------------------------------ 66 | // process all queries with transcript == tx_code 67 | // ------------------------------------------------------------ 68 | let mut ej = ei; // exon cursor inside tx 69 | 70 | while qi < q_n && q_tx[qi] == tx_code { 71 | let mut l = q_start[qi]; 72 | let lend = q_end[qi]; 73 | let idx = qi as u32; // row number into query table 74 | let local_f = q_fwd[qi]; 75 | 76 | // advance exon cursor until its end is after l 77 | while ej < ex_n && ex_tx[ej] == tx_code && ex_local_end[ej] <= l { 78 | ej += 1; 79 | } 80 | 81 | let mut ek = ej; 82 | while l < lend && ek < ex_n && ex_tx[ek] == tx_code { 83 | let el_start = ex_local_start[ek]; 84 | let el_end = ex_local_end[ek]; 85 | 86 | if l >= el_end { 87 | ek += 1; 88 | continue; 89 | } 90 | 91 | // clip to current exon 92 | let seg_end_local = if lend < el_end { lend } else { el_end }; 93 | 94 | // translate to genome 95 | let offset1 = l - el_start; 96 | let offset2 = seg_end_local - el_start; 97 | 98 | let (g_start, g_end) = if ex_fwd[ek] { 99 | ( 100 | ex_genome_start[ek] + offset1, 101 | ex_genome_start[ek] + offset2, 102 | ) 103 | } else { 104 | ( 105 | ex_genome_end[ek] - offset2, 106 | ex_genome_end[ek] - offset1, 107 | ) 108 | }; 109 | 110 | // push result 111 | results.push(StrandInterval {start: g_start, end: g_end, idx: idx, fwd: local_f == ex_fwd[ek]}); 112 | 113 | // advance inside query 114 | l = seg_end_local; 115 | if l >= lend { 116 | break; 117 | } 118 | ek += 1; 119 | } 120 | 121 | qi += 1; // next query row 122 | } 123 | 124 | // skip remaining exons of this transcript 125 | while ei < ex_n && ex_tx[ei] == tx_code { 126 | ei += 1; 127 | } 128 | } 129 | 130 | sort_by_key(&mut results, |i| i.idx); 131 | 132 | let mut out_idxs = Vec::with_capacity(results.len()); 133 | let mut out_starts = Vec::with_capacity(results.len()); 134 | let mut out_ends = Vec::with_capacity(results.len()); 135 | let mut out_strands = Vec::with_capacity(results.len()); 136 | 137 | for rec in results { 138 | out_idxs.push(rec.idx); 139 | out_starts.push(rec.start); 140 | out_ends.push(rec.end); 141 | out_strands.push(rec.fwd); 142 | } 143 | 144 | (out_idxs, out_starts, out_ends, out_strands) 145 | } -------------------------------------------------------------------------------- /.github/workflows/CI.yml: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by maturin v1.7.8 2 | # To update, run 3 | # 4 | # maturin generate-ci github 5 | # 6 | name: CI 7 | 8 | on: 9 | push: 10 | branches: 11 | - main 12 | - master 13 | tags: 14 | - '*' 15 | pull_request: 16 | workflow_dispatch: 17 | 18 | permissions: 19 | contents: read 20 | 21 | jobs: 22 | linux: 23 | runs-on: ${{ matrix.platform.runner }} 24 | strategy: 25 | matrix: 26 | platform: 27 | - runner: ubuntu-22.04 28 | target: x86_64 29 | - runner: ubuntu-22.04 30 | target: x86 31 | - runner: ubuntu-22.04 32 | target: aarch64 33 | - runner: ubuntu-22.04 34 | target: armv7 35 | - runner: ubuntu-22.04 36 | target: s390x 37 | - runner: ubuntu-22.04 38 | target: ppc64le 39 | steps: 40 | - uses: actions/checkout@v4 41 | - uses: actions/setup-python@v5 42 | with: 43 | python-version: 3.x 44 | - name: Build wheels 45 | uses: PyO3/maturin-action@v1 46 | with: 47 | target: ${{ matrix.platform.target }} 48 | args: --release --out dist --find-interpreter 49 | sccache: 'true' 50 | manylinux: auto 51 | - name: Upload wheels 52 | uses: actions/upload-artifact@v4 53 | with: 54 | name: wheels-linux-${{ matrix.platform.target }} 55 | path: dist 56 | 57 | musllinux: 58 | runs-on: ${{ matrix.platform.runner }} 59 | strategy: 60 | matrix: 61 | platform: 62 | - runner: ubuntu-22.04 63 | target: x86_64 64 | - runner: ubuntu-22.04 65 | target: x86 66 | - runner: ubuntu-22.04 67 | target: aarch64 68 | - runner: ubuntu-22.04 69 | target: armv7 70 | steps: 71 | - uses: actions/checkout@v4 72 | - uses: actions/setup-python@v5 73 | with: 74 | python-version: 3.x 75 | - name: Build wheels 76 | uses: PyO3/maturin-action@v1 77 | with: 78 | target: ${{ matrix.platform.target }} 79 | args: --release --out dist --find-interpreter 80 | sccache: 'true' 81 | manylinux: musllinux_1_2 82 | - name: Upload wheels 83 | uses: actions/upload-artifact@v4 84 | with: 85 | name: wheels-musllinux-${{ matrix.platform.target }} 86 | path: dist 87 | 88 | windows: 89 | runs-on: ${{ matrix.platform.runner }} 90 | strategy: 91 | matrix: 92 | platform: 93 | - runner: windows-latest 94 | target: x64 95 | - runner: windows-latest 96 | target: x86 97 | steps: 98 | - uses: actions/checkout@v4 99 | - uses: actions/setup-python@v5 100 | with: 101 | python-version: 3.x 102 | architecture: ${{ matrix.platform.target }} 103 | - name: Build wheels 104 | uses: PyO3/maturin-action@v1 105 | with: 106 | target: ${{ matrix.platform.target }} 107 | args: --release --out dist --find-interpreter 108 | sccache: 'true' 109 | - name: Upload wheels 110 | uses: actions/upload-artifact@v4 111 | with: 112 | name: wheels-windows-${{ matrix.platform.target }} 113 | path: dist 114 | 115 | macos: 116 | runs-on: ${{ matrix.platform.runner }} 117 | strategy: 118 | matrix: 119 | platform: 120 | - runner: macos-13 121 | target: x86_64 122 | - runner: macos-14 123 | target: aarch64 124 | steps: 125 | - uses: actions/checkout@v4 126 | - uses: actions/setup-python@v5 127 | with: 128 | python-version: 3.x 129 | - name: Build wheels 130 | uses: PyO3/maturin-action@v1 131 | with: 132 | target: ${{ matrix.platform.target }} 133 | args: --release --out dist --find-interpreter 134 | sccache: 'true' 135 | - name: Upload wheels 136 | uses: actions/upload-artifact@v4 137 | with: 138 | name: wheels-macos-${{ matrix.platform.target }} 139 | path: dist 140 | 141 | sdist: 142 | runs-on: ubuntu-latest 143 | steps: 144 | - uses: actions/checkout@v4 145 | - name: Build sdist 146 | uses: PyO3/maturin-action@v1 147 | with: 148 | command: sdist 149 | args: --out dist 150 | - name: Upload sdist 151 | uses: actions/upload-artifact@v4 152 | with: 153 | name: wheels-sdist 154 | path: dist 155 | 156 | release: 157 | name: Release 158 | runs-on: ubuntu-latest 159 | if: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' }} 160 | needs: [linux, musllinux, windows, macos, sdist] 161 | permissions: 162 | # Use to sign the release artifacts 163 | id-token: write 164 | # Used to upload release artifacts 165 | contents: write 166 | # Used to generate artifact attestation 167 | attestations: write 168 | steps: 169 | - uses: actions/download-artifact@v4 170 | - name: Generate artifact attestation 171 | uses: actions/attest-build-provenance@v1 172 | with: 173 | subject-path: 'wheels-*/*' 174 | - name: Publish to PyPI 175 | if: ${{ startsWith(github.ref, 'refs/tags/') }} 176 | uses: PyO3/maturin-action@v1 177 | env: 178 | MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} 179 | with: 180 | command: upload 181 | args: --non-interactive --skip-existing wheels-*/* 182 | -------------------------------------------------------------------------------- /src/bindings/numpy_bindings/spliced_subsequence_numpy.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use numpy::{IntoPyArray, PyReadonlyArray1, PyArray1}; 3 | 4 | use crate::spliced_subsequence::{spliced_subseq, spliced_subseq_multi}; 5 | 6 | /// ------------------------------------------------------------------------- 7 | /// single-slice wrappers 8 | /// ------------------------------------------------------------------------- 9 | macro_rules! define_spliced_subsequence_numpy { 10 | ($fname:ident, $chr_ty:ty, $pos_ty:ty) => { 11 | #[pyfunction] 12 | #[pyo3(signature = ( 13 | chrs, 14 | starts, 15 | ends, 16 | strand_flags, 17 | start, 18 | end = None, 19 | force_plus_strand = false 20 | ))] 21 | #[allow(non_snake_case)] 22 | pub fn $fname( 23 | chrs: PyReadonlyArray1<$chr_ty>, 24 | starts: PyReadonlyArray1<$pos_ty>, 25 | ends: PyReadonlyArray1<$pos_ty>, 26 | strand_flags: PyReadonlyArray1, 27 | start: $pos_ty, 28 | end: Option<$pos_ty>, 29 | force_plus_strand: bool, 30 | py: Python<'_>, 31 | ) -> PyResult<( 32 | Py>, // indices 33 | Py>, // new starts 34 | Py>, // new ends 35 | Py>, // strand True='+', False='-' 36 | )> { 37 | let (idx, new_starts, new_ends, strands) = spliced_subseq( 38 | chrs.as_slice()?, 39 | starts.as_slice()?, 40 | ends.as_slice()?, 41 | strand_flags.as_slice()?, 42 | start, 43 | end, 44 | force_plus_strand, 45 | ); 46 | 47 | Ok(( 48 | idx .into_pyarray(py).to_owned().into(), 49 | new_starts .into_pyarray(py).to_owned().into(), 50 | new_ends .into_pyarray(py).to_owned().into(), 51 | strands .into_pyarray(py).to_owned().into(), 52 | )) 53 | } 54 | }; 55 | } 56 | 57 | // concrete instantiations 58 | define_spliced_subsequence_numpy!(spliced_subsequence_numpy_u64_i64, u64, i64); 59 | define_spliced_subsequence_numpy!(spliced_subsequence_numpy_u32_i64, u32, i64); 60 | define_spliced_subsequence_numpy!(spliced_subsequence_numpy_u32_i32, u32, i32); 61 | define_spliced_subsequence_numpy!(spliced_subsequence_numpy_u32_i16, u32, i16); 62 | define_spliced_subsequence_numpy!(spliced_subsequence_numpy_u16_i64, u16, i64); 63 | define_spliced_subsequence_numpy!(spliced_subsequence_numpy_u16_i32, u16, i32); 64 | define_spliced_subsequence_numpy!(spliced_subsequence_numpy_u16_i16, u16, i16); 65 | define_spliced_subsequence_numpy!(spliced_subsequence_numpy_u8_i64, u8, i64); 66 | define_spliced_subsequence_numpy!(spliced_subsequence_numpy_u8_i32, u8, i32); 67 | define_spliced_subsequence_numpy!(spliced_subsequence_numpy_u8_i16, u8, i16); 68 | 69 | macro_rules! define_spliced_subsequence_multi_numpy { 70 | ($fname:ident, $chr_ty:ty, $pos_ty:ty) => { 71 | #[pyfunction] 72 | #[pyo3(signature = ( 73 | chrs, 74 | starts, 75 | ends, 76 | strand_flags, 77 | slice_starts, 78 | slice_ends, 79 | force_plus_strand = false 80 | ))] 81 | #[allow(non_snake_case)] 82 | pub fn $fname( 83 | chrs: PyReadonlyArray1<$chr_ty>, 84 | starts: PyReadonlyArray1<$pos_ty>, 85 | ends: PyReadonlyArray1<$pos_ty>, 86 | strand_flags: PyReadonlyArray1, 87 | slice_starts: PyReadonlyArray1<$pos_ty>, 88 | slice_ends: PyReadonlyArray1<$pos_ty>, 89 | force_plus_strand: bool, 90 | py: Python<'_>, 91 | ) -> PyResult<( 92 | Py>, 93 | Py>, 94 | Py>, 95 | Py>, 96 | )> { 97 | let ends_opt: Vec> = slice_ends 98 | .as_slice()? 99 | .iter() 100 | .map(|&v| Some(v)) 101 | .collect(); 102 | 103 | let (idx, new_starts, new_ends, strands) = spliced_subseq_multi( 104 | chrs.as_slice()?, 105 | starts.as_slice()?, 106 | ends.as_slice()?, 107 | strand_flags.as_slice()?, 108 | slice_starts.as_slice()?, 109 | ends_opt.as_slice(), 110 | force_plus_strand, 111 | ); 112 | 113 | Ok(( 114 | idx.into_pyarray(py).to_owned().into(), 115 | new_starts.into_pyarray(py).to_owned().into(), 116 | new_ends.into_pyarray(py).to_owned().into(), 117 | strands.into_pyarray(py).to_owned().into(), 118 | )) 119 | } 120 | }; 121 | } 122 | 123 | 124 | // concrete instantiations 125 | define_spliced_subsequence_multi_numpy!(spliced_subsequence_multi_numpy_u64_i64, u64, i64); 126 | define_spliced_subsequence_multi_numpy!(spliced_subsequence_multi_numpy_u32_i64, u32, i64); 127 | define_spliced_subsequence_multi_numpy!(spliced_subsequence_multi_numpy_u32_i32, u32, i32); 128 | define_spliced_subsequence_multi_numpy!(spliced_subsequence_multi_numpy_u32_i16, u32, i16); 129 | define_spliced_subsequence_multi_numpy!(spliced_subsequence_multi_numpy_u16_i64, u16, i64); 130 | define_spliced_subsequence_multi_numpy!(spliced_subsequence_multi_numpy_u16_i32, u16, i32); 131 | define_spliced_subsequence_multi_numpy!(spliced_subsequence_multi_numpy_u16_i16, u16, i16); 132 | define_spliced_subsequence_multi_numpy!(spliced_subsequence_multi_numpy_u8_i64, u8, i64); 133 | define_spliced_subsequence_multi_numpy!(spliced_subsequence_multi_numpy_u8_i32, u8, i32); 134 | define_spliced_subsequence_multi_numpy!(spliced_subsequence_multi_numpy_u8_i16, u8, i16); 135 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ruranges - blazing-fast interval algebra for NumPy 2 | 3 | ruranges is a thin Python wrapper around a set of Rust kernels that implement common genomic / interval algorithms at native speed. All public functions accept and return plain NumPy arrays so you can drop the results straight into your existing Python data-science stack. 4 | 5 | --- 6 | 7 | ## Why ruranges? 8 | 9 | * Speed: heavy kernels in Rust compiled with --release. 10 | * Zero copy: results are numpy views whenever possible. 11 | * Flexible dtypes: unsigned int8/16/32/64 for group ids, signed ints for coordinates. The wrapper chooses the smallest safe dtype automatically. 12 | * Stateless: plain functions, no classes. 13 | 14 | --- 15 | 16 | ## Installation 17 | 18 | ```bash 19 | pip install ruranges # PyPI 20 | # or 21 | pip install git+https://github.com/your-org/ruranges.git 22 | ``` 23 | 24 | --- 25 | 26 | ## Cheat sheet 27 | 28 | | Category | Function | What it does | 29 | | --------------------- | ------------------------------------------ | ----------------------------------------------- | 30 | | Overlap and proximity | overlaps | all overlapping pairs between two sets | 31 | | | nearest | k nearest intervals with optional strand filter | 32 | | | count\_overlaps | how many rows in B overlap each row in A | 33 | | Set algebra | subtract | A minus B | 34 | | | complement | gaps within chromosome bounds | 35 | | | merge, cluster, max\_disjoint | collapse or filter overlaps | 36 | | Utility | sort\_intervals, window, tile, extend, ... | assorted helpers | 37 | 38 | Below are the three most common calls: overlaps, nearest, subtract. 39 | 40 | --- 41 | 42 | ## 1. overlaps 43 | 44 | Simple example: 45 | 46 | ```python 47 | import pandas as pd 48 | import numpy as np 49 | from ruranges import overlaps 50 | 51 | df1 = pd.DataFrame({ 52 | "chr": ["chr1", "chr1", "chr2"], 53 | "strand": ["+", "+", "-"], 54 | "start": [1, 10, 30], 55 | "end": [5, 15, 35], 56 | }) 57 | 58 | df2 = pd.DataFrame({ 59 | "chr": ["chr1", "chr2", "chr2"], 60 | "strand": ["+", "-", "-"], 61 | "start": [3, -50, 0], 62 | "end": [6, 50, 2], 63 | }) 64 | 65 | print("Inputs:") 66 | 67 | print(df1) 68 | print(df2) 69 | 70 | 71 | # Vectorised: concatenate, then ngroup 72 | combo = pd.concat([df1[["chr", "strand"]], df2[["chr", "strand"]]], ignore_index=True) 73 | labels = combo.groupby(["chr", "strand"], sort=False).ngroup().astype(np.uint32).to_numpy() 74 | 75 | groups = labels[:len(df1)] 76 | groups2 = labels[len(df1):] 77 | 78 | idx1, idx2 = overlaps( 79 | starts=df1["start"].to_numpy(np.int32), 80 | ends=df1["end"].to_numpy(np.int32), 81 | starts2=df2["start"].to_numpy(np.int32), 82 | ends2=df2["end"].to_numpy(np.int32), 83 | groups=groups, 84 | groups2=groups2, 85 | ) 86 | 87 | 88 | print("Output:") 89 | print(idx1, idx2) 90 | 91 | print("Extracts rows:") 92 | print(df1.iloc[idx1]) 93 | print(df2.iloc[idx2]) 94 | 95 | # Inputs: 96 | # chr strand start end 97 | # 0 chr1 + 1 5 98 | # 1 chr1 + 10 15 99 | # 2 chr2 - 30 35 100 | # chr strand start end 101 | # 0 chr1 + 3 6 102 | # 1 chr2 - -50 50 103 | # 2 chr2 - 0 2 104 | # Output: 105 | # [0 2] [0 1] 106 | # Extracts rows: 107 | # chr strand start end 108 | # 0 chr1 + 1 5 109 | # 2 chr2 - 30 35 110 | # chr strand start end 111 | # 0 chr1 + 3 6 112 | # 1 chr2 - -50 50 113 | ``` 114 | 115 | ## 2. nearest 116 | 117 | ```python 118 | import numpy as np 119 | from ruranges import nearest 120 | 121 | starts = np.array([1, 10, 30], dtype=np.int32) 122 | ends = np.array([5, 15, 35], dtype=np.int32) 123 | starts2 = np.array([3, 20, 28], dtype=np.int32) 124 | ends2 = np.array([6, 25, 32], dtype=np.int32) 125 | 126 | idx1, idx2, dist = nearest( 127 | starts=starts, ends=ends, 128 | starts2=starts2, ends2=ends2, 129 | k=2, 130 | include_overlaps=False, 131 | direction="any", 132 | ) 133 | 134 | for a, b, d in zip(idx1, idx2, dist): 135 | print(f"query[{a}] <-> ref[{b}] : {d} bp") 136 | 137 | # query[0] <-> ref[1] : 16 bp 138 | # query[0] <-> ref[2] : 24 bp 139 | # query[1] <-> ref[0] : 5 bp 140 | # query[1] <-> ref[1] : 6 bp 141 | # query[2] <-> ref[1] : 6 bp 142 | # query[2] <-> ref[0] : 25 bp 143 | ``` 144 | 145 | Set direction to "forward" or "backward" to restrict to one side. 146 | 147 | --- 148 | 149 | ## 3. subtract 150 | 151 | ```python 152 | import numpy as np 153 | from ruranges import subtract 154 | 155 | starts = np.array([0, 10], dtype=np.int32) 156 | ends = np.array([10, 20], dtype=np.int32) 157 | starts2 = np.array([5, 12], dtype=np.int32) 158 | ends2 = np.array([15, 18], dtype=np.int32) 159 | 160 | idx_keep, sub_starts, sub_ends = subtract( 161 | starts, ends, 162 | starts2, ends2, 163 | ) 164 | 165 | print(idx_keep) 166 | print(sub_starts) 167 | print(sub_ends) 168 | # [0 1] 169 | # [ 0 18] 170 | # [ 5 20] 171 | ``` 172 | 173 | Because interval 1 is broken into two pieces it appears twice in idx\_keep. 174 | 175 | --- 176 | 177 | ## FAQ 178 | 179 | ### Supported dtypes 180 | 181 | * Groups: uint8, uint16, uint32, uint64 182 | * Coordinates: int8, int16, int32, int64 183 | 184 | ### Do I need sorted intervals? 185 | 186 | No. Functions sort internally where needed and return index permutations so you can restore the original order. 187 | 188 | ### How to encode strand? 189 | 190 | Any function that needs strand expects a boolean array: True for the minus strand, False for the plus strand. 191 | 192 | --- 193 | 194 | ## License 195 | 196 | Apache 2.0. See LICENSE for details. 197 | 198 | 199 | -------------------------------------------------------------------------------- /src/subtract.rs: -------------------------------------------------------------------------------- 1 | use num_traits::{PrimInt, Signed, Zero}; 2 | use radsort::sort_by_key; 3 | use rustc_hash::FxHashMap; 4 | use std::hash::Hash; 5 | 6 | use crate::{ruranges_structs::{GroupType, Interval, MinEvent, MinInterval, PositionType}, sorts}; 7 | 8 | pub fn sweep_line_subtract( 9 | chrs1: &[G], 10 | starts1: &[T], 11 | ends1: &[T], 12 | chrs2: &[G], 13 | starts2: &[T], 14 | ends2: &[T], 15 | ) -> (Vec, Vec, Vec) { 16 | // If either set is empty, set1 is unchanged (or trivially subtracted). 17 | if chrs1.is_empty() || chrs2.is_empty() { 18 | return ( 19 | (0..chrs1.len() as u32).collect(), 20 | starts1.to_vec(), 21 | ends1.to_vec(), 22 | ); 23 | } 24 | 25 | // Build sorted events 26 | let events = 27 | sorts::build_sorted_events_idxs(chrs1, starts1, ends1, chrs2, starts2, ends2, T::zero()); 28 | 29 | let mut out_events = Vec::new(); 30 | 31 | 32 | // Track how many set2 intervals are active 33 | let mut active2_count: i64 = 0; 34 | 35 | // For each active interval in set1, store the position at which 36 | // we last started a "valid" sub-interval (when active2_count == 0). 37 | // i.e. active1[idx] = Some(position) means we are currently capturing 38 | // a sub-interval for that idx that started at `position`. 39 | let mut active1: FxHashMap> = FxHashMap::default(); 40 | 41 | let mut current_chr = events.first().unwrap().chr; 42 | 43 | // We'll sweep in ascending order 44 | for e in events.iter() { 45 | // If we jumped to a new chromosome, close out everything 46 | // because intervals do not cross chromosome boundaries. 47 | if e.chr != current_chr { 48 | // for any active sub-interval in the old chromosome, we close them at the last event pos 49 | // but in typical coordinate intervals, they should already be ended by the end event. 50 | // We'll do a final cleanup if you want. Usually, each interval on the old chr 51 | // has presumably ended with an event, but if not, you can decide to finalize them here. 52 | 53 | // Clear everything 54 | active1.clear(); 55 | active2_count = 0; 56 | current_chr = e.chr; 57 | } 58 | 59 | let pos = e.pos; 60 | 61 | // --- 1. If we have *just arrived* at a new position, and `active2_count == 0`, 62 | // we are "continuing" sub-intervals for all active1. 63 | // 64 | // But typically, the actual writing out of intervals 65 | // occurs at the event boundaries (start or end). 66 | // We'll handle that logic around the transitions. 67 | 68 | // --- 2. Now handle the event itself: 69 | 70 | if e.first_set { 71 | // This event is from set1 72 | if e.is_start { 73 | // A set1 interval starts 74 | // If we are outside set2 (active2_count==0), 75 | // that means we can immediately start capturing a sub-interval. 76 | if active2_count == 0 { 77 | active1.insert(e.idx, Some(pos)); 78 | } else { 79 | // set2 is active, so we do not start capturing yet 80 | active1.insert(e.idx, None); 81 | } 82 | } else { 83 | // A set1 interval ends 84 | // If we have been capturing a sub-interval for this idx, close it 85 | if let Some(start_pos) = active1.get(&e.idx).cloned().unwrap_or(None) { 86 | // We are capturing. End the sub-interval at e.pos 87 | if start_pos < pos { 88 | out_events.push(MinInterval {start: start_pos, end: pos, idx: e.idx}); 89 | } 90 | } 91 | // Remove it from active1 92 | active1.remove(&e.idx); 93 | } 94 | } else { 95 | // This event is from set2 96 | if e.is_start { 97 | // set2 interval starts 98 | active2_count += 1; 99 | 100 | // If we just went from 0 -> 1, that means we need to close 101 | // *all currently capturing intervals in set1* right at this boundary. 102 | if active2_count == 1 { 103 | // close everyone 104 | for (&idx1, &maybe_start) in active1.iter() { 105 | if let Some(start_pos) = maybe_start { 106 | // Close at current event pos (exclusive or inclusive depends on your semantics) 107 | if start_pos < pos { 108 | out_events.push(MinInterval {start: start_pos, end: pos, idx: idx1}); 109 | } 110 | } 111 | } 112 | // Now, set them all to None, since we cannot capture while set2 is active 113 | for v in active1.values_mut() { 114 | *v = None; 115 | } 116 | } 117 | } else { 118 | // set2 interval ends 119 | active2_count -= 1; 120 | 121 | // If we just went from 1 -> 0, that means we can *resume capturing* 122 | // for all set1 intervals that are still active. 123 | if active2_count == 0 { 124 | // For every set1 interval that is active, we set the start to the boundary 125 | // so we resume capturing at e.pos 126 | for (_idx1, v) in active1.iter_mut() { 127 | if v.is_none() { 128 | *v = Some(pos); 129 | } 130 | } 131 | } 132 | } 133 | } 134 | 135 | // Optionally, you can look ahead to the next event's position 136 | // to handle the "between events" region if needed. 137 | // But typically, the creation of sub-intervals at boundaries is enough. 138 | 139 | // 3. Move on to the next event 140 | } 141 | sort_by_key(&mut out_events, |i| i.idx); 142 | 143 | // No final cleanup is strictly necessary if every set1 interval has a corresponding end event. 144 | let mut out_idxs = Vec::with_capacity(out_events.len()); 145 | let mut out_starts = Vec::with_capacity(out_events.len()); 146 | let mut out_ends = Vec::with_capacity(out_events.len()); 147 | 148 | for rec in out_events { 149 | out_idxs.push(rec.idx); 150 | out_starts.push(rec.start); 151 | out_ends.push(rec.end); 152 | } 153 | 154 | (out_idxs, out_starts, out_ends) 155 | } 156 | -------------------------------------------------------------------------------- /src/bindings/polars_bindings.rs: -------------------------------------------------------------------------------- 1 | // use std::str::FromStr; 2 | // 3 | // use polars::prelude::*; 4 | // use pyo3::exceptions::PyException; 5 | // use pyo3::prelude::*; 6 | // use pyo3_polars::PySeries; 7 | // 8 | // use crate::cluster::sweep_line_cluster; 9 | // use crate::merge::{self, sweep_line_merge}; 10 | // use crate::numpy_bindings::{keep_first_by_idx, OverlapType}; 11 | // use crate::overlaps::{self, sweep_line_overlaps_set1}; 12 | // use crate::ruranges_structs::OverlapPair; 13 | // 14 | // /// Helper function to convert a PySeries into a contiguous slice of u32. 15 | // fn pyseries_to_u32_slice(pyseries: PySeries) -> PyResult> { 16 | // // Access the inner Series from the PySeries tuple-struct. 17 | // let series = (pyseries.0).rechunk(); 18 | // // Get the UInt32Chunked, mapping any Polars error into a PyException. 19 | // let ca = series 20 | // .u32() 21 | // .map_err(|e| PyException::new_err(e.to_string()))?; 22 | // // cont_slice() returns an Option<&[u32]>; convert it to a PyResult. 23 | // let slice = ca 24 | // .cont_slice() 25 | // .map_err(|e| PyException::new_err(e.to_string()))?; 26 | // Ok(slice.to_vec()) 27 | // } 28 | // 29 | // fn pyseries_to_i32_slice(pyseries: PySeries) -> PyResult> { 30 | // let series = (pyseries.0).rechunk(); 31 | // let ca = series 32 | // .i32() 33 | // .map_err(|e| PyException::new_err(e.to_string()))?; 34 | // let slice = ca 35 | // .cont_slice() 36 | // .map_err(|e| PyException::new_err(e.to_string()))?; 37 | // Ok(slice.to_vec()) 38 | // } 39 | // 40 | // /// PyO3 wrapper function that accepts PySeries objects, 41 | // /// converts them to slices using the helper functions, 42 | // /// calls your native sweep_line_overlaps_set1, and returns the result. 43 | // #[pyfunction] 44 | // pub fn sweep_line_overlaps_set1_polars( 45 | // chrs: PySeries, 46 | // starts: PySeries, 47 | // ends: PySeries, 48 | // chrs2: PySeries, 49 | // starts2: PySeries, 50 | // ends2: PySeries, 51 | // slack: i32, 52 | // ) -> PyResult { 53 | // let chrs_slice = pyseries_to_u32_slice(chrs)?; 54 | // let starts_slice = pyseries_to_i32_slice(starts)?; 55 | // let ends_slice = pyseries_to_i32_slice(ends)?; 56 | // let chrs2_slice = pyseries_to_u32_slice(chrs2)?; 57 | // let starts2_slice = pyseries_to_i32_slice(starts2)?; 58 | // let ends2_slice = pyseries_to_i32_slice(ends2)?; 59 | // 60 | // let overlaps = sweep_line_overlaps_set1( 61 | // &chrs_slice, 62 | // &starts_slice, 63 | // &ends_slice, 64 | // &chrs2_slice, 65 | // &starts2_slice, 66 | // &ends2_slice, 67 | // slack, 68 | // ); 69 | // let out_series = Series::new("overlaps".into(), overlaps); 70 | // Ok(PySeries(out_series)) 71 | // } 72 | // 73 | // #[pyfunction] 74 | // pub fn cluster_polars( 75 | // chrs: PySeries, 76 | // starts: PySeries, 77 | // ends: PySeries, 78 | // slack: i32, 79 | // ) -> PyResult<(PySeries, PySeries)> { 80 | // let chrs_slice = pyseries_to_u32_slice(chrs)?; 81 | // let starts_slice = pyseries_to_i32_slice(starts)?; 82 | // let ends_slice = pyseries_to_i32_slice(ends)?; 83 | // 84 | // let (cluster_ids, row_nmb) = sweep_line_cluster(&chrs_slice, &starts_slice, &ends_slice, slack); 85 | // let idx_series = Series::new("row_nmb".into(), row_nmb); 86 | // let cluster_series = Series::new("cluster_id".into(), cluster_ids); 87 | // Ok((PySeries(cluster_series), PySeries(idx_series))) 88 | // } 89 | // 90 | // #[pyfunction] 91 | // pub fn chromsweep_polars( 92 | // _py: Python, 93 | // chrs: PySeries, 94 | // starts: PySeries, 95 | // ends: PySeries, 96 | // chrs2: PySeries, 97 | // starts2: PySeries, 98 | // ends2: PySeries, 99 | // slack: i32, 100 | // overlap_type: &str, 101 | // contained: bool, 102 | // ) -> PyResult<(PySeries, PySeries)> { 103 | // let chrs_slice = &pyseries_to_u32_slice(chrs)?; 104 | // let starts_slice = &pyseries_to_i32_slice(starts)?; 105 | // let ends_slice = &pyseries_to_i32_slice(ends)?; 106 | // let chrs2_slice = &pyseries_to_u32_slice(chrs2)?; 107 | // let starts2_slice = &pyseries_to_i32_slice(starts2)?; 108 | // let ends2_slice = &pyseries_to_i32_slice(ends2)?; 109 | // 110 | // let overlap_type = OverlapType::from_str(overlap_type).unwrap(); 111 | // let result = overlaps::sweep_line_overlaps( 112 | // chrs_slice, 113 | // starts_slice, 114 | // ends_slice, 115 | // chrs2_slice, 116 | // starts2_slice, 117 | // ends2_slice, 118 | // slack, 119 | // ); 120 | // 121 | // // let result: (Vec, Vec) = if !contained { 122 | // // let (sorted_starts, sorted_ends) = overlaps::compute_sorted_events( 123 | // // chrs_slice, 124 | // // starts_slice, 125 | // // ends_slice, 126 | // // slack, 127 | // // invert, 128 | // // ); 129 | // // let (sorted_starts2, sorted_ends2) = 130 | // // overlaps::compute_sorted_events(chrs2_slice, starts2_slice, ends2_slice, 0, invert); 131 | // 132 | // // let mut pairs = overlaps::sweep_line_overlaps_overlap_pair( 133 | // // &sorted_starts, 134 | // // &sorted_ends, 135 | // // &sorted_starts2, 136 | // // &sorted_ends2, 137 | // // ); 138 | // // eprintln!("indices found: {:?}", pairs.len()); 139 | // // if overlap_type != OverlapType::All { 140 | // // keep_first_by_idx(&mut pairs); 141 | // // } 142 | // // radsort::sort_by_key(&mut pairs, |p| (p.idx, p.idx2)); 143 | // // pairs.into_iter().map(|pair| (pair.idx, pair.idx2)).unzip() 144 | // // } else { 145 | // // let maxevents = overlaps::compute_sorted_maxevents( 146 | // // chrs_slice, 147 | // // starts_slice, 148 | // // ends_slice, 149 | // // chrs2_slice, 150 | // // starts2_slice, 151 | // // ends2_slice, 152 | // // slack, 153 | // // invert, 154 | // // ); 155 | // // let mut pairs = overlaps::sweep_line_overlaps_containment(maxevents); 156 | // // if overlap_type != OverlapType::All { 157 | // // keep_first_by_idx(&mut pairs); 158 | // // } 159 | // // radsort::sort_by_key(&mut pairs, |p| (p.idx, p.idx2)); 160 | // // pairs.into_iter().map(|pair| (pair.idx, pair.idx2)).unzip() 161 | // // }; 162 | // 163 | // let idx_series = Series::new("idx".into(), result.0); 164 | // let idx2_series = Series::new("idx2".into(), result.1); 165 | // Ok((PySeries(idx_series), PySeries(idx2_series))) 166 | // } 167 | // // #[pymodule] 168 | // // fn ruranges(m: &Bound<'_, PyModule>) -> PyResult<()> { 169 | // // // Use add_wrapped in this version of pyo3 170 | // // Ok(()) 171 | // // } 172 | // -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 4 4 | 5 | [[package]] 6 | name = "autocfg" 7 | version = "1.4.0" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" 10 | 11 | [[package]] 12 | name = "heck" 13 | version = "0.5.0" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" 16 | 17 | [[package]] 18 | name = "indoc" 19 | version = "2.0.5" 20 | source = "registry+https://github.com/rust-lang/crates.io-index" 21 | checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" 22 | 23 | [[package]] 24 | name = "libc" 25 | version = "0.2.169" 26 | source = "registry+https://github.com/rust-lang/crates.io-index" 27 | checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" 28 | 29 | [[package]] 30 | name = "matrixmultiply" 31 | version = "0.3.9" 32 | source = "registry+https://github.com/rust-lang/crates.io-index" 33 | checksum = "9380b911e3e96d10c1f415da0876389aaf1b56759054eeb0de7df940c456ba1a" 34 | dependencies = [ 35 | "autocfg", 36 | "rawpointer", 37 | ] 38 | 39 | [[package]] 40 | name = "memoffset" 41 | version = "0.9.1" 42 | source = "registry+https://github.com/rust-lang/crates.io-index" 43 | checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" 44 | dependencies = [ 45 | "autocfg", 46 | ] 47 | 48 | [[package]] 49 | name = "ndarray" 50 | version = "0.16.1" 51 | source = "registry+https://github.com/rust-lang/crates.io-index" 52 | checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841" 53 | dependencies = [ 54 | "matrixmultiply", 55 | "num-complex", 56 | "num-integer", 57 | "num-traits", 58 | "portable-atomic", 59 | "portable-atomic-util", 60 | "rawpointer", 61 | ] 62 | 63 | [[package]] 64 | name = "num-complex" 65 | version = "0.4.6" 66 | source = "registry+https://github.com/rust-lang/crates.io-index" 67 | checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" 68 | dependencies = [ 69 | "num-traits", 70 | ] 71 | 72 | [[package]] 73 | name = "num-integer" 74 | version = "0.1.46" 75 | source = "registry+https://github.com/rust-lang/crates.io-index" 76 | checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" 77 | dependencies = [ 78 | "num-traits", 79 | ] 80 | 81 | [[package]] 82 | name = "num-traits" 83 | version = "0.2.19" 84 | source = "registry+https://github.com/rust-lang/crates.io-index" 85 | checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" 86 | dependencies = [ 87 | "autocfg", 88 | ] 89 | 90 | [[package]] 91 | name = "numpy" 92 | version = "0.26.0" 93 | source = "registry+https://github.com/rust-lang/crates.io-index" 94 | checksum = "9b2dba356160b54f5371b550575b78130a54718b4c6e46b3f33a6da74a27e78b" 95 | dependencies = [ 96 | "libc", 97 | "ndarray", 98 | "num-complex", 99 | "num-integer", 100 | "num-traits", 101 | "pyo3", 102 | "pyo3-build-config", 103 | "rustc-hash", 104 | ] 105 | 106 | [[package]] 107 | name = "once_cell" 108 | version = "1.21.3" 109 | source = "registry+https://github.com/rust-lang/crates.io-index" 110 | checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" 111 | 112 | [[package]] 113 | name = "portable-atomic" 114 | version = "1.10.0" 115 | source = "registry+https://github.com/rust-lang/crates.io-index" 116 | checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" 117 | 118 | [[package]] 119 | name = "portable-atomic-util" 120 | version = "0.2.4" 121 | source = "registry+https://github.com/rust-lang/crates.io-index" 122 | checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" 123 | dependencies = [ 124 | "portable-atomic", 125 | ] 126 | 127 | [[package]] 128 | name = "proc-macro2" 129 | version = "1.0.92" 130 | source = "registry+https://github.com/rust-lang/crates.io-index" 131 | checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" 132 | dependencies = [ 133 | "unicode-ident", 134 | ] 135 | 136 | [[package]] 137 | name = "pyo3" 138 | version = "0.26.0" 139 | source = "registry+https://github.com/rust-lang/crates.io-index" 140 | checksum = "7ba0117f4212101ee6544044dae45abe1083d30ce7b29c4b5cbdfa2354e07383" 141 | dependencies = [ 142 | "indoc", 143 | "libc", 144 | "memoffset", 145 | "once_cell", 146 | "portable-atomic", 147 | "pyo3-build-config", 148 | "pyo3-ffi", 149 | "pyo3-macros", 150 | "unindent", 151 | ] 152 | 153 | [[package]] 154 | name = "pyo3-build-config" 155 | version = "0.26.0" 156 | source = "registry+https://github.com/rust-lang/crates.io-index" 157 | checksum = "4fc6ddaf24947d12a9aa31ac65431fb1b851b8f4365426e182901eabfb87df5f" 158 | dependencies = [ 159 | "target-lexicon", 160 | ] 161 | 162 | [[package]] 163 | name = "pyo3-ffi" 164 | version = "0.26.0" 165 | source = "registry+https://github.com/rust-lang/crates.io-index" 166 | checksum = "025474d3928738efb38ac36d4744a74a400c901c7596199e20e45d98eb194105" 167 | dependencies = [ 168 | "libc", 169 | "pyo3-build-config", 170 | ] 171 | 172 | [[package]] 173 | name = "pyo3-macros" 174 | version = "0.26.0" 175 | source = "registry+https://github.com/rust-lang/crates.io-index" 176 | checksum = "2e64eb489f22fe1c95911b77c44cc41e7c19f3082fc81cce90f657cdc42ffded" 177 | dependencies = [ 178 | "proc-macro2", 179 | "pyo3-macros-backend", 180 | "quote", 181 | "syn", 182 | ] 183 | 184 | [[package]] 185 | name = "pyo3-macros-backend" 186 | version = "0.26.0" 187 | source = "registry+https://github.com/rust-lang/crates.io-index" 188 | checksum = "100246c0ecf400b475341b8455a9213344569af29a3c841d29270e53102e0fcf" 189 | dependencies = [ 190 | "heck", 191 | "proc-macro2", 192 | "pyo3-build-config", 193 | "quote", 194 | "syn", 195 | ] 196 | 197 | [[package]] 198 | name = "quote" 199 | version = "1.0.37" 200 | source = "registry+https://github.com/rust-lang/crates.io-index" 201 | checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" 202 | dependencies = [ 203 | "proc-macro2", 204 | ] 205 | 206 | [[package]] 207 | name = "radsort" 208 | version = "0.1.1" 209 | source = "registry+https://github.com/rust-lang/crates.io-index" 210 | checksum = "019b4b213425016d7d84a153c4c73afb0946fbb4840e4eece7ba8848b9d6da22" 211 | 212 | [[package]] 213 | name = "rawpointer" 214 | version = "0.2.1" 215 | source = "registry+https://github.com/rust-lang/crates.io-index" 216 | checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" 217 | 218 | [[package]] 219 | name = "ruranges" 220 | version = "0.1.0" 221 | dependencies = [ 222 | "num-traits", 223 | "numpy", 224 | "pyo3", 225 | "radsort", 226 | "rustc-hash", 227 | ] 228 | 229 | [[package]] 230 | name = "rustc-hash" 231 | version = "2.1.0" 232 | source = "registry+https://github.com/rust-lang/crates.io-index" 233 | checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497" 234 | 235 | [[package]] 236 | name = "syn" 237 | version = "2.0.91" 238 | source = "registry+https://github.com/rust-lang/crates.io-index" 239 | checksum = "d53cbcb5a243bd33b7858b1d7f4aca2153490815872d86d955d6ea29f743c035" 240 | dependencies = [ 241 | "proc-macro2", 242 | "quote", 243 | "unicode-ident", 244 | ] 245 | 246 | [[package]] 247 | name = "target-lexicon" 248 | version = "0.13.2" 249 | source = "registry+https://github.com/rust-lang/crates.io-index" 250 | checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a" 251 | 252 | [[package]] 253 | name = "unicode-ident" 254 | version = "1.0.14" 255 | source = "registry+https://github.com/rust-lang/crates.io-index" 256 | checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" 257 | 258 | [[package]] 259 | name = "unindent" 260 | version = "0.2.3" 261 | source = "registry+https://github.com/rust-lang/crates.io-index" 262 | checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" 263 | -------------------------------------------------------------------------------- /src/overlaps_simple.rs: -------------------------------------------------------------------------------- 1 | #![allow(dead_code)] 2 | 3 | use core::fmt::Debug; 4 | use core::ops::Add; 5 | use std::str::FromStr; 6 | 7 | use crate::ruranges_structs::{GroupType, OverlapType, PositionType}; 8 | 9 | #[inline(always)] 10 | fn overlaps_with_slack(a_start: T, a_end: T, b_start: T, b_end: T, slack: T) -> bool { 11 | a_start < (b_end + slack) && b_start < (a_end + slack) 12 | } 13 | 14 | #[inline(always)] 15 | fn contains_with_slack(outer_start: T, outer_end: T, inner_start: T, inner_end: T, slack: T) -> bool { 16 | outer_start <= (inner_start + slack) && inner_end <= (outer_end + slack) 17 | } 18 | 19 | #[inline(always)] 20 | fn assert_sorted_by_group_then_start( 21 | grp: &[C], 22 | start: &[T], 23 | end: &[T], 24 | label: &str, 25 | ) { 26 | debug_assert_eq!(grp.len(), start.len(), "{label}: grp/start length mismatch"); 27 | debug_assert_eq!(grp.len(), end.len(), "{label}: grp/end length mismatch"); 28 | 29 | if grp.is_empty() { 30 | return; 31 | } 32 | 33 | for i in 1..grp.len() { 34 | let g_prev = grp[i - 1]; 35 | let g_cur = grp[i]; 36 | if g_cur == g_prev { 37 | let s_prev = start[i - 1]; 38 | let s_cur = start[i]; 39 | if s_cur < s_prev { 40 | panic!( 41 | "{label}: not sorted by start within group at index {i}: grp={:?} prev_start={:?} cur_start={:?}", 42 | g_cur, s_prev, s_cur 43 | ); 44 | } 45 | } 46 | } 47 | } 48 | 49 | pub fn sweep_line_overlaps( 50 | grp1: &[C], 51 | start1: &[T], 52 | end1: &[T], 53 | grp2: &[C], 54 | start2: &[T], 55 | end2: &[T], 56 | slack: T, 57 | overlap_type: &str, 58 | contained: bool, 59 | no_checks: bool, 60 | ) -> (Vec, Vec) { 61 | let multiple = OverlapType::from_str(overlap_type) 62 | .expect("invalid overlap_type string"); 63 | 64 | if !no_checks { 65 | assert_sorted_by_group_then_start(grp1, start1, end1, "Left collection"); 66 | assert_sorted_by_group_then_start(grp2, start2, end2, "Right collection"); 67 | } 68 | let n1 = grp1.len(); 69 | let n2 = grp2.len(); 70 | 71 | let mut out1: Vec = Vec::new(); 72 | let mut out2: Vec = Vec::new(); 73 | 74 | // Pointer into right collection (by group). 75 | let mut j: usize = 0; 76 | 77 | // Active set of right indices for current group. 78 | let mut active: Vec = Vec::new(); 79 | let mut active_head: usize = 0; // logical head (retired items remain until occasional compaction) 80 | 81 | #[inline(always)] 82 | fn clear_active(active: &mut Vec, active_head: &mut usize) { 83 | active.clear(); 84 | *active_head = 0; 85 | } 86 | 87 | let mut i: usize = 0; 88 | while i < n1 && j < n2 { 89 | // Align groups 90 | let g1 = grp1[i]; 91 | let g2 = grp2[j]; 92 | 93 | if g1 < g2 { 94 | let gg = g1; 95 | while i < n1 && grp1[i] == gg { 96 | i += 1; 97 | } 98 | continue; 99 | } else if g2 < g1 { 100 | let gg = g2; 101 | while j < n2 && grp2[j] == gg { 102 | j += 1; 103 | } 104 | continue; 105 | } 106 | 107 | // Groups equal: process this group chunk. 108 | let grp = g1; 109 | 110 | // Group ranges [i0, i1) and [j0, j1). 111 | let i0 = i; 112 | while i < n1 && grp1[i] == grp { 113 | i += 1; 114 | } 115 | let i1 = i; 116 | 117 | let j0 = j; 118 | while j < n2 && grp2[j] == grp { 119 | j += 1; 120 | } 121 | let j1 = j; 122 | 123 | // Reset sweep state for this group. 124 | clear_active(&mut active, &mut active_head); 125 | 126 | // Right pointer within this group. 127 | let mut jr = j0; 128 | 129 | // Sweep left intervals in this group. 130 | for il in i0..i1 { 131 | let a_start = start1[il]; 132 | let a_end = end1[il]; 133 | 134 | // Add to active: all right intervals whose start < a_end + slack. 135 | let a_end_slack = a_end + slack; 136 | while jr < j1 && start2[jr] < a_end_slack { 137 | active.push(jr); 138 | jr += 1; 139 | } 140 | 141 | // Retire: any right interval that is certainly too far left (end + slack <= a_start). 142 | while active_head < active.len() { 143 | let k = active[active_head]; 144 | if (end2[k] + slack) <= a_start { 145 | active_head += 1; 146 | } else { 147 | break; 148 | } 149 | } 150 | 151 | // Occasional compaction (cheap amortized). 152 | if active_head > 0 && active_head * 2 >= active.len() { 153 | active.drain(0..active_head); 154 | active_head = 0; 155 | } 156 | 157 | match multiple { 158 | OverlapType::All => { 159 | for idx in active_head..active.len() { 160 | let r = active[idx]; 161 | let b_start = start2[r]; 162 | let b_end = end2[r]; 163 | 164 | if !overlaps_with_slack(a_start, a_end, b_start, b_end, slack) { 165 | continue; 166 | } 167 | if contained && !contained_either_direction(a_start, a_end, b_start, b_end, slack) { 168 | continue; 169 | } 170 | 171 | out1.push(il); 172 | out2.push(r); 173 | } 174 | } 175 | OverlapType::First => { 176 | for idx in active_head..active.len() { 177 | let r = active[idx]; 178 | let b_start = start2[r]; 179 | let b_end = end2[r]; 180 | 181 | if !overlaps_with_slack(a_start, a_end, b_start, b_end, slack) { 182 | continue; 183 | } 184 | if contained && !contained_either_direction(a_start, a_end, b_start, b_end, slack) { 185 | continue; 186 | } 187 | 188 | out1.push(il); 189 | out2.push(r); 190 | break; 191 | } 192 | } 193 | OverlapType::Last => { 194 | let mut last_r: Option = None; 195 | 196 | for idx in active_head..active.len() { 197 | let r = active[idx]; 198 | let b_start = start2[r]; 199 | let b_end = end2[r]; 200 | 201 | if !overlaps_with_slack(a_start, a_end, b_start, b_end, slack) { 202 | continue; 203 | } 204 | if contained && !contained_either_direction(a_start, a_end, b_start, b_end, slack) { 205 | continue; 206 | } 207 | 208 | last_r = Some(r); 209 | } 210 | 211 | if let Some(r) = last_r { 212 | out1.push(il); 213 | out2.push(r); 214 | } 215 | } 216 | } 217 | } 218 | } 219 | 220 | (out1, out2) 221 | } 222 | 223 | #[inline(always)] 224 | fn contained_either_direction(a_start: T, a_end: T, b_start: T, b_end: T, slack: T) -> bool { 225 | // Default interpretation: keep if A contains B OR B contains A (with slack). 226 | contains_with_slack(a_start, a_end, b_start, b_end, slack) 227 | || contains_with_slack(b_start, b_end, a_start, a_end, slack) 228 | } 229 | -------------------------------------------------------------------------------- /src/multiprocessing.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | 3 | use crate::ruranges_structs::MinEvent; 4 | 5 | 6 | pub fn find_chr_boundaries_minevents(data: &[MinEvent]) -> Vec { 7 | let mut boundaries = Vec::new(); 8 | 9 | // Start boundary (beginning of first chromosome group) 10 | boundaries.push(0); 11 | 12 | // Identify every index `i` where the chromosome changes 13 | for i in 1..data.len() { 14 | if data[i].chr != data[i - 1].chr { 15 | boundaries.push(i); 16 | } 17 | } 18 | 19 | // Final boundary (end of the last chromosome group) 20 | boundaries.push(data.len()); 21 | 22 | boundaries 23 | } 24 | 25 | /// Holds combined boundaries for a single chromosome across two vectors. 26 | #[derive(Debug, Clone)] 27 | pub struct ChrBound { 28 | pub chr: i64, 29 | pub start1: usize, 30 | pub end1: usize, 31 | pub start2: usize, 32 | pub end2: usize, 33 | } 34 | 35 | impl fmt::Display for ChrBound { 36 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 37 | // Customize the output format as desired. 38 | write!(f, "ChrBound {{ chr: {}, start1: {}, end1: {}, start2: {}, end2: {}, len1: {}, len2: {}, }}", 39 | self.chr, self.start1, self.end1, self.start2, self.end2, self.end1 - self.start1, self.end2 - self.start2) 40 | } 41 | } 42 | 43 | /// Returns boundary indices [0, ..., data.len()] whenever `chr` changes. 44 | /// E.g. if `data` has chr=1 for indices [0..2), chr=2 for [2..5), etc., 45 | /// then you might get [0, 2, 5] (and finally data.len()). 46 | fn find_chr_boundaries(data: &[MinEvent]) -> Vec { 47 | let mut boundaries = Vec::new(); 48 | if data.is_empty() { 49 | return boundaries; 50 | } 51 | 52 | // Always push the start index 0 53 | boundaries.push(0); 54 | 55 | // Mark where the chromosome changes 56 | for i in 1..data.len() { 57 | if data[i].chr != data[i - 1].chr { 58 | boundaries.push(i); 59 | } 60 | } 61 | 62 | // Push the final end 63 | boundaries.push(data.len()); 64 | boundaries 65 | } 66 | 67 | /// Converts boundary indices into a list of (chr, start_index, end_index) blocks. 68 | /// Each block covers all MinEvents for a single chromosome in `data`. 69 | fn build_chr_blocks(data: &[MinEvent], boundaries: &[usize]) -> Vec<(i64, usize, usize)> { 70 | let mut blocks = Vec::new(); 71 | for w in boundaries.windows(2) { 72 | let start = w[0]; 73 | let end = w[1]; 74 | // If start < end, we have a real block 75 | if start < end { 76 | let chrom = data[start].chr; 77 | blocks.push((chrom, start, end)); 78 | } 79 | } 80 | blocks 81 | } 82 | 83 | 84 | /// A small helper struct for the final results. 85 | /// Each partition covers [start1..end1) in `sorted_starts` 86 | /// and [start2..end2) in `sorted_starts2`. 87 | #[derive(Debug)] 88 | pub struct PartitionIndex { 89 | pub start1: usize, 90 | pub end1: usize, 91 | pub start2: usize, 92 | pub end2: usize, 93 | } 94 | 95 | /// A helper struct to store the range of indices for a contiguous 96 | /// set of events on a single chromosome. 97 | #[derive(Debug)] 98 | struct ChromRange { 99 | chr: i64, 100 | start_idx: usize, 101 | end_idx: usize, // end_idx is exclusive 102 | } 103 | 104 | /// Given a sorted slice of MinEvents, group them by chromosome 105 | /// and return a Vec of (chr, start_idx, end_idx). 106 | fn group_by_chromosome(events: &[MinEvent]) -> Vec { 107 | if events.is_empty() { 108 | return vec![]; 109 | } 110 | 111 | let mut ranges = Vec::new(); 112 | 113 | let mut current_chr = events[0].chr; 114 | let mut current_start = 0usize; 115 | 116 | for i in 1..events.len() { 117 | if events[i].chr != current_chr { 118 | // We've hit a new chromosome, close out the old range 119 | ranges.push(ChromRange { 120 | chr: current_chr, 121 | start_idx: current_start, 122 | end_idx: i, 123 | }); 124 | // start a new range 125 | current_chr = events[i].chr; 126 | current_start = i; 127 | } 128 | } 129 | // close the final range 130 | ranges.push(ChromRange { 131 | chr: current_chr, 132 | start_idx: current_start, 133 | end_idx: events.len(), 134 | }); 135 | 136 | ranges 137 | } 138 | 139 | /// Partition a single sorted slice (grouped by chromosome) into N partitions. 140 | /// Each partition is represented as (start_index, end_index) into the original slice. 141 | fn partition_chrom_ranges( 142 | events: &[MinEvent], 143 | num_partitions: usize, 144 | ) -> Vec<(usize, usize)> { 145 | if events.is_empty() { 146 | // If no events, return num_partitions empty partitions 147 | return (0..num_partitions).map(|_| (0, 0)).collect(); 148 | } 149 | if num_partitions == 0 { 150 | return vec![]; 151 | } 152 | 153 | let chrom_ranges = group_by_chromosome(events); 154 | 155 | // total events 156 | let total_len = events.len(); 157 | let target_chunk_size = (total_len as f64 / num_partitions as f64).ceil() as usize; 158 | 159 | let mut partitions = Vec::with_capacity(num_partitions); 160 | let mut current_start = chrom_ranges[0].start_idx; 161 | let mut accumulated = 0; // count of events in the current partition 162 | let mut partition_count = 1; 163 | 164 | for (i, chr_range) in chrom_ranges.iter().enumerate() { 165 | let chr_range_size = chr_range.end_idx - chr_range.start_idx; 166 | let tentative_new_size = accumulated + chr_range_size; 167 | 168 | // If adding this chromosome range exceeds target_chunk_size 169 | // and we still have space for more partitions, then we close 170 | // the current partition before adding this chromosome. 171 | if partition_count < num_partitions && // we can only "cut" if we still have partitions left to form 172 | accumulated > 0 && // avoid zero-length partition in normal logic 173 | (tentative_new_size > target_chunk_size) 174 | { 175 | // close out the previous partition 176 | partitions.push((current_start, chr_range.start_idx)); 177 | partition_count += 1; 178 | 179 | // start a new partition with this chromosome 180 | current_start = chr_range.start_idx; 181 | accumulated = 0; 182 | } 183 | 184 | accumulated += chr_range_size; 185 | 186 | // if this is the last chromosome or if we have formed 187 | // the last partition, we close it automatically 188 | if i == chrom_ranges.len() - 1 { 189 | // close out final partition 190 | partitions.push((current_start, chr_range.end_idx)); 191 | } 192 | } 193 | 194 | // If we still don't have exactly num_partitions, we can pad out 195 | // or merge the last ones. This naive approach just merges any 196 | // extras at the end if we created more than needed (which can 197 | // happen if lots of single-chr partitions blow up). 198 | // In many real-world scenarios, you might do a more sophisticated 199 | // balancing, but here we keep it simple. 200 | if partitions.len() > num_partitions { 201 | // Merge the extra partitions into the last one 202 | let mut merged = partitions[0..(num_partitions - 1)].to_vec(); 203 | // The last partition we merge everything from the leftover 204 | let last_start = partitions[num_partitions - 1].0; 205 | let last_end = partitions.last().unwrap().1; 206 | merged.push((last_start, last_end)); 207 | partitions = merged; 208 | } else if partitions.len() < num_partitions { 209 | // If fewer partitions, we can just duplicate the last range as no-ops 210 | while partitions.len() < num_partitions { 211 | let last = *partitions.last().unwrap(); 212 | partitions.push(last); 213 | } 214 | } 215 | 216 | partitions 217 | } 218 | 219 | /// Create `num_partitions` partitions for *both* slices, ensuring no chromosome boundaries 220 | /// are crossed in either slice. Each returned element describes the start/end in slice1 221 | /// and the start/end in slice2. 222 | pub fn partition_two_arrays( 223 | sorted_starts: &[MinEvent], 224 | sorted_starts2: &[MinEvent], 225 | num_partitions: usize, 226 | ) -> Vec { 227 | let parts1 = partition_chrom_ranges(sorted_starts, num_partitions); 228 | let parts2 = partition_chrom_ranges(sorted_starts2, num_partitions); 229 | 230 | // Zip them into a single vector of PartitionIndex 231 | parts1 232 | .into_iter() 233 | .zip(parts2.into_iter()) 234 | .map(|((start1, end1), (start2, end2))| PartitionIndex { 235 | start1, 236 | end1, 237 | start2, 238 | end2, 239 | }) 240 | .collect() 241 | } 242 | 243 | #[cfg(test)] 244 | mod tests { 245 | use super::*; 246 | 247 | #[test] 248 | fn test_partition_two_arrays() { 249 | // A small mock dataset with two chromosomes, 5 events on chr1, 250 | // then 4 events on chr2, for each slice. 251 | let ev1 = vec![ 252 | MinEvent { chr: 1, pos: 10, idx: 0 }, 253 | MinEvent { chr: 1, pos: 20, idx: 1 }, 254 | MinEvent { chr: 1, pos: 30, idx: 2 }, 255 | MinEvent { chr: 1, pos: 40, idx: 3 }, 256 | MinEvent { chr: 1, pos: 50, idx: 4 }, 257 | MinEvent { chr: 2, pos: 10, idx: 5 }, 258 | MinEvent { chr: 2, pos: 20, idx: 6 }, 259 | MinEvent { chr: 2, pos: 30, idx: 7 }, 260 | MinEvent { chr: 2, pos: 40, idx: 8 }, 261 | ]; 262 | let ev2 = vec![ 263 | MinEvent { chr: 1, pos: 15, idx: 0 }, 264 | MinEvent { chr: 1, pos: 25, idx: 1 }, 265 | MinEvent { chr: 1, pos: 35, idx: 2 }, 266 | MinEvent { chr: 2, pos: 5, idx: 3 }, 267 | MinEvent { chr: 2, pos: 15, idx: 4 }, 268 | MinEvent { chr: 2, pos: 25, idx: 5 }, 269 | ]; 270 | 271 | // Let's request 3 partitions 272 | let results = partition_two_arrays(&ev1, &ev2, 3); 273 | 274 | for (i, part) in results.iter().enumerate() { 275 | println!("Partition {}: {:?}", i, part); 276 | } 277 | 278 | // Here we only check that we got exactly 3 partitions: 279 | assert_eq!(results.len(), 3); 280 | 281 | // Additional checks or asserts can verify the boundaries do not cross chrs, etc. 282 | } 283 | } 284 | -------------------------------------------------------------------------------- /src/spliced_subsequence.rs: -------------------------------------------------------------------------------- 1 | use radsort::sort_by_key; 2 | 3 | use crate::{ 4 | ruranges_structs::{GroupType, PositionType, SplicedSubsequenceInterval}, 5 | sorts::build_sorted_subsequence_intervals, 6 | }; 7 | 8 | /// (idxs, starts, ends, strands) for exactly one (start,end) slice 9 | fn global_shift(starts: &[T], ends: &[T]) -> T { 10 | let mut min_coord = T::zero(); 11 | for &v in starts { if v < min_coord { min_coord = v; } } 12 | for &v in ends { if v < min_coord { min_coord = v; } } 13 | if min_coord < T::zero() { -min_coord } else { T::zero() } 14 | } 15 | 16 | /// (idxs, starts, ends, strands) for **one** (start,end) slice 17 | pub fn spliced_subseq( 18 | chrs: &[G], 19 | starts: &[T], 20 | ends: &[T], 21 | strand_flags: &[bool], 22 | start: T, 23 | end: Option, 24 | force_plus_strand: bool, 25 | ) -> (Vec, Vec, Vec, Vec) { 26 | 27 | // ────────────────────────── 1. pre-processing: apply global shift ───── 28 | let shift = global_shift(starts, ends); 29 | 30 | // Either borrow the original slices (shift == 0) or build shifted copies. 31 | // `tmp_storage` keeps the vectors alive for as long as we need the slices. 32 | let (starts_slice, ends_slice); 33 | let _tmp_storage: Option<(Vec, Vec)>; 34 | 35 | if shift > T::zero() { 36 | let mut s = Vec::with_capacity(starts.len()); 37 | let mut e = Vec::with_capacity(ends.len()); 38 | for i in 0..starts.len() { 39 | s.push(starts[i] + shift); 40 | e.push(ends [i] + shift); 41 | } 42 | _tmp_storage = Some((s, e)); 43 | let (s_ref, e_ref) = _tmp_storage.as_ref().unwrap(); 44 | starts_slice = s_ref.as_slice(); 45 | ends_slice = e_ref.as_slice(); 46 | } else { 47 | _tmp_storage = None; 48 | starts_slice = starts; 49 | ends_slice = ends; 50 | } 51 | // ─────────────────────────────────────────────────────────────────────── 52 | 53 | // ────────────── helper struct local to this function ─────────────────── 54 | struct OutRec { 55 | idx: u32, 56 | start: T, 57 | end: T, 58 | strand: bool, 59 | } 60 | 61 | // Build sorted interval vector (caller guarantees same grouping rules). 62 | let mut intervals = build_sorted_subsequence_intervals( 63 | chrs, 64 | starts_slice, 65 | ends_slice, 66 | strand_flags, 67 | ); 68 | 69 | // Early-exit when nothing to do 70 | if intervals.is_empty() { 71 | return (Vec::new(), Vec::new(), Vec::new(), Vec::new()); 72 | } 73 | 74 | let mut out_recs: Vec> = Vec::with_capacity(intervals.len()); 75 | 76 | let mut group_buf: Vec> = Vec::new(); 77 | let mut current_chr = intervals[0].chr; 78 | let mut running_sum = T::zero(); 79 | 80 | // ───────── helper: finalise one transcript/group ─────────────────────── 81 | let mut finalize_group = |group: &mut [SplicedSubsequenceInterval]| { 82 | if group.is_empty() { return; } 83 | 84 | // total spliced length 85 | let total_len = group.last().unwrap().temp_cumsum; 86 | let end_val = end.unwrap_or(total_len); 87 | 88 | // translate negative offsets 89 | let global_start = if start < T::zero() { total_len + start } else { start }; 90 | let global_end = if end_val < T::zero() { total_len + end_val } else { end_val }; 91 | 92 | let group_forward = group[0].forward_strand; 93 | 94 | // per-exon closure so we don’t duplicate maths 95 | let mut process_iv = |iv: &mut SplicedSubsequenceInterval| { 96 | let cumsum_start = iv.temp_cumsum - iv.temp_length; 97 | let cumsum_end = iv.temp_cumsum; 98 | 99 | let mut st = iv.start; 100 | let mut en = iv.end; 101 | 102 | // coordinate arithmetic orientation 103 | let processed_forward = 104 | force_plus_strand || iv.forward_strand; 105 | 106 | if processed_forward { 107 | let shift = global_start - cumsum_start; 108 | if shift > T::zero() { st = st + shift; } 109 | let shift = cumsum_end - global_end; 110 | if shift > T::zero() { en = en - shift; } 111 | } else { 112 | let shift = global_start - cumsum_start; 113 | if shift > T::zero() { en = en - shift; } 114 | let shift = cumsum_end - global_end; 115 | if shift > T::zero() { st = st + shift; } 116 | } 117 | 118 | // keep only non-empty pieces 119 | if st < en { 120 | out_recs.push(OutRec { 121 | idx: iv.idx, 122 | start: st, 123 | end: en, 124 | strand: iv.forward_strand == processed_forward, // (+)*(+) or (−)*(−) → '+' 125 | }); 126 | } 127 | }; 128 | 129 | // walk exons in transcription order 130 | if group_forward { 131 | for iv in group.iter_mut() { process_iv(iv); } 132 | } else { 133 | for iv in group.iter_mut().rev() { process_iv(iv); } 134 | } 135 | }; 136 | // ─────────────────────────────────────────────────────────────────────── 137 | 138 | // single linear scan over all exons 139 | for mut iv in intervals.into_iter() { 140 | iv.start = iv.start.abs(); 141 | iv.end = iv.end.abs(); 142 | 143 | // new chromosome ⇒ flush buffer 144 | if iv.chr != current_chr { 145 | finalize_group(&mut group_buf); 146 | group_buf.clear(); 147 | running_sum = T::zero(); 148 | current_chr = iv.chr; 149 | } 150 | 151 | iv.temp_length = iv.end - iv.start; 152 | iv.temp_cumsum = running_sum + iv.temp_length; 153 | running_sum = iv.temp_cumsum; 154 | 155 | group_buf.push(iv); 156 | } 157 | finalize_group(&mut group_buf); 158 | 159 | // restore original row order 160 | sort_by_key(&mut out_recs, |r| r.idx); 161 | 162 | // ───────── explode OutRec list into parallel result vectors ──────────── 163 | let mut out_idxs = Vec::with_capacity(out_recs.len()); 164 | let mut out_starts = Vec::with_capacity(out_recs.len()); 165 | let mut out_ends = Vec::with_capacity(out_recs.len()); 166 | let mut out_strands = Vec::with_capacity(out_recs.len()); 167 | 168 | for rec in out_recs { 169 | out_idxs.push(rec.idx); 170 | out_starts.push(rec.start); 171 | out_ends.push(rec.end); 172 | out_strands.push(rec.strand); 173 | } 174 | 175 | // ─────────────────────────── 3. post-processing: undo shift ──────────── 176 | if shift > T::zero() { 177 | for v in &mut out_starts { *v = *v - shift; } 178 | for v in &mut out_ends { *v = *v - shift; } 179 | } 180 | // ─────────────────────────────────────────────────────────────────────── 181 | 182 | (out_idxs, out_starts, out_ends, out_strands) 183 | } 184 | 185 | pub fn spliced_subseq_multi( 186 | chrs: &[G], 187 | starts: &[T], 188 | ends: &[T], 189 | strand_flags: &[bool], 190 | slice_starts: &[T], 191 | slice_ends: &[Option], 192 | force_plus_strand: bool, 193 | ) -> (Vec, Vec, Vec, Vec) { 194 | assert_eq!(chrs.len(), starts.len()); 195 | assert_eq!(starts.len(), ends.len()); 196 | assert_eq!(ends.len(), strand_flags.len()); 197 | assert_eq!(strand_flags.len(), slice_starts.len()); 198 | assert_eq!(slice_starts.len(), slice_ends.len()); 199 | 200 | let shift = global_shift(starts, ends); 201 | 202 | let (starts_slice, ends_slice); 203 | let _tmp_storage: Option<(Vec, Vec)>; 204 | if shift > T::zero() { 205 | let mut s = Vec::with_capacity(starts.len()); 206 | let mut e = Vec::with_capacity(ends.len()); 207 | for i in 0..starts.len() { 208 | s.push(starts[i] + shift); 209 | e.push(ends[i] + shift); 210 | } 211 | _tmp_storage = Some((s, e)); 212 | let (s_ref, e_ref) = _tmp_storage.as_ref().unwrap(); 213 | starts_slice = s_ref.as_slice(); 214 | ends_slice = e_ref.as_slice(); 215 | } else { 216 | _tmp_storage = None; 217 | starts_slice = starts; 218 | ends_slice = ends; 219 | } 220 | 221 | struct OutRec { 222 | idx: u32, 223 | start: T, 224 | end: T, 225 | strand: bool, 226 | } 227 | 228 | let mut intervals = 229 | build_sorted_subsequence_intervals(chrs, starts_slice, ends_slice, strand_flags); 230 | 231 | if intervals.is_empty() { 232 | return (Vec::new(), Vec::new(), Vec::new(), Vec::new()); 233 | } 234 | 235 | let mut out_recs: Vec> = Vec::with_capacity(intervals.len()); 236 | let mut group_buf: Vec> = Vec::new(); 237 | let mut current_chr = intervals[0].chr; 238 | let mut running_sum = T::zero(); 239 | let mut current_slice_start: T = slice_starts[intervals[0].idx as usize]; 240 | let mut current_slice_end: Option = slice_ends[intervals[0].idx as usize]; 241 | 242 | let mut finalize_group = |group: &mut [SplicedSubsequenceInterval], 243 | slice_start: T, 244 | slice_end: Option| { 245 | if group.is_empty() { 246 | return; 247 | } 248 | 249 | let total_len = group.last().unwrap().temp_cumsum; 250 | let end_val = slice_end.unwrap_or(total_len); 251 | 252 | let global_start = if slice_start < T::zero() { 253 | total_len + slice_start 254 | } else { 255 | slice_start 256 | }; 257 | let global_end = if end_val < T::zero() { 258 | total_len + end_val 259 | } else { 260 | end_val 261 | }; 262 | 263 | let group_forward = group[0].forward_strand; 264 | 265 | let mut process_iv = |iv: &mut SplicedSubsequenceInterval| { 266 | let cumsum_start = iv.temp_cumsum - iv.temp_length; 267 | let cumsum_end = iv.temp_cumsum; 268 | 269 | let mut st = iv.start; 270 | let mut en = iv.end; 271 | 272 | let processed_forward = force_plus_strand || iv.forward_strand; 273 | 274 | if processed_forward { 275 | let shift = global_start - cumsum_start; 276 | if shift > T::zero() { 277 | st = st + shift; 278 | } 279 | let shift = cumsum_end - global_end; 280 | if shift > T::zero() { 281 | en = en - shift; 282 | } 283 | } else { 284 | let shift = global_start - cumsum_start; 285 | if shift > T::zero() { 286 | en = en - shift; 287 | } 288 | let shift = cumsum_end - global_end; 289 | if shift > T::zero() { 290 | st = st + shift; 291 | } 292 | } 293 | 294 | if st < en { 295 | out_recs.push(OutRec { 296 | idx: iv.idx, 297 | start: st, 298 | end: en, 299 | strand: iv.forward_strand == processed_forward, 300 | }); 301 | } 302 | }; 303 | 304 | if group_forward { 305 | for iv in group.iter_mut() { 306 | process_iv(iv); 307 | } 308 | } else { 309 | for iv in group.iter_mut().rev() { 310 | process_iv(iv); 311 | } 312 | } 313 | }; 314 | 315 | for mut iv in intervals.into_iter() { 316 | iv.start = iv.start.abs(); 317 | iv.end = iv.end.abs(); 318 | 319 | if iv.chr != current_chr { 320 | finalize_group(&mut group_buf, current_slice_start, current_slice_end); 321 | group_buf.clear(); 322 | running_sum = T::zero(); 323 | current_chr = iv.chr; 324 | current_slice_start = slice_starts[iv.idx as usize]; 325 | current_slice_end = slice_ends[iv.idx as usize]; 326 | } 327 | 328 | iv.temp_length = iv.end - iv.start; 329 | iv.temp_cumsum = running_sum + iv.temp_length; 330 | running_sum = iv.temp_cumsum; 331 | 332 | group_buf.push(iv); 333 | } 334 | finalize_group(&mut group_buf, current_slice_start, current_slice_end); 335 | 336 | sort_by_key(&mut out_recs, |r| r.idx); 337 | 338 | let mut out_idxs = Vec::with_capacity(out_recs.len()); 339 | let mut out_starts = Vec::with_capacity(out_recs.len()); 340 | let mut out_ends = Vec::with_capacity(out_recs.len()); 341 | let mut out_strands = Vec::with_capacity(out_recs.len()); 342 | 343 | for rec in out_recs { 344 | out_idxs.push(rec.idx); 345 | out_starts.push(rec.start); 346 | out_ends.push(rec.end); 347 | out_strands.push(rec.strand); 348 | } 349 | 350 | if shift > T::zero() { 351 | for v in &mut out_starts { 352 | *v = *v - shift; 353 | } 354 | for v in &mut out_ends { 355 | *v = *v - shift; 356 | } 357 | } 358 | 359 | (out_idxs, out_starts, out_ends, out_strands) 360 | } 361 | -------------------------------------------------------------------------------- /src/sorts.rs: -------------------------------------------------------------------------------- 1 | use radsort::sort_by_key; 2 | 3 | use crate::ruranges_structs::Event; 4 | use crate::ruranges_structs::GenericEvent; 5 | use crate::ruranges_structs::GroupStruct; 6 | use crate::ruranges_structs::GroupType; 7 | use crate::ruranges_structs::Interval; 8 | use crate::ruranges_structs::MaxEvent; 9 | use crate::ruranges_structs::MinEvent; 10 | use crate::ruranges_structs::PositionType; 11 | use crate::ruranges_structs::SplicedSubsequenceInterval; 12 | use crate::ruranges_structs::SubsequenceInterval; 13 | 14 | pub fn build_intervals( 15 | chrs: &[C], 16 | starts: &[T], 17 | ends: &[T], 18 | sort_reverse_direction: Option<&[bool]>, 19 | slack: T, 20 | ) -> Vec> { 21 | let mut intervals = Vec::with_capacity(chrs.len()); 22 | match sort_reverse_direction { 23 | Some(reverse) => { 24 | for i in 0..chrs.len() { 25 | intervals.push(Interval { 26 | group: chrs[i], 27 | start: if reverse[i] { 28 | -(starts[i] - slack) 29 | } else { 30 | starts[i] - slack 31 | }, 32 | end: if reverse[i] { 33 | -(ends[i] + slack) 34 | } else { 35 | ends[i] + slack 36 | }, 37 | idx: i as u32, 38 | }); 39 | } 40 | } 41 | None => { 42 | for i in 0..chrs.len() { 43 | intervals.push(Interval { 44 | group: chrs[i], 45 | start: starts[i] - slack, 46 | end: ends[i] + slack, 47 | idx: i as u32, 48 | }); 49 | } 50 | } 51 | }; 52 | 53 | intervals 54 | } 55 | 56 | pub fn build_subsequence_intervals( 57 | chrs: &[G], 58 | starts: &[T], 59 | ends: &[T], 60 | strand_flags: &[bool], 61 | ) -> Vec> { 62 | let mut intervals = Vec::with_capacity(chrs.len()); 63 | for i in 0..chrs.len() { 64 | intervals.push(SplicedSubsequenceInterval { 65 | chr: chrs[i], 66 | start: if strand_flags[i] { 67 | starts[i] 68 | } else { 69 | -starts[i] 70 | }, // so that negative strand intervals are sorted in the correct direction 71 | end: if strand_flags[i] { ends[i] } else { -ends[i] }, // we will find the absolute value when using them 72 | idx: i as u32, 73 | forward_strand: strand_flags[i], 74 | temp_cumsum: T::zero(), 75 | temp_length: T::zero(), 76 | }); 77 | } 78 | 79 | intervals 80 | } 81 | 82 | pub fn build_sequence_intervals( 83 | chrs: &[i64], 84 | starts: &[i64], 85 | ends: &[i64], 86 | idxs: &[i64], 87 | strand_flags: &[bool], 88 | force_plus_strand: bool, 89 | ) -> Vec { 90 | let mut intervals: Vec = Vec::with_capacity(chrs.len()); 91 | for i in 0..chrs.len() { 92 | intervals.push(SubsequenceInterval { 93 | group_id: chrs[i], 94 | start: if force_plus_strand || strand_flags[i] { 95 | starts[i] 96 | } else { 97 | -starts[i] 98 | }, // so that negative strand intervals are sorted in the correct direction 99 | end: if force_plus_strand || strand_flags[i] { 100 | ends[i] 101 | } else { 102 | -ends[i] 103 | }, // we will find the absolute value when using them 104 | idx: idxs[i], 105 | forward_strand: strand_flags[i], 106 | }); 107 | } 108 | 109 | intervals 110 | } 111 | 112 | pub fn build_sorted_intervals( 113 | chrs: &[C], 114 | starts: &[T], 115 | ends: &[T], 116 | sort_reverse_direction: Option<&[bool]>, 117 | slack: T, 118 | sort_on_ends_too: bool, 119 | ) -> Vec> { 120 | let mut intervals = build_intervals(chrs, starts, ends, sort_reverse_direction, slack); 121 | 122 | if sort_on_ends_too { 123 | sort_by_key(&mut intervals, |i| i.end); 124 | }; 125 | sort_by_key(&mut intervals, |i| i.start); 126 | sort_by_key(&mut intervals, |i| i.group); 127 | 128 | intervals 129 | } 130 | 131 | pub fn build_sorted_subsequence_intervals( 132 | chrs: &[G], 133 | starts: &[T], 134 | ends: &[T], 135 | strand_flags: &[bool], 136 | ) -> Vec> { 137 | let mut intervals = build_subsequence_intervals(chrs, starts, ends, strand_flags); 138 | 139 | sort_by_key(&mut intervals, |i| i.end); 140 | sort_by_key(&mut intervals, |i| i.start); 141 | sort_by_key(&mut intervals, |i| i.chr); 142 | 143 | intervals 144 | } 145 | 146 | pub fn build_sorted_sequence_intervals( 147 | chrs: &[i64], 148 | starts: &[i64], 149 | ends: &[i64], 150 | idxs: &[i64], 151 | strand_flags: &[bool], 152 | force_plus_strand: bool, 153 | ) -> Vec { 154 | let mut intervals = 155 | build_sequence_intervals(chrs, starts, ends, idxs, strand_flags, force_plus_strand); 156 | 157 | sort_by_key(&mut intervals, |i| i.end); 158 | sort_by_key(&mut intervals, |i| i.start); 159 | sort_by_key(&mut intervals, |i| i.group_id); 160 | 161 | intervals 162 | } 163 | 164 | pub fn sort_order_idx( 165 | chrs: &[G], 166 | starts: &[T], 167 | ends: &[T], 168 | sort_reverse_direction: Option<&[bool]>, 169 | ) -> Vec { 170 | build_sorted_intervals(chrs, starts, ends, sort_reverse_direction, T::zero(), true) 171 | .iter() 172 | .map(|i| i.idx) 173 | .collect() 174 | } 175 | 176 | pub fn build_sorted_events_single_position( 177 | chrs: &[C], 178 | pos: &[T], 179 | start: bool, 180 | first_set: bool, 181 | negative_position: bool, 182 | slack: T, 183 | ) -> Vec> { 184 | let mut events = Vec::with_capacity(2 * (chrs.len())); 185 | 186 | // Convert set1 intervals into events 187 | for i in 0..chrs.len() { 188 | let pos = if start { 189 | pos[i] - slack 190 | } else { 191 | pos[i] + slack 192 | }; 193 | events.push(Event { 194 | chr: chrs[i], 195 | pos: if negative_position { -pos } else { pos }, 196 | is_start: start, 197 | first_set: first_set, 198 | idx: i as u32, 199 | }); 200 | } 201 | 202 | sort_by_key(&mut events, |e| (e.chr, e.pos, e.is_start)); 203 | 204 | events 205 | } 206 | 207 | pub fn build_sorted_events_single_collection( 208 | chrs: &[C], 209 | starts: &[T], 210 | ends: &[T], 211 | slack: T, 212 | ) -> Vec> { 213 | let mut events = Vec::with_capacity(2 * (chrs.len())); 214 | 215 | // Convert set1 intervals into events 216 | for i in 0..chrs.len() { 217 | events.push(Event { 218 | chr: chrs[i], 219 | pos: starts[i], 220 | is_start: true, 221 | first_set: true, 222 | idx: i as u32, 223 | }); 224 | events.push(Event { 225 | chr: chrs[i], 226 | pos: ends[i] + slack, 227 | is_start: false, 228 | first_set: true, 229 | idx: i as u32, 230 | }); 231 | } 232 | 233 | // Sort events by: 234 | // 1. pos (ascending) 235 | // 2. is_start before is_end (if pos ties) 236 | // (We don't strictly need to tie-break by set_id or idx, but we can.) 237 | 238 | sort_by_key(&mut events, |e| e.is_start); 239 | sort_by_key(&mut events, |e| e.pos); 240 | sort_by_key(&mut events, |e| e.chr); 241 | 242 | events 243 | } 244 | 245 | pub fn build_sorted_events_single_collection_separate_outputs( 246 | chrs: &[C], 247 | pos: &[T], 248 | slack: T, 249 | ) -> Vec> { 250 | let mut out_pos: Vec> = Vec::with_capacity(chrs.len()); 251 | 252 | // Convert set1 intervals into events 253 | for i in 0..chrs.len() { 254 | out_pos.push(MinEvent { 255 | chr: chrs[i], 256 | pos: pos[i] - slack, 257 | idx: i as u32, 258 | }); 259 | } 260 | 261 | sort_by_key(&mut out_pos, |e| e.pos); 262 | sort_by_key(&mut out_pos, |e| e.chr); 263 | 264 | out_pos 265 | } 266 | 267 | pub fn build_sorted_groups( 268 | chrs: &[C], 269 | ) -> Vec { 270 | let mut out: Vec> = (0..chrs.len()) 271 | .map(|i| GroupStruct { chr: chrs[i], idx: i as u32 }) 272 | .collect(); 273 | 274 | out.sort_by_key(|e| e.chr); 275 | 276 | // take the chromosome field, cast to u32, collect ----------------------- 277 | out.into_iter().map(|e| e.idx).collect() 278 | } 279 | 280 | pub fn build_sorted_events_with_starts_ends( 281 | chrs: &[C], 282 | pos: &[T], 283 | slack: T, 284 | ) -> Vec> { 285 | let mut out_pos = Vec::with_capacity(chrs.len()); 286 | 287 | // Convert set1 intervals into events 288 | for i in 0..chrs.len() { 289 | out_pos.push(MinEvent { 290 | chr: chrs[i], 291 | pos: pos[i] - slack, 292 | idx: i as u32, 293 | }); 294 | } 295 | 296 | sort_by_key(&mut out_pos, |e| e.pos); 297 | sort_by_key(&mut out_pos, |e| e.chr); 298 | 299 | out_pos 300 | } 301 | 302 | pub fn build_sorted_events( 303 | chrs: &[C], 304 | starts: &[T], 305 | ends: &[T], 306 | chrs2: &[C], 307 | starts2: &[T], 308 | ends2: &[T], 309 | slack: T, 310 | ) -> Vec> { 311 | let mut events = Vec::with_capacity(2 * (chrs.len() + chrs2.len())); 312 | 313 | // Convert set1 intervals into events 314 | for i in 0..chrs.len() { 315 | events.push(GenericEvent { 316 | chr: chrs[i], 317 | pos: if slack < starts[i] { 318 | starts[i] - slack 319 | } else { 320 | T::zero() 321 | }, 322 | is_start: true, 323 | first_set: true, 324 | idx: i as u32, 325 | }); 326 | events.push(GenericEvent { 327 | chr: chrs[i], 328 | pos: ends[i].saturating_add(slack), 329 | is_start: false, 330 | first_set: true, 331 | idx: i as u32, 332 | }); 333 | } 334 | 335 | for j in 0..chrs2.len() { 336 | events.push(GenericEvent { 337 | chr: chrs2[j], 338 | pos: starts2[j], 339 | is_start: true, 340 | first_set: false, 341 | idx: j as u32, 342 | }); 343 | events.push(GenericEvent { 344 | chr: chrs2[j], 345 | pos: ends2[j], 346 | is_start: false, 347 | first_set: false, 348 | idx: j as u32, 349 | }); 350 | } 351 | 352 | sort_by_key(&mut events, |e| e.is_start); 353 | sort_by_key(&mut events, |e| e.pos); 354 | sort_by_key(&mut events, |e| e.chr); 355 | 356 | events 357 | } 358 | 359 | pub fn build_sorted_maxevents_with_starts_ends( 360 | chrs: &[C], 361 | starts: &[T], 362 | ends: &[T], 363 | chrs2: &[C], 364 | starts2: &[T], 365 | ends2: &[T], 366 | slack: T, 367 | ) -> Vec> { 368 | let mut events = Vec::with_capacity(2 * (chrs.len() + chrs2.len())); 369 | 370 | // Convert set1 intervals into events 371 | for i in 0..chrs.len() { 372 | events.push(MaxEvent { 373 | chr: chrs[i], 374 | pos: starts[i] - slack, 375 | start: starts[i] - slack, 376 | end: ends[i] + slack, 377 | is_start: true, 378 | first_set: true, 379 | idx: i as u32, 380 | }); 381 | events.push(MaxEvent { 382 | chr: chrs[i], 383 | pos: ends[i] + slack, 384 | end: ends[i] + slack, 385 | start: starts[i] - slack, 386 | is_start: false, 387 | first_set: true, 388 | idx: i as u32, 389 | }); 390 | } 391 | 392 | for i in 0..chrs2.len() { 393 | events.push(MaxEvent { 394 | chr: chrs2[i], 395 | pos: starts2[i], 396 | start: starts2[i], 397 | end: ends2[i], 398 | is_start: true, 399 | first_set: false, 400 | idx: i as u32, 401 | }); 402 | events.push(MaxEvent { 403 | chr: chrs2[i], 404 | pos: ends2[i], 405 | start: starts2[i], 406 | end: ends2[i], 407 | is_start: false, 408 | first_set: false, 409 | idx: i as u32, 410 | }); 411 | } 412 | 413 | sort_by_key(&mut events, |e| e.is_start); 414 | sort_by_key(&mut events, |e| e.pos); 415 | sort_by_key(&mut events, |e| e.chr); 416 | 417 | events 418 | } 419 | 420 | pub fn build_sorted_events_idxs( 421 | chrs: &[C], 422 | starts: &[T], 423 | ends: &[T], 424 | chrs2: &[C], 425 | starts2: &[T], 426 | ends2: &[T], 427 | slack: T, 428 | ) -> Vec> { 429 | let mut events = Vec::with_capacity(2 * (chrs.len() + chrs2.len())); 430 | 431 | // Convert set1 intervals into events 432 | for i in 0..chrs.len() { 433 | events.push(Event { 434 | chr: chrs[i], 435 | pos: starts[i] - slack, 436 | is_start: true, 437 | first_set: true, 438 | idx: i as u32, 439 | }); 440 | events.push(Event { 441 | chr: chrs[i], 442 | pos: ends[i] + slack, 443 | is_start: false, 444 | first_set: true, 445 | idx: i as u32, 446 | }); 447 | } 448 | 449 | for j in 0..chrs2.len() { 450 | events.push(Event { 451 | chr: chrs2[j], 452 | pos: starts2[j], 453 | is_start: true, 454 | first_set: false, 455 | idx: j as u32, 456 | }); 457 | events.push(Event { 458 | chr: chrs2[j], 459 | pos: ends2[j], 460 | is_start: false, 461 | first_set: false, 462 | idx: j as u32, 463 | }); 464 | } 465 | 466 | sort_by_key(&mut events, |e| e.is_start); 467 | sort_by_key(&mut events, |e| e.pos); 468 | sort_by_key(&mut events, |e| e.chr); 469 | 470 | events 471 | } 472 | 473 | pub fn build_sorted_events_from_intervals( 474 | intervals1: &mut [Interval], 475 | intervals2: &mut [Interval], 476 | ) -> Vec> { 477 | let mut events = Vec::with_capacity(2 * (intervals1.len() + intervals2.len())); 478 | 479 | // Convert set1 intervals into events 480 | for interval in intervals1 { 481 | events.push(Event { 482 | chr: interval.group, 483 | pos: interval.start, 484 | is_start: true, 485 | first_set: true, 486 | idx: interval.idx, 487 | }); 488 | events.push(Event { 489 | chr: interval.group, 490 | pos: interval.end, 491 | is_start: false, 492 | first_set: true, 493 | idx: interval.idx, 494 | }); 495 | } 496 | 497 | for interval in intervals2 { 498 | events.push(Event { 499 | chr: interval.group, 500 | pos: interval.start, 501 | is_start: true, 502 | first_set: false, 503 | idx: interval.idx, 504 | }); 505 | events.push(Event { 506 | chr: interval.group, 507 | pos: interval.end, 508 | is_start: false, 509 | first_set: false, 510 | idx: interval.idx, 511 | }); 512 | } 513 | 514 | // Sort events by: 515 | // 1. pos (ascending) 516 | // 2. is_start before is_end (if pos ties) 517 | // (We don't strictly need to tie-break by set_id or idx, but we can.) 518 | sort_by_key(&mut events, |e| e.is_start); 519 | sort_by_key(&mut events, |e| e.pos); 520 | 521 | events 522 | } 523 | -------------------------------------------------------------------------------- /src/tile.rs: -------------------------------------------------------------------------------- 1 | use crate::ruranges_structs::{GroupType, PositionType}; 2 | 3 | pub fn tile_grouped( 4 | chrs: &[C], 5 | starts: &[T], 6 | ends: &[T], 7 | negative_strand: &[bool], 8 | tile_size: T, 9 | ) -> (Vec, Vec, Vec, Vec) 10 | where 11 | T: PositionType, // signed integer-like 12 | C: GroupType + PartialEq, // unsigned integer-like; equality for boundaries 13 | { 14 | assert_eq!(starts.len(), ends.len()); 15 | assert_eq!(starts.len(), negative_strand.len()); 16 | assert_eq!(starts.len(), chrs.len()); 17 | 18 | let n = starts.len(); 19 | let mut out_starts = Vec::new(); 20 | let mut out_ends = Vec::new(); 21 | let mut out_indices = Vec::new(); 22 | let mut out_overlaps = Vec::new(); 23 | 24 | if n == 0 { 25 | return (out_starts, out_ends, out_indices, out_overlaps); 26 | } 27 | 28 | let denom = tile_size.to_f64().unwrap(); 29 | 30 | // Walk groups of equal `chrs` (assumed sorted so equal keys are contiguous) 31 | let mut g_start = 0usize; 32 | while g_start < n { 33 | let mut g_end = g_start + 1; 34 | while g_end < n && chrs[g_end] == chrs[g_start] { 35 | g_end += 1; 36 | } 37 | 38 | // Process this group exactly like the original function (no cross-group state) 39 | for i in g_start..g_end { 40 | let s = starts[i]; 41 | let e = ends[i]; 42 | 43 | // Skip invalid intervals. 44 | if e <= s { 45 | continue; 46 | } 47 | 48 | if !negative_strand[i] { 49 | // === Forward direction (same as original) === 50 | 51 | // First tile boundary <= s (works for negatives too) 52 | let mut tile_start = if s >= T::zero() { 53 | (s / tile_size) * tile_size 54 | } else { 55 | let mut multiple = s / tile_size; 56 | if s % tile_size != T::zero() { 57 | multiple = multiple - T::one(); // round toward -inf 58 | } 59 | multiple * tile_size 60 | }; 61 | 62 | // Step forward over tiles and keep overlaps with [s, e) 63 | while tile_start < e { 64 | let tile_end = tile_start + tile_size; 65 | if tile_end > s && tile_start < e { 66 | let num: f64 = (tile_end.min(e) - tile_start.max(s)).to_f64().unwrap(); 67 | let overlap_fraction = num / denom; 68 | 69 | out_starts.push(tile_start); 70 | out_ends.push(tile_end); 71 | out_indices.push(i); 72 | out_overlaps.push(overlap_fraction); 73 | } 74 | tile_start = tile_start + tile_size; 75 | } 76 | } else { 77 | // === Reverse direction (emit right→left like original) === 78 | 79 | // First tile boundary >= e 80 | let mut tile_end = if e > T::zero() { 81 | // ceil(e / tile_size) * tile_size without using floating-point 82 | let div = (e - T::one()) / tile_size; // ensure exact multiples stay at e 83 | (div + T::one()) * tile_size 84 | } else { 85 | // e <= 0 86 | let mut multiple = e / tile_size; 87 | if e % tile_size != T::zero() { 88 | multiple = multiple - T::one(); // fix: was - T::zero() 89 | } 90 | multiple * tile_size 91 | }; 92 | 93 | // Walk backward over tiles and keep overlaps with [s, e) 94 | while tile_end > s { 95 | let tile_start = tile_end - tile_size; 96 | if tile_start < e && tile_end > s { 97 | let num: f64 = (tile_end.min(e) - tile_start.max(s)).to_f64().unwrap(); 98 | let overlap_fraction = num / denom; 99 | 100 | out_starts.push(tile_start); 101 | out_ends.push(tile_end); 102 | out_indices.push(i); 103 | out_overlaps.push(overlap_fraction); 104 | } 105 | tile_end = tile_end - tile_size; 106 | } 107 | } 108 | } 109 | 110 | g_start = g_end; 111 | } 112 | 113 | (out_starts, out_ends, out_indices, out_overlaps) 114 | } 115 | 116 | 117 | /// Returns tiled intervals along with the original row index and the tile overlap as a fraction of tile size. 118 | /// 119 | /// For each interval defined by `starts[i]` and `ends[i]`, the function splits the genome into 120 | /// fixed-size tiles of length `tile_size` (e.g., [tile_start, tile_start + tile_size)) and computes 121 | /// the fraction of each tile that overlaps the original interval. 122 | /// 123 | /// # Examples 124 | /// 125 | /// - For an interval 99–100 with tile size 100, the tile [0,100) gets an overlap fraction of 0.01. 126 | /// - For an interval 100–250 with tile size 100: 127 | /// - The tile [100,200) gets an overlap fraction of 1.0, 128 | /// - The tile [200,300) gets an overlap fraction of 0.5. 129 | pub fn tile( 130 | starts: &[T], 131 | ends: &[T], 132 | negative_strand: &[bool], 133 | tile_size: T, 134 | ) -> (Vec, Vec, Vec, Vec) where T: PositionType { 135 | assert_eq!(starts.len(), ends.len()); 136 | assert_eq!(starts.len(), negative_strand.len()); 137 | 138 | let mut out_starts = Vec::new(); 139 | let mut out_ends = Vec::new(); 140 | let mut out_indices = Vec::new(); 141 | let mut out_overlaps = Vec::new(); 142 | let denom = tile_size.to_f64().unwrap(); 143 | 144 | for (i, ((&s, &e), &is_neg)) in starts 145 | .iter() 146 | .zip(ends.iter()) 147 | .zip(negative_strand.iter()) 148 | .enumerate() 149 | { 150 | // Skip invalid intervals. 151 | if e <= s { 152 | continue; 153 | } 154 | 155 | if !is_neg { 156 | // === Forward direction (same as original) === // 157 | 158 | // Determine the first tile boundary that is <= s. 159 | let mut tile_start = if s >= T::zero() { 160 | (s / tile_size) * tile_size 161 | } else { 162 | let mut multiple = s / tile_size; 163 | if s % tile_size != T::zero() { 164 | multiple = multiple - T::one(); 165 | } 166 | multiple * tile_size 167 | }; 168 | 169 | // Process each tile that may overlap [s, e). 170 | while tile_start < e { 171 | let tile_end = tile_start + tile_size; 172 | if tile_end > s && tile_start < e { 173 | // Calculate overlap fraction 174 | let num: f64 = (tile_end.min(e) - tile_start.max(s)).to_f64().unwrap(); 175 | let denom: f64 = tile_size.to_f64().unwrap(); 176 | let overlap_fraction = num / denom; 177 | out_starts.push(tile_start); 178 | out_ends.push(tile_end); 179 | out_indices.push(i); 180 | out_overlaps.push(overlap_fraction); 181 | } 182 | tile_start = tile_start + tile_size; 183 | } 184 | } else { 185 | // === Reverse direction === // 186 | 187 | // We want to find the first tile boundary >= e. 188 | // Because e could be negative or positive, we handle it similarly to the forward code, 189 | // but in reverse. 190 | // 191 | // Example logic: 192 | // if e = 787 and tile_size = 100, 193 | // the first boundary >= 787 is 800 194 | // 195 | // For negative e, we do a similar approach but be mindful of rounding. 196 | let mut tile_end = if e > T::zero() { 197 | // Round up to nearest multiple 198 | let div = (e - T::one()) / tile_size; // subtract 1 so that exact multiple doesn't push us one step further 199 | (div + T::one()) * tile_size 200 | } else { 201 | // e is negative or 0 202 | let mut multiple = e / tile_size; 203 | if e % tile_size != T::zero() { 204 | multiple = multiple - T::zero(); // go one step "earlier" in negative direction 205 | } 206 | multiple * tile_size 207 | }; 208 | 209 | // Walk backward until the tile_end <= s 210 | while tile_end > s { 211 | let tile_start = tile_end - tile_size; 212 | // Still check for overlap with [s, e). 213 | if tile_start < e && tile_end > s { 214 | let num= (tile_end.min(e) - tile_start.max(s)).to_f64().unwrap(); 215 | let overlap_fraction = num / denom; 216 | // We keep intervals with the smaller coordinate as start: 217 | out_starts.push(tile_start); 218 | out_ends.push(tile_end); 219 | out_indices.push(i); 220 | out_overlaps.push(overlap_fraction); 221 | } 222 | tile_end = tile_end - tile_size; 223 | } 224 | } 225 | } 226 | 227 | (out_starts, out_ends, out_indices, out_overlaps) 228 | } 229 | 230 | 231 | use std::cmp::min; 232 | 233 | pub fn window_grouped( 234 | chrs: &[C], 235 | starts: &[T], 236 | ends: &[T], 237 | negative_strand: &[bool], 238 | window_size: T, 239 | ) -> (Vec, Vec, Vec) 240 | where 241 | T: PositionType, // PrimInt + Signed + Zero + etc. 242 | C: GroupType + PartialEq, // PrimInt + Zero + equality to find boundaries 243 | { 244 | assert_eq!(starts.len(), ends.len()); 245 | assert_eq!(starts.len(), negative_strand.len()); 246 | assert_eq!(starts.len(), chrs.len()); 247 | assert!(window_size > T::zero()); 248 | 249 | let n = starts.len(); 250 | let mut out_starts = Vec::new(); 251 | let mut out_ends = Vec::new(); 252 | let mut out_indices = Vec::new(); 253 | 254 | if n == 0 { 255 | return (out_starts, out_ends, out_indices); 256 | } 257 | 258 | let mut g_start = 0usize; 259 | while g_start < n { 260 | // ----- find end of current group (maximal run of equal chrs) ----- 261 | let mut g_end = g_start + 1; 262 | while g_end < n && chrs[g_end] == chrs[g_start] { 263 | g_end += 1; 264 | } 265 | 266 | // ----- per-group state ----- 267 | // PLUS: carry how much we've filled of the current left->right window 268 | let mut carry_plus = T::zero(); 269 | 270 | // MINUS: how many bases we need at the LEFT of the next minus interval 271 | // to complete the current RIGHT-anchored window across this group 272 | let mut total_minus_len = T::zero(); 273 | for i in g_start..g_end { 274 | if negative_strand[i] { 275 | let len = ends[i] - starts[i]; 276 | if len > T::zero() { 277 | total_minus_len = total_minus_len + len; 278 | } 279 | } 280 | } 281 | let mut minus_needed = if total_minus_len.is_zero() { 282 | T::zero() 283 | } else { 284 | total_minus_len % window_size 285 | }; 286 | 287 | // ----- process intervals in the group ----- 288 | for i in g_start..g_end { 289 | let s = starts[i]; 290 | let e = ends[i]; 291 | if e <= s { 292 | continue; 293 | } 294 | 295 | if !negative_strand[i] { 296 | // ================= PLUS strand ================= 297 | let mut cur = s; 298 | let mut remaining = e - s; 299 | 300 | // 1) If we have a carry, complete that pending window first 301 | if !carry_plus.is_zero() { 302 | let need = window_size - carry_plus; 303 | let take = min(need, remaining); 304 | if take > T::zero() { 305 | let seg_end = cur + take; 306 | out_starts.push(cur); 307 | out_ends.push(seg_end); 308 | out_indices.push(i); 309 | 310 | cur = seg_end; 311 | remaining = remaining - take; 312 | carry_plus = carry_plus + take; 313 | 314 | if carry_plus == window_size { 315 | carry_plus = T::zero(); // phase boundary reached 316 | } 317 | if remaining.is_zero() { 318 | continue; 319 | } 320 | } 321 | } 322 | 323 | // 2) Full windows 324 | while remaining >= window_size { 325 | let seg_end = cur + window_size; 326 | out_starts.push(cur); 327 | out_ends.push(seg_end); 328 | out_indices.push(i); 329 | 330 | cur = seg_end; 331 | remaining = remaining - window_size; 332 | } 333 | 334 | // 3) Tail becomes next carry 335 | if remaining > T::zero() { 336 | let seg_end = e; // cur + remaining 337 | out_starts.push(cur); 338 | out_ends.push(seg_end); 339 | out_indices.push(i); 340 | carry_plus = remaining; // read at start of next interval in this group 341 | } else { 342 | carry_plus = T::zero(); 343 | } 344 | } else { 345 | // ================= MINUS strand ================= 346 | // We’ll *collect* segments left→right, then emit them right→left 347 | // to match legacy ordering (rightmost segment first). 348 | let mut cur = s; 349 | let mut remaining = e - s; 350 | let mut segs: Vec<(T, T)> = Vec::new(); 351 | 352 | // 1) Consume initial needed-at-left to align right-anchored phase 353 | if minus_needed > T::zero() { 354 | let take = min(minus_needed, remaining); 355 | let seg_end = cur + take; 356 | segs.push((cur, seg_end)); 357 | 358 | cur = seg_end; 359 | remaining = remaining - take; 360 | minus_needed = minus_needed - take; 361 | 362 | if remaining.is_zero() { 363 | // emit collected (just the leftmost piece) in reverse (trivial here) 364 | for (st, en) in segs.into_iter().rev() { 365 | out_starts.push(st); 366 | out_ends.push(en); 367 | out_indices.push(i); 368 | } 369 | continue; // still need more from next minus interval 370 | } 371 | } 372 | 373 | // 2) Full windows (collect left→right) 374 | while remaining >= window_size { 375 | let seg_end = cur + window_size; 376 | segs.push((cur, seg_end)); 377 | 378 | cur = seg_end; 379 | remaining = remaining - window_size; 380 | } 381 | 382 | // 3) Tail at RIGHT; set need for LEFT of next minus interval 383 | if remaining > T::zero() { 384 | let seg_end = e; // cur + remaining 385 | segs.push((cur, seg_end)); 386 | 387 | let tail = remaining; // 0 < tail < window_size here 388 | let rem = tail % window_size; // == tail 389 | minus_needed = if rem.is_zero() { T::zero() } else { window_size - rem }; 390 | } else { 391 | minus_needed = T::zero(); 392 | } 393 | 394 | // Emit minus-interval segments in reverse order (right→left) 395 | for (st, en) in segs.into_iter().rev() { 396 | out_starts.push(st); 397 | out_ends.push(en); 398 | out_indices.push(i); 399 | } 400 | } 401 | } 402 | 403 | g_start = g_end; 404 | } 405 | 406 | (out_starts, out_ends, out_indices) 407 | } -------------------------------------------------------------------------------- /src/numpy_bindings.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::str::FromStr; 3 | 4 | use numpy::{IntoPyArray, PyArray1, PyReadonlyArray1}; 5 | use pyo3::exceptions::PyValueError; 6 | use pyo3::prelude::*; 7 | use pyo3::wrap_pyfunction; 8 | 9 | use bindings::numpy_bindings::overlaps_numpy::*; 10 | use bindings::numpy_bindings::overlaps_simple_numpy::*; 11 | use bindings::numpy_bindings::nearest_numpy::*; 12 | use bindings::numpy_bindings::subtract_numpy::*; 13 | use bindings::numpy_bindings::complement_overlaps_numpy::*; 14 | use bindings::numpy_bindings::count_overlaps_numpy::*; 15 | use bindings::numpy_bindings::sort_intervals_numpy::*; 16 | use bindings::numpy_bindings::cluster_numpy::*; 17 | use bindings::numpy_bindings::merge_numpy::*; 18 | use bindings::numpy_bindings::window_numpy::*; 19 | use bindings::numpy_bindings::tile_numpy::*; 20 | use bindings::numpy_bindings::max_disjoint_numpy::*; 21 | use bindings::numpy_bindings::extend_numpy::*; 22 | use bindings::numpy_bindings::complement_numpy::*; 23 | use bindings::numpy_bindings::boundary_numpy::*; 24 | use bindings::numpy_bindings::spliced_subsequence_numpy::*; 25 | use bindings::numpy_bindings::split_numpy::*; 26 | use bindings::numpy_bindings::genome_bounds_numpy::*; 27 | use bindings::numpy_bindings::group_cumsum_numpy::*; 28 | use bindings::numpy_bindings::map_to_global_numpy::*; 29 | 30 | use crate::bindings; 31 | 32 | 33 | #[derive(Debug, PartialEq)] 34 | enum Direction { 35 | Forward, 36 | Backward, 37 | Any, 38 | } 39 | 40 | impl FromStr for Direction { 41 | type Err = String; 42 | 43 | fn from_str(s: &str) -> Result { 44 | match s.to_lowercase().as_str() { 45 | "forward" => Ok(Direction::Forward), 46 | "backward" => Ok(Direction::Backward), 47 | "any" => Ok(Direction::Any), 48 | _ => Err(format!("Invalid direction: {}", s)), 49 | } 50 | } 51 | } 52 | 53 | 54 | #[pymodule] 55 | #[pyo3(name = "ruranges")] 56 | fn ruranges(m: &Bound<'_, PyModule>) -> PyResult<()> { 57 | m.add_function(wrap_pyfunction!(map_to_global_numpy_u64_i64, m)?)?; 58 | m.add_function(wrap_pyfunction!(map_to_global_numpy_u32_i64, m)?)?; 59 | m.add_function(wrap_pyfunction!(map_to_global_numpy_u32_i32, m)?)?; 60 | m.add_function(wrap_pyfunction!(map_to_global_numpy_u32_i16, m)?)?; 61 | m.add_function(wrap_pyfunction!(map_to_global_numpy_u16_i64, m)?)?; 62 | m.add_function(wrap_pyfunction!(map_to_global_numpy_u16_i32, m)?)?; 63 | m.add_function(wrap_pyfunction!(map_to_global_numpy_u16_i16, m)?)?; 64 | m.add_function(wrap_pyfunction!(map_to_global_numpy_u8_i64, m)?)?; 65 | m.add_function(wrap_pyfunction!(map_to_global_numpy_u8_i32, m)?)?; 66 | m.add_function(wrap_pyfunction!(map_to_global_numpy_u8_i16, m)?)?; 67 | 68 | m.add_function(wrap_pyfunction!(chromsweep_numpy_u64_i64, m)?)?; 69 | m.add_function(wrap_pyfunction!(chromsweep_numpy_u32_i64, m)?)?; 70 | m.add_function(wrap_pyfunction!(chromsweep_numpy_u32_i32, m)?)?; 71 | m.add_function(wrap_pyfunction!(chromsweep_numpy_u32_i16, m)?)?; 72 | m.add_function(wrap_pyfunction!(chromsweep_numpy_u16_i64, m)?)?; 73 | m.add_function(wrap_pyfunction!(chromsweep_numpy_u16_i32, m)?)?; 74 | m.add_function(wrap_pyfunction!(chromsweep_numpy_u16_i16, m)?)?; 75 | m.add_function(wrap_pyfunction!(chromsweep_numpy_u8_i64, m)?)?; 76 | m.add_function(wrap_pyfunction!(chromsweep_numpy_u8_i32, m)?)?; 77 | m.add_function(wrap_pyfunction!(chromsweep_numpy_u8_i16, m)?)?; 78 | 79 | m.add_function(wrap_pyfunction!(sweepline_numpy_u64_i64, m)?)?; 80 | m.add_function(wrap_pyfunction!(sweepline_numpy_u32_i64, m)?)?; 81 | m.add_function(wrap_pyfunction!(sweepline_numpy_u32_i32, m)?)?; 82 | m.add_function(wrap_pyfunction!(sweepline_numpy_u32_i16, m)?)?; 83 | m.add_function(wrap_pyfunction!(sweepline_numpy_u16_i64, m)?)?; 84 | m.add_function(wrap_pyfunction!(sweepline_numpy_u16_i32, m)?)?; 85 | m.add_function(wrap_pyfunction!(sweepline_numpy_u16_i16, m)?)?; 86 | m.add_function(wrap_pyfunction!(sweepline_numpy_u8_i64, m)?)?; 87 | m.add_function(wrap_pyfunction!(sweepline_numpy_u8_i32, m)?)?; 88 | m.add_function(wrap_pyfunction!(sweepline_numpy_u8_i16, m)?)?; 89 | 90 | m.add_function(wrap_pyfunction!(nearest_numpy_u64_i64, m)?)?; 91 | m.add_function(wrap_pyfunction!(nearest_numpy_u32_i64, m)?)?; 92 | m.add_function(wrap_pyfunction!(nearest_numpy_u32_i32, m)?)?; 93 | m.add_function(wrap_pyfunction!(nearest_numpy_u32_i16, m)?)?; 94 | m.add_function(wrap_pyfunction!(nearest_numpy_u16_i64, m)?)?; 95 | m.add_function(wrap_pyfunction!(nearest_numpy_u16_i32, m)?)?; 96 | m.add_function(wrap_pyfunction!(nearest_numpy_u16_i16, m)?)?; 97 | m.add_function(wrap_pyfunction!(nearest_numpy_u8_i64, m)?)?; 98 | m.add_function(wrap_pyfunction!(nearest_numpy_u8_i32, m)?)?; 99 | m.add_function(wrap_pyfunction!(nearest_numpy_u8_i16, m)?)?; 100 | 101 | m.add_function(wrap_pyfunction!(subtract_numpy_u64_i64, m)?)?; 102 | m.add_function(wrap_pyfunction!(subtract_numpy_u32_i64, m)?)?; 103 | m.add_function(wrap_pyfunction!(subtract_numpy_u32_i32, m)?)?; 104 | m.add_function(wrap_pyfunction!(subtract_numpy_u32_i16, m)?)?; 105 | m.add_function(wrap_pyfunction!(subtract_numpy_u16_i64, m)?)?; 106 | m.add_function(wrap_pyfunction!(subtract_numpy_u16_i32, m)?)?; 107 | m.add_function(wrap_pyfunction!(subtract_numpy_u16_i16, m)?)?; 108 | m.add_function(wrap_pyfunction!(subtract_numpy_u8_i64, m)?)?; 109 | m.add_function(wrap_pyfunction!(subtract_numpy_u8_i32, m)?)?; 110 | m.add_function(wrap_pyfunction!(subtract_numpy_u8_i16, m)?)?; 111 | 112 | m.add_function(wrap_pyfunction!(complement_overlaps_numpy_u64_i64, m)?)?; 113 | m.add_function(wrap_pyfunction!(complement_overlaps_numpy_u32_i64, m)?)?; 114 | m.add_function(wrap_pyfunction!(complement_overlaps_numpy_u32_i32, m)?)?; 115 | m.add_function(wrap_pyfunction!(complement_overlaps_numpy_u32_i16, m)?)?; 116 | m.add_function(wrap_pyfunction!(complement_overlaps_numpy_u16_i64, m)?)?; 117 | m.add_function(wrap_pyfunction!(complement_overlaps_numpy_u16_i32, m)?)?; 118 | m.add_function(wrap_pyfunction!(complement_overlaps_numpy_u16_i16, m)?)?; 119 | m.add_function(wrap_pyfunction!(complement_overlaps_numpy_u8_i64, m)?)?; 120 | m.add_function(wrap_pyfunction!(complement_overlaps_numpy_u8_i32, m)?)?; 121 | m.add_function(wrap_pyfunction!(complement_overlaps_numpy_u8_i16, m)?)?; 122 | 123 | m.add_function(wrap_pyfunction!(count_overlaps_numpy_u64_i64, m)?)?; 124 | m.add_function(wrap_pyfunction!(count_overlaps_numpy_u32_i64, m)?)?; 125 | m.add_function(wrap_pyfunction!(count_overlaps_numpy_u32_i32, m)?)?; 126 | m.add_function(wrap_pyfunction!(count_overlaps_numpy_u32_i16, m)?)?; 127 | m.add_function(wrap_pyfunction!(count_overlaps_numpy_u16_i64, m)?)?; 128 | m.add_function(wrap_pyfunction!(count_overlaps_numpy_u16_i32, m)?)?; 129 | m.add_function(wrap_pyfunction!(count_overlaps_numpy_u16_i16, m)?)?; 130 | m.add_function(wrap_pyfunction!(count_overlaps_numpy_u8_i64, m)?)?; 131 | m.add_function(wrap_pyfunction!(count_overlaps_numpy_u8_i32, m)?)?; 132 | m.add_function(wrap_pyfunction!(count_overlaps_numpy_u8_i16, m)?)?; 133 | 134 | m.add_function(wrap_pyfunction!(sort_intervals_numpy_u64_i64, m)?)?; 135 | m.add_function(wrap_pyfunction!(sort_intervals_numpy_u32_i64, m)?)?; 136 | m.add_function(wrap_pyfunction!(sort_intervals_numpy_u32_i32, m)?)?; 137 | m.add_function(wrap_pyfunction!(sort_intervals_numpy_u32_i16, m)?)?; 138 | m.add_function(wrap_pyfunction!(sort_intervals_numpy_u16_i64, m)?)?; 139 | m.add_function(wrap_pyfunction!(sort_intervals_numpy_u16_i32, m)?)?; 140 | m.add_function(wrap_pyfunction!(sort_intervals_numpy_u16_i16, m)?)?; 141 | m.add_function(wrap_pyfunction!(sort_intervals_numpy_u8_i64, m)?)?; 142 | m.add_function(wrap_pyfunction!(sort_intervals_numpy_u8_i32, m)?)?; 143 | m.add_function(wrap_pyfunction!(sort_intervals_numpy_u8_i16, m)?)?; 144 | 145 | m.add_function(wrap_pyfunction!(cluster_numpy_u64_i64, m)?)?; 146 | m.add_function(wrap_pyfunction!(cluster_numpy_u32_i64, m)?)?; 147 | m.add_function(wrap_pyfunction!(cluster_numpy_u32_i32, m)?)?; 148 | m.add_function(wrap_pyfunction!(cluster_numpy_u32_i16, m)?)?; 149 | m.add_function(wrap_pyfunction!(cluster_numpy_u16_i64, m)?)?; 150 | m.add_function(wrap_pyfunction!(cluster_numpy_u16_i32, m)?)?; 151 | m.add_function(wrap_pyfunction!(cluster_numpy_u16_i16, m)?)?; 152 | m.add_function(wrap_pyfunction!(cluster_numpy_u8_i64, m)?)?; 153 | m.add_function(wrap_pyfunction!(cluster_numpy_u8_i32, m)?)?; 154 | m.add_function(wrap_pyfunction!(cluster_numpy_u8_i16, m)?)?; 155 | 156 | m.add_function(wrap_pyfunction!(merge_numpy_u64_i64, m)?)?; 157 | m.add_function(wrap_pyfunction!(merge_numpy_u32_i64, m)?)?; 158 | m.add_function(wrap_pyfunction!(merge_numpy_u32_i32, m)?)?; 159 | m.add_function(wrap_pyfunction!(merge_numpy_u32_i16, m)?)?; 160 | m.add_function(wrap_pyfunction!(merge_numpy_u16_i64, m)?)?; 161 | m.add_function(wrap_pyfunction!(merge_numpy_u16_i32, m)?)?; 162 | m.add_function(wrap_pyfunction!(merge_numpy_u16_i16, m)?)?; 163 | m.add_function(wrap_pyfunction!(merge_numpy_u8_i64, m)?)?; 164 | m.add_function(wrap_pyfunction!(merge_numpy_u8_i32, m)?)?; 165 | m.add_function(wrap_pyfunction!(merge_numpy_u8_i16, m)?)?; 166 | 167 | m.add_function(wrap_pyfunction!(max_disjoint_numpy_u64_i64, m)?)?; 168 | m.add_function(wrap_pyfunction!(max_disjoint_numpy_u32_i64, m)?)?; 169 | m.add_function(wrap_pyfunction!(max_disjoint_numpy_u32_i32, m)?)?; 170 | m.add_function(wrap_pyfunction!(max_disjoint_numpy_u32_i16, m)?)?; 171 | m.add_function(wrap_pyfunction!(max_disjoint_numpy_u16_i64, m)?)?; 172 | m.add_function(wrap_pyfunction!(max_disjoint_numpy_u16_i32, m)?)?; 173 | m.add_function(wrap_pyfunction!(max_disjoint_numpy_u16_i16, m)?)?; 174 | m.add_function(wrap_pyfunction!(max_disjoint_numpy_u8_i64, m)?)?; 175 | m.add_function(wrap_pyfunction!(max_disjoint_numpy_u8_i32, m)?)?; 176 | m.add_function(wrap_pyfunction!(max_disjoint_numpy_u8_i16, m)?)?; 177 | 178 | m.add_function(wrap_pyfunction!(complement_numpy_u64_i64, m)?)?; 179 | m.add_function(wrap_pyfunction!(complement_numpy_u32_i64, m)?)?; 180 | m.add_function(wrap_pyfunction!(complement_numpy_u32_i32, m)?)?; 181 | m.add_function(wrap_pyfunction!(complement_numpy_u32_i16, m)?)?; 182 | m.add_function(wrap_pyfunction!(complement_numpy_u16_i64, m)?)?; 183 | m.add_function(wrap_pyfunction!(complement_numpy_u16_i32, m)?)?; 184 | m.add_function(wrap_pyfunction!(complement_numpy_u16_i16, m)?)?; 185 | m.add_function(wrap_pyfunction!(complement_numpy_u8_i64, m)?)?; 186 | m.add_function(wrap_pyfunction!(complement_numpy_u8_i32, m)?)?; 187 | m.add_function(wrap_pyfunction!(complement_numpy_u8_i16, m)?)?; 188 | 189 | 190 | m.add_function(wrap_pyfunction!(window_numpy_u64_i64, m)?)?; 191 | m.add_function(wrap_pyfunction!(window_numpy_u32_i64, m)?)?; 192 | m.add_function(wrap_pyfunction!(window_numpy_u32_i32, m)?)?; 193 | m.add_function(wrap_pyfunction!(window_numpy_u32_i16, m)?)?; 194 | m.add_function(wrap_pyfunction!(window_numpy_u16_i64, m)?)?; 195 | m.add_function(wrap_pyfunction!(window_numpy_u16_i32, m)?)?; 196 | m.add_function(wrap_pyfunction!(window_numpy_u16_i16, m)?)?; 197 | m.add_function(wrap_pyfunction!(window_numpy_u8_i64, m)?)?; 198 | m.add_function(wrap_pyfunction!(window_numpy_u8_i32, m)?)?; 199 | m.add_function(wrap_pyfunction!(window_numpy_u8_i16, m)?)?; 200 | 201 | m.add_function(wrap_pyfunction!(tile_numpy_i64, m)?)?; 202 | m.add_function(wrap_pyfunction!(tile_numpy_i32, m)?)?; 203 | m.add_function(wrap_pyfunction!(tile_numpy_i16, m)?)?; 204 | 205 | m.add_function(wrap_pyfunction!(boundary_numpy_u64_i64, m)?)?; 206 | m.add_function(wrap_pyfunction!(boundary_numpy_u32_i64, m)?)?; 207 | m.add_function(wrap_pyfunction!(boundary_numpy_u32_i32, m)?)?; 208 | m.add_function(wrap_pyfunction!(boundary_numpy_u32_i16, m)?)?; 209 | m.add_function(wrap_pyfunction!(boundary_numpy_u16_i64, m)?)?; 210 | m.add_function(wrap_pyfunction!(boundary_numpy_u16_i32, m)?)?; 211 | m.add_function(wrap_pyfunction!(boundary_numpy_u16_i16, m)?)?; 212 | m.add_function(wrap_pyfunction!(boundary_numpy_u8_i64, m)?)?; 213 | m.add_function(wrap_pyfunction!(boundary_numpy_u8_i32, m)?)?; 214 | m.add_function(wrap_pyfunction!(boundary_numpy_u8_i16, m)?)?; 215 | 216 | m.add_function(wrap_pyfunction!(spliced_subsequence_numpy_u64_i64, m)?)?; 217 | m.add_function(wrap_pyfunction!(spliced_subsequence_numpy_u32_i64, m)?)?; 218 | m.add_function(wrap_pyfunction!(spliced_subsequence_numpy_u32_i32, m)?)?; 219 | m.add_function(wrap_pyfunction!(spliced_subsequence_numpy_u32_i16, m)?)?; 220 | m.add_function(wrap_pyfunction!(spliced_subsequence_numpy_u16_i64, m)?)?; 221 | m.add_function(wrap_pyfunction!(spliced_subsequence_numpy_u16_i32, m)?)?; 222 | m.add_function(wrap_pyfunction!(spliced_subsequence_numpy_u16_i16, m)?)?; 223 | m.add_function(wrap_pyfunction!(spliced_subsequence_numpy_u8_i64, m)?)?; 224 | m.add_function(wrap_pyfunction!(spliced_subsequence_numpy_u8_i32, m)?)?; 225 | m.add_function(wrap_pyfunction!(spliced_subsequence_numpy_u8_i16, m)?)?; 226 | 227 | m.add_function(wrap_pyfunction!(spliced_subsequence_multi_numpy_u64_i64, m)?)?; 228 | m.add_function(wrap_pyfunction!(spliced_subsequence_multi_numpy_u32_i64, m)?)?; 229 | m.add_function(wrap_pyfunction!(spliced_subsequence_multi_numpy_u32_i32, m)?)?; 230 | m.add_function(wrap_pyfunction!(spliced_subsequence_multi_numpy_u32_i16, m)?)?; 231 | m.add_function(wrap_pyfunction!(spliced_subsequence_multi_numpy_u16_i64, m)?)?; 232 | m.add_function(wrap_pyfunction!(spliced_subsequence_multi_numpy_u16_i32, m)?)?; 233 | m.add_function(wrap_pyfunction!(spliced_subsequence_multi_numpy_u16_i16, m)?)?; 234 | m.add_function(wrap_pyfunction!(spliced_subsequence_multi_numpy_u8_i64, m)?)?; 235 | m.add_function(wrap_pyfunction!(spliced_subsequence_multi_numpy_u8_i32, m)?)?; 236 | m.add_function(wrap_pyfunction!(spliced_subsequence_multi_numpy_u8_i16, m)?)?; 237 | 238 | m.add_function(wrap_pyfunction!(extend_numpy_u64_i64, m)?)?; 239 | m.add_function(wrap_pyfunction!(extend_numpy_u32_i64, m)?)?; 240 | m.add_function(wrap_pyfunction!(extend_numpy_u32_i32, m)?)?; 241 | m.add_function(wrap_pyfunction!(extend_numpy_u32_i16, m)?)?; 242 | m.add_function(wrap_pyfunction!(extend_numpy_u16_i64, m)?)?; 243 | m.add_function(wrap_pyfunction!(extend_numpy_u16_i32, m)?)?; 244 | m.add_function(wrap_pyfunction!(extend_numpy_u16_i16, m)?)?; 245 | m.add_function(wrap_pyfunction!(extend_numpy_u8_i64, m)?)?; 246 | m.add_function(wrap_pyfunction!(extend_numpy_u8_i32, m)?)?; 247 | m.add_function(wrap_pyfunction!(extend_numpy_u8_i16, m)?)?; 248 | 249 | m.add_function(wrap_pyfunction!(split_numpy_u64_i64, m)?)?; 250 | m.add_function(wrap_pyfunction!(split_numpy_u32_i64, m)?)?; 251 | m.add_function(wrap_pyfunction!(split_numpy_u32_i32, m)?)?; 252 | m.add_function(wrap_pyfunction!(split_numpy_u32_i16, m)?)?; 253 | m.add_function(wrap_pyfunction!(split_numpy_u16_i64, m)?)?; 254 | m.add_function(wrap_pyfunction!(split_numpy_u16_i32, m)?)?; 255 | m.add_function(wrap_pyfunction!(split_numpy_u16_i16, m)?)?; 256 | m.add_function(wrap_pyfunction!(split_numpy_u8_i64, m)?)?; 257 | m.add_function(wrap_pyfunction!(split_numpy_u8_i32, m)?)?; 258 | m.add_function(wrap_pyfunction!(split_numpy_u8_i16, m)?)?; 259 | 260 | m.add_function(wrap_pyfunction!(genome_bounds_numpy_u64_i64, m)?)?; 261 | m.add_function(wrap_pyfunction!(genome_bounds_numpy_u32_i64, m)?)?; 262 | m.add_function(wrap_pyfunction!(genome_bounds_numpy_u32_i32, m)?)?; 263 | m.add_function(wrap_pyfunction!(genome_bounds_numpy_u32_i16, m)?)?; 264 | m.add_function(wrap_pyfunction!(genome_bounds_numpy_u16_i64, m)?)?; 265 | m.add_function(wrap_pyfunction!(genome_bounds_numpy_u16_i32, m)?)?; 266 | m.add_function(wrap_pyfunction!(genome_bounds_numpy_u16_i16, m)?)?; 267 | m.add_function(wrap_pyfunction!(genome_bounds_numpy_u8_i64, m)?)?; 268 | m.add_function(wrap_pyfunction!(genome_bounds_numpy_u8_i32, m)?)?; 269 | m.add_function(wrap_pyfunction!(genome_bounds_numpy_u8_i16, m)?)?; 270 | 271 | m.add_function(wrap_pyfunction!(group_cumsum_numpy_u64_i64, m)?)?; 272 | m.add_function(wrap_pyfunction!(group_cumsum_numpy_u32_i64, m)?)?; 273 | m.add_function(wrap_pyfunction!(group_cumsum_numpy_u32_i32, m)?)?; 274 | m.add_function(wrap_pyfunction!(group_cumsum_numpy_u32_i16, m)?)?; 275 | m.add_function(wrap_pyfunction!(group_cumsum_numpy_u16_i64, m)?)?; 276 | m.add_function(wrap_pyfunction!(group_cumsum_numpy_u16_i32, m)?)?; 277 | m.add_function(wrap_pyfunction!(group_cumsum_numpy_u16_i16, m)?)?; 278 | m.add_function(wrap_pyfunction!(group_cumsum_numpy_u8_i64, m)?)?; 279 | m.add_function(wrap_pyfunction!(group_cumsum_numpy_u8_i32, m)?)?; 280 | m.add_function(wrap_pyfunction!(group_cumsum_numpy_u8_i16, m)?)?; 281 | 282 | Ok(()) 283 | } 284 | -------------------------------------------------------------------------------- /src/nearest.rs: -------------------------------------------------------------------------------- 1 | use std::{str::FromStr, time::Instant}; 2 | 3 | use radsort::sort_by_key; 4 | 5 | use crate::{ 6 | overlaps::{self, sweep_line_overlaps, sweep_line_overlaps_overlap_pair}, 7 | ruranges_structs::{GroupType, MinEvent, Nearest, OverlapPair, PositionType}, 8 | sorts::build_sorted_events_single_collection_separate_outputs, 9 | }; 10 | 11 | /// For each MinEvent in `sorted_ends`, find up to `k` *unique positions* 12 | /// in `sorted_starts2` that lie to the right (including equal position on the 13 | /// same chromosome). If multiple entries in `sorted_starts2` share the same 14 | /// position, they all get reported, but they count as one unique position. 15 | pub fn nearest_intervals_to_the_right( 16 | sorted_ends: Vec>, 17 | sorted_starts2: Vec>, 18 | k: usize, 19 | ) -> Vec> { 20 | // We might need more than `sorted_ends.len()` because each end could 21 | // contribute up to `k` *unique positions* (potentially multiplied by the 22 | // number of intervals sharing those positions). So we set capacity 23 | // accordingly. 24 | // This is not strictly required, but it helps performance to reserve enough space. 25 | let mut output = Vec::with_capacity(sorted_ends.len().saturating_mul(k)); 26 | 27 | let n_starts = sorted_starts2.len(); 28 | 29 | // `j` will track our position in sorted_starts2 as we move through sorted_ends. 30 | let mut j = 0usize; 31 | 32 | // Iterate over each 'end' event 33 | for end in &sorted_ends { 34 | let end_chr = end.chr; 35 | let end_pos = end.pos; 36 | 37 | // Advance `j` so that sorted_starts2[j] is the first start 38 | // that is >= end_pos on the same chrom (or beyond). 39 | // Because both arrays are sorted, we never need to move `j` backward. 40 | while j < n_starts { 41 | let start = &sorted_starts2[j]; 42 | if start.chr < end_chr { 43 | // still on a smaller chromosome; move j forward 44 | j += 1; 45 | } else if start.chr == end_chr && start.pos < end_pos { 46 | // same chrom but still to the left; move j forward 47 | j += 1; 48 | } else { 49 | // now start.chr > end_chr (i.e. next chromosome) OR 50 | // start.chr == end_chr && start.pos >= end_pos 51 | // -> we've reached a region that is "to the right" or next chrom 52 | break; 53 | } 54 | } 55 | 56 | // Now collect up to k unique positions (on the same chromosome). 57 | let mut unique_count = 0; 58 | let mut last_pos: Option = None; 59 | 60 | // We'll scan from `j` onward, but we do NOT move `j` itself 61 | // because the next 'end' might need a similar or slightly advanced position. 62 | // Instead, we use `local_idx` to look ahead for this specific end. 63 | let mut local_idx = j; 64 | while local_idx < n_starts { 65 | let start = &sorted_starts2[local_idx]; 66 | 67 | // If we've passed beyond the chromosome of this end, we won't find 68 | // any more right-side intervals for this end. 69 | if start.chr != end_chr { 70 | break; 71 | } 72 | 73 | // Check if we're at a new unique position 74 | if last_pos.map_or(true, |lp| start.pos != lp) { 75 | unique_count += 1; 76 | if unique_count > k { 77 | // we've reached the limit of k unique positions 78 | break; 79 | } 80 | last_pos = Some(start.pos); 81 | } 82 | 83 | // This start is included in the results 84 | let distance = start.pos - end_pos + T::one(); // can be 0 or positive 85 | output.push(Nearest { 86 | distance, 87 | idx: end.idx, 88 | idx2: start.idx, 89 | }); 90 | 91 | local_idx += 1; 92 | } 93 | } 94 | 95 | output 96 | } 97 | 98 | /// For each MinEvent in `sorted_ends`, find up to `k` *unique positions* 99 | /// in `sorted_starts2` that lie to the left (strictly smaller position on 100 | /// the same chromosome). If multiple entries in `sorted_starts2` share 101 | /// the same position, they all get reported, but they count as one 102 | /// unique position in the limit `k`. 103 | pub fn nearest_intervals_to_the_left( 104 | sorted_ends: Vec>, 105 | sorted_starts2: Vec>, 106 | k: usize, 107 | ) -> Vec> { 108 | // The max possible size is (number of ends) * (k + duplicates at each of those k positions). 109 | // We reserve a rough upper bound for efficiency. 110 | let mut output = Vec::with_capacity(sorted_ends.len().saturating_mul(k)); 111 | 112 | let n_starts = sorted_starts2.len(); 113 | let mut j = 0_usize; // Points into sorted_starts2 114 | 115 | for end in &sorted_ends { 116 | let end_chr = end.chr; 117 | let end_pos = end.pos; 118 | 119 | // Move `j` forward so that: 120 | // - All start events at indices < j have start.chr < end_chr 121 | // OR (start.chr == end_chr && start.pos < end_pos). 122 | // - Equivalently, sorted_starts2[j] is the *first* event that is NOT 123 | // strictly to the left of `end`. 124 | while j < n_starts { 125 | let start = &sorted_starts2[j]; 126 | if start.chr < end_chr { 127 | // still a smaller chromosome => definitely to the left 128 | j += 1; 129 | } else if start.chr == end_chr && start.pos <= end_pos { 130 | // same chrom, smaller position => to the left 131 | j += 1; 132 | } else { 133 | // we've reached a start that is not to the left 134 | break; 135 | } 136 | } 137 | 138 | // Now, everything in [0..j) is strictly to the left of `end`. 139 | // We'll look backwards from j-1 to gather up to k unique positions 140 | // on the same chromosome. 141 | if j == 0 { 142 | // No intervals to the left; skip 143 | continue; 144 | } 145 | 146 | let mut local_idx = j - 1; 147 | let mut unique_count = 0; 148 | let mut last_pos: Option = None; 149 | 150 | // Descend from j-1 down to 0 (or until we break). 151 | loop { 152 | let start = &sorted_starts2[local_idx]; 153 | 154 | // Must match the same chromosome 155 | if start.chr != end_chr { 156 | break; 157 | } 158 | 159 | // Check if we have a new (unique) position 160 | if last_pos.map_or(true, |lp| start.pos != lp) { 161 | unique_count += 1; 162 | if unique_count > k { 163 | break; 164 | } 165 | last_pos = Some(start.pos); 166 | } 167 | 168 | // Calculate the distance (end.pos - start.pos) 169 | // Here, start.pos < end.pos by definition if we get here. 170 | let distance = end_pos - start.pos + T::one(); 171 | output.push(Nearest { 172 | distance, 173 | idx: end.idx, // the 'end' event's idx 174 | idx2: start.idx, // the 'start' event's idx 175 | }); 176 | 177 | if local_idx == 0 { 178 | break; 179 | } 180 | local_idx -= 1; 181 | } 182 | } 183 | 184 | output 185 | } 186 | 187 | /// Merges th 188 | #[derive(Copy, Clone, Debug, PartialEq, Eq)] 189 | pub enum Direction { 190 | Forward, 191 | Backward, 192 | Any, 193 | } 194 | 195 | impl FromStr for Direction { 196 | type Err = &'static str; 197 | 198 | fn from_str(s: &str) -> Result { 199 | match s.to_lowercase().as_str() { 200 | "forward" => Ok(Direction::Forward), 201 | "backward" => Ok(Direction::Backward), 202 | "any" => Ok(Direction::Any), 203 | _ => Err("Invalid direction string"), 204 | } 205 | } 206 | } 207 | 208 | pub fn nearest( 209 | chrs: &[C], 210 | starts: &[T], 211 | ends: &[T], 212 | chrs2: &[C], 213 | starts2: &[T], 214 | ends2: &[T], 215 | slack: T, 216 | k: usize, 217 | include_overlaps: bool, 218 | direction: &str, 219 | ) -> (Vec, Vec, Vec) { 220 | let dir = Direction::from_str(direction).unwrap(); 221 | 222 | let sorted_starts = build_sorted_events_single_collection_separate_outputs(chrs, starts, slack); 223 | let sorted_ends = build_sorted_events_single_collection_separate_outputs(chrs, ends, slack); 224 | 225 | let sorted_starts2 = build_sorted_events_single_collection_separate_outputs(chrs2, starts2, T::zero()); 226 | let sorted_ends2 = build_sorted_events_single_collection_separate_outputs(chrs2, ends2, T::zero()); 227 | 228 | let overlaps = if include_overlaps { 229 | sweep_line_overlaps_overlap_pair( 230 | &sorted_starts, 231 | &sorted_ends, 232 | &sorted_starts2, 233 | &sorted_ends2, 234 | ) 235 | } else { 236 | Vec::new() 237 | }; 238 | let nearest_left = if dir == Direction::Backward || dir == Direction::Any { 239 | let mut tmp = nearest_intervals_to_the_left(sorted_starts, sorted_ends2, k); 240 | radsort::sort_by_key(&mut tmp, |n| (n.idx, n.distance)); 241 | tmp 242 | } else { 243 | Vec::new() 244 | }; 245 | let nearest_right = if dir == Direction::Forward || dir == Direction::Any { 246 | let mut tmp = nearest_intervals_to_the_right(sorted_ends, sorted_starts2, k); 247 | radsort::sort_by_key(&mut tmp, |n| (n.idx, n.distance)); 248 | tmp 249 | } else { 250 | Vec::new() 251 | }; 252 | 253 | let merged = merge_three_way_by_index_distance(&overlaps, &nearest_left, &nearest_right, k); 254 | merged 255 | } 256 | 257 | /// Merges three sources of intervals, grouped by `idx` (i.e. `idx1` in overlaps). 258 | /// For each unique `idx`, it returns up to `k` *distinct* distances (including 259 | /// all intervals at those distances). Overlaps are treated as distance=0 (or 1). 260 | /// 261 | /// The data is assumed to be sorted in ascending order by `(idx, distance)`. 262 | pub fn merge_three_way_by_index_distance( 263 | overlaps: &[OverlapPair], // sorted by idx1 264 | nearest_left: &[Nearest], // sorted by (idx, distance) 265 | nearest_right: &[Nearest], // sorted by (idx, distance) 266 | k: usize, 267 | ) -> (Vec, Vec, Vec) { 268 | // We'll return tuples: (idx, idx2, distance). 269 | // You can adapt if you want a custom struct instead. 270 | let mut results = Vec::new(); 271 | 272 | // Pointers over each input 273 | let (mut i, mut j, mut r) = (0_usize, 0_usize, 0_usize); 274 | 275 | // Outer loop: pick the smallest index among the three lists 276 | while i < overlaps.len() || j < nearest_left.len() || r < nearest_right.len() { 277 | // Current index (None if that list is exhausted) 278 | let idx_o = overlaps.get(i).map(|o| o.idx); 279 | let idx_l = nearest_left.get(j).map(|n| n.idx); 280 | let idx_r = nearest_right.get(r).map(|n| n.idx); 281 | 282 | // If all three are None, we're done 283 | let current_idx = match (idx_o, idx_l, idx_r) { 284 | (None, None, None) => break, 285 | (Some(a), Some(b), Some(c)) => a.min(b.min(c)), 286 | (Some(a), Some(b), None) => a.min(b), 287 | (Some(a), None, Some(c)) => a.min(c), 288 | (None, Some(b), Some(c)) => b.min(c), 289 | (Some(a), None, None) => a, 290 | (None, Some(b), None) => b, 291 | (None, None, Some(c)) => c, 292 | }; 293 | 294 | // Gather all overlaps for current_idx 295 | let i_start = i; 296 | while i < overlaps.len() && overlaps[i].idx == current_idx { 297 | i += 1; 298 | } 299 | let overlaps_slice = &overlaps[i_start..i]; 300 | 301 | // Gather all nearest_left for current_idx 302 | let j_start = j; 303 | while j < nearest_left.len() && nearest_left[j].idx == current_idx { 304 | j += 1; 305 | } 306 | let left_slice = &nearest_left[j_start..j]; 307 | 308 | // Gather all nearest_right for current_idx 309 | let r_start = r; 310 | while r < nearest_right.len() && nearest_right[r].idx == current_idx { 311 | r += 1; 312 | } 313 | let right_slice = &nearest_right[r_start..r]; 314 | 315 | // Now we have three *already-sorted* slices (by distance) for this index: 316 | // 1) overlaps_slice (distance=0 or 1, or if you store it in OverlapPair, read it) 317 | // 2) left_slice (sorted ascending by distance) 318 | // 3) right_slice (sorted ascending by distance) 319 | // 320 | // We'll do a 3-way merge *by distance*, collecting up to k *distinct* distances. 321 | // If you store overlap distances in OverlapPair, you can read them; 322 | // otherwise, assume overlap distance=0. 323 | 324 | let mut used_distances = std::collections::HashSet::new(); 325 | let mut distinct_count = 0; 326 | 327 | let (mut oi, mut lj, mut rr) = (0, 0, 0); 328 | 329 | // Helper closures to peek distance from each slice 330 | let overlap_dist = |_ix: usize| -> T { 331 | // If you store distance in OverlapPair, return that. Otherwise 0 or 1. 332 | // For the example, let's assume actual Overlap distance=0: 333 | T::zero() 334 | }; 335 | let left_dist = |ix: usize| -> T { left_slice[ix].distance }; 336 | let right_dist = |ix: usize| -> T { right_slice[ix].distance }; 337 | 338 | // Inner loop: pick the next *smallest* distance among the three slices 339 | while oi < overlaps_slice.len() || lj < left_slice.len() || rr < right_slice.len() { 340 | // Peek next distance (or i64::MAX if none) 341 | let d_o = if oi < overlaps_slice.len() { 342 | overlap_dist(oi) 343 | } else { 344 | T::max_value() 345 | }; 346 | let d_l = if lj < left_slice.len() { 347 | left_dist(lj) 348 | } else { 349 | T::max_value() 350 | }; 351 | let d_r = if rr < right_slice.len() { 352 | right_dist(rr) 353 | } else { 354 | T::max_value() 355 | }; 356 | 357 | let smallest = d_o.min(d_l.min(d_r)); 358 | if smallest == T::max_value() { 359 | // no more items 360 | break; 361 | } 362 | 363 | // We'll pull everything from Overlaps that has distance == smallest 364 | while oi < overlaps_slice.len() { 365 | let dcur = overlap_dist(oi); 366 | if dcur == smallest { 367 | // If this is a *new* distance (not in used_distances), 368 | // we check if it would exceed k distinct distances 369 | if !used_distances.contains(&dcur) { 370 | distinct_count += 1; 371 | if distinct_count > k { 372 | // no new distances allowed 373 | break; 374 | } 375 | used_distances.insert(dcur); 376 | } 377 | // Add to result 378 | let OverlapPair { idx, idx2 } = overlaps_slice[oi]; 379 | results.push(Nearest { idx: idx, idx2: idx2, distance: T::zero() }); 380 | oi += 1; 381 | } else { 382 | break; 383 | } 384 | } 385 | if distinct_count > k { 386 | break; 387 | } 388 | 389 | // Pull everything from Left that has distance == smallest 390 | while lj < left_slice.len() { 391 | let dcur = left_dist(lj); 392 | if dcur == smallest { 393 | if !used_distances.contains(&dcur) { 394 | distinct_count += 1; 395 | if distinct_count > k { 396 | break; 397 | } 398 | used_distances.insert(dcur); 399 | } 400 | results.push(left_slice[lj]); 401 | lj += 1; 402 | } else { 403 | break; 404 | } 405 | } 406 | if distinct_count > k { 407 | break; 408 | } 409 | 410 | // Pull everything from Right that has distance == smallest 411 | while rr < right_slice.len() { 412 | let dcur = right_dist(rr); 413 | if dcur == smallest { 414 | if !used_distances.contains(&dcur) { 415 | distinct_count += 1; 416 | if distinct_count > k { 417 | break; 418 | } 419 | used_distances.insert(dcur); 420 | } 421 | results.push(right_slice[rr]); 422 | rr += 1; 423 | } else { 424 | break; 425 | } 426 | } 427 | if distinct_count > k { 428 | break; 429 | } 430 | } 431 | // done collecting up to k distinct distances for this index 432 | } 433 | 434 | sort_by_key(&mut results, |n| (n.idx, n.distance, n.idx2)); 435 | 436 | let mut out_idxs = Vec::with_capacity(results.len()); 437 | let mut out_idxs2 = Vec::with_capacity(results.len()); 438 | let mut out_distances = Vec::with_capacity(results.len()); 439 | 440 | for rec in results { 441 | out_idxs.push(rec.idx); 442 | out_idxs2.push(rec.idx2); 443 | out_distances.push(rec.distance); 444 | } 445 | 446 | (out_idxs, out_idxs2, out_distances) 447 | } 448 | --------------------------------------------------------------------------------