├── .github └── workflows │ └── rust.yml ├── .gitignore ├── Cargo.toml ├── LICENSE ├── README.md ├── build.sh ├── src ├── cached_faidx.rs ├── main.rs └── processor.rs └── test ├── test.bam ├── test.bam.bai ├── test_cram.fa └── test_cram.fa.fai /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | - name: Build 20 | run: cargo build --verbose 21 | - name: Run tests 22 | run: cargo test --verbose 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | debug/ 4 | target/ 5 | 6 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 7 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 8 | Cargo.lock 9 | 10 | # These are backup files generated by rustfmt 11 | **/*.rs.bk 12 | 13 | # MSVC Windows builds of rustc generate these, which store debugging information 14 | *.pdb 15 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "pbr" 3 | version = "0.2.1" 4 | edition = "2021" 5 | authors = ["Brent Pedersen"] 6 | description = "pileups filtered with lua expressions" 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | 10 | [dependencies] 11 | anyhow = "1.0.71" 12 | bio = "1.1.0" 13 | clap = {version="4.3.0", features=["derive", "help"]} 14 | mimalloc = "0.1.37" 15 | mlua = {version = "0.10.3", features=["luau", "send"]} 16 | perbase = {git = "https://github.com/brentp/perbase", rev="2b96a92"} 17 | #rust-htslib = {git = "https://github.com/brentp/rust-htslib", branch = "faidx-sl", features=["static"]} 18 | rust-htslib = {git = "https://github.com/brentp/rust-htslib", rev = "b130834", features=["static"]} 19 | rust-lapper = "1.1.0" 20 | 21 | [profile.release] 22 | codegen-units=1 23 | 24 | [profile.dev] 25 | opt-level = 1 26 | [profile.dev.package."*"] 27 | opt-level = 3 28 | 29 | 30 | [dev-dependencies] 31 | tempfile = "3.10.1" 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Brent Pedersen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pbr 2 | 3 | [![Rust](https://github.com/brentp/pbr/actions/workflows/rust.yml/badge.svg)](https://github.com/brentp/pbr/actions/workflows/rust.yml) 4 | drunk on [perbase](https://github.com/sstadick/perbase) pileups and [lua](https://github.com/khvzak/mlua/) expressions. 5 | 6 | An example use is in calculating mutation rates. Variant callers have filters such as these 7 | that affect the number of mutations called; we also want to scale the denominator in similar 8 | way so we can compare rates across samples. 9 | 10 | ## Expression 11 | 12 | The following attributes are available on the `read` object in lua expressions: 13 | 14 | ``` 15 | mapping_quality 16 | flags # integer. must use bit32 lua module (builtin here) to do operations 17 | tid 18 | start 19 | stop 20 | # where in the current read is the pileup given by qpos with convenience of distance_from_[left/right]_end 21 | qpos 22 | distance_from_5prime 23 | distance_from_3prime 24 | insert_size 25 | qname 26 | bq # base_quality at current site 27 | length # length of the read sequence 28 | sequence 29 | n_proportion_5_prime(bases:number) 30 | n_proportion_3_prime(bases:number) 31 | indel_count 32 | soft_clips_3_prime 33 | soft_clips_5_prime 34 | tag(name: string) 35 | ``` 36 | 37 | An example expression could be: 38 | 39 | ```lua 40 | -- high mapping-q and base-quality for this column 41 | read.mapping_quality > 10 and read.bq > 20 \ 42 | -- and not within 10 bases of left end or right end 43 | and read.distance_from_5prime > 10 and read.distance_from_3prime > 10 \ 44 | -- and exclude read if unmapped, not primary, qc_fail, or duplicate. 45 | and bit32.band(read.flags, bit32.bor(4, 256, 512, 1024)) == 0 \ 46 | -- and exclude read if it has more than 5% N's in the sequence 47 | and string_count(read.sequence, 'N') < 0.05 * read.length 48 | ``` 49 | 50 | this runs as: 51 | 52 | ``` 53 | pbr $bam "return $expression" > out.pileup 54 | ``` 55 | 56 | where the $expression argument is the lua expression. 57 | 58 | - Note that we can use, e.g. `print(read.qname, read.flags); return $expression)` to help with debugging. 59 | - Note that the expression _must_ contain **'return'** 60 | 61 | # Usage 62 | 63 | ``` 64 | pileups filtered with lua expressions 65 | 66 | Usage: pbr [OPTIONS] 67 | 68 | Arguments: 69 | Path to the bamfile 70 | Lua expression to evaluate 71 | 72 | Options: 73 | -t, --threads Number of threads to use [default: 2] 74 | -m, --max-depth maximum depth in the pileup [default: 100000] 75 | -b, --bedfile optional path to the BED of include regions 76 | -f, --fasta optional path to the reference fasta file 77 | -e, --exclude optional path to BED of exclude regions 78 | --mate-fix adjust depth to not double count overlapping mates 79 | -p, --pile-expression optional expression required for the pileup 80 | -h, --help Print help 81 | -V, --version Print version 82 | ``` 83 | 84 | ## PileExpression 85 | 86 | Note that the pile-expression is also a lua expression; it is applied to the Pileup (column) rather than to the reads. 87 | The available attributes on the `pile` object are: 88 | 89 | ``` 90 | depth,a,c,g,t,n,fail,ins,del,ref_skip 91 | ``` 92 | 93 | An example --pile-expression would look like: 94 | 95 | ``` 96 | return pile.n / pile.depth < 0.05 97 | ``` 98 | 99 | To require that fewer than 5% of the reads in the pile are 'N'. Positions that do not pass this expression will **not** be printed. 100 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | set -eu 2 | 3 | #target=x86_64-unknown-linux-gnu 4 | #export RUSTFLAGS="-C target-feature=-crt-static -C relocation-model=pie" 5 | #cargo test --release --target $target 6 | #RUSTFLAGS="-C target-feature=-crt-static -C relocation-model=pie" cargo build --release --target $target 7 | #ls -lh ./target/$target/release/pbr 8 | #exit 9 | 10 | target=x86_64-unknown-linux-musl 11 | cargo clean 12 | cross build --target=$target --release 13 | ls -lhd ./target/$target/release/pbr 14 | -------------------------------------------------------------------------------- /src/cached_faidx.rs: -------------------------------------------------------------------------------- 1 | use rust_htslib::errors::Result; 2 | use rust_htslib::faidx; 3 | use std::path::Path; 4 | 5 | /// CachedFaidx uses rust-htslib faidx reader 6 | /// and caches the results to reduce disk access. 7 | /// It does not do anything smart but should work well for 8 | /// single consecutive bases as used in pbr. 9 | pub struct CachedFaidx { 10 | faidx: faidx::Reader, 11 | cache: Vec, 12 | chrom: String, 13 | start: usize, 14 | } 15 | 16 | impl CachedFaidx { 17 | pub fn new>(fasta_path: P) -> Result { 18 | let faidx = faidx::Reader::from_path(fasta_path)?; 19 | let cache = vec![0; 1000]; 20 | Ok(CachedFaidx { 21 | faidx, 22 | cache, 23 | chrom: String::new(), 24 | start: 0, 25 | }) 26 | } 27 | 28 | //pub fn n_seqs(&self) -> u64 { 29 | // self.faidx.n_seqs() 30 | //} 31 | 32 | fn fetch_into_cache>( 33 | &mut self, 34 | chrom: N, 35 | start: usize, 36 | end: usize, 37 | ) -> Result<()> { 38 | let r = self.faidx.fetch_seq(chrom.as_ref(), start, end)?; 39 | self.chrom = String::from(chrom.as_ref()); 40 | self.start = start; 41 | self.cache.clear(); 42 | self.cache.extend_from_slice(&r); 43 | Ok(()) 44 | } 45 | 46 | #[allow(dead_code)] 47 | pub fn fetch_seq_string + std::cmp::PartialEq>( 48 | &mut self, 49 | chrom: N, 50 | start: usize, 51 | end: usize, 52 | ) -> Result { 53 | let bytes = self.fetch_seq(chrom, start, end)?; 54 | Ok(std::str::from_utf8(bytes).unwrap().to_owned()) 55 | } 56 | 57 | pub fn fetch_seq + std::cmp::PartialEq>( 58 | &mut self, 59 | chrom: N, 60 | start: usize, 61 | end: usize, 62 | ) -> Result<&[u8]> { 63 | if chrom.as_ref() == self.chrom 64 | && start >= self.start 65 | && end < self.start + self.cache.len() 66 | { 67 | let cstart = start - self.start; 68 | let cend = end - self.start; 69 | return Ok(&self.cache[cstart..cend + 1]); 70 | } 71 | self.fetch_into_cache(chrom, start, std::cmp::max(end, start + 1000))?; 72 | Ok(&self.cache[0..std::cmp::min(self.cache.len(), (end - start) + 1)]) 73 | } 74 | } 75 | 76 | #[cfg(test)] 77 | mod tests { 78 | use super::*; 79 | 80 | fn open_reader() -> CachedFaidx { 81 | CachedFaidx::new(format!("{}/test/test_cram.fa", env!("CARGO_MANIFEST_DIR"))) 82 | .ok() 83 | .unwrap() 84 | } 85 | #[test] 86 | fn faidx_open() { 87 | open_reader(); 88 | } 89 | 90 | #[test] 91 | fn faidx_read_chr_first_base() { 92 | let mut r = open_reader(); 93 | 94 | let bseq = r.fetch_seq("chr1", 0, 0).unwrap(); 95 | assert_eq!(bseq.len(), 1); 96 | assert_eq!(bseq, b"G"); 97 | } 98 | 99 | #[test] 100 | fn faidx_read_chr_start() { 101 | let mut r = open_reader(); 102 | 103 | let bseq = r.fetch_seq("chr1", 0, 9).unwrap(); 104 | assert_eq!(bseq.len(), 10); 105 | assert_eq!(bseq, b"GGGCACAGCC"); 106 | } 107 | 108 | #[test] 109 | fn faidx_read_chr_between() { 110 | let mut r = open_reader(); 111 | 112 | let bseq = r.fetch_seq("chr1", 4, 14).unwrap(); 113 | assert_eq!(bseq.len(), 11); 114 | assert_eq!(bseq, b"ACAGCCTCACC"); 115 | 116 | let seq = r.fetch_seq_string("chr1", 4, 14).unwrap(); 117 | assert_eq!(seq.len(), 11); 118 | assert_eq!(seq, "ACAGCCTCACC"); 119 | } 120 | 121 | #[test] 122 | fn faidx_read_chr_end() { 123 | let mut r = open_reader(); 124 | 125 | let bseq = r.fetch_seq("chr1", 110, 120).unwrap(); 126 | assert_eq!(bseq.len(), 10); 127 | assert_eq!(bseq, b"CCCCTCCGTG"); 128 | } 129 | 130 | #[test] 131 | fn faidx_read_twice_bytes() { 132 | let mut r = open_reader(); 133 | let seq = r.fetch_seq("chr1", 110, 120).unwrap(); 134 | assert_eq!(seq.len(), 10); 135 | assert_eq!(seq, b"CCCCTCCGTG"); 136 | 137 | let seq = r.fetch_seq("chr1", 5, 9).unwrap(); 138 | assert_eq!(seq.len(), 5); 139 | assert_eq!(seq, b"CAGCC"); 140 | } 141 | 142 | #[test] 143 | #[ignore] 144 | fn faidx_position_too_large() { 145 | let mut r = open_reader(); 146 | let position_too_large = i64::MAX as usize; 147 | let res = r.fetch_seq("chr1", position_too_large - 1, position_too_large); 148 | assert_eq!(res, Err(rust_htslib::errors::Error::FaidxPositionTooLarge)); 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | #[global_allocator] 2 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; 3 | 4 | mod cached_faidx; 5 | mod processor; 6 | 7 | use anyhow::Result; 8 | use cached_faidx::CachedFaidx; 9 | use clap::Parser; 10 | use processor::{excluded, BasicProcessor}; 11 | 12 | use mlua::prelude::*; 13 | use mlua::{Function, Value}; 14 | use perbase_lib::{ 15 | par_granges::{self, RegionProcessor}, 16 | position::pileup_position::PileupPosition, 17 | read_filter::ReadFilter, 18 | }; 19 | use rust_htslib::bam::{ 20 | self, 21 | pileup::Alignment, 22 | record::{Aux, Cigar, Record}, 23 | Read, 24 | }; 25 | use std::path::PathBuf; 26 | 27 | struct LuaReadFilter<'a> { 28 | lua: &'a Lua, 29 | filter_func: Function, 30 | } 31 | 32 | impl<'a> LuaReadFilter<'a> { 33 | // Create a new LuaReadFilter instance with the given expression 34 | fn new(expression: &str, lua: &'a Lua) -> Result { 35 | let filter_func = lua.load(expression).into_function()?; 36 | lua.register_userdata_type::(|reg| { 37 | reg.add_field_method_get("mapping_quality", |_, this| Ok(this.mapq())); 38 | reg.add_field_method_get("flags", |_, this| Ok(this.flags())); 39 | reg.add_field_method_get("tid", |_, this| Ok(this.tid())); 40 | reg.add_field_method_get("start", |_, this| Ok(this.pos())); 41 | reg.add_field_method_get("stop", |_, this| Ok(this.cigar().end_pos())); 42 | reg.add_field_method_get("length", |_, this| Ok(this.seq_len())); 43 | reg.add_field_method_get("insert_size", |_, this| Ok(this.insert_size())); 44 | reg.add_field_method_get("qname", |_, this| { 45 | let q = this.qname(); 46 | Ok(std::str::from_utf8(q).unwrap_or("").to_string()) 47 | }); 48 | reg.add_field_method_get("sequence", |_, this| { 49 | let seq = this.seq(); 50 | Ok(std::str::from_utf8(&seq.as_bytes()) 51 | .unwrap_or("") 52 | .to_string()) 53 | }); 54 | reg.add_function("qpos", |_, this: mlua::AnyUserData| { 55 | let r: Result = this.named_user_value("qpos"); 56 | r 57 | }); 58 | reg.add_field_function_get("bq", |_, this: mlua::AnyUserData| { 59 | let qpos: usize = match this.named_user_value::("qpos") { 60 | Ok(qpos) => qpos, 61 | Err(_) => { 62 | return Ok(-1); 63 | } 64 | }; 65 | this.borrow_scoped::(|r| match qpos { 66 | usize::MAX => -1, 67 | _ => r.qual()[qpos] as i32, 68 | }) 69 | }); 70 | reg.add_field_function_get("distance_from_5prime", |_, this| { 71 | let qpos: usize = match this.named_user_value("qpos") { 72 | Ok(qpos) => qpos, 73 | Err(_) => { 74 | return Ok(-1); 75 | } 76 | }; 77 | this.borrow_scoped::(|r| { 78 | if r.is_reverse() { 79 | r.seq_len() as i32 - qpos as i32 80 | } else { 81 | qpos as i32 82 | } 83 | }) 84 | }); 85 | reg.add_field_function_get("distance_from_3prime", |_, this| { 86 | let qpos: usize = match this.named_user_value("qpos") { 87 | Ok(qpos) => qpos, 88 | Err(_) => { 89 | return Ok(usize::MAX); 90 | } 91 | }; 92 | this.borrow_scoped::(|r| { 93 | if r.is_reverse() { 94 | qpos 95 | } else { 96 | r.seq_len() - qpos 97 | } 98 | }) 99 | }); 100 | 101 | reg.add_method("n_proportion_3_prime", |_, this, n_bases: usize| { 102 | let seq = this.seq(); 103 | let mut count = 0; 104 | let reverse = this.is_reverse(); 105 | for i in 0..n_bases { 106 | let base = 107 | seq[if reverse { i } else { seq.len() - 1 - i }].to_ascii_uppercase(); 108 | if base == b'N' { 109 | count += 1; 110 | } 111 | } 112 | Ok(count as f64 / n_bases as f64) 113 | }); 114 | 115 | reg.add_method("n_proportion_5_prime", |_, this, n_bases: usize| { 116 | let seq = this.seq(); 117 | let mut count = 0; 118 | let reverse = this.is_reverse(); 119 | for i in 0..n_bases { 120 | let base = 121 | seq[if reverse { seq.len() - 1 - i } else { i }].to_ascii_uppercase(); 122 | if base == b'N' { 123 | count += 1; 124 | } 125 | } 126 | Ok(count as f64 / n_bases as f64) 127 | }); 128 | 129 | reg.add_field_method_get("indel_count", |_, this| { 130 | let cigar = this.cigar(); 131 | let mut count = 0; 132 | for op in cigar.iter() { 133 | match op { 134 | Cigar::Ins(_) | Cigar::Del(_) => { 135 | count += 1; 136 | } 137 | _ => {} 138 | } 139 | } 140 | Ok(count) 141 | }); 142 | 143 | reg.add_field_method_get("soft_clips_3_prime", |_, this| { 144 | let cigar = this.cigar(); 145 | if this.is_reverse() { 146 | Ok(cigar.leading_softclips()) 147 | } else { 148 | Ok(cigar.trailing_softclips()) 149 | } 150 | }); 151 | reg.add_field_method_get("soft_clips_5_prime", |_, this| { 152 | let cigar = this.cigar(); 153 | if this.is_reverse() { 154 | Ok(cigar.trailing_softclips()) 155 | } else { 156 | Ok(cigar.leading_softclips()) 157 | } 158 | }); 159 | reg.add_field_method_get("average_base_quality", |_, this| { 160 | let qual = this.qual(); 161 | let sum = qual.iter().map(|q| *q as u64).sum::(); 162 | let count = qual.len(); 163 | Ok(sum as f64 / count as f64) 164 | }); 165 | 166 | reg.add_method("tag", |lua, this: &Record, tag: String| { 167 | let tag = tag.as_bytes(); 168 | let aux = this.aux(tag).map_err(LuaError::external)?; 169 | let lua_val: Value = match aux { 170 | Aux::Char(v) => Value::String(lua.create_string(&[v])?), 171 | Aux::I8(v) => Value::Number(v as f64), 172 | Aux::U8(v) => Value::Number(v as f64), 173 | Aux::I16(v) => Value::Number(v as f64), 174 | Aux::U16(v) => Value::Number(v as f64), 175 | Aux::I32(v) => Value::Number(v as f64), 176 | Aux::U32(v) => Value::Number(v as f64), 177 | Aux::Float(v) => Value::Number(v as f64), 178 | Aux::Double(v) => Value::Number(v as f64), 179 | Aux::String(v) => Value::String(lua.create_string(&v)?), 180 | Aux::ArrayFloat(v) => { 181 | let mut arr = Vec::new(); 182 | for i in 0..v.len() { 183 | arr.push(v.get(i).unwrap_or(f32::NAN) as f32); 184 | } 185 | Value::Table(lua.create_sequence_from(arr)?) 186 | } 187 | Aux::ArrayI32(v) => { 188 | let mut arr = Vec::new(); 189 | for i in 0..v.len() { 190 | arr.push(v.get(i).unwrap_or(i32::MIN) as i32); 191 | } 192 | Value::Table(lua.create_sequence_from(arr)?) 193 | } 194 | Aux::ArrayI8(v) => { 195 | let mut arr = Vec::new(); 196 | for i in 0..v.len() { 197 | arr.push(v.get(i).unwrap_or(i8::MIN) as i8); 198 | } 199 | Value::Table(lua.create_sequence_from(arr)?) 200 | } 201 | Aux::ArrayU8(v) => { 202 | let mut arr = Vec::new(); 203 | for i in 0..v.len() { 204 | arr.push(v.get(i).unwrap_or(u8::MIN) as u8); 205 | } 206 | Value::Table(lua.create_sequence_from(arr)?) 207 | } 208 | Aux::ArrayU16(v) => { 209 | let mut arr = Vec::new(); 210 | for i in 0..v.len() { 211 | arr.push(v.get(i).unwrap_or(u16::MIN) as u16); 212 | } 213 | Value::Table(lua.create_sequence_from(arr)?) 214 | } 215 | Aux::ArrayU32(v) => { 216 | let mut arr = Vec::new(); 217 | for i in 0..v.len() { 218 | arr.push(v.get(i).unwrap_or(u32::MIN) as u32); 219 | } 220 | Value::Table(lua.create_sequence_from(arr)?) 221 | } 222 | Aux::ArrayI16(v) => { 223 | let mut arr = Vec::new(); 224 | for i in 0..v.len() { 225 | arr.push(v.get(i).unwrap_or(i16::MIN) as i16); 226 | } 227 | Value::Table(lua.create_sequence_from(arr)?) 228 | } 229 | Aux::HexByteArray(v) => { 230 | let lstr = String::from_utf8_lossy(v.as_bytes()).to_string(); 231 | Value::String(lua.create_string(&lstr)?) 232 | } 233 | }; 234 | Ok(Some(lua_val)) 235 | }) 236 | })?; 237 | Ok(Self { lua, filter_func }) 238 | } 239 | } 240 | 241 | fn register_pile(lua: &Lua) -> mlua::Result<()> { 242 | lua.register_userdata_type::(|reg| { 243 | reg.add_field_method_get("depth", |_, this| Ok(this.depth)); 244 | reg.add_field_method_get("a", |_, this| Ok(this.a)); 245 | reg.add_field_method_get("c", |_, this| Ok(this.c)); 246 | reg.add_field_method_get("g", |_, this| Ok(this.g)); 247 | reg.add_field_method_get("t", |_, this| Ok(this.t)); 248 | reg.add_field_method_get("n", |_, this| Ok(this.n)); 249 | reg.add_field_method_get("fail", |_, this| Ok(this.fail)); 250 | reg.add_field_method_get("ins", |_, this| Ok(this.ins)); 251 | reg.add_field_method_get("del", |_, this| Ok(this.del)); 252 | reg.add_field_method_get("ref_skip", |_, this| Ok(this.ref_skip)); 253 | reg.add_field_method_get("pos", |_, this| Ok(this.pos)); 254 | }) 255 | } 256 | 257 | impl<'a> ReadFilter for LuaReadFilter<'a> { 258 | /// Filter reads based user expression. 259 | #[inline] 260 | fn filter_read(&self, read: &Record, alignment: Option<&Alignment>) -> bool { 261 | let r = self.lua.scope(|scope| { 262 | let globals = self.lua.globals(); 263 | let ud = scope.create_any_userdata_ref(read)?; 264 | ud.set_named_user_value("qpos", alignment.unwrap().qpos().unwrap_or(usize::MAX))?; 265 | 266 | globals.set("read", ud).expect("error setting read"); 267 | 268 | self.filter_func.call::(()) 269 | }); 270 | 271 | match r { 272 | Ok(r) => r, 273 | Err(e) => { 274 | eprintln!("Error evaluating expression: {}", e); 275 | false 276 | } 277 | } 278 | } 279 | } 280 | 281 | impl RegionProcessor for BasicProcessor { 282 | type P = PileupPosition; 283 | 284 | // This function receives an interval to examine. 285 | fn process_region(&self, tid: u32, start: u32, stop: u32) -> Vec { 286 | let mut reader = bam::IndexedReader::from_path(&self.bamfile).expect("Indexed reader"); 287 | let mut fai = if let Some(fasta) = &self.fasta_path { 288 | reader.set_reference(fasta).expect("reference"); 289 | Some(CachedFaidx::new(fasta).expect("error reading fasta")) 290 | } else { 291 | None 292 | }; 293 | 294 | let header = reader.header().to_owned(); 295 | let lua = Lua::new(); 296 | 297 | let rf = LuaReadFilter::new(&self.expression, &lua).unwrap_or_else(|_| { 298 | panic!( 299 | "error creating lua read filter with expression {}", 300 | &self.expression 301 | ) 302 | }); 303 | 304 | let exclude_intervals = self.exclude_regions.as_ref().map(|regions_bed| { 305 | Self::bed_to_intervals(&header, regions_bed, true).expect("BED file") 306 | }); 307 | 308 | let string_count = rf 309 | .lua 310 | .create_function(|_, (haystack, needle): (String, String)| { 311 | assert!(needle.len() == 1); 312 | let needle = needle.chars().next().unwrap(); 313 | Ok(haystack.chars().filter(|c| *c == needle).count()) 314 | }) 315 | .expect("eror creating function"); 316 | rf.lua 317 | .globals() 318 | .set("string_count", string_count) 319 | .expect("error setting string_count"); 320 | 321 | // fetch the region 322 | reader.fetch((tid, start, stop)).expect("Fetched ROI"); 323 | // Walk over pileups 324 | let mut p = reader.pileup(); 325 | let chrom = unsafe { std::str::from_utf8_unchecked(header.target_names()[tid as usize]) }; 326 | p.set_max_depth(self.max_depth); 327 | let mut result: Vec = p 328 | .flat_map(|p| { 329 | let pileup = p.expect("Extracted a pileup"); 330 | // Verify that we are within the bounds of the chunk we are iterating on 331 | // Since pileup will pull reads that overhang edges. 332 | if pileup.pos() >= start 333 | && pileup.pos() < stop 334 | // and check if this position is excluded. 335 | && !excluded(&exclude_intervals, &pileup) 336 | { 337 | if self.mate_fix { 338 | Some(PileupPosition::from_pileup_mate_aware( 339 | pileup, &header, &rf, None, 340 | )) 341 | } else { 342 | Some(PileupPosition::from_pileup(pileup, &header, &rf, None)) 343 | } 344 | } else { 345 | None 346 | } 347 | }) 348 | .collect(); 349 | if let Some(fai) = &mut fai { 350 | result.iter_mut().for_each(|p| { 351 | let s = fai 352 | .fetch_seq(chrom, p.pos as usize, (p.pos + 1) as usize) 353 | .expect("error extracting reference base"); 354 | p.ref_base = Some(s[0] as char); 355 | }); 356 | } 357 | result 358 | } 359 | } 360 | 361 | #[derive(Parser, Default, Debug)] 362 | #[command(author, version, about, long_about = None)] 363 | struct Args { 364 | #[arg(help = "Path to the bamfile")] 365 | bam_path: PathBuf, 366 | #[clap(help = "Lua expression to evaluate")] 367 | expression: String, 368 | #[clap(short, long, default_value = "2", help = "Number of threads to use")] 369 | threads: usize, 370 | #[clap( 371 | short, 372 | long, 373 | default_value_t = 100000, 374 | help = "maximum depth in the pileup" 375 | )] 376 | max_depth: u32, 377 | #[clap(short, long, help = "optional path to the BED of include regions")] 378 | bedfile: Option, 379 | #[clap(short, long, help = "optional path to the reference fasta file")] 380 | fasta: Option, 381 | #[clap(short, long, help = "optional path to BED of exclude regions")] 382 | exclude: Option, 383 | 384 | #[clap( 385 | long, 386 | help = "adjust depth to not double count overlapping mates", 387 | long_help = "note that for now this is much slower than the default" 388 | )] 389 | mate_fix: bool, 390 | 391 | #[clap(short, long, help = "optional expression required for the pileup")] 392 | pile_expression: Option, 393 | } 394 | 395 | fn main() -> Result<()> { 396 | let opts = Args::parse(); 397 | 398 | if !opts.expression.contains("return") { 399 | eprintln!("Expression '{}' must contain 'return'", opts.expression); 400 | std::process::exit(1); 401 | } 402 | 403 | let basic_processor = BasicProcessor { 404 | bamfile: PathBuf::from(&opts.bam_path), 405 | expression: String::from("") + opts.expression.as_str(), 406 | max_depth: opts.max_depth, 407 | exclude_regions: opts.exclude, 408 | mate_fix: opts.mate_fix, 409 | fasta_path: opts.fasta.clone(), 410 | }; 411 | 412 | let par_granges_runner = par_granges::ParGranges::new( 413 | opts.bam_path, // pass in bam 414 | opts.fasta, // optional ref fasta 415 | opts.bedfile, // bedfile to narrow regions 416 | None, // optional bcf/vcf file to specify positions of interest 417 | true, // Merge any overlapping regions in the BED file 418 | Some(opts.threads), // optional allowed number of threads, defaults to max 419 | None, // optional chunksize modification 420 | None, // optional modifier on the size of the channel for sending Positions 421 | basic_processor, 422 | ); 423 | 424 | let pile_lua = Lua::new(); 425 | register_pile(&pile_lua)?; 426 | 427 | let pile_expression: Option = if let Some(expression) = opts.pile_expression { 428 | Some(pile_lua.load(expression.as_str()).into_function()?) 429 | } else { 430 | None 431 | }; 432 | 433 | // Run the processor 434 | let receiver = par_granges_runner.process()?; 435 | println!("#chrom\tpos0\tref_base\tdepth\ta\tc\tg\tt\tn"); 436 | // Pull the in-order results from the receiver channel 437 | receiver 438 | .into_iter() 439 | .filter(|p| p.depth > 0) 440 | // filter on the pile expression 441 | .filter(|p| { 442 | if let Some(pile_expression) = &pile_expression { 443 | let r = pile_lua.scope(|scope| { 444 | let globals = pile_lua.globals(); 445 | let ud = scope.create_any_userdata_ref(p)?; 446 | globals.set("pile", ud).expect("error setting pile"); 447 | 448 | pile_expression.call::(()) 449 | }); 450 | match r { 451 | Ok(r) => r, 452 | Err(e) => { 453 | eprintln!("Error evaluating expression: {}", e); 454 | std::process::exit(1); 455 | } 456 | } 457 | } else { 458 | true 459 | } 460 | }) 461 | .for_each(|p: PileupPosition| { 462 | //p:PileupPosition { ref_seq: "chr2", pos: 196, ref_base: None, depth: 1, a: 1, c: 0, g: 0, t: 0, n: 0, ins: 0, del: 0, ref_skip: 0, fail: 1, near_max_depth: false } 463 | println!( 464 | "{chrom}\t{pos}\t{ref_base}\t{depth}\t{a}\t{c}\t{g}\t{t}\t{n}", 465 | chrom = p.ref_seq, 466 | pos = p.pos, 467 | depth = p.depth, 468 | ref_base = p.ref_base.unwrap_or('.'), 469 | a = p.a, 470 | c = p.c, 471 | g = p.g, 472 | t = p.t, 473 | n = p.n 474 | ); 475 | }); 476 | 477 | Ok(()) 478 | } 479 | 480 | #[cfg(test)] 481 | mod tests { 482 | 483 | use super::*; 484 | use mlua::Lua; 485 | use rust_htslib::bam; 486 | use rust_htslib::bam::pileup::Pileup; 487 | use rust_htslib::bam::record::Record; 488 | use rust_htslib::bam::{header::HeaderRecord, Header, HeaderView, IndexedReader, Read}; 489 | use tempfile::NamedTempFile; 490 | 491 | #[test] 492 | fn test_read_bq() -> Result<()> { 493 | // Create a header with chr1 494 | let mut header = Header::new(); 495 | let mut sq = HeaderRecord::new(b"SQ"); 496 | sq.push_tag(b"SN", "chr1"); 497 | sq.push_tag(b"LN", &1000000u32); 498 | header.push_record(&sq); 499 | let header_view = HeaderView::from_header(&header); 500 | 501 | // Create a test BAM record using SAM format 502 | let record = Record::from_sam( 503 | &header_view, 504 | b"test_read\t0\tchr1\t100\t30\t4M\t*\t0\t0\tACGT\t&&&&\tRG:Z:test", // Use standard ASCII for qual 505 | ) 506 | .expect("Failed to create record from SAM"); 507 | 508 | // Write record to a temporary BAM file 509 | let tmp = NamedTempFile::new()?; 510 | let path = tmp.path(); 511 | { 512 | let mut writer = bam::Writer::from_path(path, &header, bam::Format::Bam)?; 513 | writer.write(&record)?; 514 | } 515 | bam::index::build(path, None, bam::index::Type::Bai, 1)?; 516 | 517 | // Create pileup 518 | let mut reader = IndexedReader::from_path(path)?; 519 | reader.fetch(("chr1", 100, 101))?; // Fetch the position of the record 520 | let mut pileups = reader.pileup(); 521 | let pileup: Pileup = pileups.next().unwrap().expect("Failed to get pileup"); 522 | 523 | // Get the first alignment from the pileup 524 | let alignment = pileup 525 | .alignments() 526 | .next() 527 | .expect("No alignment found in pileup"); 528 | 529 | let lua = Lua::new(); 530 | let rf = LuaReadFilter::new( 531 | "return read.bq > 0 and read.distance_from_5prime == 0 and read.distance_from_3prime > 0", 532 | &lua, 533 | )?; // Example expression 534 | 535 | // Test the bq functionality using the alignment from the pileup 536 | let result = rf.filter_read(&alignment.record(), Some(&alignment)); 537 | assert!(result); 538 | 539 | Ok(()) 540 | } 541 | 542 | #[test] 543 | fn test_pileup_position() -> mlua::Result<()> { 544 | let pileup_position = PileupPosition { 545 | depth: 10, 546 | a: 1, 547 | c: 2, 548 | g: 3, 549 | t: 4, 550 | n: 5, 551 | fail: 6, 552 | ins: 7, 553 | del: 8, 554 | ref_skip: 9, 555 | pos: 10, 556 | ..Default::default() 557 | }; 558 | 559 | let lua = Lua::new(); 560 | register_pile(&lua)?; 561 | let globals = lua.globals(); 562 | for (expected, expression) in [ 563 | (true, "pile.g > 3"), 564 | (true, "pile.a > 0"), 565 | (false, "pile.a > 10"), 566 | (false, "pile.ref_skip == 100"), 567 | (true, "pile.ref_skip == 9"), 568 | ] { 569 | eprintln!("Testing expression: {}", expression); 570 | lua.scope(|scope| { 571 | let p = scope 572 | .create_any_userdata_ref(&pileup_position) 573 | .expect("error creating user data"); 574 | globals.set("pile", p)?; 575 | let f = lua 576 | .load(&(String::from("return ") + expression)) 577 | .into_function()?; 578 | let result: bool = f.call(())?; 579 | Ok(result == expected) 580 | }) 581 | .expect("error evaluating expression"); 582 | } 583 | Ok(()) 584 | } 585 | } 586 | -------------------------------------------------------------------------------- /src/processor.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{anyhow, Context, Result}; 2 | use bio::io::bed; 3 | use rust_htslib::bam::{pileup::Pileup, HeaderView}; 4 | use rust_lapper::{Interval, Lapper}; 5 | 6 | use std::path::PathBuf; 7 | 8 | pub(crate) struct BasicProcessor { 9 | // An indexed bamfile to query for the region we were passed 10 | pub(crate) bamfile: PathBuf, 11 | pub(crate) expression: String, 12 | pub(crate) max_depth: u32, 13 | pub(crate) exclude_regions: Option, 14 | pub(crate) mate_fix: bool, 15 | pub(crate) fasta_path: Option, 16 | } 17 | 18 | impl BasicProcessor { 19 | /// copied verbatim from perbase 20 | pub(crate) fn bed_to_intervals( 21 | header: &HeaderView, 22 | bed_file: &PathBuf, 23 | merge: bool, 24 | ) -> Result>> { 25 | let mut bed_reader = bed::Reader::from_file(bed_file)?; 26 | let mut intervals = vec![vec![]; header.target_count() as usize]; 27 | for (i, record) in bed_reader.records().enumerate() { 28 | let record = record?; 29 | let tid = header 30 | .tid(record.chrom().as_bytes()) 31 | .expect("Chromosome not found in BAM/CRAM header"); 32 | let start = record 33 | .start() 34 | .try_into() 35 | .with_context(|| format!("BED record {} is invalid: unable to parse start", i))?; 36 | let stop = record 37 | .end() 38 | .try_into() 39 | .with_context(|| format!("BED record {} is invalid: unable to parse stop", i))?; 40 | if stop < start { 41 | return Err(anyhow!("BED record {} is invalid: stop < start", i)); 42 | } 43 | intervals[tid as usize].push(Interval { 44 | start, 45 | stop, 46 | val: (), 47 | }); 48 | } 49 | 50 | Ok(intervals 51 | .into_iter() 52 | .map(|ivs| { 53 | let mut lapper = Lapper::new(ivs); 54 | if merge { 55 | lapper.merge_overlaps(); 56 | } 57 | lapper 58 | }) 59 | .collect()) 60 | } 61 | } 62 | 63 | #[inline] 64 | pub(crate) fn excluded(exclude_intervals: &Option>>, p: &Pileup) -> bool { 65 | match exclude_intervals { 66 | Some(ref intervals) => { 67 | if p.tid() as usize >= intervals.len() { 68 | return false; 69 | } 70 | let ivs = &intervals[p.tid() as usize]; 71 | let pos = p.pos(); 72 | ivs.count(pos, pos + 1) > 0 73 | } 74 | None => false, 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /test/test.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brentp/pbr/d1a7ed5c33a8f568bffabf1f9d06298a7ea67d56/test/test.bam -------------------------------------------------------------------------------- /test/test.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brentp/pbr/d1a7ed5c33a8f568bffabf1f9d06298a7ea67d56/test/test.bam.bai -------------------------------------------------------------------------------- /test/test_cram.fa: -------------------------------------------------------------------------------- 1 | >chr1 2 | GGGCACAGCCTCACCCAGGAAAGCAGCTGGGGGTCCACTGGGCTCAGGGAAGACCCCCTG 3 | CCAGGGAGACCCCAGGCGCCTGAATGGCCACGGGAAGGAAAACCTACCAGCCCCTCCGTG 4 | >chr2 5 | AAGAAATAACTGCTAATTTAAAATTGAAGACTTCTGCTCTGCAAAAGACATTGTTAAGAT 6 | AATGAAAAGACAAGCCAAAGACTTGTAGAAAGTATTTGAAAAATAATCTCTGATAAATGG 7 | >chr3 8 | CCAACAAGCATTGGTGTGGCATTTCAGTGGAGAAGGAAACTTGGGGGGAAAAAGCCCATC 9 | AAGGTTGTAAGAAGACTCCCAATTTAACTGTCCCTTTCCCTATTTATCCACCATCCAAGA 10 | -------------------------------------------------------------------------------- /test/test_cram.fa.fai: -------------------------------------------------------------------------------- 1 | chr1 120 6 60 61 2 | chr2 120 134 60 61 3 | chr3 120 262 60 61 4 | --------------------------------------------------------------------------------