├── .env ├── .gitattributes ├── .github ├── helpers │ └── vcpkg.json └── workflows │ ├── test-static.yaml │ └── test.yaml ├── .gitignore ├── .vscode ├── launch.json └── tasks.json ├── CHANGELOG.md ├── Cargo.lock ├── Cargo.toml ├── Justfile ├── LICENSE ├── README.md ├── benches └── mzml.rs ├── cliff.toml ├── crates └── mzdata-spectra │ ├── .gitignore │ ├── Cargo.toml │ ├── README.md │ └── src │ └── lib.rs ├── cv ├── extract_activation.py ├── extract_component.py ├── extract_cv_metadata.py ├── extract_energy.py ├── extract_file_formats.py ├── extract_native_ids.py ├── extract_software.py ├── psi-ms.obo.gz └── requirements.txt ├── docs ├── img │ ├── denoised_spec.png │ ├── raw_spec.png │ └── to_uri.py ├── reader_tutorial.md ├── spectrum_tutorial.md └── writer_tutorial.md ├── examples ├── async_mzcat.rs ├── averaging_writer.rs ├── compressed_mzml.rs ├── describe_instrument.rs ├── from_stdin.rs ├── get_scan_by.rs ├── infer_format.rs ├── msn_target_mapping.rs ├── mzcat.rs ├── mzconvert.rs ├── mzinfo.rs ├── random_access_iter.rs └── readme.rs ├── pipeconvert.sh ├── src ├── io │ ├── compression.rs │ ├── infer_format │ │ ├── dispatch.rs │ │ ├── inference.rs │ │ ├── mod.rs │ │ └── pipeline.rs │ ├── mgf.rs │ ├── mgf │ │ ├── async_reader.rs │ │ ├── reader.rs │ │ └── writer.rs │ ├── mod.rs │ ├── mzml.rs │ ├── mzml │ │ ├── async_reader.rs │ │ ├── reader.rs │ │ ├── reading_shared.rs │ │ └── writer.rs │ ├── mzmlb.rs │ ├── mzmlb │ │ ├── common.rs │ │ ├── reader.rs │ │ └── writer.rs │ ├── offset_index.rs │ ├── proxi.rs │ ├── shorthand.rs │ ├── tdf │ │ ├── arrays.rs │ │ ├── constants.rs │ │ ├── mod.rs │ │ ├── reader.rs │ │ └── sql.rs │ ├── thermo.rs │ ├── thermo │ │ ├── async_reader.rs │ │ ├── instruments.rs │ │ └── reader.rs │ ├── traits.rs │ ├── traits │ │ ├── chromatogram.rs │ │ ├── frame.rs │ │ ├── spectrum.rs │ │ └── util.rs │ ├── usi.rs │ └── utils.rs ├── lib.rs ├── main.rs ├── meta.rs ├── meta │ ├── activation.rs │ ├── data_processing.rs │ ├── file_description.rs │ ├── instrument.rs │ ├── run.rs │ ├── sample.rs │ ├── software.rs │ └── traits.rs ├── params.rs ├── prelude.rs ├── spectrum.rs ├── spectrum │ ├── bindata.rs │ ├── bindata │ │ ├── array.rs │ │ ├── conversion.rs │ │ ├── encodings.rs │ │ ├── map.rs │ │ └── traits.rs │ ├── chromatogram.rs │ ├── frame.rs │ ├── group.rs │ ├── group │ │ ├── flat.rs │ │ ├── frame.rs │ │ ├── mse_iter.rs │ │ ├── spectrum.rs │ │ └── util.rs │ ├── peaks.rs │ ├── scan_properties.rs │ ├── spectrum_types.rs │ └── utils.rs ├── tutorial.rs ├── tutorial │ ├── reading.rs │ ├── spectrum.rs │ └── writing.rs └── utils.rs └── test └── data ├── 20200204_BU_8B8egg_1ug_uL_7charges_60_min_Slot2-11_1_244.mzML.gz ├── batching_test.mzML ├── diaPASEF.d ├── analysis.tdf └── analysis.tdf_bin ├── im_f64_zstd_base64.txt ├── mz_f64_zlib_bas64.txt ├── processed_batch.mgf.gz ├── read_index_of.mzML ├── small.RAW ├── small.mgf ├── small.mzML ├── small.mzML.gz ├── small.mzMLb └── three_test_scans.mzML /.env: -------------------------------------------------------------------------------- 1 | RUST_LOG=info -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto 2 | 3 | *.rs text 4 | *.md text 5 | *.sh text eol=lf 6 | 7 | *.mzML binary 8 | *.mzml binary 9 | *.gz binary 10 | *.mgf binary -------------------------------------------------------------------------------- /.github/helpers/vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://raw.githubusercontent.com/microsoft/vcpkg-tool/main/docs/vcpkg.schema.json", 3 | "dependencies": [ 4 | { 5 | "name": "hdf5", 6 | "version>=": "1.10.1" 7 | } 8 | ], 9 | "overrides": [{ "name": "hdf5", "version": "1.10.1" }], 10 | "builtin-baseline": "e57b2167e66c847f991bd6bce1355b85acd944e8" 11 | } 12 | -------------------------------------------------------------------------------- /.github/workflows/test-static.yaml: -------------------------------------------------------------------------------- 1 | on: [push, pull_request] 2 | 3 | env: 4 | RUST_BACKTRACE: full 5 | RUST_LOG: debug 6 | CARGO_PROFILE_TEST_BUILD_OVERRIDE_DEBUG: true 7 | 8 | name: Test (HDF5 Static Compilation) 9 | jobs: 10 | 11 | test: 12 | strategy: 13 | matrix: 14 | os: [ubuntu-latest, windows-latest] 15 | name: Test Suite 16 | runs-on: ${{ matrix.os }} 17 | steps: 18 | - name: Checkout sources 19 | uses: actions/checkout@v4 20 | 21 | - name: Install stable toolchain 22 | uses: actions-rs/toolchain@v1 23 | with: 24 | profile: minimal 25 | toolchain: stable 26 | override: true 27 | - name: Run cargo test 28 | uses: actions-rs/cargo@v1 29 | with: 30 | command: test 31 | args: --features hdf5_static,zlib -- --nocapture --show-output -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | on: [push, pull_request] 2 | 3 | env: 4 | RUST_BACKTRACE: full 5 | RUST_LOG: debug 6 | CARGO_PROFILE_TEST_BUILD_OVERRIDE_DEBUG: true 7 | 8 | name: Test 9 | jobs: 10 | 11 | test: 12 | strategy: 13 | matrix: 14 | os: [ubuntu-latest, windows-latest] 15 | name: Test Suite 16 | runs-on: ${{ matrix.os }} 17 | steps: 18 | - name: Checkout sources 19 | uses: actions/checkout@v4 20 | 21 | - name: Install stable toolchain 22 | uses: actions-rs/toolchain@v1 23 | with: 24 | profile: minimal 25 | toolchain: stable 26 | override: true 27 | - name: Rust Cache 28 | uses: Swatinem/rust-cache@v2 29 | - name: Run cargo test 30 | uses: actions-rs/cargo@v1 31 | with: 32 | command: test 33 | args: --features nalgebra,parallelism,async,mzsignal,thermo,numpress -- --nocapture --show-output 34 | 35 | test-remote: 36 | name: PROXI Test Suite 37 | runs-on: ubuntu-latest 38 | steps: 39 | - name: Checkout sources 40 | uses: actions/checkout@v4 41 | 42 | - name: Install stable toolchain 43 | uses: actions-rs/toolchain@v1 44 | with: 45 | profile: minimal 46 | toolchain: stable 47 | override: true 48 | - name: Rust Cache 49 | uses: Swatinem/rust-cache@v2 50 | - name: Run cargo test 51 | continue-on-error: true 52 | uses: actions-rs/cargo@v1 53 | with: 54 | command: test 55 | args: --features async,proxi,proxi-async -- --nocapture --show-output 56 | 57 | test-spectra-subset: 58 | name: mzdata-spectra 59 | runs-on: ubuntu-latest 60 | steps: 61 | - name: Checkout sources 62 | uses: actions/checkout@v4 63 | 64 | - name: Install stable toolchain 65 | uses: actions-rs/toolchain@v1 66 | with: 67 | profile: minimal 68 | toolchain: stable 69 | override: true 70 | - name: Rust Cache 71 | uses: Swatinem/rust-cache@v2 72 | 73 | - name: Run cargo test 74 | continue-on-error: true 75 | run: | 76 | cd crates/mzdata-spectra && cargo build --features mzsignal,nalgebra,serde -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | test/data/*.index.json 3 | .vscode/* 4 | Untitled* 5 | test.mzML 6 | tmp.mzML 7 | *.sh 8 | *.py -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "(Windows) Launch", 9 | "type": "cppvsdbg", 10 | "request": "launch", 11 | "program": "${workspaceFolder}/target/debug/mzdata.exe", 12 | "args": [], 13 | "stopAtEntry": false, 14 | "cwd": "${workspaceRoot}", 15 | "environment": [], 16 | "console": "internalConsole" 17 | }, 18 | { 19 | "name": "Run Test Debugger", 20 | "type": "cppvsdbg", 21 | "request": "launch", 22 | "program": "${workspaceRoot}/target/debug/deps/mzdata-d4d3354e8c37294a.exe", 23 | "args": [], 24 | "stopAtEntry": false, 25 | "cwd": "${workspaceRoot}", 26 | "environment": [], 27 | "console": "internalConsole" 28 | } 29 | ] 30 | } -------------------------------------------------------------------------------- /.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0.0", 3 | "type": "shell", 4 | "label": "cargo test build", 5 | "command": "cargo", 6 | "args": [ 7 | "test", 8 | "--no-run" 9 | ], 10 | "problemMatcher": [ 11 | "$rustc" 12 | ] 13 | } -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "mzdata" 3 | version = "0.55.0" 4 | edition = "2021" 5 | keywords = ['mass-spectrometry', 'mzml', 'mgf'] 6 | 7 | categories = ["science", "parser-implementations", "data-structures"] 8 | 9 | description = "A library to read mass spectrometry data formats and a data model for mass spectra" 10 | 11 | license = "Apache-2.0" 12 | 13 | repository = "https://github.com/mobiusklein/mzdata" 14 | documentation = "https://docs.rs/mzdata" 15 | 16 | exclude = ["tmp/*", "test/data/*"] 17 | 18 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 19 | [[bin]] 20 | name = "mzdata" 21 | # src = "src/main.rs" 22 | 23 | [[example]] 24 | name = "async_mzcat" 25 | required-features = ["async"] 26 | 27 | [[example]] 28 | name = "mzcat" 29 | required-features = ["parallelism"] 30 | 31 | [[example]] 32 | name = "averaging_writer" 33 | required-features = ["parallelism", "mzsignal", "nalgebra"] 34 | 35 | [[example]] 36 | name = "random_access_iter" 37 | required-features = ["nalgebra"] 38 | 39 | [lib] 40 | name = "mzdata" 41 | # src = "src/lib.rs" 42 | 43 | [profile.release] 44 | lto = true 45 | debug = true 46 | 47 | [features] 48 | # default = ["nalgebra", "parallelism", "mzsignal", "zlib-ng-compat"] 49 | default = ["zlib-ng-compat", "mgf", "mzml"] 50 | 51 | checksum = ["dep:md5", "dep:sha1", "dep:base16ct"] 52 | 53 | mgf = [] 54 | mzml = ["dep:quick-xml", "checksum", "dep:memchr"] 55 | 56 | 57 | openblas = ["mzsignal", "mzsignal/openblas"] 58 | netlib = ["mzsignal", "mzsignal/netlib"] 59 | intel-mkl = ["mzsignal", "mzsignal/intel-mkl"] 60 | nalgebra = ["mzsignal", "mzsignal/nalgebra"] 61 | parallelism = ["rayon", "mzsignal?/parallelism"] 62 | 63 | # The zlib feature makes the code faster unoptimized, but with LTO the default 64 | # algorithm is faster on tests. Odds are on real data, this will vary with the 65 | # size of the data being shuttled back and forth, and on the quality of the data 66 | # (de)compression. 67 | zlib = ["flate2/zlib"] 68 | zlib-ng-compat = ["flate2/zlib-ng-compat"] 69 | zlib-ng = ["flate2/zlib-ng"] 70 | miniz_oxide = ["flate2/rust_backend"] 71 | 72 | # Enables reading mzMLb 73 | mzmlb = ["mzml", "dep:hdf5", "dep:ndarray", "dep:hdf5-sys"] 74 | 75 | # Enable compiling and statically linking HDF5, which requires building 76 | # libz-sys in static mode, which conflicts with flate2/zlib-ng-compat 77 | # but not flate2/zlib 78 | hdf5_static = ["mzmlb", "hdf5-sys/static", "hdf5-sys/zlib", "dep:libz-sys"] 79 | 80 | thermo = [ 81 | "dep:thermorawfilereader", 82 | "thermorawfilereader/net8_0", 83 | "thermorawfilereader/nethost-download", 84 | "checksum", 85 | ] 86 | 87 | bruker_tdf = [ 88 | "dep:timsrust", 89 | "dep:parking_lot", 90 | "dep:rusqlite", 91 | "mzsignal", 92 | "checksum", 93 | ] 94 | 95 | doc-only = [] 96 | 97 | serde = [ 98 | "dep:serde", 99 | "mzpeaks/serde", 100 | "mzsignal?/serde", 101 | "chrono/serde", 102 | "dep:serde_json", 103 | "dep:serde_with", 104 | ] 105 | 106 | async = ["async_partial", "tokio/fs"] 107 | async_partial = [ 108 | "dep:tokio", 109 | "quick-xml/async-tokio", 110 | "dep:futures", 111 | "dep:pin-project-lite", 112 | ] 113 | 114 | proxi = ["dep:reqwest", "serde"] 115 | proxi-async = ["proxi", "dep:futures"] 116 | libloading = ["dep:libloading"] 117 | zstd = ["dep:zstd"] 118 | 119 | [dependencies] 120 | regex = "1" 121 | serde = { version = "1.0", features = ["derive"], optional = true } 122 | serde_json = { version = "1.0", optional = true } 123 | serde_with = { version = "3.12.0", optional = true } 124 | 125 | quick-xml = { version = "0.30", features = ["serialize"], optional = true } 126 | num-traits = "0.2" 127 | log = "0.4.20" 128 | indexmap = { version = "2.0.0", features = ["serde"] } 129 | chrono = "0.4.37" 130 | bitflags = "2.5.0" 131 | identity-hash = "0.1.0" 132 | 133 | thiserror = "2.0.2" 134 | 135 | 136 | mzpeaks = { version = ">=1.0.6,<1.1.0" } 137 | 138 | # Internal parallelism 139 | rayon = { version = ">=1.8.0,<2.0", optional = true } 140 | 141 | # Internal signal processing 142 | mzsignal = { version = ">=1.1.2,<1.2.0", default-features = false, optional = true, features = [ 143 | 'avx', 144 | ] } 145 | 146 | # Checksums and hashing 147 | sha1 = { version = "0.10.6", optional = true } 148 | md5 = { version = "0.7.0", optional = true } 149 | base16ct = { version = "0.2.0", features = ["alloc"], optional = true } 150 | 151 | # Bytes and compression 152 | flate2 = { version = "1.0.20" } 153 | numpress = { version = "1.1.0", optional = true, package = "numpress-rs" } 154 | bytemuck = { version = "1.18.0", features = ["extern_crate_alloc", "min_const_generics"] } 155 | base64-simd = "0.8.0" 156 | 157 | # Async reader features 158 | tokio = { version = "1.42", optional = true, features = ["macros"] } 159 | 160 | # mzMLb-related features 161 | hdf5 = { version = "0.8.1", optional = true, features = ["blosc", "lzf"] } 162 | hdf5-sys = { version = "0.8.1", optional = true } 163 | libz-sys = { version = "1.1", default-features = false, features = [ 164 | "static", 165 | ], optional = true } 166 | ndarray = { version = "0.15.6", optional = true } 167 | filename = { version = "0.1.1", optional = true } 168 | 169 | # PROXI-related behaviors 170 | reqwest = { version = "0.12", features = ["json", "blocking"], optional = true } 171 | futures = { version = "0.3", optional = true } 172 | 173 | # Thermo RAW-related features 174 | thermorawfilereader = { version = "0.5.1", default-features = false, optional = true } 175 | 176 | # Bruker TDF-related features 177 | rusqlite = { version = "0.31.0", optional = true } 178 | timsrust = { version = "0.4.1", default-features = false, features = [ 179 | "tdf", 180 | ], optional = true } 181 | 182 | parking_lot = { version = "0.12.3", optional = true } 183 | pin-project-lite = { version = "0.2.16", optional = true } 184 | memchr = { version = "2.7.4", optional = true } 185 | libloading = { version = "0.8.6", optional = true } 186 | zstd = { version = "0.13.3", optional = true } 187 | 188 | 189 | [dev-dependencies] 190 | criterion = { version = "0.5.1", features = ["html_reports"] } 191 | test-log = "0.2.12 " 192 | env_logger = "0.11.6" 193 | tempfile = "3.10" 194 | clap = { version = "4.4.11", features = ["derive"] } 195 | tokio = { version = "1.42", features = ["macros", "fs", "rt-multi-thread"] } 196 | 197 | 198 | [[bench]] 199 | name = "mzml" 200 | harness = false 201 | 202 | 203 | [package.metadata.docs.rs] 204 | features = [ 205 | "mzml", 206 | "mgf", 207 | "parallelism", 208 | "mzsignal", 209 | "nalgebra", 210 | "mzmlb", 211 | "async", 212 | "proxi", 213 | "proxi-async", 214 | # "thermo", 215 | "doc-only", 216 | "bruker_tdf", 217 | ] 218 | no-default-features = true 219 | -------------------------------------------------------------------------------- /Justfile: -------------------------------------------------------------------------------- 1 | set dotenv-load := true 2 | 3 | test-units: 4 | cargo nextest run --lib --features nalgebra,parallelism,mzsignal,zlib-ng-compat,thermo,async,numpress 5 | 6 | test-coverage: 7 | cargo llvm-cov --lib --tests nextest --features nalgebra,parallelism,mzsignal,zlib-ng-compat,thermo,mzmlb,async,numpress --html 8 | 9 | alias t := test-units 10 | 11 | test-units-more: 12 | cargo nextest run --lib --features nalgebra,parallelism,mzsignal,zlib-ng-compat,thermo,async,numpress 13 | 14 | quick-docs: 15 | cargo doc --no-deps -p mzdata 16 | 17 | docs: 18 | cargo doc --no-deps --features nalgebra,parallelism,mzsignal,mzmlb,zlib-ng-compat,thermo,async,proxi,bruker_tdf -p mzdata -p mzsignal -p mzpeaks 19 | 20 | install-mzdata: 21 | cargo install --path . --features nalgebra,parallelism,mzsignal,mzmlb,zlib-ng-compat,hdf5_static 22 | 23 | update-cv: 24 | curl --insecure \ 25 | --location \ 26 | https://github.com/HUPO-PSI/psi-ms-CV/releases/latest/download/psi-ms.obo | gzip -c > cv/psi-ms.obo.gz 27 | 28 | gzip -d -c cv/psi-ms.obo.gz | head -n 5 29 | 30 | update-cv-terms: 31 | cog -c -r -U src/meta/software.rs src/meta/instrument.rs src/meta/file_description.rs src/io/mzml/writer.rs src/meta/activation.rs 32 | 33 | changelog version: 34 | #!/usr/bin/env python 35 | 36 | import subprocess 37 | import re 38 | 39 | new_content = subprocess.check_output(['git', 'cliff', '-s', 'all', '-u', '-t', '{{version}}'], stderr=subprocess.DEVNULL).decode() 40 | 41 | new_version = "{{version}}" 42 | 43 | buffer = open('CHANGELOG.md').read() 44 | 45 | buffer = buffer.replace("## ", f"{new_content}## ", 1).splitlines() 46 | 47 | offset = buffer.index("") + 1 48 | line_to_patch = buffer[offset + 1] 49 | previous_version = re.search(r"(v\d+\.\d+\.\d+[^\.]*)", line_to_patch).group(1) 50 | buffer[offset] = re.sub(r"v\d+\.\d+\.\d+[^\.]*", new_version, line_to_patch) 51 | 52 | version_link_template = buffer[offset + 2] 53 | version_link_template = re.sub( 54 | r"\d+\.\d+\.\d+[^\.]*(?=\])", new_version[1:], version_link_template 55 | ) 56 | version_link_template = version_link_template.rsplit("/", 1)[0] + f"/{previous_version}...{new_version}" 57 | buffer[offset + 1] = version_link_template 58 | 59 | buffer.insert(offset, '') 60 | buffer = '\n'.join(buffer) 61 | open('CHANGELOG.md', 'wt').write(buffer) 62 | print(buffer) 63 | 64 | 65 | release tag: (patch-version) (changelog tag) 66 | #!/usr/bin/env bash 67 | 68 | git add crates 69 | git add CHANGELOG.md 70 | git commit -m "chore: update changelog" 71 | git tag {{tag}} 72 | 73 | cargo publish 74 | cd crates/mzdata-spectra && cargo publish 75 | 76 | 77 | patch-version: 78 | #!/usr/bin/env python 79 | import sys 80 | import re 81 | 82 | 83 | ref_toml = "Cargo.toml" 84 | target_toml = "crates/mzdata-spectra/Cargo.toml" 85 | 86 | pattern = re.compile(r"^version\s*=\s*\"(.+?)\"") 87 | dep_pattern = re.compile(r"version\s*=\s*\"(.+?)\"") 88 | 89 | version = None 90 | 91 | with open(ref_toml) as fh: 92 | for line in fh: 93 | if match := pattern.match(line): 94 | version = match.string 95 | break 96 | 97 | if not version: 98 | raise ValueError("Version not found in reference") 99 | 100 | 101 | buffer = [] 102 | with open(target_toml) as fh: 103 | for line in fh: 104 | if pattern.match(line): 105 | line = version 106 | if line.startswith("mzdata"): 107 | line = dep_pattern.sub(version.strip(), line) 108 | 109 | buffer.append(line.strip()) 110 | 111 | with open(target_toml, 'w') as fh: 112 | fh.write('\n'.join(buffer)) 113 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mzdata 2 | [![Latest Version](https://img.shields.io/crates/v/mzdata?style=for-the-badge&color=mediumpurple&logo=rust)](https://crates.io/crates/mzdata) 3 | [![docs.rs](https://img.shields.io/docsrs/mzdata?style=for-the-badge&logo=docs.rs&color=mediumseagreen)](https://docs.rs/mzdata/latest/mzdata/) 4 | 5 | A Rust library for reading mass spectrometry data file formats. 6 | 7 | ## Quickstart 8 | ```rust 9 | use std::fs; 10 | use mzdata::prelude::*; 11 | use mzpeaks::Tolerance; 12 | use mzdata::MzMLReader; 13 | use mzdata::spectrum::SignalContinuity; 14 | 15 | fn main() { 16 | let mut ms1_count = 0; 17 | let mut msn_count = 0; 18 | let reader = MzMLReader::new(fs::File::open("./test/data/small.mzML").unwrap()); 19 | for spectrum in reader { 20 | if spectrum.ms_level() == 1 { 21 | ms1_count += 1; 22 | } else { 23 | msn_count += 1; 24 | } 25 | println!("Scan {} => BP {}", spectrum.id(), spectrum.peaks().base_peak().mz); 26 | if spectrum.signal_continuity() == SignalContinuity::Centroid { 27 | let peak_picked = spectrum.into_centroid().unwrap(); 28 | println!("Matches for 579.155: {:?}", peak_picked.peaks.all_peaks_for(579.155, Tolerance::Da(0.02))); 29 | } 30 | } 31 | println!("MS1 Count: {}\nMSn Count: {}", ms1_count, msn_count); 32 | assert_eq!(ms1_count, 14); 33 | assert_eq!(msn_count, 34); 34 | } 35 | 36 | 37 | ``` 38 | 39 | ## Supported Formats 40 | 1. `mzML` and `indexedmzML` 41 | 2. `MGF` 42 | 3. `mzMLb` 43 | 4. Thermo RAW 44 | 45 | ## Disclaimer 46 | This library was made in part to learn Rust, so it may not use the preferred idioms, 47 | patterns, or libraries. Any recommendations are welcome. -------------------------------------------------------------------------------- /benches/mzml.rs: -------------------------------------------------------------------------------- 1 | use criterion::{black_box, criterion_group, criterion_main, Criterion}; 2 | 3 | use std::fs; 4 | 5 | use mzdata::io::mzml::MzMLReader; 6 | use mzdata::prelude::*; 7 | 8 | fn serial(file_path: &str) { 9 | let file = fs::File::open(file_path).unwrap(); 10 | let reader = MzMLReader::new(file); 11 | let total: usize = reader.map(|s| s.arrays.unwrap().mzs().unwrap().len()).sum(); 12 | assert_eq!(total, 305213); 13 | } 14 | 15 | fn serial_with_index(file_path: &str) { 16 | let file = fs::File::open(file_path).unwrap(); 17 | let reader = MzMLReader::new_indexed(file); 18 | let total: usize = reader.map(|s| s.arrays.unwrap().mzs().unwrap().len()).sum(); 19 | assert_eq!(total, 305213); 20 | } 21 | 22 | fn serial_with_external_iterator(file_path: &str) { 23 | let file = fs::File::open(file_path).unwrap(); 24 | let mut reader = MzMLReader::new_indexed(file); 25 | let total: usize = reader 26 | .iter() 27 | .map(|s| s.arrays.unwrap().mzs().unwrap().len()) 28 | .sum(); 29 | assert_eq!(total, 305213); 30 | } 31 | 32 | fn mzml_totaling(c: &mut Criterion) { 33 | c.bench_function("serial_execution", |b| { 34 | b.iter(|| serial(black_box("./test/data/small.mzML"))) 35 | }); 36 | c.bench_function("serial_execution_with_index", |b| { 37 | b.iter(|| serial_with_index(black_box("./test/data/small.mzML"))) 38 | }); 39 | c.bench_function("serial_with_external_iterator", |b| { 40 | b.iter(|| serial_with_external_iterator(black_box("./test/data/small.mzML"))) 41 | }); 42 | } 43 | 44 | criterion_group!(benches, mzml_totaling); 45 | criterion_main!(benches); 46 | -------------------------------------------------------------------------------- /cliff.toml: -------------------------------------------------------------------------------- 1 | # git-cliff ~ configuration file 2 | # https://git-cliff.org/docs/configuration 3 | 4 | [changelog] 5 | # changelog header 6 | header = """ 7 | # Changelog\n 8 | All notable changes to this project will be documented in this file. 9 | 10 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 11 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).\n 12 | """ 13 | # template for the changelog body 14 | # https://keats.github.io/tera/docs/#introduction 15 | body = """ 16 | {% if version -%} 17 | ## [{{ version | trim_start_matches(pat="v") }}] - {{ timestamp | date(format="%Y-%m-%d") }} 18 | {% else -%} 19 | ## [Unreleased] 20 | {% endif -%} 21 | {% for group, commits in commits | group_by(attribute="group") %} 22 | ### {{ group | upper_first }} 23 | {% for commit in commits %} 24 | - {{ commit.message | upper_first }}\ 25 | {% if commit.body %} 26 | {{ commit.body }}\ 27 | {% endif %}\ 28 | {% endfor %} 29 | {% endfor %}\n 30 | """ 31 | # template for the changelog footer 32 | footer = """ 33 | {% for release in releases -%} 34 | {% if release.version -%} 35 | {% if release.previous.version -%} 36 | [{{ release.version | trim_start_matches(pat="v") }}]: \ 37 | https://github.com/{{ remote.github.owner }}/{{ remote.github.repo }}\ 38 | /compare/{{ release.previous.version }}..{{ release.version }} 39 | {% endif -%} 40 | {% else -%} 41 | [unreleased]: https://github.com/{{ remote.github.owner }}/{{ remote.github.repo }}\ 42 | /compare/{{ release.previous.version }}..HEAD 43 | {% endif -%} 44 | {% endfor %} 45 | 46 | """ 47 | # remove the leading and trailing whitespace from the templates 48 | trim = true 49 | 50 | [git] 51 | # parse the commits based on https://www.conventionalcommits.org 52 | conventional_commits = true 53 | # filter out the commits that are not conventional 54 | filter_unconventional = true 55 | # process each line of a commit as an individual commit 56 | split_commits = false 57 | # regex for parsing and grouping commits 58 | commit_parsers = [ 59 | { message = "^chore:", skip = true }, 60 | { message = "^.*: a|Add", group = "Added" }, 61 | { message = "^.*: S|support", group = "Added" }, 62 | { message = "^.*: R|remove", group = "Removed" }, 63 | { message = "^.*: D|delete", group = "Removed" }, 64 | { message = "^test", group = "Fixed" }, 65 | { message = "^f|Fix", group = "Fixed" }, 66 | { message = "^.*: fix", group = "Fixed" }, 67 | { message = "^doc", group = "Documentation" }, 68 | { message = "^.*: C|change(d?)*", group = "Changed" }, 69 | { message = "^.*: U|upgrade(d?)*", group = "Changed" }, 70 | ] 71 | # protect breaking changes from being skipped due to matching a skipping commit_parser 72 | protect_breaking_commits = false 73 | # filter out the commits that are not matched by commit parsers 74 | filter_commits = true 75 | # regex for matching git tags 76 | tag_pattern = "v[0-9].*" 77 | # regex for skipping tags 78 | skip_tags = "v0.1.0-beta.1" 79 | # regex for ignoring tags 80 | ignore_tags = "" 81 | # sort the tags topologically 82 | topo_order = false 83 | # sort the commits inside sections by oldest/newest order 84 | sort_commits = "oldest" -------------------------------------------------------------------------------- /crates/mzdata-spectra/.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | Cargo.lock -------------------------------------------------------------------------------- /crates/mzdata-spectra/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "mzdata-spectra" 3 | version = "0.54.0" 4 | edition = "2021" 5 | 6 | keywords = ['mass-spectrometry'] 7 | 8 | license = "Apache-2.0" 9 | repository = "https://github.com/mobiusklein/mzdata" 10 | documentation = "https://docs.rs/mzdata" 11 | 12 | description = "A subset of `mzdata`'s traits and spectrum data model" 13 | 14 | categories = ["science", "data-structures"] 15 | 16 | [features] 17 | 18 | mzsignal = ["mzdata/mzsignal"] 19 | openblas = ["mzdata/openblas"] 20 | netlib = ["mzdata/netlib"] 21 | intel-mkl = ["mzdata/intel-mkl"] 22 | nalgebra = ["mzdata/nalgebra"] 23 | 24 | parallelism = ["mzdata/parallelism"] 25 | 26 | serde = ["mzdata/serde"] 27 | 28 | 29 | [dependencies] 30 | mzdata = { path = "../../", default-features = false, features = [], version = "0.54.0" } -------------------------------------------------------------------------------- /crates/mzdata-spectra/README.md: -------------------------------------------------------------------------------- 1 | # mzdata-spectra 2 | 3 | [![Latest Version](https://img.shields.io/crates/v/mzdata?style=for-the-badge&color=mediumpurple&logo=rust)](https://crates.io/crates/mzdata-spectra) 4 | [![docs.rs](https://img.shields.io/docsrs/mzdata?style=for-the-badge&logo=docs.rs&color=mediumseagreen)](https://docs.rs/mzdata/latest/mzdata/) 5 | 6 | 7 | This re-exports a subset of the [`mzdata`](https://github.com/mobiusklein/mzdata), specifically 8 | by not enabling the default features that include the mzML and MGF reading and writing components. 9 | 10 | This cuts the minimum number of dependencies considerably, but retains the metadata mappings and 11 | spectrum data model, as well as all the traits and "helper" types under `mzdata::io`. For more detail, 12 | please see `mzdata`'s source code and documentation. -------------------------------------------------------------------------------- /crates/mzdata-spectra/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! `mzdata` provides basic access to raw and processed mass spectrometry data formats in 2 | //! Rust. 3 | //! 4 | //! `mzdata` requires a lot of parsing machinery which can make it a weighty dependency to 5 | //! to add to a project. 6 | 7 | pub use mzdata::*; 8 | 9 | -------------------------------------------------------------------------------- /cv/extract_activation.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import json 3 | import io 4 | import itertools 5 | import re 6 | 7 | from typing import Tuple, Dict, Set, List 8 | 9 | import fastobo 10 | from fastobo.term import ( 11 | TermFrame, 12 | IsAClause, 13 | NameClause, 14 | DefClause, 15 | ) 16 | 17 | from fastobo.doc import OboDoc 18 | 19 | from fastobo.id import PrefixedIdent 20 | 21 | ROOT_TERM = PrefixedIdent("MS", "1000044") 22 | 23 | segment_pattern = re.compile(r"(_[a-zA-Z])") 24 | 25 | 26 | def collect_components( 27 | cv: OboDoc, base_term: PrefixedIdent 28 | ) -> Tuple[Set[PrefixedIdent], Dict[PrefixedIdent, TermFrame]]: 29 | term: TermFrame 30 | id_to_clause = {} 31 | component_ids = {base_term} 32 | # Make multiple passes 33 | for term in itertools.chain(cv, cv): 34 | id_to_clause[term.id] = term 35 | for clause in term: 36 | if isinstance(clause, IsAClause): 37 | if clause.term in component_ids: 38 | component_ids.add(term.id) 39 | return component_ids, id_to_clause 40 | 41 | 42 | def format_name(match: re.Match) -> str: 43 | return match.group(1)[-1].upper() 44 | 45 | 46 | def find_name(term: TermFrame): 47 | for clause in term: 48 | if isinstance(clause, NameClause): 49 | name = str(clause.name) 50 | return name 51 | else: 52 | raise LookupError(f"Term name not found for {term.id!s}") 53 | 54 | 55 | def make_entry_for(term: TermFrame): 56 | name = None 57 | parents = [] 58 | descr = "" 59 | for clause in term: 60 | if isinstance(clause, NameClause): 61 | name = str(clause.name) 62 | if isinstance(clause, IsAClause): 63 | parents.append(str(clause.term)) 64 | if isinstance(clause, DefClause): 65 | descr = re.sub( 66 | r"(\[|\])", 67 | lambda m: "\\\\" + m.group(1), 68 | str(clause.definition).replace('"', "'"), 69 | ) 70 | 71 | vname = name 72 | if "-" in vname: 73 | vname = vname.replace("-", "_") 74 | if ":" in vname: 75 | vname = vname.replace(":", "_") 76 | if "/" in vname: 77 | vname = vname.replace("/", "_") 78 | if "+" in vname: 79 | vname = vname.replace("+", "plus") 80 | if "!" in vname: 81 | vname = vname.replace("!", "_") 82 | 83 | vname = segment_pattern.sub(format_name, vname.replace(" ", "_")) 84 | vname = vname[0].upper() + vname[1:] 85 | 86 | if vname[0].isdigit(): 87 | vname = "_" + vname 88 | 89 | return f""" 90 | #[term(cv=MS, accession={term.id.local}, name="{name}", flags={{{0}}}, parents={{{json.dumps(parents)}}})] 91 | #[doc = "{name} - {descr}"] 92 | {vname},""" 93 | 94 | 95 | def generate_term_enum(terms: List[TermFrame], type_name: str): 96 | buffer = io.StringIO() 97 | buffer.write("pub enum $Term {".replace("$", type_name)) 98 | for term in terms: 99 | buffer.write(make_entry_for(term)) 100 | buffer.write("\n}") 101 | return buffer.getvalue() 102 | 103 | 104 | def main(): 105 | cv: OboDoc = fastobo.load(gzip.open("./cv/psi-ms.obo.gz")) 106 | term_ids, id_to_clause = collect_components(cv, ROOT_TERM) 107 | t = find_name(id_to_clause[ROOT_TERM]) 108 | type_name = t.title().replace(" ", "") 109 | 110 | term_specs = list(map(id_to_clause.get, sorted(term_ids))) 111 | text = generate_term_enum(term_specs, type_name) 112 | print(text) 113 | 114 | 115 | if __name__ == "__main__": 116 | main() -------------------------------------------------------------------------------- /cv/extract_component.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gzip 3 | import json 4 | import io 5 | import itertools 6 | import re 7 | 8 | from enum import IntFlag 9 | from typing import Tuple, Dict, Set, List 10 | 11 | import fastobo 12 | from fastobo.term import TermFrame, IsAClause, NameClause, RelationshipClause, DefClause 13 | 14 | from fastobo.doc import OboDoc 15 | 16 | from fastobo.id import PrefixedIdent 17 | 18 | segment_pattern = re.compile(r"(_[a-zA-Z])") 19 | 20 | class ValueType(IntFlag): 21 | NoType = 0 22 | String = 0b00000001 23 | Integer = 0b00000010 24 | Float = 0b00000100 25 | Double = 0b00001000 26 | NonNegativeInteger = 0b00010000 27 | PositiveInteger = 0b00100000 28 | DateTime = 0b01000000 29 | Boolean = 0b10000000 30 | 31 | ListOf = 0b1000000000000000 32 | 33 | 34 | xsd_to_type = { 35 | "xsd:int": ValueType.Integer, 36 | "xsd:integer": ValueType.Integer, 37 | "xsd:string": ValueType.String, 38 | "xsd:float": ValueType.Float, 39 | "xsd:double": ValueType.Double, 40 | "xsd:nonNegativeInteger": ValueType.NonNegativeInteger, 41 | "xsd:positiveInteger": ValueType.PositiveInteger, 42 | "xsd:dateTime": ValueType.DateTime, 43 | "xsd:boolean": ValueType.Boolean, 44 | } 45 | 46 | 47 | COMPONENT_TO_ENUM = { 48 | "mass-analyzer": 'MassAnalyzer', 49 | "ionization-type": 'IonizationType', 50 | "inlet-type": "InletType", 51 | "detector-type": "DetectorType", 52 | "collision-energy": "CollisionEnergy" 53 | } 54 | 55 | 56 | COMPONENT_TO_TERM = { 57 | "mass-analyzer": PrefixedIdent("MS", "1000443"), 58 | "ionization-type": PrefixedIdent("MS", "1000008"), 59 | "inlet-type": PrefixedIdent("MS", "1000007"), 60 | "detector-type": PrefixedIdent("MS", "1000026"), 61 | "collision-energy": PrefixedIdent("MS", "1000045"), 62 | } 63 | 64 | 65 | def make_parser(): 66 | parser = argparse.ArgumentParser() 67 | parser.add_argument( 68 | "component", 69 | choices=[ 70 | "mass-analyzer", 71 | "ionization-type", 72 | "inlet-type", 73 | "detector-type", 74 | "collision-energy", 75 | "-", 76 | ], 77 | ) 78 | parser.add_argument("-c", "--curie") 79 | parser.add_argument("-t", "--type-name") 80 | return parser 81 | 82 | 83 | def collect_components( 84 | cv: OboDoc, 85 | base_term: PrefixedIdent 86 | ) -> Tuple[Set[PrefixedIdent], Dict[PrefixedIdent, TermFrame]]: 87 | term: TermFrame 88 | id_to_clause = {} 89 | component_ids = {base_term} 90 | # Make multiple passes 91 | for term in itertools.chain(cv, cv): 92 | id_to_clause[term.id] = term 93 | for clause in term: 94 | if isinstance(clause, IsAClause): 95 | if clause.term in component_ids: 96 | component_ids.add(term.id) 97 | return component_ids, id_to_clause 98 | 99 | 100 | def format_name(match: re.Match) -> str: 101 | return match.group(1)[-1].upper() 102 | 103 | 104 | def find_name(term: TermFrame): 105 | for clause in term: 106 | if isinstance(clause, NameClause): 107 | name = str(clause.name) 108 | return name 109 | else: 110 | raise LookupError(f"Term name not found for {term.id!s}") 111 | 112 | 113 | def make_entry_for(term: TermFrame): 114 | name = None 115 | flags = ValueType.NoType 116 | descr = "" 117 | parents = [] 118 | for clause in term: 119 | if isinstance(clause, NameClause): 120 | name = str(clause.name) 121 | if isinstance(clause, IsAClause): 122 | parents.append(str(clause.term)) 123 | if isinstance(clause, RelationshipClause): 124 | if str(clause.typedef) == 'has_value_type': 125 | flags |= xsd_to_type[str(clause.term)] 126 | if isinstance(clause, DefClause): 127 | descr = re.sub( 128 | r"(\[|\])", 129 | lambda m: "\\\\" + m.group(1), 130 | str(clause.definition).replace('"', "'"), 131 | ) 132 | 133 | vname = name 134 | if "-" in vname: 135 | vname = vname.replace("-", "_") 136 | if ":" in vname: 137 | vname = vname.replace(":", "_") 138 | if "/" in vname: 139 | vname = vname.replace("/", "_") 140 | if "+" in vname: 141 | vname = vname.replace("+", "plus") 142 | if "!" in vname: 143 | vname = vname.replace("!", "_") 144 | 145 | vname = segment_pattern.sub( 146 | format_name, vname.replace(" ", "_") 147 | ) 148 | vname = vname[0].upper() + vname[1:] 149 | 150 | if vname[0].isdigit(): 151 | vname = "_" + vname 152 | 153 | return f""" 154 | #[term(cv=MS, accession={term.id.local}, name="{name}", flags={{{int(flags)}}}, parents={{{json.dumps(parents)}}})] 155 | #[doc="{name} - {descr}"] 156 | {vname},""" 157 | 158 | 159 | def generate_term_enum(terms: List[TermFrame], type_name: str): 160 | buffer = io.StringIO() 161 | buffer.write("pub enum $Term {".replace("$", type_name)) 162 | for term in terms: 163 | buffer.write(make_entry_for(term)) 164 | buffer.write("\n}") 165 | return buffer.getvalue() 166 | 167 | 168 | def main(): 169 | parser = make_parser() 170 | args = parser.parse_args() 171 | component = args.component 172 | 173 | if component == '-': 174 | term = PrefixedIdent(*args.curie.split(":")) 175 | type_name = args.type_name 176 | else: 177 | term = COMPONENT_TO_TERM[component] 178 | type_name = COMPONENT_TO_ENUM[component] 179 | 180 | cv: OboDoc = fastobo.load(gzip.open("./cv/psi-ms.obo.gz")) 181 | component_ids, id_to_clause = collect_components(cv, term) 182 | if type_name is None: 183 | t = find_name(id_to_clause[term]) 184 | type_name = t.title().replace(" ", "") 185 | 186 | term_specs = list(map(id_to_clause.get, sorted(component_ids))) 187 | text = generate_term_enum(term_specs, type_name) 188 | print(text) 189 | 190 | 191 | if __name__ == "__main__": 192 | main() -------------------------------------------------------------------------------- /cv/extract_cv_metadata.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import argparse 3 | 4 | import fastobo 5 | 6 | from fastobo.doc import OboDoc 7 | from fastobo.header import DataVersionClause 8 | 9 | 10 | 11 | def make_parser(): 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('clause', choices=["data-version"]) 14 | return parser.parse_args() 15 | 16 | 17 | def main(): 18 | args = make_parser() 19 | cv: OboDoc = fastobo.load(gzip.open("./cv/psi-ms.obo.gz")) 20 | 21 | clause = args.clause 22 | 23 | target_type = None.__class__ 24 | if clause == 'data-version': 25 | target_type = DataVersionClause 26 | 27 | for clause in cv.header: 28 | if isinstance(clause, target_type): 29 | print(clause.raw_value()) 30 | 31 | 32 | if __name__ == "__main__": 33 | main() -------------------------------------------------------------------------------- /cv/extract_energy.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import json 3 | import io 4 | import itertools 5 | import re 6 | 7 | from typing import Tuple, Dict, Set, List 8 | 9 | import fastobo 10 | from fastobo.term import ( 11 | TermFrame, 12 | IsAClause, 13 | NameClause, 14 | DefClause, 15 | ) 16 | 17 | from fastobo.doc import OboDoc 18 | 19 | from fastobo.id import PrefixedIdent 20 | 21 | ROOT_TERM = PrefixedIdent("MS", "1000045") 22 | EXTRA_ROOTS = [ 23 | PrefixedIdent("MS", "1000138"), 24 | PrefixedIdent("MS", "1002680"), 25 | PrefixedIdent("MS", "1003410") 26 | ] 27 | 28 | segment_pattern = re.compile(r"(_[a-zA-Z])") 29 | 30 | 31 | def collect_components( 32 | cv: OboDoc, base_term: PrefixedIdent 33 | ) -> Tuple[Set[PrefixedIdent], Dict[PrefixedIdent, TermFrame]]: 34 | term: TermFrame 35 | id_to_clause = {} 36 | component_ids = {base_term} 37 | # Make multiple passes 38 | for term in itertools.chain(cv, cv): 39 | id_to_clause[term.id] = term 40 | for clause in term: 41 | if isinstance(clause, IsAClause): 42 | if clause.term in component_ids: 43 | component_ids.add(term.id) 44 | return component_ids, id_to_clause 45 | 46 | 47 | def format_name(match: re.Match) -> str: 48 | return match.group(1)[-1].upper() 49 | 50 | 51 | def find_name(term: TermFrame): 52 | for clause in term: 53 | if isinstance(clause, NameClause): 54 | name = str(clause.name) 55 | return name 56 | else: 57 | raise LookupError(f"Term name not found for {term.id!s}") 58 | 59 | 60 | def make_entry_for(term: TermFrame): 61 | name = None 62 | parents = [] 63 | descr = "" 64 | for clause in term: 65 | if isinstance(clause, NameClause): 66 | name = str(clause.name) 67 | if isinstance(clause, IsAClause): 68 | parents.append(str(clause.term)) 69 | if isinstance(clause, DefClause): 70 | descr = re.sub( 71 | r"(\[|\])", 72 | lambda m: "\\\\" + m.group(1), 73 | str(clause.definition).replace('"', "'"), 74 | ) 75 | 76 | vname = name 77 | if "-" in vname: 78 | vname = vname.replace("-", "_") 79 | if ":" in vname: 80 | vname = vname.replace(":", "_") 81 | if "/" in vname: 82 | vname = vname.replace("/", "_") 83 | if "+" in vname: 84 | vname = vname.replace("+", "plus") 85 | if "!" in vname: 86 | vname = vname.replace("!", "_") 87 | 88 | vname = segment_pattern.sub(format_name, vname.replace(" ", "_")) 89 | vname = vname[0].upper() + vname[1:] 90 | 91 | if vname[0].isdigit(): 92 | vname = "_" + vname 93 | 94 | return f""" 95 | #[term(cv=MS, accession={term.id.local}, name="{name}", flags={{{0}}}, parents={{{json.dumps(parents)}}})] 96 | #[doc = "{name} - {descr}"] 97 | {vname}(f32),""" 98 | 99 | 100 | def generate_term_enum(terms: List[TermFrame], type_name: str): 101 | buffer = io.StringIO() 102 | buffer.write("pub enum $Term {".replace("$", type_name)) 103 | for term in terms: 104 | buffer.write(make_entry_for(term)) 105 | buffer.write("\n}") 106 | return buffer.getvalue() 107 | 108 | 109 | def merge_term_sets(term_sets: List[Tuple[Set, Dict]]) -> Tuple[Set, Dict]: 110 | base_term_ids, base_id_to_clause = map(lambda x: x.copy(), term_sets[0]) 111 | for (term_ids, id_to_clause) in term_sets[1:]: 112 | base_term_ids.update(term_ids) 113 | base_id_to_clause.update(id_to_clause) 114 | return (base_term_ids, base_id_to_clause) 115 | 116 | 117 | def main(): 118 | cv: OboDoc = fastobo.load(gzip.open("./cv/psi-ms.obo.gz")) 119 | term_ids, id_to_clause = merge_term_sets([collect_components(cv, root) for root in [ROOT_TERM] + EXTRA_ROOTS]) 120 | # t = find_name(id_to_clause[ROOT_TERM]) 121 | # type_name = t.title().replace(" ", "") 122 | type_name = "DissociationEnergy" 123 | 124 | term_specs = list(map(id_to_clause.get, sorted(term_ids))) 125 | text = generate_term_enum(term_specs, type_name) 126 | print(text) 127 | 128 | 129 | if __name__ == "__main__": 130 | main() -------------------------------------------------------------------------------- /cv/extract_file_formats.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import json 3 | import io 4 | import itertools 5 | import re 6 | 7 | from typing import Tuple, Dict, Set, List 8 | 9 | import fastobo 10 | from fastobo.term import ( 11 | TermFrame, 12 | IsAClause, 13 | NameClause, 14 | DefClause, 15 | ) 16 | 17 | from fastobo.doc import OboDoc 18 | 19 | from fastobo.id import PrefixedIdent 20 | 21 | ROOT_TERM = PrefixedIdent("MS", "1000560") 22 | 23 | segment_pattern = re.compile(r"(_[a-zA-Z])") 24 | 25 | 26 | def collect_components( 27 | cv: OboDoc, base_term: PrefixedIdent 28 | ) -> Tuple[Set[PrefixedIdent], Dict[PrefixedIdent, TermFrame]]: 29 | term: TermFrame 30 | id_to_clause = {} 31 | component_ids = {base_term} 32 | # Make multiple passes 33 | for term in itertools.chain(cv, cv): 34 | id_to_clause[term.id] = term 35 | for clause in term: 36 | if isinstance(clause, IsAClause): 37 | if clause.term in component_ids: 38 | component_ids.add(term.id) 39 | return component_ids, id_to_clause 40 | 41 | 42 | def format_name(match: re.Match) -> str: 43 | return match.group(1)[-1].upper() 44 | 45 | 46 | def find_name(term: TermFrame): 47 | for clause in term: 48 | if isinstance(clause, NameClause): 49 | name = str(clause.name) 50 | return name 51 | else: 52 | raise LookupError(f"Term name not found for {term.id!s}") 53 | 54 | 55 | def make_entry_for(term: TermFrame): 56 | name = None 57 | parents = [] 58 | descr = "" 59 | for clause in term: 60 | if isinstance(clause, NameClause): 61 | name = str(clause.name) 62 | if isinstance(clause, IsAClause): 63 | parents.append(str(clause.term)) 64 | if isinstance(clause, DefClause): 65 | descr = re.sub(r"(\[|\])", lambda m: '\\\\' + m.group(1), str(clause.definition).replace('"', '\'')) 66 | 67 | vname = name 68 | if "-" in vname: 69 | vname = vname.replace("-", "_") 70 | if ":" in vname: 71 | vname = vname.replace(":", "_") 72 | if "/" in vname: 73 | vname = vname.replace("/", "_") 74 | if "+" in vname: 75 | vname = vname.replace("+", "plus") 76 | if "!" in vname: 77 | vname = vname.replace("!", "_") 78 | 79 | vname = segment_pattern.sub(format_name, vname.replace(" ", "_")) 80 | vname = vname[0].upper() + vname[1:] 81 | 82 | if vname[0].isdigit(): 83 | vname = "_" + vname 84 | 85 | if vname.endswith("Format"): 86 | vname = vname[:-6] 87 | 88 | return f""" 89 | #[term(cv=MS, accession={term.id.local}, name="{name}", flags={{{0}}}, parents={{{json.dumps(parents)}}})] 90 | #[doc = "{name} - {descr}"] 91 | {vname},""" 92 | 93 | 94 | def generate_term_enum(terms: List[TermFrame], type_name: str): 95 | buffer = io.StringIO() 96 | buffer.write("pub enum $Term {".replace("$", type_name)) 97 | for term in terms: 98 | buffer.write(make_entry_for(term)) 99 | buffer.write("\n}") 100 | return buffer.getvalue() 101 | 102 | 103 | def main(): 104 | cv: OboDoc = fastobo.load(gzip.open("./cv/psi-ms.obo.gz")) 105 | term_ids, id_to_clause = collect_components(cv, ROOT_TERM) 106 | t = find_name(id_to_clause[ROOT_TERM]) 107 | type_name = t.title().replace(" ", "") 108 | 109 | term_specs = list(map(id_to_clause.get, sorted(term_ids))) 110 | text = generate_term_enum(term_specs, type_name) 111 | print(text) 112 | 113 | 114 | if __name__ == "__main__": 115 | main() -------------------------------------------------------------------------------- /cv/extract_native_ids.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import json 3 | import io 4 | import itertools 5 | import re 6 | 7 | from typing import Tuple, Dict, Set, List 8 | 9 | import fastobo 10 | from fastobo.term import ( 11 | TermFrame, 12 | IsAClause, 13 | NameClause, 14 | DefClause, 15 | ) 16 | 17 | from fastobo.doc import OboDoc 18 | 19 | from fastobo.id import PrefixedIdent 20 | 21 | ROOT_TERM = PrefixedIdent("MS", "1000767") 22 | 23 | segment_pattern = re.compile(r"(_[a-zA-Z])") 24 | 25 | type_pat = re.compile( 26 | "([A-Za-z]+)=(xsd:(%s+))" 27 | % "|".join({"IDREF", "long", "nonNegativeInteger", "positiveInteger", "string"}) 28 | ) 29 | 30 | xsd_to_regex = { 31 | "IDREF": r"\S+", 32 | "long": r"-?\d+", 33 | "nonNegativeInteger": r"\d+", 34 | "positiveInteger": r"\d+", 35 | "string": r"\S+", 36 | } 37 | 38 | 39 | def collect_components( 40 | cv: OboDoc, base_term: PrefixedIdent 41 | ) -> Tuple[Set[PrefixedIdent], Dict[PrefixedIdent, TermFrame]]: 42 | term: TermFrame 43 | id_to_clause = {} 44 | component_ids = {base_term} 45 | # Make multiple passes 46 | for term in itertools.chain(cv, cv): 47 | id_to_clause[term.id] = term 48 | for clause in term: 49 | if isinstance(clause, IsAClause): 50 | if clause.term in component_ids: 51 | component_ids.add(term.id) 52 | return component_ids, id_to_clause 53 | 54 | 55 | def format_name(match: re.Match) -> str: 56 | return match.group(1)[-1].upper() 57 | 58 | 59 | def find_name(term: TermFrame): 60 | for clause in term: 61 | if isinstance(clause, NameClause): 62 | name = str(clause.name) 63 | return name 64 | else: 65 | raise LookupError(f"Term name not found for {term.id!s}") 66 | 67 | 68 | def make_entry_for(term: TermFrame): 69 | name = None 70 | parents = [] 71 | descr = "" 72 | for clause in term: 73 | if isinstance(clause, NameClause): 74 | name = str(clause.name) 75 | if isinstance(clause, IsAClause): 76 | parents.append(str(clause.term)) 77 | if isinstance(clause, DefClause): 78 | descr = str(clause.definition) 79 | try: 80 | descr = descr.split("defined by ")[1].rstrip('.') 81 | descr = type_pat.sub( 82 | lambda x: f"{x.group(1)}=(?<{x.group(1)}>{xsd_to_regex[x.group(3)]})", descr 83 | ) 84 | except IndexError: 85 | descr = "(.+)" 86 | 87 | vname = name 88 | if "-" in vname: 89 | vname = vname.replace("-", "_") 90 | if ":" in vname: 91 | vname = vname.replace(":", "_") 92 | if "/" in vname: 93 | vname = vname.replace("/", "_") 94 | if "+" in vname: 95 | vname = vname.replace("+", "plus") 96 | if "!" in vname: 97 | vname = vname.replace("!", "_") 98 | 99 | vname = segment_pattern.sub(format_name, vname.replace(" ", "_")) 100 | vname = vname[0].upper() + vname[1:] 101 | 102 | if vname[0].isdigit(): 103 | vname = "_" + vname 104 | 105 | return f""" 106 | #[term(cv=MS, accession={term.id.local}, name="{name}", flags={{r"{descr}"}}, parents={{{json.dumps(parents)}}})] 107 | #[doc = r"{name} - `{descr}`"] 108 | {vname},""" 109 | 110 | 111 | def generate_term_enum(terms: List[TermFrame], type_name: str): 112 | buffer = io.StringIO() 113 | buffer.write("pub enum $Term {".replace("$", type_name)) 114 | for term in terms: 115 | buffer.write(make_entry_for(term)) 116 | buffer.write("\n}") 117 | return buffer.getvalue() 118 | 119 | 120 | def main(): 121 | cv: OboDoc = fastobo.load(gzip.open("./cv/psi-ms.obo.gz")) 122 | term_ids, id_to_clause = collect_components(cv, ROOT_TERM) 123 | t = find_name(id_to_clause[ROOT_TERM]) 124 | type_name = t.title().replace(" ", "") 125 | 126 | term_specs = list(map(id_to_clause.get, sorted(term_ids))) 127 | text = generate_term_enum(term_specs, type_name) 128 | print(text) 129 | 130 | 131 | if __name__ == "__main__": 132 | main() -------------------------------------------------------------------------------- /cv/extract_software.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import json 3 | import io 4 | import itertools 5 | import re 6 | 7 | from enum import IntFlag 8 | from typing import Tuple, Dict, Set, List 9 | 10 | import fastobo 11 | from fastobo.term import ( 12 | TermFrame, 13 | IsAClause, 14 | NameClause, 15 | DefClause, 16 | ) 17 | 18 | from fastobo.doc import OboDoc 19 | 20 | from fastobo.id import PrefixedIdent 21 | 22 | ACQUISITION_SW = PrefixedIdent("MS", "1001455") 23 | ANALYSIS_SW = PrefixedIdent("MS", "1001456") 24 | DP_SW = PrefixedIdent("MS", "1001457") 25 | 26 | segment_pattern = re.compile(r"(_[a-zA-Z])") 27 | 28 | class SoftwareType(IntFlag): 29 | NoType = 0 30 | Analysis = 0b00000001 31 | DataProcessing = 0b00000010 32 | Acquisition = 0b00000100 33 | 34 | 35 | def collect_software_types(cv: OboDoc) -> Tuple[Set[PrefixedIdent], Dict[PrefixedIdent, TermFrame]]: 36 | term: TermFrame 37 | id_to_clause = {} 38 | software_ids = { 39 | PrefixedIdent("MS", "1000531") 40 | } 41 | for term in itertools.chain(cv, cv): 42 | id_to_clause[term.id] = term 43 | for clause in term: 44 | if isinstance(clause, IsAClause): 45 | if clause.term in software_ids: 46 | software_ids.add(term.id) 47 | return software_ids, id_to_clause 48 | 49 | def format_name(match: re.Match) -> str: 50 | return match.group(1)[-1].upper() 51 | 52 | def make_entry_for(term: TermFrame): 53 | name = None 54 | flags = SoftwareType.NoType 55 | parents = [] 56 | descr = '' 57 | for clause in term: 58 | if isinstance(clause, NameClause): 59 | name = str(clause.name) 60 | if isinstance(clause, IsAClause): 61 | parents.append(str(clause.term)) 62 | if clause.term == DP_SW: 63 | flags |= SoftwareType.DataProcessing 64 | elif clause.term == ANALYSIS_SW: 65 | flags |= SoftwareType.Analysis 66 | elif clause.term == ACQUISITION_SW: 67 | flags |= SoftwareType.Acquisition 68 | if isinstance(clause, DefClause): 69 | descr = re.sub( 70 | r"(\[|\])", 71 | lambda m: "\\\\" + m.group(1), 72 | str(clause.definition).replace('"', "'"), 73 | ) 74 | 75 | vname: str = name 76 | if "-" in vname: 77 | vname = vname.replace("-", "_") 78 | if ":" in vname: 79 | vname = vname.replace(":", "_") 80 | if '/' in vname: 81 | vname = vname.replace('/', '_') 82 | if "+" in vname: 83 | vname = vname.replace("+", "plus") 84 | if "!" in vname: 85 | vname = vname.replace("!", "_") 86 | 87 | vname: str = segment_pattern.sub(format_name, vname.replace(" ", "_").replace("software", "Software")) 88 | vname: str = vname[0].upper() + vname[1:] 89 | 90 | if vname[0].isdigit(): 91 | vname = "_" + vname 92 | 93 | return f""" 94 | #[term(cv=MS, accession={term.id.local}, name="{name}", flags={{{int(flags)}}}, parents={{{json.dumps(parents)}}})] 95 | #[doc="{name} - {descr}"] 96 | {vname},""" 97 | 98 | 99 | def generate_term_enum(terms: List[TermFrame]): 100 | buffer = io.StringIO() 101 | buffer.write("pub enum SoftwareTerm {") 102 | for term in terms: 103 | buffer.write(make_entry_for(term)) 104 | buffer.write("\n}") 105 | return buffer.getvalue() 106 | 107 | 108 | def main(): 109 | cv: OboDoc = fastobo.load(gzip.open("./cv/psi-ms.obo.gz")) 110 | software_ids, id_to_clause = collect_software_types(cv) 111 | sw_terms = list(map(id_to_clause.get, sorted(software_ids))) 112 | text = generate_term_enum(sw_terms) 113 | print(text) 114 | 115 | 116 | if __name__ == "__main__": 117 | main() -------------------------------------------------------------------------------- /cv/psi-ms.obo.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mobiusklein/mzdata/e7e11f83532617657352da0c87cba6e6591a85a3/cv/psi-ms.obo.gz -------------------------------------------------------------------------------- /cv/requirements.txt: -------------------------------------------------------------------------------- 1 | fastobo -------------------------------------------------------------------------------- /docs/img/denoised_spec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mobiusklein/mzdata/e7e11f83532617657352da0c87cba6e6591a85a3/docs/img/denoised_spec.png -------------------------------------------------------------------------------- /docs/img/raw_spec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mobiusklein/mzdata/e7e11f83532617657352da0c87cba6e6591a85a3/docs/img/raw_spec.png -------------------------------------------------------------------------------- /docs/img/to_uri.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import base64 3 | from urllib import parse as urlparse 4 | 5 | 6 | def parse_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("img_path") 9 | parser.add_argument("tag_name") 10 | return parser.parse_args() 11 | 12 | 13 | def make_data_uri_from_path(path): 14 | content = open(path, "rb").read() 15 | content = base64.b64encode(content) 16 | content = urlparse.quote(content) 17 | return f"data:image/png;base64,{content}" 18 | 19 | 20 | def main(): 21 | args = parse_args() 22 | uri = make_data_uri_from_path(args.img_path) 23 | print(f" [{args.tag_name}]: {uri}") 24 | 25 | 26 | if __name__ == "__main__": 27 | main() 28 | -------------------------------------------------------------------------------- /docs/writer_tutorial.md: -------------------------------------------------------------------------------- 1 | # Writing mass spectrometry data files with `mzdata` 2 | 3 | ## Table of contents 4 | - [Writing mass spectrometry data files with `mzdata`](#writing-mass-spectrometry-data-files-with-mzdata) 5 | - [Table of contents](#table-of-contents) 6 | - [Creating a `SpectrumWriter`](#creating-a-spectrumwriter) 7 | - [Copying across metadata](#copying-across-metadata) 8 | - [Writing spectra](#writing-spectra) 9 | 10 | ## Creating a `SpectrumWriter` 11 | 12 | `mzdata` uses the [`SpectrumWriter`] trait to define shared writing behavior across different 13 | data file formats. The [`MzMLWriter`] type writes spectra to mzML files, and [`MGFWriter`] writes 14 | spectra to MGF files, though other formats may be available as well. [`SpectrumWriter`] is agnostic 15 | to the thing being written to, and [`MzMLWriter`] and [`MGFWriter`] can accept an arbitrary [`Write`] 16 | implementation, though not all implementations are. 17 | 18 | ```rust 19 | use std::{io, fs}; 20 | 21 | use mzdata::io::MzMLWriter; 22 | use mzdata::prelude::*; 23 | 24 | fn main() -> io::Result<()> { 25 | let fh = io::BufWriter::new(fs::File::create("tmp.mzML")?); 26 | let mut writer = MzMLWriter::new(fh); 27 | 28 | Ok(()) 29 | } 30 | ``` 31 | 32 | ## Copying across metadata 33 | 34 | If you have a source which implements [`MSDataFileMetadata`], you can easily copy metadata from your 35 | source file to your output file, preserving the trace of information about where your data came from. 36 | 37 | ```rust 38 | use std::{io, fs}; 39 | 40 | use mzdata::io::MzMLWriter; 41 | use mzdata::prelude::*; 42 | 43 | fn main() -> io::Result<()> { 44 | let reader = mzdata::MZReader::open_path("test/data/batching_test.mzML")?; 45 | 46 | let fh = io::BufWriter::new(fs::File::create("tmp.mzML")?); 47 | let mut writer = MzMLWriter::new(fh); 48 | 49 | writer.copy_metadata_from(&reader); 50 | 51 | // mzML files want to list how many spectra they contain 52 | if let Some(n_spectra) = reader.spectrum_count_hint() { 53 | writer.set_spectrum_count(n_spectra) 54 | } else { 55 | writer.set_spectrum_count(reader.len() as u64) 56 | } 57 | 58 | Ok(()) 59 | } 60 | ``` 61 | 62 | ## Writing spectra 63 | 64 | ```rust 65 | # use std::{io, fs}; 66 | # use mzdata::io::MzMLWriter; 67 | # use mzdata::prelude::*; 68 | # 69 | # 70 | # fn main() -> io::Result<()> { 71 | # let reader = mzdata::MZReader::open_path("test/data/batching_test.mzML")?; 72 | # 73 | # let fh = io::BufWriter::new(fs::File::create("tmp.mzML")?); 74 | # let mut writer = MzMLWriter::new(fh); 75 | # 76 | # writer.copy_metadata_from(&reader); 77 | # 78 | # // mzML files want to list how many spectra they contain 79 | # if let Some(n_spectra) = reader.spectrum_count_hint() { 80 | # writer.set_spectrum_count(n_spectra) 81 | # } else { 82 | # writer.set_spectrum_count(reader.len() as u64) 83 | # } 84 | # 85 | // Write spectra out one at a time, by reference 86 | for spec in reader { 87 | writer.write(&spec)?; 88 | } 89 | # Ok(()) 90 | # } 91 | ``` 92 | 93 | ```rust 94 | # use std::{io, fs}; 95 | # use mzdata::io::MzMLWriter; 96 | # use mzdata::prelude::*; 97 | # 98 | # 99 | # fn main() -> io::Result<()> { 100 | # let reader = mzdata::MZReader::open_path("test/data/batching_test.mzML")?; 101 | # 102 | # let fh = io::BufWriter::new(fs::File::create("tmp.mzML")?); 103 | # let mut writer = MzMLWriter::new(fh); 104 | # 105 | # writer.copy_metadata_from(&reader); 106 | # 107 | # // mzML files want to list how many spectra they contain 108 | # if let Some(n_spectra) = reader.spectrum_count_hint() { 109 | # writer.set_spectrum_count(n_spectra) 110 | # } else { 111 | # writer.set_spectrum_count(reader.len() as u64) 112 | # } 113 | # 114 | // Write out an iterator over spectra, in this case 115 | // using the owning variant for an iterator over owned 116 | // instances. 117 | writer.write_all_owned(reader)?; 118 | # Ok(()) 119 | # } 120 | ``` 121 | 122 | 123 | -------------------------------------------------------------------------------- /examples/async_mzcat.rs: -------------------------------------------------------------------------------- 1 | use std::time; 2 | use std::{env, io, path}; 3 | 4 | use futures::StreamExt; 5 | use tokio::fs; 6 | 7 | use mzdata::io::mzml; 8 | use mzdata::prelude::*; 9 | 10 | async fn load_file + Clone>( 11 | path: P, 12 | ) -> io::Result> { 13 | let fh = fs::File::open(path.into()).await?; 14 | let mut reader = mzml::AsyncMzMLReader::new(fh).await; 15 | reader 16 | .read_index_from_end() 17 | .await 18 | .expect("Failed to read index from the file"); 19 | Ok(reader) 20 | } 21 | 22 | async fn scan_file(reader: &mut mzml::AsyncMzMLReader) { 23 | let start = time::Instant::now(); 24 | let mut i = 0; 25 | 26 | let mut stream = reader.as_stream(); 27 | 28 | while let Some(scan) = stream.next().await { 29 | if i % 10000 == 0 { 30 | println!( 31 | "\tScan {}: {}|{} ({} seconds)", 32 | i, 33 | scan.id(), 34 | scan.index(), 35 | (time::Instant::now() - start).as_secs_f64(), 36 | ); 37 | } 38 | i += 1; 39 | } 40 | let end = time::Instant::now(); 41 | println!( 42 | "Loaded in {} spectra {} seconds", 43 | i, 44 | (end - start).as_secs_f64() 45 | ); 46 | } 47 | 48 | #[tokio::main(flavor = "multi_thread", worker_threads = 10)] 49 | async fn main() -> io::Result<()> { 50 | let path = path::PathBuf::from( 51 | env::args().nth(1) 52 | .expect("Please pass an MS data file path"), 53 | ); 54 | if let Some(ext) = path.extension() { 55 | if ext.to_string_lossy().to_lowercase() == "mzml" { 56 | let mut reader = load_file(path).await?; 57 | scan_file(&mut reader).await; 58 | } else { 59 | panic!("Could not infer the file format") 60 | } 61 | } else { 62 | let mut reader = load_file(path).await?; 63 | scan_file(&mut reader).await; 64 | }; 65 | Ok(()) 66 | } 67 | -------------------------------------------------------------------------------- /examples/averaging_writer.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | * Demo how to use the deferred spectrum averaging iterator with `rayon` 3 | * to quickly average over an LC-MS run and write the averaged spectra 4 | * out to an mzML file on disk. 5 | */ 6 | use std::env; 7 | use std::io; 8 | use std::path; 9 | use std::thread; 10 | use std::time::Instant; 11 | 12 | use std::sync::mpsc::sync_channel; 13 | 14 | use mzdata::spectrum::SignalContinuity; 15 | use rayon::prelude::*; 16 | 17 | use mzdata::prelude::*; 18 | use mzdata::spectrum::utils::Collator; 19 | use mzdata::{MZReader, MzMLWriter}; 20 | 21 | fn main() -> io::Result<()> { 22 | let path = path::PathBuf::from( 23 | env::args() 24 | .nth(1) 25 | .expect("Please pass an MS data file path"), 26 | ); 27 | 28 | let mut reader = MZReader::open_path(path)?; 29 | let mut writer = MzMLWriter::new(io::BufWriter::new(io::stdout())); 30 | writer.copy_metadata_from(&reader); 31 | 32 | let (input_sender, input_receiver) = sync_channel(5000); 33 | let (output_sender, output_receiver) = sync_channel(5000); 34 | 35 | let start = Instant::now(); 36 | let reader_task = thread::spawn(move || { 37 | let (grouper, averager, _reprofiler) = 38 | reader.groups().averaging_deferred(1, 120.0, 2000.1, 0.002); 39 | grouper 40 | .enumerate() 41 | .par_bridge() 42 | .map_init( 43 | || averager.clone(), 44 | |averager, (i, g)| { 45 | let (mut g, arrays) = g.average_with(averager); 46 | if let Some(p) = g.precursor_mut() { 47 | p.arrays = Some(arrays.into()); 48 | p.description_mut().signal_continuity = SignalContinuity::Profile; 49 | } 50 | (i, g) 51 | }, 52 | ) 53 | .for_each(|(i, g)| { 54 | input_sender.send((i, g)).unwrap(); 55 | }); 56 | let end_read = Instant::now(); 57 | eprintln!( 58 | "Finished reading all spectra and averaging in {:0.3?}", 59 | end_read - start 60 | ); 61 | }); 62 | 63 | let collator_task = 64 | thread::spawn(move || Collator::collate_sync(input_receiver, output_sender)); 65 | 66 | let writer_task = thread::spawn(move || -> io::Result<()> { 67 | for (_, group) in output_receiver { 68 | writer.write_group(&group)?; 69 | } 70 | writer.close().unwrap(); 71 | let end_write = Instant::now(); 72 | eprintln!("Finished writing all spectra in {:0.3?}", end_write - start); 73 | Ok(()) 74 | }); 75 | 76 | if let Err(e) = reader_task.join() { 77 | eprintln!("An error occurred while joining processing spectra task: {e:?}") 78 | } 79 | 80 | if let Err(e) = collator_task.join() { 81 | eprintln!("An error occurred while joining collating spectra task: {e:?}") 82 | } 83 | 84 | match writer_task.join() { 85 | Ok(r) => { 86 | r?; 87 | } 88 | Err(e) => { 89 | eprintln!("An error occurred while joining writing spectra task: {e:?}") 90 | } 91 | } 92 | 93 | Ok(()) 94 | } 95 | -------------------------------------------------------------------------------- /examples/compressed_mzml.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | * Demo how to read an mzML file that is compressed 3 | */ 4 | use std::env; 5 | use std::fs; 6 | use std::io; 7 | use std::process::exit; 8 | use std::time::Instant; 9 | 10 | use mzdata::io::PreBufferedStream; 11 | use mzdata::io::{MzMLReader, RestartableGzDecoder}; 12 | use mzdata::prelude::*; 13 | 14 | fn main() -> io::Result<()> { 15 | let input = env::args().nth(1).unwrap_or_else(|| { 16 | eprintln!("Please provide a file path or '-' for STDIN"); 17 | exit(1) 18 | }); 19 | let start = Instant::now(); 20 | let groups = if input == "-" { 21 | let stream = 22 | RestartableGzDecoder::new(io::BufReader::new(PreBufferedStream::new(io::stdin())?)); 23 | let reader = MzMLReader::new(stream); 24 | let groups: Vec<_> = reader.into_groups().collect(); 25 | groups 26 | } else { 27 | let stream = RestartableGzDecoder::new(io::BufReader::new(fs::File::open(input)?)); 28 | let reader = MzMLReader::new(stream); 29 | let groups: Vec<_> = reader.into_groups().collect(); 30 | groups 31 | }; 32 | let spectra: Vec<_> = groups 33 | .iter() 34 | .flat_map(|g| g.precursor.iter().chain(g.products.iter())) 35 | .collect(); 36 | let end = Instant::now(); 37 | eprintln!( 38 | "Read {} groups with {} spectra in {:0.3?}", 39 | groups.len(), 40 | spectra.len(), 41 | end - start 42 | ); 43 | assert!(!spectra.is_empty()); 44 | 45 | Ok(()) 46 | } 47 | -------------------------------------------------------------------------------- /examples/describe_instrument.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::io; 3 | 4 | use mzdata::mz_read; 5 | use mzdata::prelude::*; 6 | 7 | fn main() -> io::Result<()> { 8 | env_logger::init(); 9 | let inpath = env::args().nth(1).unwrap_or_else(|| { 10 | eprintln!("Please provide a path to read an MS data file from, or '-'"); 11 | std::process::exit(1) 12 | }); 13 | 14 | let configs = mz_read!(inpath, reader => { 15 | reader.instrument_configurations().clone() 16 | })?; 17 | 18 | for (k, config) in configs.iter() { 19 | println!("Configuration ID: {k}"); 20 | for component in config.components.iter() { 21 | println!("\t{:?} -> {}\n", component.component_type, component.order); 22 | for p in component.params() { 23 | println!("\t{p}"); 24 | } 25 | } 26 | for p in config.params() { 27 | println!("{p}"); 28 | } 29 | } 30 | 31 | Ok(()) 32 | } 33 | -------------------------------------------------------------------------------- /examples/from_stdin.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | * Demo how to read a data file from STDIN, and then batch collect its spectra 3 | */ 4 | use std::io::{self, Seek}; 5 | use std::time::Instant; 6 | 7 | use mzdata::io::{ 8 | infer_from_stream, MassSpectrometryFormat, PreBufferedStream, RestartableGzDecoder, 9 | SpectrumSource, 10 | }; 11 | use mzdata::{MGFReader, MzMLReader}; 12 | 13 | fn main() -> io::Result<()> { 14 | env_logger::init(); 15 | let start = Instant::now(); 16 | let stream = io::stdin(); 17 | let mut stream = PreBufferedStream::new(stream)?; 18 | let (fmt, compressed) = infer_from_stream(&mut stream)?; 19 | stream.seek(io::SeekFrom::Start(0))?; 20 | let groups: Vec<_> = match fmt { 21 | MassSpectrometryFormat::MGF => { 22 | if compressed { 23 | MGFReader::new(RestartableGzDecoder::new(io::BufReader::new(stream))) 24 | .into_groups() 25 | .collect() 26 | } else { 27 | MGFReader::new(stream).into_groups().collect() 28 | } 29 | } 30 | MassSpectrometryFormat::MzML => { 31 | if compressed { 32 | MzMLReader::new(RestartableGzDecoder::new(io::BufReader::new(stream))) 33 | .into_groups() 34 | .collect() 35 | } else { 36 | MzMLReader::new(stream).into_groups().collect() 37 | } 38 | } 39 | x => { 40 | panic!("Cannot identify file format ({:?})", x) 41 | } 42 | }; 43 | let spectra: Vec<_> = groups 44 | .iter() 45 | .flat_map(|g| g.precursor.iter().chain(g.products.iter())) 46 | .collect(); 47 | let end = Instant::now(); 48 | eprintln!( 49 | "Read {} groups with {} spectra in {:0.3?}", 50 | groups.len(), 51 | spectra.len(), 52 | end - start 53 | ); 54 | assert!(!spectra.is_empty()); 55 | Ok(()) 56 | } 57 | -------------------------------------------------------------------------------- /examples/get_scan_by.rs: -------------------------------------------------------------------------------- 1 | use std::{env, io, path}; 2 | 3 | use log::info; 4 | use mzdata::io::MZReader; 5 | use mzdata::prelude::*; 6 | 7 | fn main() -> io::Result<()> { 8 | env_logger::init(); 9 | let mut args = env::args().skip(1); 10 | 11 | let path = path::PathBuf::from(args.next().expect("Please pass an MS data file path")); 12 | 13 | let key = args 14 | .next() 15 | .expect("Please provide a key type, \"id\", \"index\" or \"time\""); 16 | let key_value = args 17 | .next() 18 | .expect("Please provide a key value matching the key type"); 19 | 20 | info!("Opening {}", path.display()); 21 | let mut reader = MZReader::open_path(path)?; 22 | 23 | let spectrum = match key.as_str() { 24 | "id" => reader.get_spectrum_by_id(&key_value).unwrap(), 25 | "index" => reader 26 | .get_spectrum_by_index(key_value.parse().unwrap()) 27 | .unwrap(), 28 | "time" => reader 29 | .get_spectrum_by_time(key_value.parse().unwrap()) 30 | .unwrap(), 31 | _ => { 32 | panic!("Unknown key type {}", key); 33 | } 34 | }; 35 | 36 | // dbg!(spectrum); 37 | println!("ID: {}; Index: {}; Time: {}", spectrum.id(), spectrum.index(), spectrum.start_time()); 38 | println!("Num data points: {}", spectrum.raw_arrays().unwrap().mzs().unwrap().len()); 39 | 40 | Ok(()) 41 | } 42 | -------------------------------------------------------------------------------- /examples/infer_format.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | * Demo the minimum code needed to infer the input file format from a path 3 | * or STDIN using `infer_format`, `infer_from_stream` and 4 | */ 5 | use std::env; 6 | use std::io; 7 | use std::process::exit; 8 | 9 | use mzdata::io::{infer_format, infer_from_stream, PreBufferedStream}; 10 | 11 | fn main() -> io::Result<()> { 12 | let input = env::args().nth(1).unwrap_or_else(|| { 13 | eprintln!("Please provide a file path or '-' for STDIN"); 14 | exit(1) 15 | }); 16 | 17 | let (inferred, gzipped) = if input == "-" { 18 | let mut stream = PreBufferedStream::new(io::stdin())?; 19 | infer_from_stream(&mut stream)? 20 | } else { 21 | infer_format(input)? 22 | }; 23 | println!("{:?} (gzip: {})", inferred, gzipped); 24 | Ok(()) 25 | } 26 | -------------------------------------------------------------------------------- /examples/msn_target_mapping.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::Ordering; 2 | use std::collections::VecDeque; 3 | use std::io; 4 | use std::ops::Range; 5 | use std::{env, fs, path}; 6 | 7 | use mzdata::prelude::*; 8 | use mzdata::spectrum::{MultiLayerSpectrum, SpectrumGroup, SpectrumGroupingIterator}; 9 | use mzpeaks::{CentroidPeak, DeconvolutedPeak, Tolerance}; 10 | 11 | #[derive(Default, Debug, Clone, PartialEq)] 12 | pub struct SelectionTargetSpecification { 13 | pub mz: f64, 14 | pub charge: Option, 15 | pub time_range: Range, 16 | } 17 | 18 | impl SelectionTargetSpecification { 19 | pub fn new(mz: f64, charge: Option, time_range: Range) -> Self { 20 | Self { 21 | mz, 22 | charge, 23 | time_range, 24 | } 25 | } 26 | 27 | pub fn spans(&self, time: f64) -> bool { 28 | self.time_range.start <= time && self.time_range.end > time 29 | } 30 | 31 | pub fn from_spectrum(spectrum: &MultiLayerSpectrum) -> Self { 32 | let prec = spectrum.precursor().unwrap(); 33 | Self { 34 | mz: prec.mz(), 35 | charge: prec.charge(), 36 | time_range: (spectrum.start_time()..spectrum.start_time()), 37 | } 38 | } 39 | 40 | pub fn observe(&mut self, spectrum: &MultiLayerSpectrum) { 41 | let t = spectrum.start_time(); 42 | if t < self.time_range.start { 43 | self.time_range.start = t; 44 | } 45 | if t > self.time_range.end { 46 | self.time_range.end = t; 47 | } 48 | } 49 | } 50 | 51 | impl PartialOrd for SelectionTargetSpecification { 52 | fn partial_cmp(&self, other: &Self) -> Option { 53 | match self.mz.partial_cmp(&other.mz) { 54 | Some(Ordering::Equal) => {} 55 | ord => return ord, 56 | } 57 | match self.charge.partial_cmp(&other.charge) { 58 | Some(Ordering::Equal) => {} 59 | ord => return ord, 60 | } 61 | match self.time_range.start.partial_cmp(&other.time_range.start) { 62 | Some(Ordering::Equal) => {} 63 | ord => return ord, 64 | } 65 | self.time_range.end.partial_cmp(&other.time_range.end) 66 | } 67 | } 68 | 69 | #[derive(Default, Debug, Clone, Copy, PartialEq)] 70 | pub struct SelectedTarget { 71 | pub mz: f64, 72 | pub charge: Option, 73 | } 74 | 75 | impl SelectedTarget { 76 | pub fn new(mz: f64, charge: Option) -> Self { 77 | Self { mz, charge } 78 | } 79 | } 80 | 81 | pub struct MSnTargetTrackingIterator { 82 | source: SpectrumGroupingIterator, 83 | time_width: f64, 84 | error_tolerance: Tolerance, 85 | buffer: VecDeque<(SpectrumGroup, f64)>, 86 | pushback_buffer: Option<(SpectrumGroup, f64)>, 87 | targets: VecDeque, 88 | } 89 | 90 | impl MSnTargetTrackingIterator { 91 | pub fn new( 92 | source: SpectrumGroupingIterator, 93 | time_width: f64, 94 | error_tolerance: Tolerance, 95 | ) -> Self { 96 | let mut inst = Self { 97 | source, 98 | time_width, 99 | error_tolerance, 100 | buffer: Default::default(), 101 | pushback_buffer: Default::default(), 102 | targets: Default::default(), 103 | }; 104 | inst.initial_feed(); 105 | inst 106 | } 107 | 108 | fn observe(&mut self, group: &SpectrumGroup) { 109 | let error_tolerance = self.error_tolerance; 110 | let time_width_half = self.time_width / 2.0; 111 | let new_targets: usize = group 112 | .products 113 | .iter() 114 | .map(|s| { 115 | let prec = s.precursor().unwrap(); 116 | let mz = prec.mz(); 117 | let t = s.start_time(); 118 | let hits: usize = self 119 | .targets 120 | .iter_mut() 121 | .map(|p| { 122 | if error_tolerance.test(p.mz, mz) 123 | && (t > p.time_range.end) 124 | && (t - p.time_range.end) < time_width_half 125 | { 126 | p.time_range.end = t; 127 | 1 128 | } else { 129 | 0 130 | } 131 | }) 132 | .sum(); 133 | if hits == 0 { 134 | let p = SelectionTargetSpecification::new(mz, prec.charge(), t - 0.5..t + 0.5); 135 | // eprintln!("Added {p:?}"); 136 | self.targets.push_back(p); 137 | 1 138 | } else { 139 | 0 140 | } 141 | }) 142 | .sum(); 143 | if new_targets > 0 { 144 | log::debug!("Added {new_targets} new targets"); 145 | } 146 | } 147 | 148 | fn initial_feed(&mut self) { 149 | let group = self.source.next(); 150 | let start_time = group 151 | .as_ref() 152 | .and_then(|s| s.earliest_spectrum().map(|s| s.start_time())) 153 | .unwrap(); 154 | let end_time = start_time + self.time_width; 155 | eprintln!("Initial time window {start_time} to {end_time}"); 156 | if let Some(g) = group { 157 | self.observe(&g); 158 | self.buffer.push_back((g, start_time)); 159 | } 160 | while let Some(group) = self.source.next() { 161 | let t = group.earliest_spectrum().map(|s| s.start_time()).unwrap(); 162 | if t < end_time { 163 | self.observe(&group); 164 | self.buffer.push_back((group, t)); 165 | } else { 166 | self.pushback_buffer = Some((group, t)); 167 | break; 168 | } 169 | } 170 | eprintln!( 171 | "{} targets extracted from buffer size {}", 172 | self.targets.len(), 173 | self.buffer.len() 174 | ); 175 | } 176 | 177 | fn get_current_window_end(&self) -> f64 { 178 | if self.buffer.is_empty() { 179 | return f64::NEG_INFINITY; 180 | } 181 | let start = self.buffer.front().map(|(_, t)| *t).unwrap(); 182 | let end = self.buffer.back().map(|(_, t)| *t).unwrap(); 183 | let mid = start + (end - start) / 2.0; 184 | mid + self.time_width 185 | } 186 | 187 | fn feed_next(&mut self) { 188 | let threshold = self.get_current_window_end(); 189 | let (use_pushback, pushback_populated) = if let Some((_, t)) = self.pushback_buffer.as_ref() 190 | { 191 | (*t < threshold, true) 192 | } else { 193 | (false, false) 194 | }; 195 | if use_pushback { 196 | let (group, t) = self.pushback_buffer.take().unwrap(); 197 | self.observe(&group); 198 | self.buffer.push_back((group, t)); 199 | } 200 | if !pushback_populated { 201 | if let Some(group) = self.source.next() { 202 | let t = group.earliest_spectrum().map(|s| s.start_time()).unwrap(); 203 | if t < threshold { 204 | self.observe(&group); 205 | self.buffer.push_back((group, t)); 206 | } else { 207 | self.pushback_buffer = Some((group, t)); 208 | } 209 | } 210 | } 211 | } 212 | 213 | fn step(&mut self) -> Option<(SpectrumGroup, Vec)> { 214 | if let Some((group, t)) = self.buffer.pop_front() { 215 | let targets: Vec<_> = self 216 | .targets 217 | .iter() 218 | .filter(|p| p.spans(t)) 219 | .map(|p| SelectedTarget::new(p.mz, p.charge)) 220 | .collect(); 221 | self.targets = self 222 | .targets 223 | .drain(..) 224 | .filter(|target| { 225 | // Keep targets which have not ended by this time point 226 | let cond = target.time_range.end >= t; 227 | if !cond { 228 | log::debug!("Dropping {target:?} at {t}") 229 | } 230 | cond 231 | }) 232 | .collect(); 233 | Some((group, targets)) 234 | } else { 235 | None 236 | } 237 | } 238 | } 239 | 240 | impl Iterator for MSnTargetTrackingIterator { 241 | type Item = (SpectrumGroup, Vec); 242 | 243 | fn next(&mut self) -> Option { 244 | let value = self.step(); 245 | self.feed_next(); 246 | value 247 | } 248 | } 249 | 250 | fn main() -> io::Result<()> { 251 | let path = path::PathBuf::from( 252 | env::args() 253 | .nth(1) 254 | .expect("Please pass an MS data file path"), 255 | ); 256 | env_logger::init(); 257 | 258 | let reader = mzdata::MzMLReader::new(fs::File::open(path)?); 259 | let group_iter = reader.into_groups(); 260 | let tracking_iter = MSnTargetTrackingIterator::new(group_iter, 2.0, Tolerance::PPM(5.0)); 261 | for (group, targets) in tracking_iter { 262 | if group.precursor.is_some() { 263 | let prec = group.precursor.as_ref().unwrap(); 264 | eprintln!("{} {}: {}", prec.id(), prec.start_time(), targets.len()); 265 | } 266 | } 267 | Ok(()) 268 | } 269 | -------------------------------------------------------------------------------- /examples/mzcat.rs: -------------------------------------------------------------------------------- 1 | use std::time; 2 | use std::{env, io, path}; 3 | 4 | use mzdata::spectrum::MultiLayerSpectrum; 5 | use mzdata::{prelude::*, MZReader}; 6 | use rayon::prelude::*; 7 | 8 | fn scan_file + Send>(reader: &mut R) { 9 | let start = time::Instant::now(); 10 | reader.enumerate().par_bridge().for_each(|(i, scan)| { 11 | if i % 10000 == 0 { 12 | println!( 13 | "\tScan {}: {}|{} ({} seconds)", 14 | i, 15 | scan.id(), 16 | scan.index(), 17 | (time::Instant::now() - start).as_secs_f64(), 18 | ); 19 | } 20 | }); 21 | let end = time::Instant::now(); 22 | println!("Loaded in {} seconds", (end - start).as_secs_f64()); 23 | } 24 | 25 | fn main() -> io::Result<()> { 26 | env_logger::init(); 27 | let path = path::PathBuf::from( 28 | env::args() 29 | .nth(1) 30 | .expect("Please pass an MS data file path"), 31 | ); 32 | 33 | let mut reader = MZReader::open_path(&path)?; 34 | scan_file(&mut reader); 35 | Ok(()) 36 | } 37 | -------------------------------------------------------------------------------- /examples/mzinfo.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::env; 3 | use std::io; 4 | use std::path; 5 | use std::process; 6 | use std::thread::spawn; 7 | use std::time; 8 | 9 | use std::sync::mpsc::sync_channel; 10 | 11 | use mzdata::io::Source; 12 | use mzdata::prelude::*; 13 | use mzdata::spectrum::{ 14 | DeconvolutedSpectrum, MultiLayerSpectrum, RefPeakDataLevel, SignalContinuity, SpectrumLike, 15 | }; 16 | use mzdata::MZReader; 17 | 18 | struct MSDataFileSummary { 19 | pub start_time: f64, 20 | pub end_time: f64, 21 | pub level_table: HashMap, 22 | pub charge_table: HashMap, 23 | pub peak_charge_table: HashMap>, 24 | pub peak_mode_table: HashMap, 25 | pub has_ion_mobility: bool 26 | } 27 | 28 | impl Default for MSDataFileSummary { 29 | fn default() -> Self { 30 | Self { 31 | start_time: f64::INFINITY, 32 | end_time: f64::NEG_INFINITY, 33 | level_table: Default::default(), 34 | charge_table: Default::default(), 35 | peak_charge_table: Default::default(), 36 | peak_mode_table: Default::default(), 37 | has_ion_mobility: false, 38 | } 39 | } 40 | } 41 | 42 | impl MSDataFileSummary { 43 | pub fn handle_scan(&mut self, scan: MultiLayerSpectrum) { 44 | let time = scan.start_time(); 45 | self.start_time = self.start_time.min(time); 46 | self.end_time = self.end_time.max(time); 47 | let level = scan.ms_level(); 48 | *self.level_table.entry(level).or_default() += 1; 49 | if level > 1 { 50 | if let Some(charge) = scan.precursor().unwrap().ion().charge { 51 | *self.charge_table.entry(charge).or_default() += 1; 52 | } else { 53 | *self.charge_table.entry(0).or_default() += 1; 54 | } 55 | } 56 | *self 57 | .peak_mode_table 58 | .entry(scan.signal_continuity()) 59 | .or_default() += scan.peaks().len(); 60 | 61 | let has_charge = match scan.peaks() { 62 | RefPeakDataLevel::Missing => false, 63 | RefPeakDataLevel::RawData(arrays) => arrays.charges().is_ok(), 64 | RefPeakDataLevel::Centroid(_) => false, 65 | RefPeakDataLevel::Deconvoluted(_) => true, 66 | }; 67 | 68 | let has_ion_mobility = match scan.peaks() { 69 | RefPeakDataLevel::RawData(arrays) => arrays.has_ion_mobility(), 70 | _ => false, 71 | } || scan.has_ion_mobility(); 72 | self.has_ion_mobility |= has_ion_mobility; 73 | 74 | if has_charge { 75 | let deconv_scan: DeconvolutedSpectrum = scan.try_into().unwrap(); 76 | deconv_scan.deconvoluted_peaks.iter().for_each(|p| { 77 | *(*self 78 | .peak_charge_table 79 | .entry(deconv_scan.ms_level()) 80 | .or_default()) 81 | .entry(p.charge) 82 | .or_default() += 1; 83 | assert!((p.index as usize) < deconv_scan.deconvoluted_peaks.len()) 84 | }) 85 | } 86 | } 87 | 88 | pub fn _scan_file(&mut self, reader: &mut R) { 89 | let start = time::Instant::now(); 90 | reader.enumerate().for_each(|(i, scan)| { 91 | if i % 10000 == 0 && i > 0 { 92 | println!( 93 | "\tScan {}: {} ({:0.3} seconds, {} peaks|points)", 94 | i, 95 | scan.id(), 96 | (time::Instant::now() - start).as_secs_f64(), 97 | self.peak_mode_table.values().sum::() 98 | ); 99 | } 100 | self.handle_scan(scan); 101 | }); 102 | let end = time::Instant::now(); 103 | let elapsed = end - start; 104 | println!("{:0.3} seconds elapsed", elapsed.as_secs_f64()); 105 | } 106 | 107 | pub fn scan_file(&mut self, reader: R) { 108 | self.scan_file_threaded(reader) 109 | } 110 | 111 | pub fn scan_file_threaded(&mut self, reader: R) { 112 | let start = time::Instant::now(); 113 | let (sender, receiver) = sync_channel(2usize.pow(12)); 114 | let read_handle = spawn(move || { 115 | reader.into_iter() 116 | .enumerate() 117 | .for_each(|(i, scan)| { 118 | sender.send((i, scan)).unwrap() 119 | }); 120 | }); 121 | let i = receiver.iter().fold(0, |_, (i, scan)| { 122 | if i % 10000 == 0 && i > 0 { 123 | println!( 124 | "\tScan {}: {} ({:0.3} seconds, {} peaks|points)", 125 | i, 126 | scan.id(), 127 | (time::Instant::now() - start).as_secs_f64(), 128 | self.peak_mode_table.values().sum::() 129 | ); 130 | } 131 | self.handle_scan(scan); 132 | i 133 | }); 134 | read_handle.join().unwrap(); 135 | let end = time::Instant::now(); 136 | let elapsed = end - start; 137 | println!("{:0.3} seconds elapsed, handled {i} spectra", elapsed.as_secs_f64()); 138 | } 139 | 140 | pub fn write_out(&self) { 141 | println!("Start Time: {:0.2}", self.start_time); 142 | println!("End Time: {:0.2}", self.end_time); 143 | println!("Has Ion Mobility: {}", self.has_ion_mobility); 144 | println!("MS Levels:"); 145 | let mut level_set: Vec<(&u8, &usize)> = self.level_table.iter().collect(); 146 | level_set.sort_by_key(|(a, _)| *a); 147 | for (level, count) in level_set.iter() { 148 | println!("\t{}: {}", level, count); 149 | } 150 | 151 | println!("Precursor Charge States:"); 152 | let mut charge_set: Vec<(&i32, &usize)> = self.charge_table.iter().collect(); 153 | charge_set.sort_by_key(|(a, _)| *a); 154 | for (charge, count) in charge_set.iter() { 155 | if **charge == 0 { 156 | println!("\tCharge Not Reported: {}", count); 157 | } else { 158 | println!("\t{}: {}", charge, count); 159 | } 160 | } 161 | 162 | let mut peak_charge_levels: Vec<_> = self.peak_charge_table.iter().collect(); 163 | 164 | peak_charge_levels.sort_by(|(level_a, _), (level_b, _)| level_a.cmp(level_b)); 165 | 166 | for (level, peak_charge_table) in peak_charge_levels { 167 | if !peak_charge_table.is_empty() { 168 | println!("Peak Charge States for MS level {}:", level); 169 | let mut peak_charge_set: Vec<(&i32, &usize)> = peak_charge_table.iter().collect(); 170 | peak_charge_set.sort_by_key(|(a, _)| *a); 171 | for (charge, count) in peak_charge_set.iter() { 172 | if **charge == 0 { 173 | println!("\tCharge Not Reported: {}", count); 174 | } else { 175 | println!("\t{}: {}", charge, count); 176 | } 177 | } 178 | } 179 | } 180 | self.peak_mode_table 181 | .iter() 182 | .for_each(|(mode, count)| match mode { 183 | SignalContinuity::Unknown => println!("Unknown continuity: {}", count), 184 | SignalContinuity::Centroid => println!("Peaks: {}", count), 185 | SignalContinuity::Profile => println!("Points: {}", count), 186 | }); 187 | } 188 | } 189 | 190 | fn main() -> io::Result<()> { 191 | env_logger::init(); 192 | let path = path::PathBuf::from(env::args().nth(1).unwrap_or_else(|| { 193 | eprintln!("Please provide a path to an MS data file"); 194 | process::exit(1) 195 | })); 196 | let mut summarizer = MSDataFileSummary::default(); 197 | 198 | if path.as_os_str() == "-" { 199 | mzdata::mz_read!(Source::Stdin, reader => { 200 | summarizer.scan_file(reader) 201 | })?; 202 | } else { 203 | let reader = MZReader::open_path(path)?; 204 | eprintln!("Format: {}", reader.as_format()); 205 | summarizer.scan_file(reader) 206 | }; 207 | 208 | summarizer.write_out(); 209 | Ok(()) 210 | } 211 | -------------------------------------------------------------------------------- /examples/random_access_iter.rs: -------------------------------------------------------------------------------- 1 | use std::{env, io, path}; 2 | 3 | use mzdata::io::mzml; 4 | use mzdata::prelude::*; 5 | 6 | fn main() -> io::Result<()> { 7 | let path = path::PathBuf::from( 8 | env::args() 9 | .nth(1) 10 | .expect("Please pass an MS data file path"), 11 | // "test/data/batching_test.mzML" 12 | ); 13 | 14 | let mut reader = mzml::MzMLReader::open_path(path)?; 15 | 16 | let n_spectra = reader.len(); 17 | 18 | // Find the spectrum at the midpoint of the run 19 | let spec = reader.get_spectrum_by_index(n_spectra / 2).unwrap(); 20 | eprintln!( 21 | "Midpoint spectrum {} (level {}) at time {}", 22 | spec.id(), 23 | spec.ms_level(), 24 | spec.start_time() 25 | ); 26 | 27 | // Jump the iterator to that point in time 28 | reader.start_from_time(spec.start_time())?; 29 | let s = reader.next().unwrap(); 30 | eprintln!( 31 | "Resuming at {} (level {}) at time {}", 32 | s.id(), 33 | s.ms_level(), 34 | s.start_time() 35 | ); 36 | 37 | // Convert the iterator into a group iterator 38 | let mut group_iter = reader.into_groups(); 39 | // Jump the group iterator to that point in time (If an MSn spectrum was found, the next MS1 may be shown instead) 40 | group_iter.start_from_time(spec.start_time())?; 41 | let g = group_iter.next().unwrap(); 42 | eprintln!( 43 | "Resuming at group having {:?} at time {:?}", 44 | g.earliest_spectrum().map(|s| s.id()), 45 | g.earliest_spectrum().map(|s| s.start_time()) 46 | ); 47 | 48 | // Convert the group iterator into an averaging group iterator 49 | let mut avg_iter = group_iter.averaging(1, 200.0, 2200.0, 0.005); 50 | // Jump the group iterator to that point in time (If an MSn spectrum was found, the next MS1 may be shown instead) 51 | avg_iter.start_from_time(spec.start_time())?; 52 | let g = avg_iter.next().unwrap(); 53 | eprintln!( 54 | "Resuming at group having {:?} at time {:?}", 55 | g.earliest_spectrum().map(|s| s.id()), 56 | g.earliest_spectrum().map(|s| s.start_time()) 57 | ); 58 | 59 | Ok(()) 60 | } 61 | -------------------------------------------------------------------------------- /examples/readme.rs: -------------------------------------------------------------------------------- 1 | use mzdata::prelude::*; 2 | use mzdata::spectrum::SignalContinuity; 3 | use mzpeaks::Tolerance; 4 | use std::io; 5 | 6 | fn main() -> io::Result<()> { 7 | mzdata::mz_read!("./test/data/small.mzML".as_ref(), reader => 8 | { 9 | let mut ms1_count = 0; 10 | let mut msn_count = 0; 11 | for spectrum in reader { 12 | if spectrum.ms_level()==1 { 13 | ms1_count+=1; 14 | }else { 15 | msn_count+=1; 16 | } 17 | println!("Scan {} => BP {}",spectrum.id(),spectrum.peaks().base_peak().mz); 18 | if spectrum.signal_continuity() log.txt -------------------------------------------------------------------------------- /src/io/compression.rs: -------------------------------------------------------------------------------- 1 | use std::{io, path}; 2 | 3 | use flate2::bufread::MultiGzDecoder; 4 | use std::io::prelude::*; 5 | 6 | pub fn is_gzipped(header: &[u8]) -> bool { 7 | header.starts_with(b"\x1f\x8b") 8 | } 9 | 10 | pub fn is_gzipped_extension(path: path::PathBuf) -> (bool, path::PathBuf) { 11 | if let Some(ext) = path.extension() { 12 | if ext.eq_ignore_ascii_case("gz") { 13 | (true, path.with_extension("")) 14 | } else { 15 | (false, path) 16 | } 17 | } else { 18 | (false, path) 19 | } 20 | } 21 | 22 | /// A [`flate2::bufread::MultiGzDecoder`] that implements [`std::io::Seek`] by 23 | /// by incrementally reading ahead, or rewinding to the beginning of the file and 24 | /// doing the same. 25 | /// 26 | /// Not intended to be efficient, but provides a workable interface. 27 | pub struct RestartableGzDecoder { 28 | handle: Option>, 29 | offset: u64, 30 | } 31 | 32 | impl RestartableGzDecoder { 33 | pub fn new(handle: R) -> Self { 34 | Self { 35 | handle: Some(MultiGzDecoder::new(handle)), 36 | offset: 0, 37 | } 38 | } 39 | 40 | fn reset(&mut self) -> io::Result { 41 | let handle = self.handle.take().unwrap(); 42 | let mut inner = handle.into_inner(); 43 | let res = inner.seek(io::SeekFrom::Start(0)); 44 | self.handle = Some(MultiGzDecoder::new(inner)); 45 | self.offset = 0; 46 | res 47 | } 48 | } 49 | 50 | impl Read for RestartableGzDecoder { 51 | fn read(&mut self, buf: &mut [u8]) -> io::Result { 52 | let handle = self.handle.as_mut().unwrap(); 53 | match handle.read(buf) { 54 | Ok(b) => { 55 | self.offset += b as u64; 56 | Ok(b) 57 | } 58 | Err(e) => Err(e), 59 | } 60 | } 61 | } 62 | 63 | impl Seek for RestartableGzDecoder { 64 | fn seek(&mut self, pos: io::SeekFrom) -> io::Result { 65 | match pos { 66 | io::SeekFrom::Start(o) => { 67 | self.reset()?; 68 | let mut buf = vec![0u8; o as usize]; 69 | self.read_exact(&mut buf)?; 70 | Ok(o) 71 | } 72 | io::SeekFrom::End(_) => Err(io::Error::new( 73 | io::ErrorKind::Unsupported, 74 | "Cannot seek relative to end of a gzip stream", 75 | )), 76 | io::SeekFrom::Current(o) => match o { 77 | 0 => Ok(self.offset), 78 | _ if o < 0 => { 79 | if o.unsigned_abs() > self.offset { 80 | Err(io::Error::new( 81 | io::ErrorKind::Unsupported, 82 | "Cannot earlier than the start of the stream", 83 | )) 84 | } else { 85 | self.seek(io::SeekFrom::Start((self.offset as i64 + o) as u64)) 86 | } 87 | } 88 | _ => { 89 | let mut buf = vec![0; o as usize]; 90 | self.read_exact(&mut buf)?; 91 | Ok(self.offset) 92 | } 93 | }, 94 | } 95 | } 96 | } 97 | 98 | #[cfg(test)] 99 | mod test { 100 | use std::fs; 101 | 102 | use super::*; 103 | 104 | #[test] 105 | fn exercise_restartable() -> io::Result<()> { 106 | let handle = io::BufReader::new(fs::File::open("test/data/small.mzML.gz")?); 107 | let mut reader = RestartableGzDecoder::new(handle); 108 | reader.seek(io::SeekFrom::Current(5113415))?; 109 | let mut buf = String::new(); 110 | reader.read_to_string(&mut buf)?; 111 | assert!(buf.starts_with("")); 112 | reader.seek(io::SeekFrom::Start(5145))?; 113 | buf.clear(); 114 | io::BufReader::new(&mut reader).read_line(&mut buf)?; 115 | assert!(buf.contains("controllerType=0 controllerNumber=1 scan=1")); 116 | Ok(()) 117 | } 118 | } -------------------------------------------------------------------------------- /src/io/infer_format/inference.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | convert::TryFrom, 3 | fmt::Display, 4 | fs, 5 | io::{self, prelude::*, BufReader}, 6 | path, 7 | }; 8 | 9 | use flate2::bufread::GzDecoder; 10 | 11 | use crate::{ 12 | io::compression::{is_gzipped, is_gzipped_extension}, 13 | meta::FormatConversion, 14 | params::ControlledVocabulary, 15 | Param, 16 | }; 17 | 18 | #[cfg(feature = "mgf")] 19 | use crate::io::mgf::is_mgf; 20 | 21 | #[cfg(feature = "mzml")] 22 | use crate::io::mzml::is_mzml; 23 | 24 | #[cfg(feature = "thermo")] 25 | use crate::io::thermo::is_thermo_raw_prefix; 26 | 27 | #[cfg(feature = "bruker_tdf")] 28 | use crate::io::tdf::is_tdf; 29 | 30 | /// Mass spectrometry file formats that [`mzdata`](crate) 31 | /// supports 32 | #[non_exhaustive] 33 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] 34 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] 35 | pub enum MassSpectrometryFormat { 36 | MGF, 37 | MzML, 38 | MzMLb, 39 | ThermoRaw, 40 | BrukerTDF, 41 | Unknown, 42 | } 43 | 44 | impl MassSpectrometryFormat { 45 | pub fn as_conversion(&self) -> Option { 46 | match self { 47 | MassSpectrometryFormat::MzML => Some(FormatConversion::ConversionToMzML), 48 | MassSpectrometryFormat::MzMLb => Some(FormatConversion::ConversionToMzMLb), 49 | _ => None, 50 | } 51 | } 52 | 53 | pub fn as_param(&self) -> Option { 54 | let p = match self { 55 | MassSpectrometryFormat::MGF => { 56 | ControlledVocabulary::MS.const_param_ident("Mascot MGF format", 1001062) 57 | } 58 | MassSpectrometryFormat::MzML => { 59 | ControlledVocabulary::MS.const_param_ident("mzML format", 1000584) 60 | } 61 | MassSpectrometryFormat::MzMLb => { 62 | ControlledVocabulary::MS.const_param_ident("mzMLb format", 1002838) 63 | } 64 | MassSpectrometryFormat::ThermoRaw => { 65 | ControlledVocabulary::MS.const_param_ident("Thermo RAW format", 1000563) 66 | } 67 | MassSpectrometryFormat::BrukerTDF => { 68 | ControlledVocabulary::MS.const_param_ident("Bruker TDF format", 1002817) 69 | } 70 | MassSpectrometryFormat::Unknown => return None, 71 | }; 72 | Some(p.into()) 73 | } 74 | } 75 | 76 | impl TryFrom for Param { 77 | type Error = &'static str; 78 | 79 | fn try_from(value: MassSpectrometryFormat) -> Result { 80 | if let Some(p) = value.as_param() { 81 | Ok(p) 82 | } else { 83 | Err("No conversion") 84 | } 85 | } 86 | } 87 | 88 | impl TryFrom for FormatConversion { 89 | type Error = &'static str; 90 | 91 | fn try_from(value: MassSpectrometryFormat) -> Result { 92 | if let Some(p) = value.as_conversion() { 93 | Ok(p) 94 | } else { 95 | Err("No conversion") 96 | } 97 | } 98 | } 99 | 100 | impl Display for MassSpectrometryFormat { 101 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 102 | write!(f, "{:?}", self) 103 | } 104 | } 105 | 106 | /// Given a path, infer the file format and whether or not the file at that path is 107 | /// GZIP compressed 108 | pub fn infer_from_path>(path: P) -> (MassSpectrometryFormat, bool) { 109 | let path: path::PathBuf = path.into(); 110 | if path.is_dir() { 111 | #[cfg(feature = "bruker_tdf")] 112 | if is_tdf(path) { 113 | return (MassSpectrometryFormat::BrukerTDF, false); 114 | } else { 115 | return (MassSpectrometryFormat::Unknown, false); 116 | } 117 | } 118 | let (is_gzipped, path) = is_gzipped_extension(path); 119 | if let Some(ext) = path.extension() { 120 | if let Some(ext) = ext.to_ascii_lowercase().to_str() { 121 | let form = match ext { 122 | "mzml" => MassSpectrometryFormat::MzML, 123 | "mgf" => MassSpectrometryFormat::MGF, 124 | #[cfg(feature = "mzmlb")] 125 | "mzmlb" => MassSpectrometryFormat::MzMLb, 126 | #[cfg(feature = "thermo")] 127 | "raw" => MassSpectrometryFormat::ThermoRaw, 128 | _ => MassSpectrometryFormat::Unknown, 129 | }; 130 | (form, is_gzipped) 131 | } else { 132 | (MassSpectrometryFormat::Unknown, is_gzipped) 133 | } 134 | } else { 135 | (MassSpectrometryFormat::Unknown, is_gzipped) 136 | } 137 | } 138 | 139 | /// Given a stream of bytes, infer the file format and whether or not the 140 | /// stream is GZIP compressed. This assumes the stream is seekable. 141 | pub fn infer_from_stream( 142 | stream: &mut R, 143 | ) -> io::Result<(MassSpectrometryFormat, bool)> { 144 | // We need to read in at least enough bytes to span a complete XML head plus the 145 | // end of an opening tag 146 | let mut buf = Vec::with_capacity(500); 147 | buf.resize(500, b'\0'); 148 | let current_pos = stream.stream_position()?; 149 | // record how many bytes were actually read so we know the upper bound 150 | let bytes_read = stream.read(buf.as_mut_slice())?; 151 | buf.shrink_to(bytes_read); 152 | let is_stream_gzipped = is_gzipped(buf.as_slice()); 153 | if is_stream_gzipped { 154 | let mut decompressed_buf = Vec::new(); 155 | // In the worst case, we can't have fewer bytes than those that were read in (minus the size of the gzip header) 156 | // and we assume the compression ratio means we have recouped that. We read in only that many bytes 157 | // decompressed because the decompressor treats an incomplete segment as an error and thus using 158 | // io::Read::read_to_end is not an option. 159 | decompressed_buf.resize(bytes_read, b'\0'); 160 | let mut decoder = GzDecoder::new(io::Cursor::new(buf)); 161 | decoder.read_exact(&mut decompressed_buf)?; 162 | buf = decompressed_buf; 163 | } 164 | stream.seek(io::SeekFrom::Start(current_pos))?; 165 | 166 | match &buf { 167 | #[cfg(feature = "mzml")] 168 | _ if is_mzml(&buf) => Ok((MassSpectrometryFormat::MzML, is_stream_gzipped)), 169 | #[cfg(feature = "mgf")] 170 | _ if is_mgf(&buf) => Ok((MassSpectrometryFormat::MGF, is_stream_gzipped)), 171 | #[cfg(feature = "thermo")] 172 | _ if is_thermo_raw_prefix(&buf) => { 173 | Ok((MassSpectrometryFormat::ThermoRaw, is_stream_gzipped)) 174 | } 175 | _ => Ok((MassSpectrometryFormat::Unknown, is_stream_gzipped)), 176 | } 177 | } 178 | 179 | /// Given a path, infer the file format and whether or not the file at that path is 180 | /// GZIP compressed, using both the file name and by trying to open and read the file 181 | /// header 182 | pub fn infer_format>(path: P) -> io::Result<(MassSpectrometryFormat, bool)> { 183 | let path: path::PathBuf = path.into(); 184 | 185 | let (format, is_gzipped) = infer_from_path(&path); 186 | match format { 187 | MassSpectrometryFormat::Unknown => { 188 | let handle = fs::File::open(path.clone())?; 189 | let mut stream = BufReader::new(handle); 190 | let (format, is_gzipped) = infer_from_stream(&mut stream)?; 191 | Ok((format, is_gzipped)) 192 | } 193 | _ => Ok((format, is_gzipped)), 194 | } 195 | } 196 | -------------------------------------------------------------------------------- /src/io/infer_format/mod.rs: -------------------------------------------------------------------------------- 1 | mod inference; 2 | mod dispatch; 3 | mod pipeline; 4 | 5 | pub use dispatch::{MZReader, MZReaderType, MZReaderBuilder, IMMZReaderType}; 6 | #[cfg(feature = "async_partial")] 7 | pub use dispatch::{AsyncMZReaderType, AsyncMZReader, AsyncMZReaderBuilder}; 8 | 9 | 10 | 11 | pub use inference::{infer_from_path, infer_from_stream, infer_format, MassSpectrometryFormat}; 12 | 13 | pub use pipeline::{MassSpectrometryReadWriteProcess, Source, Sink}; 14 | 15 | #[cfg(test)] 16 | mod test { 17 | use std::{fs, io, path}; 18 | 19 | use mzpeaks::{CentroidPeak, DeconvolutedPeak}; 20 | 21 | use crate::{ 22 | prelude::*, 23 | spectrum::{ArrayType, Spectrum}, 24 | io::DetailLevel 25 | }; 26 | 27 | use super::*; 28 | 29 | #[cfg(feature = "mzml")] 30 | #[test] 31 | fn infer_mzml() { 32 | let path = path::Path::new("./test/data/small.mzML"); 33 | assert!(path.exists()); 34 | let (fmt, zipped) = infer_from_path(path); 35 | assert_eq!(fmt, MassSpectrometryFormat::MzML); 36 | assert!(!zipped); 37 | } 38 | 39 | #[test] 40 | fn infer_mgf() { 41 | let path = path::Path::new("./test/data/small.mgf"); 42 | assert!(path.exists()); 43 | let (fmt, zipped) = infer_from_path(path); 44 | assert_eq!(fmt, MassSpectrometryFormat::MGF); 45 | assert!(!zipped); 46 | } 47 | 48 | #[cfg(feature = "thermo")] 49 | #[test] 50 | fn infer_thermo() { 51 | let path = path::Path::new("./test/data/small.RAW"); 52 | let (fmt, zipped) = infer_from_path(path); 53 | assert_eq!(fmt, MassSpectrometryFormat::ThermoRaw); 54 | assert!(!zipped); 55 | } 56 | 57 | #[cfg(feature = "mzml")] 58 | #[test] 59 | fn infer_open() { 60 | let path = path::Path::new("./test/data/small.mzML"); 61 | assert!(path.exists()); 62 | if let Ok(mut reader) = MZReader::open_path(path) { 63 | assert_eq!(reader.len(), 48); 64 | assert_eq!(*reader.detail_level(), DetailLevel::Full); 65 | if let Some(spec) = reader.get_spectrum_by_index(10) { 66 | let spec: Spectrum = spec; 67 | assert!(spec.index() == 10); 68 | assert!(spec.id() == "controllerType=0 controllerNumber=1 scan=11"); 69 | if let Some(data_arrays) = &spec.arrays { 70 | assert!(data_arrays.has_array(&ArrayType::MZArray)); 71 | assert!(data_arrays.has_array(&ArrayType::IntensityArray)); 72 | let mzs = data_arrays.mzs().unwrap(); 73 | assert!(mzs.len() == 941); 74 | } 75 | } else { 76 | panic!("Failed to retrieve spectrum by index") 77 | } 78 | 79 | assert_eq!(reader.get_spectrum_by_id("controllerType=0 controllerNumber=1 scan=11").unwrap().index(), 10); 80 | 81 | if let Some(spec) = reader.get_spectrum_by_time(0.358558333333) { 82 | assert_eq!(spec.index(), 34); 83 | } else { 84 | panic!("Failed to retrieve spectrum by time") 85 | } 86 | 87 | } else { 88 | panic!("Failed to open file") 89 | } 90 | } 91 | 92 | #[cfg(feature = "thermo")] 93 | #[test] 94 | fn infer_open_thermo() { 95 | let path = path::Path::new("./test/data/small.RAW"); 96 | assert!(path.exists()); 97 | if let Ok(mut reader) = MZReader::open_path(path) { 98 | assert_eq!(reader.len(), 48); 99 | assert_eq!(*reader.detail_level(), DetailLevel::Full); 100 | if let Some(spec) = reader.get_spectrum_by_index(10) { 101 | let spec: Spectrum = spec; 102 | assert_eq!(spec.index(), 10); 103 | assert_eq!(spec.id(), "controllerType=0 controllerNumber=1 scan=11"); 104 | assert_eq!(spec.peaks().len(), 941); 105 | } else { 106 | panic!("Failed to retrieve spectrum by index") 107 | } 108 | 109 | assert_eq!(reader.get_spectrum_by_id("controllerType=0 controllerNumber=1 scan=11").unwrap().index(), 10); 110 | 111 | if let Some(spec) = reader.get_spectrum_by_time(0.358558333333) { 112 | assert_eq!(spec.index(), 34); 113 | } else { 114 | panic!("Failed to retrieve spectrum by time") 115 | } 116 | 117 | } else { 118 | panic!("Failed to open file") 119 | } 120 | } 121 | 122 | #[test] 123 | fn test_source_conv() -> io::Result<()> { 124 | let s = Source::::from("text/path".as_ref()); 125 | assert!(matches!(s, Source::PathLike(_))); 126 | 127 | let fh = Box::new(io::BufReader::new(fs::File::open("./test/data/small.mgf")?)) as Box; 128 | let rs: Source = (fh, MassSpectrometryFormat::MGF).into(); 129 | assert!(matches!(rs, Source::Reader(_, _))); 130 | 131 | Ok(()) 132 | } 133 | 134 | #[cfg(feature = "mzml")] 135 | #[test] 136 | fn test_dispatch_mzreader() -> io::Result<()> { 137 | let mut reader = MZReader::open_path("./test/data/small.mzML")?; 138 | 139 | let n = reader.len(); 140 | let n_ms1 = reader.iter().filter(|s| s.ms_level() == 1).count(); 141 | let n_msn = reader.iter().filter(|s| s.ms_level() == 2).count(); 142 | 143 | assert_eq!(n, 48); 144 | assert_eq!(n, n_ms1 + n_msn); 145 | Ok(()) 146 | } 147 | 148 | #[cfg(feature = "mzml")] 149 | #[test] 150 | fn test_infer_stream() -> io::Result<()> { 151 | let mut mzml_file = fs::File::open("./test/data/small.mzML")?; 152 | let (form, gzip) = infer_from_stream(&mut mzml_file)?; 153 | assert_eq!(form, MassSpectrometryFormat::MzML); 154 | assert!(!gzip); 155 | 156 | mzml_file = fs::File::open("./test/data/20200204_BU_8B8egg_1ug_uL_7charges_60_min_Slot2-11_1_244.mzML.gz")?; 157 | let (form, gzip) = infer_from_stream(&mut mzml_file)?; 158 | assert_eq!(form, MassSpectrometryFormat::MzML); 159 | assert!(gzip); 160 | 161 | let mut mgf_file = fs::File::open("./test/data/small.mgf")?; 162 | let (form, gzip) = infer_from_stream(&mut mgf_file)?; 163 | assert_eq!(form, MassSpectrometryFormat::MGF); 164 | assert!(!gzip); 165 | Ok(()) 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /src/io/mgf.rs: -------------------------------------------------------------------------------- 1 | //! Read and write [MGF](https://www.matrixscience.com/help/data_file_help.html#GEN) files. 2 | //! Supports random access when reading from a source that supports [`io::Seek`](std::io::Seek). 3 | //! 4 | //! **Requires the `mgf` feature, enabled by default** 5 | #![cfg(feature = "mgf")] 6 | mod reader; 7 | mod writer; 8 | 9 | pub use reader::{MGFError, MGFParserState, MGFReader, MGFReaderType}; 10 | pub use writer::{ 11 | MGFHeaderStyle, MGFWriter, MGFWriterType, MZDataMGFStyle, SimpleMGFStyle, SimpleMGFWriter, 12 | SimpleMGFWriterType, 13 | }; 14 | 15 | #[cfg(feature = "async")] 16 | mod async_reader; 17 | 18 | #[cfg(feature = "async")] 19 | pub use crate::io::mgf::async_reader::{ 20 | MGFReader as AsyncMGFReader, MGFReaderType as AsyncMGFReaderType, 21 | }; 22 | 23 | 24 | pub fn is_mgf(buf: &[u8]) -> bool { 25 | let needle = b"BEGIN IONS"; 26 | buf.windows(needle.len()).any(|window| window == needle) 27 | } 28 | 29 | 30 | #[cfg(test)] 31 | mod test { 32 | use crate::io::DetailLevel; 33 | use crate::spectrum::RefPeakDataLevel; 34 | use crate::CentroidSpectrum; 35 | use crate::{io::RestartableGzDecoder, prelude::*}; 36 | use mzpeaks::{CentroidPeak, DeconvolutedPeak, IndexedCoordinate}; 37 | 38 | use super::*; 39 | use std::{fs, io, path}; 40 | 41 | #[test] 42 | fn test_reader() { 43 | let path = path::Path::new("./test/data/small.mgf"); 44 | let file = fs::File::open(path).expect("Test file doesn't exist"); 45 | let reader = MGFReaderType::<_>::new(file); 46 | let mut ms1_count = 0; 47 | let mut msn_count = 0; 48 | for scan in reader { 49 | let level = scan.ms_level(); 50 | if level == 1 { 51 | ms1_count += 1; 52 | } else { 53 | msn_count += 1; 54 | } 55 | } 56 | assert_eq!(ms1_count, 0); 57 | assert_eq!(msn_count, 35); 58 | } 59 | 60 | #[test] 61 | fn test_reader_indexed() { 62 | let path = path::Path::new("./test/data/small.mgf"); 63 | let file = fs::File::open(path).expect("Test file doesn't exist"); 64 | let mut reader = MGFReaderType::<_, CentroidPeak, DeconvolutedPeak>::new_indexed(file); 65 | 66 | let n = reader.len(); 67 | let mut ms1_count = 0; 68 | let mut msn_count = 0; 69 | 70 | for i in (0..n).rev() { 71 | let scan = reader.get_spectrum_by_index(i).expect("Missing spectrum"); 72 | let level = scan.ms_level(); 73 | if level == 1 { 74 | ms1_count += 1; 75 | } else { 76 | msn_count += 1; 77 | } 78 | let centroided: CentroidSpectrum = scan.try_into().unwrap(); 79 | centroided.peaks.iter().for_each(|p| { 80 | (centroided.peaks[p.get_index() as usize]).mz(); 81 | }) 82 | } 83 | assert_eq!(ms1_count, 0); 84 | assert_eq!(msn_count, 35); 85 | } 86 | 87 | #[test] 88 | fn test_writer() -> io::Result<()> { 89 | let buff: Vec = Vec::new(); 90 | let inner_writer = io::Cursor::new(buff); 91 | let mut writer = MGFWriter::new(inner_writer); 92 | 93 | let path = path::Path::new("./test/data/small.mgf"); 94 | let file = fs::File::open(path).expect("Test file doesn't exist"); 95 | let mut reader = MGFReader::new(file); 96 | 97 | for scan in reader.iter() { 98 | writer.write(&scan)?; 99 | } 100 | writer.flush()?; 101 | let inner_writer = writer.handle.into_inner()?; 102 | let buffer = inner_writer.into_inner(); 103 | let reader2 = MGFReader::new(io::Cursor::new(buffer)); 104 | assert_eq!(reader2.len(), reader.len()); 105 | 106 | // Not including platform-specific line endings 107 | Ok(()) 108 | } 109 | 110 | #[test] 111 | fn test_read_unsupported() -> io::Result<()> { 112 | let path = path::Path::new("./test/data/small.mgf"); 113 | let file = fs::File::open(path).expect("Test file doesn't exist"); 114 | let mut reader = MGFReaderType::<_>::new(file); 115 | 116 | assert!(reader.get_chromatogram_by_id("not real").is_none()); 117 | assert!(reader.get_chromatogram_by_index(0).is_none()); 118 | assert!(reader.spectrum_count_hint().is_none()); 119 | 120 | reader = MGFReaderType::<_>::open_path(path)?; 121 | assert_eq!(reader.spectrum_count_hint().unwrap() as usize, reader.len()); 122 | 123 | assert!(reader.run_description().is_some()); 124 | Ok(()) 125 | } 126 | 127 | #[test] 128 | fn test_read_charged_complex() -> io::Result<()> { 129 | let fh = io::BufReader::new(fs::File::open("./test/data/processed_batch.mgf.gz")?); 130 | let fh = RestartableGzDecoder::new(fh); 131 | let mut reader = MGFReader::new_indexed(fh); 132 | 133 | let mut scan = reader.next().unwrap(); 134 | scan.try_build_deconvoluted_centroids().unwrap(); 135 | assert!(matches!(scan.peaks(), RefPeakDataLevel::Deconvoluted(_))); 136 | scan.try_build_centroids() 137 | .expect_err("Expected not to find"); 138 | assert!(matches!(scan.peaks(), RefPeakDataLevel::Deconvoluted(_))); 139 | scan.update_summaries(); 140 | 141 | assert_eq!(scan.index(), 0); 142 | let summaries = scan.peaks().fetch_summaries(); 143 | assert!( 144 | (summaries.tic - 3758148.3).abs() < 1e-3, 145 | "TIC error {}", 146 | summaries.tic - 3758148.3 147 | ); 148 | assert!( 149 | (summaries.base_peak.mz - 443.2600402832031).abs() < 1e-3, 150 | "BP m/z {}", 151 | (summaries.base_peak.mz - 443.2600402832031) 152 | ); 153 | assert!( 154 | (summaries.mz_range.0 > 120.0 && summaries.mz_range.0 < 120.1), 155 | "{:?}", 156 | summaries.mz_range 157 | ); 158 | assert_eq!(summaries.count, 492); 159 | 160 | reader.read_into(&mut scan).unwrap(); 161 | assert_eq!(scan.index(), 1); 162 | 163 | let sid = "MouseBrain-Z-T-1.25740.25740.2 File:\"MouseBrain-Z-T-1.raw\", NativeID:\"controllerType=0 controllerNumber=1 scan=25740\""; 164 | scan = reader.get_spectrum_by_id(sid).unwrap(); 165 | assert_eq!(scan.id(), sid); 166 | assert_eq!(scan.index(), 0); 167 | reader.reset(); 168 | 169 | assert_eq!(*reader.detail_level(), DetailLevel::Full); 170 | scan = reader.start_from_index(30).unwrap().next().unwrap(); 171 | assert!(!scan.peaks().is_empty()); 172 | assert_eq!(scan.index(), 30); 173 | let time = scan.start_time(); 174 | 175 | reader.set_detail_level(DetailLevel::MetadataOnly); 176 | scan = reader.start_from_id(sid).unwrap().next().unwrap(); 177 | assert_eq!(scan.index(), 0); 178 | assert!(scan.peaks().is_empty()); 179 | 180 | scan = reader.start_from_time(time).unwrap().next().unwrap(); 181 | assert_eq!(scan.index(), 30); 182 | assert!(scan.peaks().is_empty()); 183 | Ok(()) 184 | } 185 | 186 | #[cfg(feature = "async")] 187 | mod async_tests { 188 | use super::*; 189 | use futures::StreamExt; 190 | use tokio::fs; 191 | 192 | #[tokio::test] 193 | async fn test_reader() { 194 | let path = path::Path::new("./test/data/small.mgf"); 195 | let file = fs::File::open(path).await.expect("Test file doesn't exist"); 196 | let mut reader = AsyncMGFReaderType::<_>::new(file).await; 197 | let mut ms1_count = 0; 198 | let mut msn_count = 0; 199 | while let Some(scan) = reader.read_next().await { 200 | let level = scan.ms_level(); 201 | if level == 1 { 202 | ms1_count += 1; 203 | } else { 204 | msn_count += 1; 205 | } 206 | } 207 | assert_eq!(ms1_count, 0); 208 | assert_eq!(msn_count, 35); 209 | } 210 | 211 | #[tokio::test] 212 | async fn test_reader_indexed() { 213 | let path = path::Path::new("./test/data/small.mgf"); 214 | let file = fs::File::open(path).await.expect("Test file doesn't exist"); 215 | let mut reader = 216 | AsyncMGFReaderType::<_, CentroidPeak, DeconvolutedPeak>::new_indexed(file).await; 217 | 218 | let n = reader.len(); 219 | let mut ms1_count = 0; 220 | let mut msn_count = 0; 221 | 222 | for i in (0..n).rev() { 223 | let scan = reader.get_spectrum_by_index(i).await.unwrap(); 224 | let level = scan.ms_level(); 225 | if level == 1 { 226 | ms1_count += 1; 227 | } else { 228 | msn_count += 1; 229 | } 230 | let centroided: CentroidSpectrum = scan.try_into().unwrap(); 231 | centroided.peaks.iter().for_each(|p| { 232 | (centroided.peaks[p.get_index() as usize]).mz(); 233 | }) 234 | } 235 | assert_eq!(ms1_count, 0); 236 | assert_eq!(msn_count, 35); 237 | 238 | ms1_count = 0; 239 | msn_count = 0; 240 | reader.reset().await; 241 | 242 | let mut stream = reader.as_stream(); 243 | while let Some(scan) = stream.next().await { 244 | let level = scan.ms_level(); 245 | if level == 1 { 246 | ms1_count += 1; 247 | } else { 248 | msn_count += 1; 249 | } 250 | let centroided: CentroidSpectrum = scan.try_into().unwrap(); 251 | centroided.peaks.iter().for_each(|p| { 252 | (centroided.peaks[p.get_index() as usize]).mz(); 253 | }) 254 | } 255 | 256 | assert_eq!(ms1_count, 0); 257 | assert_eq!(msn_count, 35); 258 | } 259 | } 260 | } 261 | -------------------------------------------------------------------------------- /src/io/mod.rs: -------------------------------------------------------------------------------- 1 | //! Reading and writing mass spectrometry data file formats and abstractions over them. 2 | //! 3 | //! There are many data file formats for recording mass spectrometry data. 4 | //! 5 | 6 | mod infer_format; 7 | pub mod mgf; 8 | pub mod mzml; 9 | #[cfg(feature = "mzmlb")] 10 | pub mod mzmlb; 11 | mod offset_index; 12 | #[cfg(feature = "proxi")] 13 | pub mod proxi; 14 | mod shorthand; 15 | pub(crate) mod traits; 16 | mod utils; 17 | 18 | pub(crate) mod compression; 19 | 20 | pub use crate::io::infer_format::{ 21 | infer_format, infer_from_path, infer_from_stream, IMMZReaderType, MZReader, MZReaderBuilder, 22 | MZReaderType, MassSpectrometryFormat, MassSpectrometryReadWriteProcess, Sink, Source, 23 | }; 24 | 25 | #[cfg(feature = "mgf")] 26 | pub use crate::io::mgf::{MGFError, MGFReader, MGFWriter}; 27 | 28 | #[cfg(all(feature = "async", feature = "mzml"))] 29 | pub use crate::io::mzml::AsyncMzMLReader; 30 | 31 | #[cfg(feature = "mzml")] 32 | pub use crate::io::mzml::{MzMLParserError, MzMLReader, MzMLWriter}; 33 | 34 | #[cfg(feature = "mzmlb")] 35 | pub use crate::io::mzmlb::{MzMLbError, MzMLbReader}; 36 | 37 | pub use crate::io::offset_index::OffsetIndex; 38 | pub use crate::io::traits::{ 39 | BorrowedGeneric3DIonMobilityFrameSource, ChromatogramIterator, ChromatogramSource, 40 | Generic3DIonMobilityFrameSource, IntoIonMobilityFrameSource, IntoIonMobilityFrameSourceError, 41 | IonMobilityFrameAccessError, IonMobilityFrameGrouping, IonMobilityFrameIterator, 42 | IonMobilityFrameSource, MZFileReader, MemorySpectrumSource, 43 | RandomAccessIonMobilityFrameGroupingIterator, RandomAccessIonMobilityFrameIterator, 44 | RandomAccessSpectrumGroupingIterator, RandomAccessSpectrumIterator, RandomAccessSpectrumSource, 45 | SpectrumAccessError, SpectrumGrouping, SpectrumIterator, SpectrumReceiver, SpectrumSource, 46 | SpectrumSourceWithMetadata, SpectrumWriter, StreamingSpectrumIterator, 47 | }; 48 | 49 | #[cfg(feature = "async_partial")] 50 | pub use crate::io::traits::AsyncSpectrumSource; 51 | 52 | #[cfg(feature = "async")] 53 | pub use crate::io::traits::AsyncMZFileReader; 54 | 55 | pub use crate::io::utils::{DetailLevel, PreBufferedStream}; 56 | 57 | #[cfg(feature = "checksum")] 58 | pub use crate::io::utils::checksum_file; 59 | 60 | pub use compression::RestartableGzDecoder; 61 | 62 | #[cfg(any(feature = "thermo", feature = "doc-only"))] 63 | pub mod thermo; 64 | 65 | #[cfg(any(feature = "thermo", feature = "doc-only"))] 66 | pub use thermo::ThermoRawReader; 67 | 68 | #[cfg(all( 69 | feature = "async_partial", 70 | any(feature = "thermo", feature = "doc-only") 71 | ))] 72 | pub use thermo::AsyncThermoRawReader; 73 | 74 | #[cfg(feature = "async_partial")] 75 | pub use infer_format::{AsyncMZReader, AsyncMZReaderBuilder, AsyncMZReaderType}; 76 | 77 | #[cfg(feature = "bruker_tdf")] 78 | pub mod tdf; 79 | 80 | pub mod usi; 81 | 82 | 83 | // A location to re-export the symbols needed to make mz_read and 84 | // mz_write macros behave properly in other crates. 85 | #[doc(hidden)] 86 | pub mod _impl { 87 | pub use super::shorthand::*; 88 | 89 | } -------------------------------------------------------------------------------- /src/io/mzml.rs: -------------------------------------------------------------------------------- 1 | //! Implements a parser for the PSI-MS mzML and indexedmzML XML file formats 2 | //! for representing raw and processed mass spectra, providing a 3 | //! [`RandomAccessSpectrumIterator`](crate::io::traits::RandomAccessSpectrumIterator) 4 | //! interface for reading, and [`SpectrumWriter`](crate::io::traits::SpectrumWriter) 5 | //! interface for writing. 6 | //! 7 | //! **Requires the `mzml` feature, enabled by default** 8 | //! 9 | //! The mzML format is standardized by the Proteomics Standards Initiative (PSI), with 10 | //! a formal schema defined at . 11 | //! 12 | //! This crate supports both reading and writing (indexed) mzML documents with spectra 13 | //! of varying degrees of complexity (raw profiles, centroids, processed centroids), though 14 | //! extensive customization of the coercion process relies on the [`BuildFromArrayMap`](crate::spectrum::bindata::BuildFromArrayMap) and 15 | //! [`BuildArrayMapFrom`](crate::spectrum::bindata::BuildArrayMapFrom) traits 16 | //! for reading and writing conversion to [`BinaryArrayMap`](crate::spectrum::bindata::BinaryArrayMap). 17 | #![cfg(feature = "mzml")] 18 | mod reader; 19 | mod reading_shared; 20 | mod writer; 21 | 22 | #[cfg(feature = "async")] 23 | mod async_reader; 24 | 25 | pub use reading_shared::{ 26 | CVParamParse, MzMLParserError, MzMLParserState, MzMLSAX, XMLParseBase, 27 | FileMetadataBuilder, EntryType 28 | }; 29 | 30 | #[allow(unused)] 31 | pub(crate) use reading_shared::{IncrementingIdMap, ParserResult}; 32 | 33 | pub use crate::io::mzml::reader::{ 34 | MzMLReader, MzMLReaderType, MzMLSpectrumBuilder, 35 | SpectrumBuilding, 36 | }; 37 | 38 | pub(crate) use crate::io::mzml::reader::is_mzml; 39 | 40 | pub use crate::io::mzml::writer::{MzMLWriter, MzMLWriterState, MzMLWriterType, MzMLWriterError}; 41 | 42 | #[cfg(feature = "async")] 43 | pub use crate::io::mzml::async_reader::{ 44 | MzMLReader as AsyncMzMLReader, MzMLReaderType as AsyncMzMLReaderType, 45 | }; 46 | -------------------------------------------------------------------------------- /src/io/mzmlb.rs: -------------------------------------------------------------------------------- 1 | //! Implements a parser for the mzMLb file format 2 | //! for representing raw and processed mass spectra, providing a 3 | //! [`RandomAccessSpectrumIterator`](crate::io::traits::RandomAccessSpectrumIterator) 4 | //! interface for reading, and [`SpectrumWriter`](crate::io::traits::SpectrumWriter) 5 | //! interface for writing. 6 | //! 7 | //! **Requires the `mzmlb` feature** 8 | //! 9 | //! The mzMLb format embeds a variant of the mzML format within an HDF5 file, storing 10 | //! the spectrum metadata in XML and data arrays in separate datasets in the same file. 11 | //! 12 | //! This crate supports both reading and writing (indexed) mzML documents with spectra 13 | //! of varying degrees of complexity (raw profiles, centroids, processed centroids), though 14 | //! extensive customization of the coercion process relies on the [`BuildFromArrayMap`](crate::spectrum::bindata::BuildFromArrayMap) and 15 | //! [`BuildArrayMapFrom`](crate::spectrum::bindata::BuildArrayMapFrom) traits 16 | //! for reading and writing conversion to [`BinaryArrayMap`](crate::spectrum::bindata::BinaryArrayMap). 17 | 18 | mod reader; 19 | mod common; 20 | mod writer; 21 | 22 | pub use reader::{MzMLbReader, MzMLbError, MzMLbReaderType, MzMLbSpectrumBuilder}; 23 | pub use writer::{MzMLbWriterType, MzMLbWriterError, MzMLbWriterBuilder, MzMLbWriter}; -------------------------------------------------------------------------------- /src/io/mzmlb/common.rs: -------------------------------------------------------------------------------- 1 | use hdf5; 2 | use hdf5::types::{TypeDescriptor, IntSize, FloatSize}; 3 | 4 | use crate::spectrum::BinaryDataArrayType; 5 | 6 | impl From<&TypeDescriptor> for BinaryDataArrayType { 7 | fn from(value: &TypeDescriptor) -> Self { 8 | match value { 9 | TypeDescriptor::Integer(z) => { 10 | match z { 11 | IntSize::U1 => BinaryDataArrayType::Unknown, 12 | IntSize::U2 => BinaryDataArrayType::Unknown, 13 | IntSize::U4 => BinaryDataArrayType::Int32, 14 | IntSize::U8 => BinaryDataArrayType::Int64, 15 | } 16 | }, 17 | TypeDescriptor::Unsigned(z) => { 18 | match z { 19 | IntSize::U1 => BinaryDataArrayType::Unknown, 20 | IntSize::U2 => BinaryDataArrayType::Unknown, 21 | IntSize::U4 => BinaryDataArrayType::Int32, 22 | IntSize::U8 => BinaryDataArrayType::Int64, 23 | } 24 | }, 25 | TypeDescriptor::Float(z) => { 26 | match z { 27 | FloatSize::U4 => BinaryDataArrayType::Float32, 28 | FloatSize::U8 => BinaryDataArrayType::Float64, 29 | } 30 | }, 31 | TypeDescriptor::Boolean => BinaryDataArrayType::Unknown, 32 | TypeDescriptor::Enum(_) => BinaryDataArrayType::Unknown, 33 | TypeDescriptor::Compound(_) => BinaryDataArrayType::Unknown, 34 | TypeDescriptor::FixedArray(_, _) => BinaryDataArrayType::Unknown, 35 | TypeDescriptor::FixedAscii(_) => { 36 | BinaryDataArrayType::ASCII 37 | }, 38 | TypeDescriptor::FixedUnicode(_) => { 39 | BinaryDataArrayType::ASCII 40 | }, 41 | TypeDescriptor::VarLenArray(_) => todo!(), 42 | TypeDescriptor::VarLenAscii => BinaryDataArrayType::Unknown, 43 | TypeDescriptor::VarLenUnicode => BinaryDataArrayType::Unknown, 44 | } 45 | } 46 | } 47 | 48 | 49 | impl From<&BinaryDataArrayType> for TypeDescriptor { 50 | fn from(value: &BinaryDataArrayType) -> Self { 51 | match value { 52 | BinaryDataArrayType::Unknown => TypeDescriptor::Unsigned(IntSize::U1), 53 | BinaryDataArrayType::Float64 => TypeDescriptor::Float(FloatSize::U8), 54 | BinaryDataArrayType::Float32 => TypeDescriptor::Float(FloatSize::U4), 55 | BinaryDataArrayType::Int64 => TypeDescriptor::Integer(IntSize::U8), 56 | BinaryDataArrayType::Int32 => TypeDescriptor::Integer(IntSize::U4), 57 | BinaryDataArrayType::ASCII => TypeDescriptor::Unsigned(IntSize::U1), 58 | } 59 | } 60 | } 61 | 62 | 63 | impl From for BinaryDataArrayType { 64 | fn from(value: hdf5::Datatype) -> Self { 65 | match value.size() { 66 | 1 => Self::ASCII, 67 | 4 => { 68 | if value.is::() { 69 | Self::Int32 70 | } else if value.is::() { 71 | Self::Float32 72 | } else { 73 | Self::Unknown 74 | } 75 | } 76 | 8 => { 77 | if value.is::() { 78 | Self::Int64 79 | } else if value.is::() { 80 | Self::Float64 81 | } else { 82 | Self::Unknown 83 | } 84 | } 85 | _ => Self::Unknown, 86 | } 87 | } 88 | } -------------------------------------------------------------------------------- /src/io/offset_index.rs: -------------------------------------------------------------------------------- 1 | #[allow(unused)] 2 | use std::io::prelude::*; 3 | 4 | use indexmap::map::{Iter, Keys}; 5 | use indexmap::IndexMap; 6 | 7 | 8 | /** 9 | An ordered mapping from entity ID to byte offset into the source 10 | file it resides in. 11 | 12 | A wrapper around [`indexmap::IndexMap`]. 13 | */ 14 | #[derive(Default, Debug, Clone)] 15 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] 16 | pub struct OffsetIndex { 17 | /// The name of the index. There may potentially be more than one 18 | /// index per file 19 | pub name: String, 20 | 21 | /// The mapping from ID to byte offset, ordered by occurrence 22 | // If using serde_json to save this, use 23 | #[cfg_attr(feature = "serde", serde(with = "indexmap::map::serde_seq"))] 24 | pub offsets: IndexMap, u64>, 25 | 26 | /// Whether the index has been initalized explicitly or not, as 27 | /// it may be initially empty or read as empty. 28 | pub init: bool, 29 | } 30 | 31 | impl OffsetIndex { 32 | pub fn new(name: String) -> OffsetIndex { 33 | OffsetIndex { 34 | name, 35 | ..Default::default() 36 | } 37 | } 38 | 39 | /// Get the offset of the specified key 40 | #[inline] 41 | pub fn get(&self, key: &str) -> Option { 42 | self.offsets.get(key).copied() 43 | } 44 | 45 | /// Get the associated key and offset for the specified index position 46 | #[inline] 47 | pub fn get_index(&self, index: usize) -> Option<(&str, u64)> { 48 | if let Some((key, offset)) = self.offsets.get_index(index) { 49 | Some((key, *offset)) 50 | } else { 51 | None 52 | } 53 | } 54 | 55 | /// Get the position in the index for a specific key 56 | #[inline] 57 | pub fn index_of(&self, key: &str) -> Option { 58 | self.offsets.get_index_of(key) 59 | } 60 | 61 | /// Insert `key` into the index with an offset value 62 | #[inline] 63 | pub fn insert>>(&mut self, key: T, offset: u64) -> Option { 64 | self.offsets.insert(key.into(), offset) 65 | } 66 | 67 | #[inline] 68 | pub fn len(&self) -> usize { 69 | self.offsets.len() 70 | } 71 | 72 | pub fn is_empty(&self) -> bool { 73 | self.offsets.is_empty() 74 | } 75 | 76 | pub fn keys(&self) -> Keys, u64> { 77 | self.offsets.keys() 78 | } 79 | 80 | /// Iterate over the keys and indices 81 | pub fn iter(&self) -> Iter, u64> { 82 | self.offsets.iter() 83 | } 84 | 85 | /// Check if the key is in the index 86 | #[inline] 87 | pub fn contains_key(&self, key: &str) -> bool { 88 | self.offsets.contains_key(key) 89 | } 90 | 91 | #[cfg(feature = "serde")] 92 | /// Write the index out in JSON format to `writer` 93 | pub fn to_writer(&self, writer: W) -> serde_json::Result<()> { 94 | serde_json::to_writer(writer, self) 95 | } 96 | 97 | #[cfg(feature = "serde")] 98 | /// Read an index in JSON format from `reader` 99 | pub fn from_reader(reader: R) -> serde_json::Result { 100 | serde_json::from_reader(reader) 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/io/tdf/arrays.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | iter::FromIterator, 3 | ops::{Range, RangeBounds}, 4 | }; 5 | 6 | use mzpeaks::{feature::Feature, IonMobility, MZPeakSetType, MZ}; 7 | use timsrust::{converters::ConvertableDomain, Metadata}; 8 | 9 | use crate::{ 10 | mzpeaks::{CentroidPeak, PeakSet}, 11 | params::Unit, 12 | prelude::*, 13 | spectrum::{ 14 | bindata::{ArrayRetrievalError, BinaryArrayMap3D}, 15 | ArrayType, BinaryArrayMap, BinaryDataArrayType, DataArray, 16 | }, 17 | }; 18 | 19 | use mzsignal::feature_mapping::{FeatureGraphBuilder, IMMSMapExtracter}; 20 | 21 | pub struct FrameToArraysMapper<'a> { 22 | frame: &'a timsrust::Frame, 23 | metadata: &'a Metadata, 24 | } 25 | 26 | impl<'a> FrameToArraysMapper<'a> { 27 | pub fn new(frame: &'a timsrust::Frame, metadata: &'a Metadata) -> Self { 28 | Self { frame, metadata } 29 | } 30 | 31 | pub fn process_3d_slice(&self, iv: impl RangeBounds) -> BinaryArrayMap3D { 32 | let n_scans = self.frame.scan_offsets.len(); 33 | 34 | let first_scan = match iv.start_bound() { 35 | std::ops::Bound::Included(i) => *i, 36 | std::ops::Bound::Excluded(i) => *i, 37 | std::ops::Bound::Unbounded => 0, 38 | }; 39 | 40 | let final_scan = match iv.end_bound() { 41 | std::ops::Bound::Included(i) => *i, 42 | std::ops::Bound::Excluded(i) => *i + 1, 43 | std::ops::Bound::Unbounded => n_scans, 44 | } 45 | .min(n_scans); 46 | 47 | let mut im_dimension = Vec::with_capacity(final_scan - first_scan + 1); 48 | let mut arrays = Vec::with_capacity(final_scan - first_scan + 1); 49 | 50 | let mut scan_begin = first_scan; 51 | for (i, mut scan_end) in self.frame.scan_offsets[first_scan..final_scan] 52 | .iter() 53 | .copied() 54 | .enumerate() 55 | { 56 | if scan_begin > self.frame.tof_indices.len() { 57 | break; 58 | } 59 | if scan_end > self.frame.tof_indices.len() { 60 | log::warn!( 61 | "Limiting scan_end {scan_end} for index {i} ({}, {})", 62 | self.frame.tof_indices.len(), 63 | self.frame.intensities.len() 64 | ); 65 | scan_end = self.frame.tof_indices.len(); 66 | } 67 | let width = scan_end.saturating_sub(scan_begin); 68 | 69 | let mut mz_array_bytes: Vec = 70 | Vec::with_capacity(width * BinaryDataArrayType::Float64.size_of()); 71 | let mut intensity_array_bytes: Vec = 72 | Vec::with_capacity(width * BinaryDataArrayType::Float32.size_of()); 73 | 74 | self.frame.tof_indices[scan_begin..scan_begin + width] 75 | .iter() 76 | .for_each(|tof_idx| { 77 | mz_array_bytes.extend_from_slice( 78 | &self.metadata.mz_converter.convert(*tof_idx).to_le_bytes(), 79 | ) 80 | }); 81 | (scan_begin..(scan_begin + width)).for_each(|idx| { 82 | intensity_array_bytes.extend_from_slice( 83 | &((self.frame.intensities[idx] as u64) as f32).to_le_bytes(), 84 | ); 85 | }); 86 | let drift = self.metadata.im_converter.convert((i + first_scan) as u32); 87 | im_dimension.push(drift); 88 | 89 | let mz_array = DataArray::wrap( 90 | &ArrayType::MZArray, 91 | BinaryDataArrayType::Float64, 92 | mz_array_bytes, 93 | ); 94 | let intensity_array = DataArray::wrap( 95 | &ArrayType::IntensityArray, 96 | BinaryDataArrayType::Float32, 97 | intensity_array_bytes, 98 | ); 99 | 100 | let mut arrays_at = BinaryArrayMap::new(); 101 | arrays_at.add(mz_array); 102 | arrays_at.add(intensity_array); 103 | arrays.push(arrays_at); 104 | scan_begin = scan_end; 105 | } 106 | 107 | // We read out the IM dimension in descending order, here we reverse it to be in ascending 108 | // IM order 109 | im_dimension.reverse(); 110 | arrays.reverse(); 111 | 112 | BinaryArrayMap3D::from_ion_mobility_dimension_and_arrays( 113 | im_dimension, 114 | ArrayType::MeanInverseReducedIonMobilityArray, 115 | Unit::VoltSecondPerSquareCentimeter, 116 | arrays, 117 | ) 118 | } 119 | } 120 | 121 | pub fn consolidate_peaks>( 122 | arrays: &BinaryArrayMap3D, 123 | scan_range: &Range, 124 | metadata: &Metadata, 125 | error_tolerance: Tolerance, 126 | ) -> Result, ArrayRetrievalError> { 127 | let peaks: Result, ArrayRetrievalError> = scan_range 128 | .clone() 129 | .rev() 130 | .map(|i| -> Result<(f64, PeakSet), ArrayRetrievalError> { 131 | let im = metadata.im_converter.convert(i); 132 | if let Some(arrays_point) = arrays.get_ion_mobility(im) { 133 | let mzs = arrays_point.mzs()?; 134 | let intens = arrays_point.intensities()?; 135 | let peaks: PeakSet = mzs 136 | .iter() 137 | .copied() 138 | .zip(intens.iter().copied()) 139 | .map(|(mz, i)| CentroidPeak::new(mz, i, 0)) 140 | .collect(); 141 | Ok((im, peaks)) 142 | } else { 143 | Ok((im, PeakSet::empty())) 144 | } 145 | }) 146 | .collect(); 147 | 148 | let peaks = peaks?; 149 | if peaks.is_empty() { 150 | return Ok(MZPeakSetType::empty()); 151 | } 152 | 153 | if peaks.len() == 1 { 154 | return Ok(peaks 155 | .into_iter() 156 | .next() 157 | .unwrap() 158 | .1 159 | .into_iter() 160 | .map(|p| p.into()) 161 | .collect()); 162 | } 163 | 164 | let mut extracter = IMMSMapExtracter::from_iter(peaks); 165 | let features = extracter.extract_features(error_tolerance, 2, 0.01); 166 | let merger = mzsignal::feature_mapping::FeatureMerger::< 167 | MZ, 168 | IonMobility, 169 | Feature, 170 | >::default(); 171 | let features = merger 172 | .bridge_feature_gaps(features, error_tolerance, f64::INFINITY) 173 | .features; 174 | 175 | let peaks: MZPeakSetType = features 176 | .iter() 177 | .map(|f| CentroidPeak::new(f.mz(), f.intensity(), 0).into()) 178 | .collect(); 179 | 180 | Ok(peaks) 181 | } 182 | -------------------------------------------------------------------------------- /src/io/tdf/constants.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | meta::{Component, ComponentType, InletTypeTerm, IonizationTypeTerm}, 3 | params::ParamDescribed, 4 | }; 5 | 6 | #[allow(non_camel_case_types, clippy::upper_case_acronyms)] 7 | #[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)] 8 | pub enum InstrumentSource { 9 | AlsoUnknown = 0, 10 | ESI = 1, 11 | APCI = 2, 12 | NanoESIOffline = 3, 13 | NanoESIOnline = 4, 14 | APPI = 5, 15 | AP_MALDI = 6, 16 | MALDI = 7, 17 | MultiMode = 8, 18 | NanoFlowESI = 9, 19 | Ultraspray = 10, 20 | CaptiveSpray = 11, 21 | EI = 16, 22 | GC_APCI = 17, 23 | VIP_HESI = 18, 24 | VIP_APCI = 19, 25 | #[default] 26 | Unknown = 255, 27 | } 28 | 29 | impl InstrumentSource { 30 | pub fn to_component(self) -> Component { 31 | let mut comp = Component { 32 | order: 1, 33 | component_type: ComponentType::IonSource, 34 | ..Default::default() 35 | }; 36 | 37 | match self { 38 | Self::ESI | Self::MultiMode | Self::Ultraspray | Self::VIP_HESI => { 39 | comp.add_param(IonizationTypeTerm::ElectrosprayIonization.into()); 40 | comp.add_param(InletTypeTerm::ElectrosprayInlet.into()); 41 | } 42 | Self::NanoESIOffline | Self::NanoESIOnline | Self::NanoFlowESI | Self::CaptiveSpray => { 43 | comp.add_param(IonizationTypeTerm::Nanoelectrospray.into()); 44 | comp.add_param(InletTypeTerm::NanosprayInlet.into()); 45 | } 46 | Self::APCI | Self::GC_APCI | Self::VIP_APCI => { 47 | comp.add_param(IonizationTypeTerm::AtmosphericPressureChemicalIonization.into()); 48 | } 49 | Self::APPI => { 50 | comp.add_param(IonizationTypeTerm::AtmosphericPressurePhotoionization.into()); 51 | } 52 | Self::EI => { 53 | comp.add_param(IonizationTypeTerm::ElectronIonization.into()); 54 | } 55 | Self::MALDI => { 56 | comp.add_param(IonizationTypeTerm::MatrixAssistedLaserDesorptionIonization.into()); 57 | } 58 | Self::AP_MALDI => { 59 | comp.add_param( 60 | IonizationTypeTerm::AtmosphericPressureMatrixAssistedLaserDesorptionIonization 61 | .into(), 62 | ); 63 | } 64 | Self::AlsoUnknown | Self::Unknown => {} 65 | } 66 | 67 | comp 68 | } 69 | } 70 | 71 | impl From for InstrumentSource { 72 | fn from(value: u8) -> Self { 73 | match value { 74 | 0 => Self::AlsoUnknown, 75 | 1 => Self::ESI, 76 | 2 => Self::APCI, 77 | 3 => Self::NanoESIOffline, 78 | 4 => Self::NanoESIOnline, 79 | 5 => Self::APPI, 80 | 6 => Self::AP_MALDI, 81 | 7 => Self::MALDI, 82 | 8 => Self::MultiMode, 83 | 9 => Self::NanoFlowESI, 84 | 10 => Self::Ultraspray, 85 | 11 => Self::CaptiveSpray, 86 | 16 => Self::EI, 87 | 17 => Self::GC_APCI, 88 | 18 => Self::VIP_HESI, 89 | 19 => Self::VIP_APCI, 90 | 255 => Self::Unknown, 91 | _ => Self::Unknown, 92 | } 93 | } 94 | } 95 | 96 | 97 | #[allow(clippy::upper_case_acronyms)] 98 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] 99 | pub enum MsMsType { 100 | MS1 = 0, 101 | MRM = 2, 102 | DDAPASEF = 8, 103 | DIAPASEF = 9, 104 | PRMPASEF = 10, 105 | 106 | Unknown = -1, 107 | } 108 | 109 | impl MsMsType { 110 | pub const fn ms_level(&self) -> u8 { 111 | match self { 112 | MsMsType::MS1 => 1, 113 | MsMsType::MRM => 2, 114 | MsMsType::DDAPASEF => 2, 115 | MsMsType::DIAPASEF => 2, 116 | MsMsType::PRMPASEF => 2, 117 | MsMsType::Unknown => 0, 118 | } 119 | } 120 | } 121 | 122 | impl From for MsMsType { 123 | fn from(value: u8) -> Self { 124 | match value { 125 | 0 => Self::MS1, 126 | 2 => Self::MRM, 127 | 8 => Self::DDAPASEF, 128 | 9 => Self::DIAPASEF, 129 | 10 => Self::PRMPASEF, 130 | _ => Self::Unknown, 131 | } 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /src/io/tdf/mod.rs: -------------------------------------------------------------------------------- 1 | //! Reader implementation for Bruker's TDF data files, [`TDFFrameReaderType`] for ion mobility frames 2 | //! and [`TDFSpectrumReaderType`] for sumed or sliced spectra. 3 | //! 4 | //! **Requires the `bruker_tdf` feature** 5 | //! 6 | //! Depends upon the [`timsrust`] library, a cross-platform, pure Rust implementation of the Bruker-specifc 7 | //! file reading behaviors and [`rusqlite`] for reading the SQLite3 .tdf files. 8 | mod constants; 9 | mod arrays; 10 | mod sql; 11 | mod reader; 12 | 13 | pub use reader::{TDFFrameReader, TDFFrameReaderType, TDFSpectrumReader, TDFSpectrumReaderType, is_tdf}; 14 | pub use sql::{ChromatographyData, SQLTrace}; -------------------------------------------------------------------------------- /src/io/thermo.rs: -------------------------------------------------------------------------------- 1 | //! Reader implementation for Thermo RAW files, [`ThermoRawReaderType`]. 2 | //! 3 | //! **Requires the `thermo` feature** 4 | //! 5 | //! Depends upon the [`thermorawfilereader`] crate which manages the self-hosted `.NET` 6 | //! runtime. You must still have a working [`.NET 8`](https://dotnet.microsoft.com/en-us/download/dotnet/8.0) runtime installed on the machine you 7 | //! wish to run this on until Thermo's library supports .NET ahead-of-time compilation. For scripted installation of the .NET runtime 8 | //! see . 9 | //! 10 | //! ```no_run 11 | //! use std::io; 12 | //! 13 | //! use mzdata::prelude::*; 14 | //! use mzdata::io::ThermoRawReader; 15 | //! 16 | //! # fn main() -> io::Result<()> { 17 | //! let mut reader = ThermoRawReader::open_path("./test/data/small.RAW")?; 18 | //! let scan = reader.get_spectrum_by_index(0).unwrap(); 19 | //! assert_eq!(scan.index(), 0); 20 | //! assert_eq!(reader.len(), 48); 21 | //! # Ok(()) 22 | //! # } 23 | //! ``` 24 | //! # Licensing 25 | //! By using this library, you agree to the [RawFileReader License](https://github.com/thermofisherlsms/RawFileReader/blob/main/License.doc) 26 | //! 27 | mod instruments; 28 | mod reader; 29 | 30 | pub use reader::{is_thermo_raw_prefix, ThermoRawReader, ThermoRawReaderType}; 31 | 32 | #[cfg(feature = "async")] 33 | mod async_reader; 34 | #[cfg(feature = "async")] 35 | pub use async_reader::{ 36 | ThermoRawReader as AsyncThermoRawReader, ThermoRawReaderType as AsyncThermoRawReaderType, 37 | }; 38 | -------------------------------------------------------------------------------- /src/io/thermo/async_reader.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | use std::path::PathBuf; 3 | 4 | use futures::stream; 5 | use mzpeaks::{CentroidPeak, DeconvolutedPeak}; 6 | use tokio; 7 | 8 | use super::ThermoRawReaderType as SyncThermoRawReaderType; 9 | use crate::{ 10 | io::{ 11 | traits::{AsyncMZFileReader, AsyncRandomAccessSpectrumIterator, SpectrumStream}, 12 | DetailLevel, 13 | }, 14 | prelude::*, 15 | spectrum::MultiLayerSpectrum, 16 | }; 17 | 18 | pub struct ThermoRawReaderType< 19 | C: CentroidLike + From + Send = CentroidPeak, 20 | D: DeconvolutedCentroidLike + Send = DeconvolutedPeak, 21 | > { 22 | inner: Option>, 23 | } 24 | 25 | #[cfg(feature = "async")] 26 | impl + Send + 'static, D: DeconvolutedCentroidLike + Send + 'static> 27 | AsyncMZFileReader> for ThermoRawReaderType 28 | { 29 | async fn construct_index_from_stream(&mut self) -> u64 { 30 | self.len() as u64 31 | } 32 | 33 | /// The underlying Thermo library requires an explicit file system path to open 34 | /// the file, and as such this method always fails. 35 | /// 36 | /// [`open_path`](Self::open_path) works as normal. 37 | #[allow(unused)] 38 | async fn open_file(source: tokio::fs::File) -> io::Result { 39 | Err(io::Error::new( 40 | io::ErrorKind::Unsupported, 41 | "Cannot read a Thermo RAW file from an open file handle, only directly from a path", 42 | )) 43 | } 44 | 45 | async fn open_path

(path: P) -> io::Result 46 | where 47 | P: Into, 48 | { 49 | Self::new(path.into()).await 50 | } 51 | } 52 | 53 | impl + Send, D: DeconvolutedCentroidLike + Send> MSDataFileMetadata 54 | for ThermoRawReaderType 55 | { 56 | crate::delegate_impl_metadata_trait!(expr, this => { this.inner.as_ref().unwrap() }, &mut => { this.inner.as_mut().unwrap() }); 57 | } 58 | 59 | impl + Send + 'static, D: DeconvolutedCentroidLike + Send + 'static> 60 | AsyncSpectrumSource> for ThermoRawReaderType 61 | { 62 | fn reset(&mut self) -> impl std::future::Future { 63 | self.inner.as_mut().unwrap().reset(); 64 | futures::future::ready(()) 65 | } 66 | 67 | fn detail_level(&self) -> &DetailLevel { 68 | &self.inner.as_ref().unwrap().detail_level 69 | } 70 | 71 | fn set_detail_level(&mut self, detail_level: DetailLevel) { 72 | self.inner.as_mut().unwrap().set_detail_level(detail_level) 73 | } 74 | 75 | fn get_spectrum_by_id( 76 | &mut self, 77 | id: &str, 78 | ) -> impl std::future::Future>> { 79 | self.get_spectrum_by_id(id) 80 | } 81 | 82 | fn get_spectrum_by_index( 83 | &mut self, 84 | index: usize, 85 | ) -> impl std::future::Future>> { 86 | self.get_spectrum_by_index(index) 87 | } 88 | 89 | fn get_index(&self) -> &crate::io::OffsetIndex { 90 | self.get_index() 91 | } 92 | 93 | fn set_index(&mut self, index: crate::io::OffsetIndex) { 94 | self.inner.as_mut().unwrap().set_index(index); 95 | } 96 | 97 | fn read_next(&mut self) -> impl std::future::Future>> { 98 | self.read_next() 99 | } 100 | 101 | async fn get_spectrum_by_time(&mut self, time: f64) -> Option> { 102 | self.get_spectrum_by_time(time).await 103 | } 104 | } 105 | 106 | impl + Send + 'static, D: DeconvolutedCentroidLike + Send + 'static> 107 | ThermoRawReaderType 108 | { 109 | /// Create a new [`ThermoRawReaderType`] from a path. 110 | /// This may trigger an expensive I/O operation to checksum the file 111 | pub async fn new + 'static + Send>(path: P) -> io::Result { 112 | Self::new_with_detail_level_and_centroiding(path, DetailLevel::Full, false).await 113 | } 114 | 115 | /// Create a new [`ThermoRawReaderType`] from a path. 116 | /// This may trigger an expensive I/O operation to checksum the file 117 | pub async fn new_with_detail_level_and_centroiding + Send + 'static>( 118 | path: P, 119 | detail_level: DetailLevel, 120 | centroiding: bool, 121 | ) -> io::Result { 122 | tokio::task::spawn_blocking(move || { 123 | let inner = SyncThermoRawReaderType::new_with_detail_level_and_centroiding( 124 | path, 125 | detail_level, 126 | centroiding, 127 | )?; 128 | let this = Self { inner: Some(inner) }; 129 | Ok(this) 130 | }) 131 | .await 132 | .unwrap() 133 | } 134 | 135 | pub fn len(&self) -> usize { 136 | self.inner.as_ref().unwrap().len() 137 | } 138 | 139 | pub fn is_empty(&self) -> bool { 140 | self.inner.as_ref().unwrap().is_empty() 141 | } 142 | 143 | pub fn get_centroiding(&self) -> bool { 144 | self.inner.as_ref().unwrap().get_centroiding() 145 | } 146 | 147 | pub fn set_centroiding(&mut self, value: bool) { 148 | self.inner.as_mut().unwrap().set_centroiding(value) 149 | } 150 | 151 | 152 | /// Get whether or not to load extended spectrum signal information for the spectrum. 153 | /// 154 | /// The loaded data isn't incorporated into a peak list, instead access them under 155 | /// the binary data arrays. 156 | pub fn get_load_extended_spectrum_data(&self) -> bool { 157 | self.inner.as_ref().unwrap().get_load_extended_spectrum_data() 158 | } 159 | 160 | /// Set whether or not to load extended spectrum signal information for the spectrum. 161 | /// 162 | /// The loaded data isn't incorporated into a peak list, instead access them under 163 | /// the binary data arrays. 164 | pub fn set_load_extended_spectrum_data(&mut self, load_extended_spectrum_data: bool) { 165 | self.inner.as_mut().unwrap().set_load_extended_spectrum_data(load_extended_spectrum_data) 166 | } 167 | 168 | 169 | pub fn get_index(&self) -> &crate::io::OffsetIndex { 170 | self.inner.as_ref().unwrap().get_index() 171 | } 172 | 173 | pub fn as_stream(&mut self) -> impl SpectrumStream> + '_ { 174 | Box::pin(stream::unfold(self, |reader| async { 175 | let spec = reader.read_next(); 176 | spec.await.map(|val| (val, reader)) 177 | })) 178 | } 179 | 180 | pub async fn read_next(&mut self) -> Option> { 181 | let mut inner = self.inner.take().unwrap(); 182 | let (inner, spec) = tokio::task::spawn_blocking(move || { 183 | let spec = inner.read_next_spectrum(); 184 | (inner, spec) 185 | }) 186 | .await 187 | .unwrap(); 188 | self.inner = Some(inner); 189 | spec 190 | } 191 | 192 | pub async fn get_spectrum_by_id(&mut self, id: &str) -> Option> { 193 | let mut inner = self.inner.take().unwrap(); 194 | let id = id.to_string(); 195 | let (inner, spec) = tokio::task::spawn_blocking(move || { 196 | let spec = inner.get_spectrum_by_id(&id); 197 | (inner, spec) 198 | }) 199 | .await 200 | .unwrap(); 201 | self.inner = Some(inner); 202 | spec 203 | } 204 | 205 | pub async fn get_spectrum_by_index( 206 | &mut self, 207 | index: usize, 208 | ) -> Option> { 209 | let mut inner = self.inner.take().unwrap(); 210 | let (inner, spec) = tokio::task::spawn_blocking(move || { 211 | let spec = inner.get_spectrum_by_index(index); 212 | (inner, spec) 213 | }) 214 | .await 215 | .unwrap(); 216 | self.inner = Some(inner); 217 | spec 218 | } 219 | 220 | pub async fn get_spectrum_by_time(&mut self, time: f64) -> Option> { 221 | let mut inner = self.inner.take().unwrap(); 222 | let (inner, spec) = tokio::task::spawn_blocking(move || { 223 | let spec = inner.get_spectrum_by_time(time); 224 | (inner, spec) 225 | }) 226 | .await 227 | .unwrap(); 228 | self.inner = Some(inner); 229 | spec 230 | } 231 | } 232 | 233 | pub type ThermoRawReader = ThermoRawReaderType; 234 | 235 | impl< 236 | C: CentroidLike + From + Send + Sync + 'static, 237 | D: DeconvolutedCentroidLike + Send + Sync + 'static, 238 | > AsyncRandomAccessSpectrumIterator> 239 | for ThermoRawReaderType 240 | { 241 | async fn start_from_id(&mut self, id: &str) -> Result<&mut Self, SpectrumAccessError> { 242 | let mut inner = self.inner.take().unwrap(); 243 | let id = id.to_string(); 244 | let (inner, spec) = tokio::task::spawn_blocking(move || { 245 | let spec = inner.start_from_id(&id); 246 | let res = spec.err(); 247 | (inner, res) 248 | }) 249 | .await 250 | .unwrap(); 251 | if let Some(e) = spec { 252 | return Err(e); 253 | } 254 | self.inner = Some(inner); 255 | Ok(self) 256 | } 257 | 258 | async fn start_from_index(&mut self, index: usize) -> Result<&mut Self, SpectrumAccessError> { 259 | let mut inner = self.inner.take().unwrap(); 260 | let (inner, spec) = tokio::task::spawn_blocking(move || { 261 | let spec = inner.start_from_index(index); 262 | let res = spec.err(); 263 | (inner, res) 264 | }) 265 | .await 266 | .unwrap(); 267 | if let Some(e) = spec { 268 | return Err(e); 269 | } 270 | self.inner = Some(inner); 271 | Ok(self) 272 | } 273 | 274 | async fn start_from_time(&mut self, time: f64) -> Result<&mut Self, SpectrumAccessError> { 275 | let mut inner = self.inner.take().unwrap(); 276 | let (inner, spec) = tokio::task::spawn_blocking(move || { 277 | let spec = inner.start_from_time(time); 278 | let res = spec.err(); 279 | (inner, res) 280 | }) 281 | .await 282 | .unwrap(); 283 | if let Some(e) = spec { 284 | return Err(e); 285 | } 286 | self.inner = Some(inner); 287 | Ok(self) 288 | } 289 | } 290 | 291 | #[cfg(test)] 292 | mod test { 293 | use super::*; 294 | 295 | #[tokio::test(flavor = "multi_thread", worker_threads = 4)] 296 | async fn test_read() -> io::Result<()> { 297 | let mut reader: ThermoRawReaderType = 298 | ThermoRawReaderType::open_path("./test/data/small.RAW").await?; 299 | 300 | let n = reader.len(); 301 | let mut ms1_counter = 0; 302 | let mut msn_counter = 0; 303 | while let Some(spec) = reader.read_next().await { 304 | if spec.ms_level() > 1 { 305 | msn_counter += 1; 306 | } else { 307 | ms1_counter += 1; 308 | } 309 | } 310 | 311 | assert_eq!(n, ms1_counter + msn_counter); 312 | Ok(()) 313 | } 314 | } 315 | -------------------------------------------------------------------------------- /src/io/traits.rs: -------------------------------------------------------------------------------- 1 | mod chromatogram; 2 | mod frame; 3 | mod spectrum; 4 | mod util; 5 | 6 | pub use spectrum::{ 7 | MZFileReader, MemorySpectrumSource, RandomAccessSpectrumGroupingIterator, 8 | RandomAccessSpectrumIterator, RandomAccessSpectrumSource, SpectrumAccessError, 9 | SpectrumIterator, SpectrumReceiver, SpectrumSource, 10 | SpectrumSourceWithMetadata, SpectrumWriter, StreamingSpectrumIterator, 11 | }; 12 | pub use util::SeekRead; 13 | 14 | pub use frame::{ 15 | BorrowedGeneric3DIonMobilityFrameSource, Generic3DIonMobilityFrameSource, 16 | IonMobilityFrameAccessError, IonMobilityFrameIterator, 17 | IonMobilityFrameSource, IonMobilityFrameWriter, RandomAccessIonMobilityFrameIterator, 18 | RandomAccessIonMobilityFrameGroupingIterator, 19 | IntoIonMobilityFrameSourceError, 20 | IntoIonMobilityFrameSource 21 | }; 22 | 23 | pub use chromatogram::{ChromatogramIterator, ChromatogramSource}; 24 | 25 | pub use crate::spectrum::group::{SpectrumGrouping, IonMobilityFrameGrouping}; 26 | 27 | #[cfg(feature = "async_partial")] 28 | pub use spectrum::{AsyncSpectrumSource, AsyncRandomAccessSpectrumIterator, SpectrumStream}; 29 | 30 | #[cfg(feature = "async")] 31 | pub use spectrum::AsyncMZFileReader; 32 | 33 | #[cfg(test)] 34 | mod test { 35 | use super::*; 36 | 37 | #[test] 38 | fn test_object_safe() { 39 | // If `SpectrumSource` were not object safe, this code 40 | // couldn't compile. 41 | let _f = |_x: &dyn SpectrumSource| {}; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/io/traits/chromatogram.rs: -------------------------------------------------------------------------------- 1 | use std::iter::FusedIterator; 2 | 3 | use crate::spectrum::Chromatogram; 4 | 5 | 6 | /// A trait that for retrieving [`Chromatogram`]s from a source. 7 | pub trait ChromatogramSource { 8 | /// Get a [`Chromatogram`] by its identifier, if it exists. 9 | fn get_chromatogram_by_id(&mut self, id: &str) -> Option; 10 | 11 | /// Get a [`Chromatogram`] by its index, if it exists. 12 | fn get_chromatogram_by_index(&mut self, index: usize) -> Option; 13 | 14 | /// Iterate over [`Chromatogram`]s with a [`ChromatogramIterator`] 15 | fn iter_chromatograms(&mut self) -> ChromatogramIterator<'_, Self> 16 | where 17 | Self: Sized, 18 | { 19 | ChromatogramIterator::new(self) 20 | } 21 | } 22 | 23 | #[derive(Debug)] 24 | pub struct ChromatogramIterator<'a, R: ChromatogramSource> { 25 | source: &'a mut R, 26 | index: usize, 27 | } 28 | 29 | impl<'a, R: ChromatogramSource> ChromatogramIterator<'a, R> { 30 | pub fn new(source: &'a mut R) -> Self { 31 | Self { source, index: 0 } 32 | } 33 | } 34 | 35 | impl Iterator for ChromatogramIterator<'_, R> { 36 | type Item = Chromatogram; 37 | 38 | fn next(&mut self) -> Option { 39 | if let Some(chrom) = self.source.get_chromatogram_by_index(self.index) { 40 | self.index += 1; 41 | Some(chrom) 42 | } else { 43 | None 44 | } 45 | } 46 | } 47 | 48 | impl FusedIterator for ChromatogramIterator<'_, R> {} -------------------------------------------------------------------------------- /src/io/traits/util.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | 3 | pub trait SeekRead: io::Read + io::Seek {} 4 | impl SeekRead for T {} 5 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! `mzdata` provides basic access to raw and processed mass spectrometry data formats in 2 | //! Rust. 3 | //! 4 | //! For a guide, see the [tutorial] section. 5 | //! 6 | //! The library currently supports reading: 7 | //! 1. MGF files using [`MGFReader`] in [`mzdata::io::mgf`](crate::io::mgf) 8 | //! 2. mzML & indexedmzML files using [`MzMLReader`] in [`mzdata::io::mzml`](crate::io::mzml) 9 | //! 3. mzMLb files using [`MzMLbReader`] in [`mzdata::io::mzmlb`](crate::io::mzmlb), if the `mzmlb` feature is enabled 10 | //! 4. Thermo RAW files using [`ThermoRawReader`](crate::io::thermo::ThermoRawReader) in [`mzdata::io::thermo`](crate::io::thermo), if the `thermo` feature is enabled 11 | //! 12 | //! and writing: 13 | //! 1. MGF files using [`MGFWriter`] in [`mzdata::io::mgf`](crate::io::mgf) 14 | //! 2. mzML & indexedmzML files using [`MzMLWriter`] in [`mzdata::io::mzml`](crate::io::mzml) 15 | //! 3. mzMLb files using [`MzMLbWriter`] in [`mzdata::io::mzmlb`](crate::io::mzmlb), if the `mzmlb` feature is enabled 16 | //! 17 | //! This menagerie of different formats and gzip compression or not can be inferred from a path or [`io::Read`](std::io::Read) using [`io::infer_format`] and [`io::infer_from_stream`]. 18 | //! Conventional dispatch is possible through [`MZReader`]. The [`mz_read`] macro provides a convenient means of working with 19 | //! a value with zero added overhead, but with a limited scope. The [`mz_write`] macro is the equivalent for opening a writer. 20 | //! There are additional tools for dealing with file format dispatch in [`MassSpectrometryReadWriteProcess`](crate::io::MassSpectrometryReadWriteProcess). 21 | //! 22 | //! It also includes a set of representation layers for spectra in [`mzdata::spectrum`](crate::spectrum) 23 | //! 24 | //! # Example 25 | //! ```rust 26 | //! use std::fs; 27 | //! use mzdata::prelude::*; 28 | //! use mzpeaks::Tolerance; 29 | //! use mzdata::MZReader; 30 | //! use mzdata::spectrum::SignalContinuity; 31 | //! 32 | //! let reader = MZReader::open_path("./test/data/small.mzML").unwrap(); 33 | //! for spectrum in reader { 34 | //! println!("Scan {} => BP {}", spectrum.id(), spectrum.peaks().base_peak().mz); 35 | //! 36 | //! if spectrum.signal_continuity() == SignalContinuity::Centroid { 37 | //! let peak_picked = spectrum.into_centroid().unwrap(); 38 | //! println!("Matches for 579.155: {:?}", 39 | //! peak_picked.peaks.all_peaks_for( 40 | //! 579.155, Tolerance::Da(0.02) 41 | //! ) 42 | //! ); 43 | //! } 44 | //! } 45 | //! ``` 46 | //! 47 | //! It uses [`mzpeaks`] to represent peaks and peak lists, and re-exports the basic types. While the high-level 48 | //! types are templated on simple peak types, more complex, application-specific peak types can be substituted. 49 | //! See [`mzdata::spectrum::bindata`](crate::spectrum::bindata) for more information about how to directly convert 50 | //! data arrays to peak lists. 51 | //! 52 | //! 53 | //! ## Traits 54 | //! The library makes heavy use of traits to abstract over the implementation details of different file formats. 55 | //! These traits are included in [`mzdata::prelude`](crate::prelude). It also imports [`mzpeaks::prelude`]. 56 | //! 57 | //! 58 | pub mod io; 59 | pub mod meta; 60 | #[macro_use] 61 | pub mod params; 62 | pub mod prelude; 63 | pub mod spectrum; 64 | pub mod utils; 65 | 66 | pub use crate::io::{MZReader, MZReaderBuilder}; 67 | #[cfg(feature = "mgf")] 68 | pub use crate::io::mgf::{MGFReader, MGFWriter}; 69 | #[cfg(feature = "mzml")] 70 | pub use crate::io::mzml::{MzMLReader, MzMLWriter}; 71 | 72 | #[cfg(feature = "mzmlb")] 73 | pub use crate::io::mzmlb::{ 74 | MzMLbReader, MzMLbWriter, MzMLbWriterBuilder, 75 | }; 76 | 77 | #[cfg(feature = "thermo")] 78 | pub use crate::io::thermo::ThermoRawReader; 79 | 80 | pub use crate::params::{Param, ParamList}; 81 | 82 | pub use crate::spectrum::{CentroidSpectrum, RawSpectrum, Spectrum}; 83 | 84 | #[cfg(doc)] 85 | pub mod tutorial; 86 | 87 | pub use mzpeaks; 88 | 89 | #[cfg(feature = "mzsignal")] 90 | pub use mzsignal; -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::env; 3 | use std::io; 4 | use std::path; 5 | use std::process; 6 | use std::thread::spawn; 7 | use std::time; 8 | 9 | use std::sync::mpsc::sync_channel; 10 | 11 | use mzdata::io::Source; 12 | use mzdata::prelude::*; 13 | use mzdata::spectrum::{ 14 | DeconvolutedSpectrum, MultiLayerSpectrum, RefPeakDataLevel, SignalContinuity, SpectrumLike, 15 | }; 16 | use mzdata::MZReader; 17 | 18 | struct MSDataFileSummary { 19 | pub start_time: f64, 20 | pub end_time: f64, 21 | pub level_table: HashMap, 22 | pub charge_table: HashMap, 23 | pub peak_charge_table: HashMap>, 24 | pub peak_mode_table: HashMap, 25 | pub has_ion_mobility: bool 26 | } 27 | 28 | impl Default for MSDataFileSummary { 29 | fn default() -> Self { 30 | Self { 31 | start_time: f64::INFINITY, 32 | end_time: f64::NEG_INFINITY, 33 | level_table: Default::default(), 34 | charge_table: Default::default(), 35 | peak_charge_table: Default::default(), 36 | peak_mode_table: Default::default(), 37 | has_ion_mobility: false, 38 | } 39 | } 40 | } 41 | 42 | impl MSDataFileSummary { 43 | pub fn handle_scan(&mut self, scan: MultiLayerSpectrum) { 44 | let time = scan.start_time(); 45 | self.start_time = self.start_time.min(time); 46 | self.end_time = self.end_time.max(time); 47 | let level = scan.ms_level(); 48 | *self.level_table.entry(level).or_default() += 1; 49 | if level > 1 { 50 | if let Some(charge) = scan.precursor().unwrap().ion().charge { 51 | *self.charge_table.entry(charge).or_default() += 1; 52 | } else { 53 | *self.charge_table.entry(0).or_default() += 1; 54 | } 55 | } 56 | *self 57 | .peak_mode_table 58 | .entry(scan.signal_continuity()) 59 | .or_default() += scan.peaks().len(); 60 | 61 | let has_charge = match scan.peaks() { 62 | RefPeakDataLevel::Missing => false, 63 | RefPeakDataLevel::RawData(arrays) => arrays.charges().is_ok(), 64 | RefPeakDataLevel::Centroid(_) => false, 65 | RefPeakDataLevel::Deconvoluted(_) => true, 66 | }; 67 | 68 | let has_ion_mobility = match scan.peaks() { 69 | RefPeakDataLevel::RawData(arrays) => arrays.has_ion_mobility(), 70 | _ => false, 71 | } || scan.has_ion_mobility(); 72 | self.has_ion_mobility |= has_ion_mobility; 73 | 74 | if has_charge { 75 | let deconv_scan: DeconvolutedSpectrum = scan.try_into().unwrap(); 76 | deconv_scan.deconvoluted_peaks.iter().for_each(|p| { 77 | *(*self 78 | .peak_charge_table 79 | .entry(deconv_scan.ms_level()) 80 | .or_default()) 81 | .entry(p.charge) 82 | .or_default() += 1; 83 | assert!((p.index as usize) < deconv_scan.deconvoluted_peaks.len()) 84 | }) 85 | } 86 | } 87 | 88 | pub fn _scan_file(&mut self, reader: &mut R) { 89 | let start = time::Instant::now(); 90 | reader.enumerate().for_each(|(i, scan)| { 91 | if i % 10000 == 0 && i > 0 { 92 | println!( 93 | "\tScan {}: {} ({:0.3} seconds, {} peaks|points)", 94 | i, 95 | scan.id(), 96 | (time::Instant::now() - start).as_secs_f64(), 97 | self.peak_mode_table.values().sum::() 98 | ); 99 | } 100 | self.handle_scan(scan); 101 | }); 102 | let end = time::Instant::now(); 103 | let elapsed = end - start; 104 | println!("{:0.3} seconds elapsed", elapsed.as_secs_f64()); 105 | } 106 | 107 | pub fn scan_file(&mut self, reader: R) { 108 | self.scan_file_threaded(reader) 109 | } 110 | 111 | pub fn scan_file_threaded(&mut self, reader: R) { 112 | let start = time::Instant::now(); 113 | let (sender, receiver) = sync_channel(2usize.pow(12)); 114 | let read_handle = spawn(move || { 115 | reader.into_iter() 116 | .enumerate() 117 | .for_each(|(i, scan)| { 118 | sender.send((i, scan)).unwrap() 119 | }); 120 | }); 121 | let i = receiver.iter().fold(0, |_, (i, scan)| { 122 | if i % 10000 == 0 && i > 0 { 123 | println!( 124 | "\tScan {}: {} ({:0.3} seconds, {} peaks|points)", 125 | i, 126 | scan.id(), 127 | (time::Instant::now() - start).as_secs_f64(), 128 | self.peak_mode_table.values().sum::() 129 | ); 130 | } 131 | self.handle_scan(scan); 132 | i 133 | }); 134 | read_handle.join().unwrap(); 135 | let end = time::Instant::now(); 136 | let elapsed = end - start; 137 | println!("{:0.3} seconds elapsed, handled {i} spectra", elapsed.as_secs_f64()); 138 | } 139 | 140 | pub fn write_out(&self) { 141 | println!("Start Time: {:0.2}", self.start_time); 142 | println!("End Time: {:0.2}", self.end_time); 143 | println!("Has Ion Mobility: {}", self.has_ion_mobility); 144 | println!("MS Levels:"); 145 | let mut level_set: Vec<(&u8, &usize)> = self.level_table.iter().collect(); 146 | level_set.sort_by_key(|(a, _)| *a); 147 | for (level, count) in level_set.iter() { 148 | println!("\t{}: {}", level, count); 149 | } 150 | 151 | println!("Precursor Charge States:"); 152 | let mut charge_set: Vec<(&i32, &usize)> = self.charge_table.iter().collect(); 153 | charge_set.sort_by_key(|(a, _)| *a); 154 | for (charge, count) in charge_set.iter() { 155 | if **charge == 0 { 156 | println!("\tCharge Not Reported: {}", count); 157 | } else { 158 | println!("\t{}: {}", charge, count); 159 | } 160 | } 161 | 162 | let mut peak_charge_levels: Vec<_> = self.peak_charge_table.iter().collect(); 163 | 164 | peak_charge_levels.sort_by(|(level_a, _), (level_b, _)| level_a.cmp(level_b)); 165 | 166 | for (level, peak_charge_table) in peak_charge_levels { 167 | if !peak_charge_table.is_empty() { 168 | println!("Peak Charge States for MS level {}:", level); 169 | let mut peak_charge_set: Vec<(&i32, &usize)> = peak_charge_table.iter().collect(); 170 | peak_charge_set.sort_by_key(|(a, _)| *a); 171 | for (charge, count) in peak_charge_set.iter() { 172 | if **charge == 0 { 173 | println!("\tCharge Not Reported: {}", count); 174 | } else { 175 | println!("\t{}: {}", charge, count); 176 | } 177 | } 178 | } 179 | } 180 | self.peak_mode_table 181 | .iter() 182 | .for_each(|(mode, count)| match mode { 183 | SignalContinuity::Unknown => println!("Unknown continuity: {}", count), 184 | SignalContinuity::Centroid => println!("Peaks: {}", count), 185 | SignalContinuity::Profile => println!("Points: {}", count), 186 | }); 187 | } 188 | } 189 | 190 | fn main() -> io::Result<()> { 191 | let path = path::PathBuf::from(env::args().nth(1).unwrap_or_else(|| { 192 | eprintln!("Please provide a path to an MS data file"); 193 | process::exit(1) 194 | })); 195 | let mut summarizer = MSDataFileSummary::default(); 196 | 197 | if path.as_os_str() == "-" { 198 | mzdata::mz_read!(Source::Stdin, reader => { 199 | summarizer.scan_file(reader) 200 | })?; 201 | } else { 202 | let reader = MZReader::open_path(path)?; 203 | eprintln!("Format: {}", reader.as_format()); 204 | summarizer.scan_file(reader) 205 | }; 206 | 207 | summarizer.write_out(); 208 | Ok(()) 209 | } 210 | -------------------------------------------------------------------------------- /src/meta/data_processing.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused)] 2 | use std::fmt::Display; 3 | 4 | use crate::impl_param_described; 5 | use crate::params::{ControlledVocabulary, Param, ParamCow, ParamList}; 6 | 7 | use super::Software; 8 | 9 | /// Describe a data processing method stage tied to a specific piece of [`Software`] 10 | /// 11 | /// See 12 | #[derive(Debug, Clone, Default, PartialEq, Eq)] 13 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] 14 | pub struct ProcessingMethod { 15 | pub order: i8, 16 | pub software_reference: String, 17 | pub params: ParamList, 18 | } 19 | 20 | /// Describe a complete data processing method, a series of [`ProcessingMethod`] transformations 21 | /// through a pipeline of [`Software`]. 22 | /// 23 | /// See 24 | #[derive(Debug, Clone, Default, PartialEq, Eq)] 25 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] 26 | pub struct DataProcessing { 27 | pub id: String, 28 | pub methods: Vec, 29 | } 30 | 31 | impl_param_described!(ProcessingMethod); 32 | 33 | impl DataProcessing { 34 | pub fn push(&mut self, method: ProcessingMethod) { 35 | self.methods.push(method) 36 | } 37 | 38 | pub fn iter(&self) -> std::slice::Iter { 39 | self.methods.iter() 40 | } 41 | 42 | pub fn len(&self) -> usize { 43 | self.methods.len() 44 | } 45 | 46 | pub fn is_empty(&self) -> bool { 47 | self.methods.is_empty() 48 | } 49 | 50 | pub fn highest_order(&self) -> i8 { 51 | self.iter().map(|p| p.order).max().unwrap_or_default() 52 | } 53 | } 54 | 55 | #[derive(Debug, Clone, PartialEq)] 56 | pub enum DataTransformationAction { 57 | FormatConversion(FormatConversion), 58 | DataProcessingAction(DataProcessingAction), 59 | Other(Param), 60 | } 61 | 62 | impl Display for DataTransformationAction { 63 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 64 | write!(f, "{:?}", self) 65 | } 66 | } 67 | 68 | #[derive(Debug, Clone, Copy, PartialEq, Hash)] 69 | pub enum DataProcessingAction { 70 | Deisotoping, 71 | ChargeDeconvolution, 72 | PeakPicking, 73 | Smoothing, 74 | BaselineReduction, 75 | ChargeStateCalculation, 76 | PrecursorRecalculation, 77 | IntensityNormalization, 78 | MZCalibration, 79 | DataFiltering, 80 | AdductDeconvolution, 81 | IonMobilityDeconvolution, 82 | } 83 | 84 | impl From for Param { 85 | fn from(value: DataProcessingAction) -> Self { 86 | value.as_param_const().into() 87 | } 88 | } 89 | 90 | impl From for ParamCow<'static> { 91 | fn from(value: DataProcessingAction) -> Self { 92 | value.as_param_const() 93 | } 94 | } 95 | 96 | impl Display for DataProcessingAction { 97 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 98 | write!(f, "{:?}", self) 99 | } 100 | } 101 | 102 | impl DataProcessingAction { 103 | pub const fn as_param_const(&self) -> ParamCow<'static> { 104 | const CV: ControlledVocabulary = ControlledVocabulary::MS; 105 | 106 | match self { 107 | DataProcessingAction::Deisotoping => CV.const_param_ident("deisotoping", 1000033), 108 | DataProcessingAction::ChargeDeconvolution => { 109 | CV.const_param_ident("charge deconvolution", 1000034) 110 | } 111 | DataProcessingAction::PeakPicking => CV.const_param_ident("peak picking", 1000035), 112 | DataProcessingAction::Smoothing => CV.const_param_ident("smoothing", 1000592), 113 | DataProcessingAction::BaselineReduction => { 114 | CV.const_param_ident("baseline reduction", 1000593) 115 | } 116 | DataProcessingAction::ChargeStateCalculation => { 117 | CV.const_param_ident("charge state calculation", 1000778) 118 | } 119 | DataProcessingAction::PrecursorRecalculation => { 120 | CV.const_param_ident("precursor recalculation", 1000780) 121 | } 122 | DataProcessingAction::IntensityNormalization => { 123 | CV.const_param_ident("intensity normalization", 1001484) 124 | } 125 | DataProcessingAction::MZCalibration => CV.const_param_ident("m/z calibration", 1001485), 126 | DataProcessingAction::DataFiltering => CV.const_param_ident("data filtering", 1001486), 127 | DataProcessingAction::AdductDeconvolution => { 128 | CV.const_param_ident("adduct deconvolution", 1003220) 129 | } 130 | DataProcessingAction::IonMobilityDeconvolution => { 131 | CV.const_param_ident("ion mobility deconvolution", 1003222) 132 | } 133 | } 134 | } 135 | 136 | pub const fn from_accession(accession: u32) -> Option { 137 | match accession { 138 | 1000033 => Some(DataProcessingAction::Deisotoping), 139 | 1000034 => Some(DataProcessingAction::ChargeDeconvolution), 140 | 1000035 => Some(DataProcessingAction::PeakPicking), 141 | 1000592 => Some(DataProcessingAction::Smoothing), 142 | 1000593 => Some(DataProcessingAction::BaselineReduction), 143 | 1000778 => Some(DataProcessingAction::ChargeStateCalculation), 144 | 1000780 => Some(DataProcessingAction::PrecursorRecalculation), 145 | 1001484 => Some(DataProcessingAction::IntensityNormalization), 146 | 1001485 => Some(DataProcessingAction::MZCalibration), 147 | 1001486 => Some(DataProcessingAction::DataFiltering), 148 | 1003220 => Some(DataProcessingAction::AdductDeconvolution), 149 | 1003222 => Some(DataProcessingAction::IonMobilityDeconvolution), 150 | _ => None, 151 | } 152 | } 153 | } 154 | 155 | #[derive(Debug, Clone, Copy, PartialEq, Hash)] 156 | pub enum FormatConversion { 157 | ConversionToMzML, 158 | ConversionToMzMLb, 159 | } 160 | 161 | impl Display for FormatConversion { 162 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 163 | write!(f, "{:?}", self) 164 | } 165 | } 166 | 167 | impl From for Param { 168 | fn from(value: FormatConversion) -> Self { 169 | value.to_param_const().into() 170 | } 171 | } 172 | 173 | impl From for ParamCow<'static> { 174 | fn from(value: FormatConversion) -> Self { 175 | value.to_param_const() 176 | } 177 | } 178 | 179 | impl FormatConversion { 180 | pub const fn to_param_const(&self) -> ParamCow<'static> { 181 | const CV: ControlledVocabulary = ControlledVocabulary::MS; 182 | 183 | match self { 184 | FormatConversion::ConversionToMzML => { 185 | CV.const_param_ident("Conversion to mzML", 1000544) 186 | } 187 | FormatConversion::ConversionToMzMLb => { 188 | CV.const_param_ident("Conversion to mzMLb", 1002839) 189 | } 190 | } 191 | } 192 | 193 | pub const fn from_accession(accession: u32) -> Option { 194 | match accession { 195 | 1000544 => Some(Self::ConversionToMzML), 196 | 1002839 => Some(Self::ConversionToMzMLb), 197 | _ => None, 198 | } 199 | } 200 | } 201 | -------------------------------------------------------------------------------- /src/meta/run.rs: -------------------------------------------------------------------------------- 1 | use chrono::{DateTime, FixedOffset}; 2 | 3 | /// Metadata describing the experiment that does not belong in any other section 4 | /// that covers some default options. 5 | #[derive(Debug, Default, PartialEq, Hash, Eq, Clone)] 6 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] 7 | pub struct MassSpectrometryRun { 8 | pub id: Option, 9 | pub default_data_processing_id: Option, 10 | pub default_instrument_id: Option, 11 | pub default_source_file_id: Option, 12 | pub start_time: Option>, 13 | } 14 | 15 | impl MassSpectrometryRun { 16 | pub fn new( 17 | id: Option, 18 | default_data_processing_id: Option, 19 | default_instrument_id: Option, 20 | default_source_file_id: Option, 21 | start_time: Option>, 22 | ) -> Self { 23 | Self { 24 | id, 25 | default_data_processing_id, 26 | default_instrument_id, 27 | default_source_file_id, 28 | start_time, 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/meta/sample.rs: -------------------------------------------------------------------------------- 1 | use crate::impl_param_described; 2 | use crate::params::{ParamDescribed, ParamList}; 3 | 4 | 5 | #[derive(Debug, Default, Clone, PartialEq)] 6 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] 7 | pub struct Sample { 8 | pub id: String, 9 | pub name: Option, 10 | pub params: ParamList 11 | } 12 | 13 | impl Sample { 14 | pub fn new(id: String, name: Option, params: ParamList) -> Self { 15 | Self { id, name, params } 16 | } 17 | 18 | crate::find_param_method!(number, &crate::curie!(MS:1000001), "Find the sample number, if it is present"); 19 | crate::find_param_method!(batch, &crate::curie!(MS:1000053), "Find the sample batch, if it is present"); 20 | } 21 | 22 | 23 | impl_param_described!(Sample); -------------------------------------------------------------------------------- /src/prelude.rs: -------------------------------------------------------------------------------- 1 | //! A set of foundational traits used throughout the library. 2 | pub use crate::io::traits::{ 3 | MZFileReader, RandomAccessSpectrumGroupingIterator, RandomAccessSpectrumIterator, 4 | RandomAccessSpectrumSource as _, SpectrumSourceWithMetadata as _, SpectrumSource, 5 | SpectrumWriter, SeekRead, SpectrumAccessError, IonMobilityFrameSource, 6 | RandomAccessIonMobilityFrameIterator, ChromatogramSource, 7 | IonMobilityFrameWriter, RandomAccessIonMobilityFrameGroupingIterator, 8 | IntoIonMobilityFrameSource, 9 | }; 10 | 11 | #[cfg(feature = "async_partial")] 12 | pub use crate::io::traits::AsyncSpectrumSource; 13 | 14 | pub use crate::meta::MSDataFileMetadata; 15 | pub use crate::params::{ParamDescribed, ParamLike, ParamValue, ParamDescribedRead}; 16 | pub use crate::spectrum::bindata::{ 17 | BuildArrayMapFrom, BuildFromArrayMap, ByteArrayView, ByteArrayViewMut, 18 | BuildArrayMap3DFrom, BuildFromArrayMap3D 19 | }; 20 | pub use crate::spectrum::{ 21 | IonProperties, PrecursorSelection, SpectrumLike, IonMobilityMeasure, IonMobilityFrameLike, 22 | SpectrumGrouping, IonMobilityFrameGrouping, 23 | }; 24 | 25 | #[cfg(feature = "mzsignal")] 26 | pub use crate::spectrum::group::SpectrumGroupAveraging; 27 | 28 | #[doc(hidden)] 29 | pub use std::convert::TryInto; 30 | #[doc(hidden)] 31 | pub use std::io::prelude::*; 32 | #[doc(hidden)] 33 | pub use mzpeaks::prelude::*; -------------------------------------------------------------------------------- /src/spectrum.rs: -------------------------------------------------------------------------------- 1 | //! The data structures and components that represent a mass spectrum and how 2 | //! to access their data. 3 | //! 4 | //! A mass spectrum is made up of multiple components, the spectrum's signal data 5 | //! itself, plus all the metadata that describes how that data was acquired by 6 | //! the instrument. 7 | //! 8 | //! # Components 9 | //! - [`bindata`] includes structures for dealing with raw binary data arrays that may or may not 10 | //! be byte-encoded but not strongly typed, though it does not include signal processing as that 11 | //! is outside the scope of this crate. 12 | //! 13 | //! # Spectra 14 | //! 15 | //! Represent the collection of attributes and data that compose a single mass spectrum. 16 | //! 17 | //! Because a mass spectrum may be obtained from sources with varying levels of detail, 18 | //! several alternative structures are provided with a common set of trait-based methods 19 | //! to unify access: 20 | //! 21 | //! 1. [`RawSpectrum`] for representing a spectrum that has not been decoded into distinct 22 | //! peaks yet, but whose data may be continuous or discrete. 23 | //! 2. [`CentroidSpectrum`] for representing spectra from sources which are guaranteed to 24 | //! be pre-centroided, like those from MGF files or other simple text representations. 25 | //! 3. [`MultiLayerSpectrum`] for representing a multi-layer representation of a spectrum where both 26 | //! raw data and a distinct peak list are available. 27 | //! 4. [`DeconvolutedSpectrum`] for representing spectra from sources which are guaranteed to be 28 | //! pre-centroided, deisotoped and charge state deconvoluted. 29 | //! 30 | //! These structures all implement the [`SpectrumLike`] trait 31 | //! 32 | //! The [`SpectrumLike`] trait is included in the crate prelude, and gives the caller 33 | //! read-only access to components that describe a spectrum's metadata. 34 | //! 35 | //! ```rust 36 | //! use mzpeaks::Tolerance; 37 | //! use mzdata::MzMLReader; 38 | //! use mzdata::prelude::*; 39 | //! use mzdata::spectrum::SignalContinuity; 40 | //! 41 | //! let reader = MzMLReader::open_path("./test/data/small.mzML").unwrap(); 42 | //! for spectrum in reader { 43 | //! println!("Scan {} => BP {}", spectrum.id(), spectrum.peaks().base_peak().mz); 44 | //! 45 | //! if spectrum.signal_continuity() < SignalContinuity::Profile { 46 | //! let peak_picked = spectrum.into_centroid().unwrap(); 47 | //! println!("Matches for 579.155: {:?}", 48 | //! peak_picked.peaks.all_peaks_for( 49 | //! 579.155, Tolerance::Da(0.02))); 50 | //! } 51 | //! } 52 | //! ``` 53 | //! 54 | //! More examples can be found in the [spectrum tutorial](crate::tutorial::spectrum). 55 | 56 | pub mod bindata; 57 | pub(crate) mod chromatogram; 58 | pub(crate) mod frame; 59 | pub(crate) mod group; 60 | pub(crate) mod peaks; 61 | pub(crate) mod scan_properties; 62 | pub(crate) mod spectrum_types; 63 | pub mod utils; 64 | 65 | pub use crate::spectrum::bindata::{ArrayType, BinaryArrayMap, BinaryDataArrayType, DataArray}; 66 | pub use crate::spectrum::chromatogram::{Chromatogram, ChromatogramLike}; 67 | pub use crate::spectrum::scan_properties::*; 68 | pub use crate::spectrum::spectrum_types::{ 69 | CentroidPeakAdapting, CentroidSpectrum, CentroidSpectrumType, DeconvolutedPeakAdapting, 70 | DeconvolutedSpectrum, DeconvolutedSpectrumType, MultiLayerSpectrum, RawSpectrum, Spectrum, 71 | SpectrumConversionError, SpectrumLike, SpectrumProcessingError, 72 | }; 73 | 74 | pub use crate::spectrum::peaks::{ 75 | PeakDataIter, PeakDataIterDispatch, PeakDataLevel, RawIter, RefPeakDataIter, RefPeakDataLevel, 76 | SpectrumSummary, 77 | }; 78 | 79 | pub use utils::HasIonMobility; 80 | 81 | pub use frame::{ 82 | FeatureDataLevel, IonMobilityFrameDescription, IonMobilityFrameLike, 83 | MultiLayerIonMobilityFrame, RefFeatureDataLevel 84 | }; 85 | 86 | pub use group::{ 87 | IonMobilityFrameGroup, IonMobilityFrameGroupIntoIter, IonMobilityFrameGroupIter, 88 | IonMobilityFrameGroupingIterator, SpectrumGroup, SpectrumGroupIntoIter, SpectrumGroupIter, 89 | SpectrumGroupingIterator, SpectrumGrouping, IonMobilityFrameGrouping, 90 | }; 91 | 92 | #[cfg(feature = "mzsignal")] 93 | pub use group::{ 94 | average_spectra, DeferredSpectrumAveragingIterator, SpectrumAveragingIterator, 95 | SpectrumGroupAveraging, 96 | }; 97 | -------------------------------------------------------------------------------- /src/spectrum/bindata.rs: -------------------------------------------------------------------------------- 1 | mod array; 2 | mod conversion; 3 | mod encodings; 4 | mod map; 5 | mod traits; 6 | 7 | pub use array::{DataArray, DataArraySlice}; 8 | pub use conversion::{ 9 | ArraysAvailable, BuildArrayMap3DFrom, BuildArrayMapFrom, BuildFromArrayMap, BuildFromArrayMap3D, 10 | }; 11 | pub use encodings::{ 12 | as_bytes, delta_decoding, delta_encoding, linear_prediction_decoding, 13 | linear_prediction_encoding, to_bytes, vec_as_bytes, ArrayRetrievalError, ArrayType, 14 | BinaryCompressionType, BinaryDataArrayType, Bytes, 15 | }; 16 | pub use map::{BinaryArrayMap, BinaryArrayMap3D}; 17 | pub use traits::{ByteArrayView, ByteArrayViewMut}; 18 | -------------------------------------------------------------------------------- /src/spectrum/bindata/traits.rs: -------------------------------------------------------------------------------- 1 | use std::marker::PhantomData; 2 | use std::slice; 3 | use std::mem; 4 | use std::borrow::Cow; 5 | 6 | use bytemuck::Pod; 7 | use num_traits::{AsPrimitive, Num}; 8 | use crate::params::Unit; 9 | 10 | use super::encodings::{ArrayRetrievalError, BinaryDataArrayType, Bytes}; 11 | use super::ArrayType; 12 | 13 | 14 | pub trait ByteArrayView<'transient, 'lifespan: 'transient> { 15 | fn view(&'lifespan self) -> Result, ArrayRetrievalError>; 16 | 17 | fn coerce_from( 18 | buffer: Cow<'transient, [u8]>, 19 | ) -> Result, ArrayRetrievalError> { 20 | let n = buffer.len(); 21 | if n == 0 { 22 | return Ok(Cow::Owned(Vec::new())) 23 | } 24 | let z = mem::size_of::(); 25 | if n % z != 0 { 26 | return Err(ArrayRetrievalError::DataTypeSizeMismatch); 27 | } 28 | match buffer { 29 | Cow::Borrowed(c) => { 30 | Ok(Cow::Borrowed(bytemuck::try_cast_slice(c)?)) 31 | }, 32 | Cow::Owned(v) => { 33 | let size_type = n / z; 34 | let mut buf = Vec::with_capacity(size_type); 35 | v.chunks_exact(z).try_for_each(|c| { 36 | buf.extend(bytemuck::try_cast_slice(c)?); 37 | Ok::<(), bytemuck::PodCastError>(()) 38 | })?; 39 | Ok(Cow::Owned(buf)) 40 | }, 41 | } 42 | } 43 | 44 | fn coerce( 45 | &'lifespan self, 46 | ) -> Result, ArrayRetrievalError> { 47 | match self.view() { 48 | Ok(data) => Self::coerce_from(data), 49 | Err(err) => Err(err), 50 | } 51 | } 52 | 53 | /// Decode the array, then copy it to a new array, converting each element from type `D` to to type `S` 54 | fn convert + Pod, D: Num + Clone + Copy + 'static>( 55 | &'lifespan self, 56 | ) -> Result, ArrayRetrievalError> { 57 | match self.coerce::() { 58 | Ok(view) => { 59 | match view { 60 | Cow::Borrowed(view) => { 61 | Ok(Cow::Owned(view.iter().map(|a| a.as_()).collect())) 62 | } 63 | Cow::Owned(owned) => { 64 | let res = owned.iter().map(|a| a.as_()).collect(); 65 | Ok(Cow::Owned(res)) 66 | } 67 | } 68 | } 69 | Err(err) => Err(err), 70 | } 71 | } 72 | 73 | /// The kind of array this is 74 | fn name(&self) -> &ArrayType; 75 | 76 | /// The real data type encoded in bytes 77 | fn dtype(&self) -> BinaryDataArrayType; 78 | 79 | /// The unit of measurement each data point is in 80 | fn unit(&self) -> Unit; 81 | 82 | fn to_f32(&'lifespan self) -> Result, ArrayRetrievalError> { 83 | type D = f32; 84 | match self.dtype() { 85 | BinaryDataArrayType::Float32 | BinaryDataArrayType::ASCII => self.coerce::(), 86 | BinaryDataArrayType::Float64 => { 87 | type S = f64; 88 | self.convert::() 89 | } 90 | BinaryDataArrayType::Int32 => { 91 | type S = i32; 92 | self.convert::() 93 | } 94 | BinaryDataArrayType::Int64 => { 95 | type S = i64; 96 | self.convert::() 97 | } 98 | _ => Err(ArrayRetrievalError::DataTypeSizeMismatch), 99 | } 100 | } 101 | 102 | fn to_f64(&'lifespan self) -> Result, ArrayRetrievalError> { 103 | type D = f64; 104 | match self.dtype() { 105 | BinaryDataArrayType::Float32 => { 106 | type S = f32; 107 | self.convert::() 108 | } 109 | BinaryDataArrayType::Float64 | BinaryDataArrayType::ASCII => self.coerce(), 110 | BinaryDataArrayType::Int32 => { 111 | type S = i32; 112 | self.convert::() 113 | } 114 | BinaryDataArrayType::Int64 => { 115 | type S = i64; 116 | self.convert::() 117 | } 118 | _ => Err(ArrayRetrievalError::DataTypeSizeMismatch), 119 | } 120 | } 121 | 122 | fn to_i32(&'lifespan self) -> Result, ArrayRetrievalError> { 123 | type D = i32; 124 | match self.dtype() { 125 | BinaryDataArrayType::Float32 => { 126 | type S = f32; 127 | self.convert::() 128 | } 129 | BinaryDataArrayType::Float64 => { 130 | type S = f64; 131 | self.convert::() 132 | } 133 | BinaryDataArrayType::Int32 | BinaryDataArrayType::ASCII => self.coerce::(), 134 | BinaryDataArrayType::Int64 => { 135 | type S = i64; 136 | self.convert::() 137 | } 138 | _ => Err(ArrayRetrievalError::DataTypeSizeMismatch), 139 | } 140 | } 141 | 142 | fn to_i64(&'lifespan self) -> Result, ArrayRetrievalError> { 143 | type D = i64; 144 | match self.dtype() { 145 | BinaryDataArrayType::Float32 => { 146 | type S = f32; 147 | self.convert::() 148 | } 149 | BinaryDataArrayType::Float64 => { 150 | type S = f64; 151 | self.convert::() 152 | } 153 | BinaryDataArrayType::Int64 | BinaryDataArrayType::ASCII => self.coerce::(), 154 | BinaryDataArrayType::Int32 => { 155 | type S = i32; 156 | self.convert::() 157 | } 158 | _ => Err(ArrayRetrievalError::DataTypeSizeMismatch), 159 | } 160 | } 161 | 162 | /// The size of encoded array in terms of # of elements of the [`BinaryDataArrayType`] given by [`ByteArrayView::dtype`] 163 | fn data_len(&'lifespan self) -> Result { 164 | let view = self.view()?; 165 | let n = view.len(); 166 | Ok(n / self.dtype().size_of()) 167 | } 168 | 169 | fn iter_type(&'lifespan self) -> Result, ArrayRetrievalError> { 170 | Ok(DataSliceIter::new(self.view()?)) 171 | } 172 | 173 | fn iter_u8(&'lifespan self) -> Result, ArrayRetrievalError> { 174 | Ok(DataSliceIter::new(self.view()?)) 175 | } 176 | 177 | fn iter_f32(&'lifespan self) -> Result, ArrayRetrievalError> { 178 | Ok(DataSliceIter::new(self.view()?)) 179 | } 180 | 181 | fn iter_f64(&'lifespan self) -> Result, ArrayRetrievalError> { 182 | Ok(DataSliceIter::new(self.view()?)) 183 | } 184 | 185 | fn iter_i32(&'lifespan self) -> Result, ArrayRetrievalError> { 186 | Ok(DataSliceIter::new(self.view()?)) 187 | } 188 | 189 | fn iter_i64(&'lifespan self) -> Result, ArrayRetrievalError> { 190 | Ok(DataSliceIter::new(self.view()?)) 191 | } 192 | } 193 | 194 | pub trait ByteArrayViewMut<'transient, 'lifespan: 'transient>: 195 | ByteArrayView<'transient, 'lifespan> 196 | { 197 | 198 | /// Specify the unit of the data array 199 | fn unit_mut(&mut self) -> &mut Unit; 200 | 201 | /// Get a mutable view of the bytes backing this data array. 202 | /// 203 | /// This is in turn used by [`ByteArrayViewMut::coerce_mut`] to produce a typed array 204 | fn view_mut(&'transient mut self) -> Result<&'transient mut Bytes, ArrayRetrievalError>; 205 | 206 | fn coerce_from_mut( 207 | buffer: &mut [u8], 208 | ) -> Result<&'transient mut [T], ArrayRetrievalError> { 209 | let n = buffer.len(); 210 | if n == 0 { 211 | return Ok(&mut []) 212 | } 213 | let z = mem::size_of::(); 214 | if n % z != 0 { 215 | return Err(ArrayRetrievalError::DataTypeSizeMismatch); 216 | } 217 | let m = n / z; 218 | unsafe { Ok(slice::from_raw_parts_mut(buffer.as_mut_ptr() as *mut T, m)) } 219 | } 220 | 221 | fn coerce_mut( 222 | &'lifespan mut self, 223 | ) -> Result<&'transient mut [T], ArrayRetrievalError> { 224 | let view = self.view_mut()?; 225 | Self::coerce_from_mut(view) 226 | } 227 | } 228 | 229 | #[derive(Debug)] 230 | pub struct DataSliceIter<'a, T: Pod> { 231 | buffer: Cow<'a, [u8]>, 232 | i: usize, 233 | _t: PhantomData 234 | } 235 | 236 | impl ExactSizeIterator for DataSliceIter<'_, T> { 237 | fn len(&self) -> usize { 238 | let z = mem::size_of::(); 239 | self.buffer.len() / z 240 | } 241 | } 242 | 243 | impl<'a, T: Pod> DataSliceIter<'a, T> { 244 | pub fn new(buffer: Cow<'a, [u8]>) -> Self { 245 | Self { buffer, i: 0, _t: PhantomData } 246 | } 247 | 248 | pub fn next_value(&mut self) -> Option { 249 | let z = mem::size_of::(); 250 | let offset = z * self.i; 251 | if (offset + z) > self.buffer.len() { 252 | None 253 | } else { 254 | let val = bytemuck::from_bytes(&self.buffer[offset..offset + z]); 255 | self.i += 1; 256 | Some(*val) 257 | } 258 | } 259 | } 260 | 261 | impl Iterator for DataSliceIter<'_, T> { 262 | type Item = T; 263 | 264 | fn next(&mut self) -> Option { 265 | self.next_value() 266 | } 267 | } -------------------------------------------------------------------------------- /src/spectrum/chromatogram.rs: -------------------------------------------------------------------------------- 1 | use std::borrow::{Borrow, Cow}; 2 | 3 | use super::bindata::{ArrayRetrievalError, ArrayType, BinaryArrayMap, ByteArrayView}; 4 | use crate::params::{Param, ParamDescribed}; 5 | use crate::spectrum::scan_properties::{ 6 | ChromatogramDescription, ChromatogramType, Precursor, ScanPolarity, 7 | }; 8 | use mzpeaks::coordinate::{Time, MZ}; 9 | use mzpeaks::feature::{FeatureView, SimpleFeature, TimeInterval}; 10 | 11 | #[derive(Debug, Default, Clone)] 12 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] 13 | pub struct Chromatogram { 14 | description: ChromatogramDescription, 15 | pub arrays: BinaryArrayMap, 16 | } 17 | 18 | macro_rules! as_feature_view { 19 | ($chromatogram:ident, $view:ident => $then:tt) => { 20 | if let Ok(t) = $chromatogram.time() { 21 | if let Ok(i) = $chromatogram.intensity() { 22 | let $view = FeatureView::::new(t.borrow(), t.borrow(), i.borrow()); 23 | Some($then) 24 | } else { 25 | None 26 | } 27 | } else { 28 | None 29 | } 30 | }; 31 | } 32 | 33 | #[allow(unused)] 34 | pub(crate) fn as_simple_feature(chromatogram: &Chromatogram) -> Option> { 35 | if let Ok(t) = chromatogram.time() { 36 | if let Ok(i) = chromatogram.intensity() { 37 | let mut f = SimpleFeature::::empty(0.0); 38 | f.extend(t.iter().zip(i.iter()).map(|(y, z)| (0.0f64, *y, *z))); 39 | return Some(f); 40 | } 41 | } 42 | None 43 | } 44 | 45 | impl TimeInterval