├── .github ├── dependabot.yml └── workflows │ ├── conventional-prs.yml │ ├── release-please.yml │ └── rust.yml ├── .gitignore ├── CHANGELOG.md ├── Cargo.lock ├── Cargo.toml ├── LICENSE.md ├── README.md └── src ├── alignment.rs ├── annot ├── contig.rs ├── loc.rs ├── mod.rs ├── pos.rs ├── refids.rs └── spliced.rs ├── genome.rs ├── lib.rs ├── phylogeny.rs ├── sequence.rs ├── strand.rs └── variant.rs /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "cargo" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | commit-message: 8 | # Prefix all commit messages with "deps: ", which should be 9 | # accepted as a conventional commit and trigger release-please 10 | prefix: "deps" 11 | -------------------------------------------------------------------------------- /.github/workflows/conventional-prs.yml: -------------------------------------------------------------------------------- 1 | name: PR 2 | on: 3 | pull_request_target: 4 | types: 5 | - opened 6 | - reopened 7 | - edited 8 | - synchronize 9 | 10 | jobs: 11 | title-format: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: amannn/action-semantic-pull-request@v5 15 | env: 16 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 17 | with: 18 | types: | 19 | fix 20 | feat 21 | docs 22 | style 23 | refactor 24 | perf 25 | test 26 | build 27 | ci 28 | chore 29 | revert 30 | deps 31 | -------------------------------------------------------------------------------- /.github/workflows/release-please.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - master 5 | 6 | permissions: 7 | contents: write 8 | pull-requests: write 9 | 10 | name: release-please 11 | 12 | jobs: 13 | release-please: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: googleapis/release-please-action@v4 17 | id: release 18 | with: 19 | release-type: rust 20 | package-name: bio-types 21 | token: ${{ secrets.GITHUB_TOKEN }} 22 | changelog-types: '[{"type":"feat","section":"Features","hidden":false},{"type":"fix","section":"Bug Fixes","hidden":false},{"type":"chore","section":"Miscellaneous","hidden":false},{"type":"deps","section":"Dependencies","hidden":false}]' 23 | 24 | - uses: actions/checkout@v2 25 | if: ${{ steps.release.outputs.release_created }} 26 | 27 | - name: Install stable toolchain 28 | uses: actions-rs/toolchain@v1 29 | if: ${{ steps.release.outputs.release_created }} 30 | with: 31 | toolchain: stable 32 | override: true 33 | 34 | - uses: Swatinem/rust-cache@v1.3.0 35 | if: ${{ steps.release.outputs.release_created }} 36 | 37 | - name: Publish crate 38 | if: ${{ steps.release.outputs.release_created }} 39 | uses: actions-rs/cargo@v1 40 | with: 41 | command: publish 42 | args: --token ${{ secrets.CRATES_IO_TOKEN }} 43 | -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | Formatting: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout repository 14 | uses: actions/checkout@v2 15 | 16 | - name: Install stable toolchain 17 | uses: actions-rs/toolchain@v1 18 | with: 19 | toolchain: stable 20 | override: true 21 | components: rustfmt 22 | 23 | - uses: Swatinem/rust-cache@v1.3.0 24 | 25 | - name: Check format 26 | run: cargo fmt -- --check 27 | 28 | Linting: 29 | runs-on: ubuntu-latest 30 | steps: 31 | - name: Checkout repository 32 | uses: actions/checkout@v2 33 | 34 | - name: Install stable toolchain 35 | uses: actions-rs/toolchain@v1 36 | with: 37 | toolchain: stable 38 | override: true 39 | components: clippy 40 | 41 | - uses: Swatinem/rust-cache@v1.3.0 42 | 43 | - name: Lint with clippy 44 | uses: actions-rs/clippy-check@v1 45 | with: 46 | token: ${{ secrets.GITHUB_TOKEN }} 47 | 48 | Testing: 49 | needs: Formatting 50 | runs-on: ${{ matrix.os }} 51 | strategy: 52 | fail-fast: false 53 | matrix: 54 | build: [linux, windows, macos] 55 | include: 56 | - build: macos 57 | os: macos-latest 58 | rust: stable 59 | - build: windows 60 | os: windows-latest 61 | rust: stable 62 | - build: linux 63 | os: ubuntu-latest 64 | rust: stable 65 | steps: 66 | - name: Checkout repository 67 | uses: actions/checkout@v2 68 | 69 | - name: Install ${{ matrix.rust }} toolchain 70 | uses: actions-rs/toolchain@v1 71 | with: 72 | toolchain: ${{ matrix.rust }} 73 | override: true 74 | 75 | - uses: Swatinem/rust-cache@v1.3.0 76 | 77 | - name: Run tests 78 | uses: actions-rs/cargo@v1 79 | with: 80 | command: test 81 | args: --all --no-fail-fast 82 | 83 | Coverage: 84 | needs: Formatting 85 | runs-on: ubuntu-latest 86 | steps: 87 | - name: Checkout repository 88 | uses: actions/checkout@v2 89 | 90 | - name: Install nightly toolchain 91 | uses: actions-rs/toolchain@v1 92 | with: 93 | toolchain: stable 94 | override: true 95 | 96 | - uses: Swatinem/rust-cache@v1.3.0 97 | 98 | - name: Install cargo-tarpaulin 99 | uses: actions-rs/install@v0.1 100 | with: 101 | crate: cargo-tarpaulin 102 | version: latest 103 | use-tool-cache: true 104 | 105 | - name: Coverage with tarpaulin 106 | run: cargo tarpaulin --all --all-features --timeout 600 --out Lcov -- --test-threads 1 107 | 108 | - name: Upload coverage 109 | uses: coverallsapp/github-action@master 110 | with: 111 | github-token: ${{ secrets.GITHUB_TOKEN }} 112 | path-to-lcov: ./lcov.info 113 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | /target/ 3 | **/*.rs.bk 4 | Cargo.lock 5 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [1.0.4](https://github.com/rust-bio/rust-bio-types/compare/v1.0.3...v1.0.4) (2024-08-05) 4 | 5 | 6 | ### Bug Fixes 7 | 8 | * update release-please.yml to use v4 ([22dc35c](https://github.com/rust-bio/rust-bio-types/commit/22dc35c78a936ece182836c70cc13b45fff55d95)) 9 | 10 | ### Dependencies 11 | 12 | * deps: bump regex from 1.10.5 to 1.10.6 ([#108](https://github.com/rust-bio/rust-bio-types/commit/3c625c5cfdf67ee04143d426d64490dd0f989202)) 13 | * deps: bump clap from 4.5.9 to 4.5.13 ([#107](https://github.com/rust-bio/rust-bio-types/commit/1182b3379cce87e47f06bc67d3c9be010f77e5c8)) 14 | * deps: bump thiserror from 1.0.62 to 1.0.63 ([#105](https://github.com/rust-bio/rust-bio-types/commit/d394e73fa002883a3031f3a5899d28e2c9c0541d)) 15 | 16 | 17 | ## [1.0.3](https://github.com/rust-bio/rust-bio-types/compare/v1.0.2...v1.0.3) (2024-07-15) 18 | 19 | 20 | ### Dependencies 21 | 22 | * bump clap from 4.4.18 to 4.5.9 ([c2380d9](https://github.com/rust-bio/rust-bio-types/commit/c2380d966faeb63127ff08b319afa4359db2a8ff)) 23 | * bump derive-new from 0.5.9 to 0.6.0 ([174b264](https://github.com/rust-bio/rust-bio-types/commit/174b2645649d4b69a2e66da0ff2da1547cd8a0a3)) 24 | * bump lazy_static from 1.4.0 to 1.5.0 ([b84ef21](https://github.com/rust-bio/rust-bio-types/commit/b84ef21ed2f36189c0dc3bb5a958fa726515a96b)) 25 | * bump serde from 1.0.188 to 1.0.204 ([0dfda26](https://github.com/rust-bio/rust-bio-types/commit/0dfda265893062fd41344b308df7f7ce83e4bd64)) 26 | 27 | ## [1.0.2](https://github.com/rust-bio/rust-bio-types/compare/v1.0.1...v1.0.2) (2024-07-12) 28 | 29 | 30 | ### Dependencies 31 | 32 | * bump clap from 4.4.0 to 4.4.18 ([ba733ba](https://github.com/rust-bio/rust-bio-types/commit/ba733bae293292c42e6ec651ca98a53c5152623b)) 33 | * bump petgraph from 0.6.3 to 0.6.5 ([231cd4f](https://github.com/rust-bio/rust-bio-types/commit/231cd4f30d97e42dae1029b9d57f762ea229b695)) 34 | * bump regex from 1.9.4 to 1.10.5 ([6307a18](https://github.com/rust-bio/rust-bio-types/commit/6307a1800c496e6294f35b808a9fd43fc15ceda8)) 35 | * bump strum_macros from 0.25.2 to 0.26.4 ([c9de708](https://github.com/rust-bio/rust-bio-types/commit/c9de708e3b9c9f831f2c43458251843359f419d0)) 36 | * bump thiserror from 1.0.47 to 1.0.62 ([4702ad2](https://github.com/rust-bio/rust-bio-types/commit/4702ad227671aa951e796b2820cf574629c92888)) 37 | 38 | ## [1.0.1](https://github.com/rust-bio/rust-bio-types/compare/v1.0.0...v1.0.1) (2023-08-29) 39 | 40 | 41 | ### Bug Fixes 42 | 43 | * make dependabot commit message follow conventional commits ([#55](https://github.com/rust-bio/rust-bio-types/issues/55)) ([f727826](https://github.com/rust-bio/rust-bio-types/commit/f72782674f28d152d0775605ee9c50bd45355b79)) 44 | 45 | 46 | ### Dependencies 47 | 48 | * bump clap from 3.2.25 to 4.3.19 ([2e1b414](https://github.com/rust-bio/rust-bio-types/commit/2e1b4143fc0012b7c21914d3c5f0ca851d2ae42d)) 49 | * bump clap from 4.3.19 to 4.3.21 ([ffae0ba](https://github.com/rust-bio/rust-bio-types/commit/ffae0ba47e5cf5e160ccf911b5095a7cbe64738b)) 50 | * bump clap from 4.3.21 to 4.4.0 ([c9840ad](https://github.com/rust-bio/rust-bio-types/commit/c9840ad7c342d21a202bac1cd3a9f614d2d327e1)) 51 | * bump regex from 1.9.1 to 1.9.4 ([5ad9921](https://github.com/rust-bio/rust-bio-types/commit/5ad99215d040df0f3d40b6c20757460be2c1a248)) 52 | * bump serde from 1.0.136 to 1.0.174 ([cb9f249](https://github.com/rust-bio/rust-bio-types/commit/cb9f249343859a7ec7a050643d09199941a913ee)) 53 | * bump serde from 1.0.174 to 1.0.179 ([e07a44c](https://github.com/rust-bio/rust-bio-types/commit/e07a44c687edbdd2e7d91524805e0da8140f4040)) 54 | * bump serde from 1.0.179 to 1.0.183 ([43c00bd](https://github.com/rust-bio/rust-bio-types/commit/43c00bd4248248a2ca4a68a06b747e5eab0c34f1)) 55 | * bump serde from 1.0.183 to 1.0.188 ([7c41547](https://github.com/rust-bio/rust-bio-types/commit/7c41547380e34d8827082696d62c868447110f1e)) 56 | * bump strum_macros from 0.24.3 to 0.25.1 ([#56](https://github.com/rust-bio/rust-bio-types/issues/56)) ([0487bf4](https://github.com/rust-bio/rust-bio-types/commit/0487bf468b59731d36f59fdd1dad796cca0068d2)) 57 | * bump strum_macros from 0.25.1 to 0.25.2 ([4d12e6f](https://github.com/rust-bio/rust-bio-types/commit/4d12e6fb86277b8b361ca30ae7092dd7135be969)) 58 | * bump thiserror from 1.0.29 to 1.0.44 ([cb9a4d0](https://github.com/rust-bio/rust-bio-types/commit/cb9a4d04ff79225da72a28812428c088d5b4376b)) 59 | * bump thiserror from 1.0.44 to 1.0.47 ([d3e34e0](https://github.com/rust-bio/rust-bio-types/commit/d3e34e0497a106da974e079c4927dd9fc8246106)) 60 | 61 | ## [1.0.0](https://www.github.com/rust-bio/rust-bio-types/compare/v0.13.0...v1.0.0) (2023-06-06) 62 | 63 | 64 | ### ⚠ BREAKING CHANGES 65 | 66 | * custom number of columns in Alignment::pretty() (#19) 67 | 68 | ### Features 69 | 70 | * allow AlignmentMode to be used on the command line ([#44](https://www.github.com/rust-bio/rust-bio-types/issues/44)) ([40846fb](https://www.github.com/rust-bio/rust-bio-types/commit/40846fb38f5abe471bdebdd40193ed2b5e545826)) 71 | * custom number of columns in Alignment::pretty() ([#19](https://www.github.com/rust-bio/rust-bio-types/issues/19)) ([2528736](https://www.github.com/rust-bio/rust-bio-types/commit/252873617a6366520ee37dbd1f126f433e50c32e)) 72 | * strand: Use +/- rather than (+)/(-) for str ([#39](https://www.github.com/rust-bio/rust-bio-types/issues/39)) ([45f518e](https://www.github.com/rust-bio/rust-bio-types/commit/45f518e97148f3cac27b53f4a6272267236c2912)) 73 | 74 | 75 | ### Bug Fixes 76 | 77 | * link to repository and docs ([#41](https://www.github.com/rust-bio/rust-bio-types/issues/41)) ([5e078d4](https://www.github.com/rust-bio/rust-bio-types/commit/5e078d4a097ad3daad82af60430e5d8e39e4850e)) 78 | 79 | ## [0.13.0](https://www.github.com/rust-bio/rust-bio-types/compare/v0.12.1...v0.13.0) (2022-07-23) 80 | 81 | 82 | ### Features 83 | 84 | * strand: Use serde ([#38](https://www.github.com/rust-bio/rust-bio-types/issues/38)) ([3fa1820](https://www.github.com/rust-bio/rust-bio-types/commit/3fa1820ec03165367b8c5025a6cf110fd968a81b)) 85 | 86 | ### [0.12.1](https://www.github.com/rust-bio/rust-bio-types/compare/v0.12.0...v0.12.1) (2021-11-17) 87 | 88 | 89 | ### Bug Fixes 90 | 91 | * Fix `'\'` being rendered as ’' on docs.rs ([#31](https://www.github.com/rust-bio/rust-bio-types/issues/31)) ([ebbb67a](https://www.github.com/rust-bio/rust-bio-types/commit/ebbb67a3b4683a0584f869828223d622c1a502c6)) 92 | 93 | ## [0.12.0](https://www.github.com/rust-bio/rust-bio-types/compare/v0.11.0...v0.12.0) (2021-07-09) 94 | 95 | 96 | ### Features 97 | 98 | * make AlignmentOperation hashable ([f1c4df0](https://www.github.com/rust-bio/rust-bio-types/commit/f1c4df09f0247ef76235f5ed6c17156535586b47)) 99 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "aho-corasick" 7 | version = "1.0.2" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" 10 | dependencies = [ 11 | "memchr", 12 | ] 13 | 14 | [[package]] 15 | name = "anstream" 16 | version = "0.6.14" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b" 19 | dependencies = [ 20 | "anstyle", 21 | "anstyle-parse", 22 | "anstyle-query", 23 | "anstyle-wincon", 24 | "colorchoice", 25 | "is_terminal_polyfill", 26 | "utf8parse", 27 | ] 28 | 29 | [[package]] 30 | name = "anstyle" 31 | version = "1.0.8" 32 | source = "registry+https://github.com/rust-lang/crates.io-index" 33 | checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1" 34 | 35 | [[package]] 36 | name = "anstyle-parse" 37 | version = "0.2.1" 38 | source = "registry+https://github.com/rust-lang/crates.io-index" 39 | checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333" 40 | dependencies = [ 41 | "utf8parse", 42 | ] 43 | 44 | [[package]] 45 | name = "anstyle-query" 46 | version = "1.0.0" 47 | source = "registry+https://github.com/rust-lang/crates.io-index" 48 | checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" 49 | dependencies = [ 50 | "windows-sys 0.48.0", 51 | ] 52 | 53 | [[package]] 54 | name = "anstyle-wincon" 55 | version = "3.0.3" 56 | source = "registry+https://github.com/rust-lang/crates.io-index" 57 | checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19" 58 | dependencies = [ 59 | "anstyle", 60 | "windows-sys 0.52.0", 61 | ] 62 | 63 | [[package]] 64 | name = "bio-types" 65 | version = "1.0.4" 66 | dependencies = [ 67 | "clap", 68 | "derive-new", 69 | "lazy_static", 70 | "petgraph", 71 | "regex", 72 | "serde", 73 | "strum_macros", 74 | "thiserror", 75 | ] 76 | 77 | [[package]] 78 | name = "clap" 79 | version = "4.5.15" 80 | source = "registry+https://github.com/rust-lang/crates.io-index" 81 | checksum = "11d8838454fda655dafd3accb2b6e2bea645b9e4078abe84a22ceb947235c5cc" 82 | dependencies = [ 83 | "clap_builder", 84 | "clap_derive", 85 | ] 86 | 87 | [[package]] 88 | name = "clap_builder" 89 | version = "4.5.15" 90 | source = "registry+https://github.com/rust-lang/crates.io-index" 91 | checksum = "216aec2b177652e3846684cbfe25c9964d18ec45234f0f5da5157b207ed1aab6" 92 | dependencies = [ 93 | "anstream", 94 | "anstyle", 95 | "clap_lex", 96 | "strsim", 97 | ] 98 | 99 | [[package]] 100 | name = "clap_derive" 101 | version = "4.5.13" 102 | source = "registry+https://github.com/rust-lang/crates.io-index" 103 | checksum = "501d359d5f3dcaf6ecdeee48833ae73ec6e42723a1e52419c79abf9507eec0a0" 104 | dependencies = [ 105 | "heck", 106 | "proc-macro2", 107 | "quote", 108 | "syn", 109 | ] 110 | 111 | [[package]] 112 | name = "clap_lex" 113 | version = "0.7.1" 114 | source = "registry+https://github.com/rust-lang/crates.io-index" 115 | checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70" 116 | 117 | [[package]] 118 | name = "colorchoice" 119 | version = "1.0.0" 120 | source = "registry+https://github.com/rust-lang/crates.io-index" 121 | checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" 122 | 123 | [[package]] 124 | name = "derive-new" 125 | version = "0.6.0" 126 | source = "registry+https://github.com/rust-lang/crates.io-index" 127 | checksum = "d150dea618e920167e5973d70ae6ece4385b7164e0d799fe7c122dd0a5d912ad" 128 | dependencies = [ 129 | "proc-macro2", 130 | "quote", 131 | "syn", 132 | ] 133 | 134 | [[package]] 135 | name = "equivalent" 136 | version = "1.0.1" 137 | source = "registry+https://github.com/rust-lang/crates.io-index" 138 | checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" 139 | 140 | [[package]] 141 | name = "fixedbitset" 142 | version = "0.4.0" 143 | source = "registry+https://github.com/rust-lang/crates.io-index" 144 | checksum = "398ea4fabe40b9b0d885340a2a991a44c8a645624075ad966d21f88688e2b69e" 145 | 146 | [[package]] 147 | name = "hashbrown" 148 | version = "0.14.5" 149 | source = "registry+https://github.com/rust-lang/crates.io-index" 150 | checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" 151 | 152 | [[package]] 153 | name = "heck" 154 | version = "0.5.0" 155 | source = "registry+https://github.com/rust-lang/crates.io-index" 156 | checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" 157 | 158 | [[package]] 159 | name = "indexmap" 160 | version = "2.2.6" 161 | source = "registry+https://github.com/rust-lang/crates.io-index" 162 | checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" 163 | dependencies = [ 164 | "equivalent", 165 | "hashbrown", 166 | ] 167 | 168 | [[package]] 169 | name = "is_terminal_polyfill" 170 | version = "1.70.0" 171 | source = "registry+https://github.com/rust-lang/crates.io-index" 172 | checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" 173 | 174 | [[package]] 175 | name = "lazy_static" 176 | version = "1.5.0" 177 | source = "registry+https://github.com/rust-lang/crates.io-index" 178 | checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" 179 | 180 | [[package]] 181 | name = "memchr" 182 | version = "2.7.4" 183 | source = "registry+https://github.com/rust-lang/crates.io-index" 184 | checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" 185 | 186 | [[package]] 187 | name = "petgraph" 188 | version = "0.6.5" 189 | source = "registry+https://github.com/rust-lang/crates.io-index" 190 | checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" 191 | dependencies = [ 192 | "fixedbitset", 193 | "indexmap", 194 | ] 195 | 196 | [[package]] 197 | name = "proc-macro2" 198 | version = "1.0.86" 199 | source = "registry+https://github.com/rust-lang/crates.io-index" 200 | checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" 201 | dependencies = [ 202 | "unicode-ident", 203 | ] 204 | 205 | [[package]] 206 | name = "quote" 207 | version = "1.0.36" 208 | source = "registry+https://github.com/rust-lang/crates.io-index" 209 | checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" 210 | dependencies = [ 211 | "proc-macro2", 212 | ] 213 | 214 | [[package]] 215 | name = "regex" 216 | version = "1.10.6" 217 | source = "registry+https://github.com/rust-lang/crates.io-index" 218 | checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" 219 | dependencies = [ 220 | "aho-corasick", 221 | "memchr", 222 | "regex-automata", 223 | "regex-syntax", 224 | ] 225 | 226 | [[package]] 227 | name = "regex-automata" 228 | version = "0.4.7" 229 | source = "registry+https://github.com/rust-lang/crates.io-index" 230 | checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" 231 | dependencies = [ 232 | "aho-corasick", 233 | "memchr", 234 | "regex-syntax", 235 | ] 236 | 237 | [[package]] 238 | name = "regex-syntax" 239 | version = "0.8.4" 240 | source = "registry+https://github.com/rust-lang/crates.io-index" 241 | checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" 242 | 243 | [[package]] 244 | name = "rustversion" 245 | version = "1.0.5" 246 | source = "registry+https://github.com/rust-lang/crates.io-index" 247 | checksum = "61b3909d758bb75c79f23d4736fac9433868679d3ad2ea7a61e3c25cfda9a088" 248 | 249 | [[package]] 250 | name = "serde" 251 | version = "1.0.206" 252 | source = "registry+https://github.com/rust-lang/crates.io-index" 253 | checksum = "5b3e4cd94123dd520a128bcd11e34d9e9e423e7e3e50425cb1b4b1e3549d0284" 254 | dependencies = [ 255 | "serde_derive", 256 | ] 257 | 258 | [[package]] 259 | name = "serde_derive" 260 | version = "1.0.206" 261 | source = "registry+https://github.com/rust-lang/crates.io-index" 262 | checksum = "fabfb6138d2383ea8208cf98ccf69cdfb1aff4088460681d84189aa259762f97" 263 | dependencies = [ 264 | "proc-macro2", 265 | "quote", 266 | "syn", 267 | ] 268 | 269 | [[package]] 270 | name = "strsim" 271 | version = "0.11.1" 272 | source = "registry+https://github.com/rust-lang/crates.io-index" 273 | checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" 274 | 275 | [[package]] 276 | name = "strum_macros" 277 | version = "0.26.4" 278 | source = "registry+https://github.com/rust-lang/crates.io-index" 279 | checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" 280 | dependencies = [ 281 | "heck", 282 | "proc-macro2", 283 | "quote", 284 | "rustversion", 285 | "syn", 286 | ] 287 | 288 | [[package]] 289 | name = "syn" 290 | version = "2.0.70" 291 | source = "registry+https://github.com/rust-lang/crates.io-index" 292 | checksum = "2f0209b68b3613b093e0ec905354eccaedcfe83b8cb37cbdeae64026c3064c16" 293 | dependencies = [ 294 | "proc-macro2", 295 | "quote", 296 | "unicode-ident", 297 | ] 298 | 299 | [[package]] 300 | name = "thiserror" 301 | version = "1.0.63" 302 | source = "registry+https://github.com/rust-lang/crates.io-index" 303 | checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" 304 | dependencies = [ 305 | "thiserror-impl", 306 | ] 307 | 308 | [[package]] 309 | name = "thiserror-impl" 310 | version = "1.0.63" 311 | source = "registry+https://github.com/rust-lang/crates.io-index" 312 | checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" 313 | dependencies = [ 314 | "proc-macro2", 315 | "quote", 316 | "syn", 317 | ] 318 | 319 | [[package]] 320 | name = "unicode-ident" 321 | version = "1.0.10" 322 | source = "registry+https://github.com/rust-lang/crates.io-index" 323 | checksum = "22049a19f4a68748a168c0fc439f9516686aa045927ff767eca0a85101fb6e73" 324 | 325 | [[package]] 326 | name = "utf8parse" 327 | version = "0.2.1" 328 | source = "registry+https://github.com/rust-lang/crates.io-index" 329 | checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" 330 | 331 | [[package]] 332 | name = "windows-sys" 333 | version = "0.48.0" 334 | source = "registry+https://github.com/rust-lang/crates.io-index" 335 | checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" 336 | dependencies = [ 337 | "windows-targets 0.48.1", 338 | ] 339 | 340 | [[package]] 341 | name = "windows-sys" 342 | version = "0.52.0" 343 | source = "registry+https://github.com/rust-lang/crates.io-index" 344 | checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" 345 | dependencies = [ 346 | "windows-targets 0.52.6", 347 | ] 348 | 349 | [[package]] 350 | name = "windows-targets" 351 | version = "0.48.1" 352 | source = "registry+https://github.com/rust-lang/crates.io-index" 353 | checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f" 354 | dependencies = [ 355 | "windows_aarch64_gnullvm 0.48.0", 356 | "windows_aarch64_msvc 0.48.0", 357 | "windows_i686_gnu 0.48.0", 358 | "windows_i686_msvc 0.48.0", 359 | "windows_x86_64_gnu 0.48.0", 360 | "windows_x86_64_gnullvm 0.48.0", 361 | "windows_x86_64_msvc 0.48.0", 362 | ] 363 | 364 | [[package]] 365 | name = "windows-targets" 366 | version = "0.52.6" 367 | source = "registry+https://github.com/rust-lang/crates.io-index" 368 | checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" 369 | dependencies = [ 370 | "windows_aarch64_gnullvm 0.52.6", 371 | "windows_aarch64_msvc 0.52.6", 372 | "windows_i686_gnu 0.52.6", 373 | "windows_i686_gnullvm", 374 | "windows_i686_msvc 0.52.6", 375 | "windows_x86_64_gnu 0.52.6", 376 | "windows_x86_64_gnullvm 0.52.6", 377 | "windows_x86_64_msvc 0.52.6", 378 | ] 379 | 380 | [[package]] 381 | name = "windows_aarch64_gnullvm" 382 | version = "0.48.0" 383 | source = "registry+https://github.com/rust-lang/crates.io-index" 384 | checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" 385 | 386 | [[package]] 387 | name = "windows_aarch64_gnullvm" 388 | version = "0.52.6" 389 | source = "registry+https://github.com/rust-lang/crates.io-index" 390 | checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" 391 | 392 | [[package]] 393 | name = "windows_aarch64_msvc" 394 | version = "0.48.0" 395 | source = "registry+https://github.com/rust-lang/crates.io-index" 396 | checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" 397 | 398 | [[package]] 399 | name = "windows_aarch64_msvc" 400 | version = "0.52.6" 401 | source = "registry+https://github.com/rust-lang/crates.io-index" 402 | checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" 403 | 404 | [[package]] 405 | name = "windows_i686_gnu" 406 | version = "0.48.0" 407 | source = "registry+https://github.com/rust-lang/crates.io-index" 408 | checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" 409 | 410 | [[package]] 411 | name = "windows_i686_gnu" 412 | version = "0.52.6" 413 | source = "registry+https://github.com/rust-lang/crates.io-index" 414 | checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" 415 | 416 | [[package]] 417 | name = "windows_i686_gnullvm" 418 | version = "0.52.6" 419 | source = "registry+https://github.com/rust-lang/crates.io-index" 420 | checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" 421 | 422 | [[package]] 423 | name = "windows_i686_msvc" 424 | version = "0.48.0" 425 | source = "registry+https://github.com/rust-lang/crates.io-index" 426 | checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" 427 | 428 | [[package]] 429 | name = "windows_i686_msvc" 430 | version = "0.52.6" 431 | source = "registry+https://github.com/rust-lang/crates.io-index" 432 | checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" 433 | 434 | [[package]] 435 | name = "windows_x86_64_gnu" 436 | version = "0.48.0" 437 | source = "registry+https://github.com/rust-lang/crates.io-index" 438 | checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" 439 | 440 | [[package]] 441 | name = "windows_x86_64_gnu" 442 | version = "0.52.6" 443 | source = "registry+https://github.com/rust-lang/crates.io-index" 444 | checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" 445 | 446 | [[package]] 447 | name = "windows_x86_64_gnullvm" 448 | version = "0.48.0" 449 | source = "registry+https://github.com/rust-lang/crates.io-index" 450 | checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" 451 | 452 | [[package]] 453 | name = "windows_x86_64_gnullvm" 454 | version = "0.52.6" 455 | source = "registry+https://github.com/rust-lang/crates.io-index" 456 | checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" 457 | 458 | [[package]] 459 | name = "windows_x86_64_msvc" 460 | version = "0.48.0" 461 | source = "registry+https://github.com/rust-lang/crates.io-index" 462 | checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" 463 | 464 | [[package]] 465 | name = "windows_x86_64_msvc" 466 | version = "0.52.6" 467 | source = "registry+https://github.com/rust-lang/crates.io-index" 468 | checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" 469 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "bio-types" 3 | version = "1.0.4" 4 | authors = ["Johannes Köster "] 5 | description = "A collection of common biomedical types for use in rust-bio and rust-htslib." 6 | homepage = "https://rust-bio.github.io" 7 | repository = "https://github.com/rust-bio/rust-bio-types" 8 | documentation = "https://docs.rs/bio-types" 9 | readme = "README.md" 10 | license = "MIT" 11 | license-file = "LICENSE.md" 12 | edition = "2018" 13 | exclude = [".gitignore", ".github"] 14 | 15 | [features] 16 | phylogeny = ["petgraph"] 17 | 18 | [dependencies] 19 | serde = { version = "^1", optional = true, features=["derive"] } 20 | clap = { version = ">=3.2.0", optional = true, features = ["derive"] } 21 | thiserror = ">=1, <3" 22 | regex = "1.10" 23 | lazy_static = "1.5" 24 | derive-new = ">=0.6, <0.8" 25 | petgraph = { version = ">=0.5, <0.7", optional = true } 26 | strum_macros = ">=0.20, <0.27" 27 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2018 Johannes Köster, the Rust-Bio team. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Crates.io](https://img.shields.io/crates/d/bio-types.svg)](https://crates.io/crates/bio-types) 2 | [![Crates.io](https://img.shields.io/crates/v/bio-types.svg)](https://crates.io/crates/bio-types) 3 | [![Crates.io](https://img.shields.io/crates/l/bio-types.svg)](https://crates.io/crates/bio-types) 4 | [![GitHub Workflow Status](https://github.com/rust-bio/rust-bio-types/actions/workflows/rust.yml/badge.svg?branch=master)](https://github.com/rust-bio/rust-bio-types/actions/workflows/rust.yml) 5 | [![Coverage Status](https://coveralls.io/repos/github/rust-bio/rust-bio-types/badge.svg?branch=master)](https://coveralls.io/github/rust-bio/rust-bio-types?branch=master) 6 | 7 | # Rust-Bio-Types 8 | 9 | This crate provides common biomedical types that will be used by rust-bio and rust-htslib. Feel free to provide additional types as needed. 10 | -------------------------------------------------------------------------------- /src/alignment.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2014-2015 Johannes Köster, Vadim Nazarov, Patrick Marks 2 | // Licensed under the MIT license (http://opensource.org/licenses/MIT) 3 | // This file may not be copied, modified, or distributed 4 | // except according to those terms. 5 | 6 | //! Types for representing pairwise sequence alignments 7 | 8 | #[cfg(feature = "clap")] 9 | use clap::ValueEnum; 10 | #[cfg(feature = "serde")] 11 | use serde::{Deserialize, Serialize}; 12 | 13 | pub type TextSlice<'a> = &'a [u8]; 14 | 15 | /// Alignment operations supported are match, substitution, insertion, deletion 16 | /// and clipping. Clipping is a special boundary condition where you are allowed 17 | /// to clip off the beginning/end of the sequence for a fixed clip penalty. The 18 | /// clip penalty could be different for the two sequences x and y, and the 19 | /// clipping operations on both are distinguishable (Xclip and Yclip). The usize 20 | /// value associated with the clipping operations are the lengths clipped. In case 21 | /// of standard modes like Global, Semi-Global and Local alignment, the clip operations 22 | /// are filtered out 23 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 24 | #[derive(Eq, PartialEq, Debug, Copy, Clone, Hash)] 25 | pub enum AlignmentOperation { 26 | Match, 27 | Subst, 28 | Del, 29 | Ins, 30 | Xclip(usize), 31 | Yclip(usize), 32 | } 33 | 34 | /// The modes of alignment supported by the aligner include standard modes such as 35 | /// Global, Semi-Global and Local alignment. In addition to this, user can also invoke 36 | /// the custom mode. In the custom mode, users can explicitly specify the clipping penalties 37 | /// for prefix and suffix of strings 'x' and 'y' independently. Under the hood the standard 38 | /// modes are implemented as special cases of the custom mode with the clipping penalties 39 | /// appropriately set. 40 | /// 41 | /// The default alignment mode is Global. 42 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 43 | #[cfg_attr(feature = "clap", derive(ValueEnum))] 44 | #[derive(Debug, PartialEq, Eq, Copy, Clone)] 45 | pub enum AlignmentMode { 46 | Local, 47 | Semiglobal, 48 | Global, 49 | Custom, 50 | } 51 | 52 | impl Default for AlignmentMode { 53 | fn default() -> Self { 54 | AlignmentMode::Global 55 | } 56 | } 57 | 58 | /// We consider alignment between two sequences x and y. x is the query or read sequence 59 | /// and y is the reference or template sequence. An alignment, consisting of a score, 60 | /// the start and end position of the alignment on sequence x and sequence y, the 61 | /// lengths of sequences x and y, and the alignment edit operations. The start position 62 | /// and end position of the alignment does not include the clipped regions. The length 63 | /// of clipped regions are already encapsulated in the Alignment Operation. 64 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 65 | #[derive(Debug, Eq, PartialEq, Clone, Default)] 66 | pub struct Alignment { 67 | /// Smith-Waterman alignment score 68 | pub score: i32, 69 | 70 | /// Start position of alignment in reference 71 | pub ystart: usize, 72 | 73 | /// Start position of alignment in query 74 | pub xstart: usize, 75 | 76 | /// End position of alignment in reference 77 | pub yend: usize, 78 | 79 | /// End position of alignment in query 80 | pub xend: usize, 81 | 82 | /// Length of the reference sequence 83 | pub ylen: usize, 84 | 85 | /// Length of the query sequence 86 | pub xlen: usize, 87 | 88 | /// Vector of alignment operations 89 | pub operations: Vec, 90 | pub mode: AlignmentMode, 91 | } 92 | 93 | impl Alignment { 94 | /// Calculate the cigar string from the alignment struct. x is the target string 95 | /// 96 | /// # Example 97 | /// 98 | /// ``` 99 | /// use bio_types::alignment::{Alignment,AlignmentMode}; 100 | /// use bio_types::alignment::AlignmentOperation::{Match, Subst, Ins, Del}; 101 | /// let alignment = Alignment { 102 | /// score: 5, 103 | /// xstart: 3, 104 | /// ystart: 0, 105 | /// xend: 9, 106 | /// yend: 10, 107 | /// ylen: 10, 108 | /// xlen: 10, 109 | /// operations: vec![Match, Match, Match, Subst, Ins, Ins, Del, Del], 110 | /// mode: AlignmentMode::Semiglobal 111 | /// }; 112 | /// assert_eq!(alignment.cigar(false), "3S3=1X2I2D1S"); 113 | /// ``` 114 | pub fn cigar(&self, hard_clip: bool) -> String { 115 | match self.mode { 116 | AlignmentMode::Global => panic!(" Cigar fn not supported for Global Alignment mode"), 117 | AlignmentMode::Local => panic!(" Cigar fn not supported for Local Alignment mode"), 118 | _ => {} 119 | } 120 | 121 | let clip_str = if hard_clip { "H" } else { "S" }; 122 | 123 | let add_op = |op: AlignmentOperation, k, cigar: &mut String| match op { 124 | AlignmentOperation::Match => cigar.push_str(&format!("{}{}", k, "=")), 125 | AlignmentOperation::Subst => cigar.push_str(&format!("{}{}", k, "X")), 126 | AlignmentOperation::Del => cigar.push_str(&format!("{}{}", k, "D")), 127 | AlignmentOperation::Ins => cigar.push_str(&format!("{}{}", k, "I")), 128 | _ => {} 129 | }; 130 | 131 | let mut cigar = "".to_owned(); 132 | if self.operations.is_empty() { 133 | return cigar; 134 | } 135 | 136 | let mut last = self.operations[0]; 137 | if self.xstart > 0 { 138 | cigar.push_str(&format!("{}{}", self.xstart, clip_str)) 139 | } 140 | let mut k = 1; 141 | for &op in self.operations[1..].iter() { 142 | if op == last { 143 | k += 1; 144 | } else { 145 | add_op(last, k, &mut cigar); 146 | k = 1; 147 | } 148 | last = op; 149 | } 150 | add_op(last, k, &mut cigar); 151 | if self.xlen > self.xend { 152 | cigar.push_str(&format!("{}{}", self.xlen - self.xend, clip_str)) 153 | } 154 | 155 | cigar 156 | } 157 | 158 | /// Return the pretty formatted alignment as a String. The string 159 | /// contains sets of 3 lines of length 100. First line is for the 160 | /// sequence x, second line is for the alignment operation and the 161 | /// the third line is for the sequence y. A '-' in the sequence 162 | /// indicates a blank (insertion/deletion). The operations follow 163 | /// the following convention: '|' for a match, '\\' (a single backslash) for a mismatch, 164 | /// '+' for an insertion, 'x' for a deletion and ' ' for clipping 165 | /// 166 | /// # Example 167 | /// 168 | /// If we align the strings "CCGTCCGGCAAGGG" and "AAAAACCGTTGACGGCCAA" 169 | /// in various modes, we will get the following output: 170 | /// 171 | /// Semiglobal: 172 | /// ```c 173 | /// CCGTCCGGCAAGGG 174 | /// ||||++++\\|\|| 175 | /// AAAAACCGT----TGACGGCCAA 176 | /// ``` 177 | /// 178 | /// Local: 179 | /// ```c 180 | /// CCGTCCGGCAAGGG 181 | /// |||| 182 | /// AAAAACCGT TGACGGCCAA 183 | /// ``` 184 | /// 185 | /// Global: 186 | /// ```c 187 | /// -----CCGT--CCGGCAAGGG 188 | /// xxxxx||||xx\||||\|++\ 189 | /// AAAAACCGTTGACGGCCA--A 190 | /// ``` 191 | /// 192 | pub fn pretty(&self, x: TextSlice, y: TextSlice, ncol: usize) -> String { 193 | let mut x_pretty = String::new(); 194 | let mut y_pretty = String::new(); 195 | let mut inb_pretty = String::new(); 196 | 197 | if !self.operations.is_empty() { 198 | let mut x_i: usize; 199 | let mut y_i: usize; 200 | 201 | // If the alignment mode is one of the standard ones, the prefix clipping is 202 | // implicit so we need to process it here 203 | match self.mode { 204 | AlignmentMode::Custom => { 205 | x_i = 0; 206 | y_i = 0; 207 | } 208 | _ => { 209 | x_i = self.xstart; 210 | y_i = self.ystart; 211 | for k in x.iter().take(self.xstart) { 212 | x_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[*k]))); 213 | inb_pretty.push(' '); 214 | y_pretty.push(' ') 215 | } 216 | for k in y.iter().take(self.ystart) { 217 | y_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[*k]))); 218 | inb_pretty.push(' '); 219 | x_pretty.push(' ') 220 | } 221 | } 222 | } 223 | 224 | // Process the alignment. 225 | for i in 0..self.operations.len() { 226 | match self.operations[i] { 227 | AlignmentOperation::Match => { 228 | x_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[x[x_i]]))); 229 | x_i += 1; 230 | 231 | inb_pretty.push('|'); 232 | 233 | y_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[y[y_i]]))); 234 | y_i += 1; 235 | } 236 | AlignmentOperation::Subst => { 237 | x_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[x[x_i]]))); 238 | x_i += 1; 239 | 240 | inb_pretty.push('\\'); 241 | 242 | y_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[y[y_i]]))); 243 | y_i += 1; 244 | } 245 | AlignmentOperation::Del => { 246 | x_pretty.push('-'); 247 | 248 | inb_pretty.push('x'); 249 | 250 | y_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[y[y_i]]))); 251 | y_i += 1; 252 | } 253 | AlignmentOperation::Ins => { 254 | x_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[x[x_i]]))); 255 | x_i += 1; 256 | 257 | inb_pretty.push('+'); 258 | 259 | y_pretty.push('-'); 260 | } 261 | AlignmentOperation::Xclip(len) => { 262 | for k in x.iter().take(len) { 263 | x_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[*k]))); 264 | x_i += 1; 265 | 266 | inb_pretty.push(' '); 267 | 268 | y_pretty.push(' ') 269 | } 270 | } 271 | AlignmentOperation::Yclip(len) => { 272 | for k in y.iter().take(len) { 273 | y_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[*k]))); 274 | y_i += 1; 275 | 276 | inb_pretty.push(' '); 277 | 278 | x_pretty.push(' ') 279 | } 280 | } 281 | } 282 | } 283 | 284 | // If the alignment mode is one of the standard ones, the suffix clipping is 285 | // implicit so we need to process it here 286 | match self.mode { 287 | AlignmentMode::Custom => {} 288 | _ => { 289 | for k in x.iter().take(self.xlen).skip(x_i) { 290 | x_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[*k]))); 291 | inb_pretty.push(' '); 292 | y_pretty.push(' ') 293 | } 294 | for k in y.iter().take(self.ylen).skip(y_i) { 295 | y_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[*k]))); 296 | inb_pretty.push(' '); 297 | x_pretty.push(' ') 298 | } 299 | } 300 | } 301 | } 302 | 303 | let mut s = String::new(); 304 | let mut idx = 0; 305 | use std::cmp::min; 306 | 307 | assert_eq!(x_pretty.len(), inb_pretty.len()); 308 | assert_eq!(y_pretty.len(), inb_pretty.len()); 309 | 310 | let ml = x_pretty.len(); 311 | 312 | while idx < ml { 313 | let rng = idx..min(idx + ncol, ml); 314 | s.push_str(&x_pretty[rng.clone()]); 315 | s.push('\n'); 316 | 317 | s.push_str(&inb_pretty[rng.clone()]); 318 | s.push('\n'); 319 | 320 | s.push_str(&y_pretty[rng]); 321 | s.push('\n'); 322 | 323 | s.push_str("\n\n"); 324 | idx += ncol; 325 | } 326 | 327 | s 328 | } 329 | 330 | /// Returns the optimal path in the alignment matrix 331 | /// 332 | /// # Example 333 | /// 334 | /// ``` 335 | /// use bio_types::alignment::{Alignment,AlignmentMode}; 336 | /// use bio_types::alignment::AlignmentOperation::*; 337 | /// let alignment = Alignment { 338 | /// score: 5, 339 | /// xstart: 3, 340 | /// ystart: 0, 341 | /// xend: 9, 342 | /// yend: 10, 343 | /// ylen: 10, 344 | /// xlen: 10, 345 | /// operations: vec![Match, Match, Match, Subst, Ins, Ins, Del, Del], 346 | /// mode: AlignmentMode::Semiglobal, 347 | /// }; 348 | /// assert_eq!(alignment.path(),[ 349 | /// (4, 5, Match), 350 | /// (5, 6, Match), 351 | /// (6, 7, Match), 352 | /// (7, 8, Subst), 353 | /// (8, 8, Ins), 354 | /// (9, 8, Ins), 355 | /// (9, 9, Del), 356 | /// (9, 10, Del)]) 357 | /// ``` 358 | pub fn path(&self) -> Vec<(usize, usize, AlignmentOperation)> { 359 | let mut path = Vec::new(); 360 | 361 | if !self.operations.is_empty() { 362 | let last = match self.mode { 363 | AlignmentMode::Custom => (self.xlen, self.ylen), 364 | _ => (self.xend, self.yend), 365 | }; 366 | let mut x_i = last.0; 367 | let mut y_i = last.1; 368 | 369 | let mut ops = self.operations.clone(); 370 | ops.reverse(); 371 | 372 | // Process the alignment. 373 | for i in ops { 374 | path.push((x_i, y_i, i)); 375 | match i { 376 | AlignmentOperation::Match => { 377 | x_i -= 1; 378 | y_i -= 1; 379 | } 380 | AlignmentOperation::Subst => { 381 | x_i -= 1; 382 | y_i -= 1; 383 | } 384 | AlignmentOperation::Del => { 385 | y_i -= 1; 386 | } 387 | AlignmentOperation::Ins => { 388 | x_i -= 1; 389 | } 390 | AlignmentOperation::Xclip(len) => { 391 | x_i -= len; 392 | } 393 | AlignmentOperation::Yclip(len) => { 394 | y_i -= len; 395 | } 396 | } 397 | } 398 | } 399 | path.reverse(); 400 | path 401 | } 402 | 403 | /// Filter out Xclip and Yclip operations from the list of operations. Useful 404 | /// when invoking the standard modes. 405 | pub fn filter_clip_operations(&mut self) { 406 | use self::AlignmentOperation::{Del, Ins, Match, Subst}; 407 | self.operations 408 | .retain(|x| (*x == Match || *x == Subst || *x == Ins || *x == Del)); 409 | } 410 | 411 | /// Number of bases in reference sequence that are aligned 412 | pub fn y_aln_len(&self) -> usize { 413 | self.yend - self.ystart 414 | } 415 | 416 | /// Number of bases in query sequence that are aigned 417 | pub fn x_aln_len(&self) -> usize { 418 | self.xend - self.xstart 419 | } 420 | } 421 | 422 | #[cfg(test)] 423 | mod tests { 424 | use super::AlignmentOperation::*; 425 | use super::*; 426 | 427 | #[test] 428 | fn test_cigar() { 429 | let alignment = Alignment { 430 | score: 5, 431 | xstart: 3, 432 | ystart: 0, 433 | xend: 9, 434 | yend: 10, 435 | ylen: 10, 436 | xlen: 10, 437 | operations: vec![Match, Match, Match, Subst, Ins, Ins, Del, Del], 438 | mode: AlignmentMode::Semiglobal, 439 | }; 440 | assert_eq!(alignment.cigar(false), "3S3=1X2I2D1S"); 441 | 442 | let alignment = Alignment { 443 | score: 5, 444 | xstart: 0, 445 | ystart: 5, 446 | xend: 4, 447 | yend: 10, 448 | ylen: 10, 449 | xlen: 5, 450 | operations: vec![Yclip(5), Match, Subst, Subst, Ins, Del, Del, Xclip(1)], 451 | mode: AlignmentMode::Custom, 452 | }; 453 | assert_eq!(alignment.cigar(false), "1=2X1I2D1S"); 454 | assert_eq!(alignment.cigar(true), "1=2X1I2D1H"); 455 | 456 | let alignment = Alignment { 457 | score: 5, 458 | xstart: 0, 459 | ystart: 5, 460 | xend: 3, 461 | yend: 8, 462 | ylen: 10, 463 | xlen: 3, 464 | operations: vec![Yclip(5), Subst, Match, Subst, Yclip(2)], 465 | mode: AlignmentMode::Custom, 466 | }; 467 | assert_eq!(alignment.cigar(false), "1X1=1X"); 468 | 469 | let alignment = Alignment { 470 | score: 5, 471 | xstart: 0, 472 | ystart: 5, 473 | xend: 3, 474 | yend: 8, 475 | ylen: 10, 476 | xlen: 3, 477 | operations: vec![Subst, Match, Subst], 478 | mode: AlignmentMode::Semiglobal, 479 | }; 480 | assert_eq!(alignment.cigar(false), "1X1=1X"); 481 | } 482 | 483 | #[test] 484 | fn test_pretty() { 485 | let alignment = Alignment { 486 | score: 1, 487 | xstart: 0, 488 | ystart: 2, 489 | xend: 3, 490 | yend: 5, 491 | ylen: 7, 492 | xlen: 2, 493 | operations: vec![Subst, Match, Match], 494 | mode: AlignmentMode::Semiglobal, 495 | }; 496 | let pretty = concat!(" GAT \n", " \\|| \n", "CTAATCC\n", "\n\n"); 497 | assert_eq!(alignment.pretty(b"GAT", b"CTAATCC", 100), pretty); 498 | let alignment = Alignment { 499 | score: 5, 500 | xstart: 0, 501 | ystart: 5, 502 | xend: 4, 503 | yend: 10, 504 | ylen: 10, 505 | xlen: 5, 506 | operations: vec![Yclip(5), Match, Subst, Subst, Ins, Del, Del, Xclip(1)], 507 | mode: AlignmentMode::Custom, 508 | }; 509 | let pretty = concat!(" AAAA--A\n |\\\\+xx \nTTTTTTTT-TT \n\n\n"); 510 | assert_eq!(alignment.pretty(b"AAAAA", b"TTTTTTTTTT", 100), pretty); 511 | } 512 | 513 | #[test] 514 | fn test_path() { 515 | let alignment = Alignment { 516 | score: 5, 517 | xstart: 3, 518 | ystart: 0, 519 | xend: 9, 520 | yend: 10, 521 | ylen: 10, 522 | xlen: 10, 523 | operations: vec![Match, Match, Match, Subst, Ins, Ins, Del, Del], 524 | mode: AlignmentMode::Semiglobal, 525 | }; 526 | assert_eq!( 527 | alignment.path(), 528 | [ 529 | (4, 5, Match), 530 | (5, 6, Match), 531 | (6, 7, Match), 532 | (7, 8, Subst), 533 | (8, 8, Ins), 534 | (9, 8, Ins), 535 | (9, 9, Del), 536 | (9, 10, Del) 537 | ] 538 | ) 539 | } 540 | } 541 | -------------------------------------------------------------------------------- /src/annot/contig.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Nicholas Ingolia 2 | // Licensed under the MIT license (http://opensource.org/licenses/MIT) 3 | // This file may not be copied, modified, or distributed 4 | // except according to those terms. 5 | 6 | //! Contiguous region on a named sequence, e.g., chromosome XI 7 | //! 334,915-334,412. 8 | 9 | use std::cmp::{max, min}; 10 | use std::convert::Into; 11 | use std::fmt::{self, Display, Formatter}; 12 | use std::ops::Neg; 13 | use std::str::FromStr; 14 | 15 | use regex::Regex; 16 | 17 | use crate::annot::loc::Loc; 18 | use crate::annot::pos::Pos; 19 | use crate::annot::*; 20 | use crate::strand::*; 21 | 22 | /// Contiguous sequence region on a particular, named sequence (e.g. a 23 | /// chromosome) 24 | /// 25 | /// Parameterized over the type of the reference sequence identifier 26 | /// and over the strandedness of the position. 27 | /// 28 | /// The display format for a `Contig` is _chr:start-end(+/-/.)_. The 29 | /// boundaries are given as a half-open 0-based interval, like the 30 | /// Rust `Range` and BED format. 31 | /// 32 | /// ``` 33 | /// # use bio_types::annot::ParseAnnotError; 34 | /// # fn try_main() -> Result<(), Box> { 35 | /// use bio_types::annot::contig::Contig; 36 | /// use bio_types::strand::ReqStrand; 37 | /// let tma19 = Contig::new("chrXI".to_owned(), 334412, (334916 - 334412), ReqStrand::Reverse); 38 | /// let tma19_str = tma19.to_string(); 39 | /// assert_eq!(tma19_str, "chrXI:334412-334916(-)"); 40 | /// let tma19_str_loc = tma19_str.parse()?; 41 | /// assert_eq!(tma19, tma19_str_loc); 42 | /// # Ok(()) 43 | /// # } 44 | /// # fn main() { try_main().unwrap(); } 45 | /// ``` 46 | #[derive(Debug, Clone, Hash, PartialEq, Eq)] 47 | pub struct Contig { 48 | refid: R, 49 | start: isize, 50 | length: usize, 51 | strand: S, 52 | } 53 | 54 | impl Contig { 55 | /// Construct a new sequence contig location 56 | /// 57 | /// ``` 58 | /// use std::rc::Rc; 59 | /// use bio_types::annot::contig::Contig; 60 | /// use bio_types::strand::ReqStrand; 61 | /// let chr = Rc::new("chrX".to_owned()); 62 | /// let tma22 = Contig::new(chr, 461829, 462426 - 461829, ReqStrand::Forward); 63 | /// ``` 64 | pub fn new(refid: R, start: isize, length: usize, strand: S) -> Self { 65 | Contig { 66 | refid, 67 | start, 68 | length, 69 | strand, 70 | } 71 | } 72 | 73 | /// Construct a new sequence contig location from a starting 74 | /// position and length. 75 | /// 76 | /// In general, the starting position must have a "strandedness", 77 | /// and reverse-strand starting positions will extend towards 78 | /// lower coordinates from the starting position. 79 | /// 80 | /// 81 | /// 82 | /// ``` 83 | /// # use bio_types::annot::AnnotError; 84 | /// # fn try_main() -> Result<(), Box> { 85 | /// use bio_types::annot::contig::Contig; 86 | /// use bio_types::annot::pos::Pos; 87 | /// use bio_types::strand::ReqStrand; 88 | /// 89 | /// let tma22_first = Pos::new("chrX".to_string(), 461829, ReqStrand::Forward); 90 | /// let tma22 = Contig::with_first_length(&tma22_first, 462426 - 461829)?; 91 | /// assert_eq!(tma22.to_string(), "chrX:461829-462426(+)"); 92 | /// 93 | /// let tma19_first = Pos::new("chrXI".to_string(), 335015, ReqStrand::Reverse); 94 | /// let tma19 = Contig::with_first_length(&tma19_first, 335016 - 334412)?; 95 | /// assert_eq!(tma19.to_string(), "chrXI:334412-335016(-)"); 96 | /// # Ok(()) 97 | /// # } 98 | /// # fn main() { try_main().unwrap(); } 99 | /// ``` 100 | pub fn with_first_length(pos: &Pos, length: usize) -> Result 101 | where 102 | R: Clone, 103 | S: Into> + Copy, 104 | { 105 | if length < 2 { 106 | Ok(Contig { 107 | refid: pos.refid().clone(), 108 | start: pos.start(), 109 | length, 110 | strand: pos.strand(), 111 | }) 112 | } else { 113 | let start = match pos.strand().into() { 114 | None => Err(AnnotError::NoStrand), 115 | Some(ReqStrand::Forward) => Ok(pos.start()), 116 | Some(ReqStrand::Reverse) => Ok(1 + pos.start() - length as isize), 117 | }?; 118 | 119 | Ok(Contig { 120 | refid: pos.refid().clone(), 121 | start, 122 | length, 123 | strand: pos.strand(), 124 | }) 125 | } 126 | } 127 | 128 | /// Convert into a stranded sequence location on the specified strand 129 | pub fn into_stranded(self, strand: ReqStrand) -> Contig { 130 | Contig { 131 | refid: self.refid, 132 | start: self.start, 133 | length: self.length, 134 | strand, 135 | } 136 | } 137 | } 138 | 139 | impl Contig { 140 | /// Extend the annotation by `dist` in the upstream direction on the 141 | /// annotated strand. 142 | /// 143 | /// # Arguments 144 | /// 145 | /// * `dist` specifies the offset for sliding the position. The 146 | /// left, 5'-most end of the contig will expand for forward-strand 147 | /// annotations and the right, 3'-most end will expand for 148 | /// reverse-strand annotations. 149 | /// 150 | /// ``` 151 | /// use bio_types::annot::contig::Contig; 152 | /// use bio_types::strand::ReqStrand; 153 | /// let mut tma22 = Contig::new("chrX".to_owned(), 461829, 462426 - 461829, ReqStrand::Forward); 154 | /// tma22.extend_upstream(100); 155 | /// assert_eq!(tma22.to_string(), "chrX:461729-462426(+)"); 156 | /// let mut tma19 = Contig::new("chrXI".to_owned(), 334412, 334916 - 334412, ReqStrand::Reverse); 157 | /// tma19.extend_upstream(100); 158 | /// assert_eq!(tma19.to_string(), "chrXI:334412-335016(-)"); 159 | /// ``` 160 | pub fn extend_upstream(&mut self, dist: usize) { 161 | self.length += dist; 162 | if self.strand == ReqStrand::Forward { 163 | self.start -= dist as isize; 164 | } 165 | } 166 | 167 | /// Extend the annotation by `dist` in the downstream direction on the 168 | /// annotated strand. 169 | /// 170 | /// # Arguments 171 | /// 172 | /// * `dist` specifies the offset for sliding the position. The 173 | /// right, 3'-most end of the contig will expand for 174 | /// forward-strand annotations and the left, 5'-most end will 175 | /// expand for reverse-strand annotations. 176 | /// 177 | /// ``` 178 | /// use bio_types::annot::contig::Contig; 179 | /// use bio_types::strand::ReqStrand; 180 | /// let mut tma22 = Contig::new("chrX".to_owned(), 461829, 462426 - 461829, ReqStrand::Forward); 181 | /// tma22.extend_downstream(100); 182 | /// assert_eq!(tma22.to_string(), "chrX:461829-462526(+)"); 183 | /// let mut tma19 = Contig::new("chrXI".to_owned(), 334412, 334916 - 334412, ReqStrand::Reverse); 184 | /// tma19.extend_downstream(100); 185 | /// assert_eq!(tma19.to_string(), "chrXI:334312-334916(-)"); 186 | /// ``` 187 | pub fn extend_downstream(&mut self, dist: usize) { 188 | self.length += dist; 189 | if self.strand == ReqStrand::Reverse { 190 | self.start -= dist as isize; 191 | } 192 | } 193 | } 194 | 195 | impl Loc for Contig { 196 | type RefID = R; 197 | type Strand = S; 198 | fn refid(&self) -> &R { 199 | &self.refid 200 | } 201 | fn start(&self) -> isize { 202 | self.start 203 | } 204 | fn length(&self) -> usize { 205 | self.length 206 | } 207 | fn strand(&self) -> S 208 | where 209 | S: Copy, 210 | { 211 | self.strand 212 | } 213 | 214 | fn pos_into(&self, pos: &Pos) -> Option> 215 | where 216 | Self::RefID: Eq, 217 | Self::Strand: Into + Copy, 218 | T: Neg + Copy, 219 | { 220 | if self.refid != *pos.refid() { 221 | None 222 | } else { 223 | let offset = pos.pos() - self.start; 224 | if offset < 0 || offset >= self.length as isize { 225 | None 226 | } else { 227 | Some(match self.strand().into() { 228 | ReqStrand::Forward => Pos::new((), offset, pos.strand()), 229 | ReqStrand::Reverse => { 230 | Pos::new((), self.length as isize - (offset + 1), -pos.strand()) 231 | } 232 | }) 233 | } 234 | } 235 | } 236 | 237 | fn pos_outof(&self, pos: &Pos) -> Option> 238 | where 239 | Self::RefID: Clone, 240 | Self::Strand: Into + Copy, 241 | T: Neg + Copy, 242 | { 243 | let offset = match self.strand().into() { 244 | ReqStrand::Forward => pos.pos(), 245 | ReqStrand::Reverse => self.length as isize - (pos.pos() + 1), 246 | }; 247 | 248 | if offset >= 0 && offset < self.length as isize { 249 | Some(Pos::new( 250 | self.refid.clone(), 251 | self.start + offset, 252 | self.strand().into().on_strand(pos.strand()), 253 | )) 254 | } else { 255 | None 256 | } 257 | } 258 | 259 | fn contig_intersection(&self, contig: &Contig) -> Option 260 | where 261 | Self::RefID: PartialEq + Clone, 262 | Self::Strand: Copy, 263 | { 264 | if self.refid() != contig.refid() { 265 | return None; 266 | } 267 | 268 | let start = max(self.start, contig.start); 269 | let end = min( 270 | self.start + self.length as isize, 271 | contig.start + contig.length as isize, 272 | ); 273 | 274 | if start <= end { 275 | Some(Self::new( 276 | self.refid.clone(), 277 | start, 278 | (end - start) as usize, 279 | self.strand, 280 | )) 281 | } else { 282 | None 283 | } 284 | } 285 | } 286 | 287 | impl Display for Contig 288 | where 289 | R: Display, 290 | S: Display + Clone + Into, 291 | { 292 | fn fmt(&self, f: &mut Formatter) -> fmt::Result { 293 | write!( 294 | f, 295 | "{}:{}-{}", 296 | self.refid, 297 | self.start, 298 | self.start + self.length as isize 299 | )?; 300 | let strand: Strand = self.strand.clone().into(); 301 | if !strand.is_unknown() { 302 | write!(f, "({})", strand)?; 303 | } 304 | Ok(()) 305 | } 306 | } 307 | 308 | impl FromStr for Contig 309 | where 310 | R: From, 311 | S: FromStr, 312 | { 313 | type Err = ParseAnnotError; 314 | 315 | fn from_str(s: &str) -> Result { 316 | lazy_static! { 317 | static ref CONTIG_RE: Regex = Regex::new(r"^(.*):(\d+)-(\d+)(\([+-]\))?$").unwrap(); 318 | } 319 | 320 | let cap = CONTIG_RE.captures(s).ok_or(ParseAnnotError::BadAnnot)?; 321 | 322 | let start = cap[2].parse::().map_err(ParseAnnotError::ParseInt)?; 323 | let end = cap[3].parse::().map_err(ParseAnnotError::ParseInt)?; 324 | let strand = cap 325 | .get(4) 326 | .map_or("", |m| m.as_str()) 327 | .parse::() 328 | .map_err(ParseAnnotError::ParseStrand)?; 329 | 330 | if start <= end { 331 | Ok(Contig::new( 332 | R::from(cap[1].to_owned()), 333 | start, 334 | (end - start) as usize, 335 | strand, 336 | )) 337 | } else { 338 | Err(ParseAnnotError::EndBeforeStart) 339 | } 340 | } 341 | } 342 | 343 | impl From> for Contig { 344 | fn from(x: Contig) -> Self { 345 | Contig { 346 | refid: x.refid, 347 | start: x.start, 348 | length: x.length, 349 | strand: match x.strand { 350 | ReqStrand::Forward => Strand::Forward, 351 | ReqStrand::Reverse => Strand::Reverse, 352 | }, 353 | } 354 | } 355 | } 356 | 357 | impl From> for Contig { 358 | fn from(x: Contig) -> Self { 359 | Contig { 360 | refid: x.refid, 361 | start: x.start, 362 | length: x.length, 363 | strand: Strand::Unknown, 364 | } 365 | } 366 | } 367 | 368 | impl From> for Contig { 369 | fn from(x: Contig) -> Self { 370 | Contig { 371 | refid: x.refid, 372 | start: x.start, 373 | length: x.length, 374 | strand: NoStrand::Unknown, 375 | } 376 | } 377 | } 378 | 379 | impl From> for Contig { 380 | fn from(x: Contig) -> Self { 381 | Contig { 382 | refid: x.refid, 383 | start: x.start, 384 | length: x.length, 385 | strand: NoStrand::Unknown, 386 | } 387 | } 388 | } 389 | 390 | /// Default stranded sequence position on a reference sequence named 391 | /// by a `String`. 392 | pub type SeqContigStranded = Contig; 393 | 394 | /// Default unstranded sequence position on a reference sequence named 395 | /// by a `String` 396 | pub type SeqContigUnstranded = Contig; 397 | 398 | #[cfg(test)] 399 | mod tests { 400 | use super::*; 401 | 402 | #[test] 403 | fn first_and_last() { 404 | let tma22 = "chrX:461829-462426(+)" 405 | .parse::() 406 | .unwrap(); 407 | let first = tma22.first_pos(); 408 | assert_eq!(first.to_string(), "chrX:461829(+)"); 409 | let last = tma22.last_pos(); 410 | assert_eq!(last.to_string(), "chrX:462425(+)"); 411 | 412 | let tma19 = "chrXI:334412-334916(-)" 413 | .parse::() 414 | .unwrap(); 415 | let first = tma19.first_pos(); 416 | assert_eq!(first.to_string(), "chrXI:334915(-)"); 417 | let last = tma19.last_pos(); 418 | assert_eq!(last.to_string(), "chrXI:334412(-)"); 419 | 420 | let tma22_first = Pos::new("chrX".to_string(), 461829, ReqStrand::Forward); 421 | let tma22 = Contig::with_first_length(&tma22_first, 462426 - 461829).unwrap(); 422 | assert_eq!(tma22.to_string(), "chrX:461829-462426(+)"); 423 | 424 | let tma19_first = Pos::new("chrXI".to_string(), 335015, ReqStrand::Reverse); 425 | let tma19 = Contig::with_first_length(&tma19_first, 335016 - 334412).unwrap(); 426 | assert_eq!(tma19.to_string(), "chrXI:334412-335016(-)"); 427 | } 428 | 429 | #[test] 430 | fn into_outof() { 431 | let tma22 = "chrX:461829-462426(+)" 432 | .parse::() 433 | .unwrap(); 434 | let p0 = "chrX:461829(+)".parse::>().unwrap(); 435 | let p0_into = tma22.pos_into(&p0); 436 | assert!(Some(Pos::new((), 0, ReqStrand::Forward)).same(&p0_into)); 437 | let p0_outof = tma22.pos_outof(&p0_into.unwrap()); 438 | assert!(Some(p0).same(&p0_outof)); 439 | 440 | let p0 = "chrX:461839(-)".parse::>().unwrap(); 441 | let p0_into = tma22.pos_into(&p0); 442 | assert!(Some(Pos::new((), 10, ReqStrand::Reverse)).same(&p0_into)); 443 | let p0_outof = tma22.pos_outof(&p0_into.unwrap()); 444 | assert!(Some(p0).same(&p0_outof)); 445 | 446 | let p0 = "chrX:462425(+)".parse::>().unwrap(); 447 | let p0_into = tma22.pos_into(&p0); 448 | assert!(Some(Pos::new((), 596, ReqStrand::Forward)).same(&p0_into)); 449 | let p0_outof = tma22.pos_outof(&p0_into.unwrap()); 450 | assert!(Some(p0).same(&p0_outof)); 451 | 452 | let p0 = "chrX:461828(+)".parse::>().unwrap(); 453 | let p0_into = tma22.pos_into(&p0); 454 | assert!(None.same(&p0_into)); 455 | 456 | let p0 = "chrV:461829(+)".parse::>().unwrap(); 457 | let p0_into = tma22.pos_into(&p0); 458 | assert!(None.same(&p0_into)); 459 | 460 | let p0 = "chrV:462426(+)".parse::>().unwrap(); 461 | let p0_into = tma22.pos_into(&p0); 462 | assert!(None.same(&p0_into)); 463 | } 464 | 465 | fn test_contig_ixn(ca_str: &str, cb_str: &str, cab_str: Option) -> () { 466 | let ca = ca_str.parse::().unwrap(); 467 | let cb = cb_str.parse::().unwrap(); 468 | match ca.contig_intersection(&cb) { 469 | None => assert_eq!(None, cab_str), 470 | Some(cab) => assert_eq!(Some(cab.to_string()), cab_str), 471 | }; 472 | } 473 | 474 | #[test] 475 | fn test_display_fmt() { 476 | let tma19 = Contig::new( 477 | "chrXI".to_owned(), 478 | 334412, 479 | 334916 - 334412, 480 | ReqStrand::Reverse, 481 | ); 482 | assert_eq!(format!("{}", tma19), "chrXI:334412-334916(-)"); 483 | } 484 | 485 | #[test] 486 | fn intersection() { 487 | test_contig_ixn( 488 | "chrX:461829-462426(+)", 489 | "chrX:461800-461900(+)", 490 | Some("chrX:461829-461900(+)".to_owned()), 491 | ); 492 | test_contig_ixn( 493 | "chrX:461829-462426(-)", 494 | "chrX:461800-461900(+)", 495 | Some("chrX:461829-461900(-)".to_owned()), 496 | ); 497 | test_contig_ixn( 498 | "chrX:461829-462426(+)", 499 | "chrX:461800-461900(-)", 500 | Some("chrX:461829-461900(+)".to_owned()), 501 | ); 502 | 503 | test_contig_ixn( 504 | "chrX:461829-462426(+)", 505 | "chrX:462000-463000(+)", 506 | Some("chrX:462000-462426(+)".to_owned()), 507 | ); 508 | test_contig_ixn( 509 | "chrX:461829-462426(+)", 510 | "chrX:461000-463000(+)", 511 | Some("chrX:461829-462426(+)".to_owned()), 512 | ); 513 | test_contig_ixn( 514 | "chrX:461829-462426(+)", 515 | "chrX:462000-462100(+)", 516 | Some("chrX:462000-462100(+)".to_owned()), 517 | ); 518 | 519 | test_contig_ixn("chrX:461829-462426(+)", "chrX:461000-461500(+)", None); 520 | test_contig_ixn("chrX:461829-462426(+)", "chrX:463000-463500(+)", None); 521 | test_contig_ixn("chrX:461829-462426(+)", "chrV:461000-463000(+)", None); 522 | } 523 | } 524 | -------------------------------------------------------------------------------- /src/annot/loc.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Nicholas Ingolia 2 | // Licensed under the MIT license (http://opensource.org/licenses/MIT) 3 | // This file may not be copied, modified, or distributed 4 | // except according to those terms. 5 | 6 | //! Trait shared across sequence locations -- spliced, contiguous, or 7 | //! single-position. 8 | 9 | use std::ops::Neg; 10 | 11 | use crate::annot::contig::Contig; 12 | use crate::annot::pos::Pos; 13 | 14 | use crate::strand::*; 15 | 16 | /// A trait for a sequence location -- a defined region on a named 17 | /// chromosome (or other reference sequence), which may also have 18 | /// defined strand information. The trait is generic over the type of 19 | /// identifier for the reference sequence (allowing owned strings, 20 | /// sequence IDs, and other options) and the strand information 21 | /// (allowing type-level distinction between stranded and unstranded 22 | /// annotations). 23 | pub trait Loc { 24 | type RefID; 25 | type Strand; 26 | 27 | /// Name of the reference sequence (chromosome name, etc.) 28 | fn refid(&self) -> &Self::RefID; 29 | /// Starting (lowest, left-most, 5'-most) position on the 30 | /// reference sequence (0-based). 31 | fn start(&self) -> isize; 32 | /// Length of the region 33 | fn length(&self) -> usize; 34 | /// `Strand` of the position 35 | fn strand(&self) -> Self::Strand 36 | where 37 | Self::Strand: Copy; 38 | 39 | /// Map a sequence position on a reference sequence _into_ a 40 | /// relative position within an annotated location on the 41 | /// reference sequence. 42 | /// 43 | /// The first position of the annotated location is mapped to a 44 | /// position at 0, the next position of the annotated location is 45 | /// mapped to a position at 1, and so forth. The annotated 46 | /// location must have a known strandedness, which is taken into 47 | /// account. For reverse-strand annotations, the 3'-most position 48 | /// on the reference sequence is mapped to 0, and the strandedness 49 | /// of the position is reversed. When the sequence position lies 50 | /// on a different named reference sequence than the annotated 51 | /// location, or doesn't fall within the annotated location, then 52 | /// `None` is returned. 53 | /// 54 | /// This function serves as an inverse of @pos_outof. 55 | fn pos_into(&self, pos: &Pos) -> Option> 56 | where 57 | Self::RefID: Eq, 58 | Self::Strand: Into + Copy, 59 | T: Neg + Copy; 60 | 61 | /// Map a relative position within an annotated location _out of_ 62 | /// that location onto the enclosing reference sequence. 63 | /// 64 | /// Position 0 within the annotated location is mapped to the 65 | /// first position of the annotated location, position 1 is mapped 66 | /// to the subsequent position, and so forth. The annotated 67 | /// location must have a known strandedness, which is taken into 68 | /// account. For reverse-strand annotations, position 0 is mapped 69 | /// to the 3'-most position of the reference sequence. When the 70 | /// sequence position is either negative, or greater than the 71 | /// length of the annotated location, then `None` is returned. The 72 | /// reference name for the sequence position is discarded; the 73 | /// mapped position receives a clone of the annotation's reference 74 | /// sequence name. 75 | /// 76 | /// This function serves as an inverse of @pos_into. 77 | fn pos_outof(&self, pos: &Pos) -> Option> 78 | where 79 | Self::RefID: Clone, 80 | Self::Strand: Into + Copy, 81 | T: Neg + Copy; 82 | 83 | fn contig_intersection(&self, other: &Contig) -> Option 84 | where 85 | Self: ::std::marker::Sized, 86 | Self::RefID: PartialEq + Clone, 87 | Self::Strand: Copy; 88 | 89 | /// Contiguous sequence location that fully covers the location. 90 | fn contig(&self) -> Contig 91 | where 92 | Self::RefID: Clone, 93 | Self::Strand: Copy, 94 | { 95 | Contig::new( 96 | self.refid().clone(), 97 | self.start(), 98 | self.length(), 99 | self.strand(), 100 | ) 101 | } 102 | 103 | /// The first `Pos` in a location, on the annotated strand. 104 | /// 105 | /// The first position in a zero-length annotation will be the 106 | /// starting postion. This is the same as the first position in a 107 | /// length-1 annotation, on either strand. 108 | fn first_pos(&self) -> Pos 109 | where 110 | Self::RefID: Clone, 111 | Self::Strand: Into + Copy, 112 | { 113 | match self.strand().into() { 114 | ReqStrand::Forward => Pos::new(self.refid().clone(), self.start(), self.strand()), 115 | ReqStrand::Reverse => { 116 | if self.length() == 0 { 117 | Pos::new(self.refid().clone(), self.start(), self.strand()) 118 | } else { 119 | Pos::new( 120 | self.refid().clone(), 121 | self.start() + (self.length() as isize) - 1, 122 | self.strand(), 123 | ) 124 | } 125 | } 126 | } 127 | } 128 | 129 | /// The last `Pos` in a location, on the annotated strand. 130 | /// 131 | /// The last position in a zero-length annotation will be the 132 | /// starting postion. This is the same as the last position in a 133 | /// length-1 annotation, on either strand. 134 | fn last_pos(&self) -> Pos 135 | where 136 | Self::RefID: Clone, 137 | Self::Strand: Into + Copy, 138 | { 139 | match self.strand().into() { 140 | ReqStrand::Forward => { 141 | if self.length() == 0 { 142 | Pos::new(self.refid().clone(), self.start(), self.strand()) 143 | } else { 144 | Pos::new( 145 | self.refid().clone(), 146 | self.start() + (self.length() as isize) - 1, 147 | self.strand(), 148 | ) 149 | } 150 | } 151 | ReqStrand::Reverse => Pos::new(self.refid().clone(), self.start(), self.strand()), 152 | } 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /src/annot/mod.rs: -------------------------------------------------------------------------------- 1 | //! Data types for positions and regions on named sequences 2 | //! (e.g. chromosomes), useful for annotating features in a genome. 3 | //! For example, these data types let you represent that _TMA22_ is on 4 | //! chromosome X, positions 461,829-462,426, on the forward strand. They 5 | //! also allow coordinate math on these annotations, e.g., that 6 | //! position chrX:461,839 is +10 within _TMA22_ and vice versa. 7 | //! 8 | //! This module provides three concrete data types to represent a 9 | //! single position ([`Pos`](pos/Pos.t.html)), a contiguous region 10 | //! ([`Contig`](contig/Contig.t.html)), or a "spliced" region 11 | //! ([`Spliced`](spliced/Spliced.t.html)) consisting of one or more 12 | //! exons separated by introns. All three data types implement a 13 | //! location trait [`Loc`](loc/Loc.t.html). 14 | //! 15 | //! These data types are generic over the data type used to "name" the 16 | //! annotated reference sequence (e.g., the chromosome name). It's 17 | //! possible to use an owned `String`, an interned `Rc`, or an 18 | //! integer sequence identifier like the "target id" field in a BAM 19 | //! file. 20 | //! 21 | //! These data types are also generic over the kind of strand 22 | //! information in the annotation. This allows annotations with 23 | //! _required_ strand annotation 24 | //! ([`ReqStrand`](../strand/enum.ReqStrand.html)), _optional_ strand 25 | //! annotation ([`Strand`](../strand/enum.Strand.html)), or _no_ 26 | //! strand annotation ([`NoStrand`](../strand/enum.NoStrand.html)). 27 | //! 28 | //! The example below shows how to create the _TMA22_ annotation and 29 | //! find where chrX:461,839 falls within this gene. 30 | //! ``` 31 | //! # use std::str::FromStr; 32 | //! # use bio_types::annot::ParseAnnotError; 33 | //! # fn try_main() -> Result<(), ParseAnnotError> { 34 | //! use bio_types::annot::contig::Contig; 35 | //! use bio_types::annot::loc::Loc; 36 | //! use bio_types::annot::pos::Pos; 37 | //! use bio_types::strand::{ReqStrand,NoStrand}; 38 | //! let tma22: Contig = Contig::from_str("chrX:461829-462426(+)")?; 39 | //! let p0: Pos = Pos::from_str("chrX:461839")?; 40 | //! let p0_into = tma22.pos_into(&p0).unwrap_or_else(|| panic!("p0 not within TMA22")); 41 | //! assert!(p0_into.pos() == 10); 42 | //! # Ok(()) 43 | //! # } 44 | //! # fn main() { try_main().unwrap(); } 45 | //! ``` 46 | 47 | use crate::strand; 48 | use thiserror::Error; 49 | 50 | pub mod contig; 51 | pub mod loc; 52 | pub mod pos; 53 | pub mod refids; 54 | pub mod spliced; 55 | 56 | // Errors that arise in parsing annotations. 57 | #[derive(Error, Debug)] 58 | pub enum ParseAnnotError { 59 | #[error("Annotation string does not match regex")] 60 | BadAnnot, 61 | #[error("Integer parsing error")] 62 | ParseInt(#[from] ::std::num::ParseIntError), 63 | #[error("Strand parsing error")] 64 | ParseStrand(#[from] strand::StrandError), 65 | #[error("Bad splicing structure")] 66 | Splicing(#[from] spliced::SplicingError), 67 | #[error("Ending position < starting position")] 68 | EndBeforeStart, 69 | } 70 | 71 | // Errors that arise in maniuplating annotations 72 | #[derive(Error, Debug)] 73 | pub enum AnnotError { 74 | #[error("No strand information")] 75 | NoStrand, 76 | #[error("Invalid splicing structure")] 77 | BadSplicing, 78 | } 79 | -------------------------------------------------------------------------------- /src/annot/pos.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Nicholas Ingolia 2 | // Licensed under the MIT license (http://opensource.org/licenses/MIT) 3 | // This file may not be copied, modified, or distributed 4 | // except according to those terms. 5 | 6 | //! Positions on a named sequence, e.g., 683,946 on chromosome IV. 7 | 8 | use std::convert::Into; 9 | use std::fmt::{self, Display, Formatter}; 10 | use std::ops::AddAssign; 11 | use std::ops::Neg; 12 | use std::ops::SubAssign; 13 | use std::str::FromStr; 14 | 15 | use regex::Regex; 16 | 17 | use crate::annot::contig::Contig; 18 | use crate::annot::loc::Loc; 19 | use crate::annot::*; 20 | use crate::strand::*; 21 | 22 | /// Position on a particular, named sequence (e.g. a chromosome). 23 | /// 24 | /// Parameterized over the type of the reference sequence identifier 25 | /// and over the strandedness of the position. 26 | /// 27 | /// The display format for a `Pos` is _chr:pos(+/-)_. A stranded 28 | /// position must have a _(+)_ or a _(-)_, while an unstranded 29 | /// position does not. 30 | /// 31 | /// ``` 32 | /// # use bio_types::annot::ParseAnnotError; 33 | /// # fn try_main() -> Result<(), Box> { 34 | /// use bio_types::annot::pos::Pos; 35 | /// use bio_types::strand::ReqStrand; 36 | /// let start = Pos::new("chrIV".to_owned(), 683946, ReqStrand::Reverse); 37 | /// let start_str = start.to_string(); 38 | /// assert_eq!(start_str, "chrIV:683946(-)"); 39 | /// let start_str_pos = start_str.parse()?; 40 | /// assert_eq!(start, start_str_pos); 41 | /// # Ok(()) 42 | /// # } 43 | /// # fn main() { try_main().unwrap(); } 44 | /// ``` 45 | #[derive(Debug, Clone, Hash, PartialEq, Eq)] 46 | pub struct Pos { 47 | refid: R, 48 | pos: isize, 49 | strand: S, 50 | } 51 | 52 | impl Pos { 53 | /// Construct a new sequence position 54 | /// 55 | /// ``` 56 | /// use std::rc::Rc; 57 | /// use bio_types::annot::pos::Pos; 58 | /// use bio_types::strand::ReqStrand; 59 | /// let chr = Rc::new("chrIV".to_owned()); 60 | /// let start = Pos::new(chr, 683946, ReqStrand::Reverse); 61 | /// ``` 62 | pub fn new(refid: R, pos: isize, strand: S) -> Self { 63 | Pos { refid, pos, strand } 64 | } 65 | 66 | /// Position on the reference sequence (0-based). 67 | pub fn pos(&self) -> isize { 68 | self.pos 69 | } 70 | 71 | /// Convert into a stranded sequence position on the specified strand 72 | pub fn into_stranded(self, strand: ReqStrand) -> Pos { 73 | Pos { 74 | refid: self.refid, 75 | pos: self.pos, 76 | strand, 77 | } 78 | } 79 | } 80 | 81 | impl AddAssign for Pos 82 | where 83 | isize: AddAssign, 84 | isize: SubAssign, 85 | { 86 | /// Slide the reference position by an offset on the strand of the 87 | /// annotation. 88 | /// 89 | /// # Arguments 90 | /// 91 | /// * `dist` specifies the offset for sliding the position. A 92 | /// positive `dist` will numerically increase the position for 93 | /// forward-strand features and decrease it for reverse-strand 94 | /// features. 95 | /// 96 | /// ``` 97 | /// use bio_types::annot::pos::Pos; 98 | /// use bio_types::strand::ReqStrand; 99 | /// let mut start = Pos::new("chrIV".to_owned(), 683946, ReqStrand::Reverse); 100 | /// assert_eq!(start.to_string(), "chrIV:683946(-)"); 101 | /// start += 100; 102 | /// assert_eq!(start.to_string(), "chrIV:683846(-)"); 103 | /// ``` 104 | fn add_assign(&mut self, dist: T) { 105 | match self.strand { 106 | ReqStrand::Forward => self.pos += dist, 107 | ReqStrand::Reverse => self.pos -= dist, 108 | } 109 | } 110 | } 111 | 112 | impl SubAssign for Pos 113 | where 114 | isize: AddAssign, 115 | isize: SubAssign, 116 | { 117 | /// Slide the reference position by an offset on the strand of the 118 | /// annotation. 119 | /// 120 | /// # Arguments 121 | /// 122 | /// * `dist` specifies the offset for sliding the position. A 123 | /// positive `dist` will numerically decrease the position for 124 | /// forward-strand features and increase it for reverse-strand 125 | /// features. 126 | /// 127 | /// ``` 128 | /// use bio_types::annot::pos::Pos; 129 | /// use bio_types::strand::ReqStrand; 130 | /// let mut start = Pos::new("chrIV".to_owned(), 683946, ReqStrand::Reverse); 131 | /// assert_eq!(start.to_string(), "chrIV:683946(-)"); 132 | /// start -= 100; 133 | /// assert_eq!(start.to_string(), "chrIV:684046(-)"); 134 | /// ``` 135 | fn sub_assign(&mut self, dist: T) { 136 | match self.strand { 137 | ReqStrand::Forward => self.pos -= dist, 138 | ReqStrand::Reverse => self.pos += dist, 139 | } 140 | } 141 | } 142 | 143 | impl Loc for Pos { 144 | type RefID = R; 145 | type Strand = S; 146 | fn refid(&self) -> &R { 147 | &self.refid 148 | } 149 | fn start(&self) -> isize { 150 | self.pos 151 | } 152 | fn length(&self) -> usize { 153 | 1 154 | } 155 | fn strand(&self) -> S 156 | where 157 | S: Copy, 158 | { 159 | self.strand 160 | } 161 | 162 | fn pos_into(&self, pos: &Pos) -> Option> 163 | where 164 | Self::RefID: Eq, 165 | Self::Strand: Into + Copy, 166 | T: Neg + Copy, 167 | { 168 | if (self.refid != pos.refid) || (self.pos != pos.pos) { 169 | None 170 | } else { 171 | Some(Pos::new( 172 | (), 173 | 0, 174 | self.strand().into().on_strand(pos.strand()), 175 | )) 176 | } 177 | } 178 | 179 | fn pos_outof(&self, pos: &Pos) -> Option> 180 | where 181 | Self::RefID: Clone, 182 | Self::Strand: Into + Copy, 183 | T: Neg + Copy, 184 | { 185 | if pos.pos == 0 { 186 | Some(Pos::new( 187 | self.refid.clone(), 188 | self.pos, 189 | self.strand().into().on_strand(pos.strand()), 190 | )) 191 | } else { 192 | None 193 | } 194 | } 195 | 196 | fn contig_intersection(&self, contig: &Contig) -> Option 197 | where 198 | Self::RefID: PartialEq + Clone, 199 | Self::Strand: Copy, 200 | { 201 | if self.refid() != contig.refid() { 202 | return None; 203 | } 204 | 205 | if (self.pos >= contig.start()) && (self.pos < (contig.start() + contig.length() as isize)) 206 | { 207 | Some(self.clone()) 208 | } else { 209 | None 210 | } 211 | } 212 | } 213 | 214 | impl Same for Pos 215 | where 216 | R: Eq, 217 | S: Same, 218 | { 219 | /// Indicate when two positions are the "same" -- when positions 220 | /// have unknown/unspecified strands they can be the "same" but 221 | /// not equal. 222 | fn same(&self, p: &Self) -> bool { 223 | self.pos == p.pos && self.refid == p.refid && self.strand.same(&p.strand) 224 | } 225 | } 226 | 227 | impl Display for Pos 228 | where 229 | R: Display, 230 | S: Display + Clone + Into, 231 | { 232 | fn fmt(&self, f: &mut Formatter) -> fmt::Result { 233 | let strand: Strand = self.strand.clone().into(); 234 | if strand.is_unknown() { 235 | write!(f, "{}:{}", self.refid, self.pos) 236 | } else { 237 | write!(f, "{}:{}({})", self.refid, self.pos, strand) 238 | } 239 | } 240 | } 241 | 242 | impl FromStr for Pos 243 | where 244 | R: From, 245 | S: FromStr, 246 | { 247 | type Err = ParseAnnotError; 248 | 249 | fn from_str(s: &str) -> Result { 250 | lazy_static! { 251 | static ref POS_RE: Regex = Regex::new(r"^(.*):(\d+)(\([+-]\))?$").unwrap(); 252 | } 253 | 254 | let cap = POS_RE.captures(s).ok_or(ParseAnnotError::BadAnnot)?; 255 | 256 | let strand = cap 257 | .get(3) 258 | .map_or("", |m| m.as_str()) 259 | .parse::() 260 | .map_err(ParseAnnotError::ParseStrand)?; 261 | 262 | Ok(Pos::new( 263 | R::from(cap[1].to_owned()), 264 | cap[2].parse::().map_err(ParseAnnotError::ParseInt)?, 265 | strand, 266 | )) 267 | } 268 | } 269 | 270 | impl From> for Pos { 271 | fn from(x: Pos) -> Self { 272 | Pos { 273 | refid: x.refid, 274 | pos: x.pos, 275 | strand: match x.strand { 276 | ReqStrand::Forward => Strand::Forward, 277 | ReqStrand::Reverse => Strand::Reverse, 278 | }, 279 | } 280 | } 281 | } 282 | 283 | impl From> for Pos { 284 | fn from(x: Pos) -> Self { 285 | Pos { 286 | refid: x.refid, 287 | pos: x.pos, 288 | strand: Strand::Unknown, 289 | } 290 | } 291 | } 292 | 293 | impl From> for Pos { 294 | fn from(x: Pos) -> Self { 295 | Pos { 296 | refid: x.refid, 297 | pos: x.pos, 298 | strand: NoStrand::Unknown, 299 | } 300 | } 301 | } 302 | 303 | impl From> for Pos { 304 | fn from(x: Pos) -> Self { 305 | Pos { 306 | refid: x.refid, 307 | pos: x.pos, 308 | strand: NoStrand::Unknown, 309 | } 310 | } 311 | } 312 | 313 | /// Default stranded sequence position on a reference sequence named 314 | /// by a `String`. 315 | pub type SeqPosStranded = Pos; 316 | 317 | /// Default unstranded sequence position on a reference sequence named 318 | /// by a `String` 319 | pub type SeqPosUnstranded = Pos; 320 | 321 | #[cfg(test)] 322 | mod tests { 323 | use super::*; 324 | 325 | #[test] 326 | fn pos_accessors() { 327 | let start = Pos::new("chrIV".to_owned(), 683946, Strand::Unknown); 328 | assert_eq!(start.refid(), "chrIV"); 329 | assert_eq!(start.pos(), 683946); 330 | assert!(start.strand().same(&Strand::Unknown)); 331 | 332 | let start = Pos::new("chrIV".to_owned(), 683946, Strand::Reverse); 333 | assert_eq!(start.refid(), "chrIV"); 334 | assert_eq!(start.pos(), 683946); 335 | assert!(start.strand().same(&Strand::Reverse)); 336 | 337 | let start = Pos::new("chrXV".to_owned(), 493433, Strand::Forward); 338 | assert_eq!(start.refid(), "chrXV"); 339 | assert_eq!(start.pos(), 493433); 340 | assert!(start.strand().same(&Strand::Forward)); 341 | } 342 | 343 | #[test] 344 | fn strand_conversion() { 345 | let start = "chrIV:683946(-)".parse::>().unwrap(); 346 | let start_un: Pos = start.into(); 347 | assert!(start_un.same(&"chrIV:683946".parse::>().unwrap())); 348 | let start_re = start_un.into_stranded(ReqStrand::Reverse); 349 | assert!(start_re.same(&"chrIV:683946(-)".parse::>().unwrap())); 350 | 351 | let start = "chrXV:493433(+)".parse::>().unwrap(); 352 | let start_un: Pos = start.into(); 353 | assert!(start_un.same(&"chrXV:493433".parse::>().unwrap())); 354 | let start_re = start_un.into_stranded(ReqStrand::Forward); 355 | assert!(start_re.same(&"chrXV:493433(+)".parse::>().unwrap())); 356 | } 357 | 358 | #[test] 359 | fn string_representation() { 360 | let start = Pos::new("chrIV".to_owned(), 683946, NoStrand::Unknown); 361 | assert_eq!(start.to_string(), "chrIV:683946"); 362 | assert!(start.same(&"chrIV:683946".parse::>().unwrap())); 363 | 364 | let start = Pos::new("chrIV".to_owned(), 683946, Strand::Unknown); 365 | assert_eq!(start.to_string(), "chrIV:683946"); 366 | assert!(start.same(&"chrIV:683946".parse::>().unwrap())); 367 | 368 | let start = Pos::new("chrIV".to_owned(), 683946, Strand::Reverse); 369 | assert_eq!(start.to_string(), "chrIV:683946(-)"); 370 | assert!(start.same(&"chrIV:683946(-)".parse::>().unwrap())); 371 | 372 | let start = Pos::new("chrXV".to_owned(), 493433, Strand::Forward); 373 | assert_eq!(start.to_string(), "chrXV:493433(+)"); 374 | assert!(start.same(&"chrXV:493433(+)".parse::>().unwrap())); 375 | 376 | let start = Pos::new("chrIV".to_owned(), 683946, ReqStrand::Reverse); 377 | assert_eq!(start.to_string(), "chrIV:683946(-)"); 378 | assert!(start.same(&"chrIV:683946(-)".parse::>().unwrap())); 379 | 380 | let start = Pos::new("chrXV".to_owned(), 493433, ReqStrand::Forward); 381 | assert_eq!(start.to_string(), "chrXV:493433(+)"); 382 | assert!(start.same(&"chrXV:493433(+)".parse::>().unwrap())); 383 | } 384 | 385 | #[test] 386 | fn loc_impl() { 387 | let start = Pos::new("chrIV".to_owned(), 683946, ReqStrand::Forward); 388 | 389 | assert_eq!( 390 | None, 391 | start.contig_intersection(&Contig::new( 392 | "chrIV".to_owned(), 393 | 683900, 394 | 40, 395 | ReqStrand::Forward 396 | )) 397 | ); 398 | assert_eq!( 399 | None, 400 | start.contig_intersection(&Contig::new( 401 | "chrV".to_owned(), 402 | 683900, 403 | 100, 404 | ReqStrand::Forward 405 | )) 406 | ); 407 | assert_eq!( 408 | None, 409 | start.contig_intersection(&Contig::new( 410 | "chrIV".to_owned(), 411 | 683950, 412 | 40, 413 | ReqStrand::Forward 414 | )) 415 | ); 416 | 417 | assert_eq!( 418 | Some(start.clone()), 419 | start.contig_intersection(&Contig::new( 420 | "chrIV".to_owned(), 421 | 683900, 422 | 100, 423 | ReqStrand::Forward 424 | )) 425 | ); 426 | assert_eq!( 427 | Some(start.clone()), 428 | start.contig_intersection(&Contig::new( 429 | "chrIV".to_owned(), 430 | 683900, 431 | 100, 432 | ReqStrand::Reverse 433 | )) 434 | ); 435 | 436 | let rstart = Pos::new("chrIV".to_owned(), 683946, ReqStrand::Reverse); 437 | assert_eq!( 438 | Some(rstart.clone()), 439 | rstart.contig_intersection(&Contig::new( 440 | "chrIV".to_owned(), 441 | 683900, 442 | 100, 443 | ReqStrand::Forward 444 | )) 445 | ); 446 | assert_eq!( 447 | Some(rstart.clone()), 448 | rstart.contig_intersection(&Contig::new( 449 | "chrIV".to_owned(), 450 | 683900, 451 | 100, 452 | ReqStrand::Reverse 453 | )) 454 | ); 455 | } 456 | } 457 | // chrXV:493433..494470 458 | -------------------------------------------------------------------------------- /src/annot/refids.rs: -------------------------------------------------------------------------------- 1 | //! Intern reference sequence (e.g., chromosome) names 2 | use std::collections::HashMap; 3 | use std::ops::Deref; 4 | 5 | /// Data structure for interning sequence names efficiently. 6 | /// 7 | /// The structure is parameterized over the reference type `R` used to 8 | /// intern strings. Typically, this would be `Rc` for single-threaded 9 | /// access or `Arc` for multi-threaded access. These reference types 10 | /// provide fast, reference-counted cloning with no new allocation, 11 | /// which can make sequence location calculations faster as well as 12 | /// reducing the memory footprint required. 13 | /// 14 | /// ``` 15 | /// use std::rc::Rc; 16 | /// use bio_types::strand::ReqStrand; 17 | /// use bio_types::annot::contig::Contig; 18 | /// use bio_types::annot::loc::Loc; 19 | /// use bio_types::annot::refids::RefIDSet; 20 | /// let mut refids: RefIDSet> = RefIDSet::new(); 21 | /// let pau8 = Contig::new(refids.intern("chrI"), 1807, 2170 - 1807, ReqStrand::Reverse); 22 | /// { 23 | /// let chr_i = refids.intern("chrI"); 24 | /// // One reference for the RefIDSet itself, one for the pau8 Contig, one for chr_i 25 | /// assert_eq!(Rc::strong_count(&chr_i), 3); 26 | /// } 27 | /// let seo1 = Contig::new(refids.intern("chrI"), 7235, 9017 - 7235, ReqStrand::Reverse); 28 | /// let tda8 = Contig::new(refids.intern("chrI"), 13363, 13744 - 13363, ReqStrand::Reverse); 29 | /// { 30 | /// let chr_i = refids.intern("chrI"); 31 | /// assert_eq!(Rc::strong_count(&chr_i), 5); 32 | /// } 33 | /// let seo1_beginning = seo1.first_pos(); 34 | /// let seo1_ending = seo1.last_pos(); 35 | /// { 36 | /// let chr_i = refids.intern("chrI"); 37 | /// assert_eq!(Rc::strong_count(&chr_i), 7); 38 | /// } 39 | /// ``` 40 | pub struct RefIDSet { 41 | refids: HashMap, 42 | } 43 | 44 | impl Default for RefIDSet { 45 | fn default() -> Self { 46 | Self::new() 47 | } 48 | } 49 | 50 | impl RefIDSet { 51 | /// Create a new, empty table of interned reference names 52 | pub fn new() -> Self { 53 | RefIDSet { 54 | refids: HashMap::new(), 55 | } 56 | } 57 | 58 | /// Intern a reference name. 59 | /// 60 | /// This returns a shared reference of type `R` for the name. This 61 | /// reference will be shared with any other intern calls for the 62 | /// same name. The name is given originally as a reference, and it 63 | /// will be cloned into an owned `String` only when the name is 64 | /// new for the data type. 65 | pub fn intern(&mut self, id: &str) -> R 66 | where 67 | R: Deref + From + Clone, 68 | { 69 | if self.refids.contains_key(id) { 70 | if let Some(ref r) = self.refids.get(id) { 71 | (*r).clone() 72 | } else { 73 | panic!("RefIDSet::ensure failed to get() after contains()"); 74 | } 75 | } else { 76 | let r = R::from(id.to_owned()); 77 | self.refids.insert(id.to_owned(), r.clone()); 78 | r 79 | } 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/annot/spliced.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Nicholas Ingolia 2 | // Licensed under the MIT license (http://opensource.org/licenses/MIT) 3 | // This file may not be copied, modified, or distributed 4 | // except according to those terms. 5 | 6 | //! Spliced region on a named sequence, e.g., the reverse strand of 7 | //! chromosome V, exon #1 at 166,885 through 166,875 and exon #2 at 8 | //! 166,771 through 166,237. 9 | 10 | use std::cmp::{max, min}; 11 | use std::convert::Into; 12 | use std::fmt::{self, Display, Formatter}; 13 | use std::ops::Neg; 14 | use std::str::FromStr; 15 | 16 | use regex::Regex; 17 | 18 | use crate::annot::contig::Contig; 19 | use crate::annot::loc::Loc; 20 | use crate::annot::pos::Pos; 21 | use crate::annot::*; 22 | use crate::strand::*; 23 | 24 | // The spliced location representation inherently cannot represent 25 | // "bad" splicing structures. Locations comprise a first exon length, 26 | // with a vector of intron-length / exon-length pairs, all type usize 27 | // and hence non-negative. 28 | // 29 | // The InEx data type used to represent intron-exon pairs after the 30 | // first exon enforces an additional strict positivity constraint 31 | // (i.e., >0) on the lengths. This further eliminates degeneracy in 32 | // representations: only one unique set of positive-length exons with 33 | // interleaved positive-length introns can represent a splicing 34 | // structure. 35 | mod inex { 36 | use std::slice::Iter; 37 | 38 | use super::SplicingError; 39 | 40 | #[derive(Debug, Clone, Hash, PartialEq, Eq)] 41 | pub struct InEx { 42 | intron_length: usize, 43 | exon_length: usize, 44 | } 45 | 46 | impl InEx { 47 | pub fn new(intron_length: usize, exon_length: usize) -> Result { 48 | if intron_length < 1 { 49 | Err(SplicingError::IntronLength) 50 | } else if exon_length < 1 { 51 | Err(SplicingError::ExonLength) 52 | } else { 53 | Ok(InEx { 54 | intron_length, 55 | exon_length, 56 | }) 57 | } 58 | } 59 | pub fn intron_length(&self) -> usize { 60 | self.intron_length 61 | } 62 | pub fn exon_length(&self) -> usize { 63 | self.exon_length 64 | } 65 | pub fn length(&self) -> usize { 66 | self.intron_length + self.exon_length 67 | } 68 | } 69 | 70 | // Represent just the start (relative to the start of the location) and length of exons 71 | // Useful for internal coordinate math 72 | #[derive(Debug, Clone, Hash, PartialEq, Eq)] 73 | pub struct Ex { 74 | start: usize, 75 | length: usize, 76 | } 77 | 78 | impl Ex { 79 | pub fn start(&self) -> usize { 80 | self.start 81 | } 82 | pub fn length(&self) -> usize { 83 | self.length 84 | } 85 | pub fn end(&self) -> usize { 86 | self.start + self.length 87 | } 88 | } 89 | 90 | // Iterator over the Ex exons from lowest to highest coordinate 91 | pub struct Exes<'a> { 92 | state: ExesState, 93 | curr_start: usize, 94 | rest: Iter<'a, InEx>, 95 | } 96 | 97 | enum ExesState { 98 | FirstExon(usize), 99 | LaterExon, 100 | } 101 | 102 | impl<'a> Exes<'a> { 103 | pub fn new(exon_0_length: usize, inexes: &'a [InEx]) -> Self { 104 | Exes { 105 | state: ExesState::FirstExon(exon_0_length), 106 | curr_start: 0, 107 | rest: inexes.iter(), 108 | } 109 | } 110 | } 111 | 112 | impl<'a> Iterator for Exes<'a> { 113 | type Item = Ex; 114 | 115 | fn next(&mut self) -> Option { 116 | match self.state { 117 | ExesState::FirstExon(len) => { 118 | let ex = Ex { 119 | start: self.curr_start, 120 | length: len, 121 | }; 122 | self.curr_start += len; 123 | self.state = ExesState::LaterExon; 124 | Some(ex) 125 | } 126 | ExesState::LaterExon => match self.rest.next() { 127 | Some(inex) => { 128 | let ex = Ex { 129 | start: self.curr_start + inex.intron_length(), 130 | length: inex.exon_length(), 131 | }; 132 | self.curr_start += inex.length(); 133 | Some(ex) 134 | } 135 | None => None, 136 | }, 137 | } 138 | } 139 | } 140 | 141 | // Represent just the start (relative to the start of the location) and length of introns 142 | // Useful for internal coordinate math 143 | #[derive(Debug, Clone, Hash, PartialEq, Eq)] 144 | pub struct In { 145 | start: usize, 146 | length: usize, 147 | } 148 | 149 | #[allow(dead_code)] 150 | impl In { 151 | pub fn start(&self) -> usize { 152 | self.start 153 | } 154 | pub fn length(&self) -> usize { 155 | self.length 156 | } 157 | pub fn end(&self) -> usize { 158 | self.start + self.length 159 | } 160 | } 161 | 162 | // Iterator over the Ex introns from lowest to highest coordinate 163 | pub struct Ins<'a> { 164 | curr_start: usize, 165 | rest: Iter<'a, InEx>, 166 | } 167 | 168 | impl<'a> Ins<'a> { 169 | #[allow(dead_code)] 170 | pub fn new(exon_0_length: usize, inexes: &'a [InEx]) -> Self { 171 | Ins { 172 | curr_start: exon_0_length, 173 | rest: inexes.iter(), 174 | } 175 | } 176 | } 177 | 178 | impl<'a> Iterator for Ins<'a> { 179 | type Item = In; 180 | 181 | fn next(&mut self) -> Option { 182 | match self.rest.next() { 183 | Some(inex) => { 184 | let intr = In { 185 | start: self.curr_start, 186 | length: inex.intron_length(), 187 | }; 188 | self.curr_start += inex.length(); 189 | Some(intr) 190 | } 191 | None => None, 192 | } 193 | } 194 | } 195 | } 196 | 197 | /// Spliced sequence annotation on a particular, named sequence 198 | /// (e.g. a chromosome). 199 | /// 200 | /// Parameterized over the type of the reference sequence identifier 201 | /// and over the strandedness of the position. 202 | /// 203 | /// The display format for a `Spliced` is 204 | /// _chr:start_0-end_0;start_1-end_1;...;start_N-end_N(+/-/.)_. The 205 | /// boundaries for each individual exon are given as a half-open 206 | /// 0-based interval, like the Rust `Range` and BED format. 207 | /// 208 | /// ``` 209 | /// # use bio_types::strand::ReqStrand; 210 | /// # use bio_types::annot::AnnotError; 211 | /// # use bio_types::annot::spliced::{Spliced,SplicingError}; 212 | /// # fn try_main() -> Result<(), Box> { 213 | /// let tad3 = Spliced::with_lengths_starts("chrXII".to_owned(), 765265, 214 | /// &vec![808,52,109], &vec![0,864,984], 215 | /// ReqStrand::Reverse)?; 216 | /// assert_eq!(tad3.to_string(), "chrXII:765265-766073;766129-766181;766249-766358(-)"); 217 | /// let tad3_exons = tad3.exon_contigs(); 218 | /// assert_eq!(tad3_exons.len(), 3); 219 | /// assert_eq!(tad3_exons[0].to_string(), "chrXII:766249-766358(-)"); 220 | /// assert_eq!(tad3_exons[1].to_string(), "chrXII:766129-766181(-)"); 221 | /// assert_eq!(tad3_exons[2].to_string(), "chrXII:765265-766073(-)"); 222 | /// # Ok(()) 223 | /// # } 224 | /// # fn main() { try_main().unwrap(); } 225 | /// ``` 226 | #[derive(Debug, Clone, Hash, PartialEq, Eq)] 227 | pub struct Spliced { 228 | refid: R, 229 | start: isize, 230 | exon_0_length: usize, 231 | inexes: Vec, 232 | strand: S, 233 | } 234 | 235 | impl Spliced { 236 | /// Construct a new, single-exon "spliced" location 237 | /// 238 | /// ``` 239 | /// use std::rc::Rc; 240 | /// use bio_types::annot::spliced::Spliced; 241 | /// use bio_types::strand::ReqStrand; 242 | /// let chr = Rc::new("chrX".to_owned()); 243 | /// let tma22 = Spliced::new(chr, 461829, 462426 - 461829, ReqStrand::Forward); 244 | /// ``` 245 | pub fn new(refid: R, start: isize, exon_0_length: usize, strand: S) -> Self { 246 | Spliced { 247 | refid, 248 | start, 249 | exon_0_length, 250 | inexes: Vec::new(), 251 | strand, 252 | } 253 | } 254 | 255 | /// Construct a multi-exon "spliced" location using BED-style exon 256 | /// starts and lengths. 257 | pub fn with_lengths_starts( 258 | refid: R, 259 | start: isize, 260 | exon_lengths: &[usize], 261 | exon_starts: &[usize], 262 | strand: S, 263 | ) -> Result { 264 | if exon_starts.is_empty() { 265 | return Err(SplicingError::NoExons); 266 | } else if exon_starts[0] != 0 { 267 | return Err(SplicingError::BlockStart); 268 | } else if exon_starts.len() != exon_lengths.len() { 269 | return Err(SplicingError::BlockMismatch); 270 | } 271 | 272 | let exon_0_length = exon_lengths[0]; 273 | let mut intron_start = exon_0_length; 274 | let mut inexes = Vec::new(); 275 | for exno in 1..exon_starts.len() { 276 | let exon_start = exon_starts[exno]; 277 | if intron_start >= exon_start { 278 | return Err(SplicingError::BlockOverlap); 279 | } 280 | let intron_length = exon_start - intron_start; 281 | let exon_length = exon_lengths[exno]; 282 | if exon_length == 0 { 283 | return Err(SplicingError::ExonLength); 284 | } 285 | inexes.push(inex::InEx::new(intron_length, exon_length)?); 286 | intron_start = exon_start + exon_length; 287 | } 288 | 289 | Ok(Spliced { 290 | refid, 291 | start, 292 | exon_0_length, 293 | inexes, 294 | strand, 295 | }) 296 | } 297 | 298 | /// Number of exons 299 | pub fn exon_count(&self) -> usize { 300 | self.inexes.len() + 1 301 | } 302 | 303 | // This is the unique representation for zero-length Spliced 304 | // locations, because InEx pairs have positive lengths for both 305 | // introns and exons. 306 | #[allow(dead_code)] 307 | fn is_zero_length(&self) -> bool { 308 | self.exon_0_length == 0 && self.inexes.is_empty() 309 | } 310 | 311 | fn exes(&self) -> inex::Exes { 312 | inex::Exes::new(self.exon_0_length, &self.inexes) 313 | } 314 | 315 | /// Vector of exon starting positions, relative to the start of 316 | /// the location overall. 317 | /// 318 | /// These positions run from left to right on the reference 319 | /// sequence, regardless of the location's strand. 320 | pub fn exon_starts(&self) -> Vec { 321 | let mut starts = vec![0]; 322 | let mut intron_start = self.exon_0_length; 323 | for inex in self.inexes.iter() { 324 | starts.push(intron_start + inex.intron_length()); 325 | intron_start += inex.length(); 326 | } 327 | starts 328 | } 329 | 330 | /// Vector of exon lengths. 331 | /// 332 | /// Exon lengths are given from left to right on the reference 333 | /// sequence, regardless of the location's strand. 334 | pub fn exon_lengths(&self) -> Vec { 335 | let mut lengths = vec![self.exon_0_length]; 336 | for inex in self.inexes.iter() { 337 | lengths.push(inex.exon_length()); 338 | } 339 | lengths 340 | } 341 | 342 | /// Total length of exons only. 343 | /// 344 | /// The `length` method from the `Loc` trait returns the total 345 | /// length spanned by the annotation, including both introns and 346 | /// exons. 347 | pub fn exon_total_length(&self) -> usize { 348 | self.exes().map(|e| e.length()).sum() 349 | } 350 | 351 | /// Convert into a stranded sequence location on the specified strand 352 | pub fn into_stranded(self, strand: ReqStrand) -> Spliced { 353 | Spliced { 354 | refid: self.refid, 355 | start: self.start, 356 | exon_0_length: self.exon_0_length, 357 | inexes: self.inexes, 358 | strand, 359 | } 360 | } 361 | 362 | pub fn contig_cover(self) -> Contig { 363 | let length = self.length(); 364 | Contig::new(self.refid, self.start, length, self.strand) 365 | } 366 | 367 | // Get exons in reference sequence order. 368 | fn exon_contigs_vec(&self) -> Vec> 369 | where 370 | R: Clone, 371 | S: Copy, 372 | { 373 | let mut exons = Vec::new(); 374 | 375 | for ex in self.exes() { 376 | exons.push(Contig::new( 377 | self.refid().clone(), 378 | self.start + ex.start() as isize, 379 | ex.length(), 380 | self.strand, 381 | )); 382 | } 383 | 384 | exons 385 | } 386 | } 387 | 388 | impl Spliced { 389 | pub fn exon_contigs(&self) -> Vec> 390 | where 391 | R: Clone, 392 | { 393 | let mut exons = self.exon_contigs_vec(); 394 | if self.strand == ReqStrand::Reverse { 395 | exons.reverse() 396 | } 397 | exons 398 | } 399 | } 400 | 401 | impl Loc for Spliced { 402 | type RefID = R; 403 | type Strand = S; 404 | fn refid(&self) -> &R { 405 | &self.refid 406 | } 407 | fn start(&self) -> isize { 408 | self.start 409 | } 410 | fn length(&self) -> usize { 411 | let mut len = self.exon_0_length; 412 | for inex in self.inexes.iter() { 413 | len += inex.length() 414 | } 415 | len 416 | } 417 | fn strand(&self) -> S 418 | where 419 | S: Copy, 420 | { 421 | self.strand 422 | } 423 | 424 | fn pos_into(&self, pos: &Pos) -> Option> 425 | where 426 | Self::RefID: Eq, 427 | Self::Strand: Into + Copy, 428 | T: Neg + Copy, 429 | { 430 | if (self.refid != *pos.refid()) || pos.pos() < self.start { 431 | return None; 432 | } 433 | 434 | let pos_offset = (pos.pos() - self.start) as usize; 435 | 436 | let mut offset_before = 0; 437 | for ex in self.exes() { 438 | if pos_offset >= ex.start() && pos_offset < ex.end() { 439 | let offset = (offset_before + pos_offset - ex.start()) as isize; 440 | let into = match self.strand().into() { 441 | ReqStrand::Forward => Pos::new((), offset, pos.strand()), 442 | ReqStrand::Reverse => Pos::new( 443 | (), 444 | self.exon_total_length() as isize - (offset + 1), 445 | -pos.strand(), 446 | ), 447 | }; 448 | return Some(into); 449 | } 450 | offset_before += ex.length(); 451 | } 452 | 453 | None 454 | } 455 | 456 | fn pos_outof(&self, pos: &Pos) -> Option> 457 | where 458 | Self::RefID: Clone, 459 | Self::Strand: Into + Copy, 460 | T: Neg + Copy, 461 | { 462 | let mut offset = match self.strand().into() { 463 | ReqStrand::Forward => pos.pos(), 464 | ReqStrand::Reverse => self.exon_total_length() as isize - (pos.pos() + 1), 465 | }; 466 | 467 | if offset < 0 { 468 | return None; 469 | } 470 | 471 | for ex in self.exes() { 472 | if offset < ex.length() as isize { 473 | return Some(Pos::new( 474 | self.refid.clone(), 475 | self.start + ex.start() as isize + offset, 476 | self.strand.into().on_strand(pos.strand()), 477 | )); 478 | } 479 | offset -= ex.length() as isize; 480 | } 481 | 482 | None 483 | } 484 | 485 | fn contig_intersection(&self, contig: &Contig) -> Option 486 | where 487 | Self::RefID: PartialEq + Clone, 488 | Self::Strand: Copy, 489 | { 490 | if self.refid() != contig.refid() { 491 | return None; 492 | } 493 | 494 | let contig_rel_start = if contig.start() < self.start { 495 | 0 496 | } else { 497 | (contig.start() - self.start) as usize 498 | }; 499 | let contig_end = contig.start() + contig.length() as isize; 500 | let contig_rel_end = if contig_end < self.start { 501 | 0 502 | } else { 503 | (contig_end - self.start) as usize 504 | }; 505 | 506 | let mut exon_lengths = Vec::new(); 507 | let mut exon_starts = Vec::new(); 508 | 509 | for ex in self.exes() { 510 | let start = max(contig_rel_start, ex.start()); 511 | let end = min(contig_rel_end, ex.end()); 512 | 513 | if start < end { 514 | exon_starts.push(start - contig_rel_start); 515 | exon_lengths.push(end - start); 516 | } 517 | } 518 | 519 | if !exon_starts.is_empty() { 520 | let first_start = exon_starts[0]; 521 | for start in exon_starts.iter_mut() { 522 | *start -= first_start; 523 | } 524 | let ixn = Self::with_lengths_starts( 525 | self.refid.clone(), 526 | max(self.start, contig.start()) + first_start as isize, 527 | &exon_lengths, 528 | &exon_starts, 529 | self.strand, 530 | ) 531 | .unwrap_or_else(|e| { 532 | panic!( 533 | "Creating intersection spliced: {:?} for {:?} {:?}", 534 | e, exon_lengths, exon_starts 535 | ) 536 | }); 537 | 538 | Some(ixn) 539 | } else { 540 | None 541 | } 542 | } 543 | } 544 | 545 | impl Display for Spliced 546 | where 547 | R: Display, 548 | S: Display + Clone + Into, 549 | { 550 | fn fmt(&self, f: &mut Formatter) -> fmt::Result { 551 | write!(f, "{}:", self.refid)?; 552 | 553 | let mut sep = false; 554 | for ex in self.exes() { 555 | let ex_start = self.start + ex.start() as isize; 556 | write!( 557 | f, 558 | "{}{}-{}", 559 | if sep { ";" } else { "" }, 560 | ex_start, 561 | ex_start + ex.length() as isize 562 | )?; 563 | sep = true; 564 | } 565 | let strand: Strand = self.strand.clone().into(); 566 | if !strand.is_unknown() { 567 | write!(f, "({})", self.strand)?; 568 | } 569 | Ok(()) 570 | } 571 | } 572 | 573 | impl FromStr for Spliced 574 | where 575 | R: From, 576 | S: FromStr, 577 | { 578 | type Err = ParseAnnotError; 579 | 580 | fn from_str(s: &str) -> Result { 581 | lazy_static! { 582 | static ref SPLICED_RE: Regex = 583 | Regex::new(r"^(.*):(\d+)-(\d+)((?:;\d+-\d+)*)(\([+-]\))?$").unwrap(); 584 | static ref EXON_RE: Regex = Regex::new(r";(\d+)-(\d+)").unwrap(); 585 | } 586 | 587 | let cap = SPLICED_RE.captures(s).ok_or(ParseAnnotError::BadAnnot)?; 588 | 589 | let mut starts = Vec::new(); 590 | let mut lengths = Vec::new(); 591 | 592 | let first_start = cap[2].parse::().map_err(ParseAnnotError::ParseInt)?; 593 | let first_end = cap[3].parse::().map_err(ParseAnnotError::ParseInt)?; 594 | let strand = cap[5].parse::().map_err(ParseAnnotError::ParseStrand)?; 595 | 596 | starts.push(0); 597 | lengths.push((first_end - first_start) as usize); 598 | 599 | let exon_caps = EXON_RE.captures_iter(&cap[4]); 600 | 601 | for exon_cap in exon_caps { 602 | let next_start = exon_cap[1] 603 | .parse::() 604 | .map_err(ParseAnnotError::ParseInt)?; 605 | let next_end = exon_cap[2] 606 | .parse::() 607 | .map_err(ParseAnnotError::ParseInt)?; 608 | starts.push((next_start - first_start) as usize); 609 | lengths.push((next_end - next_start) as usize); 610 | } 611 | 612 | let spliced = Spliced::with_lengths_starts( 613 | R::from(cap[1].to_owned()), 614 | first_start, 615 | lengths.as_slice(), 616 | starts.as_slice(), 617 | strand, 618 | ) 619 | .map_err(ParseAnnotError::Splicing)?; 620 | Ok(spliced) 621 | } 622 | } 623 | 624 | impl From> for Spliced { 625 | fn from(x: Spliced) -> Self { 626 | Spliced { 627 | refid: x.refid, 628 | start: x.start, 629 | exon_0_length: x.exon_0_length, 630 | inexes: x.inexes, 631 | strand: match x.strand { 632 | ReqStrand::Forward => Strand::Forward, 633 | ReqStrand::Reverse => Strand::Reverse, 634 | }, 635 | } 636 | } 637 | } 638 | 639 | impl From> for Spliced { 640 | fn from(x: Spliced) -> Self { 641 | Spliced { 642 | refid: x.refid, 643 | start: x.start, 644 | exon_0_length: x.exon_0_length, 645 | inexes: x.inexes, 646 | strand: Strand::Unknown, 647 | } 648 | } 649 | } 650 | 651 | impl From> for Spliced { 652 | fn from(x: Spliced) -> Self { 653 | Spliced { 654 | refid: x.refid, 655 | start: x.start, 656 | exon_0_length: x.exon_0_length, 657 | inexes: x.inexes, 658 | strand: NoStrand::Unknown, 659 | } 660 | } 661 | } 662 | 663 | impl From> for Spliced { 664 | fn from(x: Spliced) -> Self { 665 | Spliced { 666 | refid: x.refid, 667 | start: x.start, 668 | exon_0_length: x.exon_0_length, 669 | inexes: x.inexes, 670 | strand: NoStrand::Unknown, 671 | } 672 | } 673 | } 674 | 675 | /// Default stranded sequence position on a reference sequence named 676 | /// by a `String`. 677 | pub type SeqSplicedStranded = Spliced; 678 | 679 | /// Default unstranded sequence position on a reference sequence named 680 | /// by a `String` 681 | pub type SeqSplicedUnstranded = Spliced; 682 | 683 | #[derive(Error, Debug)] 684 | pub enum SplicingError { 685 | #[error("Invalid (non-positive) intron length")] 686 | IntronLength, 687 | #[error("Invalid (non-positive) exon length")] 688 | ExonLength, 689 | #[error("No exons")] 690 | NoExons, 691 | #[error("Exons do not start at position 0")] 692 | BlockStart, 693 | #[error("Number of exon starts != number of exon lengths")] 694 | BlockMismatch, 695 | #[error("Exon blocks overlap")] 696 | BlockOverlap, 697 | } 698 | 699 | #[cfg(test)] 700 | mod tests { 701 | use super::*; 702 | 703 | #[test] 704 | fn length_start_to_contig() { 705 | //chrV 166236 166885 YER007C-A 0 - 166236 166885 0 2 535,11, 0,638, 706 | let tma20 = Spliced::with_lengths_starts( 707 | "chrV".to_owned(), 708 | 166236, 709 | &vec![535, 11], 710 | &vec![0, 638], 711 | ReqStrand::Reverse, 712 | ) 713 | .unwrap(); 714 | assert_eq!(tma20.exon_starts(), vec![0, 638]); 715 | assert_eq!(tma20.exon_lengths(), vec![535, 11]); 716 | assert_eq!(tma20.to_string(), "chrV:166236-166771;166874-166885(-)"); 717 | assert_eq!( 718 | tma20, 719 | tma20 720 | .to_string() 721 | .parse::>() 722 | .unwrap() 723 | ); 724 | let tma20_exons = tma20.exon_contigs(); 725 | assert_eq!(tma20_exons.len(), 2); 726 | assert_eq!(tma20_exons[0].to_string(), "chrV:166874-166885(-)"); 727 | assert_eq!(tma20_exons[1].to_string(), "chrV:166236-166771(-)"); 728 | 729 | //chrXVI 173151 174702 YPL198W 0 + 173151 174702 0 3 11,94,630, 0,420,921, 730 | let rpl7b = Spliced::with_lengths_starts( 731 | "chrXVI".to_owned(), 732 | 173151, 733 | &vec![11, 94, 630], 734 | &vec![0, 420, 921], 735 | ReqStrand::Forward, 736 | ) 737 | .unwrap(); 738 | assert_eq!( 739 | rpl7b.to_string(), 740 | "chrXVI:173151-173162;173571-173665;174072-174702(+)" 741 | ); 742 | assert_eq!( 743 | rpl7b, 744 | rpl7b 745 | .to_string() 746 | .parse::>() 747 | .unwrap() 748 | ); 749 | let rpl7b_exons = rpl7b.exon_contigs(); 750 | assert_eq!(rpl7b_exons.len(), 3); 751 | assert_eq!(rpl7b_exons[0].to_string(), "chrXVI:173151-173162(+)"); 752 | assert_eq!(rpl7b_exons[1].to_string(), "chrXVI:173571-173665(+)"); 753 | assert_eq!(rpl7b_exons[2].to_string(), "chrXVI:174072-174702(+)"); 754 | 755 | //chrXII 765265 766358 YLR316C 0 - 765265 766358 0 3 808,52,109, 0,864,984, 756 | let tad3 = Spliced::with_lengths_starts( 757 | "chrXII".to_owned(), 758 | 765265, 759 | &vec![808, 52, 109], 760 | &vec![0, 864, 984], 761 | ReqStrand::Reverse, 762 | ) 763 | .unwrap(); 764 | assert_eq!( 765 | tad3.to_string(), 766 | "chrXII:765265-766073;766129-766181;766249-766358(-)" 767 | ); 768 | assert_eq!( 769 | tad3, 770 | tad3.to_string() 771 | .parse::>() 772 | .unwrap() 773 | ); 774 | let tad3_exons = tad3.exon_contigs(); 775 | assert_eq!(tad3_exons.len(), 3); 776 | assert_eq!(tad3_exons[0].to_string(), "chrXII:766249-766358(-)"); 777 | assert_eq!(tad3_exons[1].to_string(), "chrXII:766129-766181(-)"); 778 | assert_eq!(tad3_exons[2].to_string(), "chrXII:765265-766073(-)"); 779 | } 780 | 781 | fn test_into_outof( 782 | loc: &Spliced, 783 | outstr: &str, 784 | in_offset: isize, 785 | in_strand: ReqStrand, 786 | ) -> () { 787 | let p0 = outstr.parse::>().unwrap(); 788 | let p0_into_expected = Pos::new((), in_offset, in_strand); 789 | let p0_into_actual = loc.pos_into(&p0); 790 | let p0_back_out_actual = loc.pos_outof(&p0_into_expected); 791 | println!( 792 | "{}\t{}\t{:?}\t{:?}\t{:?}", 793 | outstr, p0, p0_into_expected, p0_into_actual, p0_back_out_actual 794 | ); 795 | assert!(Some(p0_into_expected).same(&p0_into_actual)); 796 | assert!(Some(p0).same(&p0_back_out_actual)); 797 | } 798 | 799 | fn test_no_into(loc: &Spliced, outstr: &str) -> () { 800 | let p0 = outstr.parse::>().unwrap(); 801 | assert!(None.same(&loc.pos_into(&p0))); 802 | } 803 | 804 | #[test] 805 | fn into_outof() { 806 | //chrXVI 173151 174702 YPL198W 0 + 173151 174702 0 3 11,94,630, 0,420,921, 807 | let rpl7b = Spliced::with_lengths_starts( 808 | "chrXVI".to_owned(), 809 | 173151, 810 | &vec![11, 94, 630], 811 | &vec![0, 420, 921], 812 | ReqStrand::Forward, 813 | ) 814 | .unwrap(); 815 | let p0_into = Pos::new((), -1, ReqStrand::Forward); 816 | assert!(None.same(&rpl7b.pos_outof(&p0_into))); 817 | 818 | test_no_into(&rpl7b, "chrXVI:173150(+)"); 819 | test_into_outof(&rpl7b, "chrXVI:173151(+)", 0, ReqStrand::Forward); 820 | test_into_outof(&rpl7b, "chrXVI:173152(-)", 1, ReqStrand::Reverse); 821 | test_into_outof(&rpl7b, "chrXVI:173161(+)", 10, ReqStrand::Forward); 822 | test_no_into(&rpl7b, "chrXVI:173162(+)"); 823 | test_no_into(&rpl7b, "chrXVI:173570(+)"); 824 | test_into_outof(&rpl7b, "chrXVI:173571(+)", 11, ReqStrand::Forward); 825 | test_into_outof(&rpl7b, "chrXVI:173664(+)", 104, ReqStrand::Forward); 826 | test_no_into(&rpl7b, "chrXVI:173665(+)"); 827 | test_no_into(&rpl7b, "chrXVI:174071(+)"); 828 | test_into_outof(&rpl7b, "chrXVI:174072(+)", 105, ReqStrand::Forward); 829 | test_into_outof(&rpl7b, "chrXVI:174701(+)", 734, ReqStrand::Forward); 830 | test_no_into(&rpl7b, "chrXVI:174702(+)"); 831 | 832 | let p0_into = Pos::new((), 735, ReqStrand::Forward); 833 | assert!(None.same(&rpl7b.pos_outof(&p0_into))); 834 | 835 | //chrXII 765265 766358 YLR316C 0 - 765265 766358 0 3 808,52,109, 0,864,984, 836 | let tad3 = Spliced::with_lengths_starts( 837 | "chrXII".to_owned(), 838 | 765265, 839 | &vec![808, 52, 109], 840 | &vec![0, 864, 984], 841 | ReqStrand::Reverse, 842 | ) 843 | .unwrap(); 844 | 845 | let p0_into = Pos::new((), -1, ReqStrand::Forward); 846 | assert!(None.same(&tad3.pos_outof(&p0_into))); 847 | 848 | test_no_into(&tad3, "chrXII:765264(-)"); 849 | test_into_outof(&tad3, "chrXII:765265(-)", 968, ReqStrand::Forward); 850 | test_into_outof(&tad3, "chrXII:765266(+)", 967, ReqStrand::Reverse); 851 | test_into_outof(&tad3, "chrXII:766072(-)", 161, ReqStrand::Forward); 852 | test_no_into(&tad3, "chrXII:766073(-)"); 853 | 854 | test_no_into(&tad3, "chrXII:766128(-)"); 855 | test_into_outof(&tad3, "chrXII:766129(-)", 160, ReqStrand::Forward); 856 | test_into_outof(&tad3, "chrXII:766180(-)", 109, ReqStrand::Forward); 857 | test_no_into(&tad3, "chrXII:766181(-)"); 858 | 859 | test_no_into(&tad3, "chrXII:766248(-)"); 860 | test_into_outof(&tad3, "chrXII:766249(-)", 108, ReqStrand::Forward); 861 | test_into_outof(&tad3, "chrXII:766357(-)", 0, ReqStrand::Forward); 862 | test_no_into(&tad3, "chrXII:766358(-)"); 863 | 864 | let p0_into = Pos::new((), 969, ReqStrand::Forward); 865 | assert!(None.same(&tad3.pos_outof(&p0_into))); 866 | } 867 | 868 | fn test_contig_ixn( 869 | spl: &Spliced, 870 | cb_str: &str, 871 | cab_str: Option, 872 | ) -> () { 873 | let cb = cb_str.parse::>().unwrap(); 874 | match spl.contig_intersection(&cb) { 875 | None => assert_eq!(None, cab_str), 876 | Some(cab) => assert_eq!(Some(cab.to_string()), cab_str), 877 | }; 878 | } 879 | 880 | #[test] 881 | fn intersection() { 882 | //chrXVI 173151 174702 YPL198W 0 + 173151 174702 0 3 11,94,630, 0,420,921, 883 | let rpl7b = Spliced::with_lengths_starts( 884 | "chrXVI".to_owned(), 885 | 173151, 886 | &vec![11, 94, 630], 887 | &vec![0, 420, 921], 888 | ReqStrand::Forward, 889 | ) 890 | .unwrap(); 891 | 892 | test_contig_ixn( 893 | &rpl7b, 894 | "chrXVI:173000-175000(+)", 895 | Some("chrXVI:173151-173162;173571-173665;174072-174702(+)".to_owned()), 896 | ); 897 | 898 | test_contig_ixn( 899 | &rpl7b, 900 | "chrXVI:173150-175000(+)", 901 | Some("chrXVI:173151-173162;173571-173665;174072-174702(+)".to_owned()), 902 | ); 903 | test_contig_ixn( 904 | &rpl7b, 905 | "chrXVI:173151-175000(+)", 906 | Some("chrXVI:173151-173162;173571-173665;174072-174702(+)".to_owned()), 907 | ); 908 | test_contig_ixn( 909 | &rpl7b, 910 | "chrXVI:173152-175000(+)", 911 | Some("chrXVI:173152-173162;173571-173665;174072-174702(+)".to_owned()), 912 | ); 913 | test_contig_ixn( 914 | &rpl7b, 915 | "chrXVI:173155-175000(+)", 916 | Some("chrXVI:173155-173162;173571-173665;174072-174702(+)".to_owned()), 917 | ); 918 | test_contig_ixn( 919 | &rpl7b, 920 | "chrXVI:173161-175000(+)", 921 | Some("chrXVI:173161-173162;173571-173665;174072-174702(+)".to_owned()), 922 | ); 923 | test_contig_ixn( 924 | &rpl7b, 925 | "chrXVI:173162-175000(+)", 926 | Some("chrXVI:173571-173665;174072-174702(+)".to_owned()), 927 | ); 928 | test_contig_ixn( 929 | &rpl7b, 930 | "chrXVI:173500-175000(+)", 931 | Some("chrXVI:173571-173665;174072-174702(+)".to_owned()), 932 | ); 933 | test_contig_ixn( 934 | &rpl7b, 935 | "chrXVI:173570-175000(+)", 936 | Some("chrXVI:173571-173665;174072-174702(+)".to_owned()), 937 | ); 938 | test_contig_ixn( 939 | &rpl7b, 940 | "chrXVI:173571-175000(+)", 941 | Some("chrXVI:173571-173665;174072-174702(+)".to_owned()), 942 | ); 943 | test_contig_ixn( 944 | &rpl7b, 945 | "chrXVI:173572-175000(+)", 946 | Some("chrXVI:173572-173665;174072-174702(+)".to_owned()), 947 | ); 948 | test_contig_ixn( 949 | &rpl7b, 950 | "chrXVI:173600-175000(+)", 951 | Some("chrXVI:173600-173665;174072-174702(+)".to_owned()), 952 | ); 953 | test_contig_ixn( 954 | &rpl7b, 955 | "chrXVI:173664-175000(+)", 956 | Some("chrXVI:173664-173665;174072-174702(+)".to_owned()), 957 | ); 958 | test_contig_ixn( 959 | &rpl7b, 960 | "chrXVI:173665-175000(+)", 961 | Some("chrXVI:174072-174702(+)".to_owned()), 962 | ); 963 | test_contig_ixn( 964 | &rpl7b, 965 | "chrXVI:174100-175000(+)", 966 | Some("chrXVI:174100-174702(+)".to_owned()), 967 | ); 968 | test_contig_ixn(&rpl7b, "chrXVI:174800-175000(+)", None); 969 | 970 | test_contig_ixn( 971 | &rpl7b, 972 | "chrXVI:173150-174703(+)", 973 | Some("chrXVI:173151-173162;173571-173665;174072-174702(+)".to_owned()), 974 | ); 975 | test_contig_ixn( 976 | &rpl7b, 977 | "chrXVI:173150-174702(+)", 978 | Some("chrXVI:173151-173162;173571-173665;174072-174702(+)".to_owned()), 979 | ); 980 | test_contig_ixn( 981 | &rpl7b, 982 | "chrXVI:173150-174701(+)", 983 | Some("chrXVI:173151-173162;173571-173665;174072-174701(+)".to_owned()), 984 | ); 985 | test_contig_ixn( 986 | &rpl7b, 987 | "chrXVI:173000-174500(+)", 988 | Some("chrXVI:173151-173162;173571-173665;174072-174500(+)".to_owned()), 989 | ); 990 | test_contig_ixn( 991 | &rpl7b, 992 | "chrXVI:173000-174072(+)", 993 | Some("chrXVI:173151-173162;173571-173665(+)".to_owned()), 994 | ); 995 | test_contig_ixn( 996 | &rpl7b, 997 | "chrXVI:173000-173800(+)", 998 | Some("chrXVI:173151-173162;173571-173665(+)".to_owned()), 999 | ); 1000 | test_contig_ixn( 1001 | &rpl7b, 1002 | "chrXVI:173000-173666(+)", 1003 | Some("chrXVI:173151-173162;173571-173665(+)".to_owned()), 1004 | ); 1005 | test_contig_ixn( 1006 | &rpl7b, 1007 | "chrXVI:173000-173665(+)", 1008 | Some("chrXVI:173151-173162;173571-173665(+)".to_owned()), 1009 | ); 1010 | test_contig_ixn( 1011 | &rpl7b, 1012 | "chrXVI:173000-173664(+)", 1013 | Some("chrXVI:173151-173162;173571-173664(+)".to_owned()), 1014 | ); 1015 | test_contig_ixn( 1016 | &rpl7b, 1017 | "chrXVI:173000-173600(+)", 1018 | Some("chrXVI:173151-173162;173571-173600(+)".to_owned()), 1019 | ); 1020 | test_contig_ixn( 1021 | &rpl7b, 1022 | "chrXVI:173000-173571(+)", 1023 | Some("chrXVI:173151-173162(+)".to_owned()), 1024 | ); 1025 | test_contig_ixn( 1026 | &rpl7b, 1027 | "chrXVI:173000-173300(+)", 1028 | Some("chrXVI:173151-173162(+)".to_owned()), 1029 | ); 1030 | test_contig_ixn( 1031 | &rpl7b, 1032 | "chrXVI:173000-173155(+)", 1033 | Some("chrXVI:173151-173155(+)".to_owned()), 1034 | ); 1035 | test_contig_ixn(&rpl7b, "chrXVI:173000-173100(+)", None); 1036 | 1037 | test_contig_ixn( 1038 | &rpl7b, 1039 | "chrXVI:173155-174500(+)", 1040 | Some("chrXVI:173155-173162;173571-173665;174072-174500(+)".to_owned()), 1041 | ); 1042 | test_contig_ixn( 1043 | &rpl7b, 1044 | "chrXVI:173600-174500(+)", 1045 | Some("chrXVI:173600-173665;174072-174500(+)".to_owned()), 1046 | ); 1047 | test_contig_ixn( 1048 | &rpl7b, 1049 | "chrXVI:173155-173600(+)", 1050 | Some("chrXVI:173155-173162;173571-173600(+)".to_owned()), 1051 | ); 1052 | test_contig_ixn( 1053 | &rpl7b, 1054 | "chrXVI:173590-173610(+)", 1055 | Some("chrXVI:173590-173610(+)".to_owned()), 1056 | ); 1057 | 1058 | test_contig_ixn( 1059 | &rpl7b, 1060 | "chrXVI:173155-173160(+)", 1061 | Some("chrXVI:173155-173160(+)".to_owned()), 1062 | ); 1063 | test_contig_ixn( 1064 | &rpl7b, 1065 | "chrXVI:174400-174500(+)", 1066 | Some("chrXVI:174400-174500(+)".to_owned()), 1067 | ); 1068 | 1069 | test_contig_ixn(&rpl7b, "chrXVI:173200-173300(+)", None); 1070 | test_contig_ixn(&rpl7b, "chrXVI:173800-174000(+)", None); 1071 | } 1072 | } 1073 | -------------------------------------------------------------------------------- /src/genome.rs: -------------------------------------------------------------------------------- 1 | use std::cmp; 2 | use std::ops::Range; 3 | 4 | #[cfg(feature = "serde")] 5 | use serde::{Deserialize, Serialize}; 6 | 7 | pub type Position = u64; 8 | pub type Length = u64; 9 | 10 | pub trait AbstractInterval { 11 | /// Identifier for a genomic contig, e.g., a chromosome 12 | fn contig(&self) -> &str; 13 | /// Interval on the contig 14 | fn range(&self) -> Range; 15 | /// Return true if interval contains given locus. 16 | fn contains(&self, locus: L) -> bool 17 | where 18 | L: AbstractLocus, 19 | { 20 | self.contig() == locus.contig() 21 | && locus.pos() >= self.range().start 22 | && locus.pos() < self.range().end 23 | } 24 | } 25 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 26 | #[derive(new, Debug, PartialEq, Eq, Clone, Hash)] 27 | pub struct Interval { 28 | contig: String, 29 | range: Range, 30 | } 31 | 32 | impl PartialOrd for Interval { 33 | fn partial_cmp(&self, other: &Self) -> Option { 34 | Some(self.contig.cmp(&other.contig).then_with(|| { 35 | self.range 36 | .start 37 | .cmp(&other.range.start) 38 | .then_with(|| self.range.end.cmp(&other.range.end)) 39 | })) 40 | } 41 | } 42 | 43 | impl Ord for Interval { 44 | fn cmp(&self, other: &Self) -> cmp::Ordering { 45 | self.partial_cmp(other).unwrap() 46 | } 47 | } 48 | 49 | impl Interval { 50 | /// Mutable reference to interval on the contig 51 | pub fn range_mut(&mut self) -> &mut Range { 52 | &mut self.range 53 | } 54 | } 55 | 56 | impl AbstractInterval for Interval { 57 | fn contig(&self) -> &str { 58 | &self.contig 59 | } 60 | 61 | fn range(&self) -> Range { 62 | self.range.clone() 63 | } 64 | } 65 | 66 | pub trait AbstractLocus { 67 | /// Identifier for a genomic contig, e.g., a chromosome 68 | fn contig(&self) -> &str; 69 | /// Position on the contig 70 | fn pos(&self) -> Position; 71 | } 72 | 73 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 74 | #[derive(new, Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] 75 | pub struct Locus { 76 | contig: String, 77 | pos: Position, 78 | } 79 | 80 | impl Locus { 81 | /// Mutable reference to position. 82 | pub fn pos_mut(&mut self) -> &mut Position { 83 | &mut self.pos 84 | } 85 | } 86 | 87 | impl AbstractLocus for Locus { 88 | fn contig(&self) -> &str { 89 | &self.contig 90 | } 91 | 92 | fn pos(&self) -> Position { 93 | self.pos 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "serde")] 2 | extern crate serde; 3 | #[macro_use] 4 | extern crate lazy_static; 5 | #[macro_use] 6 | extern crate derive_new; 7 | 8 | extern crate regex; 9 | 10 | #[cfg(feature = "phylogeny")] 11 | extern crate petgraph; 12 | 13 | pub mod alignment; 14 | pub mod annot; 15 | pub mod genome; 16 | #[cfg(feature = "phylogeny")] 17 | pub mod phylogeny; 18 | pub mod sequence; 19 | pub mod strand; 20 | pub mod variant; 21 | -------------------------------------------------------------------------------- /src/phylogeny.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Franklin Delehelle 2 | // Licensed under the MIT license (http://opensource.org/licenses/MIT) 3 | // This file may not be copied, modified, or distributed 4 | // except according to those terms. 5 | 6 | //! A phylogenetic tree is represented as a directed graph. 7 | //! Each node is a taxon, identified as a string. 8 | //! The edges are weighted by the phylogenetic distance if it was defined, or f32::NAN otherwise. 9 | 10 | use petgraph::{graph::Graph, Directed}; 11 | 12 | pub type Taxon = String; 13 | pub type Proximity = f32; 14 | 15 | pub type TreeGraph = Graph; 16 | pub struct Tree { 17 | pub g: TreeGraph, 18 | } 19 | -------------------------------------------------------------------------------- /src/sequence.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Johannes Köster. 2 | // Licensed under the MIT license (http://opensource.org/licenses/MIT) 3 | // This file may not be copied, modified, or distributed 4 | // except according to those terms. 5 | 6 | #[cfg(feature = "serde")] 7 | use serde::{Deserialize, Serialize}; 8 | use strum_macros::{AsRefStr, Display}; 9 | use SequenceReadPairOrientation::None; 10 | 11 | /// A DNA base 12 | pub type Base = u8; 13 | /// An amino acid 14 | pub type AminoAcid = u8; 15 | /// A biological sequence 16 | pub type Sequence = Vec; 17 | 18 | pub trait SequenceRead { 19 | /// Read name. 20 | fn name(&self) -> &[u8]; 21 | /// Base at position `i` in the read. 22 | fn base(&self, i: usize) -> u8; 23 | /// Base quality at position `i` in the read. 24 | fn base_qual(&self, i: usize) -> u8; 25 | /// Read length. 26 | fn len(&self) -> usize; 27 | /// Return `true` if read is empty. 28 | fn is_empty(&self) -> bool { 29 | self.len() == 0 30 | } 31 | } 32 | 33 | /// Representation of sequence read pair orientation 34 | /// (e.g. F1R2 means that the forward read comes first on the reference contig, 35 | /// followed by the reverse read, on the same contig). 36 | /// 37 | /// This enum can be pretty-printed into a readable string repesentation: 38 | /// 39 | /// ```rust 40 | /// use bio_types::sequence::SequenceReadPairOrientation; 41 | /// 42 | /// // format into string 43 | /// println!("{}", SequenceReadPairOrientation::F1R2); 44 | /// // obtain string via `AsRef<&'static str>` 45 | /// assert_eq!(SequenceReadPairOrientation::R1F2.as_ref(), "R1F2"); 46 | /// ``` 47 | #[derive(Debug, Clone, Copy, PartialEq, Eq, AsRefStr, Display)] 48 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 49 | pub enum SequenceReadPairOrientation { 50 | F1R2, 51 | F2R1, 52 | R1F2, 53 | R2F1, 54 | F1F2, 55 | R1R2, 56 | F2F1, 57 | R2R1, 58 | None, 59 | } 60 | 61 | impl Default for SequenceReadPairOrientation { 62 | fn default() -> Self { 63 | None 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/strand.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2014-2016 Johannes Köster. 2 | // Licensed under the MIT license (http://opensource.org/licenses/MIT) 3 | // This file may not be copied, modified, or distributed 4 | // except according to those terms. 5 | 6 | //! Data types for strand information on annotations. 7 | 8 | #[cfg(feature = "serde")] 9 | use serde::{Deserialize, Serialize}; 10 | use std::fmt::{self, Display, Formatter}; 11 | use std::ops::Neg; 12 | use std::str::FromStr; 13 | use thiserror::Error; 14 | 15 | /// Strand information. 16 | #[derive(Debug, Clone, Copy)] 17 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 18 | pub enum Strand { 19 | Forward, 20 | Reverse, 21 | Unknown, 22 | } 23 | 24 | impl Strand { 25 | /// Returns a `Strand` enum representing the given char. 26 | /// 27 | /// The mapping is as follows: 28 | /// * '+', 'f', or 'F' becomes `Strand::Forward` 29 | /// * '-', 'r', or 'R' becomes `Strand::Reverse` 30 | /// * '.', '?' becomes `Strand::Unknown` 31 | /// * Any other inputs will return an `Err(StrandError::InvalidChar)` 32 | pub fn from_char(strand_char: &char) -> Result { 33 | match *strand_char { 34 | '+' | 'f' | 'F' => Ok(Strand::Forward), 35 | '-' | 'r' | 'R' => Ok(Strand::Reverse), 36 | '.' | '?' => Ok(Strand::Unknown), 37 | invalid => Err(StrandError::InvalidChar(invalid)), 38 | } 39 | } 40 | 41 | /// Symbol denoting the strand. By convention, in BED and GFF 42 | /// files, the forward strand is `+`, the reverse strand is `-`, 43 | /// and unknown or unspecified strands are `.`. 44 | pub fn strand_symbol(&self) -> &str { 45 | match *self { 46 | Strand::Forward => "+", 47 | Strand::Reverse => "-", 48 | Strand::Unknown => ".", 49 | } 50 | } 51 | 52 | pub fn is_unknown(&self) -> bool { 53 | matches!(*self, Strand::Unknown) 54 | } 55 | } 56 | 57 | #[allow(clippy::match_like_matches_macro)] 58 | impl PartialEq for Strand { 59 | /// Returns true if both are `Forward` or both are `Reverse`, otherwise returns false. 60 | fn eq(&self, other: &Strand) -> bool { 61 | match (self, other) { 62 | (&Strand::Forward, &Strand::Forward) => true, 63 | (&Strand::Reverse, &Strand::Reverse) => true, 64 | _ => false, 65 | } 66 | } 67 | } 68 | 69 | impl Neg for Strand { 70 | type Output = Strand; 71 | fn neg(self) -> Strand { 72 | match self { 73 | Strand::Forward => Strand::Reverse, 74 | Strand::Reverse => Strand::Forward, 75 | Strand::Unknown => Strand::Unknown, 76 | } 77 | } 78 | } 79 | 80 | #[allow(clippy::match_like_matches_macro)] 81 | impl Same for Strand { 82 | fn same(&self, s1: &Self) -> bool { 83 | match (*self, *s1) { 84 | (Strand::Forward, Strand::Forward) => true, 85 | (Strand::Reverse, Strand::Reverse) => true, 86 | (Strand::Unknown, Strand::Unknown) => true, 87 | _ => false, 88 | } 89 | } 90 | } 91 | 92 | impl Display for Strand { 93 | fn fmt(&self, f: &mut Formatter) -> fmt::Result { 94 | f.write_str(self.strand_symbol()) 95 | } 96 | } 97 | 98 | impl FromStr for Strand { 99 | type Err = StrandError; 100 | fn from_str(s: &str) -> Result { 101 | match s { 102 | "+" | "(+)" => Ok(Strand::Forward), 103 | "-" | "(-)" => Ok(Strand::Reverse), 104 | "." | "" => Ok(Strand::Unknown), 105 | _ => Err(StrandError::ParseError), 106 | } 107 | } 108 | } 109 | 110 | impl From for Strand { 111 | fn from(rstr: ReqStrand) -> Self { 112 | match rstr { 113 | ReqStrand::Forward => Strand::Forward, 114 | ReqStrand::Reverse => Strand::Reverse, 115 | } 116 | } 117 | } 118 | 119 | impl From> for Strand { 120 | fn from(orstr: Option) -> Self { 121 | match orstr { 122 | Some(ReqStrand::Forward) => Strand::Forward, 123 | Some(ReqStrand::Reverse) => Strand::Reverse, 124 | None => Strand::Unknown, 125 | } 126 | } 127 | } 128 | 129 | impl From for Strand { 130 | fn from(_: NoStrand) -> Self { 131 | Strand::Unknown 132 | } 133 | } 134 | 135 | /// Strand information for annotations that require a strand. 136 | #[derive(Debug, Clone, Hash, PartialEq, Eq, Ord, PartialOrd, Copy)] 137 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 138 | pub enum ReqStrand { 139 | Forward, 140 | Reverse, 141 | } 142 | 143 | impl ReqStrand { 144 | /// Returns a `ReqStrand` enum representing the given char. 145 | /// 146 | /// The mapping is as follows: 147 | /// * '+', 'f', or 'F' becomes `Strand::Forward` 148 | /// * '-', 'r', or 'R' becomes `Strand::Reverse` 149 | /// * Any other inputs will return an `Err(StrandError::InvalidChar)` 150 | pub fn from_char(strand_char: &char) -> Result { 151 | match *strand_char { 152 | '+' | 'f' | 'F' => Ok(ReqStrand::Forward), 153 | '-' | 'r' | 'R' => Ok(ReqStrand::Reverse), 154 | invalid => Err(StrandError::InvalidChar(invalid)), 155 | } 156 | } 157 | 158 | /// Symbol denoting the strand. By convention, in BED and GFF 159 | /// files, the forward strand is `+` and the reverse strand is `-`. 160 | pub fn strand_symbol(&self) -> &str { 161 | match *self { 162 | ReqStrand::Forward => "+", 163 | ReqStrand::Reverse => "-", 164 | } 165 | } 166 | 167 | /// Convert the (optional) strand of some other annotation 168 | /// according to this strand. That is, reverse the strand of the 169 | /// other annotation for `ReqStrand::Reverse` and leave it 170 | /// unchanged for `ReqStrand::Forward`. 171 | /// 172 | /// # Arguments 173 | /// 174 | /// * `x` is the strand information from some other annotation. 175 | /// 176 | /// ``` 177 | /// use bio_types::strand::{ReqStrand,Strand}; 178 | /// assert_eq!(ReqStrand::Forward.on_strand(Strand::Reverse), 179 | /// ReqStrand::Reverse.on_strand(Strand::Forward)); 180 | /// ``` 181 | pub fn on_strand(&self, x: T) -> T 182 | where 183 | T: Neg, 184 | { 185 | match self { 186 | ReqStrand::Forward => x, 187 | ReqStrand::Reverse => -x, 188 | } 189 | } 190 | } 191 | 192 | impl Same for ReqStrand { 193 | fn same(&self, s1: &Self) -> bool { 194 | self == s1 195 | } 196 | } 197 | 198 | impl Display for ReqStrand { 199 | fn fmt(&self, f: &mut Formatter) -> fmt::Result { 200 | f.write_str(self.strand_symbol()) 201 | } 202 | } 203 | 204 | impl FromStr for ReqStrand { 205 | type Err = StrandError; 206 | fn from_str(s: &str) -> Result { 207 | match s { 208 | "+" | "(+)" => Ok(ReqStrand::Forward), 209 | "-" | "(-)" => Ok(ReqStrand::Reverse), 210 | _ => Err(StrandError::ParseError), 211 | } 212 | } 213 | } 214 | 215 | impl From for Option { 216 | fn from(strand: Strand) -> Option { 217 | match strand { 218 | Strand::Forward => Some(ReqStrand::Forward), 219 | Strand::Reverse => Some(ReqStrand::Reverse), 220 | Strand::Unknown => None, 221 | } 222 | } 223 | } 224 | 225 | impl From for Option { 226 | fn from(_: NoStrand) -> Option { 227 | None 228 | } 229 | } 230 | 231 | impl Neg for ReqStrand { 232 | type Output = ReqStrand; 233 | fn neg(self) -> ReqStrand { 234 | match self { 235 | ReqStrand::Forward => ReqStrand::Reverse, 236 | ReqStrand::Reverse => ReqStrand::Forward, 237 | } 238 | } 239 | } 240 | 241 | /// Strand information for annotations that definitively have no 242 | /// strand information. 243 | #[derive(Debug, Clone, Hash, PartialEq, Eq, Ord, PartialOrd, Copy)] 244 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 245 | pub enum NoStrand { 246 | Unknown, 247 | } 248 | 249 | impl Neg for NoStrand { 250 | type Output = NoStrand; 251 | fn neg(self) -> NoStrand { 252 | match self { 253 | NoStrand::Unknown => NoStrand::Unknown, 254 | } 255 | } 256 | } 257 | 258 | impl Same for NoStrand { 259 | fn same(&self, _s1: &Self) -> bool { 260 | true 261 | } 262 | } 263 | 264 | impl FromStr for NoStrand { 265 | type Err = StrandError; 266 | fn from_str(s: &str) -> Result { 267 | match s { 268 | "" => Ok(NoStrand::Unknown), 269 | _ => Err(StrandError::ParseError), 270 | } 271 | } 272 | } 273 | 274 | impl Display for NoStrand { 275 | fn fmt(&self, _f: &mut Formatter) -> fmt::Result { 276 | Ok(()) 277 | } 278 | } 279 | 280 | /// Equality-like operator for comparing strand information. Unknown 281 | /// strands are not equal, but they are the "same" as other unknown 282 | /// strands. 283 | pub trait Same { 284 | /// Indicate when two strands are the "same" -- two 285 | /// unknown/unspecified strands are the "same" but are not equal. 286 | fn same(&self, other: &Self) -> bool; 287 | } 288 | 289 | impl Same for Option 290 | where 291 | T: Same, 292 | { 293 | fn same(&self, s1: &Self) -> bool { 294 | match (self, s1) { 295 | (&Option::None, &Option::None) => true, 296 | (&Option::Some(ref x), &Option::Some(ref x1)) => x.same(x1), 297 | (_, _) => false, 298 | } 299 | } 300 | } 301 | 302 | #[derive(Error, Debug)] 303 | pub enum StrandError { 304 | #[error("invalid character for strand conversion: {0:?}: can not be converted to a Strand")] 305 | InvalidChar(char), 306 | #[error("error parsing strand")] 307 | ParseError, 308 | } 309 | 310 | #[cfg(test)] 311 | mod tests { 312 | use super::*; 313 | 314 | #[test] 315 | fn test_strand() { 316 | assert_eq!(Strand::from_char(&'+').unwrap(), Strand::Forward); 317 | assert_eq!(Strand::from_char(&'-').unwrap(), Strand::Reverse); 318 | assert!(Strand::from_char(&'.').unwrap().is_unknown()); 319 | assert!(Strand::from_char(&'o').is_err()); 320 | assert_eq!(Strand::Forward.strand_symbol(), "+"); 321 | assert_eq!(Strand::Reverse.strand_symbol(), "-"); 322 | assert_eq!(Strand::Unknown.strand_symbol(), "."); 323 | } 324 | 325 | #[test] 326 | fn test_req_strand() { 327 | assert_eq!(ReqStrand::from_char(&'+').unwrap(), ReqStrand::Forward); 328 | assert_eq!(ReqStrand::from_char(&'-').unwrap(), ReqStrand::Reverse); 329 | assert!(ReqStrand::from_char(&'o').is_err()); 330 | assert_eq!(ReqStrand::Forward.strand_symbol(), "+"); 331 | assert_eq!(ReqStrand::Reverse.strand_symbol(), "-"); 332 | } 333 | } 334 | -------------------------------------------------------------------------------- /src/variant.rs: -------------------------------------------------------------------------------- 1 | use crate::genome; 2 | use crate::sequence::{Base, Sequence}; 3 | 4 | #[cfg(feature = "serde")] 5 | use serde::{Deserialize, Serialize}; 6 | 7 | /// A trait for providing variant information. This can e.g. be implemented by file readers. 8 | pub trait AbstractVariant: genome::AbstractLocus { 9 | fn kind(&self) -> &Kind; 10 | } 11 | 12 | /// Possible genomic variants. 13 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 14 | #[derive(Debug, PartialEq, Eq, Clone, Hash)] 15 | pub enum Kind { 16 | SNV(Base), 17 | MNV(Sequence), 18 | Insertion(Sequence), 19 | Deletion(genome::Length), 20 | Duplication(genome::Length), 21 | Inversion(genome::Length), 22 | None, 23 | } 24 | 25 | impl Kind { 26 | /// Return variant length. 27 | pub fn len(&self) -> genome::Length { 28 | match *self { 29 | Kind::SNV(_) => 1, 30 | Kind::MNV(ref s) => s.len() as u64, 31 | Kind::Insertion(ref s) => s.len() as u64, 32 | Kind::Deletion(l) => l, 33 | Kind::Duplication(l) => l, 34 | Kind::Inversion(l) => l, 35 | Kind::None => 1, 36 | } 37 | } 38 | /// Check if length is zero 39 | pub fn is_empty(&self) -> bool { 40 | self.len() == 0 41 | } 42 | } 43 | --------------------------------------------------------------------------------