├── .github
    ├── CODEOWNERS
    ├── labels.yml
    ├── workflows
    │   ├── stale.yml
    │   ├── validate-codecov-config.yml
    │   ├── python-ci-cd.yml
    │   └── labels.yml
    ├── stale.yml
    └── actions
    │   └── setup-python-env
    │       └── action.yml
├── docs
    ├── modules
    │   ├── .nav.yml
    │   ├── bioutils.digest.md
    │   ├── bioutils.digests.md
    │   ├── bioutils.cytobands.md
    │   ├── bioutils.normalize.md
    │   ├── bioutils.sequences.md
    │   ├── bioutils.accessions.md
    │   ├── bioutils.assemblies.md
    │   ├── bioutils.coordinates.md
    │   ├── bioutils.exceptions.md
    │   ├── bioutils.seqfetcher.md
    │   └── bioutils.vmc_digest.md
    ├── getting-started.md
    ├── changelog
    │   ├── index.rst
    │   ├── 0.4
    │   │   ├── index.rst
    │   │   ├── 0.4.3.clog
    │   │   ├── 0.4.2.clog
    │   │   ├── 0.4.4.clog
    │   │   ├── 0.4.2.rst
    │   │   ├── 0.4.3.rst
    │   │   ├── 0.4.4.rst
    │   │   ├── 0.4.1.clog
    │   │   ├── 0.4.0.clog
    │   │   ├── 0.4.1.rst
    │   │   ├── Makefile
    │   │   └── 0.4.0.rst
    │   └── 0.5
    │   │   ├── index.rst
    │   │   ├── 0.5.5.clog
    │   │   ├── 0.5.4.clog
    │   │   ├── 0.5.7.clog
    │   │   ├── 0.5.1.clog
    │   │   ├── 0.5.5.rst
    │   │   ├── 0.5.7.rst
    │   │   ├── 0.5.4.rst
    │   │   ├── 0.5.1.rst
    │   │   ├── 0.5.3.clog
    │   │   ├── 0.5.2.clog
    │   │   ├── 0.5.6.clog
    │   │   ├── 0.5.2.rst
    │   │   ├── Makefile
    │   │   ├── 0.5.3.rst
    │   │   ├── 0.5.8.clog
    │   │   ├── 0.5.6.rst
    │   │   ├── 0.5.0.clog
    │   │   ├── 0.5.8.rst
    │   │   └── 0.5.0.rst
    └── index.md
├── src
    └── bioutils
    │   ├── _data
    │       ├── assemblies
    │       │   ├── .gitignore
    │       │   ├── GRCh37.json.gz
    │       │   ├── GRCh38.json.gz
    │       │   ├── NCBI33.json.gz
    │       │   ├── NCBI34.json.gz
    │       │   ├── NCBI35.json.gz
    │       │   ├── NCBI36.json.gz
    │       │   ├── CHM1_1.0.json.gz
    │       │   ├── CHM1_1.1.json.gz
    │       │   ├── GRCh37.p10.json.gz
    │       │   ├── GRCh37.p11.json.gz
    │       │   ├── GRCh37.p12.json.gz
    │       │   ├── GRCh37.p13.json.gz
    │       │   ├── GRCh37.p2.json.gz
    │       │   ├── GRCh37.p5.json.gz
    │       │   ├── GRCh37.p9.json.gz
    │       │   ├── GRCh38.p1.json.gz
    │       │   ├── GRCh38.p10.json.gz
    │       │   ├── GRCh38.p11.json.gz
    │       │   ├── GRCh38.p12.json.gz
    │       │   ├── GRCh38.p13.json.gz
    │       │   ├── GRCh38.p14.json.gz
    │       │   ├── GRCh38.p2.json.gz
    │       │   ├── GRCh38.p3.json.gz
    │       │   ├── GRCh38.p4.json.gz
    │       │   ├── GRCh38.p5.json.gz
    │       │   ├── GRCh38.p6.json.gz
    │       │   ├── GRCh38.p7.json.gz
    │       │   ├── GRCh38.p8.json.gz
    │       │   ├── GRCh38.p9.json.gz
    │       │   ├── T2T-CHM13v2.0.json.gz
    │       │   └── Makefile
    │       └── cytobands
    │       │   ├── ucsc-hg19.json.gz
    │       │   └── ucsc-hg38.json.gz
    │   ├── exceptions.py
    │   ├── __init__.py
    │   ├── _versionwarning.py
    │   ├── cytobands.py
    │   ├── coordinates.py
    │   ├── digest.py
    │   ├── vmc_digest.py
    │   ├── assemblies.py
    │   ├── digests.py
    │   ├── accessions.py
    │   ├── seqfetcher.py
    │   ├── normalize.py
    │   └── sequences.py
├── tests
    ├── data
    │   ├── seqs.fa.gz
    │   └── cassettes
    │   │   ├── test_fetch_seq_errors
    │   │   └── test_fetch_seq_ncbi_invalid_positions
    ├── conftest.py
    ├── test_sequences.py
    ├── test_seqfetcher.py
    └── test_normalize.py
├── .git-blame-ignore-revs
├── .vscode
    └── settings.json
├── codecov.yaml
├── .deepsource.toml
├── CONTRIBUTING.md
├── tox.ini
├── CONTRIBUTORS.txt
├── .readthedocs.yml
├── .coveragerc
├── sbin
    ├── ucsc-cytoband-to-json
    ├── makefile-extract-documentation
    ├── generate-assembly-sql
    └── assembly-to-json
├── .mailmap
├── .pre-commit-config.yaml
├── mkdocs.yml
├── .gitignore
├── bin
    └── fasta-ga4gh-identifier
├── README.rst
├── README.md
├── Makefile
├── pyproject.toml
└── LICENSE.txt


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @biocommons/maintainers
2 | 


--------------------------------------------------------------------------------
/docs/modules/.nav.yml:
--------------------------------------------------------------------------------
1 | title: "Reference"
2 | 


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/.gitignore:
--------------------------------------------------------------------------------
1 | pull
2 | 


--------------------------------------------------------------------------------
/.github/labels.yml:
--------------------------------------------------------------------------------
1 | # file must contain an array, which may be empty
2 | 
3 | []
4 | 


--------------------------------------------------------------------------------
/docs/modules/bioutils.digest.md:
--------------------------------------------------------------------------------
1 | # bioutils.digest
2 | 
3 | ::: bioutils.digest
4 | 


--------------------------------------------------------------------------------
/docs/modules/bioutils.digests.md:
--------------------------------------------------------------------------------
1 | # bioutils.digests
2 | 
3 | ::: bioutils.digests
4 | 


--------------------------------------------------------------------------------
/docs/modules/bioutils.cytobands.md:
--------------------------------------------------------------------------------
1 | # bioutils.cytobands
2 | 
3 | ::: bioutils.cytobands
4 | 


--------------------------------------------------------------------------------
/docs/modules/bioutils.normalize.md:
--------------------------------------------------------------------------------
1 | # bioutils.normalize
2 | 
3 | ::: bioutils.normalize
4 | 


--------------------------------------------------------------------------------
/docs/modules/bioutils.sequences.md:
--------------------------------------------------------------------------------
1 | # bioutils.sequences
2 | 
3 | ::: bioutils.sequences
4 | 


--------------------------------------------------------------------------------
/docs/modules/bioutils.accessions.md:
--------------------------------------------------------------------------------
1 | # bioutils.accessions
2 | 
3 | ::: bioutils.accessions
4 | 


--------------------------------------------------------------------------------
/docs/modules/bioutils.assemblies.md:
--------------------------------------------------------------------------------
1 | # bioutils.assemblies
2 | 
3 | ::: bioutils.assemblies
4 | 


--------------------------------------------------------------------------------
/docs/modules/bioutils.coordinates.md:
--------------------------------------------------------------------------------
1 | # bioutils.coordinates
2 | 
3 | ::: bioutils.coordinates
4 | 


--------------------------------------------------------------------------------
/docs/modules/bioutils.exceptions.md:
--------------------------------------------------------------------------------
1 | # bioutils.exceptions
2 | 
3 | ::: bioutils.exceptions
4 | 


--------------------------------------------------------------------------------
/docs/modules/bioutils.seqfetcher.md:
--------------------------------------------------------------------------------
1 | # bioutils.seqfetcher
2 | 
3 | ::: bioutils.seqfetcher
4 | 


--------------------------------------------------------------------------------
/docs/modules/bioutils.vmc_digest.md:
--------------------------------------------------------------------------------
1 | # bioutils.vmc_digest
2 | 
3 | ::: bioutils.vmc_digest
4 | 


--------------------------------------------------------------------------------
/tests/data/seqs.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/tests/data/seqs.fa.gz


--------------------------------------------------------------------------------
/docs/getting-started.md:
--------------------------------------------------------------------------------
1 | # Getting Started
2 | 
3 | ## Installation
4 | 
5 |     pip install bioutils
6 | 


--------------------------------------------------------------------------------
/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
1 | # .git-blame-ignore-revs
2 | # Initial Ruff formatting
3 | f82e37ddd260eac32b0396ab4baad78a82527b29
4 | 


--------------------------------------------------------------------------------
/src/bioutils/exceptions.py:
--------------------------------------------------------------------------------
1 | class BioutilsError(Exception):
2 |     """Root exception for all bioutils exceptions"""
3 | 
4 |     pass
5 | 


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh37.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh37.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh38.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/NCBI33.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/NCBI33.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/NCBI34.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/NCBI34.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/NCBI35.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/NCBI35.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/NCBI36.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/NCBI36.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/CHM1_1.0.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/CHM1_1.0.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/CHM1_1.1.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/CHM1_1.1.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/cytobands/ucsc-hg19.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/cytobands/ucsc-hg19.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/cytobands/ucsc-hg38.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/cytobands/ucsc-hg38.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh37.p10.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh37.p10.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh37.p11.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh37.p11.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh37.p12.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh37.p12.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh37.p13.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh37.p13.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh37.p2.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh37.p2.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh37.p5.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh37.p5.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh37.p9.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh37.p9.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh38.p1.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p1.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh38.p10.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p10.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh38.p11.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p11.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh38.p12.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p12.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh38.p13.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p13.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh38.p14.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p14.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh38.p2.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p2.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh38.p3.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p3.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh38.p4.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p4.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh38.p5.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p5.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh38.p6.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p6.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh38.p7.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p7.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh38.p8.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p8.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/GRCh38.p9.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p9.json.gz


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/T2T-CHM13v2.0.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/T2T-CHM13v2.0.json.gz


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.formatting.provider": "yapf",
3 |     "editor.formatOnSave": true,
4 |     "python.venvPath": "${workspaceFolder}/venv/",
5 | }


--------------------------------------------------------------------------------
/docs/changelog/index.rst:
--------------------------------------------------------------------------------
 1 | .. _changelog:
 2 | 
 3 | Change Log
 4 | !!!!!!!!!!
 5 | 
 6 | .. toctree::
 7 |    :maxdepth: 2
 8 | 
 9 |    0.4/index
10 |    0.5/index
11 | 


--------------------------------------------------------------------------------
/docs/changelog/0.4/index.rst:
--------------------------------------------------------------------------------
 1 | 0.4 Series
 2 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 1
 6 |    :glob:
 7 |    :reversed:
 8 | 
 9 |    0.*
10 | 


--------------------------------------------------------------------------------
/docs/changelog/0.5/index.rst:
--------------------------------------------------------------------------------
 1 | 0.5 Series
 2 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 1
 6 |    :glob:
 7 |    :reversed:
 8 | 
 9 |    0.*
10 | 


--------------------------------------------------------------------------------
/codecov.yaml:
--------------------------------------------------------------------------------
 1 | coverage:
 2 |   range: 70..100
 3 |   round: down
 4 |   precision: 1
 5 |   status:
 6 |     project:
 7 |       default:
 8 |         target: 90%
 9 |         threshold: 0.5%
10 | 


--------------------------------------------------------------------------------
/docs/changelog/0.4/0.4.3.clog:
--------------------------------------------------------------------------------
1 | clog format: 1; -*-outline-*-
2 | * 0.4.3 (2019-04-05)
3 | Changes since 0.4.2 (2019-02-21).
4 | ** New Features
5 | *** Fixes #16: Retry seqfetcher when rate limit exceeded [92d7210]
6 | 


--------------------------------------------------------------------------------
/.deepsource.toml:
--------------------------------------------------------------------------------
 1 | version = 1
 2 | 
 3 | test_patterns = [
 4 |   
 5 | ]
 6 | 
 7 | exclude_patterns = [
 8 |   
 9 | ]
10 | 
11 | [[analyzers]]
12 | name = 'python'
13 | enabled = true
14 | runtime_version = '3.x.x'
15 | 
16 | 


--------------------------------------------------------------------------------
/docs/changelog/0.5/0.5.5.clog:
--------------------------------------------------------------------------------
1 | clog format: 1; -*-outline-*-
2 | * 0.5.5 (2021-05-03)
3 | Changes since 0.5.4 (2021-05-02).
4 | ** Bug Fixes
5 | *** Don't retry sequence fetch with invalid coordinates [94e80cd] (pjcoenen)
6 | 


--------------------------------------------------------------------------------
/docs/changelog/0.5/0.5.4.clog:
--------------------------------------------------------------------------------
1 | clog format: 1; -*-outline-*-
2 | * 0.5.4 (2021-05-02)
3 | Changes since 0.5.3 (2021-04-14).
4 | ** Internal and Developer Changes
5 | *** #31: improve support for degenerate codons [ebcec67] (kayleeyuhas)
6 | 


--------------------------------------------------------------------------------
/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
 1 | name: 'Close stale issues and PRs'
 2 | on:
 3 |   workflow_dispatch:
 4 |   schedule:
 5 |     - cron: '1 1 * * *'
 6 | 
 7 | jobs:
 8 |   stale:
 9 |     uses: biocommons/.github/.github/workflows/stale.yml@main
10 | 


--------------------------------------------------------------------------------
/docs/changelog/0.4/0.4.2.clog:
--------------------------------------------------------------------------------
1 | clog format: 1; -*-outline-*-
2 | * 0.4.2 (2019-02-21)
3 | Changes since 0.4.1 (2019-02-21).
4 | ** Internal and Developer Changes
5 | *** reraise all requests exceptions (not just HTTPError) as RuntimeError [daece64]
6 | 


--------------------------------------------------------------------------------
/docs/changelog/0.5/0.5.7.clog:
--------------------------------------------------------------------------------
1 | clog format: 1; -*-outline-*-
2 | * 0.5.7 (2022-06-13)
3 | Changes since 0.5.6 (2022-06-09).
4 | ** New Features
5 | *** Enable independent control of trimming and shuffling during normalization [203ef4e] (Ryan Gomoto)
6 | 


--------------------------------------------------------------------------------
/src/bioutils/__init__.py:
--------------------------------------------------------------------------------
1 | from importlib.metadata import PackageNotFoundError, version
2 | 
3 | try:
4 |     __version__ = version(__package__)
5 | except PackageNotFoundError:  # pragma: no cover
6 |     # package is not installed
7 |     __version__ = None
8 | 


--------------------------------------------------------------------------------
/docs/changelog/0.5/0.5.1.clog:
--------------------------------------------------------------------------------
1 | clog format: 1; -*-outline-*-
2 | * 0.5.1 (2019-07-31)
3 | Changes since 0.5.0 (2019-07-22).
4 | ** Internal and Developer Changes
5 | *** Closes #26: Fix LICENSE filename typo that prevented wheel builds :-( [df2fe4a] (Reece Hart)
6 | 


--------------------------------------------------------------------------------
/docs/changelog/0.5/0.5.5.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | 0.5.5 (2021-05-03)
 3 | ###################
 4 | 
 5 | Changes since 0.5.4 (2021-05-02).
 6 | 
 7 | Bug Fixes
 8 | $$$$$$$$$$
 9 | 
10 | * Don't retry sequence fetch with invalid coordinates [`94e80cd <https://github.com/biocommons/bioutils/commit/94e80cd>`_] (pjcoenen)
11 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | Thank you for your interest in contributing to the biocommons community!
2 | Contributions are welcome and greatly appreciated. There are many types of
3 | contributions and you don't need to be a developer!
4 | 
5 | To get started, see https://biocommons.org/contributing/.  We look forward to
6 | hearing from you!
7 | 


--------------------------------------------------------------------------------
/docs/changelog/0.5/0.5.7.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | 0.5.7 (2022-06-13)
 3 | ###################
 4 | 
 5 | Changes since 0.5.6 (2022-06-09).
 6 | 
 7 | New Features
 8 | $$$$$$$$$$$$$
 9 | 
10 | * Enable independent control of trimming and shuffling during normalization [`203ef4e <https://github.com/biocommons/bioutils/commit/203ef4e>`_] (Ryan Gomoto)
11 | 


--------------------------------------------------------------------------------
/docs/changelog/0.4/0.4.4.clog:
--------------------------------------------------------------------------------
1 | clog format: 1; -*-outline-*-
2 | * 0.4.4 (2019-05-13)
3 | Changes since 0.4.3 (2019-04-05).
4 | ** Special Attention
5 | *** This is the last release in the 0.4 series.
6 | Future biocommons packages will be tested and supported only on Python
7 | >= 3.6 (https://github.com/biocommons/org/wiki/Migrating-to-Python-3.6)
8 | 


--------------------------------------------------------------------------------
/docs/changelog/0.4/0.4.2.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | 0.4.2 (2019-02-21)
 3 | ###################
 4 | 
 5 | Changes since 0.4.1 (2019-02-21).
 6 | 
 7 | Internal and Developer Changes
 8 | $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
 9 | 
10 | * reraise all requests exceptions (not just HTTPError) as RuntimeError [`daece64 <https://github.com/biocommons/bioutils/commit/daece64>`_]
11 | 


--------------------------------------------------------------------------------
/docs/changelog/0.4/0.4.3.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | 0.4.3 (2019-04-05)
 3 | ###################
 4 | 
 5 | Changes since 0.4.2 (2019-02-21).
 6 | 
 7 | New Features
 8 | $$$$$$$$$$$$$
 9 | 
10 | * Fixes `#16 <https://github.com/biocommons/bioutils/issues/16/>`_: Retry seqfetcher when rate limit exceeded [`92d7210 <https://github.com/biocommons/bioutils/commit/92d7210>`_]
11 | 


--------------------------------------------------------------------------------
/docs/changelog/0.4/0.4.4.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | 0.4.4 (2019-05-13)
 3 | ###################
 4 | 
 5 | Changes since 0.4.3 (2019-04-05).
 6 | 
 7 | Special Attention
 8 | $$$$$$$$$$$$$$$$$$
 9 | 
10 | * This is the last release in the 0.4 series.
11 | Future biocommons packages will be tested and supported only on Python
12 | >= 3.6 (https://github.com/biocommons/org/wiki/Migrating-to-Python-3.6)
13 | 


--------------------------------------------------------------------------------
/docs/changelog/0.5/0.5.4.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | 0.5.4 (2021-05-02)
 3 | ###################
 4 | 
 5 | Changes since 0.5.3 (2021-04-14).
 6 | 
 7 | Internal and Developer Changes
 8 | $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
 9 | 
10 | * `#31 <https://github.com/biocommons/bioutils/issues/31/>`_: improve support for degenerate codons [`ebcec67 <https://github.com/biocommons/bioutils/commit/ebcec67>`_] (kayleeyuhas)
11 | 


--------------------------------------------------------------------------------
/docs/changelog/0.5/0.5.1.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | 0.5.1 (2019-07-31)
 3 | ###################
 4 | 
 5 | Changes since 0.5.0 (2019-07-22).
 6 | 
 7 | Internal and Developer Changes
 8 | $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
 9 | 
10 | * Closes `#26 <https://github.com/biocommons/bioutils/issues/26/>`_: Fix LICENSE filename typo that prevented wheel builds :-( [`df2fe4a <https://github.com/biocommons/bioutils/commit/df2fe4a>`_] (Reece Hart)
11 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | skipsdist = true
 3 | envlist = py39, py310, py311, py312, py313
 4 | 
 5 | [gh-actions]
 6 | python =
 7 |     3.11: py311
 8 |     3.12: py312
 9 |     3.13: py313
10 | 
11 | [testenv]
12 | passenv = PYTHON_VERSION
13 | allowlist_externals = uv
14 | commands =
15 |     uv sync --python {envpython}
16 |     uv run python -m pytest --doctest-modules tests --cov --cov-config=pyproject.toml --cov-report=xml
17 |     ty check
18 | 


--------------------------------------------------------------------------------
/docs/changelog/0.4/0.4.1.clog:
--------------------------------------------------------------------------------
 1 | clog format: 1; -*-outline-*-
 2 | * 0.4.1 (2019-02-21)
 3 | Changes since 0.4.0 (2018-11-11).
 4 | ** Other Changes
 5 | *** expose underlying exception on http failure [9e56110]
 6 | ** Internal and Developer Changes
 7 | *** updated badges [8f91ed1]
 8 | *** added LICENSE [b3d6d64]
 9 | *** added missing contributors definition [97f78b3]
10 | *** updated badge list [de2bf15]
11 | *** sync'd project files with eutils [3102695]
12 | 


--------------------------------------------------------------------------------
/.github/workflows/validate-codecov-config.yml:
--------------------------------------------------------------------------------
 1 | name: Validate Codecov Config
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     paths: [codecov.yaml]
 6 |   push:
 7 |     branches: [main]
 8 | 
 9 | jobs:
10 |   validate-codecov-config:
11 |     runs-on: ubuntu-latest
12 |     permissions:
13 |       contents: read
14 |     steps:
15 |       - uses: actions/checkout@v4
16 |       - name: Validate codecov configuration
17 |         run: curl -sSL --fail-with-body --data-binary @codecov.yaml https://codecov.io/validate
18 | 


--------------------------------------------------------------------------------
/docs/changelog/0.5/0.5.3.clog:
--------------------------------------------------------------------------------
 1 | clog format: 1; -*-outline-*-
 2 | * 0.5.3 (2021-04-14)
 3 | Changes since 0.5.2 (2019-11-06).
 4 | ** New Features
 5 | *** #29: Support ambiguity codes in translation [669a653] (kayleeyuhas)
 6 | *** added bin/fasta-ga4gh-identifier [63d1078] (Reece Hart)
 7 | ** Internal and Developer Changes
 8 | *** updated Makefile for Python 3.8 [29eecf5] (Reece Hart)
 9 | *** fix failing test and reformat [7cc5ebb] (kayleeyuhas)
10 | *** improve variable names and use string instead of list [5d7484b] (kayleeyuhas)
11 | 


--------------------------------------------------------------------------------
/src/bioutils/_versionwarning.py:
--------------------------------------------------------------------------------
 1 | """emits a warning when imported under Python < 3.6
 2 | 
 3 | This module may be used by other biocommons packages
 4 | 
 5 | """
 6 | 
 7 | import logging
 8 | import sys
 9 | 
10 | __all__ = []
11 | 
12 | version_warning = (
13 |     "biocommons packages are tested and supported only on Python >= 3.6"
14 |     " (https://github.com/biocommons/org/wiki/Migrating-to-Python-3.6)"
15 | )
16 | 
17 | _logger = logging.getLogger(__package__)
18 | 
19 | if sys.version_info < (3, 6):
20 |     _logger.warning(version_warning)
21 | 


--------------------------------------------------------------------------------
/docs/changelog/0.5/0.5.2.clog:
--------------------------------------------------------------------------------
 1 | clog format: 1; -*-outline-*-
 2 | * 0.5.2 (2019-11-06)
 3 | Changes since 0.5.1 (2019-07-31).
 4 | ** Special Attention
 5 | *** Thanks to @trentwatt for significant documentation contributions!  See
 6 | https://bioutils.readthedocs.io/en/master/ for his handiwork.
 7 | ** Other Changes
 8 | *** Added changelogs for 0.5.0 and 0.5.1, which @reece forgot to include :-(
 9 | *** #22 added function docs for all modules [c0090ed] (trentwatt)
10 | *** #23: fix setup.cfg description tags (`description` → `long-description`) [8945c04] (Reece Hart)
11 | 


--------------------------------------------------------------------------------
/.github/workflows/python-ci-cd.yml:
--------------------------------------------------------------------------------
 1 | name: Python CI/CD
 2 | permissions:
 3 |   contents: write
 4 |   id-token: write
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: ["*"]
 9 |     tags: ["*"]
10 | 
11 | jobs:
12 |   python-ci-cd:
13 |     name: Python CI/CD
14 |     permissions:
15 |       contents: write
16 |       id-token: write
17 |     uses: biocommons/.github/.github/workflows/python-ci-cd.yml@main
18 |     with:
19 |       publish: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') }}
20 |     secrets:
21 |       pypi-token: ${{ secrets.UV_PUBLISH_TOKEN }}
22 | 


--------------------------------------------------------------------------------
/CONTRIBUTORS.txt:
--------------------------------------------------------------------------------
 1 | Reece Hart <reecehart@gmail.com>
 2 | Ryan Gomoto <7393416+gomoto@users.noreply.github.com>
 3 | trentwatt <trentwatson1@gmail.com>
 4 | Alan Rubin <alan.rubin@wehi.edu.au>
 5 | kayleeyuhas <kaylee@genomoncology.com>
 6 | Andreas Prlic <andreas.prlic@invitae.com>
 7 | Dave Lawrence <davmlaw@gmail.com>
 8 | Kyle Ferriter <kferrite@broadinstitute.org>
 9 | Timothy Laurent <timothyjlaurent@gmail.com>
10 | Ben Robinson <ben.robinson@invitae.com>
11 | Lucas Wiman <lucas.wiman@gmail.com>
12 | Trent Watson <trentwatson1@gmail.com>
13 | pjcoenen <64436780+pjcoenen@users.noreply.github.com>
14 | 


--------------------------------------------------------------------------------
/docs/changelog/0.4/0.4.0.clog:
--------------------------------------------------------------------------------
 1 | clog format: 1; -*-outline-*-
 2 | * 0.4.0 (2018-10-22)
 3 | Changes since 0.3.3 (2017-09-03).
 4 | ** Important Notice
 5 | Support for Python <3.6 will be dropped on 2019-03-31. See
 6 | https://github.com/biocommons/org/wiki/Migrating-to-Python-3.6
 7 | ** New Features
 8 | *** Closes #10: Support NCBI API keys (and NCBI_API_KEY env variable) [8739c98] (@timothyjlaurent)
 9 | *** Closes #12: add infer_namespaces and infer_namespace functions [2a53c7f]
10 | *** Dropped biopython dependency [0382b86] (@afrubin)
11 | *** Added bioutils.sequences.py:elide_sequence() function [018a762]
12 | *** Added GRCh38.p12 [3876f36]
13 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | import vcr
 5 | 
 6 | # set vcr logging level
 7 | logging.basicConfig()
 8 | logger = logging.getLogger("vcr")
 9 | 
10 | # set default location for vcr cassettes
11 | test_dir = os.path.dirname(__file__)
12 | test_data_dir = os.path.join(test_dir, "data", "cassettes")
13 | 
14 | # initialize vcr
15 | vcr.default_vcr = vcr.VCR(
16 |     cassette_library_dir=test_data_dir,
17 |     filter_headers=["Authorization"],
18 |     filter_post_data_parameters=["Authorization"],
19 |     record_mode=os.environ.get("VCR_RECORD_MODE", "once"),
20 | )
21 | vcr.use_cassette = vcr.default_vcr.use_cassette
22 | 


--------------------------------------------------------------------------------
/docs/changelog/0.5/0.5.6.clog:
--------------------------------------------------------------------------------
 1 | clog format: 1; -*-outline-*-
 2 | * 0.5.6 (2022-06-09)
 3 | Changes since 0.5.5 (2021-05-05).
 4 | ** Bug Fixes
 5 | *** fix #36 by adding a new translation table ... (#37) [b5d4d0f] (Andreas Prlic)
 6 | *** Fix test warnings and a new failure from #36 [22b5556] (Reece Hart)
 7 | ** New Features
 8 | *** Handle Ensembl transcript versions [b3eaf83] (Dave Lawrence)
 9 | ** Internal and Developer Changes
10 | *** Update Makefile to support newer bioutils conventions [ed6eaf6] (Reece Hart)
11 | *** Adopt GitHub Actions for testing and deployment [35c6a7f] (Reece Hart)
12 | *** Switch to Python 3.10 by default [5895087] (Reece Hart)
13 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # Introduction
2 | 
3 | [![Release](https://img.shields.io/github/v/release/biocommons/bioutils)](https://img.shields.io/github/v/release/biocommons/bioutils)
4 | [![Build status](https://img.shields.io/github/actions/workflow/status/biocommons/bioutils/main.yml?branch=main)](https://github.com/biocommons/bioutils/actions/workflows/main.yml?query=branch%3Amain)
5 | [![Commit activity](https://img.shields.io/github/commit-activity/m/biocommons/bioutils)](https://img.shields.io/github/commit-activity/m/biocommons/bioutils)
6 | [![License](https://img.shields.io/github/license/biocommons/bioutils)](https://img.shields.io/github/license/biocommons/bioutils)
7 | 
8 | Package Description
9 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |   configuration: docs/source/conf.py
11 | 
12 | # Optionally build your docs in additional formats such as PDF and ePub
13 | formats: all
14 | 
15 | # Optionally set the version of Python and requirements required to build your docs
16 | python:
17 |   version: 3.7
18 |   install:
19 |     - method: pip
20 |       path: .
21 |       extra_requirements:
22 |         - docs
23 |     - method: setuptools
24 |       path: .
25 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | # .coveragerc to control coverage.py
 2 | [run]
 3 | branch = True
 4 | source = bioutils
 5 | # omit = bad_file.py
 6 | 
 7 | [paths]
 8 | source =
 9 |     src/
10 | 
11 | [report]
12 | # Regexes for lines to exclude from consideration
13 | exclude_lines =
14 |     # Have to re-enable the standard pragma
15 |     pragma: no cover
16 | 
17 |     # Don't complain about missing debug-only code:
18 |     def __repr__
19 |     if self\.debug
20 | 
21 |     # Don't complain if tests don't hit defensive assertion code:
22 |     raise AssertionError
23 |     raise NotImplementedError
24 | 
25 |     # Don't complain if non-runnable code isn't run:
26 |     if 0:
27 |     if __name__ == .__main__.:
28 | 


--------------------------------------------------------------------------------
/.github/workflows/labels.yml:
--------------------------------------------------------------------------------
 1 | name: Sync labels
 2 | on:
 3 |   workflow_dispatch:
 4 |   push:
 5 |     branches:
 6 |       - 'main'
 7 |     paths:
 8 |       - '.github/labels.yml'
 9 |       - '.github/workflows/labels.yml'
10 | 
11 | permissions:
12 |   issues: write
13 | 
14 | jobs:
15 |   labels:
16 |     runs-on: ubuntu-latest
17 | 
18 |     steps:
19 |       - uses: actions/checkout@v4
20 |         with:
21 |           sparse-checkout: .github/labels.yml
22 | 
23 |       - uses: EndBug/label-sync@v2
24 |         with:
25 |           config-file: |
26 |             https://raw.githubusercontent.com/biocommons/.github/main/etc/labels.yml
27 |             .github/labels.yml
28 | 
29 |           delete-other-labels: false


--------------------------------------------------------------------------------
/sbin/ucsc-cytoband-to-json:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import collections
 4 | import csv
 5 | import gzip
 6 | import json
 7 | import sys
 8 | 
 9 | 
10 | def open_any(fn):
11 |     if fn == "-":
12 |         return sys.stdin
13 |     if fn.endswith(".gz"):
14 |         return gzip.open(fn, mode="rt", encoding="utf-8")
15 |     return open(fn, mode="rt")
16 | 
17 | 
18 | chr_band_map = collections.defaultdict(dict)
19 | 
20 | rdr = csv.reader(open_any(sys.argv[1]), delimiter="\t")
21 | for row in rdr:
22 |     chr, start, end, band, stain = row
23 |     if chr.startswith("chr"):
24 |         chr = chr[3:] 
25 |     chr_band_map[chr][band] = (int(start),int(end),stain)
26 | 
27 | 
28 | json.dump(chr_band_map, sys.stdout, indent=None, sort_keys=True)
29 | 


--------------------------------------------------------------------------------
/src/bioutils/_data/assemblies/Makefile:
--------------------------------------------------------------------------------
 1 | # Download assembly info from NCBI and convert to json
 2 | # Use:
 3 | # $ make update
 4 | 
 5 | .PHONY: FORCE
 6 | .SUFFIXES:
 7 | .DELETE_ON_ERROR:
 8 | 
 9 | SHELL:=/bin/bash -o pipefail
10 | PATH:=../../../../sbin:${PATH}
11 | 
12 | update:
13 | 	make pull
14 | 	make json
15 | 
16 | # rsyncs all assembly records into pull/ directory
17 | pull: FORCE
18 | 	mkdir -p $@
19 | 	rsync -L -v --no-motd ftp.ncbi.nlm.nih.gov::genomes/all/GCF/000/001/405/*/*assembly_report.txt $@
20 | 	rsync -L -v --no-motd ftp.ncbi.nlm.nih.gov::genomes/all/GCF/000/306/695/*/*assembly_report.txt $@
21 | 	rsync -L -v --no-motd ftp.ncbi.nlm.nih.gov::genomes/all/GCF/009/914/755/*/*assembly_report.txt $@
22 | 
23 | json:
24 | 	for f in pull/*.txt; do assembly-to-json -p. "$$f"; done
25 | 	gzip -f *.json
26 | 


--------------------------------------------------------------------------------
/docs/changelog/0.5/0.5.2.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | 0.5.2 (2019-11-06)
 3 | ###################
 4 | 
 5 | Changes since 0.5.1 (2019-07-31).
 6 | 
 7 | Special Attention
 8 | $$$$$$$$$$$$$$$$$$
 9 | 
10 | * Thanks to @trentwatt for significant documentation contributions!  See
11 | https://bioutils.readthedocs.io/en/master/ for his handiwork.
12 | 
13 | Other Changes
14 | $$$$$$$$$$$$$$
15 | 
16 | * Added changelogs for 0.5.0 and 0.5.1, which @reece forgot to include :-(
17 | * `#22 <https://github.com/biocommons/bioutils/issues/22/>`_ added function docs for all modules [`c0090ed <https://github.com/biocommons/bioutils/commit/c0090ed>`_] (trentwatt)
18 | * `#23 <https://github.com/biocommons/bioutils/issues/23/>`_: fix setup.cfg description tags (`description` → `long-description`) [`8945c04 <https://github.com/biocommons/bioutils/commit/8945c04>`_] (Reece Hart)
19 | 


--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
 1 | # Config for https://probot.github.io/apps/stale/
 2 | 
 3 | # Number of days of inactivity before an issue becomes stale
 4 | daysUntilStale: 60
 5 | 
 6 | # Number of days of inactivity before a stale issue is closed
 7 | daysUntilClose: 7
 8 | 
 9 | # Issues with these labels will never be considered stale
10 | exemptLabels:
11 |   - pinned
12 |   - security
13 | 
14 | # Label to use when marking an issue as stale
15 | staleLabel: wontfix
16 | 
17 | # Comment to post when marking an issue as stale. Set to `false` to disable
18 | markComment: >-
19 |   This issue has not had recent activity and is now marked as stale.
20 |   It will be closed if no further activity occurs.  Please comment if
21 |   you believe the issue is still relevant.
22 | 
23 | # Comment to post when closing a stale issue. Set to `false` to disable
24 | closeComment: false
25 | 


--------------------------------------------------------------------------------
/docs/changelog/0.4/0.4.1.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | 0.4.1 (2019-02-21)
 3 | ###################
 4 | 
 5 | Changes since 0.4.0 (2018-11-11).
 6 | 
 7 | Other Changes
 8 | $$$$$$$$$$$$$$
 9 | 
10 | * expose underlying exception on http failure [`9e56110 <https://github.com/biocommons/bioutils/commit/9e56110>`_]
11 | 
12 | Internal and Developer Changes
13 | $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
14 | 
15 | * updated badges [`8f91ed1 <https://github.com/biocommons/bioutils/commit/8f91ed1>`_]
16 | * added LICENSE [`b3d6d64 <https://github.com/biocommons/bioutils/commit/b3d6d64>`_]
17 | * added missing contributors definition [`97f78b3 <https://github.com/biocommons/bioutils/commit/97f78b3>`_]
18 | * updated badge list [`de2bf15 <https://github.com/biocommons/bioutils/commit/de2bf15>`_]
19 | * sync'd project files with eutils [`3102695 <https://github.com/biocommons/bioutils/commit/3102695>`_]
20 | 


--------------------------------------------------------------------------------
/docs/changelog/0.4/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: FORCE
 2 | .DELETE_ON_ERROR:
 3 | 
 4 | # N.B. this scripts requires tools that are not publicly available
 5 | # yet. Eventually, clogger will be released.
 6 | # Sorry.
 7 | PATH:=/home/reece/projects/reece/clogger/bin:${PATH}
 8 | SHELL:=/bin/bash -o pipefail
 9 | 
10 | default:
11 | 	@echo "no $@ target"; exit 1
12 | 
13 | next.clog::
14 | 	biocommons-changelog . >$@
15 | 
16 | # TODO: use git-mapfile to map commits
17 | %.rst: %.clog
18 | 	clogger-fmt \
19 | 		-I '`#{issue_id} <https://github.com/biocommons/bioutils/issues/{issue_id}/>`_' \
20 | 		-C '`{cset} <https://github.com/biocommons/bioutils/commit/{cset}>`_' \
21 | 		<$< >$@.tmp
22 | 	mv $@.tmp $@
23 | 
24 | 
25 | 
26 | .PHONY: clean cleaner cleanest
27 | 
28 | clean:
29 | 	/bin/rm -f *~
30 | 
31 | cleaner: clean
32 | 	#/bin/rm -f *.rst
33 | 
34 | cleanest: cleaner
35 | 	/bin/rm -f *.clog
36 | 


--------------------------------------------------------------------------------
/docs/changelog/0.5/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: FORCE
 2 | .DELETE_ON_ERROR:
 3 | 
 4 | # N.B. this scripts requires tools that are not publicly available
 5 | # yet. Eventually, clogger will be released.
 6 | # Sorry.
 7 | PATH:=/home/reece/projects/reece/clogger/bin:${PATH}
 8 | SHELL:=/bin/bash -o pipefail
 9 | 
10 | default:
11 | 	@echo "no $@ target"; exit 1
12 | 
13 | next.clog::
14 | 	biocommons-changelog . >$@
15 | 
16 | # TODO: use git-mapfile to map commits
17 | %.rst: %.clog
18 | 	clogger-fmt \
19 | 		-I '`#{issue_id} <https://github.com/biocommons/bioutils/issues/{issue_id}/>`_' \
20 | 		-C '`{cset} <https://github.com/biocommons/bioutils/commit/{cset}>`_' \
21 | 		<$< >$@.tmp
22 | 	mv $@.tmp $@
23 | 
24 | 
25 | 
26 | .PHONY: clean cleaner cleanest
27 | 
28 | clean:
29 | 	/bin/rm -f *~
30 | 
31 | cleaner: clean
32 | 	#/bin/rm -f *.rst
33 | 
34 | cleanest: cleaner
35 | 	/bin/rm -f *.clog
36 | 


--------------------------------------------------------------------------------
/.github/actions/setup-python-env/action.yml:
--------------------------------------------------------------------------------
 1 | name: "Setup Python Environment"
 2 | description: "Set up Python environment for the given Python version"
 3 | 
 4 | inputs:
 5 |   python-version:
 6 |     description: "Python version to use"
 7 |     required: true
 8 |     default: "3.13"
 9 |   uv-version:
10 |     description: "uv version to use"
11 |     required: true
12 |     default: "0.7.14"
13 | 
14 | runs:
15 |   using: "composite"
16 |   steps:
17 |     - uses: actions/setup-python@v5
18 |       with:
19 |         python-version: ${{ inputs.python-version }}
20 | 
21 |     - name: Install uv
22 |       uses: astral-sh/setup-uv@v6
23 |       with:
24 |         version: ${{ inputs.uv-version }}
25 |         enable-cache: 'true'
26 |         cache-suffix: ${{ matrix.python-version }}
27 | 
28 |     - name: Install Python dependencies
29 |       run: uv sync --frozen
30 |       shell: bash
31 | 


--------------------------------------------------------------------------------
/docs/changelog/0.5/0.5.3.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | 0.5.3 (2021-04-14)
 3 | ###################
 4 | 
 5 | Changes since 0.5.2 (2019-11-06).
 6 | 
 7 | New Features
 8 | $$$$$$$$$$$$$
 9 | 
10 | * `#29 <https://github.com/biocommons/bioutils/issues/29/>`_: Support ambiguity codes in translation [`669a653 <https://github.com/biocommons/bioutils/commit/669a653>`_] (kayleeyuhas)
11 | * added bin/fasta-ga4gh-identifier [`63d1078 <https://github.com/biocommons/bioutils/commit/63d1078>`_] (Reece Hart)
12 | 
13 | Internal and Developer Changes
14 | $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
15 | 
16 | * updated Makefile for Python 3.8 [`29eecf5 <https://github.com/biocommons/bioutils/commit/29eecf5>`_] (Reece Hart)
17 | * fix failing test and reformat [`7cc5ebb <https://github.com/biocommons/bioutils/commit/7cc5ebb>`_] (kayleeyuhas)
18 | * improve variable names and use string instead of list [`5d7484b <https://github.com/biocommons/bioutils/commit/5d7484b>`_] (kayleeyuhas)
19 | 


--------------------------------------------------------------------------------
/sbin/makefile-extract-documentation:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """extract doc from a makefile"""
 3 | 
 4 | # ############################################################################
 5 | # #= BASIC USAGE
 6 | #
 7 | # .PHONY: help
 8 | # help: ## Display help message
 9 | 
10 | import fileinput
11 | import re
12 | 
13 | BOLD = "\033[1m"
14 | COMMAND_COLOR = "\033[36m"
15 | HEADER_COLOR = "\033[32m"
16 | RESET = "\033[0m"
17 | SECTION_COLOR = "\033[93m"
18 | UNDERLINE = "\033[4m"
19 | 
20 | print(f"""🌟🌟 {BOLD}{HEADER_COLOR}{UNDERLINE}biocommons conventional make targets{RESET} 🌟🌟
21 | 
22 | Using these targets promots consistency between local development and ci/cd commands.
23 | 
24 | usage: make [target ...]""")
25 | 
26 | for line in fileinput.input():  # noqa: SIM115
27 |     if m := re.match(r"#= (.+)", line):
28 |         print(f"\n{BOLD}{UNDERLINE}{SECTION_COLOR}{m.group(1)}{RESET}")
29 |     elif m := re.match(r"([-\s\w]+):.+?##\s+(.+)", line):
30 |         print(f"{BOLD}{COMMAND_COLOR}{m.group(1):<20}{RESET}{m.group(2)}")
31 | 


--------------------------------------------------------------------------------
/docs/changelog/0.5/0.5.8.clog:
--------------------------------------------------------------------------------
 1 | clog format: 1; -*-outline-*-
 2 | * HEAD (2023-07-09)
 3 | Changes since 0.5.7 (2022-07-08).
 4 | ** New Features
 5 | *** update assemblies and add T2T-CHM13v2.0 [19ebeff] (Reece Hart)
 6 | ** Other Changes
 7 | *** #47 Rewrite trim_left and trim_right for linear performance. [7817f6a] (Kyle Ferriter)
 8 | *** Handle case when no alleles are passed to trim functions [eb8607a] (Kyle Ferriter)
 9 | *** Merge pull request #48 from theferrit32/47-large-seq-normalization [5b3a80c] (Reece Hart)
10 | *** Merge pull request #49 from theferrit32/46-test-vcr-compilation [271ad81] (Reece Hart)
11 | ** Internal and Developer Changes
12 | *** Pin urllib3 version that vcrpy dependency depends on [9ddda27] (Kyle Ferriter)
13 | *** pin urllib3 to 1.26.* (rather than to specific patch version) [b60b890] (Reece Hart)
14 | *** add CODEOWNERS file [eef120d] (Reece Hart)
15 | *** updated CONTRIBUTORS.txt [8741a5f] (Reece Hart)
16 | *** reformatted with black and isort [3305c0e] (Reece Hart)
17 | *** Close #50: synchronize biocommonsexample with bioutils [5a1788c] (Reece Hart)
18 | 


--------------------------------------------------------------------------------
/docs/changelog/0.4/0.4.0.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | 0.4.0 (2018-10-22)
 3 | ###################
 4 | 
 5 | Changes since 0.3.3 (2017-09-03).
 6 | 
 7 | Important Notice
 8 | $$$$$$$$$$$$$$$$$
 9 | 
10 | Support for Python <3.6 will be dropped on 2019-03-31. See
11 | https://github.com/biocommons/org/wiki/Migrating-to-Python-3.6
12 | 
13 | New Features
14 | $$$$$$$$$$$$$
15 | 
16 | * Closes `#10 <https://github.com/biocommons/bioutils/issues/10/>`_: Support NCBI API keys (and NCBI_API_KEY env variable) [`8739c98 <https://github.com/biocommons/bioutils/commit/8739c98>`_] (@timothyjlaurent)
17 | * Closes `#12 <https://github.com/biocommons/bioutils/issues/12/>`_: add infer_namespaces and infer_namespace functions [`2a53c7f <https://github.com/biocommons/bioutils/commit/2a53c7f>`_]
18 | * Dropped biopython dependency [`0382b86 <https://github.com/biocommons/bioutils/commit/0382b86>`_] (@afrubin)
19 | * Added bioutils.sequences.py:elide_sequence() function [`018a762 <https://github.com/biocommons/bioutils/commit/018a762>`_]
20 | * Added GRCh38.p12 [`3876f36 <https://github.com/biocommons/bioutils/commit/3876f36>`_]
21 | 


--------------------------------------------------------------------------------
/.mailmap:
--------------------------------------------------------------------------------
 1 | # I pick the account with the most modified files in git shortlog -sne
 2 | # This is used by Git to consolidate the users who used multiple accounts
 3 | Andreas Prlic <andreas.prlic@invitae.com> Andreas Prlic <36012160+andreas-invitae@users.noreply.github.com>
 4 | Andreas Prlic <andreas.prlic@invitae.com> Andreas Prlic <andreas.prlic@gmail.com>
 5 | Caitlin Gong <ccaitlingo@gmail.com> Caitlin Gong <clgong@vassar.edu>
 6 | Katie Stahl <kathryn.stahl@nationwidechildrens.org> katie stahl <kathryn.stahl@nationwidechildrens.org>
 7 | Manuel Holtgrewe <manuel.holtgrewe@bih-charite.de> Manuel Holtgrewe <manuel.holtgrewe@bihealth.de>
 8 | Meng <wangm0855@gmail.com> Meng Wang <wangm0855@gmail.com>
 9 | Reece Hart <reecehart@gmail.com> Reece Hart <reece@biocmmons.org>
10 | Reece Hart <reecehart@gmail.com> Reece Hart <reece@biocommons.org>
11 | Reece Hart <reecehart@gmail.com> Reece Hart <reece@ip-192-168-0-13.us-west-2.compute.internal>
12 | Reece Hart <reecehart@gmail.com> Reece Hart <reecehart@Reeces-MacBook-Pro.local>
13 | Rudy Rico <rudy.rico@invitae.com> Rudolph Rico <rudy.rico@invitae.com>
14 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: "v6.0.0"
 4 |     hooks:
 5 |       - id: check-case-conflict
 6 |       - id: check-merge-conflict
 7 |       - id: check-json
 8 |         exclude: ^.devcontainer/.*devcontainer.json
 9 |       - id: check-shebang-scripts-are-executable
10 |       - id: check-symlinks
11 |       - id: check-toml
12 |       - id: check-yaml
13 |       - id: detect-private-key
14 |       - id: end-of-file-fixer
15 |       - id: mixed-line-ending
16 |         args: [--fix=lf]
17 |       - id: pretty-format-json
18 |         exclude: ^.devcontainer/.*devcontainer.json
19 |         args: [--autofix, --no-sort-keys]
20 |       - id: trailing-whitespace
21 | 
22 |   - repo: https://github.com/astral-sh/ruff-pre-commit
23 |     rev: "v0.12.7"
24 |     hooks:
25 |       - id: ruff-check
26 |         args: [--fix, --exit-non-zero-on-fix]
27 |       - id: ruff-format
28 | 
29 |   - repo: local
30 |     hooks:
31 |       - id: canonicalize-gitignore
32 |         name: Sort unique .gitignore
33 |         entry: sh -c 'LC_ALL=C sort -u -o .gitignore .gitignore'
34 |         language: system
35 |         files: ^\.gitignore$
36 | 


--------------------------------------------------------------------------------
/docs/changelog/0.5/0.5.6.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | 0.5.6 (2022-06-09)
 3 | ###################
 4 | 
 5 | Changes since 0.5.5 (2021-05-05).
 6 | 
 7 | Bug Fixes
 8 | $$$$$$$$$$
 9 | 
10 | * fix `#36 <https://github.com/biocommons/bioutils/issues/36/>`_ by adding a new translation table ... (`#37 <https://github.com/biocommons/bioutils/issues/37/>`_) [`b5d4d0f <https://github.com/biocommons/bioutils/commit/b5d4d0f>`_] (Andreas Prlic)
11 | * Fix test warnings and a new failure from `#36 <https://github.com/biocommons/bioutils/issues/36/>`_ [`22b5556 <https://github.com/biocommons/bioutils/commit/22b5556>`_] (Reece Hart)
12 | 
13 | New Features
14 | $$$$$$$$$$$$$
15 | 
16 | * Handle Ensembl transcript versions [`b3eaf83 <https://github.com/biocommons/bioutils/commit/b3eaf83>`_] (Dave Lawrence)
17 | 
18 | Internal and Developer Changes
19 | $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
20 | 
21 | * Update Makefile to support newer bioutils conventions [`ed6eaf6 <https://github.com/biocommons/bioutils/commit/ed6eaf6>`_] (Reece Hart)
22 | * Adopt GitHub Actions for testing and deployment [`35c6a7f <https://github.com/biocommons/bioutils/commit/35c6a7f>`_] (Reece Hart)
23 | * Switch to Python 3.10 by default [`5895087 <https://github.com/biocommons/bioutils/commit/5895087>`_] (Reece Hart)
24 | 


--------------------------------------------------------------------------------
/docs/changelog/0.5/0.5.0.clog:
--------------------------------------------------------------------------------
 1 | clog format: 1; -*-outline-*-
 2 | * 0.5.0 (2019-07-22)
 3 | Changes since 0.4.4 (2019-05-13).
 4 | ** Special Attention
 5 | *** All biocommons packages now require Python >= 3.6. See https://github.com/biocommons/org/wiki/Migrating-to-Python-3.6
 6 | ** New Features
 7 | *** #18: Implemented comprehensive sequence normalization (trim, left, right, expand/voca, vcf) [36785fa] (Reece Hart)
 8 | *** #20: implement hex-based digests à la refget [140a20e] (Reece Hart)
 9 | *** Add support for cytobands, incl data files from UCSC [0ba4361] (Reece Hart)
10 | *** Added accessions.py:coerce_namespace() [e31e592] (Reece Hart)
11 | ** Internal and Developer Changes
12 | *** Added pytest-optional-tests; use test alias in Makefile [ba9b993] (Reece Hart)
13 | *** Added trinuc normalization tests [cfe3a68] (Reece Hart)
14 | *** Added vcrpy to test requirements [95893f1] (Reece Hart)
15 | *** Moved source to src/; updated setup.cfg [ff45fb0] (Reece Hart)
16 | *** Removed pip install from tox in favor of deps [8c8f91a] (Reece Hart)
17 | *** Renamed doc → docs [1612e5c] (Reece Hart)
18 | *** Store assemblies as compressed json [ea79e71] (Reece Hart)
19 | *** Update tests to use new vcr cassettes on optional tests (much faster!) [2001745] (Reece Hart)
20 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: bioutils
 2 | repo_url: https://github.com/biocommons/bioutils
 3 | site_url: https://biocommons.github.io/bioutils
 4 | copyright: Maintained by <a href="https://biocommons.com">biocommons</a>.
 5 | 
 6 | plugins:
 7 |   - search
 8 |   - mkdocstrings:
 9 |       handlers:
10 |         python:
11 |           options:
12 |             show_source: false
13 |   - awesome-nav
14 | theme:
15 |   name: material
16 |   feature:
17 |     tabs: true
18 |   palette:
19 |     - media: "(prefers-color-scheme: light)"
20 |       scheme: default
21 |       primary: white
22 |       accent: deep orange
23 |       toggle:
24 |         icon: material/brightness-7
25 |         name: Switch to dark mode
26 |     - media: "(prefers-color-scheme: dark)"
27 |       scheme: slate
28 |       primary: black
29 |       accent: deep orange
30 |       toggle:
31 |         icon: material/brightness-4
32 |         name: Switch to light mode
33 |   icon:
34 |     repo: fontawesome/brands/github
35 | 
36 | extra:
37 |   social:
38 |     - icon: fontawesome/brands/github
39 |       link: https://github.com/biocommons/bioutils
40 |     - icon: fontawesome/brands/python
41 |       link: https://pypi.org/project/bioutils
42 | 
43 | markdown_extensions:
44 |   - toc:
45 |       permalink: true
46 |   - pymdownx.arithmatex:
47 |       generic: true
48 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *$py.class
  2 | *.bak
  3 | *.cover
  4 | *.egg
  5 | *.egg-info/
  6 | *.log
  7 | *.manifest
  8 | *.mo
  9 | *.orig
 10 | *.pot
 11 | *.py.cover
 12 | *.py[cod]
 13 | *.py[codz]
 14 | *.sage.py
 15 | *.so
 16 | *.spec
 17 | *.sqlite3
 18 | *.sqlite3-journal
 19 | *~
 20 | .DS_Store
 21 | .Python
 22 | .abstra/
 23 | .cache
 24 | .coverage
 25 | .coverage.*
 26 | .cursorignore
 27 | .cursorindexingignore
 28 | .dmypy.json
 29 | .eggs/
 30 | .env
 31 | .envrc
 32 | .hypothesis/
 33 | .idea
 34 | .installed.cfg
 35 | .ipynb_checkpoints
 36 | .mypy_cache/
 37 | .nox/
 38 | .pdm-build/
 39 | .pdm-python
 40 | .pixi
 41 | .pybuilder/
 42 | .pypirc
 43 | .pyre/
 44 | .pytest_cache
 45 | .pytest_cache/
 46 | .python-version
 47 | .pytype/
 48 | .ropeproject
 49 | .ruff_cache/
 50 | .scrapy
 51 | .spyderproject
 52 | .spyproject
 53 | .tox/
 54 | .venv
 55 | .vscode
 56 | .webassets-cache
 57 | /site
 58 | ENV/
 59 | MANIFEST
 60 | __marimo__/
 61 | __pycache__/
 62 | __pypackages__/
 63 | archive
 64 | bioutils/_data/assemblies/pull
 65 | build/
 66 | celerybeat-schedule
 67 | celerybeat.pid
 68 | cover/
 69 | coverage.xml
 70 | cython_debug/
 71 | develop-eggs/
 72 | dist/
 73 | dmypy.json
 74 | doc/_build
 75 | doc/changelog/*/.tags
 76 | doc/changelog/*/.tags.mk
 77 | doc/changelog/*/hg-git-remap.pl
 78 | docs/_build/
 79 | docs/source
 80 | downloads/
 81 | eggs/
 82 | env.bak/
 83 | env/
 84 | htmlcov/
 85 | instance/
 86 | ipython_config.py
 87 | lib/
 88 | lib64/
 89 | local_settings.py
 90 | marimo/_lsp/
 91 | marimo/_static/
 92 | misc
 93 | nosetests.xml
 94 | parts/
 95 | pip-delete-this-directory.txt
 96 | pip-log.txt
 97 | profile_default/
 98 | sdist/
 99 | share/python-wheels/
100 | target/
101 | var/
102 | venv.bak/
103 | venv/
104 | wheels/
105 | 


--------------------------------------------------------------------------------
/docs/changelog/0.5/0.5.8.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | HEAD (2023-07-09)
 3 | ##################
 4 | 
 5 | Changes since 0.5.7 (2022-07-08).
 6 | 
 7 | New Features
 8 | $$$$$$$$$$$$$
 9 | 
10 | * update assemblies and add T2T-CHM13v2.0 [`19ebeff <https://github.com/biocommons/bioutils/commit/19ebeff>`_] (Reece Hart)
11 | 
12 | Other Changes
13 | $$$$$$$$$$$$$$
14 | 
15 | * `#47 <https://github.com/biocommons/bioutils/issues/47/>`_ Rewrite trim_left and trim_right for linear performance. [`7817f6a <https://github.com/biocommons/bioutils/commit/7817f6a>`_] (Kyle Ferriter)
16 | * Handle case when no alleles are passed to trim functions [`eb8607a <https://github.com/biocommons/bioutils/commit/eb8607a>`_] (Kyle Ferriter)
17 | * Merge pull request #48 from theferrit32/47-large-seq-normalization [`5b3a80c <https://github.com/biocommons/bioutils/commit/5b3a80c>`_] (Reece Hart)
18 | * Merge pull request #49 from theferrit32/46-test-vcr-compilation [`271ad81 <https://github.com/biocommons/bioutils/commit/271ad81>`_] (Reece Hart)
19 | 
20 | Internal and Developer Changes
21 | $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
22 | 
23 | * Pin urllib3 version that vcrpy dependency depends on [`9ddda27 <https://github.com/biocommons/bioutils/commit/9ddda27>`_] (Kyle Ferriter)
24 | * pin urllib3 to 1.26.* (rather than to specific patch version) [`b60b890 <https://github.com/biocommons/bioutils/commit/b60b890>`_] (Reece Hart)
25 | * add CODEOWNERS file [`eef120d <https://github.com/biocommons/bioutils/commit/eef120d>`_] (Reece Hart)
26 | * updated CONTRIBUTORS.txt [`8741a5f <https://github.com/biocommons/bioutils/commit/8741a5f>`_] (Reece Hart)
27 | * reformatted with black and isort [`3305c0e <https://github.com/biocommons/bioutils/commit/3305c0e>`_] (Reece Hart)
28 | * Close `#50 <https://github.com/biocommons/bioutils/issues/50/>`_: synchronize biocommonsexample with bioutils [`5a1788c <https://github.com/biocommons/bioutils/commit/5a1788c>`_] (Reece Hart)
29 | 


--------------------------------------------------------------------------------
/bin/fasta-ga4gh-identifier:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """compute and display ga4gh sequence identifiers for sequences in a fasta file
 4 | 
 5 | snafu$ ./bin/fasta-ga4gh-identifier ~/Downloads/GCA_000001405.28_GRCh38.p13_genomic.fna.gz 
 6 | ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO	CM000663.2	CM000663.2 Homo sapiens chromosome 1, GRCh38 reference primary assembly
 7 | ga4gh:SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g	CM000664.2	CM000664.2 Homo sapiens chromosome 2, GRCh38 reference primary assembly
 8 | ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX	CM000665.2	CM000665.2 Homo sapiens chromosome 3, GRCh38 reference primary assembly
 9 | 
10 | snafu$ ./bin/fasta-ga4gh-identifier ~/Downloads/Homo_sapiens.GRCh38.dna.toplevel.fa.gz 
11 | ga4gh:SQ.2YnepKM7OkBoOrKmvHbGqguVfF9amCST	1	1 dna:chromosome chromosome:GRCh38:1:1:248956422:1 REF
12 | ga4gh:SQ.lwDyBi432Py-7xnAISyQlnlhWDEaBPv2	2	2 dna:chromosome chromosome:GRCh38:2:1:242193529:1 REF
13 | ga4gh:SQ.Eqk6_SvMMDCc6C-uEfickOUWTatLMDQZ	3	3 dna:chromosome chromosome:GRCh38:3:1:198295559:1 REF
14 | 
15 | 
16 | Sigh. Ensembl modifies GRCh38 sequences.
17 | 
18 | """
19 | 
20 | 
21 | import gzip
22 | import sys
23 | 
24 | from Bio import SeqIO
25 | from bioutils.digests import seq_seqhash
26 | 
27 | 
28 | def anyopen(path, encoding=None):
29 |     if path == "-":
30 |         # decoding is automatic in Python 3 based on locale
31 |         # https://docs.python.org/3/library/sys.html#sys.stdin
32 |         return sys.stdin
33 |     elif path.endswith(".gz"):
34 |         return gzip.open(path, mode="rt", encoding=encoding)
35 |     else:
36 |         return open(path, mode="r", encoding=encoding)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     for path in sys.argv[1:]:
41 |         with anyopen(path) as fp:
42 |             for rec in SeqIO.parse(fp, "fasta"):
43 |                 digest = seq_seqhash(str(rec.seq))
44 |                 ga4gh_ir = "ga4gh:SQ." + digest
45 |                 print(ga4gh_ir + "\t" + rec.id + "\t" + rec.description)
46 | 


--------------------------------------------------------------------------------
/tests/data/cassettes/test_fetch_seq_errors:
--------------------------------------------------------------------------------
 1 | interactions:
 2 | - request:
 3 |     body: null
 4 |     headers:
 5 |       Accept:
 6 |       - '*/*'
 7 |       Accept-Encoding:
 8 |       - gzip, deflate
 9 |       Connection:
10 |       - keep-alive
11 |       User-Agent:
12 |       - python-requests/2.32.3
13 |     method: GET
14 |     uri: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_9.9&rettype=fasta&tool=bioutils&email=biocommons-dev@googlegroups.com
15 |   response:
16 |     body:
17 |       string: '+Error%3A+CEFetchPApplication%3A%3Aproxy_stream()%3A+Error%3A+F+a+i+l+e+d++t+o++u+n+d+e+r+s+t+a+n+d++i+d+%3A++N+M+_+9+.+9+%0A%0A
18 | 
19 |         '
20 |     headers:
21 |       Access-Control-Allow-Origin:
22 |       - '*'
23 |       Access-Control-Expose-Headers:
24 |       - X-RateLimit-Limit,X-RateLimit-Remaining
25 |       Cache-Control:
26 |       - private
27 |       Connection:
28 |       - close
29 |       Content-Disposition:
30 |       - attachment; filename="sequence.fasta"
31 |       Content-Security-Policy:
32 |       - upgrade-insecure-requests
33 |       Content-Type:
34 |       - text/plain; charset=UTF-8
35 |       Date:
36 |       - Tue, 29 Oct 2024 01:38:17 GMT
37 |       NCBI-PHID:
38 |       - 939BBD6305AE024500005DC43656677F.1.1.m_5
39 |       NCBI-SID:
40 |       - 3B6A5B3951E601B0_4FADSID
41 |       Referrer-Policy:
42 |       - origin-when-cross-origin
43 |       Server:
44 |       - Finatra
45 |       Set-Cookie:
46 |       - ncbi_sid=3B6A5B3951E601B0_4FADSID; domain=.nih.gov; path=/; expires=Wed, 29
47 |         Oct 2025 01:38:18 GMT
48 |       Strict-Transport-Security:
49 |       - max-age=31536000; includeSubDomains; preload
50 |       Transfer-Encoding:
51 |       - chunked
52 |       X-RateLimit-Limit:
53 |       - '3'
54 |       X-RateLimit-Remaining:
55 |       - '1'
56 |       X-UA-Compatible:
57 |       - IE=Edge
58 |       X-XSS-Protection:
59 |       - 1; mode=block
60 |       content-encoding:
61 |       - gzip
62 |     status:
63 |       code: 400
64 |       message: Bad Request
65 | version: 1
66 | 


--------------------------------------------------------------------------------
/src/bioutils/cytobands.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ``./sbin/ucsc-cytoband-to-json cytoband-hg38.txt.gz | gzip -c >bioutils/_data/cytobands/ucsc-hg38.json.gz``
 3 | """
 4 | 
 5 | import gzip
 6 | import json
 7 | from importlib import resources
 8 | from pathlib import Path
 9 | 
10 | _data_dir = Path(str(resources.files(__package__) / "_data" / "cytobands"))
11 | 
12 | 
13 | def get_cytoband_names():
14 |     """Retrieves available cytobands from the ``_data/cytobands`` directory.
15 | 
16 |     Returns:
17 |         list of str: The names of the available cytobands.
18 | 
19 |     Examples:
20 |         >>> sorted(get_cytoband_names())
21 |         ['ucsc-hg19', 'ucsc-hg38']
22 |     """
23 | 
24 |     return [n.name.replace(".json.gz", "") for n in _data_dir.glob("*.json.gz")]
25 | 
26 | 
27 | def get_cytoband_map(name):
28 |     """Retrives a cytoband by name.
29 | 
30 |     Args:
31 |         name (str): The name of the cytoband to retrieve.
32 | 
33 |     Returns:
34 |         dict: A dictionary of the cytoband data.
35 | 
36 | 
37 |     Examples:
38 |         >>> map = get_cytoband_map("ucsc-hg38")
39 |         >>> map["1"]["p32.2"]
40 |         [55600000, 58500000, 'gpos50']
41 |     """
42 | 
43 |     fn = _data_dir / f"{name}.json.gz"
44 |     return json.load(gzip.open(fn, mode="rt", encoding="utf-8"))
45 | 
46 | 
47 | def get_cytoband_maps(names=[]):
48 |     """Retrieves data from multiple cytobands.
49 | 
50 |     If cytobands are not specified, retrieves data from all available ones.
51 | 
52 |     Args:
53 |         names (list of str, optional): The names of cytobands to retrieve data for.
54 | 
55 |     Returns:
56 |         dict: A dictionary of the form ``{cytoband_name, cytoband_data}``.
57 | 
58 |     Examples:
59 |         >>> maps = get_cytoband_maps()
60 |         >>> maps["ucsc-hg38"]["1"]["p32.2"]
61 |         [55600000, 58500000, 'gpos50']
62 |         >>> maps["ucsc-hg19"]["1"]["p32.2"]
63 |         [56100000, 59000000, 'gpos50']
64 |     """
65 | 
66 |     if names == []:
67 |         names = get_cytoband_names()
68 |     return {name: get_cytoband_map(name) for name in names}
69 | 


--------------------------------------------------------------------------------
/docs/changelog/0.5/0.5.0.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | 0.5.0 (2019-07-22)
 3 | ###################
 4 | 
 5 | Changes since 0.4.4 (2019-05-13).
 6 | 
 7 | Special Attention
 8 | $$$$$$$$$$$$$$$$$$
 9 | 
10 | * All biocommons packages now require Python >= 3.6. See https://github.com/biocommons/org/wiki/Migrating-to-Python-3.6
11 | 
12 | New Features
13 | $$$$$$$$$$$$$
14 | 
15 | * `#18 <https://github.com/biocommons/bioutils/issues/18/>`_: Implemented comprehensive sequence normalization (trim, left, right, expand/voca, vcf) [`36785fa <https://github.com/biocommons/bioutils/commit/36785fa>`_] (Reece Hart)
16 | * `#20 <https://github.com/biocommons/bioutils/issues/20/>`_: implement hex-based digests à la refget [`140a20e <https://github.com/biocommons/bioutils/commit/140a20e>`_] (Reece Hart)
17 | * Add support for cytobands, incl data files from UCSC [`0ba4361 <https://github.com/biocommons/bioutils/commit/0ba4361>`_] (Reece Hart)
18 | * Added accessions.py:coerce_namespace() [`e31e592 <https://github.com/biocommons/bioutils/commit/e31e592>`_] (Reece Hart)
19 | 
20 | Internal and Developer Changes
21 | $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
22 | 
23 | * Added pytest-optional-tests; use test alias in Makefile [`ba9b993 <https://github.com/biocommons/bioutils/commit/ba9b993>`_] (Reece Hart)
24 | * Added trinuc normalization tests [`cfe3a68 <https://github.com/biocommons/bioutils/commit/cfe3a68>`_] (Reece Hart)
25 | * Added vcrpy to test requirements [`95893f1 <https://github.com/biocommons/bioutils/commit/95893f1>`_] (Reece Hart)
26 | * Moved source to src/; updated setup.cfg [`ff45fb0 <https://github.com/biocommons/bioutils/commit/ff45fb0>`_] (Reece Hart)
27 | * Removed pip install from tox in favor of deps [`8c8f91a <https://github.com/biocommons/bioutils/commit/8c8f91a>`_] (Reece Hart)
28 | * Renamed doc → docs [`1612e5c <https://github.com/biocommons/bioutils/commit/1612e5c>`_] (Reece Hart)
29 | * Store assemblies as compressed json [`ea79e71 <https://github.com/biocommons/bioutils/commit/ea79e71>`_] (Reece Hart)
30 | * Update tests to use new vcr cassettes on optional tests (much faster!) [`2001745 <https://github.com/biocommons/bioutils/commit/2001745>`_] (Reece Hart)
31 | 


--------------------------------------------------------------------------------
/tests/data/cassettes/test_fetch_seq_ncbi_invalid_positions:
--------------------------------------------------------------------------------
 1 | interactions:
 2 | - request:
 3 |     body: null
 4 |     headers:
 5 |       Accept:
 6 |       - '*/*'
 7 |       Accept-Encoding:
 8 |       - gzip, deflate
 9 |       Connection:
10 |       - keep-alive
11 |       User-Agent:
12 |       - python-requests/2.32.3
13 |     method: GET
14 |     uri: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=NP_001230161.1&rettype=fasta&seq_start=3191&seq_stop=3190&tool=bioutils&email=biocommons-dev@googlegroups.com
15 |   response:
16 |     body:
17 |       string: '+Error%3A+CEFetchPApplication%3A%3Aproxy_stream()%3A+Error%3A+S+e+q+u+e+n+c+e++s+t+a+r+t++i+s++o+u+t+s+i+d+e++o+f++s+e+q+u+e+n+c+e++r+a+n+g+e+%3A++f+r+o+m++%3D++3+1+9+1+,++l+e+n+g+t+h++%3D++6+0+0+%0A%0A
18 | 
19 |         '
20 |     headers:
21 |       Access-Control-Allow-Origin:
22 |       - '*'
23 |       Access-Control-Expose-Headers:
24 |       - X-RateLimit-Limit,X-RateLimit-Remaining
25 |       Cache-Control:
26 |       - private
27 |       Connection:
28 |       - close
29 |       Content-Disposition:
30 |       - attachment; filename="sequence.fasta"
31 |       Content-Security-Policy:
32 |       - upgrade-insecure-requests
33 |       Content-Type:
34 |       - text/plain; charset=UTF-8
35 |       Date:
36 |       - Tue, 29 Oct 2024 01:38:17 GMT
37 |       NCBI-PHID:
38 |       - 322C747DCD3729C500005ACD6CF2CE3F.1.1.m_5
39 |       NCBI-SID:
40 |       - 7EE76A74F4915BDA_0FEDSID
41 |       Referrer-Policy:
42 |       - origin-when-cross-origin
43 |       Server:
44 |       - Finatra
45 |       Set-Cookie:
46 |       - ncbi_sid=7EE76A74F4915BDA_0FEDSID; domain=.nih.gov; path=/; expires=Wed, 29
47 |         Oct 2025 01:38:17 GMT
48 |       Strict-Transport-Security:
49 |       - max-age=31536000; includeSubDomains; preload
50 |       Transfer-Encoding:
51 |       - chunked
52 |       X-RateLimit-Limit:
53 |       - '3'
54 |       X-RateLimit-Remaining:
55 |       - '2'
56 |       X-UA-Compatible:
57 |       - IE=Edge
58 |       X-XSS-Protection:
59 |       - 1; mode=block
60 |       content-encoding:
61 |       - gzip
62 |     status:
63 |       code: 400
64 |       message: Bad Request
65 | version: 1
66 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | bioutils -- bioinformatics utilities and lookup tables
 2 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 3 | 
 4 | |pypi_badge| |build_status| |cov_badge| |cc_badge| |issues_badge| |contributors| |license| |changelog|
 5 | 
 6 | 
 7 | bioutils provides some common utilities and lookup tables for bioinformatics.
 8 | 
 9 | * bioutils.accessions -- parse accessions, infer namespaces 
10 | * bioutils.assemblies -- Human assembly information (from NCBI/GRCh)
11 | * bioutils.cytobands -- map cytobands to coordinates (from UCSC cytoband tables)
12 | * bioutils.digests -- implementations of various digests
13 | * bioutils.normalize -- allele normalization (left shuffle, right shuffle, expanded, vcf)
14 |   
15 | 
16 | To use an E-Utilities API key run add it to an environment variable
17 | called `ncbi_api_key` and it will be used in the E-Utilities request.
18 | 
19 | 
20 | .. |build_status| image:: https://travis-ci.org/biocommons/bioutils.svg?branch=master
21 |   :target: https://travis-ci.org/biocommons/bioutils
22 | 
23 | .. |changelog| image:: https://img.shields.io/badge/docs-changelog-green.svg
24 |    :target: https://bioutils.readthedocs.io
25 | 
26 | .. |contributors| image:: https://img.shields.io/github/contributors/biocommons/bioutils.svg
27 |   :target: https://github.com/biocommons/bioutils
28 | 
29 | .. |docs| image:: https://img.shields.io/badge/docs-readthedocs-green.svg
30 |    :target: http://bioutils.readthedocs.io/
31 | 
32 | .. |issues_badge| image:: https://img.shields.io/github/issues/biocommons/bioutils.png
33 |   :target: https://github.com/biocommons/bioutils/issues
34 | 
35 | .. |license| image:: https://img.shields.io/github/license/biocommons/bioutils.svg
36 |   :target: https://github.com/biocommons/bioutils/blob/master/LICENSE
37 | 
38 | .. |pypi_badge| image:: https://img.shields.io/pypi/v/bioutils.svg
39 |   :target: https://pypi.org/project/bioutils/
40 | 
41 | 	   
42 | .. |cc_badge| image:: https://api.codeclimate.com/v1/badges/3a99e06ad0a842174b0a/maintainability
43 |    :target: https://codeclimate.com/github/biocommons/bioutils/maintainability
44 |    :alt: Maintainability
45 | 
46 | .. |cov_badge| image:: https://coveralls.io/repos/github/biocommons/bioutils/badge.svg?branch=master
47 |    :target: https://coveralls.io/github/biocommons/bioutils?branch=master
48 | 
49 | 


--------------------------------------------------------------------------------
/tests/test_sequences.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from bioutils.sequences import TranslationTable, translate_cds
 4 | 
 5 | 
 6 | def test_translate_examples():
 7 |     """test for standard translation table"""
 8 | 
 9 |     assert translate_cds("ATGCGA") == "MR"
10 |     assert translate_cds("AUGCGA") == "MR"
11 |     assert translate_cds(None) is None
12 |     assert translate_cds("") == ""
13 |     with pytest.raises(ValueError):
14 |         translate_cds("AUGCG")
15 | 
16 |     assert translate_cds("AUGCG", full_codons=False) == "M*"
17 |     assert translate_cds("ATGTAN") == "MX"
18 |     assert translate_cds("CCN") == "P"
19 |     assert translate_cds("TRA") == "*"
20 |     assert translate_cds("TTNTA", full_codons=False) == "X*"
21 |     assert translate_cds("CTB") == "L"
22 |     assert translate_cds("AGM") == "X"
23 |     assert translate_cds("GAS") == "X"
24 |     assert translate_cds("CUN") == "L"
25 |     with pytest.raises(ValueError):
26 |         translate_cds("AUGCGQ")
27 | 
28 | 
29 | def test_translate_selenoproteins():
30 |     """unit test for sec codon"""
31 |     assert translate_cds("AUGTGATAA") == "M**"
32 |     assert translate_cds("AUGTGATAA", translation_table=TranslationTable.standard) == "M**"
33 |     assert translate_cds("AUGTGATAA", translation_table=TranslationTable.selenocysteine) == "MU*"
34 |     assert (
35 |         translate_cds(
36 |             "AUGTGATA",
37 |             translation_table=TranslationTable.selenocysteine,
38 |             full_codons=False,
39 |         )
40 |         == "MU*"
41 |     )
42 | 
43 |     with pytest.raises(ValueError):
44 |         translate_cds("AUGTGATA", translation_table=TranslationTable.selenocysteine)
45 | 
46 | 
47 | def test_translate_vertebrate_mitochondrial():
48 |     """unit test for vertebrate mitochondrial codons"""
49 |     assert translate_cds("AUGTGATAA") == "M**"
50 |     assert translate_cds("ATATGAAGGAGA", translation_table=TranslationTable.vertebrate_mitochondrial) == "MW**"
51 |     assert (
52 |         translate_cds(
53 |             "ATAAG",
54 |             translation_table=TranslationTable.vertebrate_mitochondrial,
55 |             full_codons=False,
56 |         )
57 |         == "M*"
58 |     )
59 | 
60 |     with pytest.raises(ValueError):
61 |         translate_cds("ATAAG", translation_table=TranslationTable.vertebrate_mitochondrial)
62 | 


--------------------------------------------------------------------------------
/sbin/generate-assembly-sql:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import csv
 4 | import io
 5 | import sys
 6 | 
 7 | import bioutils.assemblies
 8 | 
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     with io.open("assy-seqs.sql", "w") as sql_fh:
13 |         sql_fh.write("""
14 | drop table assembly_sequence;
15 | drop table assembly;
16 | 
17 | create table assembly (
18 |     assy text primary key,
19 |     genbank_ac text,
20 |     refseq_ac text,
21 |     description text
22 | );
23 | 
24 | create table assembly_sequence (
25 |         assy text not null references assembly(assy),
26 |         genbank_ac text,
27 |         refseq_ac text not null,
28 |         rel text not null,
29 |         name text not null,
30 |         length bigint not null,
31 |         unit text not null,
32 |         aliases text[]
33 | );
34 | 
35 | create unique index assy_name_unique on assembly_sequence(assy, name);
36 | 
37 | 
38 | \copy assembly from assy.csv with CSV HEADER DELIMITER '   '
39 | \copy assembly_sequence from seqs.csv with CSV HEADER DELIMITER '        ' 
40 | 
41 | """)
42 | 
43 | 
44 |     assy_fh = csv.DictWriter(
45 |         io.open("assy.csv", "w"),
46 |         fieldnames="assy genbank_ac refseq_ac description".split(),
47 |         delimiter="\t")
48 |     assy_fh.writeheader()
49 | 
50 |     seqs_fh = csv.DictWriter(
51 |         io.open("seqs.csv", "w"),
52 |         fieldnames="assy genbank_ac refseq_ac rel name length unit aliases".split(),
53 |         delimiter="\t")
54 |     seqs_fh.writeheader()
55 | 
56 | 
57 |     assys = bioutils.assemblies.get_assemblies()
58 | 
59 |     for an in assys.keys():
60 |         assy = assys[an]
61 |         
62 |         assy_fh.writerow({
63 |             "assy": an,
64 |             "genbank_ac": assy["genbank_ac"],
65 |             "refseq_ac": assy["refseq_ac"],
66 |             "description": assy["description"],
67 |             })
68 | 
69 |         for seq in assy["sequences"]:
70 |             seqs_fh.writerow({
71 |                 "assy": an,
72 |                 "genbank_ac": seq["genbank_ac"],
73 |                 "refseq_ac": seq["refseq_ac"],
74 |                 "rel": seq["relationship"],
75 |                 "name": seq["name"],
76 |                 "length": seq["length"],
77 |                 "unit": seq["assembly_unit"],
78 |                 "aliases": "{" + ",".join(seq["aliases"]) + "}",
79 |                 })
80 | 


--------------------------------------------------------------------------------
/src/bioutils/coordinates.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-, flake8: noqa
 2 | """Provides utilities for interconverting between coordinate systems
 3 | especially as used by the hgvs code.  The three systems are:
 4 | 
 5 | .. parsed-literal::
 6 |                   : A : C : G : T : A : C :
 7 |   human/hgvs  h   :-3 :-2 :-1 : 1 : 2 : 3 :
 8 |   continuous  c   :-2 :-1 : 0 : 1 : 2 : 3 :
 9 |   interbase   i  -3  -2  -1   0   1   2   3
10 | 
11 | Human/hgvs coordinates are the native coordinates used by the HGVS
12 | recommendations. The coordinates are 1-based, inclusive, and refer to
13 | the nucleotides; there is no 0.
14 | 
15 | Continuous coordinates are similar to hgvs coordinates, but adds 1 to
16 | all negative values so that there is no discontinuity between -1 and 1
17 | (as there is with HGVS).
18 | 
19 | Interbase coordinates refer to the zero-width junctions between
20 | nucleotides.  The main advantage of interbase coordinates is that
21 | there are no corner cases in the specification of intervals used for
22 | insertions and deletions as there is with numbering systems that refer
23 | to nucleotides themselves.  Numerically, interbase intervals are
24 | 0-based, left-closed, and right-open.  Beacuse referring to a single
25 | interbase coordinate is not particularly meaningful, interbase
26 | coordinates are always passed as start,end pairs.
27 | 
28 | Because it's easy to confuse these coordinates in code, ``_h``, ``_c``, and ``_i``
29 | suffixes are often used to clarify variables.
30 | 
31 | For code clarity, this module provides functions that interconvert
32 | *intervals* specified in each of the coordinate systems.
33 | """
34 | 
35 | PLUS_STRAND = 1
36 | MINUS_STRAND = -1
37 | 
38 | 
39 | def strand_pm_to_int(s):
40 |     """Converts '+' and '-' to 1 and -1, respectively.
41 | 
42 |     Args:
43 |         s (string)
44 | 
45 |     Returns:
46 |         int: 1 if s == '+', -1 if s == '-', otherwise None.
47 | 
48 |     Examples:
49 |         >>> strand_pm_to_int('+')
50 |         1
51 |         >>> strand_pm_to_int('-')
52 |         -1
53 |         >>> strand_pm_to_int('arglefargle')
54 |     """
55 |     return PLUS_STRAND if s == "+" else MINUS_STRAND if s == "-" else None
56 | 
57 | 
58 | def strand_int_to_pm(i):
59 |     """Converts 1 and -1 to '+' and '-' respectively.
60 | 
61 |     Args:
62 |         i (int)
63 | 
64 |     Returns:
65 |         str: '+' if i == 1, '-' if i == -1, otherwise None.
66 | 
67 |     Examples:
68 |         >>> strand_int_to_pm(1)
69 |         '+'
70 |         >>> strand_int_to_pm(-1)
71 |         '-'
72 |         >>> strand_int_to_pm(42)
73 |     """
74 | 
75 |     return "+" if i == PLUS_STRAND else "-" if i == MINUS_STRAND else None
76 | 
77 | 
78 | strand_pm = strand_int_to_pm
79 | 
80 | ## <LICENSE>
81 | ## Copyright 2014 Bioutils Contributors (https://bitbucket.org/biocommons/bioutils)
82 | ##
83 | ## Licensed under the Apache License, Version 2.0 (the "License");
84 | ## you may not use this file except in compliance with the License.
85 | ## You may obtain a copy of the License at
86 | ##
87 | ##     http://www.apache.org/licenses/LICENSE-2.0
88 | ##
89 | ## Unless required by applicable law or agreed to in writing, software
90 | ## distributed under the License is distributed on an "AS IS" BASIS,
91 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
92 | ## See the License for the specific language governing permissions and
93 | ## limitations under the License.
94 | ## </LICENSE>
95 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # bioutils
 2 | 
 3 | [![Release](https://img.shields.io/github/v/release/biocommons/bioutils)](https://img.shields.io/github/v/release/biocommons/bioutils)
 4 | [![Build status](https://img.shields.io/github/actions/workflow/status/biocommons/bioutils/main.yml?branch=main)](https://github.com/biocommons/bioutils/actions/workflows/main.yml?query=branch%3Amain)
 5 | [![codecov](https://codecov.io/gh/biocommons/bioutils/branch/main/graph/badge.svg)](https://codecov.io/gh/biocommons/bioutils)
 6 | [![Commit activity](https://img.shields.io/github/commit-activity/m/biocommons/bioutils)](https://img.shields.io/github/commit-activity/m/biocommons/bioutils)
 7 | [![License](https://img.shields.io/github/license/biocommons/bioutils)](https://img.shields.io/github/license/biocommons/bioutils)
 8 | 
 9 | Package Description
10 | 
11 | This project is a product of the [biocommons](https://biocommons.org/) community.
12 | 
13 | - **Github repository**: <https://github.com/biocommons/bioutils/>
14 | - **Documentation** <https://biocommons.github.io/bioutils/>
15 | 
16 | ## Python Package Installation
17 | 
18 | Install from PyPI with `pip install bioutils` or `uv pip install bioutils`, then try it:
19 | 
20 | ## Developer Setup
21 | 
22 | ### Install Prerequisites
23 | 
24 | These tools are required to get started:
25 | 
26 | - [git](https://git-scm.com/): Version control system
27 | - [GNU make](https://www.gnu.org/software/make/): Current mechanism for consistent invocation of developer tools.
28 | - [uv](https://docs.astral.sh/uv/): An extremely fast Python package and project manager, written in Rust.
29 | 
30 | #### MacOS or Linux Systems
31 | 
32 | - [Install brew](https://brew.sh/)
33 | - `brew install git make uv`
34 | 
35 | #### Linux (Debian-based systems)
36 | 
37 | You may also install using distribution packages:
38 | 
39 |     sudo apt install git make
40 | 
41 | Then install uv using the [uv installation instructions](https://docs.astral.sh/uv/getting-started/installation/).
42 | 
43 | ### One-time developer setup
44 | 
45 | Create a Python virtual environment, install dependencies, install pre-commit hooks, and install an editable package:
46 | 
47 |     make devready
48 | 
49 | ### Development
50 | 
51 | **N.B.** Developers are strongly encouraged to use `make` to invoke tools to
52 | ensure consistency with the CI/CD pipelines.  Type `make` to see a list of
53 | supported targets.  A subset are listed here:
54 | 
55 |     » make
56 |     🌟🌟 biocommons conventional make targets 🌟🌟
57 | 
58 |     Using these targets promots consistency between local development and ci/cd commands.
59 | 
60 |     usage: make [target ...]
61 | 
62 |     BASIC USAGE
63 |     help                Display help message
64 | 
65 |     SETUP, INSTALLATION, PACKAGING
66 |     devready            Prepare local dev env: Create virtual env, install the pre-commit hooks
67 |     build               Build package
68 |     publish             publish package to PyPI
69 | 
70 |     FORMATTING, TESTING, AND CODE QUALITY
71 |     cqa                 Run code quality assessments
72 |     test                Test the code with pytest
73 | 
74 |     DOCUMENTATION
75 |     docs-serve          Build and serve the documentation
76 |     docs-test           Test if documentation can be built without warnings or errors
77 | 
78 |     CLEANUP
79 |     clean               Remove temporary and backup files
80 |     cleaner             Remove files and directories that are easily rebuilt
81 |     cleanest            Remove all files that can be rebuilt
82 |     distclean           Remove untracked files and other detritus
83 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for Python project
 2 | 
 3 | .DELETE_ON_ERROR:
 4 | .PHONY: FORCE
 5 | .PRECIOUS:
 6 | .SUFFIXES:
 7 | 
 8 | .DEFAULT_GOAL := help
 9 | default: help
10 | 
11 | ############################################################################
12 | #= BASIC USAGE
13 | 
14 | .PHONY: help
15 | help: ## Display help message
16 | 	@./sbin/makefile-extract-documentation ${MAKEFILE_LIST}
17 | 
18 | ############################################################################
19 | #= SETUP, INSTALLATION, PACKAGING
20 | 
21 | install: devready
22 | .PHONY: devready
23 | devready: ## Prepare local dev env: Create virtual env, install the pre-commit hooks
24 | 	$(call INFO_MESSAGE, Prepare local dev env: Create virtual env and install the pre-commit hooks)
25 | 	uv sync --dev
26 | 	uv run pre-commit install
27 | 	@echo '⚠️ You must activate the virtual env with `source .venv/bin/activate`'
28 | 
29 | .PHONY: build
30 | build: ## Build package
31 | 	$(call INFO_MESSAGE, "Building package")
32 | 	rm -fr dist
33 | 	uv build
34 | 
35 | .PHONY: publish
36 | publish: build ## publish package to PyPI
37 | 	$(call INFO_MESSAGE, "Publishing package")
38 | 	uv publish  # Requires UV_PUBLISH_TOKEN or Trusted Publishing setup
39 | 
40 | ############################################################################
41 | #= FORMATTING, TESTING, AND CODE QUALITY
42 | 
43 | .PHONY: cqa
44 | cqa: ## Run code quality assessments
45 | 	$(call INFO_MESSAGE, "Checking lock file consistency")
46 | 	uv lock --locked
47 | 	$(call INFO_MESSAGE, "Linting and reformatting files")
48 | 	uv run pre-commit run
49 | 	$(call INFO_MESSAGE, "Checking for obsolete dependencies")
50 | 	uv run deptry src
51 | 
52 | .PHONY: test
53 | test: ## Test the code with pytest
54 | 	@echo "🚀 Testing code: Running pytest"
55 | 	uv run pytest --cov=. --cov-report=xml
56 | 
57 | # to be incorporated
58 | # test-learn:
59 | #	VCR_RECORD_MODE=new_episodes pytest -x
60 | 
61 | ############################################################################
62 | #= DOCUMENTATION
63 | 
64 | .PHONY: docs-serve
65 | docs-serve: ## Build and serve the documentation
66 | 	$(call INFO_MESSAGE, "Build and serve docs for local development")
67 | 	uv run mkdocs serve
68 | 
69 | .PHONY: docs-test
70 | docs-test: ## Test if documentation can be built without warnings or errors
71 | 	$(call INFO_MESSAGE, "Testing whether docs can be build")
72 | 	uv run mkdocs build -s
73 | 
74 | ############################################################################
75 | #= CLEANUP
76 | 
77 | .PHONY: clean
78 | clean:  ## Remove temporary and backup files
79 | 	$(call INFO_MESSAGE, "Remove temporary and backup files")
80 | 	find . \( -name "*~" -o -name "*.bak" \) -exec rm -frv {} +
81 | 
82 | .PHONY: cleaner
83 | cleaner: clean  ## Remove files and directories that are easily rebuilt
84 | 	$(call INFO_MESSAGE, "Remove files and directories that are easily rebuilt")
85 | 	rm -frv .cache .DS_Store .pytest_cache .ruff_cache build coverage.xml dist docs/_build site
86 | 	find . \( -name __pycache__ -type d \) -exec rm -frv {} +
87 | 	find . \( -name "*.pyc" -o -name "*.egg-info" \) -exec rm -frv {} +
88 | 	find . \( -name "*.orig" -o -name "*.rej" \) -exec rm -frv {} +
89 | 
90 | .PHONY: cleanest
91 | cleanest: cleaner  ## Remove all files that can be rebuilt
92 | 	$(call INFO_MESSAGE, "Remove files and directories that can be rebuilt")
93 | 	rm -frv .eggs .tox .venv venv
94 | 
95 | .PHONY: distclean
96 | distclean: cleanest  ## Remove untracked files and other detritus
97 | 	@echo "❌ Remove untracked files and other detritus -- Too dangerous... do this yourself"
98 | 	# git clean -df
99 | 


--------------------------------------------------------------------------------
/src/bioutils/digest.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import binascii
  3 | 
  4 | _enc = "ascii"
  5 | 
  6 | 
  7 | class Digest(bytes):
  8 |     """Represents a sliceable binary digest, with support for encoding and
  9 |     decoding using printable characters.
 10 | 
 11 |     Supported encoding and decodings are::
 12 |         * base64
 13 |         * base64url
 14 |         * hex (aka base16)
 15 | 
 16 |     The Base64 specification
 17 |     (https://tools.ietf.org/html/rfc4648#page-7) defines base64 and a
 18 |     URL-safe variant called base64url.
 19 | 
 20 |     "Stringified" Digest objects use URL-safe base64 encodings.
 21 | 
 22 | 
 23 |     >>> import hashlib
 24 | 
 25 |     >>> b = hashlib.sha512().digest()
 26 |     >>> len(b)
 27 |     64
 28 | 
 29 | 
 30 |     >>> d = Digest(b)           # creation
 31 |     >>> str(d)                  # returns base64url
 32 |     'z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXcg_SpIdNs6c5H0NE8XYXysP-DGNKHfuwvY7kxvUdBeoGlODJ6-SfaPg=='
 33 | 
 34 |     >>> d24 = d[:24]            # slice binary digest at first 24 bytes
 35 |     >>> str(d24)
 36 |     'z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXc'
 37 | 
 38 |     # encoding
 39 | 
 40 |     >>> d.as_base64url()
 41 |     'z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXcg_SpIdNs6c5H0NE8XYXysP-DGNKHfuwvY7kxvUdBeoGlODJ6-SfaPg=='
 42 |     >>> d.as_hex()
 43 |     'cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e'
 44 | 
 45 |     # decoding
 46 | 
 47 |     >>> d == Digest.from_base64(d.as_base64())
 48 |     True
 49 |     >>> d == Digest.from_base64url(d.as_base64url())
 50 |     True
 51 |     >>> d == Digest.from_hex(d.as_hex())
 52 |     True
 53 |     """
 54 | 
 55 |     def __str__(self):
 56 |         """returns digest as base64url string"""
 57 |         return self.as_base64url()
 58 | 
 59 |     # TODO: Consider requiring slice start == None or 0, and len % 3 == 0
 60 |     # Slicing %3 != 0 => strings will having suffix differences
 61 |     def __getitem__(self, key):
 62 |         return Digest(bytes.__getitem__(self, key))
 63 | 
 64 |     # base64
 65 |     def as_base64(self):
 66 |         """Returns Digest as a base64-encoded string.
 67 | 
 68 |         Returns:
 69 |             str: base64 encoding of Digest.
 70 |         """
 71 |         return base64.b64encode(self).decode(_enc)
 72 | 
 73 |     @staticmethod
 74 |     def from_base64(s):
 75 |         """Returns Digest object initialized from a base64-encoded string.
 76 | 
 77 |         Args:
 78 |             s (str): A base64-encoded digest string.
 79 | 
 80 |         Returns:
 81 |             Digest: A Digest object initialized from s.
 82 |         """
 83 | 
 84 |         return Digest(base64.b64decode(s))
 85 | 
 86 |     # base64url
 87 |     def as_base64url(self):
 88 |         """Returns Digest as URL-safe, base64-encoded string.
 89 | 
 90 |         Returns:
 91 |             str: URL-safe base64 encoding of Digest.
 92 |         """
 93 |         return base64.urlsafe_b64encode(self).decode(_enc)
 94 | 
 95 |     @staticmethod
 96 |     def from_base64url(s):
 97 |         """Returns Digest object initialized from a base64url string.
 98 | 
 99 |         Args:
100 |             s (str): A base64url-encoded digest string.
101 | 
102 |         Returns:
103 |             Digest: A Digest object initialized from s.
104 |         """
105 | 
106 |         return Digest(base64.urlsafe_b64decode(s))
107 | 
108 |     # for backward compatibility with earlier versions
109 |     # ("base64url" is the official name for the encoding)
110 |     as_base64us = as_base64url
111 |     from_base64us = from_base64url
112 | 
113 |     # hex
114 |     def as_hex(self):
115 |         """Returns Digest as hex string.
116 | 
117 |         Returns:
118 |             str: A hex-encoding of Digest.
119 |         """
120 | 
121 |         return binascii.hexlify(self).decode(_enc)
122 | 
123 |     @staticmethod
124 |     def from_hex(s):
125 |         """returns Digest object initialized from hex string.
126 | 
127 |         Args:
128 |             s (str): A hex-encoded digest string.
129 | 
130 |         Returns:
131 |             Digest: A Digest object initialized from s.
132 |         """
133 | 
134 |         return Digest(binascii.unhexlify(s))
135 | 
136 | 
137 | if __name__ == "__main__":  # pragma: nocover
138 |     import hashlib
139 | 
140 |     b = hashlib.sha512().digest()
141 |     d = Digest(b)
142 |     assert isinstance(d, Digest), "d isn't a Digest"
143 |     d24 = d[:24]
144 |     assert isinstance(d24, Digest), "d24 isn't a Digest"
145 |     e = Digest.from_base64url(d.as_base64url())
146 |     e24 = Digest.from_base64url(d24.as_base64url())
147 | 


--------------------------------------------------------------------------------
/src/bioutils/vmc_digest.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import hashlib
  3 | 
  4 | from .digest import Digest
  5 | 
  6 | ENC = "UTF-8"
  7 | DEFAULT_DIGEST_SIZE = 24
  8 | 
  9 | 
 10 | def vmc_digest(data, digest_size=DEFAULT_DIGEST_SIZE):
 11 |     """Returns the VMC Digest as a Digest object, which has both bytes and
 12 |     string (URL-safe, Base 64) representations.
 13 | 
 14 |     >>> d = vmc_digest("")
 15 | 
 16 |     .. # I can't figure out how to make this test work on Py 2 and 3 :-(
 17 | 
 18 |     .. >>> d                       # doctest: +SKIP
 19 |     .. b'\xcf\x83\xe15~\xef\xb8\xbd\xf1T(P\xd6m\x80'
 20 | 
 21 |     >>> str(d)
 22 |     'z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXc'
 23 | 
 24 |     >>> len(d), len(str(d))
 25 |     (24, 32)
 26 | 
 27 |     >>> str(vmc_digest("", 24))
 28 |     'z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXc'
 29 | 
 30 |     >>> vmc_digest("", 17)
 31 |     Traceback (most recent call last):
 32 |     ...
 33 |     ValueError: digest_size must be a multiple of 3
 34 | 
 35 |     >>> vmc_digest("", 66)
 36 |     Traceback (most recent call last):
 37 |     ...
 38 |     ValueError: digest_size must be between 0 and 63 (bytes)
 39 | 
 40 | 
 41 |     SHA-512 is 2x faster than SHA1 on modern 64-bit platforms.
 42 |     However, few appliations require 512 bits (64 bytes) of keyspace.
 43 |     That larger size translates into proportionally larger key size
 44 |     requirements, with attendant performance implications.  By
 45 |     truncating the SHA-512 digest [1], users may obtain a tunable
 46 |     level of collision avoidance.
 47 | 
 48 |     The string returned by this function is Base 64 encoded with
 49 |     URL-safe characters [2], making it suitable for use with URLs or
 50 |     filesystem paths.  Base 64 encoding results in an output string
 51 |     that is 4/3 the size of the input.  If the length of the input
 52 |     string is not divisible by 3, the output is right-padded with
 53 |     equal signs (=), which have no information content. Therefore,
 54 |     this function requires that digest_size is evenly divisible by 3.
 55 |     (The resulting vmc_digest will be 4/3*digest_size bytes.)
 56 | 
 57 |     According to [3], the probability of a collision using b bits with
 58 |     m messages (sequences) is:
 59 | 
 60 |        ``P(b, m) = m^2 / 2^(b+1)``.
 61 | 
 62 |     Note that the collision probability depends on the number of
 63 |     messages, but not their size.  Solving for the number of messages:
 64 | 
 65 |        ``m(b, P) = sqrt(P * 2^(b+1))``
 66 | 
 67 |     Solving for the number of *bits*:
 68 | 
 69 |        ``b(m, P) = log2(m^2/P) - 1``
 70 | 
 71 |     For various values of ``m`` and ``P``, the number of *bytes* required
 72 |     according to ``b(m,P)`` rounded to next multiple of 3 is:
 73 | 
 74 |     +---------+-----------+-----------+-----------+-----------+-----------+-----------+
 75 |     |    #m   |``P<1e-24``|``P<1e-21``|``P<1e-18``|``P<1e-15``|``P<1e-12``|``P<1e-09``|
 76 |     +---------+-----------+-----------+-----------+-----------+-----------+-----------+
 77 |     |``1e+06``|     15    |     12    |     12    |     9     |     9     |     9     |
 78 |     +---------+-----------+-----------+-----------+-----------+-----------+-----------+
 79 |     |``1e+09``|     15    |     15    |     12    |     12    |     9     |     9     |
 80 |     +---------+-----------+-----------+-----------+-----------+-----------+-----------+
 81 |     |``1e+12``|     15    |     15    |     15    |     12    |     12    |     9     |
 82 |     +---------+-----------+-----------+-----------+-----------+-----------+-----------+
 83 |     |``1e+15``|     18    |     15    |     15    |     15    |     12    |     12    |
 84 |     +---------+-----------+-----------+-----------+-----------+-----------+-----------+
 85 |     |``1e+18``|     18    |     18    |     15    |     15    |     15    |     12    |
 86 |     +---------+-----------+-----------+-----------+-----------+-----------+-----------+
 87 |     |``1e+21``|     21    |     18    |     18    |     15    |     15    |     15    |
 88 |     +---------+-----------+-----------+-----------+-----------+-----------+-----------+
 89 |     |``1e+24``|     21    |     21    |     18    |     18    |     15    |     15    |
 90 |     +---------+-----------+-----------+-----------+-----------+-----------+-----------+
 91 | 
 92 |     For example, given ``1e+18`` expected messages and a desired collision
 93 |     probability ``< 1e-15``, we use ``digest_size = 15`` (bytes).
 94 | 
 95 |     References:
 96 |         - [1] http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
 97 |         - [2] https://tools.ietf.org/html/rfc3548#section-4
 98 |         - [3] http://stackoverflow.com/a/4014407/342839
 99 |         - [4] http://stackoverflow.com/a/22029380/342839
100 |         - [5] http://preshing.com/20110504/hash-collision-probabilities/
101 |         - [6] https://en.wikipedia.org/wiki/Birthday_problem
102 | 
103 |     """
104 | 
105 |     # TODO: Consider relaxing %3 constraint and stripping padding
106 |     if digest_size % 3 != 0:
107 |         raise ValueError("digest_size must be a multiple of 3")
108 |     if not 0 <= digest_size <= 63:
109 |         raise ValueError("digest_size must be between 0 and 63 (bytes)")
110 | 
111 |     sha512 = Digest(hashlib.sha512(data.encode(ENC)).digest())
112 |     return sha512[:digest_size]
113 | 


--------------------------------------------------------------------------------
/tests/test_seqfetcher.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing
  2 | import os
  3 | 
  4 | import pytest
  5 | import vcr
  6 | 
  7 | from bioutils.seqfetcher import _add_eutils_api_key, _fetch_seq_ncbi, fetch_seq, enst_default_seq_type
  8 | 
  9 | 
 10 | @pytest.fixture(autouse=True)
 11 | def clear_env():
 12 |     """Some tests in this module assume that the default utils access configs are
 13 |     active. If you execute tests in an environment with an existing `NCBI_API_KEY` env
 14 |     var, those tests will fail unless we first remove that variable.
 15 |     """
 16 |     if "NCBI_API_KEY" in os.environ:
 17 |         del os.environ["NCBI_API_KEY"]
 18 | 
 19 | 
 20 | @vcr.use_cassette
 21 | def test_fetch_seq():
 22 |     assert 1596 == len(fetch_seq("NP_056374.2"))
 23 | 
 24 |     assert "MESRETLSSS" == fetch_seq("NP_056374.2", 0, 10)
 25 |     assert "MESRETLSSS" == fetch_seq("NP_056374.2")[0:10]  # NOT RECOMMENDED
 26 | 
 27 |     assert "ATCACACGTGCAGGAACCCTTTTCC" == fetch_seq("NC_000001.10", 2000000, 2000025)
 28 |     assert "AAAATTAAATTAAAATAAATAAAAA" == fetch_seq("NG_032072.1", 0, 25)
 29 |     assert "TTGTGTGTTAGGGTGCTCTAAGCAA" == fetch_seq("NW_003571030.1", 0, 25)
 30 |     assert "GAATTCCTCGTTCACACAGTTTCTT" == fetch_seq("NT_113901.1", 0, 25)
 31 |     assert "NNNNNNNNNNNNNNNNNNNNNNNNN" == fetch_seq("NC_000001.10", 0, 25)
 32 |     assert "MESRETLSSSRQRGGESDFLPVSSA" == fetch_seq("NP_056374.2", 0, 25)
 33 |     assert "GATCCACCTGCCTCAGCCTCCCAGA" == fetch_seq("GL000191.1", 0, 25)
 34 |     assert "TTTATTTATTTTAGATACTTATCTC" == fetch_seq("KB663603.1", 0, 25)
 35 |     assert "CCGCTCGGGCCCCGGCTCTCGGTTA" == fetch_seq("ENST00000288602.11", 0, 25)
 36 |     assert "MAALSGGGGGGAEPGQALFNGDMEP" == fetch_seq("ENSP00000288602", 0, 25)
 37 | 
 38 | 
 39 | ENST00000617537_470_480 = {
 40 |     # In [16]: s_gen[470:480], s_cdna[470:480], s_cds[470:480]
 41 |     # Out[16]: ("TAGGTATGCA", "TAGGGTGTGT", "TGACATTTGT")
 42 |     "genomic": "TAGGTATGCA",
 43 |     "cdna": "TAGGGTGTGT",
 44 |     "cds": "TGACATTTGT",
 45 | }
 46 | 
 47 | 
 48 | @vcr.use_cassette
 49 | def test_fetch_ENST00000617537_noenv(caplog, monkeypatch):
 50 |     """ensure expected lengths for ENST00000617537 with ENST_DEFAULT_SEQ_TYPE unset"""
 51 |     monkeypatch.delenv("ENST_DEFAULT_SEQ_TYPE", raising=False)
 52 |     ac = "ENST00000617537"
 53 |     assert ENST00000617537_470_480[enst_default_seq_type] == fetch_seq(ac, start_i=470, end_i=480)
 54 |     assert "Transcript type not specified or set in ENST_DEFAULT_SEQ_TYPE" in caplog.text
 55 |     assert ENST00000617537_470_480["genomic"] == fetch_seq(ac, start_i=470, end_i=480, seq_type="genomic")
 56 |     assert ENST00000617537_470_480["cdna"] == fetch_seq(ac, start_i=470, end_i=480, seq_type="cdna")
 57 |     assert ENST00000617537_470_480["cds"] == fetch_seq(ac, start_i=470, end_i=480, seq_type="cds")
 58 | 
 59 | 
 60 | @vcr.use_cassette
 61 | def test_fetch_ENST00000617537_env(caplog, monkeypatch):
 62 |     """ensure expected lengths for ENST00000617537 with ENST_DEFAULT_SEQ_TYPE set"""
 63 |     user_enst_default_type = "cds"  # intentionally != enst_default_seq_type to ensure use
 64 |     monkeypatch.setenv("ENST_DEFAULT_SEQ_TYPE", user_enst_default_type)
 65 |     ac = "ENST00000617537"
 66 |     assert ENST00000617537_470_480[user_enst_default_type] == fetch_seq(ac, start_i=470, end_i=480)
 67 |     assert "Transcript type not specified or set in ENST_DEFAULT_SEQ_TYPE" not in caplog.text
 68 |     assert ENST00000617537_470_480["genomic"] == fetch_seq(ac, start_i=470, end_i=480, seq_type="genomic")
 69 |     assert ENST00000617537_470_480["cdna"] == fetch_seq(ac, start_i=470, end_i=480, seq_type="cdna")
 70 |     assert ENST00000617537_470_480["cds"] == fetch_seq(ac, start_i=470, end_i=480, seq_type="cds")
 71 | 
 72 | 
 73 | @vcr.use_cassette
 74 | def test_fetch_seq_ncbi_invalid_positions():
 75 |     with pytest.raises(RuntimeError) as excinfo:
 76 |         _fetch_seq_ncbi("NP_001230161.1", 3190, 3190)
 77 |     assert "invalid sequence or start or end position" in str(excinfo.value)
 78 | 
 79 | 
 80 | @vcr.use_cassette
 81 | def test_add_eutils_api_key():
 82 |     try:
 83 |         url = "http://test.com?boo=bar"
 84 |         assert _add_eutils_api_key(url) == url
 85 |         os.environ["NCBI_API_KEY"] = "test-api-key"
 86 |         assert _add_eutils_api_key(url) == url + "&api_key=test-api-key"
 87 |     finally:
 88 |         try:
 89 |             os.environ.pop("NCBI_API_KEY")
 90 |         except KeyError:
 91 |             pass
 92 | 
 93 | 
 94 | @vcr.use_cassette
 95 | @pytest.mark.network
 96 | def test_fetch_seq_errors():
 97 |     # Traceback (most recent call last):
 98 |     #    ...
 99 |     # RuntimeError: No sequence available for NM_9.9
100 |     with pytest.raises(RuntimeError):
101 |         fetch_seq("NM_9.9")
102 | 
103 |     # Traceback (most recent call last):
104 |     #    ...
105 |     # RuntimeError: No sequence fetcher for QQ01234
106 |     with pytest.raises(RuntimeError):
107 |         fetch_seq("QQ01234")
108 | 
109 | 
110 | def _check1(_x):
111 |     # small, fast query
112 |     assert "MESRETLSSS" == fetch_seq("NP_056374.2", 0, 10)
113 | 
114 | 
115 | # no vcr!
116 | @pytest.mark.network
117 | def test_rate_limit():
118 |     num_requests = num_threads = 5
119 |     p = multiprocessing.Pool(num_threads)
120 |     p.map(_check1, range(num_requests))
121 |     p.close()
122 |     p.join()
123 | 


--------------------------------------------------------------------------------
/sbin/assembly-to-json:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """convert assembly text file to json
  4 | 
  5 | eg$ ./sbin/assembly-to-json -p bioutils/_data/assemblies -f pull/GCF_000001405.29.assembly.txt
  6 | 
  7 | """
  8 | 
  9 | 
 10 | from argparse import ArgumentParser
 11 | import csv
 12 | import io
 13 | import json
 14 | import logging
 15 | import os
 16 | import re
 17 | import sys
 18 | 
 19 | 
 20 | class AssemblyParser(object):
 21 |     """Parse an assembly file from NCBI, like this one:
 22 |       ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/GCF_000001405.25.assembly.txt
 23 |     """
 24 | 
 25 |     def __init__(self, body):
 26 |         self._body = body.replace("\r","")
 27 | 
 28 |     @property
 29 |     def name(self):
 30 |         return re.search("^# Assembly name:\s+(\S+)", self._body, flags=re.MULTILINE).group(1).strip()
 31 | 
 32 |     @property
 33 |     def description(self):
 34 |         try:
 35 |             return re.search("^# Description:\s+(.+)", self._body, flags=re.MULTILINE).group(1).strip()
 36 |         except AttributeError:
 37 |             return None
 38 | 
 39 |     @property
 40 |     def taxid(self):
 41 |         return re.search("^# Taxid:\s+(\S+)", self._body, flags=re.MULTILINE).group(1).strip()
 42 | 
 43 |     @property
 44 |     def genbank_accession(self):
 45 |         try:
 46 |             return re.search("^# GenBank assembly accession:\s+(\S+)", self._body, flags=re.MULTILINE).group(1).strip()
 47 |         except AttributeError:
 48 |             return None
 49 | 
 50 |     @property
 51 |     def refseq_accession(self):
 52 |         return re.search("^# RefSeq assembly accession:\s+(\S+)", self._body, flags=re.MULTILINE).group(1).strip()
 53 | 
 54 |     @property
 55 |     def submitter(self):
 56 |         return re.search("^# Submitter:\s+(.+)", self._body, flags=re.MULTILINE).group(1).strip()
 57 | 
 58 |     @property
 59 |     def date(self):
 60 |         dt = re.search("^# Date:\s+(\S+)", self._body, flags=re.MULTILINE).group(1).strip()
 61 |         ymd = map(int,dt.split("-"))
 62 |         return "{:4d}-{:02d}-{:02d}".format(*ymd)
 63 | 
 64 |     @property
 65 |     def assembly_units(self):
 66 |         block = re.search("## Assembly-Units:\n(.+?)\n#\n", self._body, flags=re.DOTALL).group(1).replace("## ","")
 67 |         dr = csv.DictReader(io.StringIO(block), delimiter=str("\t"))
 68 |         return list(dr)
 69 | 
 70 |     @property
 71 |     def sequences(self):
 72 |         block = re.search("# (Sequence-Name.+)", self._body, flags=re.DOTALL).group(1)
 73 |         dr = csv.DictReader(io.StringIO(block), delimiter=str("\t"))
 74 |         return list(dr)
 75 | 
 76 | 
 77 | 
 78 | def parse_options(argv):
 79 |     ap = ArgumentParser(
 80 |         description = __doc__,
 81 |         )
 82 |     ap.add_argument("ASSEMBLIES", nargs="+", help="accessions (GCF or GCA) or filenames of downloaded assemblies")
 83 |     ap.add_argument("--prefix", "-p", required=True, help="directory prefix for saving files -- must exist")
 84 |     opts = ap.parse_args(argv[1:])
 85 |     return opts
 86 | 
 87 | 
 88 | def build_seq_rec(seqrec):
 89 |     aliases = []
 90 |     if seqrec["UCSC-style-name"] != "na":
 91 |         aliases = [seqrec["UCSC-style-name"]]     # "chr17_ctg5_hap1"
 92 | 
 93 |     if seqrec["GenBank-Accn"] != "na":
 94 |         genbank_ac = seqrec["GenBank-Accn"]       # "GL000258.1",
 95 |     else:
 96 |         genbank_ac = None
 97 | 
 98 |     return {
 99 |         "aliases": aliases,
100 |         "assembly_unit": seqrec["Assembly-Unit"], # "ALT_REF_LOCI_9"
101 |         "genbank_ac": genbank_ac,                 # "GL000258.1",
102 |         "length": int(seqrec["Sequence-Length"]), # "1680828"
103 |         "name": seqrec["Sequence-Name"],          # "HSCHR17_1_CTG5",
104 |         "refseq_ac": seqrec["RefSeq-Accn"],       # "NT_167251.1"
105 |         "relationship": seqrec["Relationship"],   # "=",
106 |         "sequence_role": seqrec["Sequence-Role"],  # "alt-scaffold"
107 |         #"": seqrec["Assigned-Molecule"], # "17"
108 |         #"": seqrec["Assigned-Molecule-Location/Type"], # "Chromosome",
109 |     }
110 | 
111 | 
112 | def process1(opts, assy_id_or_name):
113 |     content = open(assy_id_or_name,"r").read()
114 | 
115 |     assy = AssemblyParser(content)
116 | 
117 |     obj = {
118 |         "name": assy.name,
119 |         "description": assy.description,
120 |         "date": assy.date,
121 |         "submitter": assy.submitter,
122 |         "genbank_ac": assy.genbank_accession,
123 |         "refseq_ac": assy.refseq_accession,
124 |         "sequences": [build_seq_rec(sr) for sr in assy.sequences],
125 |         }
126 | 
127 |     out_fn = os.path.join(opts.prefix, assy.name + ".json")
128 | 
129 |     with open(out_fn,"w") as out_fd:
130 |         json.dump(fp=out_fd, sort_keys=True, indent=2, obj=obj)
131 |     return out_fn
132 | 
133 | 
134 | if __name__ == "__main__":
135 |     logging.basicConfig(level=logging.INFO)
136 |     logger = logging.getLogger()
137 | 
138 |     opts = parse_options(sys.argv)
139 | 
140 |     for assy_id_or_name in opts.ASSEMBLIES:
141 |         try:
142 |             out_fn = process1(opts, assy_id_or_name)
143 |             if out_fn is not None:
144 |                 logger.info("wrote {}".format(out_fn))
145 |         except Exception as e:
146 |             logger.error("oopsie on " + assy_id_or_name)
147 |             logger.exception(e)
148 | 
149 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | build-backend = "setuptools.build_meta"
  3 | requires = ["setuptools >= 70.1", "setuptools_scm[toml] ~= 8.1"]
  4 | 
  5 | [dependency-groups]
  6 | dev = [
  7 |   "deptry>=0.23",
  8 |   "mkdocs-awesome-nav>=3.1.2",
  9 |   "mkdocs-material>=9.6",
 10 |   "mkdocs>=1.6",
 11 |   "mkdocstrings[python]>=0.30",
 12 |   "mypy>=1.17",
 13 |   "pre-commit>=3.8",
 14 |   "pytest-cov>=4.1",
 15 |   "pytest>=7.4",
 16 |   "ruff>=0.12",
 17 |   "tox-uv>=1.28",
 18 |   "vcrpy"
 19 | ]
 20 | 
 21 | [project]
 22 | authors = [
 23 |   {email = "biocommons-dev@googlegroups.com", name = "biocommons contributors"}
 24 | ]
 25 | classifiers = [
 26 |   "Intended Audience :: Developers",
 27 |   "Programming Language :: Python :: 3",
 28 |   "Programming Language :: Python :: 3.11",
 29 |   "Programming Language :: Python :: 3.12",
 30 |   "Programming Language :: Python :: 3.13",
 31 |   "Programming Language :: Python",
 32 |   "Topic :: Software Development :: Libraries :: Python Modules"
 33 | ]
 34 | dependencies = [
 35 |   "attrs",
 36 |   "requests"
 37 | ]
 38 | description = "miscellaneous simple bioinformatics utilities and lookup tables"
 39 | dynamic = ["version"]
 40 | keywords = [
 41 |   "HGVS",
 42 |   "biocommons",
 43 |   "bioinformatics",
 44 |   "genomics",
 45 |   "variation"
 46 | ]
 47 | license = "Apache-2.0"
 48 | name = "bioutils"
 49 | readme = "README.md"
 50 | requires-python = ">=3.11"
 51 | 
 52 | [project.urls]
 53 | Documentation = "https://biocommons.github.io/bioutils/"
 54 | Homepage = "https://github.com/biocommons/bioutils"
 55 | Issues = "https://github.com/biocommons/bioutils/issues"
 56 | Repository = "https://github.com/biocommons/bioutils"
 57 | 
 58 | [tool.coverage.report]
 59 | exclude_lines = [
 60 |     # Have to re-enable the standard pragma
 61 |     "pragma: no cover",
 62 | 
 63 |     # Don't complain about missing debug-only code:
 64 |     "def __repr__",
 65 |     "if self.debug",
 66 | 
 67 |     # Don't complain if tests don't hit defensive assertion code:
 68 |     "raise AssertionError",
 69 |     "raise NotImplementedError",
 70 | 
 71 |     # Don't complain if non-runnable code isn't run:
 72 |     "if __name__ == .__main__.:",
 73 | ]
 74 | show_missing = true
 75 | skip_empty = true
 76 | 
 77 | [tool.coverage.run]
 78 | branch = true
 79 | omit = ["*/test/*", "*/tests/*", "*_test.py"]
 80 | source = ["src"]
 81 | 
 82 | [tool.deptry]
 83 | 
 84 | [tool.deptry.package_module_name_map]
 85 | # map package name to import name
 86 | # Making this explicit suppresses deptry notices
 87 | coloredlogs = "coloredlogs"
 88 | mkdocs = "mkdocs"
 89 | mkdocs-material = "mkdocs_material"
 90 | mkdocstrings = "mkdocstrings"
 91 | mypy = "mypy"
 92 | pre-commit = "pre_commit"
 93 | pytest = "pytest"
 94 | pytest-cov = "pytest_cov"
 95 | pyyaml = "yaml"
 96 | ruff = "ruff"
 97 | tox-uv = "tox_uv"
 98 | ty = "ty"
 99 | 
100 | [tool.pytest.ini_options]
101 | addopts = "-s -v -x --strict-markers -m 'not extra' --doctest-modules --cov=src"
102 | doctest_optionflags = [
103 |   "ALLOW_BYTES",
104 |   "ALLOW_UNICODE",
105 |   "ELLIPSIS",
106 |   "IGNORE_EXCEPTION_DETAIL",
107 |   "NORMALIZE_WHITESPACE"
108 | ]
109 | markers = [
110 |   "network: tests that require network connectivity",
111 |   "slow: slow tests that should be run infrequently"
112 | ]
113 | testpaths = ["tests"]
114 | 
115 | [tool.ruff]
116 | fix = true
117 | line-length = 100
118 | src = ["src", "tests"]
119 | target-version = "py39"
120 | 
121 | [tool.ruff.format]
122 | docstring-code-format = true
123 | preview = true
124 | quote-style = "double"
125 | 
126 | [tool.ruff.lint]
127 | fixable = [
128 |   "B",
129 |   "C4",
130 |   "D",
131 |   "EM",
132 |   "F401",
133 |   "F541",
134 |   "I",
135 |   "PERF",
136 |   "PIE",
137 |   "PT",
138 |   "RET",
139 |   "RSE",
140 |   "RUF",
141 |   "SIM",
142 |   "UP"
143 | ]
144 | ignore = [
145 |   "E111",
146 |   "E114",
147 |   "E117",
148 |   "E501",
149 |   "E731",
150 |   "PLR0913",
151 |   "S321",
152 |   "W191"
153 | ]
154 | select = [
155 |   "A", # https://docs.astral.sh/ruff/rules/#flake8-builtins-a
156 |   "ARG", # https://docs.astral.sh/ruff/rules/#flake8-unused-arguments-arg
157 |   "B", # https://docs.astral.sh/ruff/rules/#flake8-bugbear-b
158 |   "C4", # https://docs.astral.sh/ruff/rules/#flake8-comprehensions-c4
159 |   "DTZ", # https://docs.astral.sh/ruff/rules/#flake8-datetimez-dtz
160 |   "E",
161 |   "EM", # https://docs.astral.sh/ruff/rules/#flake8-errmsg-em
162 |   "F", # https://docs.astral.sh/ruff/rules/#pyflakes-f
163 |   "G", # https://docs.astral.sh/ruff/rules/#flake8-logging-format-g
164 |   "I", # https://docs.astral.sh/ruff/rules/#isort-i
165 |   "LOG", # https://docs.astral.sh/ruff/rules/#flake8-logging-log
166 |   "N", # https://docs.astral.sh/ruff/rules/#pep8-naming-n
167 |   "PERF", # https://docs.astral.sh/ruff/rules/#perflint-perf
168 |   "PIE", # https://docs.astral.sh/ruff/rules/#flake8-pie-pie
169 |   "PL", # https://docs.astral.sh/ruff/rules/#pylint-pl
170 |   "PT", # https://docs.astral.sh/ruff/rules/#flake8-pytest-style-pt
171 |   "PTH", # https://docs.astral.sh/ruff/rules/#flake8-use-pathlib-pth
172 |   "RET", # https://docs.astral.sh/ruff/rules/#flake8-return-ret
173 |   "RSE", # https://docs.astral.sh/ruff/rules/#flake8-raise-rse
174 |   "RUF", # https://docs.astral.sh/ruff/rules/#ruff-specific-rules-ruf
175 |   "S", # https://docs.astral.sh/ruff/rules/#flake8-bandit-s
176 |   "SIM", # https://docs.astral.sh/ruff/rules/#flake8-simplify-sim
177 |   "TRY", # https://docs.astral.sh/ruff/rules/#tryceratops-try
178 |   "UP", # https://docs.astral.sh/ruff/rules/#pyupgrade-up
179 |   "W", # https://docs.astral.sh/ruff/rules/#pycodestyle-e-w
180 |   "YTT" # https://docs.astral.sh/ruff/rules/#flake8-2020-ytt
181 | ]
182 | 
183 | [tool.ruff.lint.per-file-ignores]
184 | "tests/*" = ["S101"]
185 | 
186 | [tool.setuptools]
187 | include-package-data = true
188 | 
189 | [tool.setuptools.package-data]
190 | "*" = ["_data/*/*.json.gz"]
191 | 
192 | [tool.setuptools.packages.find]
193 | exclude = ["*.pyc", "__pycache__"]
194 | # namespaces = true
195 | where = ["src"]
196 | 
197 | [tool.setuptools_scm]
198 | 


--------------------------------------------------------------------------------
/tests/test_normalize.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | 
  3 | import pytest
  4 | 
  5 | from bioutils.normalize import NormalizationMode, normalize
  6 | 
  7 | sequence = "CCCCCCCCACACACACACTAGCAGCAGCA"
  8 | 
  9 | normalize_seq = partial(normalize, sequence=sequence)
 10 | normalize_trim = partial(normalize_seq, mode=NormalizationMode.TRIMONLY)
 11 | normalize_trim_no_shuffle = partial(normalize_seq, mode=None, trim=True)
 12 | normalize_no_trim_no_shuffle = partial(normalize_seq, mode=None, trim=False)
 13 | normalize_left = partial(normalize_seq, mode=NormalizationMode.LEFTSHUFFLE)
 14 | normalize_right = partial(normalize_seq, mode=NormalizationMode.RIGHTSHUFFLE)
 15 | normalize_expand = partial(normalize_seq, mode=NormalizationMode.EXPAND)
 16 | normalize_vcf = partial(normalize_seq, mode=NormalizationMode.VCF)
 17 | normalize_left_no_trim = partial(normalize_seq, mode=NormalizationMode.LEFTSHUFFLE, trim=False)
 18 | normalize_right_no_trim = partial(normalize_seq, mode=NormalizationMode.RIGHTSHUFFLE, trim=False)
 19 | normalize_expand_no_trim = partial(normalize_seq, mode=NormalizationMode.EXPAND, trim=False)
 20 | normalize_vcf_no_trim = partial(normalize_seq, mode=NormalizationMode.VCF, trim=False)
 21 | 
 22 | 
 23 | @pytest.mark.parametrize("normalize_fn", [normalize_trim, normalize_trim_no_shuffle])
 24 | def test_trim(normalize_fn):
 25 |     """Should trim common prefix and suffix when trim=True."""
 26 |     assert ((25, 25), ("", "AC")) == normalize_fn(interval=(22, 25), alleles=(None, "AGCAC"))
 27 |     assert ((24, 25), ("C", "", "CAC")) == normalize_fn(interval=(22, 25), alleles=(None, "AG", "AGCAC"))
 28 |     assert ((23, 24), ("G", "", "GCA")) == normalize_fn(interval=(22, 25), alleles=(None, "AC", "AGCAC"))
 29 |     assert ((22, 24), ("AG", "G", "AGCA")) == normalize_fn(interval=(22, 25), alleles=(None, "GC", "AGCAC"))
 30 | 
 31 | 
 32 | @pytest.mark.parametrize("normalize_fn", [normalize_trim, normalize_trim_no_shuffle])
 33 | def test_anchor(normalize_fn):
 34 |     assert ((23, 25), ("GC", "")) == normalize_fn(interval=(22, 25), alleles=(None, "A"), anchor_length=0)
 35 |     assert ((22, 26), ("AGCA", "AA")) == normalize_fn(interval=(22, 25), alleles=(None, "A"), anchor_length=1)
 36 |     assert ((21, 27), ("CAGCAG", "CAAG")) == normalize_fn(interval=(22, 25), alleles=(None, "A"), anchor_length=2)
 37 | 
 38 |     # off the left
 39 |     assert ((1, 1), ("", "C")) == normalize_fn(interval=(1, 1), alleles=(None, "C"), anchor_length=0)
 40 |     assert ((0, 2), ("CC", "CCC")) == normalize_fn(interval=(1, 1), alleles=(None, "C"), anchor_length=1)
 41 |     assert ((0, 3), ("CCC", "CCCC")) == normalize_fn(interval=(1, 1), alleles=(None, "C"), anchor_length=2)
 42 | 
 43 |     # off the right
 44 |     assert ((28, 28), ("", "C")) == normalize_fn(interval=(28, 28), alleles=(None, "C"), anchor_length=0)
 45 |     assert ((27, 29), ("CA", "CCA")) == normalize_fn(interval=(28, 28), alleles=(None, "C"), anchor_length=1)
 46 |     assert ((26, 29), ("GCA", "GCCA")) == normalize_fn(interval=(28, 28), alleles=(None, "C"), anchor_length=2)
 47 | 
 48 | 
 49 | def test_trinuc():
 50 |     """LEFTSHUFFLE, RIGHTSHUFFLE, EXPAND normalization for trinucleotide"""
 51 |     # 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9
 52 |     #  C C C C C C C C A C A C A C A C A C T A G C A G C A G C A T
 53 |     #                                             ^ [22,22): ['', 'AGC']   | Starting alleles
 54 |     #      LEFTSHUFFLE                      ^ [19,19): ['', 'AGC']
 55 |     #      RIGHTSHUFFLE                                         ^ [29,29): ['', 'GCA']
 56 |     #      EXPAND                           ^-------------------^ [19,29): ['AGCAGCAGCA', 'AGCAGCAGCAGCA']
 57 |     assert ((19, 19), ("", "AGC")) == normalize_left(interval=(22, 22), alleles=(None, "AGC"))
 58 |     assert ((29, 29), ("", "GCA")) == normalize_right(interval=(22, 22), alleles=(None, "AGC"))
 59 |     assert ((19, 29), ("AGCAGCAGCA", "AGCAGCAGCAGCA")) == normalize_expand(interval=(22, 22), alleles=(None, "AGC"))
 60 | 
 61 | 
 62 | def test_bounds():
 63 |     """ensure that bounds are honored"""
 64 |     assert ((20, 24), ("GCAG", "GCAGCAG")) == normalize_expand(
 65 |         interval=(22, 22), alleles=(None, "AGC"), bounds=(20, 24)
 66 |     )
 67 | 
 68 | 
 69 | def test_no_trim_no_shuffle():
 70 |     """Should not trim or shuffle when mode=None, trim=False."""
 71 |     assert ((22, 25), ("AGC", "AGC")) == normalize_no_trim_no_shuffle(interval=(22, 25), alleles=(None, "AGC"))
 72 |     assert ((22, 25), ("AGC", "AGCT")) == normalize_no_trim_no_shuffle(interval=(22, 25), alleles=(None, "AGCT"))
 73 | 
 74 | 
 75 | def test_shuffle_no_trim():
 76 |     """Should shuffle but not trim when mode!=None and trim=False."""
 77 |     assert ((19, 22), ("AGC", "AGC")) == normalize_left_no_trim(interval=(22, 25), alleles=(None, "AGC"))
 78 |     assert ((26, 29), ("GCA", "GCA")) == normalize_right_no_trim(interval=(22, 25), alleles=(None, "AGC"))
 79 |     assert ((19, 29), ("AGCAGCAGCA", "AGCAGCAGCA")) == normalize_expand_no_trim(
 80 |         interval=(22, 25), alleles=(None, "AGC")
 81 |     )
 82 | 
 83 | 
 84 | # TODO: def test_multiallele():
 85 | 
 86 | 
 87 | def test_mode_string():
 88 |     "test that mode as string is accepted"
 89 |     _normalize = partial(normalize_seq, interval=(28, 28), alleles=(None, "C"))
 90 |     vcf_out = ((26, 27), ("G", "GC"))
 91 |     assert vcf_out != _normalize(), "not VCF output by default"
 92 |     assert vcf_out == _normalize(mode="VCF"), "mode as string recognized"
 93 | 
 94 | 
 95 | def test_input_alleles_not_modified():
 96 |     """ensure that alleles list is not modified"""
 97 |     alleles = (None, "AGCAC")
 98 |     normalize_trim(interval=(22, 25), alleles=alleles)
 99 |     assert (None, "AGCAC") == alleles
100 | 
101 | 
102 | @pytest.mark.parametrize("normalize_fn", [normalize_trim, normalize_trim_no_shuffle])
103 | def test_error_distinct(normalize_fn):
104 |     """Must have at least two distinct allele sequences (incl. ref) to normalize"""
105 |     with pytest.raises(ValueError):
106 |         normalize_fn(interval=(22, 25), alleles=(None, "AGC"))
107 | 
108 | 
109 | def test_error_ref_allele():
110 |     "First allele is ref allele and must be None"
111 |     with pytest.raises(ValueError):
112 |         normalize_trim(interval=(22, 25), alleles=("foo", "AGC"))
113 | 
114 | 
115 | def test_error_vcf_mode_no_trim():
116 |     """Should raise error when mode=VCF, trim=False."""
117 |     with pytest.raises(ValueError) as exc_info:
118 |         normalize_vcf_no_trim(interval=(22, 25), alleles=(None, "AGC"))
119 |     assert str(exc_info.value) == "May not disable trimming with VCF normalization mode"
120 | 


--------------------------------------------------------------------------------
/src/bioutils/assemblies.py:
--------------------------------------------------------------------------------
  1 | """Creates dictionaries of genome assembly data as provided by
  2 | 
  3 | ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/*.assembly.txt
  4 | 
  5 | Assemblies are stored in json files with the package in
  6 | ``_data/assemblies/``. Those files are built with sbin/assembly-to-json,
  7 | also in this package.
  8 | 
  9 | Definitions:
 10 | 
 11 | * accession ``ac``: symbol used to refer to a sequence (e.g., NC_000001.10)
 12 | * name: human-label (e.g., '1', 'MT', 'HSCHR6_MHC_APD_CTG1') that
 13 |   refers to a sequence, unique within some domain (e.g., GRCh37.p10)
 14 | * chromosome (chr): subset of names that refer to chromosomes 1..22, X, Y, MT
 15 | * aliases: list of other names; uniqueness unknown
 16 | 
 17 | .. note:: Some users prefer using a 'chr' prefix for chromosomes and
 18 |     some don't.  Some prefer upper case and others prefer lower.  This
 19 |     rift is unfortunate and creates unnecessary friction in sharing data.
 20 |     You say TO-my-to and I say TO-mah-to doesn't apply here.  This code
 21 |     favors using the authoritative names exactly as defined in the
 22 |     assembly records.  Users are encouraged to use sequence names
 23 |     verbatim, without prefixes or case changes.
 24 | """
 25 | 
 26 | import gzip
 27 | import json
 28 | from importlib import resources
 29 | 
 30 | 
 31 | def get_assembly_names():
 32 |     """Retrieves available assemblies from the ``_data/assemblies`` directory.
 33 | 
 34 |     Returns:
 35 |         list of str: The names of the available assemblies.
 36 | 
 37 |     Examples:
 38 |         >>> assy_names = get_assembly_names()
 39 | 
 40 |         >>> 'GRCh37.p13' in assy_names
 41 |         True
 42 |     """
 43 |     assemblies_path = resources.files(__package__) / "_data" / "assemblies"
 44 | 
 45 |     return [n.name.replace(".json.gz", "") for n in assemblies_path.iterdir() if n.name.endswith(".json.gz")]
 46 | 
 47 | 
 48 | def get_assembly(name):
 49 |     """Retreives the assembly data for a given assembly.
 50 | 
 51 |     Args:
 52 |         name (str): The name of the assembly to retrieve data for.
 53 | 
 54 |     Returns:
 55 |         dict: A dictionary of the assembly data. See examples for details.
 56 | 
 57 | 
 58 |     Examples:
 59 |         >>> assy = get_assembly('GRCh37.p13')
 60 | 
 61 |         >>> assy['name']
 62 |         'GRCh37.p13'
 63 | 
 64 |         >>> assy['description']
 65 |         'Genome Reference Consortium Human Build 37 patch release 13 (GRCh37.p13)'
 66 | 
 67 |         >>> assy['refseq_ac']
 68 |         'GCF_000001405.25'
 69 | 
 70 |         >>> assy['genbank_ac']
 71 |         'GCA_000001405.14'
 72 | 
 73 |         >>> len(assy['sequences'])
 74 |         297
 75 | 
 76 |         >>> import pprint
 77 |         >>> pprint.pprint(assy['sequences'][0])
 78 |         {'aliases': ['chr1'],
 79 |         'assembly_unit': 'Primary Assembly',
 80 |         'genbank_ac': 'CM000663.1',
 81 |         'length': 249250621,
 82 |         'name': '1',
 83 |         'refseq_ac': 'NC_000001.10',
 84 |         'relationship': '=',
 85 |         'sequence_role': 'assembled-molecule'}
 86 |     """
 87 |     fn = resources.files(__package__) / "_data" / "assemblies" / f"{name}.json.gz"
 88 |     if not fn.exists():
 89 |         raise FileNotFoundError
 90 | 
 91 |     return json.load(gzip.open(fn, mode="rt", encoding="utf-8"))
 92 | 
 93 | 
 94 | def get_assemblies(names=[]):
 95 |     """Retrieves data from multiple assemblies.
 96 | 
 97 |     If assemblies are not specified, retrieves data from all available ones.
 98 | 
 99 |     Args:
100 |         names (list of str, optional): The names of the assemblies to retrieve data for.
101 | 
102 |     Returns:
103 |         dict: A dictionary of the form ``{assembly_name, : assembly_data}``, where the values
104 |             are the dictionaries of assembly data as described in ``get_assembly()``.
105 | 
106 |     Examples:
107 |         >>> assemblies = get_assemblies(names=['GRCh37.p13'])
108 |         >>> assy = assemblies['GRCh37.p13']
109 | 
110 |         >>> assemblies = get_assemblies()
111 |         >>> 'GRCh38.p2' in assemblies
112 |         True
113 |     """
114 | 
115 |     if names == []:
116 |         names = get_assembly_names()
117 |     return {a["name"]: a for a in (get_assembly(n) for n in names)}
118 | 
119 | 
120 | def make_name_ac_map(assy_name, primary_only=False):
121 |     """Creates a map from sequence names to accessions for a given assembly.
122 | 
123 |     Args:
124 |         assy_name (str): The name of the assembly to make a map for.
125 |         primary_only (bool, optional): Whether to include only primary sequences.
126 |             Defaults to False.
127 | 
128 |     Returns:
129 |         dict: A dictionary of the form ``{sequence_name : accession}`` for sequences in the given assembly,
130 |             Where sequence_name and accession are both strings.
131 | 
132 |     Examples:
133 |         >>> grch38p5_name_ac_map = make_name_ac_map('GRCh38.p5')
134 |         >>> grch38p5_name_ac_map['1']
135 |         'NC_000001.11'
136 |     """
137 | 
138 |     return {
139 |         s["name"]: s["refseq_ac"] for s in get_assembly(assy_name)["sequences"] if (not primary_only or _is_primary(s))
140 |     }
141 | 
142 | 
143 | def make_ac_name_map(assy_name, primary_only=False):
144 |     """Creates a map from accessions to sequence names for a given assembly.
145 | 
146 |     Args:
147 |         assy_name (str): The name of the assembly to make a map for.
148 |         primary_only (bool, optional): Whether to include only primary sequences.
149 |             Defaults to False.
150 | 
151 |     Returns:
152 |         dict: A dictionary of the form ``{accesssion : sequence_name}`` for accessions in the given assembly,
153 |             where accession and sequence_name are strings.
154 | 
155 | 
156 |     Examples:
157 |         >>> grch38p5_ac_name_map = make_ac_name_map('GRCh38.p5')
158 |         >>> grch38p5_ac_name_map['NC_000001.11']
159 |         '1'
160 |     """
161 | 
162 |     return {
163 |         s["refseq_ac"]: s["name"] for s in get_assembly(assy_name)["sequences"] if (not primary_only or _is_primary(s))
164 |     }
165 | 
166 | 
167 | ############################################################################
168 | # Internal functions
169 | 
170 | 
171 | def _is_primary(s):
172 |     """Indicates whether a sequence is a part of the primary assembly.
173 | 
174 |     Args:
175 |         s (dict): A dictionary of sequence data, e.g. those in assembly['sequences'].
176 | 
177 |     Returns:
178 |         bool: True if the sequence is part of the primary assembly, False otherwise.
179 | 
180 | 
181 |     Examples:
182 |         >>> _is_primary({'assembly_unit': 'Primary Assembly'})
183 |         True
184 | 
185 |         >>> _is_primary({'assembly_unit': 'Something else entirely'})
186 |         False
187 |     """
188 | 
189 |     return s["assembly_unit"] == "Primary Assembly"
190 | 


--------------------------------------------------------------------------------
/src/bioutils/digests.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import base64
  4 | import hashlib
  5 | 
  6 | from .sequences import normalize_sequence
  7 | from .vmc_digest import vmc_digest
  8 | 
  9 | 
 10 | def seq_seqhash(seq, normalize=True):
 11 |     """Converts sequence to 24-byte Truncated Digest.
 12 | 
 13 |     Args:
 14 |         seq (str): A sequence.
 15 |         normalize (bool, optional): Whether to normalize the sequence before conversion,
 16 |             i.e. to ensure representation as uppercase letters without whitespace or asterisks.
 17 |             Defaults to ``True``.
 18 | 
 19 |     Returns:
 20 |         str: 24-byte Truncated Digest representation of sequence.
 21 | 
 22 |     Examples:
 23 |         >>> seq_seqhash("")
 24 |         'z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXc'
 25 | 
 26 |         >>> seq_seqhash("ACGT")
 27 |         'aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2'
 28 | 
 29 |         >>> seq_seqhash("acgt")
 30 |         'aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2'
 31 | 
 32 |         >>> seq_seqhash("acgt", normalize=False)
 33 |         'eFwawHHdibaZBDcs9kW3gm31h1NNJcQe'
 34 |     """
 35 | 
 36 |     seq = normalize_sequence(seq) if normalize else seq
 37 |     return str(vmc_digest(seq, digest_size=24))
 38 | 
 39 | 
 40 | def seq_seguid(seq, normalize=True):
 41 |     """Converts sequence to seguid.
 42 | 
 43 |     This seguid is compatible with BioPython's seguid.
 44 | 
 45 |     Args:
 46 |         seq (str): A sequence.
 47 |         normalize (bool, optional): Whether to normalize the sequence before conversion,
 48 |             i.e. to ensure representation as uppercase letters without whitespace or asterisks.
 49 |             Defaults to ``True``.
 50 | 
 51 |     Returns:
 52 |         str: seguid representation of sequence.
 53 | 
 54 |     Examples:
 55 |         >>> seq_seguid('')
 56 |         '2jmj7l5rSw0yVb/vlWAYkK/YBwk'
 57 | 
 58 |         >>> seq_seguid('ACGT')
 59 |         'IQiZThf2zKn/I1KtqStlEdsHYDQ'
 60 | 
 61 |         >>> seq_seguid('acgt')
 62 |         'IQiZThf2zKn/I1KtqStlEdsHYDQ'
 63 | 
 64 |         >>> seq_seguid('acgt', normalize=False)
 65 |         'lII0AoG1/I8qKY271rgv5CFZtsU'
 66 |     """
 67 | 
 68 |     seq = normalize_sequence(seq) if normalize else seq
 69 |     bseq = seq.encode("ascii")
 70 |     return base64.b64encode(hashlib.sha1(bseq).digest()).decode("ascii").rstrip("=")
 71 | 
 72 | 
 73 | def seq_md5(seq, normalize=True):
 74 |     """Converts sequence to unicode md5 hex digest.
 75 | 
 76 |     Args:
 77 |         seq (str): A sequence.
 78 |         normalize (bool, optional): Whether to normalize the sequence before conversion,
 79 |             i.e. to ensure representation as uppercase letters without whitespace or asterisks.
 80 |             Defaults to ``True``.
 81 | 
 82 |     Returns:
 83 |         str: Unicode md5 hex digest representation of sequence.
 84 | 
 85 |     Examples:
 86 |         >>> seq_md5('')
 87 |         'd41d8cd98f00b204e9800998ecf8427e'
 88 | 
 89 |         >>> seq_md5('ACGT')
 90 |         'f1f8f4bf413b16ad135722aa4591043e'
 91 | 
 92 |         >>> seq_md5('ACGT*')
 93 |         'f1f8f4bf413b16ad135722aa4591043e'
 94 | 
 95 |         >>> seq_md5(' A C G T ')
 96 |         'f1f8f4bf413b16ad135722aa4591043e'
 97 | 
 98 |         >>> seq_md5('acgt')
 99 |         'f1f8f4bf413b16ad135722aa4591043e'
100 | 
101 |         >>> seq_md5('acgt', normalize=False)
102 |         'db516c3913e179338b162b2476d1c23f'
103 |     """
104 | 
105 |     seq = normalize_sequence(seq) if normalize else seq
106 |     bseq = seq.encode("ascii")
107 |     return hashlib.md5(bseq).hexdigest()
108 | 
109 | 
110 | def seq_sha1(seq, normalize=True):
111 |     """Converts sequence to unicode sha1 hexdigest.
112 | 
113 |     Args:
114 |         seq (str): A sequence.
115 |         normalize (bool, optional): Whether to normalize the sequence before conversion,
116 |             i.e. to ensure representation as uppercase letters without whitespace or asterisks before encoding.
117 |             Defaults to ``True``.
118 | 
119 |     Returns:
120 |         str: Unicode sha1 hexdigest representation of sequence.
121 | 
122 |     Examples:
123 |         >>> seq_sha1('')
124 |         'da39a3ee5e6b4b0d3255bfef95601890afd80709'
125 | 
126 |         >>> seq_sha1('ACGT')
127 |         '2108994e17f6cca9ff2352ada92b6511db076034'
128 | 
129 |         >>> seq_sha1('acgt')
130 |         '2108994e17f6cca9ff2352ada92b6511db076034'
131 | 
132 |         >>> seq_sha1('acgt', normalize=False)
133 |         '9482340281b5fc8f2a298dbbd6b82fe42159b6c5'
134 |     """
135 | 
136 |     seq = normalize_sequence(seq) if normalize else seq
137 |     bseq = seq.encode("ascii")
138 |     return hashlib.sha1(bseq).hexdigest()
139 | 
140 | 
141 | def seq_sha512(seq, normalize=True):
142 |     """Converts sequence to unicode sha512 hexdigest.
143 | 
144 |     Args:
145 |         seq (str): A sequence.
146 |         normalize (bool, optional): Whether to normalize the sequence before conversion,
147 |             i.e. to ensure representation as uppercase letters without whitespace or asterisks.
148 |             Defaults to ``True``.
149 | 
150 |     Returns:
151 |         str: Unicode sha512 hexdigest representation of sequence.
152 | 
153 |     Examples:
154 |         >>> seq_sha512('')
155 |         'cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e'
156 | 
157 |         >>> seq_sha512('ACGT')
158 |         '68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36cf701d26672964efbd536d197f51ce634fc70634d1eefe575bec34c83247abc52010f6e2bbdb8253'
159 | 
160 |         >>> seq_sha512('acgt')
161 |         '68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36cf701d26672964efbd536d197f51ce634fc70634d1eefe575bec34c83247abc52010f6e2bbdb8253'
162 | 
163 |         >>> seq_sha512('acgt', normalize=False)
164 |         '785c1ac071dd89b69904372cf645b7826df587534d25c41edb2862e54fb2940d697218f2883d2bf1a11cdaee658c7f7ab945a1cfd08eb26cbce57ee88790250a'
165 |     """
166 | 
167 |     seq = normalize_sequence(seq) if normalize else seq
168 |     bseq = seq.encode("ascii")
169 |     return hashlib.sha512(bseq).hexdigest()
170 | 
171 | 
172 | def seq_vmc_id(seq, normalize=True):
173 |     """Converts sequence to VMC id.
174 | 
175 |     See https://github.com/ga4gh/vmc
176 | 
177 |     Args:
178 |         seq (str): A sequence.
179 |         normalize (bool, optional): Whether to normalize the sequence before conversion,
180 |             i.e. to ensure representation as uppercase letters without whitespace or asterisks.
181 |             Defaults to ``True``.
182 | 
183 |     Returns:
184 |         str: VMC id representation of sequence.
185 | 
186 |     Examples:
187 |         >>> seq_vmc_id("")
188 |         'VMC:GS_z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXc'
189 | 
190 |         >>> seq_vmc_id("ACGT")
191 |         'VMC:GS_aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2'
192 | 
193 |         >>> seq_vmc_id("acgt")
194 |         'VMC:GS_aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2'
195 | 
196 |         >>> seq_vmc_id("acgt", normalize=False)
197 |         'VMC:GS_eFwawHHdibaZBDcs9kW3gm31h1NNJcQe'
198 |     """
199 | 
200 |     return "{ir[namespace]}:{ir[accession]}".format(ir=seq_vmc_identifier(seq, normalize))
201 | 
202 | 
203 | def seq_vmc_identifier(seq, normalize=True):
204 |     """Converts sequence to VMC identifier (record).
205 | 
206 |     See https://github.com/ga4gh/vmc
207 | 
208 |     Args:
209 |         seq (str): A sequence.
210 |         normalize (bool, optional): Whether to normalize the sequence before conversion,
211 |             i.e. to ensure representation as uppercase letters without whitespace or asterisks.
212 |             Defaults to ``True``.
213 | 
214 |     Returns:
215 |         str: VMC identifier (record) representation of sequnce.
216 | 
217 |     Examples:
218 |         >>> seq_vmc_identifier("") == {'namespace': 'VMC', 'accession': 'GS_z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXc'}
219 |         True
220 | 
221 |         >>> seq_vmc_identifier("ACGT") == {'namespace': 'VMC', 'accession': 'GS_aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2'}
222 |         True
223 | 
224 |         >>> seq_vmc_identifier("acgt") == {'namespace': 'VMC', 'accession': 'GS_aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2'}
225 |         True
226 | 
227 |         >>> seq_vmc_identifier("acgt", normalize=False) == {'namespace': 'VMC', 'accession': 'GS_eFwawHHdibaZBDcs9kW3gm31h1NNJcQe'}
228 |         True
229 |     """
230 | 
231 |     seq = normalize_sequence(seq) if normalize else seq
232 |     return {"namespace": "VMC", "accession": "GS_" + str(vmc_digest(seq))}
233 | 


--------------------------------------------------------------------------------
/src/bioutils/accessions.py:
--------------------------------------------------------------------------------
  1 | """Simple routines to deal with accessions, identifiers, etc.
  2 | 
  3 | Biocommons terminology: an identifier is composed of a *namespace* and
  4 | an *accession*. The namespace is a string, composed of any character
  5 | other than colon (:). The accession is a string without character set
  6 | restriction.  An accession is expected to be unique within the
  7 | namespace; there is no expectation of uniqueness of accessions across
  8 | namespaces.
  9 | 
 10 | ``Identifier := <Namespace, Accession>``
 11 | 
 12 | ``Namespace := [^:]+``
 13 | 
 14 | ``Accession := \\w+``
 15 | 
 16 | 
 17 | Some sample serializations of Identifiers:
 18 | 
 19 | ``json: {"namespace": "RefSeq", "accession": "NM_000551.3"}``
 20 | 
 21 | ``xml: <Identifier namespace="RefSeq" accession="NM_000551.3"/>``
 22 | 
 23 | ``string: "RefSeq:NM_000551.3"``
 24 | 
 25 | The string form may be used as a CURIE, in which case the document in
 26 | which the CURIE is used must contain a map of ``{namespace : uri}``.
 27 | """
 28 | 
 29 | import re
 30 | 
 31 | from .exceptions import BioutilsError
 32 | 
 33 | _ensembl_species_prefixes = "|".join(
 34 |     """ENS ENSACA ENSAME ENSAMX
 35 | ENSANA ENSAPL ENSBTA ENSCAF ENSCAN ENSCAP ENSCAT ENSCCA ENSCEL ENSCGR
 36 | ENSCGR ENSCHI ENSCHO ENSCIN ENSCJA ENSCLA ENSCPO ENSCSA ENSCSAV ENSDAR
 37 | ENSDNO ENSDOR ENSEBU ENSECA ENSEEU ENSETE ENSFAL ENSFCA ENSFDA ENSGAC
 38 | ENSGAL ENSGGO ENSGMO ENSHGLF ENSHGLM ENSJJA ENSLAC ENSLAF ENSLOC
 39 | ENSMAU ENSMEU ENSMFA ENSMGA ENSMIC ENSMLE ENSMLU ENSMMU ENSMNE ENSMOC
 40 | ENSMOD ENSMPU ENSMUS ENSNGA ENSNLE ENSOAN ENSOAR ENSOCU ENSODE ENSOGA
 41 | ENSONI ENSOPR ENSORL ENSPAN ENSPCA ENSPCO ENSPEM ENSPFO ENSPMA ENSPPA
 42 | ENSPPR ENSPPY ENSPSI ENSPTI ENSPTR ENSPVA ENSRBI ENSRNO ENSRRO ENSSAR
 43 | ENSSBO ENSSCE ENSSHA ENSSSC ENSSTO ENSTBE ENSTGU ENSTNI ENSTRU ENSTSY
 44 | ENSTTR ENSVPA ENSXET ENSXMA FB MGP_129S1SvImJ_ MGP_AJ_ MGP_AKRJ_
 45 | MGP_BALBcJ_ MGP_C3HHeJ_ MGP_C57BL6NJ_ MGP_CAROLIEiJ_ MGP_CASTEiJ_
 46 | MGP_CBAJ_ MGP_DBA2J_ MGP_FVBNJ_ MGP_LPJ_ MGP_NODShiLtJ_ MGP_NZOHlLtJ_
 47 | MGP_PWKPhJ_ MGP_PahariEiJ_ MGP_SPRETEiJ_ MGP_WSBEiJ_""".split()
 48 | )
 49 | _ensembl_feature_types_re = r"E|FM|G|GT|P|R|T"
 50 | _ensembl_re = r"^(?:{})(?:{}){}$".format(_ensembl_species_prefixes, _ensembl_feature_types_re, r"\d{11}(?:\.\d+)?")
 51 | 
 52 | # map of regexp => namespace
 53 | # TODO: make this namespace => [regexps] for clarity
 54 | # namespaces follow convention of identifiers.org
 55 | ac_namespace_regexps = {
 56 |     # https://uswest.ensembl.org/info/genome/stable_ids/prefixes.html
 57 |     # [species prefix][feature type prefix][a unique eleven digit number]
 58 |     # N.B. The regexp at http://identifiers.org/ensembl appears broken:
 59 |     # 1) Human only; 2) escaped backslashes (\\d rather than \d).
 60 |     _ensembl_re: "ensembl",
 61 |     # http://identifiers.org/insdc/
 62 |     # P12345, a UniProtKB accession matches the miriam regexp but shouldn't (I think)
 63 |     r"^([A-Z]\d{5}|[A-Z]{2}\d{6}|[A-Z]{4}\d{8}|[A-J][A-Z]{2}\d{5})(\.\d+)?$": "insdc",
 64 |     # http://identifiers.org/refseq/
 65 |     # https://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/
 66 |     r"^((AC|AP|NC|NG|NM|NP|NR|NT|NW|XM|XP|XR|YP|ZP)_\d+|(NZ\_[A-Z]{4}\d+))(\.\d+)?$": "refseq",
 67 |     # Uniprot
 68 |     # http://identifiers.org/uniprot/
 69 |     # https://www.uniprot.org/help/accession_numbers
 70 |     r"^(?:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$": "uniprot",
 71 | }
 72 | 
 73 | ac_namespace_regexps = {re.compile(k): v for k, v in ac_namespace_regexps.items()}
 74 | 
 75 | 
 76 | def chr22XY(c):
 77 |     """Reformats chromosome to be of the form Chr1, ..., Chr22, ChrX, ChrY, etc.
 78 | 
 79 |     Args:
 80 |         c (str or int): A chromosome.
 81 | 
 82 |     Returns:
 83 |         str: The reformatted chromosome.
 84 | 
 85 |     Examples:
 86 |         >>> chr22XY('1')
 87 |         'chr1'
 88 | 
 89 |         >>> chr22XY(1)
 90 |         'chr1'
 91 | 
 92 |         >>> chr22XY('chr1')
 93 |         'chr1'
 94 | 
 95 |         >>> chr22XY(23)
 96 |         'chrX'
 97 | 
 98 |         >>> chr22XY(24)
 99 |         'chrY'
100 | 
101 |         >>> chr22XY("X")
102 |         'chrX'
103 | 
104 |         >>> chr22XY("23")
105 |         'chrX'
106 | 
107 |         >>> chr22XY("M")
108 |         'chrM'
109 |     """
110 |     c = str(c)
111 |     if c[0:3] == "chr":
112 |         c = c[3:]
113 |     if c == "23":
114 |         c = "X"
115 |     if c == "24":
116 |         c = "Y"
117 |     return "chr" + c
118 | 
119 | 
120 | def coerce_namespace(ac):
121 |     """Prefixes accession with inferred namespace if not present.
122 | 
123 |     Intended to be used to promote consistent and unambiguous accession identifiers.
124 | 
125 |     Args:
126 |         ac (str): The accession, with or without namespace prefixed.
127 | 
128 |     Returns:
129 |         str: An identifier of the form "{namespace}:{acession}"
130 | 
131 |     Raises:
132 |         ValueError: if accession sytax does not match the syntax of any namespace.
133 | 
134 |     Examples:
135 |         >>> coerce_namespace("refseq:NM_01234.5")
136 |         'refseq:NM_01234.5'
137 | 
138 |         >>> coerce_namespace("NM_01234.5")
139 |         'refseq:NM_01234.5'
140 | 
141 |         >>> coerce_namespace("bogus:QQ_01234.5")
142 |         'bogus:QQ_01234.5'
143 | 
144 |         >>> coerce_namespace("QQ_01234.5")
145 |         Traceback (most recent call last):
146 |         ...
147 |         ValueError: Could not infer namespace for QQ_01234.5
148 |     """
149 |     if ":" not in ac:
150 |         ns = infer_namespace(ac)
151 |         if ns is None:
152 |             raise ValueError(f"Could not infer namespace for {ac}")
153 |         ac = ns + ":" + ac
154 |     return ac
155 | 
156 | 
157 | def infer_namespace(ac):
158 |     """Infers a unique namespace from an accession, if one exists.
159 | 
160 |     Args:
161 |         ac (str): An accession, without the namespace prefix.
162 | 
163 |     Returns:
164 |         str or None: The unique namespace corresponding to accession syntax, if only one is inferred.
165 |             None if the accesssion sytax does not match any namespace.
166 | 
167 |     Raises:
168 |         BioutilsError: If multiple namespaces match the syntax of the accession.
169 | 
170 |     Examples:
171 |         >>> infer_namespace("ENST00000530893.6")
172 |         'ensembl'
173 | 
174 |         >>> infer_namespace("NM_01234.5")
175 |         'refseq'
176 | 
177 |         >>> infer_namespace("A2BC19")
178 |         'uniprot'
179 | 
180 |         Disbled because Python 2 and 3 handles exceptions differently.
181 | 
182 |         >>> infer_namespace("P12345")  # doctest: +SKIP
183 |         Traceback (most recent call last):
184 |         ...
185 |         bioutils.exceptions.BioutilsError: Multiple namespaces possible for P12345
186 | 
187 |         >>> infer_namespace("BOGUS99") is None
188 |         True
189 |     """
190 | 
191 |     namespaces = infer_namespaces(ac)
192 |     if not namespaces:
193 |         return None
194 |     if len(namespaces) > 1:
195 |         raise BioutilsError("Multiple namespaces possible for {}".format(ac))
196 |     return namespaces[0]
197 | 
198 | 
199 | def infer_namespaces(ac):
200 |     """Infers namespaces possible for a given accession, based on syntax.
201 | 
202 |     Args:
203 |         ac (str): An accession, without the namespace prefix.
204 | 
205 |     Returns:
206 |         list of str: A list of namespaces matching the accession, possibly empty.
207 | 
208 |     Examples:
209 |         >>> infer_namespaces("ENST00000530893.6")
210 |         ['ensembl']
211 | 
212 |         >>> infer_namespaces("ENST00000530893")
213 |         ['ensembl']
214 | 
215 |         >>> infer_namespaces("ENSQ00000530893")
216 |         []
217 | 
218 |         >>> infer_namespaces("NM_01234")
219 |         ['refseq']
220 | 
221 |         >>> infer_namespaces("NM_01234.5")
222 |         ['refseq']
223 | 
224 |         >>> infer_namespaces("NQ_01234.5")
225 |         []
226 | 
227 |         >>> infer_namespaces("A2BC19")
228 |         ['uniprot']
229 | 
230 |         >>> sorted(infer_namespaces("P12345"))
231 |         ['insdc', 'uniprot']
232 | 
233 |         >>> infer_namespaces("A0A022YWF9")
234 |         ['uniprot']
235 |     """
236 |     return [v for k, v in ac_namespace_regexps.items() if k.match(ac)]
237 | 
238 | 
239 | def prepend_chr(chr):
240 |     """Prepends chromosome with 'chr' if not present.
241 | 
242 |     Users are strongly discouraged from using this function. Adding a
243 |     'chr' prefix results in a name that is not consistent
244 |     with authoritative assembly records.
245 | 
246 |     Args:
247 |         chr (str): The chromosome.
248 | 
249 |     Returns:
250 |         str: The chromosome with 'chr' prepended.
251 | 
252 |     Examples:
253 |         >>> prepend_chr('22')
254 |         'chr22'
255 | 
256 |         >>> prepend_chr('chr22')
257 |         'chr22'
258 |     """
259 |     return chr if chr[0:3] == "chr" else "chr" + chr
260 | 
261 | 
262 | def strip_chr(chr):
263 |     """Removes the 'chr' prefix if present.
264 | 
265 |     Args:
266 |         chr (str): The chromosome.
267 | 
268 |     Returns:
269 |         str: The chromosome without a 'chr' prefix.
270 | 
271 |     Examples:
272 |         >>> strip_chr('22')
273 |         '22'
274 | 
275 |         >>> strip_chr('chr22')
276 |         '22'
277 |     """
278 |     return chr[3:] if chr[0:3] == "chr" else chr
279 | 
280 | 
281 | ## <LICENSE>
282 | ## Copyright 2014 Bioutils Contributors (https://bitbucket.org/biocommons/bioutils)
283 | ##
284 | ## Licensed under the Apache License, Version 2.0 (the "License");
285 | ## you may not use this file except in compliance with the License.
286 | ## You may obtain a copy of the License at
287 | ##
288 | ##     http://www.apache.org/licenses/LICENSE-2.0
289 | ##
290 | ## Unless required by applicable law or agreed to in writing, software
291 | ## distributed under the License is distributed on an "AS IS" BASIS,
292 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
293 | ## See the License for the specific language governing permissions and
294 | ## limitations under the License.
295 | ## </LICENSE>
296 | 


--------------------------------------------------------------------------------
/src/bioutils/seqfetcher.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Provides sequence fetching from NCBI and Ensembl."""
  3 | 
  4 | import logging
  5 | import os
  6 | import random
  7 | import re
  8 | import time
  9 | 
 10 | import requests
 11 | 
 12 | _logger = logging.getLogger(__name__)
 13 | 
 14 | # Reece requested registration on 2017-09-03
 15 | ncbi_tool = "bioutils"
 16 | ncbi_email = "biocommons-dev@googlegroups.com"
 17 | retry_limit = 3
 18 | enst_default_seq_type = "cdna"
 19 | 
 20 | 
 21 | def fetch_seq(ac, start_i=None, end_i=None, **rest):
 22 |     """Fetches sequences and subsequences from NCBI eutils and Ensembl REST interfaces.
 23 | 
 24 |     Args:
 25 |         ac (str): The accession of the sequence to fetch.
 26 |         start_i (int, optional): The start index (interbase coordinates) of the subsequence to fetch. Defaults to ``None``.
 27 |             It is recommended to retrieve a subsequence by providing an index here, rather than by
 28 |             Python slicing the whole sequence.
 29 |         end_i (int, optional): The end index (interbase coordinates) of the subsequence to fetch. Defaults to ``None``.
 30 |             It is recommended to retrieve a subsequence by providing an index here, rather than by
 31 |             Python slicing the whole sequence.
 32 | 
 33 |     Returns:
 34 |         str: The requested sequence.
 35 | 
 36 |     Raises:
 37 |         RuntimeError: If the syntax doesn't match that of any of the databases.
 38 |         RuntimeError: If the request to the database fails.
 39 | 
 40 |     Examples:
 41 |         >>> len(fetch_seq('NP_056374.2'))
 42 |         1596
 43 | 
 44 |         >>> fetch_seq('NP_056374.2',0,10)   # This!
 45 |         'MESRETLSSS'
 46 | 
 47 |         >>> fetch_seq('NP_056374.2')[0:10]  # Not this!
 48 |         'MESRETLSSS'
 49 | 
 50 |         # Providing intervals is especially important for large sequences:
 51 | 
 52 |         >>> fetch_seq('NC_000001.10',2000000,2000030)
 53 |         'ATCACACGTGCAGGAACCCTTTTCCAAAGG'
 54 | 
 55 |         # This call will pull back 30 bases plus overhead; without the
 56 |         # interval, one would receive 250MB of chr1 plus overhead!
 57 | 
 58 |         # Essentially any RefSeq, Genbank, BIC, or Ensembl sequence may be
 59 |         # fetched.
 60 | 
 61 |         >>> fetch_seq('NM_9.9')
 62 |         Traceback (most recent call last):
 63 |         ...
 64 |         RuntimeError: No sequence available for NM_9.9
 65 | 
 66 |         >>> fetch_seq('QQ01234')
 67 |         Traceback (most recent call last):
 68 |         ...
 69 |         RuntimeError: No sequence fetcher for QQ01234
 70 |     """
 71 | 
 72 |     ac_dispatch = [
 73 |         {
 74 |             "re": re.compile(r"^(?:AC|N[CGMPRTW])_|^[A-L]\w\d|^U\d"),
 75 |             "fetcher": _fetch_seq_ncbi,
 76 |         },
 77 |         {"re": re.compile(r"^ENS[TP]\d+"), "fetcher": _fetch_seq_ensembl},
 78 |     ]
 79 | 
 80 |     eligible_fetchers = [dr["fetcher"] for dr in ac_dispatch if dr["re"].match(ac)]
 81 | 
 82 |     if len(eligible_fetchers) == 0:
 83 |         raise RuntimeError("No sequence fetcher for {ac}".format(ac=ac))
 84 | 
 85 |     if len(eligible_fetchers) >= 1:  # pragma: nocover (no way to test)
 86 |         _logger.debug("Multiple sequence fetchers found for {ac}; using first".format(ac=ac))
 87 | 
 88 |     fetcher = eligible_fetchers[0]
 89 |     _logger.debug("fetching {ac} with {f}".format(ac=ac, f=fetcher))
 90 | 
 91 |     try:
 92 |         return fetcher(ac, start_i, end_i, **rest)
 93 |     except requests.RequestException as ex:
 94 |         raise RuntimeError("Failed to fetch {ac} ({ex})".format(ac=ac, ex=ex))
 95 | 
 96 | 
 97 | # ###########################################################################
 98 | # Internal functions
 99 | 
100 | 
101 | def _fetch_seq_ensembl(ac, start_i=None, end_i=None, seq_type=None):
102 |     """Fetch sequence slice from Ensembl public REST interface.
103 | 
104 |     Args:
105 |         ac (str): The accession of the sequence to fetch.
106 |         start_i (int, optional): The start index (interbase coordinates) of the subsequence to fetch.
107 |             Defaults to None.
108 |         end_i (int, optional): The end index (interbase coordinates) of the subsequence to fetch.
109 |             Defaults to None.
110 |         seq_type (str, optional): The type of Ensembl sequence to fetch
111 | 
112 |     Returns:
113 |         str: The requested (sub)sequence
114 | 
115 |     Raises:
116 |         RequestException: if request is unsuccessful.
117 |         KeyError: if Ensembl API returns a different version than requested
118 | 
119 |     Note:
120 |         The Ensembl REST interface does not currently accept intervals, so this method
121 |         slices the sequence locally.
122 | 
123 |     Examples:
124 |         >> len(_fetch_seq_ensembl('ENSP00000288602'))
125 |         766
126 | 
127 |         >> _fetch_seq_ensembl('ENSP00000288602',0,10)
128 |         u'MAALSGGGGG'
129 | 
130 |         >> _fetch_seq_ensembl('ENSP00000288602')[0:10]
131 |         u'MAALSGGGGG'
132 | 
133 |         >> ac = 'ENSP00000288602'
134 |         >> _fetch_seq_ensembl(ac ,0, 10) == _fetch_seq_ensembl(ac)[0:10]
135 |         True
136 |     """
137 | 
138 |     # Ensembl API only takes transcript IDs (without version) and returns the latest one
139 |     # So we need to strip the transcript version, then check if what was returned was the one we requested
140 |     version = None
141 |     m = re.match(r"^(ENST\d+)\.(\d+)$", ac)
142 |     if m:
143 |         ac, version = m.groups()
144 |         version = int(version)
145 | 
146 |     if ac.startswith("ENST") and seq_type is None:
147 |         try:
148 |             seq_type = os.environ["ENST_DEFAULT_SEQ_TYPE"]
149 |         except KeyError:
150 |             seq_type = enst_default_seq_type
151 |             _logger.warning(f"{ac}: Transcript type not specified or set in ENST_DEFAULT_SEQ_TYPE; assuming {seq_type}")
152 | 
153 |     url = f"http://rest.ensembl.org/sequence/id/{ac}"
154 |     if seq_type:
155 |         url += f"?type={seq_type}"
156 |     r = requests.get(url, headers={"Content-Type": "application/json"})
157 |     r.raise_for_status()
158 |     data = r.json()
159 |     if version is not None:
160 |         latest_version = data["version"]
161 |         if version != latest_version:
162 |             msg = f"Ensembl API only provides {ac} version ({latest_version}), requested: {version}"
163 |             raise KeyError(msg)
164 | 
165 |     seq = data["seq"]
166 |     return seq if (start_i is None or end_i is None) else seq[start_i:end_i]
167 | 
168 | 
169 | def _fetch_seq_ncbi(ac, start_i=None, end_i=None):
170 |     """Fetches sequences from NCBI using the eutils interface.
171 | 
172 |     Args:
173 |         ac (str): The accession of the sequence to fetch.
174 |         start_i (int, optional): The start index (interbase coordinates) of the subsequence to fetch.
175 |             Defaults to None.
176 |         end_i (int, optional): The end index (interbase coordinates) of the subsequence to fetch.
177 |             Defaults to None.
178 | 
179 |     Returns:
180 |         str: The requested (sub)sequence
181 | 
182 |     Raises:
183 |         RequestException: if request is unsuccessful.
184 | 
185 |     Notes:
186 |         An interbase interval may be optionally provided with start_i and
187 |         end_i. NCBI eutils will return just the requested subsequence,
188 |         which might greatly reduce payload sizes (especially with
189 |         chromosome-scale sequences).
190 | 
191 |         The request includes `tool` and `email` arguments to identify the
192 |         caller as the bioutils package.  According to
193 |         https://www.ncbi.nlm.nih.gov/books/NBK25497/, these values should
194 |         correspond to the library, not the library client.  Using the
195 |         defaults is recommended.  Nonetheless, callers may set
196 |         ``bioutils.seqfetcher.ncbi_tool`` and
197 |         ``bioutils.seqfetcher.ncbi_email`` to custom values if that is
198 |         desired.
199 | 
200 |     Examples:
201 |         >> len(_fetch_seq_ncbi('NP_056374.2'))
202 |         1596
203 | 
204 |         Pass the desired interval rather than using Python's [] slice
205 |         operator.
206 | 
207 |         >> _fetch_seq_ncbi('NP_056374.2',0,10)
208 |         'MESRETLSSS'
209 | 
210 |         >> _fetch_seq_ncbi('NP_056374.2')[0:10]
211 |         'MESRETLSSS'
212 | 
213 |         >> _fetch_seq_ncbi('NP_056374.2',0,10) == _fetch_seq_ncbi('NP_056374.2')[0:10]
214 |         True
215 |     """
216 | 
217 |     db = "protein" if ac[1] == "P" else "nucleotide"
218 |     url_fmt = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={db}&id={ac}&rettype=fasta"
219 | 
220 |     if start_i is None or end_i is None:
221 |         url = url_fmt.format(db=db, ac=ac)
222 |     else:
223 |         url_fmt += "&seq_start={start}&seq_stop={stop}"
224 |         url = url_fmt.format(db=db, ac=ac, start=start_i + 1, stop=end_i)
225 | 
226 |     url += "&tool={tool}&email={email}".format(tool=ncbi_tool, email=ncbi_email)
227 | 
228 |     url = _add_eutils_api_key(url)
229 | 
230 |     n_retries = 0
231 |     while True:
232 |         resp = requests.get(url)
233 |         if resp.ok:
234 |             seq = "".join(resp.text.splitlines()[1:])
235 |             return seq
236 |         elif resp.status_code == 400:
237 |             # Invalid sequence or start/stop position for that sequence
238 |             raise RuntimeError(
239 |                 "Fetching sequence {} with start index {} and end index {} failed, invalid sequence "
240 |                 "or start or end position".format(ac, start_i, end_i)
241 |             )
242 |         if n_retries >= retry_limit:
243 |             break
244 |         if n_retries == 0:
245 |             _logger.warning("Failed to fetch {}".format(url))
246 |         sleeptime = random.randint(n_retries, 3) ** n_retries
247 |         _logger.warning("Failure {}/{}; retry in {} seconds".format(n_retries, retry_limit, sleeptime))
248 |         time.sleep(sleeptime)
249 |         n_retries += 1
250 |     # Falls through only on failure
251 |     resp.raise_for_status()
252 | 
253 | 
254 | def _add_eutils_api_key(url):
255 |     """Adds an eutils api key to the query.
256 | 
257 |     Args:
258 |         url (str): The query url without the api key.
259 | 
260 |     Returns:
261 |         str: The query url with the api key, if one is stored in the environment variable
262 |             ``NCBI_API_KEY``, otherwise it is unaltered.
263 |     """
264 | 
265 |     apikey = os.environ.get("NCBI_API_KEY")
266 |     if apikey:
267 |         url += "&api_key={apikey}".format(apikey=apikey)
268 |     return url
269 | 
270 | 
271 | # So that I don't forget why I didn't use ebi too:
272 | # $ curl 'http://www.ebi.ac.uk/ena/data/view/AM407889.1&display=fasta'
273 | # >ENA|AM407889|AM407889.2 Medicago sativa partial mRNA ...
274 | # AACGTATCACACTTCTTCTCCATTTCTTTTTCTTACATCTTCTCTCTACAAATTCATTTC
275 | # Note that we requested .1, got .2.  Implicit behavior bites again.
276 | 
277 | if __name__ == "__main__":  # pragma: nocover
278 |     import doctest
279 | 
280 |     doctest.testmod()
281 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2019 bioutils Contributors
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/src/bioutils/normalize.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Provides functionality for normalizing alleles, ensuring comparable representations."""
  3 | 
  4 | import enum
  5 | import logging
  6 | import math
  7 | from typing import Optional
  8 | 
  9 | import attr
 10 | 
 11 | _logger = logging.getLogger(__name__)
 12 | debug = False
 13 | 
 14 | NormalizationMode = enum.Enum("NormalizationMode", "EXPAND LEFTSHUFFLE RIGHTSHUFFLE TRIMONLY VCF")
 15 | """Enum passed to normalize to select the normalization mode.
 16 | 
 17 | Attributes:
 18 |     EXPAND: Normalize alleles to maximal extent both left and right.
 19 |     LEFTSHUFFLE: Normalize alleles to maximal extent left.
 20 |     RIGHTSHUFFLE: Normalize alleles to maximal extent right.
 21 |     TRIMONLY: Only trim the common prefix and suffix of alleles. Deprecated -- use `mode=None` with `trim=True` instead.
 22 |     VCF: Normalize with VCF.
 23 | """
 24 | 
 25 | 
 26 | def normalize(
 27 |     sequence,
 28 |     interval,
 29 |     alleles,
 30 |     mode: Optional[NormalizationMode] = NormalizationMode.EXPAND,
 31 |     bounds=None,
 32 |     anchor_length=0,
 33 |     trim: bool = True,
 34 | ):
 35 |     """Normalizes the alleles that co-occur on sequence at interval, ensuring comparable representations.
 36 | 
 37 |     Normalization performs three operations:
 38 |     - trimming
 39 |     - shuffling
 40 |     - anchoring
 41 | 
 42 |     Args:
 43 |         sequence (str or iterable): The reference sequence; must support indexing and ``__getitem__``.
 44 |         interval (2-tuple of int): The location of alleles in the reference sequence as ``(start, end)``.
 45 |             Interbase coordinates.
 46 |         alleles (iterable of str): The sequences to be normalized. The first element
 47 |             corresponds to the reference sequence being unchanged and must be None.
 48 |         bounds (2-tuple of int, optional): Maximal extent of normalization left and right.
 49 |             Must be provided if sequence doesn't support ``__len__``. Defaults to ``(0, len(sequence))``.
 50 |         mode (NormalizationMode Enum or string, optional): A NormalizationMode Enum or the corresponding string.
 51 |             Defaults to ``EXPAND``. Set to None to skip shuffling. Does not affect trimming or anchoring.
 52 |         anchor_length (int, optional): number of flanking residues left and right. Defaults to ``0``.
 53 |         trim (bool): indicates whether to trim the common prefix and suffix of alleles. Defaults to True.
 54 |             Set to False to skip trimming. Does not affect shuffling or anchoring.
 55 | 
 56 |     Returns:
 57 |         tuple: ``(new_interval, [new_alleles])``
 58 | 
 59 |     Raises:
 60 |         ValueError: If normalization mode is VCF and `anchor_length` is nonzero.
 61 |         ValueError: If the interval start is greater than the end.
 62 |         ValueError: If the first (reference) allele is not `None`.
 63 |         ValueError: If there are not at least two distinct alleles.
 64 | 
 65 |     Examples:
 66 |         >>> sequence = "CCCCCCCCACACACACACTAGCAGCAGCA"
 67 |         >>> normalize(sequence, interval=(22,25), alleles=(None, "GC", "AGCAC"), mode='TRIMONLY')
 68 |         ((22, 24), ('AG', 'G', 'AGCA'))
 69 | 
 70 |         >>> normalize(sequence, interval=(22, 22), alleles=(None, 'AGC'), mode='RIGHTSHUFFLE')
 71 |         ((29, 29), ('', 'GCA'))
 72 | 
 73 |         >>> normalize(sequence, interval=(22, 22), alleles=(None, 'AGC'), mode='EXPAND')
 74 |         ((19, 29), ('AGCAGCAGCA', 'AGCAGCAGCAGCA'))
 75 |     """
 76 | 
 77 |     interval = _Interval(*interval)
 78 |     if interval.start > interval.end:
 79 |         raise ValueError("Interval start > end; must be start <= end")
 80 | 
 81 |     if bounds is None:
 82 |         bounds = _Interval(0, len(sequence))
 83 |     else:
 84 |         bounds = _Interval(*bounds)
 85 | 
 86 |     left_anchor = right_anchor = anchor_length
 87 | 
 88 |     if mode is not None and not isinstance(mode, NormalizationMode):
 89 |         mode = NormalizationMode[mode]  # e.g., mode="LEFTSHUFFLE" OK
 90 | 
 91 |     if mode == NormalizationMode.VCF:
 92 |         if anchor_length:
 93 |             raise ValueError("May not provide non-zero anchor size with VCF normalization mode")
 94 |         if not trim:
 95 |             raise ValueError("May not disable trimming with VCF normalization mode")
 96 |         mode = NormalizationMode.LEFTSHUFFLE
 97 |         left_anchor = 1
 98 |         right_anchor = 0
 99 | 
100 |     if alleles[0] is not None:
101 |         raise ValueError("First allele, the reference allele, must be None")
102 |     alleles = list(alleles)  # in case tuple
103 |     alleles[0] = sequence[interval.start : interval.end]
104 | 
105 |     if debug:
106 |         _print_state(
107 |             interval,
108 |             bounds,
109 |             sequence=sequence,
110 |             alleles=alleles,
111 |             comment="Starting state",
112 |         )
113 | 
114 |     if trim:
115 |         if len(set(alleles)) < 2:
116 |             raise ValueError("Must have at least two distinct alleles to trim")
117 | 
118 |         # Trim: remove common suffix, prefix, and adjust interval to match
119 |         l_trimmed, alleles = trim_left(alleles)
120 |         interval.start += l_trimmed
121 |         r_trimmed, alleles = trim_right(alleles)
122 |         interval.end -= r_trimmed
123 |         if debug:
124 |             _print_state(
125 |                 interval,
126 |                 bounds,
127 |                 sequence=sequence,
128 |                 alleles=alleles,
129 |                 comment="After trimming",
130 |             )
131 | 
132 |     lens = [len(a) for a in alleles]
133 | 
134 |     if mode == NormalizationMode.LEFTSHUFFLE:
135 |         dist = roll_left(sequence, alleles, interval.start - 1, bounds.start)
136 |         for i, a in enumerate(alleles):
137 |             if lens[i]:
138 |                 adist = -dist % lens[i]
139 |                 alleles[i] = a[adist:] + a[:adist]
140 |         interval.start -= dist
141 |         interval.end -= dist
142 | 
143 |     elif mode == NormalizationMode.RIGHTSHUFFLE:
144 |         dist = roll_right(sequence, alleles, interval.end, bounds.end - 1)
145 |         for i, a in enumerate(alleles):
146 |             if lens[i]:
147 |                 adist = dist % lens[i]
148 |                 alleles[i] = a[adist:] + a[:adist]
149 |         interval.start += dist
150 |         interval.end += dist
151 | 
152 |     elif mode == NormalizationMode.EXPAND:
153 |         ldist = roll_left(sequence, alleles, interval.start - 1, bounds.start)
154 |         rdist = roll_right(sequence, alleles, interval.end, bounds.end - 1)
155 | 
156 |         lseq = sequence[interval.start - ldist : interval.start]
157 |         rseq = sequence[interval.end : interval.end + rdist]
158 |         alleles = [lseq + a + rseq for a in alleles]
159 | 
160 |         interval.start -= ldist
161 |         interval.end += rdist
162 | 
163 |     if debug:
164 |         _print_state(
165 |             interval,
166 |             bounds,
167 |             sequence=sequence,
168 |             alleles=alleles,
169 |             comment=f"After mode: {mode}",
170 |         )
171 | 
172 |     # Add left and/or right flanking sequence
173 |     if left_anchor or right_anchor:
174 |         anchor_left = max(bounds.start, interval.start - left_anchor)
175 |         anchor_right = min(bounds.end, interval.end + right_anchor)
176 |         left_anchor_seq = sequence[anchor_left : interval.start]
177 |         right_anchor_seq = sequence[interval.end : anchor_right]
178 |         interval.start = anchor_left
179 |         interval.end = anchor_right
180 |         alleles = [left_anchor_seq + a + right_anchor_seq for a in alleles]
181 |         if debug:
182 |             _print_state(
183 |                 interval,
184 |                 bounds,
185 |                 sequence=sequence,
186 |                 alleles=alleles,
187 |                 comment="After anchoring",
188 |             )
189 | 
190 |     return (interval.start, interval.end), tuple(alleles)
191 | 
192 | 
193 | ############################################################################
194 | # INTERNAL
195 | 
196 | 
197 | @attr.s
198 | class _Interval:
199 |     start = attr.ib(int)
200 |     end = attr.ib(int)
201 | 
202 | 
203 | def trim_left(alleles):
204 |     """
205 |     Removes common prefix of given alleles.
206 | 
207 |     Args:
208 |         alleles (list of str): A list of alleles.
209 | 
210 |     Returns:
211 |         tuple: ``(number_trimmed, [new_alleles])``.
212 | 
213 |     Examples:
214 |         >>> trim_left(["","AA"])
215 |         (0, ['', 'AA'])
216 | 
217 |         >>> trim_left(["A","AA"])
218 |         (1, ['', 'A'])
219 | 
220 |         >>> trim_left(["AT","AA"])
221 |         (1, ['T', 'A'])
222 | 
223 |         >>> trim_left(["AA","AA"])
224 |         (2, ['', ''])
225 | 
226 |         >>> trim_left(["CAG","CG"])
227 |         (1, ['AG', 'G'])
228 |     """
229 |     if len(alleles) == 0:
230 |         return (0, [])
231 |     trimmed = 0
232 |     lens = [len(x) for x in alleles]
233 |     while trimmed < min(lens):
234 |         nexts = [x[trimmed] for x in alleles]
235 |         if nexts.count(nexts[0]) == len(nexts):
236 |             trimmed += 1
237 |         else:
238 |             break
239 |     return (trimmed, [x[trimmed:] for x in alleles])
240 | 
241 | 
242 | def trim_right(alleles):
243 |     """
244 |     Removes common suffix of given alleles.
245 | 
246 |     Args:
247 |         alleles (list of str): A list of alleles.
248 | 
249 |     Returns:
250 |         tuple: ``(number_trimmed, [new_alleles])``.
251 | 
252 |     Examples:
253 |         >>> trim_right(["","AA"])
254 |         (0, ['', 'AA'])
255 | 
256 |         >>> trim_right(["A","AA"])
257 |         (1, ['', 'A'])
258 | 
259 |         >>> trim_right(["AT","AA"])
260 |         (0, ['AT', 'AA'])
261 | 
262 |         >>> trim_right(["AA","AA"])
263 |         (2, ['', ''])
264 | 
265 |         >>> trim_right(["CAG","CG"])
266 |         (1, ['CA', 'C'])
267 |     """
268 |     if len(alleles) == 0:
269 |         return (0, [])
270 |     trimmed = 0
271 |     lens = [len(x) for x in alleles]
272 |     while trimmed < min(lens):
273 |         nexts = [x[len(x) - trimmed - 1] for x in alleles]
274 |         if nexts.count(nexts[0]) == len(nexts):
275 |             trimmed += 1
276 |         else:
277 |             break
278 |     return (trimmed, [x[: (len(x) - trimmed)] for x in alleles])
279 | 
280 | 
281 | def roll_left(sequence, alleles, ref_pos, bound):
282 |     """Determines common distance all alleles can be rolled (circularly permuted) left
283 |     within the reference sequence without altering it.
284 | 
285 |     Args:
286 |         sequence (str): The reference sequence.
287 |         alleles (list of str): The sequences to be normalized.
288 |         ref_pos (int): The beginning index for rolling.
289 |         bound (int): The lower bound index in the reference sequence for normalization, hence also for rolling.
290 | 
291 |     Returns:
292 |         int: The distance that the alleles can be rolled.
293 |     """
294 | 
295 |     # circularly permute sequence d steps, using modulo arithmetic
296 |     lens = [len(a) for a in alleles]
297 |     d = 0
298 |     max_d = ref_pos - bound
299 |     while d <= max_d and not any(a and a[-(d + 1) % lens[i]] != sequence[ref_pos - d] for i, a in enumerate(alleles)):
300 |         d += 1
301 |     return d
302 | 
303 | 
304 | def roll_right(sequence, alleles, ref_pos, bound):
305 |     """Determines common distance all alleles can be rolled (circularly permuted) right
306 |     within the reference sequence without altering it.
307 | 
308 |     Args:
309 |         sequence (str): The reference sequence.
310 |         alleles (list of str): The sequences to be normalized.
311 |         ref_pos (int): The start index for rolling.
312 |         bound (int): The upper bound index in the reference sequence for normalization, hence also for rolling.
313 |     Returns:
314 |         int: The distance that the alleles can be rolled
315 |     """
316 | 
317 |     # circularly permute sequence d steps, using modulo arithmetic
318 |     lens = [len(a) for a in alleles]
319 |     d = 0
320 |     max_d = bound - ref_pos
321 |     while d <= max_d and not any(a and a[d % lens[i]] != sequence[ref_pos + d] for i, a in enumerate(alleles)):
322 |         d += 1
323 |     return d
324 | 
325 | 
326 | ############################################################################
327 | # Debugging
328 | 
329 | pfx = "        "
330 | 
331 | 
332 | def _print_state(interval, bounds, sequence, alleles, comment):
333 |     """ """
334 |     line = pfx + "  " * interval.start + "^"
335 |     if interval.end > interval.start:
336 |         line += "-" * ((interval.end - interval.start - 1) * 2 + 1) + "^"
337 |     print(line + f" [{interval.start},{interval.end}): {alleles}   | {comment}")
338 |     return
339 | 
340 |     margin = 15
341 |     leftseq = sequence[max(0, interval.start - margin) : interval.start]
342 |     rightseq = sequence[interval.end : interval.end + margin]
343 | 
344 |     row_fmt = "{:>20.20s}{:>20.20s}{:^20.20s}{:<20.20s}{:<20.20s}"
345 |     rows = [
346 |         row_fmt.format(
347 |             str(bounds.start),
348 |             "",
349 |             f"[{interval.start},{interval.end})",
350 |             "",
351 |             str(bounds.end),
352 |         ),
353 |         row_fmt.format("//", "|", "", "|", "//"),
354 |         row_fmt.format("", leftseq, alleles[0], rightseq, ""),
355 |     ] + [row_fmt.format("", "", a, "", "") for a in alleles[1:]]
356 |     print("\n".join(rows))
357 | 
358 | 
359 | def _print_seq_row(sequence):
360 |     print(pfx + " ".join("0123456789" * math.ceil(len(sequence) / 10)))
361 |     print(pfx + " " + " ".join(sequence))
362 | 
363 | 
364 | if __name__ == "__main__":  # pragma: no cover
365 |     from functools import partial
366 | 
367 |     if debug:
368 |         _logger.setLevel("DEBUG")
369 | 
370 |     sequence = "CCCCCCCCACACACACACTAGCAGCAGCAT"
371 |     #                        1                   2                   3
372 |     #    0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0
373 |     #     C C C C C C C C A C A C A C A C A C T A G C A G C A G C A T
374 | 
375 |     tests = [
376 |         # {"interval": (5,5), "alleles": [None, "C"]},
377 |         # {"interval": (5,6), "alleles": [None, "CC"]},
378 |         # {"interval": (5,6), "alleles": [None, ""]},
379 |         # {"interval": (13,13), "alleles": [None, "CA"]},
380 |         # {"interval": (14,14), "alleles": [None, "AC"]},
381 |         # {"interval": (13,15), "alleles": [None, ""]},
382 |         {"interval": (22, 22), "alleles": [None, "AGC"]},
383 |         {"interval": (22, 22), "alleles": [None, "AGCT"]},
384 |         {"interval": (22, 22), "alleles": [None, "AGC", "AGCT"]},
385 |         # {"interval": (22,25), "alleles": [None, ""]},
386 |         # {"interval": (22,25), "alleles": [None, "", "AGC"]},
387 |         # {"interval": (22,25), "alleles": [None, "", "AGCAGC"]},
388 |     ]
389 | 
390 |     normalize_seq = partial(normalize, sequence=sequence)
391 |     normalize_trim = partial(normalize_seq, mode=NormalizationMode.TRIMONLY)
392 |     normalize_left = partial(normalize_seq, mode=NormalizationMode.LEFTSHUFFLE)
393 |     normalize_right = partial(normalize_seq, mode=NormalizationMode.RIGHTSHUFFLE)
394 |     normalize_expand = partial(normalize_seq, mode=NormalizationMode.EXPAND)
395 |     normalize_vcf = partial(normalize_seq, mode=NormalizationMode.VCF)
396 | 
397 |     debug = True
398 | 
399 |     def test1(**kwargs):
400 |         print(f"* {kwargs}")
401 |         _print_seq_row(sequence)
402 |         result = normalize_seq(**kwargs)
403 |         kwargs["mode"] = str(kwargs["mode"])
404 |         print(f"assert {result} == normalize_seq({kwargs})")
405 | 
406 |     for test in tests:
407 |         print("############################################################################")
408 |         for mode in ("EXPAND",):  # "LEFTSHUFFLE", "RIGHTSHUFFLE", "EXPAND"):
409 |             for bm in (None,):
410 |                 if bm is None:
411 |                     bounds = None
412 |                 else:
413 |                     bounds = (test["interval"][0] - bm, test["interval"][1] + bm)
414 |                 test["bounds"] = bounds
415 |                 test1(mode=mode, **test)
416 | 


--------------------------------------------------------------------------------
/src/bioutils/sequences.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Simple functions and lookup tables for nucleic acid and amino acid sequences."""
  3 | 
  4 | import logging
  5 | import re
  6 | from enum import Enum
  7 | from string import ascii_lowercase
  8 | 
  9 | _logger = logging.getLogger(__name__)
 10 | 
 11 | aa3_to_aa1_lut = {
 12 |     "Ala": "A",
 13 |     "Arg": "R",
 14 |     "Asn": "N",
 15 |     "Asp": "D",
 16 |     "Cys": "C",
 17 |     "Gln": "Q",
 18 |     "Glu": "E",
 19 |     "Gly": "G",
 20 |     "His": "H",
 21 |     "Ile": "I",
 22 |     "Leu": "L",
 23 |     "Lys": "K",
 24 |     "Met": "M",
 25 |     "Phe": "F",
 26 |     "Pro": "P",
 27 |     "Ser": "S",
 28 |     "Thr": "T",
 29 |     "Trp": "W",
 30 |     "Tyr": "Y",
 31 |     "Val": "V",
 32 |     "Xaa": "X",
 33 |     "Ter": "*",
 34 |     "Sec": "U",
 35 | }
 36 | 
 37 | aa1_to_aa3_lut = {v: k for k, v in aa3_to_aa1_lut.items()}
 38 | 
 39 | dna_to_aa1_lut = {  # NCBI standard translation table
 40 |     "AAA": "K",
 41 |     "AAC": "N",
 42 |     "AAG": "K",
 43 |     "AAT": "N",
 44 |     "ACA": "T",
 45 |     "ACC": "T",
 46 |     "ACG": "T",
 47 |     "ACT": "T",
 48 |     "AGA": "R",
 49 |     "AGC": "S",
 50 |     "AGG": "R",
 51 |     "AGT": "S",
 52 |     "ATA": "I",
 53 |     "ATC": "I",
 54 |     "ATG": "M",
 55 |     "ATT": "I",
 56 |     "CAA": "Q",
 57 |     "CAC": "H",
 58 |     "CAG": "Q",
 59 |     "CAT": "H",
 60 |     "CCA": "P",
 61 |     "CCC": "P",
 62 |     "CCG": "P",
 63 |     "CCT": "P",
 64 |     "CGA": "R",
 65 |     "CGC": "R",
 66 |     "CGG": "R",
 67 |     "CGT": "R",
 68 |     "CTA": "L",
 69 |     "CTC": "L",
 70 |     "CTG": "L",
 71 |     "CTT": "L",
 72 |     "GAA": "E",
 73 |     "GAC": "D",
 74 |     "GAG": "E",
 75 |     "GAT": "D",
 76 |     "GCA": "A",
 77 |     "GCC": "A",
 78 |     "GCG": "A",
 79 |     "GCT": "A",
 80 |     "GGA": "G",
 81 |     "GGC": "G",
 82 |     "GGG": "G",
 83 |     "GGT": "G",
 84 |     "GTA": "V",
 85 |     "GTC": "V",
 86 |     "GTG": "V",
 87 |     "GTT": "V",
 88 |     "TAA": "*",
 89 |     "TAC": "Y",
 90 |     "TAG": "*",
 91 |     "TAT": "Y",
 92 |     "TCA": "S",
 93 |     "TCC": "S",
 94 |     "TCG": "S",
 95 |     "TCT": "S",
 96 |     "TGA": "*",
 97 |     "TGC": "C",
 98 |     "TGG": "W",
 99 |     "TGT": "C",
100 |     "TTA": "L",
101 |     "TTC": "F",
102 |     "TTG": "L",
103 |     "TTT": "F",
104 |     # degenerate codons
105 |     "AAR": "K",
106 |     "AAY": "N",
107 |     "ACB": "T",
108 |     "ACD": "T",
109 |     "ACH": "T",
110 |     "ACK": "T",
111 |     "ACM": "T",
112 |     "ACN": "T",
113 |     "ACR": "T",
114 |     "ACS": "T",
115 |     "ACV": "T",
116 |     "ACW": "T",
117 |     "ACY": "T",
118 |     "AGR": "R",
119 |     "AGY": "S",
120 |     "ATH": "I",
121 |     "ATM": "I",
122 |     "ATW": "I",
123 |     "ATY": "I",
124 |     "CAR": "Q",
125 |     "CAY": "H",
126 |     "CCB": "P",
127 |     "CCD": "P",
128 |     "CCH": "P",
129 |     "CCK": "P",
130 |     "CCM": "P",
131 |     "CCN": "P",
132 |     "CCR": "P",
133 |     "CCS": "P",
134 |     "CCV": "P",
135 |     "CCW": "P",
136 |     "CCY": "P",
137 |     "CGB": "R",
138 |     "CGD": "R",
139 |     "CGH": "R",
140 |     "CGK": "R",
141 |     "CGM": "R",
142 |     "CGN": "R",
143 |     "CGR": "R",
144 |     "CGS": "R",
145 |     "CGV": "R",
146 |     "CGW": "R",
147 |     "CGY": "R",
148 |     "CTB": "L",
149 |     "CTD": "L",
150 |     "CTH": "L",
151 |     "CTK": "L",
152 |     "CTM": "L",
153 |     "CTN": "L",
154 |     "CTR": "L",
155 |     "CTS": "L",
156 |     "CTV": "L",
157 |     "CTW": "L",
158 |     "CTY": "L",
159 |     "GAR": "E",
160 |     "GAY": "D",
161 |     "GCB": "A",
162 |     "GCD": "A",
163 |     "GCH": "A",
164 |     "GCK": "A",
165 |     "GCM": "A",
166 |     "GCN": "A",
167 |     "GCR": "A",
168 |     "GCS": "A",
169 |     "GCV": "A",
170 |     "GCW": "A",
171 |     "GCY": "A",
172 |     "GGB": "G",
173 |     "GGD": "G",
174 |     "GGH": "G",
175 |     "GGK": "G",
176 |     "GGM": "G",
177 |     "GGN": "G",
178 |     "GGR": "G",
179 |     "GGS": "G",
180 |     "GGV": "G",
181 |     "GGW": "G",
182 |     "GGY": "G",
183 |     "GTB": "V",
184 |     "GTD": "V",
185 |     "GTH": "V",
186 |     "GTK": "V",
187 |     "GTM": "V",
188 |     "GTN": "V",
189 |     "GTR": "V",
190 |     "GTS": "V",
191 |     "GTV": "V",
192 |     "GTW": "V",
193 |     "GTY": "V",
194 |     "MGA": "R",
195 |     "MGG": "R",
196 |     "MGR": "R",
197 |     "TAR": "*",
198 |     "TAY": "Y",
199 |     "TCB": "S",
200 |     "TCD": "S",
201 |     "TCH": "S",
202 |     "TCK": "S",
203 |     "TCM": "S",
204 |     "TCN": "S",
205 |     "TCR": "S",
206 |     "TCS": "S",
207 |     "TCV": "S",
208 |     "TCW": "S",
209 |     "TCY": "S",
210 |     "TGY": "C",
211 |     "TRA": "*",
212 |     "TTR": "L",
213 |     "TTY": "F",
214 |     "YTA": "L",
215 |     "YTG": "L",
216 |     "YTR": "L",
217 | }
218 | 
219 | # translation table for selenocysteine
220 | dna_to_aa1_sec = dna_to_aa1_lut.copy()
221 | dna_to_aa1_sec["TGA"] = "U"
222 | 
223 | # Vertebrate micochondrial translation table
224 | # https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?chapter=tgencodes#SG2
225 | 
226 | dna_to_aa1_vmito = dna_to_aa1_lut.copy()
227 | dna_to_aa1_vmito["AGA"] = "*"
228 | dna_to_aa1_vmito["AGG"] = "*"
229 | dna_to_aa1_vmito["ATA"] = "M"
230 | dna_to_aa1_vmito["TGA"] = "W"
231 | 
232 | 
233 | complement_transtable = bytes.maketrans(b"ACGT", b"TGCA")
234 | 
235 | 
236 | def aa_to_aa1(seq):
237 |     """Coerces string of 1- or 3-letter amino acids to 1-letter representation.
238 | 
239 |     Args:
240 |         seq (str): An amino acid sequence.
241 | 
242 |     Returns:
243 |         str: The sequence as one of 1-letter amino acids.
244 | 
245 |     Examples:
246 |         >>> aa_to_aa1("CATSARELAME")
247 |         'CATSARELAME'
248 | 
249 |         >>> aa_to_aa1("CysAlaThrSerAlaArgGluLeuAlaMetGlu")
250 |         'CATSARELAME'
251 | 
252 |         >>> aa_to_aa1(None)
253 |     """
254 | 
255 |     if seq is None:
256 |         return None
257 |     return aa3_to_aa1(seq) if looks_like_aa3_p(seq) else seq
258 | 
259 | 
260 | def aa_to_aa3(seq):
261 |     """Coerces string of 1- or 3-letter amino acids to 3-letter representation.
262 | 
263 |     Args:
264 |         seq (str): An amino acid sequence.
265 | 
266 |     Returns:
267 |         str: The sequence as one of 3-letter amino acids.
268 | 
269 |     Examples:
270 |         >>> aa_to_aa3("CATSARELAME")
271 |         'CysAlaThrSerAlaArgGluLeuAlaMetGlu'
272 | 
273 |         >>> aa_to_aa3("CysAlaThrSerAlaArgGluLeuAlaMetGlu")
274 |         'CysAlaThrSerAlaArgGluLeuAlaMetGlu'
275 | 
276 |         >>> aa_to_aa3(None)
277 |     """
278 | 
279 |     if seq is None:
280 |         return None
281 |     return aa1_to_aa3(seq) if not looks_like_aa3_p(seq) else seq
282 | 
283 | 
284 | def aa1_to_aa3(seq):
285 |     """Converts string of 1-letter amino acids to 3-letter amino acids.
286 | 
287 |     Should only be used if the format of the sequence is known; otherwise use ``aa_to_aa3()``.
288 | 
289 |     Args:
290 |         seq (str): An amino acid sequence as 1-letter amino acids.
291 | 
292 |     Returns:
293 |         str: The sequence as 3-letter amino acids.
294 | 
295 |     Raises:
296 |         KeyError: If the sequence is not of 1-letter amino acids.
297 | 
298 |     Examples:
299 |         >>> aa1_to_aa3("CATSARELAME")
300 |         'CysAlaThrSerAlaArgGluLeuAlaMetGlu'
301 | 
302 |         >>> aa1_to_aa3(None)
303 |     """
304 | 
305 |     if seq is None:
306 |         return None
307 |     return "".join(aa1_to_aa3_lut[aa1] for aa1 in seq)
308 | 
309 | 
310 | def aa3_to_aa1(seq):
311 |     """Converts string of 3-letter amino acids to 1-letter amino acids.
312 | 
313 |     Should only be used if the format of the sequence is known; otherwise use ``aa_to_aa1()``.
314 | 
315 |     Args:
316 |         seq (str): An amino acid sequence as 3-letter amino acids.
317 | 
318 |     Returns:
319 |         str: The sequence as 1-letter amino acids.
320 | 
321 |     Raises:
322 |         KeyError: If the sequence is not of 3-letter amino acids.
323 | 
324 |     Examples:
325 |         >>> aa3_to_aa1("CysAlaThrSerAlaArgGluLeuAlaMetGlu")
326 |         'CATSARELAME'
327 | 
328 |         >>> aa3_to_aa1(None)
329 |     """
330 | 
331 |     if seq is None:
332 |         return None
333 |     return "".join(aa3_to_aa1_lut[aa3] for aa3 in [seq[i : i + 3] for i in range(0, len(seq), 3)])
334 | 
335 | 
336 | def complement(seq):
337 |     """Retrieves the complement of a sequence.
338 | 
339 |     Args:
340 |         seq (str): A nucleotide sequence.
341 | 
342 |     Returns:
343 |         str: The complement of the sequence.
344 | 
345 |     Examples:
346 |         >>> complement("ATCG")
347 |         'TAGC'
348 | 
349 |         >>> complement(None)
350 |     """
351 | 
352 |     if seq is None:
353 |         return None
354 |     return seq.translate(complement_transtable)
355 | 
356 | 
357 | def elide_sequence(s, flank=5, elision="..."):
358 |     """Trims the middle of the sequence, leaving the right and left flanks.
359 | 
360 |     Args:
361 |         s (str): A sequence.
362 |         flank (int, optional): The length of each flank. Defaults to five.
363 |         elision (str, optional): The symbol used to represent the part trimmed. Defaults to '...'.
364 | 
365 |         Returns:
366 |             str: The sequence with the middle replaced by ``elision``.
367 | 
368 |     Examples:
369 |         >>> elide_sequence("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
370 |         'ABCDE...VWXYZ'
371 | 
372 |         >>> elide_sequence("ABCDEFGHIJKLMNOPQRSTUVWXYZ", flank=3)
373 |         'ABC...XYZ'
374 | 
375 |         >>> elide_sequence("ABCDEFGHIJKLMNOPQRSTUVWXYZ", elision="..")
376 |         'ABCDE..VWXYZ'
377 | 
378 |         >>> elide_sequence("ABCDEFGHIJKLMNOPQRSTUVWXYZ", flank=12)
379 |         'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
380 | 
381 |         >>> elide_sequence("ABCDEFGHIJKLMNOPQRSTUVWXYZ", flank=12, elision=".")
382 |         'ABCDEFGHIJKL.OPQRSTUVWXYZ'
383 |     """
384 | 
385 |     elided_sequence_len = flank + flank + len(elision)
386 |     if len(s) <= elided_sequence_len:
387 |         return s
388 |     return s[:flank] + elision + s[-flank:]
389 | 
390 | 
391 | def looks_like_aa3_p(seq):
392 |     """Indicates whether a string looks like a 3-letter AA string.
393 | 
394 |     Args:
395 |         seq (str): A sequence.
396 | 
397 |     Returns:
398 |         bool: Whether the string is of the format of a 3-letter AA string.
399 |     """
400 |     return seq is not None and (len(seq) % 3 == 0) and (len(seq) == 0 or seq[1] in ascii_lowercase)
401 | 
402 | 
403 | def normalize_sequence(seq):
404 |     """Converts sequence to normalized representation for hashing.
405 | 
406 |     Essentially, removes whitespace and asterisks, and uppercases the string.
407 | 
408 |     Args:
409 |         seq (str): The sequence to be normalized.
410 | 
411 |     Returns:
412 |         str: The sequence as a string of uppercase letters.
413 | 
414 |     Raises:
415 |         RuntimeError: If the sequence contains non-alphabetic characters (besides '*').
416 | 
417 |     Examples:
418 |         >>> normalize_sequence("ACGT")
419 |         'ACGT'
420 | 
421 |         >>> normalize_sequence("  A C G T * ")
422 |         'ACGT'
423 | 
424 |         >>> normalize_sequence("ACGT1")
425 |         Traceback (most recent call last):
426 |         ...
427 |         RuntimeError: Normalized sequence contains non-alphabetic characters
428 |     """
429 | 
430 |     nseq = re.sub(r"[\s\*]", "", seq).upper()
431 |     m = re.search("[^A-Z]", nseq)
432 |     if m:
433 |         _logger.debug("Original sequence: " + seq)
434 |         _logger.debug("Normalized sequence: " + nseq)
435 |         _logger.debug("First non-[A-Z] at {}".format(m.start()))
436 |         raise RuntimeError("Normalized sequence contains non-alphabetic characters")
437 |     return nseq
438 | 
439 | 
440 | def reverse_complement(seq):
441 |     """Converts a sequence to its reverse complement.
442 | 
443 |     Args:
444 |         seq (str): A nucleotide sequence.
445 | 
446 |     Returns:
447 |         str: The reverse complement of the sequence.
448 | 
449 |     Examples:
450 |         >>> reverse_complement("ATCG")
451 |         'CGAT'
452 | 
453 |         >>> reverse_complement(None)
454 |     """
455 | 
456 |     if seq is None:
457 |         return None
458 |     return "".join(reversed(complement(seq)))
459 | 
460 | 
461 | def replace_t_to_u(seq):
462 |     """Replaces the T's in a sequence with U's.
463 | 
464 |     Args:
465 |         seq (str): A nucleotide sequence.
466 | 
467 |     Returns:
468 |         str: The sequence with the T's replaced by U's.
469 | 
470 |     Examples:
471 |         >>> replace_t_to_u("ACGT")
472 |         'ACGU'
473 | 
474 |         >>> replace_t_to_u(None)
475 |     """
476 | 
477 |     if seq is None:
478 |         return None
479 |     return seq.replace("T", "U").replace("t", "u")
480 | 
481 | 
482 | def replace_u_to_t(seq):
483 |     """Replaces the U's in a sequence with T's.
484 | 
485 |     Args:
486 |         seq (str): A nucleotide sequence.
487 | 
488 |     Returns:
489 |         str: The sequence with the U's replaced by T's.
490 | 
491 |     Examples:
492 |         >>> replace_u_to_t("ACGU")
493 |         'ACGT'
494 | 
495 |         >>> replace_u_to_t(None)
496 |     """
497 | 
498 |     if seq is None:
499 |         return None
500 |     return seq.replace("U", "T").replace("u", "t")
501 | 
502 | 
503 | class StrEnum(str, Enum):
504 |     """utility class"""
505 | 
506 |     def __str__(self) -> str:
507 |         return str.__str__(self)
508 | 
509 |     def __repr__(self) -> str:
510 |         return str.__repr__(self)
511 | 
512 | 
513 | class TranslationTable(StrEnum):
514 |     """An enum that controls switching between standard and selenocysteine translation tables."""
515 | 
516 |     standard = "standard"
517 |     selenocysteine = "sec"
518 |     vertebrate_mitochondrial = "vmito"
519 | 
520 | 
521 | def translate_cds(seq, full_codons=True, ter_symbol="*", translation_table=TranslationTable.standard):
522 |     """Translates a DNA or RNA sequence into a single-letter amino acid sequence.
523 | 
524 |     Args:
525 |         seq (str): A nucleotide sequence.
526 |         full_codons (bool, optional): If ``True``, forces sequence to have length
527 |             that is a multiple of 3 and raises an error otherwise.
528 |             If False, ``ter_symbol`` will be added as the last amino acid.
529 |             This corresponds to biopython's behavior of padding the last codon with ``N``s.
530 |             Defaults to ``True``.
531 |         ter_symbol (str, optional): Placeholder for the last amino acid if
532 |             sequence length is not divisible by three and ``full_codons`` is False.
533 |             Defaults to ``'*'``
534 |         translation_table (TranslationTable, optional): One of the options from the TranslationTable. It indicates
535 |             which codon to amino acid translation table to use.
536 |             By default we will use the standard translation table for humans. To enable translation for selenoproteins,
537 |             the TranslationTable.selenocysteine table can get used
538 | 
539 |     Returns:
540 |         str: The corresponding single letter amino acid sequence.
541 | 
542 |     Raises:
543 |         ValueError: If ``full_codons`` and the sequence is not a multiple of three.
544 |         ValueError: If a codon is undefined in the table.
545 | 
546 |     Examples:
547 |         >>> translate_cds("ATGCGA")
548 |         'MR'
549 | 
550 |         >>> translate_cds("AUGCGA")
551 |         'MR'
552 | 
553 |         >>> translate_cds(None)
554 | 
555 |         >>> translate_cds("")
556 |         ''
557 | 
558 |         >>> translate_cds("AUGCG")
559 |         Traceback (most recent call last):
560 |         ...
561 |         ValueError: Sequence length must be a multiple of three
562 | 
563 |         >>> translate_cds("AUGCG", full_codons=False)
564 |         'M*'
565 | 
566 |         >>> translate_cds("ATGTAN")
567 |         'MX'
568 | 
569 |         >>> translate_cds("CCN")
570 |         'P'
571 | 
572 |         >>> translate_cds("TRA")
573 |         '*'
574 | 
575 |         >>> translate_cds("TTNTA", full_codons=False)
576 |         'X*'
577 | 
578 |         >>> translate_cds("CTB")
579 |         'L'
580 | 
581 |         >>> translate_cds("AGM")
582 |         'X'
583 | 
584 |         >>> translate_cds("GAS")
585 |         'X'
586 | 
587 |         >>> translate_cds("CUN")
588 |         'L'
589 | 
590 |         >>> translate_cds("AUGCGQ")
591 |         Traceback (most recent call last):
592 |         ...
593 |         ValueError: Codon CGQ at position 4..6 is undefined in codon table
594 |     """
595 | 
596 |     if seq is None:
597 |         return None
598 | 
599 |     if len(seq) == 0:
600 |         return ""
601 | 
602 |     if full_codons and len(seq) % 3 != 0:
603 |         raise ValueError("Sequence length must be a multiple of three")
604 | 
605 |     if translation_table == TranslationTable.standard:
606 |         trans_table = dna_to_aa1_lut
607 |     elif translation_table == TranslationTable.selenocysteine:
608 |         trans_table = dna_to_aa1_sec
609 |     elif translation_table == TranslationTable.vertebrate_mitochondrial:
610 |         trans_table = dna_to_aa1_vmito
611 |     else:
612 |         raise ValueError("Unsupported translation table {}".format(translation_table))
613 |     seq = replace_u_to_t(seq)
614 |     seq = seq.upper()
615 | 
616 |     protein_seq = []
617 |     for i in range(0, len(seq) - len(seq) % 3, 3):
618 |         codon = seq[i : i + 3]
619 |         try:
620 |             aa = trans_table[codon]
621 |         except KeyError:
622 |             # if this contains an ambiguous code, set aa to X, otherwise, throw error
623 |             iupac_ambiguity_codes = "BDHVNUWSMKRYZ"
624 |             if any(iupac_ambiguity_code in codon for iupac_ambiguity_code in iupac_ambiguity_codes):
625 |                 aa = "X"
626 |             else:
627 |                 raise ValueError("Codon {} at position {}..{} is undefined in codon table".format(codon, i + 1, i + 3))
628 |         protein_seq.append(aa)
629 | 
630 |     # check for trailing bases and add the ter symbol if required
631 |     if not full_codons and len(seq) % 3 != 0:
632 |         protein_seq.append(ter_symbol)
633 | 
634 |     return "".join(protein_seq)
635 | 
636 | 
637 | ## <LICENSE>
638 | ## Copyright 2014 Bioutils Contributors
639 | ##
640 | ## Licensed under the Apache License, Version 2.0 (the "License");
641 | ## you may not use this file except in compliance with the License.
642 | ## You may obtain a copy of the License at
643 | ##
644 | ##     http://www.apache.org/licenses/LICENSE-2.0
645 | ##
646 | ## Unless required by applicable law or agreed to in writing, software
647 | ## distributed under the License is distributed on an "AS IS" BASIS,
648 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
649 | ## See the License for the specific language governing permissions and
650 | ## limitations under the License.
651 | ## </LICENSE>
652 | 


--------------------------------------------------------------------------------