├── .github ├── CODEOWNERS ├── labels.yml ├── workflows │ ├── stale.yml │ ├── validate-codecov-config.yml │ ├── python-ci-cd.yml │ └── labels.yml ├── stale.yml └── actions │ └── setup-python-env │ └── action.yml ├── docs ├── modules │ ├── .nav.yml │ ├── bioutils.digest.md │ ├── bioutils.digests.md │ ├── bioutils.cytobands.md │ ├── bioutils.normalize.md │ ├── bioutils.sequences.md │ ├── bioutils.accessions.md │ ├── bioutils.assemblies.md │ ├── bioutils.coordinates.md │ ├── bioutils.exceptions.md │ ├── bioutils.seqfetcher.md │ └── bioutils.vmc_digest.md ├── getting-started.md ├── changelog │ ├── index.rst │ ├── 0.4 │ │ ├── index.rst │ │ ├── 0.4.3.clog │ │ ├── 0.4.2.clog │ │ ├── 0.4.4.clog │ │ ├── 0.4.2.rst │ │ ├── 0.4.3.rst │ │ ├── 0.4.4.rst │ │ ├── 0.4.1.clog │ │ ├── 0.4.0.clog │ │ ├── 0.4.1.rst │ │ ├── Makefile │ │ └── 0.4.0.rst │ └── 0.5 │ │ ├── index.rst │ │ ├── 0.5.5.clog │ │ ├── 0.5.4.clog │ │ ├── 0.5.7.clog │ │ ├── 0.5.1.clog │ │ ├── 0.5.5.rst │ │ ├── 0.5.7.rst │ │ ├── 0.5.4.rst │ │ ├── 0.5.1.rst │ │ ├── 0.5.3.clog │ │ ├── 0.5.2.clog │ │ ├── 0.5.6.clog │ │ ├── 0.5.2.rst │ │ ├── Makefile │ │ ├── 0.5.3.rst │ │ ├── 0.5.8.clog │ │ ├── 0.5.6.rst │ │ ├── 0.5.0.clog │ │ ├── 0.5.8.rst │ │ └── 0.5.0.rst └── index.md ├── src └── bioutils │ ├── _data │ ├── assemblies │ │ ├── .gitignore │ │ ├── GRCh37.json.gz │ │ ├── GRCh38.json.gz │ │ ├── NCBI33.json.gz │ │ ├── NCBI34.json.gz │ │ ├── NCBI35.json.gz │ │ ├── NCBI36.json.gz │ │ ├── CHM1_1.0.json.gz │ │ ├── CHM1_1.1.json.gz │ │ ├── GRCh37.p10.json.gz │ │ ├── GRCh37.p11.json.gz │ │ ├── GRCh37.p12.json.gz │ │ ├── GRCh37.p13.json.gz │ │ ├── GRCh37.p2.json.gz │ │ ├── GRCh37.p5.json.gz │ │ ├── GRCh37.p9.json.gz │ │ ├── GRCh38.p1.json.gz │ │ ├── GRCh38.p10.json.gz │ │ ├── GRCh38.p11.json.gz │ │ ├── GRCh38.p12.json.gz │ │ ├── GRCh38.p13.json.gz │ │ ├── GRCh38.p14.json.gz │ │ ├── GRCh38.p2.json.gz │ │ ├── GRCh38.p3.json.gz │ │ ├── GRCh38.p4.json.gz │ │ ├── GRCh38.p5.json.gz │ │ ├── GRCh38.p6.json.gz │ │ ├── GRCh38.p7.json.gz │ │ ├── GRCh38.p8.json.gz │ │ ├── GRCh38.p9.json.gz │ │ ├── T2T-CHM13v2.0.json.gz │ │ └── Makefile │ └── cytobands │ │ ├── ucsc-hg19.json.gz │ │ └── ucsc-hg38.json.gz │ ├── exceptions.py │ ├── __init__.py │ ├── _versionwarning.py │ ├── cytobands.py │ ├── coordinates.py │ ├── digest.py │ ├── vmc_digest.py │ ├── assemblies.py │ ├── digests.py │ ├── accessions.py │ ├── seqfetcher.py │ ├── normalize.py │ └── sequences.py ├── tests ├── data │ ├── seqs.fa.gz │ └── cassettes │ │ ├── test_fetch_seq_errors │ │ └── test_fetch_seq_ncbi_invalid_positions ├── conftest.py ├── test_sequences.py ├── test_seqfetcher.py └── test_normalize.py ├── .git-blame-ignore-revs ├── .vscode └── settings.json ├── codecov.yaml ├── .deepsource.toml ├── CONTRIBUTING.md ├── tox.ini ├── CONTRIBUTORS.txt ├── .readthedocs.yml ├── .coveragerc ├── sbin ├── ucsc-cytoband-to-json ├── makefile-extract-documentation ├── generate-assembly-sql └── assembly-to-json ├── .mailmap ├── .pre-commit-config.yaml ├── mkdocs.yml ├── .gitignore ├── bin └── fasta-ga4gh-identifier ├── README.rst ├── README.md ├── Makefile ├── pyproject.toml └── LICENSE.txt /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @biocommons/maintainers 2 | -------------------------------------------------------------------------------- /docs/modules/.nav.yml: -------------------------------------------------------------------------------- 1 | title: "Reference" 2 | -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/.gitignore: -------------------------------------------------------------------------------- 1 | pull 2 | -------------------------------------------------------------------------------- /.github/labels.yml: -------------------------------------------------------------------------------- 1 | # file must contain an array, which may be empty 2 | 3 | [] 4 | -------------------------------------------------------------------------------- /docs/modules/bioutils.digest.md: -------------------------------------------------------------------------------- 1 | # bioutils.digest 2 | 3 | ::: bioutils.digest 4 | -------------------------------------------------------------------------------- /docs/modules/bioutils.digests.md: -------------------------------------------------------------------------------- 1 | # bioutils.digests 2 | 3 | ::: bioutils.digests 4 | -------------------------------------------------------------------------------- /docs/modules/bioutils.cytobands.md: -------------------------------------------------------------------------------- 1 | # bioutils.cytobands 2 | 3 | ::: bioutils.cytobands 4 | -------------------------------------------------------------------------------- /docs/modules/bioutils.normalize.md: -------------------------------------------------------------------------------- 1 | # bioutils.normalize 2 | 3 | ::: bioutils.normalize 4 | -------------------------------------------------------------------------------- /docs/modules/bioutils.sequences.md: -------------------------------------------------------------------------------- 1 | # bioutils.sequences 2 | 3 | ::: bioutils.sequences 4 | -------------------------------------------------------------------------------- /docs/modules/bioutils.accessions.md: -------------------------------------------------------------------------------- 1 | # bioutils.accessions 2 | 3 | ::: bioutils.accessions 4 | -------------------------------------------------------------------------------- /docs/modules/bioutils.assemblies.md: -------------------------------------------------------------------------------- 1 | # bioutils.assemblies 2 | 3 | ::: bioutils.assemblies 4 | -------------------------------------------------------------------------------- /docs/modules/bioutils.coordinates.md: -------------------------------------------------------------------------------- 1 | # bioutils.coordinates 2 | 3 | ::: bioutils.coordinates 4 | -------------------------------------------------------------------------------- /docs/modules/bioutils.exceptions.md: -------------------------------------------------------------------------------- 1 | # bioutils.exceptions 2 | 3 | ::: bioutils.exceptions 4 | -------------------------------------------------------------------------------- /docs/modules/bioutils.seqfetcher.md: -------------------------------------------------------------------------------- 1 | # bioutils.seqfetcher 2 | 3 | ::: bioutils.seqfetcher 4 | -------------------------------------------------------------------------------- /docs/modules/bioutils.vmc_digest.md: -------------------------------------------------------------------------------- 1 | # bioutils.vmc_digest 2 | 3 | ::: bioutils.vmc_digest 4 | -------------------------------------------------------------------------------- /tests/data/seqs.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/tests/data/seqs.fa.gz -------------------------------------------------------------------------------- /docs/getting-started.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | 3 | ## Installation 4 | 5 | pip install bioutils 6 | -------------------------------------------------------------------------------- /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | # .git-blame-ignore-revs 2 | # Initial Ruff formatting 3 | f82e37ddd260eac32b0396ab4baad78a82527b29 4 | -------------------------------------------------------------------------------- /src/bioutils/exceptions.py: -------------------------------------------------------------------------------- 1 | class BioutilsError(Exception): 2 | """Root exception for all bioutils exceptions""" 3 | 4 | pass 5 | -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh37.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh37.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh38.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/NCBI33.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/NCBI33.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/NCBI34.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/NCBI34.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/NCBI35.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/NCBI35.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/NCBI36.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/NCBI36.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/CHM1_1.0.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/CHM1_1.0.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/CHM1_1.1.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/CHM1_1.1.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/cytobands/ucsc-hg19.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/cytobands/ucsc-hg19.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/cytobands/ucsc-hg38.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/cytobands/ucsc-hg38.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh37.p10.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh37.p10.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh37.p11.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh37.p11.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh37.p12.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh37.p12.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh37.p13.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh37.p13.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh37.p2.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh37.p2.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh37.p5.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh37.p5.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh37.p9.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh37.p9.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh38.p1.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p1.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh38.p10.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p10.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh38.p11.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p11.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh38.p12.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p12.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh38.p13.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p13.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh38.p14.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p14.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh38.p2.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p2.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh38.p3.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p3.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh38.p4.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p4.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh38.p5.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p5.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh38.p6.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p6.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh38.p7.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p7.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh38.p8.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p8.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/GRCh38.p9.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/GRCh38.p9.json.gz -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/T2T-CHM13v2.0.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocommons/bioutils/HEAD/src/bioutils/_data/assemblies/T2T-CHM13v2.0.json.gz -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.formatting.provider": "yapf", 3 | "editor.formatOnSave": true, 4 | "python.venvPath": "${workspaceFolder}/venv/", 5 | } -------------------------------------------------------------------------------- /docs/changelog/index.rst: -------------------------------------------------------------------------------- 1 | .. _changelog: 2 | 3 | Change Log 4 | !!!!!!!!!! 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | 9 | 0.4/index 10 | 0.5/index 11 | -------------------------------------------------------------------------------- /docs/changelog/0.4/index.rst: -------------------------------------------------------------------------------- 1 | 0.4 Series 2 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@ 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | :glob: 7 | :reversed: 8 | 9 | 0.* 10 | -------------------------------------------------------------------------------- /docs/changelog/0.5/index.rst: -------------------------------------------------------------------------------- 1 | 0.5 Series 2 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@ 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | :glob: 7 | :reversed: 8 | 9 | 0.* 10 | -------------------------------------------------------------------------------- /codecov.yaml: -------------------------------------------------------------------------------- 1 | coverage: 2 | range: 70..100 3 | round: down 4 | precision: 1 5 | status: 6 | project: 7 | default: 8 | target: 90% 9 | threshold: 0.5% 10 | -------------------------------------------------------------------------------- /docs/changelog/0.4/0.4.3.clog: -------------------------------------------------------------------------------- 1 | clog format: 1; -*-outline-*- 2 | * 0.4.3 (2019-04-05) 3 | Changes since 0.4.2 (2019-02-21). 4 | ** New Features 5 | *** Fixes #16: Retry seqfetcher when rate limit exceeded [92d7210] 6 | -------------------------------------------------------------------------------- /.deepsource.toml: -------------------------------------------------------------------------------- 1 | version = 1 2 | 3 | test_patterns = [ 4 | 5 | ] 6 | 7 | exclude_patterns = [ 8 | 9 | ] 10 | 11 | [[analyzers]] 12 | name = 'python' 13 | enabled = true 14 | runtime_version = '3.x.x' 15 | 16 | -------------------------------------------------------------------------------- /docs/changelog/0.5/0.5.5.clog: -------------------------------------------------------------------------------- 1 | clog format: 1; -*-outline-*- 2 | * 0.5.5 (2021-05-03) 3 | Changes since 0.5.4 (2021-05-02). 4 | ** Bug Fixes 5 | *** Don't retry sequence fetch with invalid coordinates [94e80cd] (pjcoenen) 6 | -------------------------------------------------------------------------------- /docs/changelog/0.5/0.5.4.clog: -------------------------------------------------------------------------------- 1 | clog format: 1; -*-outline-*- 2 | * 0.5.4 (2021-05-02) 3 | Changes since 0.5.3 (2021-04-14). 4 | ** Internal and Developer Changes 5 | *** #31: improve support for degenerate codons [ebcec67] (kayleeyuhas) 6 | -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | name: 'Close stale issues and PRs' 2 | on: 3 | workflow_dispatch: 4 | schedule: 5 | - cron: '1 1 * * *' 6 | 7 | jobs: 8 | stale: 9 | uses: biocommons/.github/.github/workflows/stale.yml@main 10 | -------------------------------------------------------------------------------- /docs/changelog/0.4/0.4.2.clog: -------------------------------------------------------------------------------- 1 | clog format: 1; -*-outline-*- 2 | * 0.4.2 (2019-02-21) 3 | Changes since 0.4.1 (2019-02-21). 4 | ** Internal and Developer Changes 5 | *** reraise all requests exceptions (not just HTTPError) as RuntimeError [daece64] 6 | -------------------------------------------------------------------------------- /docs/changelog/0.5/0.5.7.clog: -------------------------------------------------------------------------------- 1 | clog format: 1; -*-outline-*- 2 | * 0.5.7 (2022-06-13) 3 | Changes since 0.5.6 (2022-06-09). 4 | ** New Features 5 | *** Enable independent control of trimming and shuffling during normalization [203ef4e] (Ryan Gomoto) 6 | -------------------------------------------------------------------------------- /src/bioutils/__init__.py: -------------------------------------------------------------------------------- 1 | from importlib.metadata import PackageNotFoundError, version 2 | 3 | try: 4 | __version__ = version(__package__) 5 | except PackageNotFoundError: # pragma: no cover 6 | # package is not installed 7 | __version__ = None 8 | -------------------------------------------------------------------------------- /docs/changelog/0.5/0.5.1.clog: -------------------------------------------------------------------------------- 1 | clog format: 1; -*-outline-*- 2 | * 0.5.1 (2019-07-31) 3 | Changes since 0.5.0 (2019-07-22). 4 | ** Internal and Developer Changes 5 | *** Closes #26: Fix LICENSE filename typo that prevented wheel builds :-( [df2fe4a] (Reece Hart) 6 | -------------------------------------------------------------------------------- /docs/changelog/0.5/0.5.5.rst: -------------------------------------------------------------------------------- 1 | 2 | 0.5.5 (2021-05-03) 3 | ################### 4 | 5 | Changes since 0.5.4 (2021-05-02). 6 | 7 | Bug Fixes 8 | $$$$$$$$$$ 9 | 10 | * Don't retry sequence fetch with invalid coordinates [`94e80cd `_] (pjcoenen) 11 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Thank you for your interest in contributing to the biocommons community! 2 | Contributions are welcome and greatly appreciated. There are many types of 3 | contributions and you don't need to be a developer! 4 | 5 | To get started, see https://biocommons.org/contributing/. We look forward to 6 | hearing from you! 7 | -------------------------------------------------------------------------------- /docs/changelog/0.5/0.5.7.rst: -------------------------------------------------------------------------------- 1 | 2 | 0.5.7 (2022-06-13) 3 | ################### 4 | 5 | Changes since 0.5.6 (2022-06-09). 6 | 7 | New Features 8 | $$$$$$$$$$$$$ 9 | 10 | * Enable independent control of trimming and shuffling during normalization [`203ef4e `_] (Ryan Gomoto) 11 | -------------------------------------------------------------------------------- /docs/changelog/0.4/0.4.4.clog: -------------------------------------------------------------------------------- 1 | clog format: 1; -*-outline-*- 2 | * 0.4.4 (2019-05-13) 3 | Changes since 0.4.3 (2019-04-05). 4 | ** Special Attention 5 | *** This is the last release in the 0.4 series. 6 | Future biocommons packages will be tested and supported only on Python 7 | >= 3.6 (https://github.com/biocommons/org/wiki/Migrating-to-Python-3.6) 8 | -------------------------------------------------------------------------------- /docs/changelog/0.4/0.4.2.rst: -------------------------------------------------------------------------------- 1 | 2 | 0.4.2 (2019-02-21) 3 | ################### 4 | 5 | Changes since 0.4.1 (2019-02-21). 6 | 7 | Internal and Developer Changes 8 | $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ 9 | 10 | * reraise all requests exceptions (not just HTTPError) as RuntimeError [`daece64 `_] 11 | -------------------------------------------------------------------------------- /docs/changelog/0.4/0.4.3.rst: -------------------------------------------------------------------------------- 1 | 2 | 0.4.3 (2019-04-05) 3 | ################### 4 | 5 | Changes since 0.4.2 (2019-02-21). 6 | 7 | New Features 8 | $$$$$$$$$$$$$ 9 | 10 | * Fixes `#16 `_: Retry seqfetcher when rate limit exceeded [`92d7210 `_] 11 | -------------------------------------------------------------------------------- /docs/changelog/0.4/0.4.4.rst: -------------------------------------------------------------------------------- 1 | 2 | 0.4.4 (2019-05-13) 3 | ################### 4 | 5 | Changes since 0.4.3 (2019-04-05). 6 | 7 | Special Attention 8 | $$$$$$$$$$$$$$$$$$ 9 | 10 | * This is the last release in the 0.4 series. 11 | Future biocommons packages will be tested and supported only on Python 12 | >= 3.6 (https://github.com/biocommons/org/wiki/Migrating-to-Python-3.6) 13 | -------------------------------------------------------------------------------- /docs/changelog/0.5/0.5.4.rst: -------------------------------------------------------------------------------- 1 | 2 | 0.5.4 (2021-05-02) 3 | ################### 4 | 5 | Changes since 0.5.3 (2021-04-14). 6 | 7 | Internal and Developer Changes 8 | $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ 9 | 10 | * `#31 `_: improve support for degenerate codons [`ebcec67 `_] (kayleeyuhas) 11 | -------------------------------------------------------------------------------- /docs/changelog/0.5/0.5.1.rst: -------------------------------------------------------------------------------- 1 | 2 | 0.5.1 (2019-07-31) 3 | ################### 4 | 5 | Changes since 0.5.0 (2019-07-22). 6 | 7 | Internal and Developer Changes 8 | $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ 9 | 10 | * Closes `#26 `_: Fix LICENSE filename typo that prevented wheel builds :-( [`df2fe4a `_] (Reece Hart) 11 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | skipsdist = true 3 | envlist = py39, py310, py311, py312, py313 4 | 5 | [gh-actions] 6 | python = 7 | 3.11: py311 8 | 3.12: py312 9 | 3.13: py313 10 | 11 | [testenv] 12 | passenv = PYTHON_VERSION 13 | allowlist_externals = uv 14 | commands = 15 | uv sync --python {envpython} 16 | uv run python -m pytest --doctest-modules tests --cov --cov-config=pyproject.toml --cov-report=xml 17 | ty check 18 | -------------------------------------------------------------------------------- /docs/changelog/0.4/0.4.1.clog: -------------------------------------------------------------------------------- 1 | clog format: 1; -*-outline-*- 2 | * 0.4.1 (2019-02-21) 3 | Changes since 0.4.0 (2018-11-11). 4 | ** Other Changes 5 | *** expose underlying exception on http failure [9e56110] 6 | ** Internal and Developer Changes 7 | *** updated badges [8f91ed1] 8 | *** added LICENSE [b3d6d64] 9 | *** added missing contributors definition [97f78b3] 10 | *** updated badge list [de2bf15] 11 | *** sync'd project files with eutils [3102695] 12 | -------------------------------------------------------------------------------- /.github/workflows/validate-codecov-config.yml: -------------------------------------------------------------------------------- 1 | name: Validate Codecov Config 2 | 3 | on: 4 | pull_request: 5 | paths: [codecov.yaml] 6 | push: 7 | branches: [main] 8 | 9 | jobs: 10 | validate-codecov-config: 11 | runs-on: ubuntu-latest 12 | permissions: 13 | contents: read 14 | steps: 15 | - uses: actions/checkout@v4 16 | - name: Validate codecov configuration 17 | run: curl -sSL --fail-with-body --data-binary @codecov.yaml https://codecov.io/validate 18 | -------------------------------------------------------------------------------- /docs/changelog/0.5/0.5.3.clog: -------------------------------------------------------------------------------- 1 | clog format: 1; -*-outline-*- 2 | * 0.5.3 (2021-04-14) 3 | Changes since 0.5.2 (2019-11-06). 4 | ** New Features 5 | *** #29: Support ambiguity codes in translation [669a653] (kayleeyuhas) 6 | *** added bin/fasta-ga4gh-identifier [63d1078] (Reece Hart) 7 | ** Internal and Developer Changes 8 | *** updated Makefile for Python 3.8 [29eecf5] (Reece Hart) 9 | *** fix failing test and reformat [7cc5ebb] (kayleeyuhas) 10 | *** improve variable names and use string instead of list [5d7484b] (kayleeyuhas) 11 | -------------------------------------------------------------------------------- /src/bioutils/_versionwarning.py: -------------------------------------------------------------------------------- 1 | """emits a warning when imported under Python < 3.6 2 | 3 | This module may be used by other biocommons packages 4 | 5 | """ 6 | 7 | import logging 8 | import sys 9 | 10 | __all__ = [] 11 | 12 | version_warning = ( 13 | "biocommons packages are tested and supported only on Python >= 3.6" 14 | " (https://github.com/biocommons/org/wiki/Migrating-to-Python-3.6)" 15 | ) 16 | 17 | _logger = logging.getLogger(__package__) 18 | 19 | if sys.version_info < (3, 6): 20 | _logger.warning(version_warning) 21 | -------------------------------------------------------------------------------- /docs/changelog/0.5/0.5.2.clog: -------------------------------------------------------------------------------- 1 | clog format: 1; -*-outline-*- 2 | * 0.5.2 (2019-11-06) 3 | Changes since 0.5.1 (2019-07-31). 4 | ** Special Attention 5 | *** Thanks to @trentwatt for significant documentation contributions! See 6 | https://bioutils.readthedocs.io/en/master/ for his handiwork. 7 | ** Other Changes 8 | *** Added changelogs for 0.5.0 and 0.5.1, which @reece forgot to include :-( 9 | *** #22 added function docs for all modules [c0090ed] (trentwatt) 10 | *** #23: fix setup.cfg description tags (`description` → `long-description`) [8945c04] (Reece Hart) 11 | -------------------------------------------------------------------------------- /.github/workflows/python-ci-cd.yml: -------------------------------------------------------------------------------- 1 | name: Python CI/CD 2 | permissions: 3 | contents: write 4 | id-token: write 5 | 6 | on: 7 | push: 8 | branches: ["*"] 9 | tags: ["*"] 10 | 11 | jobs: 12 | python-ci-cd: 13 | name: Python CI/CD 14 | permissions: 15 | contents: write 16 | id-token: write 17 | uses: biocommons/.github/.github/workflows/python-ci-cd.yml@main 18 | with: 19 | publish: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') }} 20 | secrets: 21 | pypi-token: ${{ secrets.UV_PUBLISH_TOKEN }} 22 | -------------------------------------------------------------------------------- /CONTRIBUTORS.txt: -------------------------------------------------------------------------------- 1 | Reece Hart 2 | Ryan Gomoto <7393416+gomoto@users.noreply.github.com> 3 | trentwatt 4 | Alan Rubin 5 | kayleeyuhas 6 | Andreas Prlic 7 | Dave Lawrence 8 | Kyle Ferriter 9 | Timothy Laurent 10 | Ben Robinson 11 | Lucas Wiman 12 | Trent Watson 13 | pjcoenen <64436780+pjcoenen@users.noreply.github.com> 14 | -------------------------------------------------------------------------------- /docs/changelog/0.4/0.4.0.clog: -------------------------------------------------------------------------------- 1 | clog format: 1; -*-outline-*- 2 | * 0.4.0 (2018-10-22) 3 | Changes since 0.3.3 (2017-09-03). 4 | ** Important Notice 5 | Support for Python <3.6 will be dropped on 2019-03-31. See 6 | https://github.com/biocommons/org/wiki/Migrating-to-Python-3.6 7 | ** New Features 8 | *** Closes #10: Support NCBI API keys (and NCBI_API_KEY env variable) [8739c98] (@timothyjlaurent) 9 | *** Closes #12: add infer_namespaces and infer_namespace functions [2a53c7f] 10 | *** Dropped biopython dependency [0382b86] (@afrubin) 11 | *** Added bioutils.sequences.py:elide_sequence() function [018a762] 12 | *** Added GRCh38.p12 [3876f36] 13 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import vcr 5 | 6 | # set vcr logging level 7 | logging.basicConfig() 8 | logger = logging.getLogger("vcr") 9 | 10 | # set default location for vcr cassettes 11 | test_dir = os.path.dirname(__file__) 12 | test_data_dir = os.path.join(test_dir, "data", "cassettes") 13 | 14 | # initialize vcr 15 | vcr.default_vcr = vcr.VCR( 16 | cassette_library_dir=test_data_dir, 17 | filter_headers=["Authorization"], 18 | filter_post_data_parameters=["Authorization"], 19 | record_mode=os.environ.get("VCR_RECORD_MODE", "once"), 20 | ) 21 | vcr.use_cassette = vcr.default_vcr.use_cassette 22 | -------------------------------------------------------------------------------- /docs/changelog/0.5/0.5.6.clog: -------------------------------------------------------------------------------- 1 | clog format: 1; -*-outline-*- 2 | * 0.5.6 (2022-06-09) 3 | Changes since 0.5.5 (2021-05-05). 4 | ** Bug Fixes 5 | *** fix #36 by adding a new translation table ... (#37) [b5d4d0f] (Andreas Prlic) 6 | *** Fix test warnings and a new failure from #36 [22b5556] (Reece Hart) 7 | ** New Features 8 | *** Handle Ensembl transcript versions [b3eaf83] (Dave Lawrence) 9 | ** Internal and Developer Changes 10 | *** Update Makefile to support newer bioutils conventions [ed6eaf6] (Reece Hart) 11 | *** Adopt GitHub Actions for testing and deployment [35c6a7f] (Reece Hart) 12 | *** Switch to Python 3.10 by default [5895087] (Reece Hart) 13 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | [![Release](https://img.shields.io/github/v/release/biocommons/bioutils)](https://img.shields.io/github/v/release/biocommons/bioutils) 4 | [![Build status](https://img.shields.io/github/actions/workflow/status/biocommons/bioutils/main.yml?branch=main)](https://github.com/biocommons/bioutils/actions/workflows/main.yml?query=branch%3Amain) 5 | [![Commit activity](https://img.shields.io/github/commit-activity/m/biocommons/bioutils)](https://img.shields.io/github/commit-activity/m/biocommons/bioutils) 6 | [![License](https://img.shields.io/github/license/biocommons/bioutils)](https://img.shields.io/github/license/biocommons/bioutils) 7 | 8 | Package Description 9 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/source/conf.py 11 | 12 | # Optionally build your docs in additional formats such as PDF and ePub 13 | formats: all 14 | 15 | # Optionally set the version of Python and requirements required to build your docs 16 | python: 17 | version: 3.7 18 | install: 19 | - method: pip 20 | path: . 21 | extra_requirements: 22 | - docs 23 | - method: setuptools 24 | path: . 25 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | # .coveragerc to control coverage.py 2 | [run] 3 | branch = True 4 | source = bioutils 5 | # omit = bad_file.py 6 | 7 | [paths] 8 | source = 9 | src/ 10 | 11 | [report] 12 | # Regexes for lines to exclude from consideration 13 | exclude_lines = 14 | # Have to re-enable the standard pragma 15 | pragma: no cover 16 | 17 | # Don't complain about missing debug-only code: 18 | def __repr__ 19 | if self\.debug 20 | 21 | # Don't complain if tests don't hit defensive assertion code: 22 | raise AssertionError 23 | raise NotImplementedError 24 | 25 | # Don't complain if non-runnable code isn't run: 26 | if 0: 27 | if __name__ == .__main__.: 28 | -------------------------------------------------------------------------------- /.github/workflows/labels.yml: -------------------------------------------------------------------------------- 1 | name: Sync labels 2 | on: 3 | workflow_dispatch: 4 | push: 5 | branches: 6 | - 'main' 7 | paths: 8 | - '.github/labels.yml' 9 | - '.github/workflows/labels.yml' 10 | 11 | permissions: 12 | issues: write 13 | 14 | jobs: 15 | labels: 16 | runs-on: ubuntu-latest 17 | 18 | steps: 19 | - uses: actions/checkout@v4 20 | with: 21 | sparse-checkout: .github/labels.yml 22 | 23 | - uses: EndBug/label-sync@v2 24 | with: 25 | config-file: | 26 | https://raw.githubusercontent.com/biocommons/.github/main/etc/labels.yml 27 | .github/labels.yml 28 | 29 | delete-other-labels: false -------------------------------------------------------------------------------- /sbin/ucsc-cytoband-to-json: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import collections 4 | import csv 5 | import gzip 6 | import json 7 | import sys 8 | 9 | 10 | def open_any(fn): 11 | if fn == "-": 12 | return sys.stdin 13 | if fn.endswith(".gz"): 14 | return gzip.open(fn, mode="rt", encoding="utf-8") 15 | return open(fn, mode="rt") 16 | 17 | 18 | chr_band_map = collections.defaultdict(dict) 19 | 20 | rdr = csv.reader(open_any(sys.argv[1]), delimiter="\t") 21 | for row in rdr: 22 | chr, start, end, band, stain = row 23 | if chr.startswith("chr"): 24 | chr = chr[3:] 25 | chr_band_map[chr][band] = (int(start),int(end),stain) 26 | 27 | 28 | json.dump(chr_band_map, sys.stdout, indent=None, sort_keys=True) 29 | -------------------------------------------------------------------------------- /src/bioutils/_data/assemblies/Makefile: -------------------------------------------------------------------------------- 1 | # Download assembly info from NCBI and convert to json 2 | # Use: 3 | # $ make update 4 | 5 | .PHONY: FORCE 6 | .SUFFIXES: 7 | .DELETE_ON_ERROR: 8 | 9 | SHELL:=/bin/bash -o pipefail 10 | PATH:=../../../../sbin:${PATH} 11 | 12 | update: 13 | make pull 14 | make json 15 | 16 | # rsyncs all assembly records into pull/ directory 17 | pull: FORCE 18 | mkdir -p $@ 19 | rsync -L -v --no-motd ftp.ncbi.nlm.nih.gov::genomes/all/GCF/000/001/405/*/*assembly_report.txt $@ 20 | rsync -L -v --no-motd ftp.ncbi.nlm.nih.gov::genomes/all/GCF/000/306/695/*/*assembly_report.txt $@ 21 | rsync -L -v --no-motd ftp.ncbi.nlm.nih.gov::genomes/all/GCF/009/914/755/*/*assembly_report.txt $@ 22 | 23 | json: 24 | for f in pull/*.txt; do assembly-to-json -p. "$$f"; done 25 | gzip -f *.json 26 | -------------------------------------------------------------------------------- /docs/changelog/0.5/0.5.2.rst: -------------------------------------------------------------------------------- 1 | 2 | 0.5.2 (2019-11-06) 3 | ################### 4 | 5 | Changes since 0.5.1 (2019-07-31). 6 | 7 | Special Attention 8 | $$$$$$$$$$$$$$$$$$ 9 | 10 | * Thanks to @trentwatt for significant documentation contributions! See 11 | https://bioutils.readthedocs.io/en/master/ for his handiwork. 12 | 13 | Other Changes 14 | $$$$$$$$$$$$$$ 15 | 16 | * Added changelogs for 0.5.0 and 0.5.1, which @reece forgot to include :-( 17 | * `#22 `_ added function docs for all modules [`c0090ed `_] (trentwatt) 18 | * `#23 `_: fix setup.cfg description tags (`description` → `long-description`) [`8945c04 `_] (Reece Hart) 19 | -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Config for https://probot.github.io/apps/stale/ 2 | 3 | # Number of days of inactivity before an issue becomes stale 4 | daysUntilStale: 60 5 | 6 | # Number of days of inactivity before a stale issue is closed 7 | daysUntilClose: 7 8 | 9 | # Issues with these labels will never be considered stale 10 | exemptLabels: 11 | - pinned 12 | - security 13 | 14 | # Label to use when marking an issue as stale 15 | staleLabel: wontfix 16 | 17 | # Comment to post when marking an issue as stale. Set to `false` to disable 18 | markComment: >- 19 | This issue has not had recent activity and is now marked as stale. 20 | It will be closed if no further activity occurs. Please comment if 21 | you believe the issue is still relevant. 22 | 23 | # Comment to post when closing a stale issue. Set to `false` to disable 24 | closeComment: false 25 | -------------------------------------------------------------------------------- /docs/changelog/0.4/0.4.1.rst: -------------------------------------------------------------------------------- 1 | 2 | 0.4.1 (2019-02-21) 3 | ################### 4 | 5 | Changes since 0.4.0 (2018-11-11). 6 | 7 | Other Changes 8 | $$$$$$$$$$$$$$ 9 | 10 | * expose underlying exception on http failure [`9e56110 `_] 11 | 12 | Internal and Developer Changes 13 | $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ 14 | 15 | * updated badges [`8f91ed1 `_] 16 | * added LICENSE [`b3d6d64 `_] 17 | * added missing contributors definition [`97f78b3 `_] 18 | * updated badge list [`de2bf15 `_] 19 | * sync'd project files with eutils [`3102695 `_] 20 | -------------------------------------------------------------------------------- /docs/changelog/0.4/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: FORCE 2 | .DELETE_ON_ERROR: 3 | 4 | # N.B. this scripts requires tools that are not publicly available 5 | # yet. Eventually, clogger will be released. 6 | # Sorry. 7 | PATH:=/home/reece/projects/reece/clogger/bin:${PATH} 8 | SHELL:=/bin/bash -o pipefail 9 | 10 | default: 11 | @echo "no $@ target"; exit 1 12 | 13 | next.clog:: 14 | biocommons-changelog . >$@ 15 | 16 | # TODO: use git-mapfile to map commits 17 | %.rst: %.clog 18 | clogger-fmt \ 19 | -I '`#{issue_id} `_' \ 20 | -C '`{cset} `_' \ 21 | <$< >$@.tmp 22 | mv $@.tmp $@ 23 | 24 | 25 | 26 | .PHONY: clean cleaner cleanest 27 | 28 | clean: 29 | /bin/rm -f *~ 30 | 31 | cleaner: clean 32 | #/bin/rm -f *.rst 33 | 34 | cleanest: cleaner 35 | /bin/rm -f *.clog 36 | -------------------------------------------------------------------------------- /docs/changelog/0.5/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: FORCE 2 | .DELETE_ON_ERROR: 3 | 4 | # N.B. this scripts requires tools that are not publicly available 5 | # yet. Eventually, clogger will be released. 6 | # Sorry. 7 | PATH:=/home/reece/projects/reece/clogger/bin:${PATH} 8 | SHELL:=/bin/bash -o pipefail 9 | 10 | default: 11 | @echo "no $@ target"; exit 1 12 | 13 | next.clog:: 14 | biocommons-changelog . >$@ 15 | 16 | # TODO: use git-mapfile to map commits 17 | %.rst: %.clog 18 | clogger-fmt \ 19 | -I '`#{issue_id} `_' \ 20 | -C '`{cset} `_' \ 21 | <$< >$@.tmp 22 | mv $@.tmp $@ 23 | 24 | 25 | 26 | .PHONY: clean cleaner cleanest 27 | 28 | clean: 29 | /bin/rm -f *~ 30 | 31 | cleaner: clean 32 | #/bin/rm -f *.rst 33 | 34 | cleanest: cleaner 35 | /bin/rm -f *.clog 36 | -------------------------------------------------------------------------------- /.github/actions/setup-python-env/action.yml: -------------------------------------------------------------------------------- 1 | name: "Setup Python Environment" 2 | description: "Set up Python environment for the given Python version" 3 | 4 | inputs: 5 | python-version: 6 | description: "Python version to use" 7 | required: true 8 | default: "3.13" 9 | uv-version: 10 | description: "uv version to use" 11 | required: true 12 | default: "0.7.14" 13 | 14 | runs: 15 | using: "composite" 16 | steps: 17 | - uses: actions/setup-python@v5 18 | with: 19 | python-version: ${{ inputs.python-version }} 20 | 21 | - name: Install uv 22 | uses: astral-sh/setup-uv@v6 23 | with: 24 | version: ${{ inputs.uv-version }} 25 | enable-cache: 'true' 26 | cache-suffix: ${{ matrix.python-version }} 27 | 28 | - name: Install Python dependencies 29 | run: uv sync --frozen 30 | shell: bash 31 | -------------------------------------------------------------------------------- /docs/changelog/0.5/0.5.3.rst: -------------------------------------------------------------------------------- 1 | 2 | 0.5.3 (2021-04-14) 3 | ################### 4 | 5 | Changes since 0.5.2 (2019-11-06). 6 | 7 | New Features 8 | $$$$$$$$$$$$$ 9 | 10 | * `#29 `_: Support ambiguity codes in translation [`669a653 `_] (kayleeyuhas) 11 | * added bin/fasta-ga4gh-identifier [`63d1078 `_] (Reece Hart) 12 | 13 | Internal and Developer Changes 14 | $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ 15 | 16 | * updated Makefile for Python 3.8 [`29eecf5 `_] (Reece Hart) 17 | * fix failing test and reformat [`7cc5ebb `_] (kayleeyuhas) 18 | * improve variable names and use string instead of list [`5d7484b `_] (kayleeyuhas) 19 | -------------------------------------------------------------------------------- /sbin/makefile-extract-documentation: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """extract doc from a makefile""" 3 | 4 | # ############################################################################ 5 | # #= BASIC USAGE 6 | # 7 | # .PHONY: help 8 | # help: ## Display help message 9 | 10 | import fileinput 11 | import re 12 | 13 | BOLD = "\033[1m" 14 | COMMAND_COLOR = "\033[36m" 15 | HEADER_COLOR = "\033[32m" 16 | RESET = "\033[0m" 17 | SECTION_COLOR = "\033[93m" 18 | UNDERLINE = "\033[4m" 19 | 20 | print(f"""🌟🌟 {BOLD}{HEADER_COLOR}{UNDERLINE}biocommons conventional make targets{RESET} 🌟🌟 21 | 22 | Using these targets promots consistency between local development and ci/cd commands. 23 | 24 | usage: make [target ...]""") 25 | 26 | for line in fileinput.input(): # noqa: SIM115 27 | if m := re.match(r"#= (.+)", line): 28 | print(f"\n{BOLD}{UNDERLINE}{SECTION_COLOR}{m.group(1)}{RESET}") 29 | elif m := re.match(r"([-\s\w]+):.+?##\s+(.+)", line): 30 | print(f"{BOLD}{COMMAND_COLOR}{m.group(1):<20}{RESET}{m.group(2)}") 31 | -------------------------------------------------------------------------------- /docs/changelog/0.5/0.5.8.clog: -------------------------------------------------------------------------------- 1 | clog format: 1; -*-outline-*- 2 | * HEAD (2023-07-09) 3 | Changes since 0.5.7 (2022-07-08). 4 | ** New Features 5 | *** update assemblies and add T2T-CHM13v2.0 [19ebeff] (Reece Hart) 6 | ** Other Changes 7 | *** #47 Rewrite trim_left and trim_right for linear performance. [7817f6a] (Kyle Ferriter) 8 | *** Handle case when no alleles are passed to trim functions [eb8607a] (Kyle Ferriter) 9 | *** Merge pull request #48 from theferrit32/47-large-seq-normalization [5b3a80c] (Reece Hart) 10 | *** Merge pull request #49 from theferrit32/46-test-vcr-compilation [271ad81] (Reece Hart) 11 | ** Internal and Developer Changes 12 | *** Pin urllib3 version that vcrpy dependency depends on [9ddda27] (Kyle Ferriter) 13 | *** pin urllib3 to 1.26.* (rather than to specific patch version) [b60b890] (Reece Hart) 14 | *** add CODEOWNERS file [eef120d] (Reece Hart) 15 | *** updated CONTRIBUTORS.txt [8741a5f] (Reece Hart) 16 | *** reformatted with black and isort [3305c0e] (Reece Hart) 17 | *** Close #50: synchronize biocommonsexample with bioutils [5a1788c] (Reece Hart) 18 | -------------------------------------------------------------------------------- /docs/changelog/0.4/0.4.0.rst: -------------------------------------------------------------------------------- 1 | 2 | 0.4.0 (2018-10-22) 3 | ################### 4 | 5 | Changes since 0.3.3 (2017-09-03). 6 | 7 | Important Notice 8 | $$$$$$$$$$$$$$$$$ 9 | 10 | Support for Python <3.6 will be dropped on 2019-03-31. See 11 | https://github.com/biocommons/org/wiki/Migrating-to-Python-3.6 12 | 13 | New Features 14 | $$$$$$$$$$$$$ 15 | 16 | * Closes `#10 `_: Support NCBI API keys (and NCBI_API_KEY env variable) [`8739c98 `_] (@timothyjlaurent) 17 | * Closes `#12 `_: add infer_namespaces and infer_namespace functions [`2a53c7f `_] 18 | * Dropped biopython dependency [`0382b86 `_] (@afrubin) 19 | * Added bioutils.sequences.py:elide_sequence() function [`018a762 `_] 20 | * Added GRCh38.p12 [`3876f36 `_] 21 | -------------------------------------------------------------------------------- /.mailmap: -------------------------------------------------------------------------------- 1 | # I pick the account with the most modified files in git shortlog -sne 2 | # This is used by Git to consolidate the users who used multiple accounts 3 | Andreas Prlic Andreas Prlic <36012160+andreas-invitae@users.noreply.github.com> 4 | Andreas Prlic Andreas Prlic 5 | Caitlin Gong Caitlin Gong 6 | Katie Stahl katie stahl 7 | Manuel Holtgrewe Manuel Holtgrewe 8 | Meng Meng Wang 9 | Reece Hart Reece Hart 10 | Reece Hart Reece Hart 11 | Reece Hart Reece Hart 12 | Reece Hart Reece Hart 13 | Rudy Rico Rudolph Rico 14 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: "v6.0.0" 4 | hooks: 5 | - id: check-case-conflict 6 | - id: check-merge-conflict 7 | - id: check-json 8 | exclude: ^.devcontainer/.*devcontainer.json 9 | - id: check-shebang-scripts-are-executable 10 | - id: check-symlinks 11 | - id: check-toml 12 | - id: check-yaml 13 | - id: detect-private-key 14 | - id: end-of-file-fixer 15 | - id: mixed-line-ending 16 | args: [--fix=lf] 17 | - id: pretty-format-json 18 | exclude: ^.devcontainer/.*devcontainer.json 19 | args: [--autofix, --no-sort-keys] 20 | - id: trailing-whitespace 21 | 22 | - repo: https://github.com/astral-sh/ruff-pre-commit 23 | rev: "v0.12.7" 24 | hooks: 25 | - id: ruff-check 26 | args: [--fix, --exit-non-zero-on-fix] 27 | - id: ruff-format 28 | 29 | - repo: local 30 | hooks: 31 | - id: canonicalize-gitignore 32 | name: Sort unique .gitignore 33 | entry: sh -c 'LC_ALL=C sort -u -o .gitignore .gitignore' 34 | language: system 35 | files: ^\.gitignore$ 36 | -------------------------------------------------------------------------------- /docs/changelog/0.5/0.5.6.rst: -------------------------------------------------------------------------------- 1 | 2 | 0.5.6 (2022-06-09) 3 | ################### 4 | 5 | Changes since 0.5.5 (2021-05-05). 6 | 7 | Bug Fixes 8 | $$$$$$$$$$ 9 | 10 | * fix `#36 `_ by adding a new translation table ... (`#37 `_) [`b5d4d0f `_] (Andreas Prlic) 11 | * Fix test warnings and a new failure from `#36 `_ [`22b5556 `_] (Reece Hart) 12 | 13 | New Features 14 | $$$$$$$$$$$$$ 15 | 16 | * Handle Ensembl transcript versions [`b3eaf83 `_] (Dave Lawrence) 17 | 18 | Internal and Developer Changes 19 | $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ 20 | 21 | * Update Makefile to support newer bioutils conventions [`ed6eaf6 `_] (Reece Hart) 22 | * Adopt GitHub Actions for testing and deployment [`35c6a7f `_] (Reece Hart) 23 | * Switch to Python 3.10 by default [`5895087 `_] (Reece Hart) 24 | -------------------------------------------------------------------------------- /docs/changelog/0.5/0.5.0.clog: -------------------------------------------------------------------------------- 1 | clog format: 1; -*-outline-*- 2 | * 0.5.0 (2019-07-22) 3 | Changes since 0.4.4 (2019-05-13). 4 | ** Special Attention 5 | *** All biocommons packages now require Python >= 3.6. See https://github.com/biocommons/org/wiki/Migrating-to-Python-3.6 6 | ** New Features 7 | *** #18: Implemented comprehensive sequence normalization (trim, left, right, expand/voca, vcf) [36785fa] (Reece Hart) 8 | *** #20: implement hex-based digests à la refget [140a20e] (Reece Hart) 9 | *** Add support for cytobands, incl data files from UCSC [0ba4361] (Reece Hart) 10 | *** Added accessions.py:coerce_namespace() [e31e592] (Reece Hart) 11 | ** Internal and Developer Changes 12 | *** Added pytest-optional-tests; use test alias in Makefile [ba9b993] (Reece Hart) 13 | *** Added trinuc normalization tests [cfe3a68] (Reece Hart) 14 | *** Added vcrpy to test requirements [95893f1] (Reece Hart) 15 | *** Moved source to src/; updated setup.cfg [ff45fb0] (Reece Hart) 16 | *** Removed pip install from tox in favor of deps [8c8f91a] (Reece Hart) 17 | *** Renamed doc → docs [1612e5c] (Reece Hart) 18 | *** Store assemblies as compressed json [ea79e71] (Reece Hart) 19 | *** Update tests to use new vcr cassettes on optional tests (much faster!) [2001745] (Reece Hart) 20 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: bioutils 2 | repo_url: https://github.com/biocommons/bioutils 3 | site_url: https://biocommons.github.io/bioutils 4 | copyright: Maintained by biocommons. 5 | 6 | plugins: 7 | - search 8 | - mkdocstrings: 9 | handlers: 10 | python: 11 | options: 12 | show_source: false 13 | - awesome-nav 14 | theme: 15 | name: material 16 | feature: 17 | tabs: true 18 | palette: 19 | - media: "(prefers-color-scheme: light)" 20 | scheme: default 21 | primary: white 22 | accent: deep orange 23 | toggle: 24 | icon: material/brightness-7 25 | name: Switch to dark mode 26 | - media: "(prefers-color-scheme: dark)" 27 | scheme: slate 28 | primary: black 29 | accent: deep orange 30 | toggle: 31 | icon: material/brightness-4 32 | name: Switch to light mode 33 | icon: 34 | repo: fontawesome/brands/github 35 | 36 | extra: 37 | social: 38 | - icon: fontawesome/brands/github 39 | link: https://github.com/biocommons/bioutils 40 | - icon: fontawesome/brands/python 41 | link: https://pypi.org/project/bioutils 42 | 43 | markdown_extensions: 44 | - toc: 45 | permalink: true 46 | - pymdownx.arithmatex: 47 | generic: true 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *$py.class 2 | *.bak 3 | *.cover 4 | *.egg 5 | *.egg-info/ 6 | *.log 7 | *.manifest 8 | *.mo 9 | *.orig 10 | *.pot 11 | *.py.cover 12 | *.py[cod] 13 | *.py[codz] 14 | *.sage.py 15 | *.so 16 | *.spec 17 | *.sqlite3 18 | *.sqlite3-journal 19 | *~ 20 | .DS_Store 21 | .Python 22 | .abstra/ 23 | .cache 24 | .coverage 25 | .coverage.* 26 | .cursorignore 27 | .cursorindexingignore 28 | .dmypy.json 29 | .eggs/ 30 | .env 31 | .envrc 32 | .hypothesis/ 33 | .idea 34 | .installed.cfg 35 | .ipynb_checkpoints 36 | .mypy_cache/ 37 | .nox/ 38 | .pdm-build/ 39 | .pdm-python 40 | .pixi 41 | .pybuilder/ 42 | .pypirc 43 | .pyre/ 44 | .pytest_cache 45 | .pytest_cache/ 46 | .python-version 47 | .pytype/ 48 | .ropeproject 49 | .ruff_cache/ 50 | .scrapy 51 | .spyderproject 52 | .spyproject 53 | .tox/ 54 | .venv 55 | .vscode 56 | .webassets-cache 57 | /site 58 | ENV/ 59 | MANIFEST 60 | __marimo__/ 61 | __pycache__/ 62 | __pypackages__/ 63 | archive 64 | bioutils/_data/assemblies/pull 65 | build/ 66 | celerybeat-schedule 67 | celerybeat.pid 68 | cover/ 69 | coverage.xml 70 | cython_debug/ 71 | develop-eggs/ 72 | dist/ 73 | dmypy.json 74 | doc/_build 75 | doc/changelog/*/.tags 76 | doc/changelog/*/.tags.mk 77 | doc/changelog/*/hg-git-remap.pl 78 | docs/_build/ 79 | docs/source 80 | downloads/ 81 | eggs/ 82 | env.bak/ 83 | env/ 84 | htmlcov/ 85 | instance/ 86 | ipython_config.py 87 | lib/ 88 | lib64/ 89 | local_settings.py 90 | marimo/_lsp/ 91 | marimo/_static/ 92 | misc 93 | nosetests.xml 94 | parts/ 95 | pip-delete-this-directory.txt 96 | pip-log.txt 97 | profile_default/ 98 | sdist/ 99 | share/python-wheels/ 100 | target/ 101 | var/ 102 | venv.bak/ 103 | venv/ 104 | wheels/ 105 | -------------------------------------------------------------------------------- /docs/changelog/0.5/0.5.8.rst: -------------------------------------------------------------------------------- 1 | 2 | HEAD (2023-07-09) 3 | ################## 4 | 5 | Changes since 0.5.7 (2022-07-08). 6 | 7 | New Features 8 | $$$$$$$$$$$$$ 9 | 10 | * update assemblies and add T2T-CHM13v2.0 [`19ebeff `_] (Reece Hart) 11 | 12 | Other Changes 13 | $$$$$$$$$$$$$$ 14 | 15 | * `#47 `_ Rewrite trim_left and trim_right for linear performance. [`7817f6a `_] (Kyle Ferriter) 16 | * Handle case when no alleles are passed to trim functions [`eb8607a `_] (Kyle Ferriter) 17 | * Merge pull request #48 from theferrit32/47-large-seq-normalization [`5b3a80c `_] (Reece Hart) 18 | * Merge pull request #49 from theferrit32/46-test-vcr-compilation [`271ad81 `_] (Reece Hart) 19 | 20 | Internal and Developer Changes 21 | $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ 22 | 23 | * Pin urllib3 version that vcrpy dependency depends on [`9ddda27 `_] (Kyle Ferriter) 24 | * pin urllib3 to 1.26.* (rather than to specific patch version) [`b60b890 `_] (Reece Hart) 25 | * add CODEOWNERS file [`eef120d `_] (Reece Hart) 26 | * updated CONTRIBUTORS.txt [`8741a5f `_] (Reece Hart) 27 | * reformatted with black and isort [`3305c0e `_] (Reece Hart) 28 | * Close `#50 `_: synchronize biocommonsexample with bioutils [`5a1788c `_] (Reece Hart) 29 | -------------------------------------------------------------------------------- /bin/fasta-ga4gh-identifier: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """compute and display ga4gh sequence identifiers for sequences in a fasta file 4 | 5 | snafu$ ./bin/fasta-ga4gh-identifier ~/Downloads/GCA_000001405.28_GRCh38.p13_genomic.fna.gz 6 | ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO CM000663.2 CM000663.2 Homo sapiens chromosome 1, GRCh38 reference primary assembly 7 | ga4gh:SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g CM000664.2 CM000664.2 Homo sapiens chromosome 2, GRCh38 reference primary assembly 8 | ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX CM000665.2 CM000665.2 Homo sapiens chromosome 3, GRCh38 reference primary assembly 9 | 10 | snafu$ ./bin/fasta-ga4gh-identifier ~/Downloads/Homo_sapiens.GRCh38.dna.toplevel.fa.gz 11 | ga4gh:SQ.2YnepKM7OkBoOrKmvHbGqguVfF9amCST 1 1 dna:chromosome chromosome:GRCh38:1:1:248956422:1 REF 12 | ga4gh:SQ.lwDyBi432Py-7xnAISyQlnlhWDEaBPv2 2 2 dna:chromosome chromosome:GRCh38:2:1:242193529:1 REF 13 | ga4gh:SQ.Eqk6_SvMMDCc6C-uEfickOUWTatLMDQZ 3 3 dna:chromosome chromosome:GRCh38:3:1:198295559:1 REF 14 | 15 | 16 | Sigh. Ensembl modifies GRCh38 sequences. 17 | 18 | """ 19 | 20 | 21 | import gzip 22 | import sys 23 | 24 | from Bio import SeqIO 25 | from bioutils.digests import seq_seqhash 26 | 27 | 28 | def anyopen(path, encoding=None): 29 | if path == "-": 30 | # decoding is automatic in Python 3 based on locale 31 | # https://docs.python.org/3/library/sys.html#sys.stdin 32 | return sys.stdin 33 | elif path.endswith(".gz"): 34 | return gzip.open(path, mode="rt", encoding=encoding) 35 | else: 36 | return open(path, mode="r", encoding=encoding) 37 | 38 | 39 | if __name__ == "__main__": 40 | for path in sys.argv[1:]: 41 | with anyopen(path) as fp: 42 | for rec in SeqIO.parse(fp, "fasta"): 43 | digest = seq_seqhash(str(rec.seq)) 44 | ga4gh_ir = "ga4gh:SQ." + digest 45 | print(ga4gh_ir + "\t" + rec.id + "\t" + rec.description) 46 | -------------------------------------------------------------------------------- /tests/data/cassettes/test_fetch_seq_errors: -------------------------------------------------------------------------------- 1 | interactions: 2 | - request: 3 | body: null 4 | headers: 5 | Accept: 6 | - '*/*' 7 | Accept-Encoding: 8 | - gzip, deflate 9 | Connection: 10 | - keep-alive 11 | User-Agent: 12 | - python-requests/2.32.3 13 | method: GET 14 | uri: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NM_9.9&rettype=fasta&tool=bioutils&email=biocommons-dev@googlegroups.com 15 | response: 16 | body: 17 | string: '+Error%3A+CEFetchPApplication%3A%3Aproxy_stream()%3A+Error%3A+F+a+i+l+e+d++t+o++u+n+d+e+r+s+t+a+n+d++i+d+%3A++N+M+_+9+.+9+%0A%0A 18 | 19 | ' 20 | headers: 21 | Access-Control-Allow-Origin: 22 | - '*' 23 | Access-Control-Expose-Headers: 24 | - X-RateLimit-Limit,X-RateLimit-Remaining 25 | Cache-Control: 26 | - private 27 | Connection: 28 | - close 29 | Content-Disposition: 30 | - attachment; filename="sequence.fasta" 31 | Content-Security-Policy: 32 | - upgrade-insecure-requests 33 | Content-Type: 34 | - text/plain; charset=UTF-8 35 | Date: 36 | - Tue, 29 Oct 2024 01:38:17 GMT 37 | NCBI-PHID: 38 | - 939BBD6305AE024500005DC43656677F.1.1.m_5 39 | NCBI-SID: 40 | - 3B6A5B3951E601B0_4FADSID 41 | Referrer-Policy: 42 | - origin-when-cross-origin 43 | Server: 44 | - Finatra 45 | Set-Cookie: 46 | - ncbi_sid=3B6A5B3951E601B0_4FADSID; domain=.nih.gov; path=/; expires=Wed, 29 47 | Oct 2025 01:38:18 GMT 48 | Strict-Transport-Security: 49 | - max-age=31536000; includeSubDomains; preload 50 | Transfer-Encoding: 51 | - chunked 52 | X-RateLimit-Limit: 53 | - '3' 54 | X-RateLimit-Remaining: 55 | - '1' 56 | X-UA-Compatible: 57 | - IE=Edge 58 | X-XSS-Protection: 59 | - 1; mode=block 60 | content-encoding: 61 | - gzip 62 | status: 63 | code: 400 64 | message: Bad Request 65 | version: 1 66 | -------------------------------------------------------------------------------- /src/bioutils/cytobands.py: -------------------------------------------------------------------------------- 1 | """ 2 | ``./sbin/ucsc-cytoband-to-json cytoband-hg38.txt.gz | gzip -c >bioutils/_data/cytobands/ucsc-hg38.json.gz`` 3 | """ 4 | 5 | import gzip 6 | import json 7 | from importlib import resources 8 | from pathlib import Path 9 | 10 | _data_dir = Path(str(resources.files(__package__) / "_data" / "cytobands")) 11 | 12 | 13 | def get_cytoband_names(): 14 | """Retrieves available cytobands from the ``_data/cytobands`` directory. 15 | 16 | Returns: 17 | list of str: The names of the available cytobands. 18 | 19 | Examples: 20 | >>> sorted(get_cytoband_names()) 21 | ['ucsc-hg19', 'ucsc-hg38'] 22 | """ 23 | 24 | return [n.name.replace(".json.gz", "") for n in _data_dir.glob("*.json.gz")] 25 | 26 | 27 | def get_cytoband_map(name): 28 | """Retrives a cytoband by name. 29 | 30 | Args: 31 | name (str): The name of the cytoband to retrieve. 32 | 33 | Returns: 34 | dict: A dictionary of the cytoband data. 35 | 36 | 37 | Examples: 38 | >>> map = get_cytoband_map("ucsc-hg38") 39 | >>> map["1"]["p32.2"] 40 | [55600000, 58500000, 'gpos50'] 41 | """ 42 | 43 | fn = _data_dir / f"{name}.json.gz" 44 | return json.load(gzip.open(fn, mode="rt", encoding="utf-8")) 45 | 46 | 47 | def get_cytoband_maps(names=[]): 48 | """Retrieves data from multiple cytobands. 49 | 50 | If cytobands are not specified, retrieves data from all available ones. 51 | 52 | Args: 53 | names (list of str, optional): The names of cytobands to retrieve data for. 54 | 55 | Returns: 56 | dict: A dictionary of the form ``{cytoband_name, cytoband_data}``. 57 | 58 | Examples: 59 | >>> maps = get_cytoband_maps() 60 | >>> maps["ucsc-hg38"]["1"]["p32.2"] 61 | [55600000, 58500000, 'gpos50'] 62 | >>> maps["ucsc-hg19"]["1"]["p32.2"] 63 | [56100000, 59000000, 'gpos50'] 64 | """ 65 | 66 | if names == []: 67 | names = get_cytoband_names() 68 | return {name: get_cytoband_map(name) for name in names} 69 | -------------------------------------------------------------------------------- /docs/changelog/0.5/0.5.0.rst: -------------------------------------------------------------------------------- 1 | 2 | 0.5.0 (2019-07-22) 3 | ################### 4 | 5 | Changes since 0.4.4 (2019-05-13). 6 | 7 | Special Attention 8 | $$$$$$$$$$$$$$$$$$ 9 | 10 | * All biocommons packages now require Python >= 3.6. See https://github.com/biocommons/org/wiki/Migrating-to-Python-3.6 11 | 12 | New Features 13 | $$$$$$$$$$$$$ 14 | 15 | * `#18 `_: Implemented comprehensive sequence normalization (trim, left, right, expand/voca, vcf) [`36785fa `_] (Reece Hart) 16 | * `#20 `_: implement hex-based digests à la refget [`140a20e `_] (Reece Hart) 17 | * Add support for cytobands, incl data files from UCSC [`0ba4361 `_] (Reece Hart) 18 | * Added accessions.py:coerce_namespace() [`e31e592 `_] (Reece Hart) 19 | 20 | Internal and Developer Changes 21 | $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ 22 | 23 | * Added pytest-optional-tests; use test alias in Makefile [`ba9b993 `_] (Reece Hart) 24 | * Added trinuc normalization tests [`cfe3a68 `_] (Reece Hart) 25 | * Added vcrpy to test requirements [`95893f1 `_] (Reece Hart) 26 | * Moved source to src/; updated setup.cfg [`ff45fb0 `_] (Reece Hart) 27 | * Removed pip install from tox in favor of deps [`8c8f91a `_] (Reece Hart) 28 | * Renamed doc → docs [`1612e5c `_] (Reece Hart) 29 | * Store assemblies as compressed json [`ea79e71 `_] (Reece Hart) 30 | * Update tests to use new vcr cassettes on optional tests (much faster!) [`2001745 `_] (Reece Hart) 31 | -------------------------------------------------------------------------------- /tests/data/cassettes/test_fetch_seq_ncbi_invalid_positions: -------------------------------------------------------------------------------- 1 | interactions: 2 | - request: 3 | body: null 4 | headers: 5 | Accept: 6 | - '*/*' 7 | Accept-Encoding: 8 | - gzip, deflate 9 | Connection: 10 | - keep-alive 11 | User-Agent: 12 | - python-requests/2.32.3 13 | method: GET 14 | uri: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=NP_001230161.1&rettype=fasta&seq_start=3191&seq_stop=3190&tool=bioutils&email=biocommons-dev@googlegroups.com 15 | response: 16 | body: 17 | string: '+Error%3A+CEFetchPApplication%3A%3Aproxy_stream()%3A+Error%3A+S+e+q+u+e+n+c+e++s+t+a+r+t++i+s++o+u+t+s+i+d+e++o+f++s+e+q+u+e+n+c+e++r+a+n+g+e+%3A++f+r+o+m++%3D++3+1+9+1+,++l+e+n+g+t+h++%3D++6+0+0+%0A%0A 18 | 19 | ' 20 | headers: 21 | Access-Control-Allow-Origin: 22 | - '*' 23 | Access-Control-Expose-Headers: 24 | - X-RateLimit-Limit,X-RateLimit-Remaining 25 | Cache-Control: 26 | - private 27 | Connection: 28 | - close 29 | Content-Disposition: 30 | - attachment; filename="sequence.fasta" 31 | Content-Security-Policy: 32 | - upgrade-insecure-requests 33 | Content-Type: 34 | - text/plain; charset=UTF-8 35 | Date: 36 | - Tue, 29 Oct 2024 01:38:17 GMT 37 | NCBI-PHID: 38 | - 322C747DCD3729C500005ACD6CF2CE3F.1.1.m_5 39 | NCBI-SID: 40 | - 7EE76A74F4915BDA_0FEDSID 41 | Referrer-Policy: 42 | - origin-when-cross-origin 43 | Server: 44 | - Finatra 45 | Set-Cookie: 46 | - ncbi_sid=7EE76A74F4915BDA_0FEDSID; domain=.nih.gov; path=/; expires=Wed, 29 47 | Oct 2025 01:38:17 GMT 48 | Strict-Transport-Security: 49 | - max-age=31536000; includeSubDomains; preload 50 | Transfer-Encoding: 51 | - chunked 52 | X-RateLimit-Limit: 53 | - '3' 54 | X-RateLimit-Remaining: 55 | - '2' 56 | X-UA-Compatible: 57 | - IE=Edge 58 | X-XSS-Protection: 59 | - 1; mode=block 60 | content-encoding: 61 | - gzip 62 | status: 63 | code: 400 64 | message: Bad Request 65 | version: 1 66 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | bioutils -- bioinformatics utilities and lookup tables 2 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 3 | 4 | |pypi_badge| |build_status| |cov_badge| |cc_badge| |issues_badge| |contributors| |license| |changelog| 5 | 6 | 7 | bioutils provides some common utilities and lookup tables for bioinformatics. 8 | 9 | * bioutils.accessions -- parse accessions, infer namespaces 10 | * bioutils.assemblies -- Human assembly information (from NCBI/GRCh) 11 | * bioutils.cytobands -- map cytobands to coordinates (from UCSC cytoband tables) 12 | * bioutils.digests -- implementations of various digests 13 | * bioutils.normalize -- allele normalization (left shuffle, right shuffle, expanded, vcf) 14 | 15 | 16 | To use an E-Utilities API key run add it to an environment variable 17 | called `ncbi_api_key` and it will be used in the E-Utilities request. 18 | 19 | 20 | .. |build_status| image:: https://travis-ci.org/biocommons/bioutils.svg?branch=master 21 | :target: https://travis-ci.org/biocommons/bioutils 22 | 23 | .. |changelog| image:: https://img.shields.io/badge/docs-changelog-green.svg 24 | :target: https://bioutils.readthedocs.io 25 | 26 | .. |contributors| image:: https://img.shields.io/github/contributors/biocommons/bioutils.svg 27 | :target: https://github.com/biocommons/bioutils 28 | 29 | .. |docs| image:: https://img.shields.io/badge/docs-readthedocs-green.svg 30 | :target: http://bioutils.readthedocs.io/ 31 | 32 | .. |issues_badge| image:: https://img.shields.io/github/issues/biocommons/bioutils.png 33 | :target: https://github.com/biocommons/bioutils/issues 34 | 35 | .. |license| image:: https://img.shields.io/github/license/biocommons/bioutils.svg 36 | :target: https://github.com/biocommons/bioutils/blob/master/LICENSE 37 | 38 | .. |pypi_badge| image:: https://img.shields.io/pypi/v/bioutils.svg 39 | :target: https://pypi.org/project/bioutils/ 40 | 41 | 42 | .. |cc_badge| image:: https://api.codeclimate.com/v1/badges/3a99e06ad0a842174b0a/maintainability 43 | :target: https://codeclimate.com/github/biocommons/bioutils/maintainability 44 | :alt: Maintainability 45 | 46 | .. |cov_badge| image:: https://coveralls.io/repos/github/biocommons/bioutils/badge.svg?branch=master 47 | :target: https://coveralls.io/github/biocommons/bioutils?branch=master 48 | 49 | -------------------------------------------------------------------------------- /tests/test_sequences.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from bioutils.sequences import TranslationTable, translate_cds 4 | 5 | 6 | def test_translate_examples(): 7 | """test for standard translation table""" 8 | 9 | assert translate_cds("ATGCGA") == "MR" 10 | assert translate_cds("AUGCGA") == "MR" 11 | assert translate_cds(None) is None 12 | assert translate_cds("") == "" 13 | with pytest.raises(ValueError): 14 | translate_cds("AUGCG") 15 | 16 | assert translate_cds("AUGCG", full_codons=False) == "M*" 17 | assert translate_cds("ATGTAN") == "MX" 18 | assert translate_cds("CCN") == "P" 19 | assert translate_cds("TRA") == "*" 20 | assert translate_cds("TTNTA", full_codons=False) == "X*" 21 | assert translate_cds("CTB") == "L" 22 | assert translate_cds("AGM") == "X" 23 | assert translate_cds("GAS") == "X" 24 | assert translate_cds("CUN") == "L" 25 | with pytest.raises(ValueError): 26 | translate_cds("AUGCGQ") 27 | 28 | 29 | def test_translate_selenoproteins(): 30 | """unit test for sec codon""" 31 | assert translate_cds("AUGTGATAA") == "M**" 32 | assert translate_cds("AUGTGATAA", translation_table=TranslationTable.standard) == "M**" 33 | assert translate_cds("AUGTGATAA", translation_table=TranslationTable.selenocysteine) == "MU*" 34 | assert ( 35 | translate_cds( 36 | "AUGTGATA", 37 | translation_table=TranslationTable.selenocysteine, 38 | full_codons=False, 39 | ) 40 | == "MU*" 41 | ) 42 | 43 | with pytest.raises(ValueError): 44 | translate_cds("AUGTGATA", translation_table=TranslationTable.selenocysteine) 45 | 46 | 47 | def test_translate_vertebrate_mitochondrial(): 48 | """unit test for vertebrate mitochondrial codons""" 49 | assert translate_cds("AUGTGATAA") == "M**" 50 | assert translate_cds("ATATGAAGGAGA", translation_table=TranslationTable.vertebrate_mitochondrial) == "MW**" 51 | assert ( 52 | translate_cds( 53 | "ATAAG", 54 | translation_table=TranslationTable.vertebrate_mitochondrial, 55 | full_codons=False, 56 | ) 57 | == "M*" 58 | ) 59 | 60 | with pytest.raises(ValueError): 61 | translate_cds("ATAAG", translation_table=TranslationTable.vertebrate_mitochondrial) 62 | -------------------------------------------------------------------------------- /sbin/generate-assembly-sql: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import csv 4 | import io 5 | import sys 6 | 7 | import bioutils.assemblies 8 | 9 | 10 | 11 | if __name__ == "__main__": 12 | with io.open("assy-seqs.sql", "w") as sql_fh: 13 | sql_fh.write(""" 14 | drop table assembly_sequence; 15 | drop table assembly; 16 | 17 | create table assembly ( 18 | assy text primary key, 19 | genbank_ac text, 20 | refseq_ac text, 21 | description text 22 | ); 23 | 24 | create table assembly_sequence ( 25 | assy text not null references assembly(assy), 26 | genbank_ac text, 27 | refseq_ac text not null, 28 | rel text not null, 29 | name text not null, 30 | length bigint not null, 31 | unit text not null, 32 | aliases text[] 33 | ); 34 | 35 | create unique index assy_name_unique on assembly_sequence(assy, name); 36 | 37 | 38 | \copy assembly from assy.csv with CSV HEADER DELIMITER ' ' 39 | \copy assembly_sequence from seqs.csv with CSV HEADER DELIMITER ' ' 40 | 41 | """) 42 | 43 | 44 | assy_fh = csv.DictWriter( 45 | io.open("assy.csv", "w"), 46 | fieldnames="assy genbank_ac refseq_ac description".split(), 47 | delimiter="\t") 48 | assy_fh.writeheader() 49 | 50 | seqs_fh = csv.DictWriter( 51 | io.open("seqs.csv", "w"), 52 | fieldnames="assy genbank_ac refseq_ac rel name length unit aliases".split(), 53 | delimiter="\t") 54 | seqs_fh.writeheader() 55 | 56 | 57 | assys = bioutils.assemblies.get_assemblies() 58 | 59 | for an in assys.keys(): 60 | assy = assys[an] 61 | 62 | assy_fh.writerow({ 63 | "assy": an, 64 | "genbank_ac": assy["genbank_ac"], 65 | "refseq_ac": assy["refseq_ac"], 66 | "description": assy["description"], 67 | }) 68 | 69 | for seq in assy["sequences"]: 70 | seqs_fh.writerow({ 71 | "assy": an, 72 | "genbank_ac": seq["genbank_ac"], 73 | "refseq_ac": seq["refseq_ac"], 74 | "rel": seq["relationship"], 75 | "name": seq["name"], 76 | "length": seq["length"], 77 | "unit": seq["assembly_unit"], 78 | "aliases": "{" + ",".join(seq["aliases"]) + "}", 79 | }) 80 | -------------------------------------------------------------------------------- /src/bioutils/coordinates.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*-, flake8: noqa 2 | """Provides utilities for interconverting between coordinate systems 3 | especially as used by the hgvs code. The three systems are: 4 | 5 | .. parsed-literal:: 6 | : A : C : G : T : A : C : 7 | human/hgvs h :-3 :-2 :-1 : 1 : 2 : 3 : 8 | continuous c :-2 :-1 : 0 : 1 : 2 : 3 : 9 | interbase i -3 -2 -1 0 1 2 3 10 | 11 | Human/hgvs coordinates are the native coordinates used by the HGVS 12 | recommendations. The coordinates are 1-based, inclusive, and refer to 13 | the nucleotides; there is no 0. 14 | 15 | Continuous coordinates are similar to hgvs coordinates, but adds 1 to 16 | all negative values so that there is no discontinuity between -1 and 1 17 | (as there is with HGVS). 18 | 19 | Interbase coordinates refer to the zero-width junctions between 20 | nucleotides. The main advantage of interbase coordinates is that 21 | there are no corner cases in the specification of intervals used for 22 | insertions and deletions as there is with numbering systems that refer 23 | to nucleotides themselves. Numerically, interbase intervals are 24 | 0-based, left-closed, and right-open. Beacuse referring to a single 25 | interbase coordinate is not particularly meaningful, interbase 26 | coordinates are always passed as start,end pairs. 27 | 28 | Because it's easy to confuse these coordinates in code, ``_h``, ``_c``, and ``_i`` 29 | suffixes are often used to clarify variables. 30 | 31 | For code clarity, this module provides functions that interconvert 32 | *intervals* specified in each of the coordinate systems. 33 | """ 34 | 35 | PLUS_STRAND = 1 36 | MINUS_STRAND = -1 37 | 38 | 39 | def strand_pm_to_int(s): 40 | """Converts '+' and '-' to 1 and -1, respectively. 41 | 42 | Args: 43 | s (string) 44 | 45 | Returns: 46 | int: 1 if s == '+', -1 if s == '-', otherwise None. 47 | 48 | Examples: 49 | >>> strand_pm_to_int('+') 50 | 1 51 | >>> strand_pm_to_int('-') 52 | -1 53 | >>> strand_pm_to_int('arglefargle') 54 | """ 55 | return PLUS_STRAND if s == "+" else MINUS_STRAND if s == "-" else None 56 | 57 | 58 | def strand_int_to_pm(i): 59 | """Converts 1 and -1 to '+' and '-' respectively. 60 | 61 | Args: 62 | i (int) 63 | 64 | Returns: 65 | str: '+' if i == 1, '-' if i == -1, otherwise None. 66 | 67 | Examples: 68 | >>> strand_int_to_pm(1) 69 | '+' 70 | >>> strand_int_to_pm(-1) 71 | '-' 72 | >>> strand_int_to_pm(42) 73 | """ 74 | 75 | return "+" if i == PLUS_STRAND else "-" if i == MINUS_STRAND else None 76 | 77 | 78 | strand_pm = strand_int_to_pm 79 | 80 | ## 81 | ## Copyright 2014 Bioutils Contributors (https://bitbucket.org/biocommons/bioutils) 82 | ## 83 | ## Licensed under the Apache License, Version 2.0 (the "License"); 84 | ## you may not use this file except in compliance with the License. 85 | ## You may obtain a copy of the License at 86 | ## 87 | ## http://www.apache.org/licenses/LICENSE-2.0 88 | ## 89 | ## Unless required by applicable law or agreed to in writing, software 90 | ## distributed under the License is distributed on an "AS IS" BASIS, 91 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 92 | ## See the License for the specific language governing permissions and 93 | ## limitations under the License. 94 | ## 95 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bioutils 2 | 3 | [![Release](https://img.shields.io/github/v/release/biocommons/bioutils)](https://img.shields.io/github/v/release/biocommons/bioutils) 4 | [![Build status](https://img.shields.io/github/actions/workflow/status/biocommons/bioutils/main.yml?branch=main)](https://github.com/biocommons/bioutils/actions/workflows/main.yml?query=branch%3Amain) 5 | [![codecov](https://codecov.io/gh/biocommons/bioutils/branch/main/graph/badge.svg)](https://codecov.io/gh/biocommons/bioutils) 6 | [![Commit activity](https://img.shields.io/github/commit-activity/m/biocommons/bioutils)](https://img.shields.io/github/commit-activity/m/biocommons/bioutils) 7 | [![License](https://img.shields.io/github/license/biocommons/bioutils)](https://img.shields.io/github/license/biocommons/bioutils) 8 | 9 | Package Description 10 | 11 | This project is a product of the [biocommons](https://biocommons.org/) community. 12 | 13 | - **Github repository**: 14 | - **Documentation** 15 | 16 | ## Python Package Installation 17 | 18 | Install from PyPI with `pip install bioutils` or `uv pip install bioutils`, then try it: 19 | 20 | ## Developer Setup 21 | 22 | ### Install Prerequisites 23 | 24 | These tools are required to get started: 25 | 26 | - [git](https://git-scm.com/): Version control system 27 | - [GNU make](https://www.gnu.org/software/make/): Current mechanism for consistent invocation of developer tools. 28 | - [uv](https://docs.astral.sh/uv/): An extremely fast Python package and project manager, written in Rust. 29 | 30 | #### MacOS or Linux Systems 31 | 32 | - [Install brew](https://brew.sh/) 33 | - `brew install git make uv` 34 | 35 | #### Linux (Debian-based systems) 36 | 37 | You may also install using distribution packages: 38 | 39 | sudo apt install git make 40 | 41 | Then install uv using the [uv installation instructions](https://docs.astral.sh/uv/getting-started/installation/). 42 | 43 | ### One-time developer setup 44 | 45 | Create a Python virtual environment, install dependencies, install pre-commit hooks, and install an editable package: 46 | 47 | make devready 48 | 49 | ### Development 50 | 51 | **N.B.** Developers are strongly encouraged to use `make` to invoke tools to 52 | ensure consistency with the CI/CD pipelines. Type `make` to see a list of 53 | supported targets. A subset are listed here: 54 | 55 | » make 56 | 🌟🌟 biocommons conventional make targets 🌟🌟 57 | 58 | Using these targets promots consistency between local development and ci/cd commands. 59 | 60 | usage: make [target ...] 61 | 62 | BASIC USAGE 63 | help Display help message 64 | 65 | SETUP, INSTALLATION, PACKAGING 66 | devready Prepare local dev env: Create virtual env, install the pre-commit hooks 67 | build Build package 68 | publish publish package to PyPI 69 | 70 | FORMATTING, TESTING, AND CODE QUALITY 71 | cqa Run code quality assessments 72 | test Test the code with pytest 73 | 74 | DOCUMENTATION 75 | docs-serve Build and serve the documentation 76 | docs-test Test if documentation can be built without warnings or errors 77 | 78 | CLEANUP 79 | clean Remove temporary and backup files 80 | cleaner Remove files and directories that are easily rebuilt 81 | cleanest Remove all files that can be rebuilt 82 | distclean Remove untracked files and other detritus 83 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Python project 2 | 3 | .DELETE_ON_ERROR: 4 | .PHONY: FORCE 5 | .PRECIOUS: 6 | .SUFFIXES: 7 | 8 | .DEFAULT_GOAL := help 9 | default: help 10 | 11 | ############################################################################ 12 | #= BASIC USAGE 13 | 14 | .PHONY: help 15 | help: ## Display help message 16 | @./sbin/makefile-extract-documentation ${MAKEFILE_LIST} 17 | 18 | ############################################################################ 19 | #= SETUP, INSTALLATION, PACKAGING 20 | 21 | install: devready 22 | .PHONY: devready 23 | devready: ## Prepare local dev env: Create virtual env, install the pre-commit hooks 24 | $(call INFO_MESSAGE, Prepare local dev env: Create virtual env and install the pre-commit hooks) 25 | uv sync --dev 26 | uv run pre-commit install 27 | @echo '⚠️ You must activate the virtual env with `source .venv/bin/activate`' 28 | 29 | .PHONY: build 30 | build: ## Build package 31 | $(call INFO_MESSAGE, "Building package") 32 | rm -fr dist 33 | uv build 34 | 35 | .PHONY: publish 36 | publish: build ## publish package to PyPI 37 | $(call INFO_MESSAGE, "Publishing package") 38 | uv publish # Requires UV_PUBLISH_TOKEN or Trusted Publishing setup 39 | 40 | ############################################################################ 41 | #= FORMATTING, TESTING, AND CODE QUALITY 42 | 43 | .PHONY: cqa 44 | cqa: ## Run code quality assessments 45 | $(call INFO_MESSAGE, "Checking lock file consistency") 46 | uv lock --locked 47 | $(call INFO_MESSAGE, "Linting and reformatting files") 48 | uv run pre-commit run 49 | $(call INFO_MESSAGE, "Checking for obsolete dependencies") 50 | uv run deptry src 51 | 52 | .PHONY: test 53 | test: ## Test the code with pytest 54 | @echo "🚀 Testing code: Running pytest" 55 | uv run pytest --cov=. --cov-report=xml 56 | 57 | # to be incorporated 58 | # test-learn: 59 | # VCR_RECORD_MODE=new_episodes pytest -x 60 | 61 | ############################################################################ 62 | #= DOCUMENTATION 63 | 64 | .PHONY: docs-serve 65 | docs-serve: ## Build and serve the documentation 66 | $(call INFO_MESSAGE, "Build and serve docs for local development") 67 | uv run mkdocs serve 68 | 69 | .PHONY: docs-test 70 | docs-test: ## Test if documentation can be built without warnings or errors 71 | $(call INFO_MESSAGE, "Testing whether docs can be build") 72 | uv run mkdocs build -s 73 | 74 | ############################################################################ 75 | #= CLEANUP 76 | 77 | .PHONY: clean 78 | clean: ## Remove temporary and backup files 79 | $(call INFO_MESSAGE, "Remove temporary and backup files") 80 | find . \( -name "*~" -o -name "*.bak" \) -exec rm -frv {} + 81 | 82 | .PHONY: cleaner 83 | cleaner: clean ## Remove files and directories that are easily rebuilt 84 | $(call INFO_MESSAGE, "Remove files and directories that are easily rebuilt") 85 | rm -frv .cache .DS_Store .pytest_cache .ruff_cache build coverage.xml dist docs/_build site 86 | find . \( -name __pycache__ -type d \) -exec rm -frv {} + 87 | find . \( -name "*.pyc" -o -name "*.egg-info" \) -exec rm -frv {} + 88 | find . \( -name "*.orig" -o -name "*.rej" \) -exec rm -frv {} + 89 | 90 | .PHONY: cleanest 91 | cleanest: cleaner ## Remove all files that can be rebuilt 92 | $(call INFO_MESSAGE, "Remove files and directories that can be rebuilt") 93 | rm -frv .eggs .tox .venv venv 94 | 95 | .PHONY: distclean 96 | distclean: cleanest ## Remove untracked files and other detritus 97 | @echo "❌ Remove untracked files and other detritus -- Too dangerous... do this yourself" 98 | # git clean -df 99 | -------------------------------------------------------------------------------- /src/bioutils/digest.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import binascii 3 | 4 | _enc = "ascii" 5 | 6 | 7 | class Digest(bytes): 8 | """Represents a sliceable binary digest, with support for encoding and 9 | decoding using printable characters. 10 | 11 | Supported encoding and decodings are:: 12 | * base64 13 | * base64url 14 | * hex (aka base16) 15 | 16 | The Base64 specification 17 | (https://tools.ietf.org/html/rfc4648#page-7) defines base64 and a 18 | URL-safe variant called base64url. 19 | 20 | "Stringified" Digest objects use URL-safe base64 encodings. 21 | 22 | 23 | >>> import hashlib 24 | 25 | >>> b = hashlib.sha512().digest() 26 | >>> len(b) 27 | 64 28 | 29 | 30 | >>> d = Digest(b) # creation 31 | >>> str(d) # returns base64url 32 | 'z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXcg_SpIdNs6c5H0NE8XYXysP-DGNKHfuwvY7kxvUdBeoGlODJ6-SfaPg==' 33 | 34 | >>> d24 = d[:24] # slice binary digest at first 24 bytes 35 | >>> str(d24) 36 | 'z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXc' 37 | 38 | # encoding 39 | 40 | >>> d.as_base64url() 41 | 'z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXcg_SpIdNs6c5H0NE8XYXysP-DGNKHfuwvY7kxvUdBeoGlODJ6-SfaPg==' 42 | >>> d.as_hex() 43 | 'cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e' 44 | 45 | # decoding 46 | 47 | >>> d == Digest.from_base64(d.as_base64()) 48 | True 49 | >>> d == Digest.from_base64url(d.as_base64url()) 50 | True 51 | >>> d == Digest.from_hex(d.as_hex()) 52 | True 53 | """ 54 | 55 | def __str__(self): 56 | """returns digest as base64url string""" 57 | return self.as_base64url() 58 | 59 | # TODO: Consider requiring slice start == None or 0, and len % 3 == 0 60 | # Slicing %3 != 0 => strings will having suffix differences 61 | def __getitem__(self, key): 62 | return Digest(bytes.__getitem__(self, key)) 63 | 64 | # base64 65 | def as_base64(self): 66 | """Returns Digest as a base64-encoded string. 67 | 68 | Returns: 69 | str: base64 encoding of Digest. 70 | """ 71 | return base64.b64encode(self).decode(_enc) 72 | 73 | @staticmethod 74 | def from_base64(s): 75 | """Returns Digest object initialized from a base64-encoded string. 76 | 77 | Args: 78 | s (str): A base64-encoded digest string. 79 | 80 | Returns: 81 | Digest: A Digest object initialized from s. 82 | """ 83 | 84 | return Digest(base64.b64decode(s)) 85 | 86 | # base64url 87 | def as_base64url(self): 88 | """Returns Digest as URL-safe, base64-encoded string. 89 | 90 | Returns: 91 | str: URL-safe base64 encoding of Digest. 92 | """ 93 | return base64.urlsafe_b64encode(self).decode(_enc) 94 | 95 | @staticmethod 96 | def from_base64url(s): 97 | """Returns Digest object initialized from a base64url string. 98 | 99 | Args: 100 | s (str): A base64url-encoded digest string. 101 | 102 | Returns: 103 | Digest: A Digest object initialized from s. 104 | """ 105 | 106 | return Digest(base64.urlsafe_b64decode(s)) 107 | 108 | # for backward compatibility with earlier versions 109 | # ("base64url" is the official name for the encoding) 110 | as_base64us = as_base64url 111 | from_base64us = from_base64url 112 | 113 | # hex 114 | def as_hex(self): 115 | """Returns Digest as hex string. 116 | 117 | Returns: 118 | str: A hex-encoding of Digest. 119 | """ 120 | 121 | return binascii.hexlify(self).decode(_enc) 122 | 123 | @staticmethod 124 | def from_hex(s): 125 | """returns Digest object initialized from hex string. 126 | 127 | Args: 128 | s (str): A hex-encoded digest string. 129 | 130 | Returns: 131 | Digest: A Digest object initialized from s. 132 | """ 133 | 134 | return Digest(binascii.unhexlify(s)) 135 | 136 | 137 | if __name__ == "__main__": # pragma: nocover 138 | import hashlib 139 | 140 | b = hashlib.sha512().digest() 141 | d = Digest(b) 142 | assert isinstance(d, Digest), "d isn't a Digest" 143 | d24 = d[:24] 144 | assert isinstance(d24, Digest), "d24 isn't a Digest" 145 | e = Digest.from_base64url(d.as_base64url()) 146 | e24 = Digest.from_base64url(d24.as_base64url()) 147 | -------------------------------------------------------------------------------- /src/bioutils/vmc_digest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import hashlib 3 | 4 | from .digest import Digest 5 | 6 | ENC = "UTF-8" 7 | DEFAULT_DIGEST_SIZE = 24 8 | 9 | 10 | def vmc_digest(data, digest_size=DEFAULT_DIGEST_SIZE): 11 | """Returns the VMC Digest as a Digest object, which has both bytes and 12 | string (URL-safe, Base 64) representations. 13 | 14 | >>> d = vmc_digest("") 15 | 16 | .. # I can't figure out how to make this test work on Py 2 and 3 :-( 17 | 18 | .. >>> d # doctest: +SKIP 19 | .. b'\xcf\x83\xe15~\xef\xb8\xbd\xf1T(P\xd6m\x80' 20 | 21 | >>> str(d) 22 | 'z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXc' 23 | 24 | >>> len(d), len(str(d)) 25 | (24, 32) 26 | 27 | >>> str(vmc_digest("", 24)) 28 | 'z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXc' 29 | 30 | >>> vmc_digest("", 17) 31 | Traceback (most recent call last): 32 | ... 33 | ValueError: digest_size must be a multiple of 3 34 | 35 | >>> vmc_digest("", 66) 36 | Traceback (most recent call last): 37 | ... 38 | ValueError: digest_size must be between 0 and 63 (bytes) 39 | 40 | 41 | SHA-512 is 2x faster than SHA1 on modern 64-bit platforms. 42 | However, few appliations require 512 bits (64 bytes) of keyspace. 43 | That larger size translates into proportionally larger key size 44 | requirements, with attendant performance implications. By 45 | truncating the SHA-512 digest [1], users may obtain a tunable 46 | level of collision avoidance. 47 | 48 | The string returned by this function is Base 64 encoded with 49 | URL-safe characters [2], making it suitable for use with URLs or 50 | filesystem paths. Base 64 encoding results in an output string 51 | that is 4/3 the size of the input. If the length of the input 52 | string is not divisible by 3, the output is right-padded with 53 | equal signs (=), which have no information content. Therefore, 54 | this function requires that digest_size is evenly divisible by 3. 55 | (The resulting vmc_digest will be 4/3*digest_size bytes.) 56 | 57 | According to [3], the probability of a collision using b bits with 58 | m messages (sequences) is: 59 | 60 | ``P(b, m) = m^2 / 2^(b+1)``. 61 | 62 | Note that the collision probability depends on the number of 63 | messages, but not their size. Solving for the number of messages: 64 | 65 | ``m(b, P) = sqrt(P * 2^(b+1))`` 66 | 67 | Solving for the number of *bits*: 68 | 69 | ``b(m, P) = log2(m^2/P) - 1`` 70 | 71 | For various values of ``m`` and ``P``, the number of *bytes* required 72 | according to ``b(m,P)`` rounded to next multiple of 3 is: 73 | 74 | +---------+-----------+-----------+-----------+-----------+-----------+-----------+ 75 | | #m |``P<1e-24``|``P<1e-21``|``P<1e-18``|``P<1e-15``|``P<1e-12``|``P<1e-09``| 76 | +---------+-----------+-----------+-----------+-----------+-----------+-----------+ 77 | |``1e+06``| 15 | 12 | 12 | 9 | 9 | 9 | 78 | +---------+-----------+-----------+-----------+-----------+-----------+-----------+ 79 | |``1e+09``| 15 | 15 | 12 | 12 | 9 | 9 | 80 | +---------+-----------+-----------+-----------+-----------+-----------+-----------+ 81 | |``1e+12``| 15 | 15 | 15 | 12 | 12 | 9 | 82 | +---------+-----------+-----------+-----------+-----------+-----------+-----------+ 83 | |``1e+15``| 18 | 15 | 15 | 15 | 12 | 12 | 84 | +---------+-----------+-----------+-----------+-----------+-----------+-----------+ 85 | |``1e+18``| 18 | 18 | 15 | 15 | 15 | 12 | 86 | +---------+-----------+-----------+-----------+-----------+-----------+-----------+ 87 | |``1e+21``| 21 | 18 | 18 | 15 | 15 | 15 | 88 | +---------+-----------+-----------+-----------+-----------+-----------+-----------+ 89 | |``1e+24``| 21 | 21 | 18 | 18 | 15 | 15 | 90 | +---------+-----------+-----------+-----------+-----------+-----------+-----------+ 91 | 92 | For example, given ``1e+18`` expected messages and a desired collision 93 | probability ``< 1e-15``, we use ``digest_size = 15`` (bytes). 94 | 95 | References: 96 | - [1] http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf 97 | - [2] https://tools.ietf.org/html/rfc3548#section-4 98 | - [3] http://stackoverflow.com/a/4014407/342839 99 | - [4] http://stackoverflow.com/a/22029380/342839 100 | - [5] http://preshing.com/20110504/hash-collision-probabilities/ 101 | - [6] https://en.wikipedia.org/wiki/Birthday_problem 102 | 103 | """ 104 | 105 | # TODO: Consider relaxing %3 constraint and stripping padding 106 | if digest_size % 3 != 0: 107 | raise ValueError("digest_size must be a multiple of 3") 108 | if not 0 <= digest_size <= 63: 109 | raise ValueError("digest_size must be between 0 and 63 (bytes)") 110 | 111 | sha512 = Digest(hashlib.sha512(data.encode(ENC)).digest()) 112 | return sha512[:digest_size] 113 | -------------------------------------------------------------------------------- /tests/test_seqfetcher.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import os 3 | 4 | import pytest 5 | import vcr 6 | 7 | from bioutils.seqfetcher import _add_eutils_api_key, _fetch_seq_ncbi, fetch_seq, enst_default_seq_type 8 | 9 | 10 | @pytest.fixture(autouse=True) 11 | def clear_env(): 12 | """Some tests in this module assume that the default utils access configs are 13 | active. If you execute tests in an environment with an existing `NCBI_API_KEY` env 14 | var, those tests will fail unless we first remove that variable. 15 | """ 16 | if "NCBI_API_KEY" in os.environ: 17 | del os.environ["NCBI_API_KEY"] 18 | 19 | 20 | @vcr.use_cassette 21 | def test_fetch_seq(): 22 | assert 1596 == len(fetch_seq("NP_056374.2")) 23 | 24 | assert "MESRETLSSS" == fetch_seq("NP_056374.2", 0, 10) 25 | assert "MESRETLSSS" == fetch_seq("NP_056374.2")[0:10] # NOT RECOMMENDED 26 | 27 | assert "ATCACACGTGCAGGAACCCTTTTCC" == fetch_seq("NC_000001.10", 2000000, 2000025) 28 | assert "AAAATTAAATTAAAATAAATAAAAA" == fetch_seq("NG_032072.1", 0, 25) 29 | assert "TTGTGTGTTAGGGTGCTCTAAGCAA" == fetch_seq("NW_003571030.1", 0, 25) 30 | assert "GAATTCCTCGTTCACACAGTTTCTT" == fetch_seq("NT_113901.1", 0, 25) 31 | assert "NNNNNNNNNNNNNNNNNNNNNNNNN" == fetch_seq("NC_000001.10", 0, 25) 32 | assert "MESRETLSSSRQRGGESDFLPVSSA" == fetch_seq("NP_056374.2", 0, 25) 33 | assert "GATCCACCTGCCTCAGCCTCCCAGA" == fetch_seq("GL000191.1", 0, 25) 34 | assert "TTTATTTATTTTAGATACTTATCTC" == fetch_seq("KB663603.1", 0, 25) 35 | assert "CCGCTCGGGCCCCGGCTCTCGGTTA" == fetch_seq("ENST00000288602.11", 0, 25) 36 | assert "MAALSGGGGGGAEPGQALFNGDMEP" == fetch_seq("ENSP00000288602", 0, 25) 37 | 38 | 39 | ENST00000617537_470_480 = { 40 | # In [16]: s_gen[470:480], s_cdna[470:480], s_cds[470:480] 41 | # Out[16]: ("TAGGTATGCA", "TAGGGTGTGT", "TGACATTTGT") 42 | "genomic": "TAGGTATGCA", 43 | "cdna": "TAGGGTGTGT", 44 | "cds": "TGACATTTGT", 45 | } 46 | 47 | 48 | @vcr.use_cassette 49 | def test_fetch_ENST00000617537_noenv(caplog, monkeypatch): 50 | """ensure expected lengths for ENST00000617537 with ENST_DEFAULT_SEQ_TYPE unset""" 51 | monkeypatch.delenv("ENST_DEFAULT_SEQ_TYPE", raising=False) 52 | ac = "ENST00000617537" 53 | assert ENST00000617537_470_480[enst_default_seq_type] == fetch_seq(ac, start_i=470, end_i=480) 54 | assert "Transcript type not specified or set in ENST_DEFAULT_SEQ_TYPE" in caplog.text 55 | assert ENST00000617537_470_480["genomic"] == fetch_seq(ac, start_i=470, end_i=480, seq_type="genomic") 56 | assert ENST00000617537_470_480["cdna"] == fetch_seq(ac, start_i=470, end_i=480, seq_type="cdna") 57 | assert ENST00000617537_470_480["cds"] == fetch_seq(ac, start_i=470, end_i=480, seq_type="cds") 58 | 59 | 60 | @vcr.use_cassette 61 | def test_fetch_ENST00000617537_env(caplog, monkeypatch): 62 | """ensure expected lengths for ENST00000617537 with ENST_DEFAULT_SEQ_TYPE set""" 63 | user_enst_default_type = "cds" # intentionally != enst_default_seq_type to ensure use 64 | monkeypatch.setenv("ENST_DEFAULT_SEQ_TYPE", user_enst_default_type) 65 | ac = "ENST00000617537" 66 | assert ENST00000617537_470_480[user_enst_default_type] == fetch_seq(ac, start_i=470, end_i=480) 67 | assert "Transcript type not specified or set in ENST_DEFAULT_SEQ_TYPE" not in caplog.text 68 | assert ENST00000617537_470_480["genomic"] == fetch_seq(ac, start_i=470, end_i=480, seq_type="genomic") 69 | assert ENST00000617537_470_480["cdna"] == fetch_seq(ac, start_i=470, end_i=480, seq_type="cdna") 70 | assert ENST00000617537_470_480["cds"] == fetch_seq(ac, start_i=470, end_i=480, seq_type="cds") 71 | 72 | 73 | @vcr.use_cassette 74 | def test_fetch_seq_ncbi_invalid_positions(): 75 | with pytest.raises(RuntimeError) as excinfo: 76 | _fetch_seq_ncbi("NP_001230161.1", 3190, 3190) 77 | assert "invalid sequence or start or end position" in str(excinfo.value) 78 | 79 | 80 | @vcr.use_cassette 81 | def test_add_eutils_api_key(): 82 | try: 83 | url = "http://test.com?boo=bar" 84 | assert _add_eutils_api_key(url) == url 85 | os.environ["NCBI_API_KEY"] = "test-api-key" 86 | assert _add_eutils_api_key(url) == url + "&api_key=test-api-key" 87 | finally: 88 | try: 89 | os.environ.pop("NCBI_API_KEY") 90 | except KeyError: 91 | pass 92 | 93 | 94 | @vcr.use_cassette 95 | @pytest.mark.network 96 | def test_fetch_seq_errors(): 97 | # Traceback (most recent call last): 98 | # ... 99 | # RuntimeError: No sequence available for NM_9.9 100 | with pytest.raises(RuntimeError): 101 | fetch_seq("NM_9.9") 102 | 103 | # Traceback (most recent call last): 104 | # ... 105 | # RuntimeError: No sequence fetcher for QQ01234 106 | with pytest.raises(RuntimeError): 107 | fetch_seq("QQ01234") 108 | 109 | 110 | def _check1(_x): 111 | # small, fast query 112 | assert "MESRETLSSS" == fetch_seq("NP_056374.2", 0, 10) 113 | 114 | 115 | # no vcr! 116 | @pytest.mark.network 117 | def test_rate_limit(): 118 | num_requests = num_threads = 5 119 | p = multiprocessing.Pool(num_threads) 120 | p.map(_check1, range(num_requests)) 121 | p.close() 122 | p.join() 123 | -------------------------------------------------------------------------------- /sbin/assembly-to-json: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """convert assembly text file to json 4 | 5 | eg$ ./sbin/assembly-to-json -p bioutils/_data/assemblies -f pull/GCF_000001405.29.assembly.txt 6 | 7 | """ 8 | 9 | 10 | from argparse import ArgumentParser 11 | import csv 12 | import io 13 | import json 14 | import logging 15 | import os 16 | import re 17 | import sys 18 | 19 | 20 | class AssemblyParser(object): 21 | """Parse an assembly file from NCBI, like this one: 22 | ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/GCF_000001405.25.assembly.txt 23 | """ 24 | 25 | def __init__(self, body): 26 | self._body = body.replace("\r","") 27 | 28 | @property 29 | def name(self): 30 | return re.search("^# Assembly name:\s+(\S+)", self._body, flags=re.MULTILINE).group(1).strip() 31 | 32 | @property 33 | def description(self): 34 | try: 35 | return re.search("^# Description:\s+(.+)", self._body, flags=re.MULTILINE).group(1).strip() 36 | except AttributeError: 37 | return None 38 | 39 | @property 40 | def taxid(self): 41 | return re.search("^# Taxid:\s+(\S+)", self._body, flags=re.MULTILINE).group(1).strip() 42 | 43 | @property 44 | def genbank_accession(self): 45 | try: 46 | return re.search("^# GenBank assembly accession:\s+(\S+)", self._body, flags=re.MULTILINE).group(1).strip() 47 | except AttributeError: 48 | return None 49 | 50 | @property 51 | def refseq_accession(self): 52 | return re.search("^# RefSeq assembly accession:\s+(\S+)", self._body, flags=re.MULTILINE).group(1).strip() 53 | 54 | @property 55 | def submitter(self): 56 | return re.search("^# Submitter:\s+(.+)", self._body, flags=re.MULTILINE).group(1).strip() 57 | 58 | @property 59 | def date(self): 60 | dt = re.search("^# Date:\s+(\S+)", self._body, flags=re.MULTILINE).group(1).strip() 61 | ymd = map(int,dt.split("-")) 62 | return "{:4d}-{:02d}-{:02d}".format(*ymd) 63 | 64 | @property 65 | def assembly_units(self): 66 | block = re.search("## Assembly-Units:\n(.+?)\n#\n", self._body, flags=re.DOTALL).group(1).replace("## ","") 67 | dr = csv.DictReader(io.StringIO(block), delimiter=str("\t")) 68 | return list(dr) 69 | 70 | @property 71 | def sequences(self): 72 | block = re.search("# (Sequence-Name.+)", self._body, flags=re.DOTALL).group(1) 73 | dr = csv.DictReader(io.StringIO(block), delimiter=str("\t")) 74 | return list(dr) 75 | 76 | 77 | 78 | def parse_options(argv): 79 | ap = ArgumentParser( 80 | description = __doc__, 81 | ) 82 | ap.add_argument("ASSEMBLIES", nargs="+", help="accessions (GCF or GCA) or filenames of downloaded assemblies") 83 | ap.add_argument("--prefix", "-p", required=True, help="directory prefix for saving files -- must exist") 84 | opts = ap.parse_args(argv[1:]) 85 | return opts 86 | 87 | 88 | def build_seq_rec(seqrec): 89 | aliases = [] 90 | if seqrec["UCSC-style-name"] != "na": 91 | aliases = [seqrec["UCSC-style-name"]] # "chr17_ctg5_hap1" 92 | 93 | if seqrec["GenBank-Accn"] != "na": 94 | genbank_ac = seqrec["GenBank-Accn"] # "GL000258.1", 95 | else: 96 | genbank_ac = None 97 | 98 | return { 99 | "aliases": aliases, 100 | "assembly_unit": seqrec["Assembly-Unit"], # "ALT_REF_LOCI_9" 101 | "genbank_ac": genbank_ac, # "GL000258.1", 102 | "length": int(seqrec["Sequence-Length"]), # "1680828" 103 | "name": seqrec["Sequence-Name"], # "HSCHR17_1_CTG5", 104 | "refseq_ac": seqrec["RefSeq-Accn"], # "NT_167251.1" 105 | "relationship": seqrec["Relationship"], # "=", 106 | "sequence_role": seqrec["Sequence-Role"], # "alt-scaffold" 107 | #"": seqrec["Assigned-Molecule"], # "17" 108 | #"": seqrec["Assigned-Molecule-Location/Type"], # "Chromosome", 109 | } 110 | 111 | 112 | def process1(opts, assy_id_or_name): 113 | content = open(assy_id_or_name,"r").read() 114 | 115 | assy = AssemblyParser(content) 116 | 117 | obj = { 118 | "name": assy.name, 119 | "description": assy.description, 120 | "date": assy.date, 121 | "submitter": assy.submitter, 122 | "genbank_ac": assy.genbank_accession, 123 | "refseq_ac": assy.refseq_accession, 124 | "sequences": [build_seq_rec(sr) for sr in assy.sequences], 125 | } 126 | 127 | out_fn = os.path.join(opts.prefix, assy.name + ".json") 128 | 129 | with open(out_fn,"w") as out_fd: 130 | json.dump(fp=out_fd, sort_keys=True, indent=2, obj=obj) 131 | return out_fn 132 | 133 | 134 | if __name__ == "__main__": 135 | logging.basicConfig(level=logging.INFO) 136 | logger = logging.getLogger() 137 | 138 | opts = parse_options(sys.argv) 139 | 140 | for assy_id_or_name in opts.ASSEMBLIES: 141 | try: 142 | out_fn = process1(opts, assy_id_or_name) 143 | if out_fn is not None: 144 | logger.info("wrote {}".format(out_fn)) 145 | except Exception as e: 146 | logger.error("oopsie on " + assy_id_or_name) 147 | logger.exception(e) 148 | 149 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | build-backend = "setuptools.build_meta" 3 | requires = ["setuptools >= 70.1", "setuptools_scm[toml] ~= 8.1"] 4 | 5 | [dependency-groups] 6 | dev = [ 7 | "deptry>=0.23", 8 | "mkdocs-awesome-nav>=3.1.2", 9 | "mkdocs-material>=9.6", 10 | "mkdocs>=1.6", 11 | "mkdocstrings[python]>=0.30", 12 | "mypy>=1.17", 13 | "pre-commit>=3.8", 14 | "pytest-cov>=4.1", 15 | "pytest>=7.4", 16 | "ruff>=0.12", 17 | "tox-uv>=1.28", 18 | "vcrpy" 19 | ] 20 | 21 | [project] 22 | authors = [ 23 | {email = "biocommons-dev@googlegroups.com", name = "biocommons contributors"} 24 | ] 25 | classifiers = [ 26 | "Intended Audience :: Developers", 27 | "Programming Language :: Python :: 3", 28 | "Programming Language :: Python :: 3.11", 29 | "Programming Language :: Python :: 3.12", 30 | "Programming Language :: Python :: 3.13", 31 | "Programming Language :: Python", 32 | "Topic :: Software Development :: Libraries :: Python Modules" 33 | ] 34 | dependencies = [ 35 | "attrs", 36 | "requests" 37 | ] 38 | description = "miscellaneous simple bioinformatics utilities and lookup tables" 39 | dynamic = ["version"] 40 | keywords = [ 41 | "HGVS", 42 | "biocommons", 43 | "bioinformatics", 44 | "genomics", 45 | "variation" 46 | ] 47 | license = "Apache-2.0" 48 | name = "bioutils" 49 | readme = "README.md" 50 | requires-python = ">=3.11" 51 | 52 | [project.urls] 53 | Documentation = "https://biocommons.github.io/bioutils/" 54 | Homepage = "https://github.com/biocommons/bioutils" 55 | Issues = "https://github.com/biocommons/bioutils/issues" 56 | Repository = "https://github.com/biocommons/bioutils" 57 | 58 | [tool.coverage.report] 59 | exclude_lines = [ 60 | # Have to re-enable the standard pragma 61 | "pragma: no cover", 62 | 63 | # Don't complain about missing debug-only code: 64 | "def __repr__", 65 | "if self.debug", 66 | 67 | # Don't complain if tests don't hit defensive assertion code: 68 | "raise AssertionError", 69 | "raise NotImplementedError", 70 | 71 | # Don't complain if non-runnable code isn't run: 72 | "if __name__ == .__main__.:", 73 | ] 74 | show_missing = true 75 | skip_empty = true 76 | 77 | [tool.coverage.run] 78 | branch = true 79 | omit = ["*/test/*", "*/tests/*", "*_test.py"] 80 | source = ["src"] 81 | 82 | [tool.deptry] 83 | 84 | [tool.deptry.package_module_name_map] 85 | # map package name to import name 86 | # Making this explicit suppresses deptry notices 87 | coloredlogs = "coloredlogs" 88 | mkdocs = "mkdocs" 89 | mkdocs-material = "mkdocs_material" 90 | mkdocstrings = "mkdocstrings" 91 | mypy = "mypy" 92 | pre-commit = "pre_commit" 93 | pytest = "pytest" 94 | pytest-cov = "pytest_cov" 95 | pyyaml = "yaml" 96 | ruff = "ruff" 97 | tox-uv = "tox_uv" 98 | ty = "ty" 99 | 100 | [tool.pytest.ini_options] 101 | addopts = "-s -v -x --strict-markers -m 'not extra' --doctest-modules --cov=src" 102 | doctest_optionflags = [ 103 | "ALLOW_BYTES", 104 | "ALLOW_UNICODE", 105 | "ELLIPSIS", 106 | "IGNORE_EXCEPTION_DETAIL", 107 | "NORMALIZE_WHITESPACE" 108 | ] 109 | markers = [ 110 | "network: tests that require network connectivity", 111 | "slow: slow tests that should be run infrequently" 112 | ] 113 | testpaths = ["tests"] 114 | 115 | [tool.ruff] 116 | fix = true 117 | line-length = 100 118 | src = ["src", "tests"] 119 | target-version = "py39" 120 | 121 | [tool.ruff.format] 122 | docstring-code-format = true 123 | preview = true 124 | quote-style = "double" 125 | 126 | [tool.ruff.lint] 127 | fixable = [ 128 | "B", 129 | "C4", 130 | "D", 131 | "EM", 132 | "F401", 133 | "F541", 134 | "I", 135 | "PERF", 136 | "PIE", 137 | "PT", 138 | "RET", 139 | "RSE", 140 | "RUF", 141 | "SIM", 142 | "UP" 143 | ] 144 | ignore = [ 145 | "E111", 146 | "E114", 147 | "E117", 148 | "E501", 149 | "E731", 150 | "PLR0913", 151 | "S321", 152 | "W191" 153 | ] 154 | select = [ 155 | "A", # https://docs.astral.sh/ruff/rules/#flake8-builtins-a 156 | "ARG", # https://docs.astral.sh/ruff/rules/#flake8-unused-arguments-arg 157 | "B", # https://docs.astral.sh/ruff/rules/#flake8-bugbear-b 158 | "C4", # https://docs.astral.sh/ruff/rules/#flake8-comprehensions-c4 159 | "DTZ", # https://docs.astral.sh/ruff/rules/#flake8-datetimez-dtz 160 | "E", 161 | "EM", # https://docs.astral.sh/ruff/rules/#flake8-errmsg-em 162 | "F", # https://docs.astral.sh/ruff/rules/#pyflakes-f 163 | "G", # https://docs.astral.sh/ruff/rules/#flake8-logging-format-g 164 | "I", # https://docs.astral.sh/ruff/rules/#isort-i 165 | "LOG", # https://docs.astral.sh/ruff/rules/#flake8-logging-log 166 | "N", # https://docs.astral.sh/ruff/rules/#pep8-naming-n 167 | "PERF", # https://docs.astral.sh/ruff/rules/#perflint-perf 168 | "PIE", # https://docs.astral.sh/ruff/rules/#flake8-pie-pie 169 | "PL", # https://docs.astral.sh/ruff/rules/#pylint-pl 170 | "PT", # https://docs.astral.sh/ruff/rules/#flake8-pytest-style-pt 171 | "PTH", # https://docs.astral.sh/ruff/rules/#flake8-use-pathlib-pth 172 | "RET", # https://docs.astral.sh/ruff/rules/#flake8-return-ret 173 | "RSE", # https://docs.astral.sh/ruff/rules/#flake8-raise-rse 174 | "RUF", # https://docs.astral.sh/ruff/rules/#ruff-specific-rules-ruf 175 | "S", # https://docs.astral.sh/ruff/rules/#flake8-bandit-s 176 | "SIM", # https://docs.astral.sh/ruff/rules/#flake8-simplify-sim 177 | "TRY", # https://docs.astral.sh/ruff/rules/#tryceratops-try 178 | "UP", # https://docs.astral.sh/ruff/rules/#pyupgrade-up 179 | "W", # https://docs.astral.sh/ruff/rules/#pycodestyle-e-w 180 | "YTT" # https://docs.astral.sh/ruff/rules/#flake8-2020-ytt 181 | ] 182 | 183 | [tool.ruff.lint.per-file-ignores] 184 | "tests/*" = ["S101"] 185 | 186 | [tool.setuptools] 187 | include-package-data = true 188 | 189 | [tool.setuptools.package-data] 190 | "*" = ["_data/*/*.json.gz"] 191 | 192 | [tool.setuptools.packages.find] 193 | exclude = ["*.pyc", "__pycache__"] 194 | # namespaces = true 195 | where = ["src"] 196 | 197 | [tool.setuptools_scm] 198 | -------------------------------------------------------------------------------- /tests/test_normalize.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import pytest 4 | 5 | from bioutils.normalize import NormalizationMode, normalize 6 | 7 | sequence = "CCCCCCCCACACACACACTAGCAGCAGCA" 8 | 9 | normalize_seq = partial(normalize, sequence=sequence) 10 | normalize_trim = partial(normalize_seq, mode=NormalizationMode.TRIMONLY) 11 | normalize_trim_no_shuffle = partial(normalize_seq, mode=None, trim=True) 12 | normalize_no_trim_no_shuffle = partial(normalize_seq, mode=None, trim=False) 13 | normalize_left = partial(normalize_seq, mode=NormalizationMode.LEFTSHUFFLE) 14 | normalize_right = partial(normalize_seq, mode=NormalizationMode.RIGHTSHUFFLE) 15 | normalize_expand = partial(normalize_seq, mode=NormalizationMode.EXPAND) 16 | normalize_vcf = partial(normalize_seq, mode=NormalizationMode.VCF) 17 | normalize_left_no_trim = partial(normalize_seq, mode=NormalizationMode.LEFTSHUFFLE, trim=False) 18 | normalize_right_no_trim = partial(normalize_seq, mode=NormalizationMode.RIGHTSHUFFLE, trim=False) 19 | normalize_expand_no_trim = partial(normalize_seq, mode=NormalizationMode.EXPAND, trim=False) 20 | normalize_vcf_no_trim = partial(normalize_seq, mode=NormalizationMode.VCF, trim=False) 21 | 22 | 23 | @pytest.mark.parametrize("normalize_fn", [normalize_trim, normalize_trim_no_shuffle]) 24 | def test_trim(normalize_fn): 25 | """Should trim common prefix and suffix when trim=True.""" 26 | assert ((25, 25), ("", "AC")) == normalize_fn(interval=(22, 25), alleles=(None, "AGCAC")) 27 | assert ((24, 25), ("C", "", "CAC")) == normalize_fn(interval=(22, 25), alleles=(None, "AG", "AGCAC")) 28 | assert ((23, 24), ("G", "", "GCA")) == normalize_fn(interval=(22, 25), alleles=(None, "AC", "AGCAC")) 29 | assert ((22, 24), ("AG", "G", "AGCA")) == normalize_fn(interval=(22, 25), alleles=(None, "GC", "AGCAC")) 30 | 31 | 32 | @pytest.mark.parametrize("normalize_fn", [normalize_trim, normalize_trim_no_shuffle]) 33 | def test_anchor(normalize_fn): 34 | assert ((23, 25), ("GC", "")) == normalize_fn(interval=(22, 25), alleles=(None, "A"), anchor_length=0) 35 | assert ((22, 26), ("AGCA", "AA")) == normalize_fn(interval=(22, 25), alleles=(None, "A"), anchor_length=1) 36 | assert ((21, 27), ("CAGCAG", "CAAG")) == normalize_fn(interval=(22, 25), alleles=(None, "A"), anchor_length=2) 37 | 38 | # off the left 39 | assert ((1, 1), ("", "C")) == normalize_fn(interval=(1, 1), alleles=(None, "C"), anchor_length=0) 40 | assert ((0, 2), ("CC", "CCC")) == normalize_fn(interval=(1, 1), alleles=(None, "C"), anchor_length=1) 41 | assert ((0, 3), ("CCC", "CCCC")) == normalize_fn(interval=(1, 1), alleles=(None, "C"), anchor_length=2) 42 | 43 | # off the right 44 | assert ((28, 28), ("", "C")) == normalize_fn(interval=(28, 28), alleles=(None, "C"), anchor_length=0) 45 | assert ((27, 29), ("CA", "CCA")) == normalize_fn(interval=(28, 28), alleles=(None, "C"), anchor_length=1) 46 | assert ((26, 29), ("GCA", "GCCA")) == normalize_fn(interval=(28, 28), alleles=(None, "C"), anchor_length=2) 47 | 48 | 49 | def test_trinuc(): 50 | """LEFTSHUFFLE, RIGHTSHUFFLE, EXPAND normalization for trinucleotide""" 51 | # 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 52 | # C C C C C C C C A C A C A C A C A C T A G C A G C A G C A T 53 | # ^ [22,22): ['', 'AGC'] | Starting alleles 54 | # LEFTSHUFFLE ^ [19,19): ['', 'AGC'] 55 | # RIGHTSHUFFLE ^ [29,29): ['', 'GCA'] 56 | # EXPAND ^-------------------^ [19,29): ['AGCAGCAGCA', 'AGCAGCAGCAGCA'] 57 | assert ((19, 19), ("", "AGC")) == normalize_left(interval=(22, 22), alleles=(None, "AGC")) 58 | assert ((29, 29), ("", "GCA")) == normalize_right(interval=(22, 22), alleles=(None, "AGC")) 59 | assert ((19, 29), ("AGCAGCAGCA", "AGCAGCAGCAGCA")) == normalize_expand(interval=(22, 22), alleles=(None, "AGC")) 60 | 61 | 62 | def test_bounds(): 63 | """ensure that bounds are honored""" 64 | assert ((20, 24), ("GCAG", "GCAGCAG")) == normalize_expand( 65 | interval=(22, 22), alleles=(None, "AGC"), bounds=(20, 24) 66 | ) 67 | 68 | 69 | def test_no_trim_no_shuffle(): 70 | """Should not trim or shuffle when mode=None, trim=False.""" 71 | assert ((22, 25), ("AGC", "AGC")) == normalize_no_trim_no_shuffle(interval=(22, 25), alleles=(None, "AGC")) 72 | assert ((22, 25), ("AGC", "AGCT")) == normalize_no_trim_no_shuffle(interval=(22, 25), alleles=(None, "AGCT")) 73 | 74 | 75 | def test_shuffle_no_trim(): 76 | """Should shuffle but not trim when mode!=None and trim=False.""" 77 | assert ((19, 22), ("AGC", "AGC")) == normalize_left_no_trim(interval=(22, 25), alleles=(None, "AGC")) 78 | assert ((26, 29), ("GCA", "GCA")) == normalize_right_no_trim(interval=(22, 25), alleles=(None, "AGC")) 79 | assert ((19, 29), ("AGCAGCAGCA", "AGCAGCAGCA")) == normalize_expand_no_trim( 80 | interval=(22, 25), alleles=(None, "AGC") 81 | ) 82 | 83 | 84 | # TODO: def test_multiallele(): 85 | 86 | 87 | def test_mode_string(): 88 | "test that mode as string is accepted" 89 | _normalize = partial(normalize_seq, interval=(28, 28), alleles=(None, "C")) 90 | vcf_out = ((26, 27), ("G", "GC")) 91 | assert vcf_out != _normalize(), "not VCF output by default" 92 | assert vcf_out == _normalize(mode="VCF"), "mode as string recognized" 93 | 94 | 95 | def test_input_alleles_not_modified(): 96 | """ensure that alleles list is not modified""" 97 | alleles = (None, "AGCAC") 98 | normalize_trim(interval=(22, 25), alleles=alleles) 99 | assert (None, "AGCAC") == alleles 100 | 101 | 102 | @pytest.mark.parametrize("normalize_fn", [normalize_trim, normalize_trim_no_shuffle]) 103 | def test_error_distinct(normalize_fn): 104 | """Must have at least two distinct allele sequences (incl. ref) to normalize""" 105 | with pytest.raises(ValueError): 106 | normalize_fn(interval=(22, 25), alleles=(None, "AGC")) 107 | 108 | 109 | def test_error_ref_allele(): 110 | "First allele is ref allele and must be None" 111 | with pytest.raises(ValueError): 112 | normalize_trim(interval=(22, 25), alleles=("foo", "AGC")) 113 | 114 | 115 | def test_error_vcf_mode_no_trim(): 116 | """Should raise error when mode=VCF, trim=False.""" 117 | with pytest.raises(ValueError) as exc_info: 118 | normalize_vcf_no_trim(interval=(22, 25), alleles=(None, "AGC")) 119 | assert str(exc_info.value) == "May not disable trimming with VCF normalization mode" 120 | -------------------------------------------------------------------------------- /src/bioutils/assemblies.py: -------------------------------------------------------------------------------- 1 | """Creates dictionaries of genome assembly data as provided by 2 | 3 | ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/*.assembly.txt 4 | 5 | Assemblies are stored in json files with the package in 6 | ``_data/assemblies/``. Those files are built with sbin/assembly-to-json, 7 | also in this package. 8 | 9 | Definitions: 10 | 11 | * accession ``ac``: symbol used to refer to a sequence (e.g., NC_000001.10) 12 | * name: human-label (e.g., '1', 'MT', 'HSCHR6_MHC_APD_CTG1') that 13 | refers to a sequence, unique within some domain (e.g., GRCh37.p10) 14 | * chromosome (chr): subset of names that refer to chromosomes 1..22, X, Y, MT 15 | * aliases: list of other names; uniqueness unknown 16 | 17 | .. note:: Some users prefer using a 'chr' prefix for chromosomes and 18 | some don't. Some prefer upper case and others prefer lower. This 19 | rift is unfortunate and creates unnecessary friction in sharing data. 20 | You say TO-my-to and I say TO-mah-to doesn't apply here. This code 21 | favors using the authoritative names exactly as defined in the 22 | assembly records. Users are encouraged to use sequence names 23 | verbatim, without prefixes or case changes. 24 | """ 25 | 26 | import gzip 27 | import json 28 | from importlib import resources 29 | 30 | 31 | def get_assembly_names(): 32 | """Retrieves available assemblies from the ``_data/assemblies`` directory. 33 | 34 | Returns: 35 | list of str: The names of the available assemblies. 36 | 37 | Examples: 38 | >>> assy_names = get_assembly_names() 39 | 40 | >>> 'GRCh37.p13' in assy_names 41 | True 42 | """ 43 | assemblies_path = resources.files(__package__) / "_data" / "assemblies" 44 | 45 | return [n.name.replace(".json.gz", "") for n in assemblies_path.iterdir() if n.name.endswith(".json.gz")] 46 | 47 | 48 | def get_assembly(name): 49 | """Retreives the assembly data for a given assembly. 50 | 51 | Args: 52 | name (str): The name of the assembly to retrieve data for. 53 | 54 | Returns: 55 | dict: A dictionary of the assembly data. See examples for details. 56 | 57 | 58 | Examples: 59 | >>> assy = get_assembly('GRCh37.p13') 60 | 61 | >>> assy['name'] 62 | 'GRCh37.p13' 63 | 64 | >>> assy['description'] 65 | 'Genome Reference Consortium Human Build 37 patch release 13 (GRCh37.p13)' 66 | 67 | >>> assy['refseq_ac'] 68 | 'GCF_000001405.25' 69 | 70 | >>> assy['genbank_ac'] 71 | 'GCA_000001405.14' 72 | 73 | >>> len(assy['sequences']) 74 | 297 75 | 76 | >>> import pprint 77 | >>> pprint.pprint(assy['sequences'][0]) 78 | {'aliases': ['chr1'], 79 | 'assembly_unit': 'Primary Assembly', 80 | 'genbank_ac': 'CM000663.1', 81 | 'length': 249250621, 82 | 'name': '1', 83 | 'refseq_ac': 'NC_000001.10', 84 | 'relationship': '=', 85 | 'sequence_role': 'assembled-molecule'} 86 | """ 87 | fn = resources.files(__package__) / "_data" / "assemblies" / f"{name}.json.gz" 88 | if not fn.exists(): 89 | raise FileNotFoundError 90 | 91 | return json.load(gzip.open(fn, mode="rt", encoding="utf-8")) 92 | 93 | 94 | def get_assemblies(names=[]): 95 | """Retrieves data from multiple assemblies. 96 | 97 | If assemblies are not specified, retrieves data from all available ones. 98 | 99 | Args: 100 | names (list of str, optional): The names of the assemblies to retrieve data for. 101 | 102 | Returns: 103 | dict: A dictionary of the form ``{assembly_name, : assembly_data}``, where the values 104 | are the dictionaries of assembly data as described in ``get_assembly()``. 105 | 106 | Examples: 107 | >>> assemblies = get_assemblies(names=['GRCh37.p13']) 108 | >>> assy = assemblies['GRCh37.p13'] 109 | 110 | >>> assemblies = get_assemblies() 111 | >>> 'GRCh38.p2' in assemblies 112 | True 113 | """ 114 | 115 | if names == []: 116 | names = get_assembly_names() 117 | return {a["name"]: a for a in (get_assembly(n) for n in names)} 118 | 119 | 120 | def make_name_ac_map(assy_name, primary_only=False): 121 | """Creates a map from sequence names to accessions for a given assembly. 122 | 123 | Args: 124 | assy_name (str): The name of the assembly to make a map for. 125 | primary_only (bool, optional): Whether to include only primary sequences. 126 | Defaults to False. 127 | 128 | Returns: 129 | dict: A dictionary of the form ``{sequence_name : accession}`` for sequences in the given assembly, 130 | Where sequence_name and accession are both strings. 131 | 132 | Examples: 133 | >>> grch38p5_name_ac_map = make_name_ac_map('GRCh38.p5') 134 | >>> grch38p5_name_ac_map['1'] 135 | 'NC_000001.11' 136 | """ 137 | 138 | return { 139 | s["name"]: s["refseq_ac"] for s in get_assembly(assy_name)["sequences"] if (not primary_only or _is_primary(s)) 140 | } 141 | 142 | 143 | def make_ac_name_map(assy_name, primary_only=False): 144 | """Creates a map from accessions to sequence names for a given assembly. 145 | 146 | Args: 147 | assy_name (str): The name of the assembly to make a map for. 148 | primary_only (bool, optional): Whether to include only primary sequences. 149 | Defaults to False. 150 | 151 | Returns: 152 | dict: A dictionary of the form ``{accesssion : sequence_name}`` for accessions in the given assembly, 153 | where accession and sequence_name are strings. 154 | 155 | 156 | Examples: 157 | >>> grch38p5_ac_name_map = make_ac_name_map('GRCh38.p5') 158 | >>> grch38p5_ac_name_map['NC_000001.11'] 159 | '1' 160 | """ 161 | 162 | return { 163 | s["refseq_ac"]: s["name"] for s in get_assembly(assy_name)["sequences"] if (not primary_only or _is_primary(s)) 164 | } 165 | 166 | 167 | ############################################################################ 168 | # Internal functions 169 | 170 | 171 | def _is_primary(s): 172 | """Indicates whether a sequence is a part of the primary assembly. 173 | 174 | Args: 175 | s (dict): A dictionary of sequence data, e.g. those in assembly['sequences']. 176 | 177 | Returns: 178 | bool: True if the sequence is part of the primary assembly, False otherwise. 179 | 180 | 181 | Examples: 182 | >>> _is_primary({'assembly_unit': 'Primary Assembly'}) 183 | True 184 | 185 | >>> _is_primary({'assembly_unit': 'Something else entirely'}) 186 | False 187 | """ 188 | 189 | return s["assembly_unit"] == "Primary Assembly" 190 | -------------------------------------------------------------------------------- /src/bioutils/digests.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import base64 4 | import hashlib 5 | 6 | from .sequences import normalize_sequence 7 | from .vmc_digest import vmc_digest 8 | 9 | 10 | def seq_seqhash(seq, normalize=True): 11 | """Converts sequence to 24-byte Truncated Digest. 12 | 13 | Args: 14 | seq (str): A sequence. 15 | normalize (bool, optional): Whether to normalize the sequence before conversion, 16 | i.e. to ensure representation as uppercase letters without whitespace or asterisks. 17 | Defaults to ``True``. 18 | 19 | Returns: 20 | str: 24-byte Truncated Digest representation of sequence. 21 | 22 | Examples: 23 | >>> seq_seqhash("") 24 | 'z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXc' 25 | 26 | >>> seq_seqhash("ACGT") 27 | 'aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2' 28 | 29 | >>> seq_seqhash("acgt") 30 | 'aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2' 31 | 32 | >>> seq_seqhash("acgt", normalize=False) 33 | 'eFwawHHdibaZBDcs9kW3gm31h1NNJcQe' 34 | """ 35 | 36 | seq = normalize_sequence(seq) if normalize else seq 37 | return str(vmc_digest(seq, digest_size=24)) 38 | 39 | 40 | def seq_seguid(seq, normalize=True): 41 | """Converts sequence to seguid. 42 | 43 | This seguid is compatible with BioPython's seguid. 44 | 45 | Args: 46 | seq (str): A sequence. 47 | normalize (bool, optional): Whether to normalize the sequence before conversion, 48 | i.e. to ensure representation as uppercase letters without whitespace or asterisks. 49 | Defaults to ``True``. 50 | 51 | Returns: 52 | str: seguid representation of sequence. 53 | 54 | Examples: 55 | >>> seq_seguid('') 56 | '2jmj7l5rSw0yVb/vlWAYkK/YBwk' 57 | 58 | >>> seq_seguid('ACGT') 59 | 'IQiZThf2zKn/I1KtqStlEdsHYDQ' 60 | 61 | >>> seq_seguid('acgt') 62 | 'IQiZThf2zKn/I1KtqStlEdsHYDQ' 63 | 64 | >>> seq_seguid('acgt', normalize=False) 65 | 'lII0AoG1/I8qKY271rgv5CFZtsU' 66 | """ 67 | 68 | seq = normalize_sequence(seq) if normalize else seq 69 | bseq = seq.encode("ascii") 70 | return base64.b64encode(hashlib.sha1(bseq).digest()).decode("ascii").rstrip("=") 71 | 72 | 73 | def seq_md5(seq, normalize=True): 74 | """Converts sequence to unicode md5 hex digest. 75 | 76 | Args: 77 | seq (str): A sequence. 78 | normalize (bool, optional): Whether to normalize the sequence before conversion, 79 | i.e. to ensure representation as uppercase letters without whitespace or asterisks. 80 | Defaults to ``True``. 81 | 82 | Returns: 83 | str: Unicode md5 hex digest representation of sequence. 84 | 85 | Examples: 86 | >>> seq_md5('') 87 | 'd41d8cd98f00b204e9800998ecf8427e' 88 | 89 | >>> seq_md5('ACGT') 90 | 'f1f8f4bf413b16ad135722aa4591043e' 91 | 92 | >>> seq_md5('ACGT*') 93 | 'f1f8f4bf413b16ad135722aa4591043e' 94 | 95 | >>> seq_md5(' A C G T ') 96 | 'f1f8f4bf413b16ad135722aa4591043e' 97 | 98 | >>> seq_md5('acgt') 99 | 'f1f8f4bf413b16ad135722aa4591043e' 100 | 101 | >>> seq_md5('acgt', normalize=False) 102 | 'db516c3913e179338b162b2476d1c23f' 103 | """ 104 | 105 | seq = normalize_sequence(seq) if normalize else seq 106 | bseq = seq.encode("ascii") 107 | return hashlib.md5(bseq).hexdigest() 108 | 109 | 110 | def seq_sha1(seq, normalize=True): 111 | """Converts sequence to unicode sha1 hexdigest. 112 | 113 | Args: 114 | seq (str): A sequence. 115 | normalize (bool, optional): Whether to normalize the sequence before conversion, 116 | i.e. to ensure representation as uppercase letters without whitespace or asterisks before encoding. 117 | Defaults to ``True``. 118 | 119 | Returns: 120 | str: Unicode sha1 hexdigest representation of sequence. 121 | 122 | Examples: 123 | >>> seq_sha1('') 124 | 'da39a3ee5e6b4b0d3255bfef95601890afd80709' 125 | 126 | >>> seq_sha1('ACGT') 127 | '2108994e17f6cca9ff2352ada92b6511db076034' 128 | 129 | >>> seq_sha1('acgt') 130 | '2108994e17f6cca9ff2352ada92b6511db076034' 131 | 132 | >>> seq_sha1('acgt', normalize=False) 133 | '9482340281b5fc8f2a298dbbd6b82fe42159b6c5' 134 | """ 135 | 136 | seq = normalize_sequence(seq) if normalize else seq 137 | bseq = seq.encode("ascii") 138 | return hashlib.sha1(bseq).hexdigest() 139 | 140 | 141 | def seq_sha512(seq, normalize=True): 142 | """Converts sequence to unicode sha512 hexdigest. 143 | 144 | Args: 145 | seq (str): A sequence. 146 | normalize (bool, optional): Whether to normalize the sequence before conversion, 147 | i.e. to ensure representation as uppercase letters without whitespace or asterisks. 148 | Defaults to ``True``. 149 | 150 | Returns: 151 | str: Unicode sha512 hexdigest representation of sequence. 152 | 153 | Examples: 154 | >>> seq_sha512('') 155 | 'cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e' 156 | 157 | >>> seq_sha512('ACGT') 158 | '68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36cf701d26672964efbd536d197f51ce634fc70634d1eefe575bec34c83247abc52010f6e2bbdb8253' 159 | 160 | >>> seq_sha512('acgt') 161 | '68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36cf701d26672964efbd536d197f51ce634fc70634d1eefe575bec34c83247abc52010f6e2bbdb8253' 162 | 163 | >>> seq_sha512('acgt', normalize=False) 164 | '785c1ac071dd89b69904372cf645b7826df587534d25c41edb2862e54fb2940d697218f2883d2bf1a11cdaee658c7f7ab945a1cfd08eb26cbce57ee88790250a' 165 | """ 166 | 167 | seq = normalize_sequence(seq) if normalize else seq 168 | bseq = seq.encode("ascii") 169 | return hashlib.sha512(bseq).hexdigest() 170 | 171 | 172 | def seq_vmc_id(seq, normalize=True): 173 | """Converts sequence to VMC id. 174 | 175 | See https://github.com/ga4gh/vmc 176 | 177 | Args: 178 | seq (str): A sequence. 179 | normalize (bool, optional): Whether to normalize the sequence before conversion, 180 | i.e. to ensure representation as uppercase letters without whitespace or asterisks. 181 | Defaults to ``True``. 182 | 183 | Returns: 184 | str: VMC id representation of sequence. 185 | 186 | Examples: 187 | >>> seq_vmc_id("") 188 | 'VMC:GS_z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXc' 189 | 190 | >>> seq_vmc_id("ACGT") 191 | 'VMC:GS_aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2' 192 | 193 | >>> seq_vmc_id("acgt") 194 | 'VMC:GS_aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2' 195 | 196 | >>> seq_vmc_id("acgt", normalize=False) 197 | 'VMC:GS_eFwawHHdibaZBDcs9kW3gm31h1NNJcQe' 198 | """ 199 | 200 | return "{ir[namespace]}:{ir[accession]}".format(ir=seq_vmc_identifier(seq, normalize)) 201 | 202 | 203 | def seq_vmc_identifier(seq, normalize=True): 204 | """Converts sequence to VMC identifier (record). 205 | 206 | See https://github.com/ga4gh/vmc 207 | 208 | Args: 209 | seq (str): A sequence. 210 | normalize (bool, optional): Whether to normalize the sequence before conversion, 211 | i.e. to ensure representation as uppercase letters without whitespace or asterisks. 212 | Defaults to ``True``. 213 | 214 | Returns: 215 | str: VMC identifier (record) representation of sequnce. 216 | 217 | Examples: 218 | >>> seq_vmc_identifier("") == {'namespace': 'VMC', 'accession': 'GS_z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXc'} 219 | True 220 | 221 | >>> seq_vmc_identifier("ACGT") == {'namespace': 'VMC', 'accession': 'GS_aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2'} 222 | True 223 | 224 | >>> seq_vmc_identifier("acgt") == {'namespace': 'VMC', 'accession': 'GS_aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2'} 225 | True 226 | 227 | >>> seq_vmc_identifier("acgt", normalize=False) == {'namespace': 'VMC', 'accession': 'GS_eFwawHHdibaZBDcs9kW3gm31h1NNJcQe'} 228 | True 229 | """ 230 | 231 | seq = normalize_sequence(seq) if normalize else seq 232 | return {"namespace": "VMC", "accession": "GS_" + str(vmc_digest(seq))} 233 | -------------------------------------------------------------------------------- /src/bioutils/accessions.py: -------------------------------------------------------------------------------- 1 | """Simple routines to deal with accessions, identifiers, etc. 2 | 3 | Biocommons terminology: an identifier is composed of a *namespace* and 4 | an *accession*. The namespace is a string, composed of any character 5 | other than colon (:). The accession is a string without character set 6 | restriction. An accession is expected to be unique within the 7 | namespace; there is no expectation of uniqueness of accessions across 8 | namespaces. 9 | 10 | ``Identifier := `` 11 | 12 | ``Namespace := [^:]+`` 13 | 14 | ``Accession := \\w+`` 15 | 16 | 17 | Some sample serializations of Identifiers: 18 | 19 | ``json: {"namespace": "RefSeq", "accession": "NM_000551.3"}`` 20 | 21 | ``xml: `` 22 | 23 | ``string: "RefSeq:NM_000551.3"`` 24 | 25 | The string form may be used as a CURIE, in which case the document in 26 | which the CURIE is used must contain a map of ``{namespace : uri}``. 27 | """ 28 | 29 | import re 30 | 31 | from .exceptions import BioutilsError 32 | 33 | _ensembl_species_prefixes = "|".join( 34 | """ENS ENSACA ENSAME ENSAMX 35 | ENSANA ENSAPL ENSBTA ENSCAF ENSCAN ENSCAP ENSCAT ENSCCA ENSCEL ENSCGR 36 | ENSCGR ENSCHI ENSCHO ENSCIN ENSCJA ENSCLA ENSCPO ENSCSA ENSCSAV ENSDAR 37 | ENSDNO ENSDOR ENSEBU ENSECA ENSEEU ENSETE ENSFAL ENSFCA ENSFDA ENSGAC 38 | ENSGAL ENSGGO ENSGMO ENSHGLF ENSHGLM ENSJJA ENSLAC ENSLAF ENSLOC 39 | ENSMAU ENSMEU ENSMFA ENSMGA ENSMIC ENSMLE ENSMLU ENSMMU ENSMNE ENSMOC 40 | ENSMOD ENSMPU ENSMUS ENSNGA ENSNLE ENSOAN ENSOAR ENSOCU ENSODE ENSOGA 41 | ENSONI ENSOPR ENSORL ENSPAN ENSPCA ENSPCO ENSPEM ENSPFO ENSPMA ENSPPA 42 | ENSPPR ENSPPY ENSPSI ENSPTI ENSPTR ENSPVA ENSRBI ENSRNO ENSRRO ENSSAR 43 | ENSSBO ENSSCE ENSSHA ENSSSC ENSSTO ENSTBE ENSTGU ENSTNI ENSTRU ENSTSY 44 | ENSTTR ENSVPA ENSXET ENSXMA FB MGP_129S1SvImJ_ MGP_AJ_ MGP_AKRJ_ 45 | MGP_BALBcJ_ MGP_C3HHeJ_ MGP_C57BL6NJ_ MGP_CAROLIEiJ_ MGP_CASTEiJ_ 46 | MGP_CBAJ_ MGP_DBA2J_ MGP_FVBNJ_ MGP_LPJ_ MGP_NODShiLtJ_ MGP_NZOHlLtJ_ 47 | MGP_PWKPhJ_ MGP_PahariEiJ_ MGP_SPRETEiJ_ MGP_WSBEiJ_""".split() 48 | ) 49 | _ensembl_feature_types_re = r"E|FM|G|GT|P|R|T" 50 | _ensembl_re = r"^(?:{})(?:{}){}$".format(_ensembl_species_prefixes, _ensembl_feature_types_re, r"\d{11}(?:\.\d+)?") 51 | 52 | # map of regexp => namespace 53 | # TODO: make this namespace => [regexps] for clarity 54 | # namespaces follow convention of identifiers.org 55 | ac_namespace_regexps = { 56 | # https://uswest.ensembl.org/info/genome/stable_ids/prefixes.html 57 | # [species prefix][feature type prefix][a unique eleven digit number] 58 | # N.B. The regexp at http://identifiers.org/ensembl appears broken: 59 | # 1) Human only; 2) escaped backslashes (\\d rather than \d). 60 | _ensembl_re: "ensembl", 61 | # http://identifiers.org/insdc/ 62 | # P12345, a UniProtKB accession matches the miriam regexp but shouldn't (I think) 63 | r"^([A-Z]\d{5}|[A-Z]{2}\d{6}|[A-Z]{4}\d{8}|[A-J][A-Z]{2}\d{5})(\.\d+)?$": "insdc", 64 | # http://identifiers.org/refseq/ 65 | # https://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/ 66 | r"^((AC|AP|NC|NG|NM|NP|NR|NT|NW|XM|XP|XR|YP|ZP)_\d+|(NZ\_[A-Z]{4}\d+))(\.\d+)?$": "refseq", 67 | # Uniprot 68 | # http://identifiers.org/uniprot/ 69 | # https://www.uniprot.org/help/accession_numbers 70 | r"^(?:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$": "uniprot", 71 | } 72 | 73 | ac_namespace_regexps = {re.compile(k): v for k, v in ac_namespace_regexps.items()} 74 | 75 | 76 | def chr22XY(c): 77 | """Reformats chromosome to be of the form Chr1, ..., Chr22, ChrX, ChrY, etc. 78 | 79 | Args: 80 | c (str or int): A chromosome. 81 | 82 | Returns: 83 | str: The reformatted chromosome. 84 | 85 | Examples: 86 | >>> chr22XY('1') 87 | 'chr1' 88 | 89 | >>> chr22XY(1) 90 | 'chr1' 91 | 92 | >>> chr22XY('chr1') 93 | 'chr1' 94 | 95 | >>> chr22XY(23) 96 | 'chrX' 97 | 98 | >>> chr22XY(24) 99 | 'chrY' 100 | 101 | >>> chr22XY("X") 102 | 'chrX' 103 | 104 | >>> chr22XY("23") 105 | 'chrX' 106 | 107 | >>> chr22XY("M") 108 | 'chrM' 109 | """ 110 | c = str(c) 111 | if c[0:3] == "chr": 112 | c = c[3:] 113 | if c == "23": 114 | c = "X" 115 | if c == "24": 116 | c = "Y" 117 | return "chr" + c 118 | 119 | 120 | def coerce_namespace(ac): 121 | """Prefixes accession with inferred namespace if not present. 122 | 123 | Intended to be used to promote consistent and unambiguous accession identifiers. 124 | 125 | Args: 126 | ac (str): The accession, with or without namespace prefixed. 127 | 128 | Returns: 129 | str: An identifier of the form "{namespace}:{acession}" 130 | 131 | Raises: 132 | ValueError: if accession sytax does not match the syntax of any namespace. 133 | 134 | Examples: 135 | >>> coerce_namespace("refseq:NM_01234.5") 136 | 'refseq:NM_01234.5' 137 | 138 | >>> coerce_namespace("NM_01234.5") 139 | 'refseq:NM_01234.5' 140 | 141 | >>> coerce_namespace("bogus:QQ_01234.5") 142 | 'bogus:QQ_01234.5' 143 | 144 | >>> coerce_namespace("QQ_01234.5") 145 | Traceback (most recent call last): 146 | ... 147 | ValueError: Could not infer namespace for QQ_01234.5 148 | """ 149 | if ":" not in ac: 150 | ns = infer_namespace(ac) 151 | if ns is None: 152 | raise ValueError(f"Could not infer namespace for {ac}") 153 | ac = ns + ":" + ac 154 | return ac 155 | 156 | 157 | def infer_namespace(ac): 158 | """Infers a unique namespace from an accession, if one exists. 159 | 160 | Args: 161 | ac (str): An accession, without the namespace prefix. 162 | 163 | Returns: 164 | str or None: The unique namespace corresponding to accession syntax, if only one is inferred. 165 | None if the accesssion sytax does not match any namespace. 166 | 167 | Raises: 168 | BioutilsError: If multiple namespaces match the syntax of the accession. 169 | 170 | Examples: 171 | >>> infer_namespace("ENST00000530893.6") 172 | 'ensembl' 173 | 174 | >>> infer_namespace("NM_01234.5") 175 | 'refseq' 176 | 177 | >>> infer_namespace("A2BC19") 178 | 'uniprot' 179 | 180 | Disbled because Python 2 and 3 handles exceptions differently. 181 | 182 | >>> infer_namespace("P12345") # doctest: +SKIP 183 | Traceback (most recent call last): 184 | ... 185 | bioutils.exceptions.BioutilsError: Multiple namespaces possible for P12345 186 | 187 | >>> infer_namespace("BOGUS99") is None 188 | True 189 | """ 190 | 191 | namespaces = infer_namespaces(ac) 192 | if not namespaces: 193 | return None 194 | if len(namespaces) > 1: 195 | raise BioutilsError("Multiple namespaces possible for {}".format(ac)) 196 | return namespaces[0] 197 | 198 | 199 | def infer_namespaces(ac): 200 | """Infers namespaces possible for a given accession, based on syntax. 201 | 202 | Args: 203 | ac (str): An accession, without the namespace prefix. 204 | 205 | Returns: 206 | list of str: A list of namespaces matching the accession, possibly empty. 207 | 208 | Examples: 209 | >>> infer_namespaces("ENST00000530893.6") 210 | ['ensembl'] 211 | 212 | >>> infer_namespaces("ENST00000530893") 213 | ['ensembl'] 214 | 215 | >>> infer_namespaces("ENSQ00000530893") 216 | [] 217 | 218 | >>> infer_namespaces("NM_01234") 219 | ['refseq'] 220 | 221 | >>> infer_namespaces("NM_01234.5") 222 | ['refseq'] 223 | 224 | >>> infer_namespaces("NQ_01234.5") 225 | [] 226 | 227 | >>> infer_namespaces("A2BC19") 228 | ['uniprot'] 229 | 230 | >>> sorted(infer_namespaces("P12345")) 231 | ['insdc', 'uniprot'] 232 | 233 | >>> infer_namespaces("A0A022YWF9") 234 | ['uniprot'] 235 | """ 236 | return [v for k, v in ac_namespace_regexps.items() if k.match(ac)] 237 | 238 | 239 | def prepend_chr(chr): 240 | """Prepends chromosome with 'chr' if not present. 241 | 242 | Users are strongly discouraged from using this function. Adding a 243 | 'chr' prefix results in a name that is not consistent 244 | with authoritative assembly records. 245 | 246 | Args: 247 | chr (str): The chromosome. 248 | 249 | Returns: 250 | str: The chromosome with 'chr' prepended. 251 | 252 | Examples: 253 | >>> prepend_chr('22') 254 | 'chr22' 255 | 256 | >>> prepend_chr('chr22') 257 | 'chr22' 258 | """ 259 | return chr if chr[0:3] == "chr" else "chr" + chr 260 | 261 | 262 | def strip_chr(chr): 263 | """Removes the 'chr' prefix if present. 264 | 265 | Args: 266 | chr (str): The chromosome. 267 | 268 | Returns: 269 | str: The chromosome without a 'chr' prefix. 270 | 271 | Examples: 272 | >>> strip_chr('22') 273 | '22' 274 | 275 | >>> strip_chr('chr22') 276 | '22' 277 | """ 278 | return chr[3:] if chr[0:3] == "chr" else chr 279 | 280 | 281 | ## 282 | ## Copyright 2014 Bioutils Contributors (https://bitbucket.org/biocommons/bioutils) 283 | ## 284 | ## Licensed under the Apache License, Version 2.0 (the "License"); 285 | ## you may not use this file except in compliance with the License. 286 | ## You may obtain a copy of the License at 287 | ## 288 | ## http://www.apache.org/licenses/LICENSE-2.0 289 | ## 290 | ## Unless required by applicable law or agreed to in writing, software 291 | ## distributed under the License is distributed on an "AS IS" BASIS, 292 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 293 | ## See the License for the specific language governing permissions and 294 | ## limitations under the License. 295 | ## 296 | -------------------------------------------------------------------------------- /src/bioutils/seqfetcher.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Provides sequence fetching from NCBI and Ensembl.""" 3 | 4 | import logging 5 | import os 6 | import random 7 | import re 8 | import time 9 | 10 | import requests 11 | 12 | _logger = logging.getLogger(__name__) 13 | 14 | # Reece requested registration on 2017-09-03 15 | ncbi_tool = "bioutils" 16 | ncbi_email = "biocommons-dev@googlegroups.com" 17 | retry_limit = 3 18 | enst_default_seq_type = "cdna" 19 | 20 | 21 | def fetch_seq(ac, start_i=None, end_i=None, **rest): 22 | """Fetches sequences and subsequences from NCBI eutils and Ensembl REST interfaces. 23 | 24 | Args: 25 | ac (str): The accession of the sequence to fetch. 26 | start_i (int, optional): The start index (interbase coordinates) of the subsequence to fetch. Defaults to ``None``. 27 | It is recommended to retrieve a subsequence by providing an index here, rather than by 28 | Python slicing the whole sequence. 29 | end_i (int, optional): The end index (interbase coordinates) of the subsequence to fetch. Defaults to ``None``. 30 | It is recommended to retrieve a subsequence by providing an index here, rather than by 31 | Python slicing the whole sequence. 32 | 33 | Returns: 34 | str: The requested sequence. 35 | 36 | Raises: 37 | RuntimeError: If the syntax doesn't match that of any of the databases. 38 | RuntimeError: If the request to the database fails. 39 | 40 | Examples: 41 | >>> len(fetch_seq('NP_056374.2')) 42 | 1596 43 | 44 | >>> fetch_seq('NP_056374.2',0,10) # This! 45 | 'MESRETLSSS' 46 | 47 | >>> fetch_seq('NP_056374.2')[0:10] # Not this! 48 | 'MESRETLSSS' 49 | 50 | # Providing intervals is especially important for large sequences: 51 | 52 | >>> fetch_seq('NC_000001.10',2000000,2000030) 53 | 'ATCACACGTGCAGGAACCCTTTTCCAAAGG' 54 | 55 | # This call will pull back 30 bases plus overhead; without the 56 | # interval, one would receive 250MB of chr1 plus overhead! 57 | 58 | # Essentially any RefSeq, Genbank, BIC, or Ensembl sequence may be 59 | # fetched. 60 | 61 | >>> fetch_seq('NM_9.9') 62 | Traceback (most recent call last): 63 | ... 64 | RuntimeError: No sequence available for NM_9.9 65 | 66 | >>> fetch_seq('QQ01234') 67 | Traceback (most recent call last): 68 | ... 69 | RuntimeError: No sequence fetcher for QQ01234 70 | """ 71 | 72 | ac_dispatch = [ 73 | { 74 | "re": re.compile(r"^(?:AC|N[CGMPRTW])_|^[A-L]\w\d|^U\d"), 75 | "fetcher": _fetch_seq_ncbi, 76 | }, 77 | {"re": re.compile(r"^ENS[TP]\d+"), "fetcher": _fetch_seq_ensembl}, 78 | ] 79 | 80 | eligible_fetchers = [dr["fetcher"] for dr in ac_dispatch if dr["re"].match(ac)] 81 | 82 | if len(eligible_fetchers) == 0: 83 | raise RuntimeError("No sequence fetcher for {ac}".format(ac=ac)) 84 | 85 | if len(eligible_fetchers) >= 1: # pragma: nocover (no way to test) 86 | _logger.debug("Multiple sequence fetchers found for {ac}; using first".format(ac=ac)) 87 | 88 | fetcher = eligible_fetchers[0] 89 | _logger.debug("fetching {ac} with {f}".format(ac=ac, f=fetcher)) 90 | 91 | try: 92 | return fetcher(ac, start_i, end_i, **rest) 93 | except requests.RequestException as ex: 94 | raise RuntimeError("Failed to fetch {ac} ({ex})".format(ac=ac, ex=ex)) 95 | 96 | 97 | # ########################################################################### 98 | # Internal functions 99 | 100 | 101 | def _fetch_seq_ensembl(ac, start_i=None, end_i=None, seq_type=None): 102 | """Fetch sequence slice from Ensembl public REST interface. 103 | 104 | Args: 105 | ac (str): The accession of the sequence to fetch. 106 | start_i (int, optional): The start index (interbase coordinates) of the subsequence to fetch. 107 | Defaults to None. 108 | end_i (int, optional): The end index (interbase coordinates) of the subsequence to fetch. 109 | Defaults to None. 110 | seq_type (str, optional): The type of Ensembl sequence to fetch 111 | 112 | Returns: 113 | str: The requested (sub)sequence 114 | 115 | Raises: 116 | RequestException: if request is unsuccessful. 117 | KeyError: if Ensembl API returns a different version than requested 118 | 119 | Note: 120 | The Ensembl REST interface does not currently accept intervals, so this method 121 | slices the sequence locally. 122 | 123 | Examples: 124 | >> len(_fetch_seq_ensembl('ENSP00000288602')) 125 | 766 126 | 127 | >> _fetch_seq_ensembl('ENSP00000288602',0,10) 128 | u'MAALSGGGGG' 129 | 130 | >> _fetch_seq_ensembl('ENSP00000288602')[0:10] 131 | u'MAALSGGGGG' 132 | 133 | >> ac = 'ENSP00000288602' 134 | >> _fetch_seq_ensembl(ac ,0, 10) == _fetch_seq_ensembl(ac)[0:10] 135 | True 136 | """ 137 | 138 | # Ensembl API only takes transcript IDs (without version) and returns the latest one 139 | # So we need to strip the transcript version, then check if what was returned was the one we requested 140 | version = None 141 | m = re.match(r"^(ENST\d+)\.(\d+)$", ac) 142 | if m: 143 | ac, version = m.groups() 144 | version = int(version) 145 | 146 | if ac.startswith("ENST") and seq_type is None: 147 | try: 148 | seq_type = os.environ["ENST_DEFAULT_SEQ_TYPE"] 149 | except KeyError: 150 | seq_type = enst_default_seq_type 151 | _logger.warning(f"{ac}: Transcript type not specified or set in ENST_DEFAULT_SEQ_TYPE; assuming {seq_type}") 152 | 153 | url = f"http://rest.ensembl.org/sequence/id/{ac}" 154 | if seq_type: 155 | url += f"?type={seq_type}" 156 | r = requests.get(url, headers={"Content-Type": "application/json"}) 157 | r.raise_for_status() 158 | data = r.json() 159 | if version is not None: 160 | latest_version = data["version"] 161 | if version != latest_version: 162 | msg = f"Ensembl API only provides {ac} version ({latest_version}), requested: {version}" 163 | raise KeyError(msg) 164 | 165 | seq = data["seq"] 166 | return seq if (start_i is None or end_i is None) else seq[start_i:end_i] 167 | 168 | 169 | def _fetch_seq_ncbi(ac, start_i=None, end_i=None): 170 | """Fetches sequences from NCBI using the eutils interface. 171 | 172 | Args: 173 | ac (str): The accession of the sequence to fetch. 174 | start_i (int, optional): The start index (interbase coordinates) of the subsequence to fetch. 175 | Defaults to None. 176 | end_i (int, optional): The end index (interbase coordinates) of the subsequence to fetch. 177 | Defaults to None. 178 | 179 | Returns: 180 | str: The requested (sub)sequence 181 | 182 | Raises: 183 | RequestException: if request is unsuccessful. 184 | 185 | Notes: 186 | An interbase interval may be optionally provided with start_i and 187 | end_i. NCBI eutils will return just the requested subsequence, 188 | which might greatly reduce payload sizes (especially with 189 | chromosome-scale sequences). 190 | 191 | The request includes `tool` and `email` arguments to identify the 192 | caller as the bioutils package. According to 193 | https://www.ncbi.nlm.nih.gov/books/NBK25497/, these values should 194 | correspond to the library, not the library client. Using the 195 | defaults is recommended. Nonetheless, callers may set 196 | ``bioutils.seqfetcher.ncbi_tool`` and 197 | ``bioutils.seqfetcher.ncbi_email`` to custom values if that is 198 | desired. 199 | 200 | Examples: 201 | >> len(_fetch_seq_ncbi('NP_056374.2')) 202 | 1596 203 | 204 | Pass the desired interval rather than using Python's [] slice 205 | operator. 206 | 207 | >> _fetch_seq_ncbi('NP_056374.2',0,10) 208 | 'MESRETLSSS' 209 | 210 | >> _fetch_seq_ncbi('NP_056374.2')[0:10] 211 | 'MESRETLSSS' 212 | 213 | >> _fetch_seq_ncbi('NP_056374.2',0,10) == _fetch_seq_ncbi('NP_056374.2')[0:10] 214 | True 215 | """ 216 | 217 | db = "protein" if ac[1] == "P" else "nucleotide" 218 | url_fmt = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={db}&id={ac}&rettype=fasta" 219 | 220 | if start_i is None or end_i is None: 221 | url = url_fmt.format(db=db, ac=ac) 222 | else: 223 | url_fmt += "&seq_start={start}&seq_stop={stop}" 224 | url = url_fmt.format(db=db, ac=ac, start=start_i + 1, stop=end_i) 225 | 226 | url += "&tool={tool}&email={email}".format(tool=ncbi_tool, email=ncbi_email) 227 | 228 | url = _add_eutils_api_key(url) 229 | 230 | n_retries = 0 231 | while True: 232 | resp = requests.get(url) 233 | if resp.ok: 234 | seq = "".join(resp.text.splitlines()[1:]) 235 | return seq 236 | elif resp.status_code == 400: 237 | # Invalid sequence or start/stop position for that sequence 238 | raise RuntimeError( 239 | "Fetching sequence {} with start index {} and end index {} failed, invalid sequence " 240 | "or start or end position".format(ac, start_i, end_i) 241 | ) 242 | if n_retries >= retry_limit: 243 | break 244 | if n_retries == 0: 245 | _logger.warning("Failed to fetch {}".format(url)) 246 | sleeptime = random.randint(n_retries, 3) ** n_retries 247 | _logger.warning("Failure {}/{}; retry in {} seconds".format(n_retries, retry_limit, sleeptime)) 248 | time.sleep(sleeptime) 249 | n_retries += 1 250 | # Falls through only on failure 251 | resp.raise_for_status() 252 | 253 | 254 | def _add_eutils_api_key(url): 255 | """Adds an eutils api key to the query. 256 | 257 | Args: 258 | url (str): The query url without the api key. 259 | 260 | Returns: 261 | str: The query url with the api key, if one is stored in the environment variable 262 | ``NCBI_API_KEY``, otherwise it is unaltered. 263 | """ 264 | 265 | apikey = os.environ.get("NCBI_API_KEY") 266 | if apikey: 267 | url += "&api_key={apikey}".format(apikey=apikey) 268 | return url 269 | 270 | 271 | # So that I don't forget why I didn't use ebi too: 272 | # $ curl 'http://www.ebi.ac.uk/ena/data/view/AM407889.1&display=fasta' 273 | # >ENA|AM407889|AM407889.2 Medicago sativa partial mRNA ... 274 | # AACGTATCACACTTCTTCTCCATTTCTTTTTCTTACATCTTCTCTCTACAAATTCATTTC 275 | # Note that we requested .1, got .2. Implicit behavior bites again. 276 | 277 | if __name__ == "__main__": # pragma: nocover 278 | import doctest 279 | 280 | doctest.testmod() 281 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2019 bioutils Contributors 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /src/bioutils/normalize.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Provides functionality for normalizing alleles, ensuring comparable representations.""" 3 | 4 | import enum 5 | import logging 6 | import math 7 | from typing import Optional 8 | 9 | import attr 10 | 11 | _logger = logging.getLogger(__name__) 12 | debug = False 13 | 14 | NormalizationMode = enum.Enum("NormalizationMode", "EXPAND LEFTSHUFFLE RIGHTSHUFFLE TRIMONLY VCF") 15 | """Enum passed to normalize to select the normalization mode. 16 | 17 | Attributes: 18 | EXPAND: Normalize alleles to maximal extent both left and right. 19 | LEFTSHUFFLE: Normalize alleles to maximal extent left. 20 | RIGHTSHUFFLE: Normalize alleles to maximal extent right. 21 | TRIMONLY: Only trim the common prefix and suffix of alleles. Deprecated -- use `mode=None` with `trim=True` instead. 22 | VCF: Normalize with VCF. 23 | """ 24 | 25 | 26 | def normalize( 27 | sequence, 28 | interval, 29 | alleles, 30 | mode: Optional[NormalizationMode] = NormalizationMode.EXPAND, 31 | bounds=None, 32 | anchor_length=0, 33 | trim: bool = True, 34 | ): 35 | """Normalizes the alleles that co-occur on sequence at interval, ensuring comparable representations. 36 | 37 | Normalization performs three operations: 38 | - trimming 39 | - shuffling 40 | - anchoring 41 | 42 | Args: 43 | sequence (str or iterable): The reference sequence; must support indexing and ``__getitem__``. 44 | interval (2-tuple of int): The location of alleles in the reference sequence as ``(start, end)``. 45 | Interbase coordinates. 46 | alleles (iterable of str): The sequences to be normalized. The first element 47 | corresponds to the reference sequence being unchanged and must be None. 48 | bounds (2-tuple of int, optional): Maximal extent of normalization left and right. 49 | Must be provided if sequence doesn't support ``__len__``. Defaults to ``(0, len(sequence))``. 50 | mode (NormalizationMode Enum or string, optional): A NormalizationMode Enum or the corresponding string. 51 | Defaults to ``EXPAND``. Set to None to skip shuffling. Does not affect trimming or anchoring. 52 | anchor_length (int, optional): number of flanking residues left and right. Defaults to ``0``. 53 | trim (bool): indicates whether to trim the common prefix and suffix of alleles. Defaults to True. 54 | Set to False to skip trimming. Does not affect shuffling or anchoring. 55 | 56 | Returns: 57 | tuple: ``(new_interval, [new_alleles])`` 58 | 59 | Raises: 60 | ValueError: If normalization mode is VCF and `anchor_length` is nonzero. 61 | ValueError: If the interval start is greater than the end. 62 | ValueError: If the first (reference) allele is not `None`. 63 | ValueError: If there are not at least two distinct alleles. 64 | 65 | Examples: 66 | >>> sequence = "CCCCCCCCACACACACACTAGCAGCAGCA" 67 | >>> normalize(sequence, interval=(22,25), alleles=(None, "GC", "AGCAC"), mode='TRIMONLY') 68 | ((22, 24), ('AG', 'G', 'AGCA')) 69 | 70 | >>> normalize(sequence, interval=(22, 22), alleles=(None, 'AGC'), mode='RIGHTSHUFFLE') 71 | ((29, 29), ('', 'GCA')) 72 | 73 | >>> normalize(sequence, interval=(22, 22), alleles=(None, 'AGC'), mode='EXPAND') 74 | ((19, 29), ('AGCAGCAGCA', 'AGCAGCAGCAGCA')) 75 | """ 76 | 77 | interval = _Interval(*interval) 78 | if interval.start > interval.end: 79 | raise ValueError("Interval start > end; must be start <= end") 80 | 81 | if bounds is None: 82 | bounds = _Interval(0, len(sequence)) 83 | else: 84 | bounds = _Interval(*bounds) 85 | 86 | left_anchor = right_anchor = anchor_length 87 | 88 | if mode is not None and not isinstance(mode, NormalizationMode): 89 | mode = NormalizationMode[mode] # e.g., mode="LEFTSHUFFLE" OK 90 | 91 | if mode == NormalizationMode.VCF: 92 | if anchor_length: 93 | raise ValueError("May not provide non-zero anchor size with VCF normalization mode") 94 | if not trim: 95 | raise ValueError("May not disable trimming with VCF normalization mode") 96 | mode = NormalizationMode.LEFTSHUFFLE 97 | left_anchor = 1 98 | right_anchor = 0 99 | 100 | if alleles[0] is not None: 101 | raise ValueError("First allele, the reference allele, must be None") 102 | alleles = list(alleles) # in case tuple 103 | alleles[0] = sequence[interval.start : interval.end] 104 | 105 | if debug: 106 | _print_state( 107 | interval, 108 | bounds, 109 | sequence=sequence, 110 | alleles=alleles, 111 | comment="Starting state", 112 | ) 113 | 114 | if trim: 115 | if len(set(alleles)) < 2: 116 | raise ValueError("Must have at least two distinct alleles to trim") 117 | 118 | # Trim: remove common suffix, prefix, and adjust interval to match 119 | l_trimmed, alleles = trim_left(alleles) 120 | interval.start += l_trimmed 121 | r_trimmed, alleles = trim_right(alleles) 122 | interval.end -= r_trimmed 123 | if debug: 124 | _print_state( 125 | interval, 126 | bounds, 127 | sequence=sequence, 128 | alleles=alleles, 129 | comment="After trimming", 130 | ) 131 | 132 | lens = [len(a) for a in alleles] 133 | 134 | if mode == NormalizationMode.LEFTSHUFFLE: 135 | dist = roll_left(sequence, alleles, interval.start - 1, bounds.start) 136 | for i, a in enumerate(alleles): 137 | if lens[i]: 138 | adist = -dist % lens[i] 139 | alleles[i] = a[adist:] + a[:adist] 140 | interval.start -= dist 141 | interval.end -= dist 142 | 143 | elif mode == NormalizationMode.RIGHTSHUFFLE: 144 | dist = roll_right(sequence, alleles, interval.end, bounds.end - 1) 145 | for i, a in enumerate(alleles): 146 | if lens[i]: 147 | adist = dist % lens[i] 148 | alleles[i] = a[adist:] + a[:adist] 149 | interval.start += dist 150 | interval.end += dist 151 | 152 | elif mode == NormalizationMode.EXPAND: 153 | ldist = roll_left(sequence, alleles, interval.start - 1, bounds.start) 154 | rdist = roll_right(sequence, alleles, interval.end, bounds.end - 1) 155 | 156 | lseq = sequence[interval.start - ldist : interval.start] 157 | rseq = sequence[interval.end : interval.end + rdist] 158 | alleles = [lseq + a + rseq for a in alleles] 159 | 160 | interval.start -= ldist 161 | interval.end += rdist 162 | 163 | if debug: 164 | _print_state( 165 | interval, 166 | bounds, 167 | sequence=sequence, 168 | alleles=alleles, 169 | comment=f"After mode: {mode}", 170 | ) 171 | 172 | # Add left and/or right flanking sequence 173 | if left_anchor or right_anchor: 174 | anchor_left = max(bounds.start, interval.start - left_anchor) 175 | anchor_right = min(bounds.end, interval.end + right_anchor) 176 | left_anchor_seq = sequence[anchor_left : interval.start] 177 | right_anchor_seq = sequence[interval.end : anchor_right] 178 | interval.start = anchor_left 179 | interval.end = anchor_right 180 | alleles = [left_anchor_seq + a + right_anchor_seq for a in alleles] 181 | if debug: 182 | _print_state( 183 | interval, 184 | bounds, 185 | sequence=sequence, 186 | alleles=alleles, 187 | comment="After anchoring", 188 | ) 189 | 190 | return (interval.start, interval.end), tuple(alleles) 191 | 192 | 193 | ############################################################################ 194 | # INTERNAL 195 | 196 | 197 | @attr.s 198 | class _Interval: 199 | start = attr.ib(int) 200 | end = attr.ib(int) 201 | 202 | 203 | def trim_left(alleles): 204 | """ 205 | Removes common prefix of given alleles. 206 | 207 | Args: 208 | alleles (list of str): A list of alleles. 209 | 210 | Returns: 211 | tuple: ``(number_trimmed, [new_alleles])``. 212 | 213 | Examples: 214 | >>> trim_left(["","AA"]) 215 | (0, ['', 'AA']) 216 | 217 | >>> trim_left(["A","AA"]) 218 | (1, ['', 'A']) 219 | 220 | >>> trim_left(["AT","AA"]) 221 | (1, ['T', 'A']) 222 | 223 | >>> trim_left(["AA","AA"]) 224 | (2, ['', '']) 225 | 226 | >>> trim_left(["CAG","CG"]) 227 | (1, ['AG', 'G']) 228 | """ 229 | if len(alleles) == 0: 230 | return (0, []) 231 | trimmed = 0 232 | lens = [len(x) for x in alleles] 233 | while trimmed < min(lens): 234 | nexts = [x[trimmed] for x in alleles] 235 | if nexts.count(nexts[0]) == len(nexts): 236 | trimmed += 1 237 | else: 238 | break 239 | return (trimmed, [x[trimmed:] for x in alleles]) 240 | 241 | 242 | def trim_right(alleles): 243 | """ 244 | Removes common suffix of given alleles. 245 | 246 | Args: 247 | alleles (list of str): A list of alleles. 248 | 249 | Returns: 250 | tuple: ``(number_trimmed, [new_alleles])``. 251 | 252 | Examples: 253 | >>> trim_right(["","AA"]) 254 | (0, ['', 'AA']) 255 | 256 | >>> trim_right(["A","AA"]) 257 | (1, ['', 'A']) 258 | 259 | >>> trim_right(["AT","AA"]) 260 | (0, ['AT', 'AA']) 261 | 262 | >>> trim_right(["AA","AA"]) 263 | (2, ['', '']) 264 | 265 | >>> trim_right(["CAG","CG"]) 266 | (1, ['CA', 'C']) 267 | """ 268 | if len(alleles) == 0: 269 | return (0, []) 270 | trimmed = 0 271 | lens = [len(x) for x in alleles] 272 | while trimmed < min(lens): 273 | nexts = [x[len(x) - trimmed - 1] for x in alleles] 274 | if nexts.count(nexts[0]) == len(nexts): 275 | trimmed += 1 276 | else: 277 | break 278 | return (trimmed, [x[: (len(x) - trimmed)] for x in alleles]) 279 | 280 | 281 | def roll_left(sequence, alleles, ref_pos, bound): 282 | """Determines common distance all alleles can be rolled (circularly permuted) left 283 | within the reference sequence without altering it. 284 | 285 | Args: 286 | sequence (str): The reference sequence. 287 | alleles (list of str): The sequences to be normalized. 288 | ref_pos (int): The beginning index for rolling. 289 | bound (int): The lower bound index in the reference sequence for normalization, hence also for rolling. 290 | 291 | Returns: 292 | int: The distance that the alleles can be rolled. 293 | """ 294 | 295 | # circularly permute sequence d steps, using modulo arithmetic 296 | lens = [len(a) for a in alleles] 297 | d = 0 298 | max_d = ref_pos - bound 299 | while d <= max_d and not any(a and a[-(d + 1) % lens[i]] != sequence[ref_pos - d] for i, a in enumerate(alleles)): 300 | d += 1 301 | return d 302 | 303 | 304 | def roll_right(sequence, alleles, ref_pos, bound): 305 | """Determines common distance all alleles can be rolled (circularly permuted) right 306 | within the reference sequence without altering it. 307 | 308 | Args: 309 | sequence (str): The reference sequence. 310 | alleles (list of str): The sequences to be normalized. 311 | ref_pos (int): The start index for rolling. 312 | bound (int): The upper bound index in the reference sequence for normalization, hence also for rolling. 313 | Returns: 314 | int: The distance that the alleles can be rolled 315 | """ 316 | 317 | # circularly permute sequence d steps, using modulo arithmetic 318 | lens = [len(a) for a in alleles] 319 | d = 0 320 | max_d = bound - ref_pos 321 | while d <= max_d and not any(a and a[d % lens[i]] != sequence[ref_pos + d] for i, a in enumerate(alleles)): 322 | d += 1 323 | return d 324 | 325 | 326 | ############################################################################ 327 | # Debugging 328 | 329 | pfx = " " 330 | 331 | 332 | def _print_state(interval, bounds, sequence, alleles, comment): 333 | """ """ 334 | line = pfx + " " * interval.start + "^" 335 | if interval.end > interval.start: 336 | line += "-" * ((interval.end - interval.start - 1) * 2 + 1) + "^" 337 | print(line + f" [{interval.start},{interval.end}): {alleles} | {comment}") 338 | return 339 | 340 | margin = 15 341 | leftseq = sequence[max(0, interval.start - margin) : interval.start] 342 | rightseq = sequence[interval.end : interval.end + margin] 343 | 344 | row_fmt = "{:>20.20s}{:>20.20s}{:^20.20s}{:<20.20s}{:<20.20s}" 345 | rows = [ 346 | row_fmt.format( 347 | str(bounds.start), 348 | "", 349 | f"[{interval.start},{interval.end})", 350 | "", 351 | str(bounds.end), 352 | ), 353 | row_fmt.format("//", "|", "", "|", "//"), 354 | row_fmt.format("", leftseq, alleles[0], rightseq, ""), 355 | ] + [row_fmt.format("", "", a, "", "") for a in alleles[1:]] 356 | print("\n".join(rows)) 357 | 358 | 359 | def _print_seq_row(sequence): 360 | print(pfx + " ".join("0123456789" * math.ceil(len(sequence) / 10))) 361 | print(pfx + " " + " ".join(sequence)) 362 | 363 | 364 | if __name__ == "__main__": # pragma: no cover 365 | from functools import partial 366 | 367 | if debug: 368 | _logger.setLevel("DEBUG") 369 | 370 | sequence = "CCCCCCCCACACACACACTAGCAGCAGCAT" 371 | # 1 2 3 372 | # 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 373 | # C C C C C C C C A C A C A C A C A C T A G C A G C A G C A T 374 | 375 | tests = [ 376 | # {"interval": (5,5), "alleles": [None, "C"]}, 377 | # {"interval": (5,6), "alleles": [None, "CC"]}, 378 | # {"interval": (5,6), "alleles": [None, ""]}, 379 | # {"interval": (13,13), "alleles": [None, "CA"]}, 380 | # {"interval": (14,14), "alleles": [None, "AC"]}, 381 | # {"interval": (13,15), "alleles": [None, ""]}, 382 | {"interval": (22, 22), "alleles": [None, "AGC"]}, 383 | {"interval": (22, 22), "alleles": [None, "AGCT"]}, 384 | {"interval": (22, 22), "alleles": [None, "AGC", "AGCT"]}, 385 | # {"interval": (22,25), "alleles": [None, ""]}, 386 | # {"interval": (22,25), "alleles": [None, "", "AGC"]}, 387 | # {"interval": (22,25), "alleles": [None, "", "AGCAGC"]}, 388 | ] 389 | 390 | normalize_seq = partial(normalize, sequence=sequence) 391 | normalize_trim = partial(normalize_seq, mode=NormalizationMode.TRIMONLY) 392 | normalize_left = partial(normalize_seq, mode=NormalizationMode.LEFTSHUFFLE) 393 | normalize_right = partial(normalize_seq, mode=NormalizationMode.RIGHTSHUFFLE) 394 | normalize_expand = partial(normalize_seq, mode=NormalizationMode.EXPAND) 395 | normalize_vcf = partial(normalize_seq, mode=NormalizationMode.VCF) 396 | 397 | debug = True 398 | 399 | def test1(**kwargs): 400 | print(f"* {kwargs}") 401 | _print_seq_row(sequence) 402 | result = normalize_seq(**kwargs) 403 | kwargs["mode"] = str(kwargs["mode"]) 404 | print(f"assert {result} == normalize_seq({kwargs})") 405 | 406 | for test in tests: 407 | print("############################################################################") 408 | for mode in ("EXPAND",): # "LEFTSHUFFLE", "RIGHTSHUFFLE", "EXPAND"): 409 | for bm in (None,): 410 | if bm is None: 411 | bounds = None 412 | else: 413 | bounds = (test["interval"][0] - bm, test["interval"][1] + bm) 414 | test["bounds"] = bounds 415 | test1(mode=mode, **test) 416 | -------------------------------------------------------------------------------- /src/bioutils/sequences.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Simple functions and lookup tables for nucleic acid and amino acid sequences.""" 3 | 4 | import logging 5 | import re 6 | from enum import Enum 7 | from string import ascii_lowercase 8 | 9 | _logger = logging.getLogger(__name__) 10 | 11 | aa3_to_aa1_lut = { 12 | "Ala": "A", 13 | "Arg": "R", 14 | "Asn": "N", 15 | "Asp": "D", 16 | "Cys": "C", 17 | "Gln": "Q", 18 | "Glu": "E", 19 | "Gly": "G", 20 | "His": "H", 21 | "Ile": "I", 22 | "Leu": "L", 23 | "Lys": "K", 24 | "Met": "M", 25 | "Phe": "F", 26 | "Pro": "P", 27 | "Ser": "S", 28 | "Thr": "T", 29 | "Trp": "W", 30 | "Tyr": "Y", 31 | "Val": "V", 32 | "Xaa": "X", 33 | "Ter": "*", 34 | "Sec": "U", 35 | } 36 | 37 | aa1_to_aa3_lut = {v: k for k, v in aa3_to_aa1_lut.items()} 38 | 39 | dna_to_aa1_lut = { # NCBI standard translation table 40 | "AAA": "K", 41 | "AAC": "N", 42 | "AAG": "K", 43 | "AAT": "N", 44 | "ACA": "T", 45 | "ACC": "T", 46 | "ACG": "T", 47 | "ACT": "T", 48 | "AGA": "R", 49 | "AGC": "S", 50 | "AGG": "R", 51 | "AGT": "S", 52 | "ATA": "I", 53 | "ATC": "I", 54 | "ATG": "M", 55 | "ATT": "I", 56 | "CAA": "Q", 57 | "CAC": "H", 58 | "CAG": "Q", 59 | "CAT": "H", 60 | "CCA": "P", 61 | "CCC": "P", 62 | "CCG": "P", 63 | "CCT": "P", 64 | "CGA": "R", 65 | "CGC": "R", 66 | "CGG": "R", 67 | "CGT": "R", 68 | "CTA": "L", 69 | "CTC": "L", 70 | "CTG": "L", 71 | "CTT": "L", 72 | "GAA": "E", 73 | "GAC": "D", 74 | "GAG": "E", 75 | "GAT": "D", 76 | "GCA": "A", 77 | "GCC": "A", 78 | "GCG": "A", 79 | "GCT": "A", 80 | "GGA": "G", 81 | "GGC": "G", 82 | "GGG": "G", 83 | "GGT": "G", 84 | "GTA": "V", 85 | "GTC": "V", 86 | "GTG": "V", 87 | "GTT": "V", 88 | "TAA": "*", 89 | "TAC": "Y", 90 | "TAG": "*", 91 | "TAT": "Y", 92 | "TCA": "S", 93 | "TCC": "S", 94 | "TCG": "S", 95 | "TCT": "S", 96 | "TGA": "*", 97 | "TGC": "C", 98 | "TGG": "W", 99 | "TGT": "C", 100 | "TTA": "L", 101 | "TTC": "F", 102 | "TTG": "L", 103 | "TTT": "F", 104 | # degenerate codons 105 | "AAR": "K", 106 | "AAY": "N", 107 | "ACB": "T", 108 | "ACD": "T", 109 | "ACH": "T", 110 | "ACK": "T", 111 | "ACM": "T", 112 | "ACN": "T", 113 | "ACR": "T", 114 | "ACS": "T", 115 | "ACV": "T", 116 | "ACW": "T", 117 | "ACY": "T", 118 | "AGR": "R", 119 | "AGY": "S", 120 | "ATH": "I", 121 | "ATM": "I", 122 | "ATW": "I", 123 | "ATY": "I", 124 | "CAR": "Q", 125 | "CAY": "H", 126 | "CCB": "P", 127 | "CCD": "P", 128 | "CCH": "P", 129 | "CCK": "P", 130 | "CCM": "P", 131 | "CCN": "P", 132 | "CCR": "P", 133 | "CCS": "P", 134 | "CCV": "P", 135 | "CCW": "P", 136 | "CCY": "P", 137 | "CGB": "R", 138 | "CGD": "R", 139 | "CGH": "R", 140 | "CGK": "R", 141 | "CGM": "R", 142 | "CGN": "R", 143 | "CGR": "R", 144 | "CGS": "R", 145 | "CGV": "R", 146 | "CGW": "R", 147 | "CGY": "R", 148 | "CTB": "L", 149 | "CTD": "L", 150 | "CTH": "L", 151 | "CTK": "L", 152 | "CTM": "L", 153 | "CTN": "L", 154 | "CTR": "L", 155 | "CTS": "L", 156 | "CTV": "L", 157 | "CTW": "L", 158 | "CTY": "L", 159 | "GAR": "E", 160 | "GAY": "D", 161 | "GCB": "A", 162 | "GCD": "A", 163 | "GCH": "A", 164 | "GCK": "A", 165 | "GCM": "A", 166 | "GCN": "A", 167 | "GCR": "A", 168 | "GCS": "A", 169 | "GCV": "A", 170 | "GCW": "A", 171 | "GCY": "A", 172 | "GGB": "G", 173 | "GGD": "G", 174 | "GGH": "G", 175 | "GGK": "G", 176 | "GGM": "G", 177 | "GGN": "G", 178 | "GGR": "G", 179 | "GGS": "G", 180 | "GGV": "G", 181 | "GGW": "G", 182 | "GGY": "G", 183 | "GTB": "V", 184 | "GTD": "V", 185 | "GTH": "V", 186 | "GTK": "V", 187 | "GTM": "V", 188 | "GTN": "V", 189 | "GTR": "V", 190 | "GTS": "V", 191 | "GTV": "V", 192 | "GTW": "V", 193 | "GTY": "V", 194 | "MGA": "R", 195 | "MGG": "R", 196 | "MGR": "R", 197 | "TAR": "*", 198 | "TAY": "Y", 199 | "TCB": "S", 200 | "TCD": "S", 201 | "TCH": "S", 202 | "TCK": "S", 203 | "TCM": "S", 204 | "TCN": "S", 205 | "TCR": "S", 206 | "TCS": "S", 207 | "TCV": "S", 208 | "TCW": "S", 209 | "TCY": "S", 210 | "TGY": "C", 211 | "TRA": "*", 212 | "TTR": "L", 213 | "TTY": "F", 214 | "YTA": "L", 215 | "YTG": "L", 216 | "YTR": "L", 217 | } 218 | 219 | # translation table for selenocysteine 220 | dna_to_aa1_sec = dna_to_aa1_lut.copy() 221 | dna_to_aa1_sec["TGA"] = "U" 222 | 223 | # Vertebrate micochondrial translation table 224 | # https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?chapter=tgencodes#SG2 225 | 226 | dna_to_aa1_vmito = dna_to_aa1_lut.copy() 227 | dna_to_aa1_vmito["AGA"] = "*" 228 | dna_to_aa1_vmito["AGG"] = "*" 229 | dna_to_aa1_vmito["ATA"] = "M" 230 | dna_to_aa1_vmito["TGA"] = "W" 231 | 232 | 233 | complement_transtable = bytes.maketrans(b"ACGT", b"TGCA") 234 | 235 | 236 | def aa_to_aa1(seq): 237 | """Coerces string of 1- or 3-letter amino acids to 1-letter representation. 238 | 239 | Args: 240 | seq (str): An amino acid sequence. 241 | 242 | Returns: 243 | str: The sequence as one of 1-letter amino acids. 244 | 245 | Examples: 246 | >>> aa_to_aa1("CATSARELAME") 247 | 'CATSARELAME' 248 | 249 | >>> aa_to_aa1("CysAlaThrSerAlaArgGluLeuAlaMetGlu") 250 | 'CATSARELAME' 251 | 252 | >>> aa_to_aa1(None) 253 | """ 254 | 255 | if seq is None: 256 | return None 257 | return aa3_to_aa1(seq) if looks_like_aa3_p(seq) else seq 258 | 259 | 260 | def aa_to_aa3(seq): 261 | """Coerces string of 1- or 3-letter amino acids to 3-letter representation. 262 | 263 | Args: 264 | seq (str): An amino acid sequence. 265 | 266 | Returns: 267 | str: The sequence as one of 3-letter amino acids. 268 | 269 | Examples: 270 | >>> aa_to_aa3("CATSARELAME") 271 | 'CysAlaThrSerAlaArgGluLeuAlaMetGlu' 272 | 273 | >>> aa_to_aa3("CysAlaThrSerAlaArgGluLeuAlaMetGlu") 274 | 'CysAlaThrSerAlaArgGluLeuAlaMetGlu' 275 | 276 | >>> aa_to_aa3(None) 277 | """ 278 | 279 | if seq is None: 280 | return None 281 | return aa1_to_aa3(seq) if not looks_like_aa3_p(seq) else seq 282 | 283 | 284 | def aa1_to_aa3(seq): 285 | """Converts string of 1-letter amino acids to 3-letter amino acids. 286 | 287 | Should only be used if the format of the sequence is known; otherwise use ``aa_to_aa3()``. 288 | 289 | Args: 290 | seq (str): An amino acid sequence as 1-letter amino acids. 291 | 292 | Returns: 293 | str: The sequence as 3-letter amino acids. 294 | 295 | Raises: 296 | KeyError: If the sequence is not of 1-letter amino acids. 297 | 298 | Examples: 299 | >>> aa1_to_aa3("CATSARELAME") 300 | 'CysAlaThrSerAlaArgGluLeuAlaMetGlu' 301 | 302 | >>> aa1_to_aa3(None) 303 | """ 304 | 305 | if seq is None: 306 | return None 307 | return "".join(aa1_to_aa3_lut[aa1] for aa1 in seq) 308 | 309 | 310 | def aa3_to_aa1(seq): 311 | """Converts string of 3-letter amino acids to 1-letter amino acids. 312 | 313 | Should only be used if the format of the sequence is known; otherwise use ``aa_to_aa1()``. 314 | 315 | Args: 316 | seq (str): An amino acid sequence as 3-letter amino acids. 317 | 318 | Returns: 319 | str: The sequence as 1-letter amino acids. 320 | 321 | Raises: 322 | KeyError: If the sequence is not of 3-letter amino acids. 323 | 324 | Examples: 325 | >>> aa3_to_aa1("CysAlaThrSerAlaArgGluLeuAlaMetGlu") 326 | 'CATSARELAME' 327 | 328 | >>> aa3_to_aa1(None) 329 | """ 330 | 331 | if seq is None: 332 | return None 333 | return "".join(aa3_to_aa1_lut[aa3] for aa3 in [seq[i : i + 3] for i in range(0, len(seq), 3)]) 334 | 335 | 336 | def complement(seq): 337 | """Retrieves the complement of a sequence. 338 | 339 | Args: 340 | seq (str): A nucleotide sequence. 341 | 342 | Returns: 343 | str: The complement of the sequence. 344 | 345 | Examples: 346 | >>> complement("ATCG") 347 | 'TAGC' 348 | 349 | >>> complement(None) 350 | """ 351 | 352 | if seq is None: 353 | return None 354 | return seq.translate(complement_transtable) 355 | 356 | 357 | def elide_sequence(s, flank=5, elision="..."): 358 | """Trims the middle of the sequence, leaving the right and left flanks. 359 | 360 | Args: 361 | s (str): A sequence. 362 | flank (int, optional): The length of each flank. Defaults to five. 363 | elision (str, optional): The symbol used to represent the part trimmed. Defaults to '...'. 364 | 365 | Returns: 366 | str: The sequence with the middle replaced by ``elision``. 367 | 368 | Examples: 369 | >>> elide_sequence("ABCDEFGHIJKLMNOPQRSTUVWXYZ") 370 | 'ABCDE...VWXYZ' 371 | 372 | >>> elide_sequence("ABCDEFGHIJKLMNOPQRSTUVWXYZ", flank=3) 373 | 'ABC...XYZ' 374 | 375 | >>> elide_sequence("ABCDEFGHIJKLMNOPQRSTUVWXYZ", elision="..") 376 | 'ABCDE..VWXYZ' 377 | 378 | >>> elide_sequence("ABCDEFGHIJKLMNOPQRSTUVWXYZ", flank=12) 379 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 380 | 381 | >>> elide_sequence("ABCDEFGHIJKLMNOPQRSTUVWXYZ", flank=12, elision=".") 382 | 'ABCDEFGHIJKL.OPQRSTUVWXYZ' 383 | """ 384 | 385 | elided_sequence_len = flank + flank + len(elision) 386 | if len(s) <= elided_sequence_len: 387 | return s 388 | return s[:flank] + elision + s[-flank:] 389 | 390 | 391 | def looks_like_aa3_p(seq): 392 | """Indicates whether a string looks like a 3-letter AA string. 393 | 394 | Args: 395 | seq (str): A sequence. 396 | 397 | Returns: 398 | bool: Whether the string is of the format of a 3-letter AA string. 399 | """ 400 | return seq is not None and (len(seq) % 3 == 0) and (len(seq) == 0 or seq[1] in ascii_lowercase) 401 | 402 | 403 | def normalize_sequence(seq): 404 | """Converts sequence to normalized representation for hashing. 405 | 406 | Essentially, removes whitespace and asterisks, and uppercases the string. 407 | 408 | Args: 409 | seq (str): The sequence to be normalized. 410 | 411 | Returns: 412 | str: The sequence as a string of uppercase letters. 413 | 414 | Raises: 415 | RuntimeError: If the sequence contains non-alphabetic characters (besides '*'). 416 | 417 | Examples: 418 | >>> normalize_sequence("ACGT") 419 | 'ACGT' 420 | 421 | >>> normalize_sequence(" A C G T * ") 422 | 'ACGT' 423 | 424 | >>> normalize_sequence("ACGT1") 425 | Traceback (most recent call last): 426 | ... 427 | RuntimeError: Normalized sequence contains non-alphabetic characters 428 | """ 429 | 430 | nseq = re.sub(r"[\s\*]", "", seq).upper() 431 | m = re.search("[^A-Z]", nseq) 432 | if m: 433 | _logger.debug("Original sequence: " + seq) 434 | _logger.debug("Normalized sequence: " + nseq) 435 | _logger.debug("First non-[A-Z] at {}".format(m.start())) 436 | raise RuntimeError("Normalized sequence contains non-alphabetic characters") 437 | return nseq 438 | 439 | 440 | def reverse_complement(seq): 441 | """Converts a sequence to its reverse complement. 442 | 443 | Args: 444 | seq (str): A nucleotide sequence. 445 | 446 | Returns: 447 | str: The reverse complement of the sequence. 448 | 449 | Examples: 450 | >>> reverse_complement("ATCG") 451 | 'CGAT' 452 | 453 | >>> reverse_complement(None) 454 | """ 455 | 456 | if seq is None: 457 | return None 458 | return "".join(reversed(complement(seq))) 459 | 460 | 461 | def replace_t_to_u(seq): 462 | """Replaces the T's in a sequence with U's. 463 | 464 | Args: 465 | seq (str): A nucleotide sequence. 466 | 467 | Returns: 468 | str: The sequence with the T's replaced by U's. 469 | 470 | Examples: 471 | >>> replace_t_to_u("ACGT") 472 | 'ACGU' 473 | 474 | >>> replace_t_to_u(None) 475 | """ 476 | 477 | if seq is None: 478 | return None 479 | return seq.replace("T", "U").replace("t", "u") 480 | 481 | 482 | def replace_u_to_t(seq): 483 | """Replaces the U's in a sequence with T's. 484 | 485 | Args: 486 | seq (str): A nucleotide sequence. 487 | 488 | Returns: 489 | str: The sequence with the U's replaced by T's. 490 | 491 | Examples: 492 | >>> replace_u_to_t("ACGU") 493 | 'ACGT' 494 | 495 | >>> replace_u_to_t(None) 496 | """ 497 | 498 | if seq is None: 499 | return None 500 | return seq.replace("U", "T").replace("u", "t") 501 | 502 | 503 | class StrEnum(str, Enum): 504 | """utility class""" 505 | 506 | def __str__(self) -> str: 507 | return str.__str__(self) 508 | 509 | def __repr__(self) -> str: 510 | return str.__repr__(self) 511 | 512 | 513 | class TranslationTable(StrEnum): 514 | """An enum that controls switching between standard and selenocysteine translation tables.""" 515 | 516 | standard = "standard" 517 | selenocysteine = "sec" 518 | vertebrate_mitochondrial = "vmito" 519 | 520 | 521 | def translate_cds(seq, full_codons=True, ter_symbol="*", translation_table=TranslationTable.standard): 522 | """Translates a DNA or RNA sequence into a single-letter amino acid sequence. 523 | 524 | Args: 525 | seq (str): A nucleotide sequence. 526 | full_codons (bool, optional): If ``True``, forces sequence to have length 527 | that is a multiple of 3 and raises an error otherwise. 528 | If False, ``ter_symbol`` will be added as the last amino acid. 529 | This corresponds to biopython's behavior of padding the last codon with ``N``s. 530 | Defaults to ``True``. 531 | ter_symbol (str, optional): Placeholder for the last amino acid if 532 | sequence length is not divisible by three and ``full_codons`` is False. 533 | Defaults to ``'*'`` 534 | translation_table (TranslationTable, optional): One of the options from the TranslationTable. It indicates 535 | which codon to amino acid translation table to use. 536 | By default we will use the standard translation table for humans. To enable translation for selenoproteins, 537 | the TranslationTable.selenocysteine table can get used 538 | 539 | Returns: 540 | str: The corresponding single letter amino acid sequence. 541 | 542 | Raises: 543 | ValueError: If ``full_codons`` and the sequence is not a multiple of three. 544 | ValueError: If a codon is undefined in the table. 545 | 546 | Examples: 547 | >>> translate_cds("ATGCGA") 548 | 'MR' 549 | 550 | >>> translate_cds("AUGCGA") 551 | 'MR' 552 | 553 | >>> translate_cds(None) 554 | 555 | >>> translate_cds("") 556 | '' 557 | 558 | >>> translate_cds("AUGCG") 559 | Traceback (most recent call last): 560 | ... 561 | ValueError: Sequence length must be a multiple of three 562 | 563 | >>> translate_cds("AUGCG", full_codons=False) 564 | 'M*' 565 | 566 | >>> translate_cds("ATGTAN") 567 | 'MX' 568 | 569 | >>> translate_cds("CCN") 570 | 'P' 571 | 572 | >>> translate_cds("TRA") 573 | '*' 574 | 575 | >>> translate_cds("TTNTA", full_codons=False) 576 | 'X*' 577 | 578 | >>> translate_cds("CTB") 579 | 'L' 580 | 581 | >>> translate_cds("AGM") 582 | 'X' 583 | 584 | >>> translate_cds("GAS") 585 | 'X' 586 | 587 | >>> translate_cds("CUN") 588 | 'L' 589 | 590 | >>> translate_cds("AUGCGQ") 591 | Traceback (most recent call last): 592 | ... 593 | ValueError: Codon CGQ at position 4..6 is undefined in codon table 594 | """ 595 | 596 | if seq is None: 597 | return None 598 | 599 | if len(seq) == 0: 600 | return "" 601 | 602 | if full_codons and len(seq) % 3 != 0: 603 | raise ValueError("Sequence length must be a multiple of three") 604 | 605 | if translation_table == TranslationTable.standard: 606 | trans_table = dna_to_aa1_lut 607 | elif translation_table == TranslationTable.selenocysteine: 608 | trans_table = dna_to_aa1_sec 609 | elif translation_table == TranslationTable.vertebrate_mitochondrial: 610 | trans_table = dna_to_aa1_vmito 611 | else: 612 | raise ValueError("Unsupported translation table {}".format(translation_table)) 613 | seq = replace_u_to_t(seq) 614 | seq = seq.upper() 615 | 616 | protein_seq = [] 617 | for i in range(0, len(seq) - len(seq) % 3, 3): 618 | codon = seq[i : i + 3] 619 | try: 620 | aa = trans_table[codon] 621 | except KeyError: 622 | # if this contains an ambiguous code, set aa to X, otherwise, throw error 623 | iupac_ambiguity_codes = "BDHVNUWSMKRYZ" 624 | if any(iupac_ambiguity_code in codon for iupac_ambiguity_code in iupac_ambiguity_codes): 625 | aa = "X" 626 | else: 627 | raise ValueError("Codon {} at position {}..{} is undefined in codon table".format(codon, i + 1, i + 3)) 628 | protein_seq.append(aa) 629 | 630 | # check for trailing bases and add the ter symbol if required 631 | if not full_codons and len(seq) % 3 != 0: 632 | protein_seq.append(ter_symbol) 633 | 634 | return "".join(protein_seq) 635 | 636 | 637 | ## 638 | ## Copyright 2014 Bioutils Contributors 639 | ## 640 | ## Licensed under the Apache License, Version 2.0 (the "License"); 641 | ## you may not use this file except in compliance with the License. 642 | ## You may obtain a copy of the License at 643 | ## 644 | ## http://www.apache.org/licenses/LICENSE-2.0 645 | ## 646 | ## Unless required by applicable law or agreed to in writing, software 647 | ## distributed under the License is distributed on an "AS IS" BASIS, 648 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 649 | ## See the License for the specific language governing permissions and 650 | ## limitations under the License. 651 | ## 652 | --------------------------------------------------------------------------------