├── .github
├── dependabot.yml
└── workflows
│ ├── ci.yml
│ └── publish.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── CHANGES.md
├── CITATION.cff
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── bioframe
├── __init__.py
├── core
│ ├── __init__.py
│ ├── arrops.py
│ ├── checks.py
│ ├── construction.py
│ ├── specs.py
│ └── stringops.py
├── extras.py
├── io
│ ├── __init__.py
│ ├── assembly.py
│ ├── bed.py
│ ├── data
│ │ ├── _assemblies.yml
│ │ ├── ce10.seqinfo.tsv
│ │ ├── ce11.seqinfo.tsv
│ │ ├── danRer10.seqinfo.tsv
│ │ ├── danRer11.seqinfo.tsv
│ │ ├── dm3.seqinfo.tsv
│ │ ├── dm6.seqinfo.tsv
│ │ ├── hg19.cytoband.tsv
│ │ ├── hg19.seqinfo.tsv
│ │ ├── hg38.cytoband.tsv
│ │ ├── hg38.seqinfo.tsv
│ │ ├── hs1.cytoband.tsv
│ │ ├── hs1.seqinfo.tsv
│ │ ├── mm10.seqinfo.tsv
│ │ ├── mm39.seqinfo.tsv
│ │ ├── mm9.seqinfo.tsv
│ │ ├── sacCer3.seqinfo.tsv
│ │ └── wuhCor1.seqinfo.tsv
│ ├── fileops.py
│ ├── resources.py
│ └── schemas.py
├── ops.py
├── sandbox
│ ├── clients.py
│ ├── gtf_io.py
│ └── parquet_io.py
└── vis.py
├── docs
├── Makefile
├── api-construction.rst
├── api-extras.rst
├── api-fileops.rst
├── api-intervalops.rst
├── api-lowlevel.md
├── api-resources.rst
├── api-validation.rst
├── api-vis.rst
├── conf.py
├── figs
│ ├── ._bioframe-logo.png
│ ├── bioframe-logo.png
│ ├── bioframe_closest.pdf
│ ├── closest0.png
│ ├── closest1.png
│ ├── closest2.png
│ ├── closest3.png
│ ├── df1.png
│ ├── df2.png
│ ├── df@.png
│ ├── merge_df1.png
│ ├── overlap_inner_0.png
│ └── overlap_inner_1.png
├── guide-bedtools.md
├── guide-definitions.rst
├── guide-intervalops.md
├── guide-io.ipynb
├── guide-performance.ipynb
├── guide-quickstart.rst
├── guide-recipes.md
├── guide-specifications.rst
├── index.rst
├── lowlevel
│ ├── arrops.rst
│ ├── specs.rst
│ └── stringops.rst
├── make.bat
├── times100.bw
└── tutorials
│ ├── tutorial_assign_motifs_to_peaks.ipynb
│ └── tutorial_assign_peaks_to_genes.ipynb
├── pyproject.toml
└── tests
├── test_assembly_info.py
├── test_bed.py
├── test_core_checks.py
├── test_core_construction.py
├── test_core_specs.py
├── test_core_stringops.py
├── test_data
├── bed12.bed
├── bed9.bed
├── jaspar.bed
├── narrowPeak.bed
├── test.chrom.sizes
├── test.fa
├── test.fa.fai
├── toy.bam
├── toy.bam.bai
└── toy.sam
├── test_extras.py
├── test_fileops.py
├── test_ops.py
├── test_ops_select.py
├── test_resources.py
└── test_vis.py
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: "github-actions"
4 | directory: "/"
5 | schedule:
6 | interval: "weekly"
7 | groups:
8 | actions:
9 | patterns:
10 | - "*"
11 | - package-ecosystem: "pip"
12 | directory: "/"
13 | schedule:
14 | interval: "weekly"
15 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on:
4 | push:
5 | branches: [ main ]
6 |
7 | pull_request:
8 | branches: [ main ]
9 |
10 | concurrency:
11 | group: ${{ github.workflow }}-${{ github.ref }}
12 | cancel-in-progress: true
13 |
14 | jobs:
15 |
16 | Test:
17 | runs-on: ubuntu-latest
18 | strategy:
19 | matrix:
20 | python-version: [ "3.9", "3.10", "3.11", "3.12" ]
21 | steps:
22 | - uses: actions/checkout@v4
23 | - name: Set up Python ${{ matrix.python-version }}
24 | uses: actions/setup-python@v5
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 | - run: |
28 | python -m pip install --upgrade pip hatch
29 | pip install -e .[dev]
30 | hatch run test
31 |
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | name: Publish Python Package to PyPI
2 |
3 | on:
4 | release:
5 | types: [published]
6 | workflow_dispatch:
7 |
8 | jobs:
9 | Publish:
10 | # prevents this action from running on forks
11 | if: github.repository == 'open2c/bioframe'
12 |
13 | runs-on: ubuntu-latest
14 | permissions:
15 | id-token: write
16 |
17 | steps:
18 | - name: Checkout
19 | uses: actions/checkout@v4
20 |
21 | - name: Setup Python
22 | uses: actions/setup-python@v5
23 | with:
24 | python-version: "3.x"
25 |
26 | - name: Install dependencies
27 | run: |
28 | python -m pip install --upgrade pip
29 | pip install build
30 |
31 | - name: Build
32 | run: python -m build
33 |
34 | - name: Publish distribution 📦 to PyPI
35 | uses: pypa/gh-action-pypi-publish@release/v1
36 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | *.swo
3 | *~
4 |
5 | *.py[cod]
6 | __pycache__
7 | .cache
8 | .pytest_cache
9 | .ipynb_checkpoints/
10 | .venv/*
11 |
12 | # setup and build
13 | docs/_*
14 | *.egg-info/
15 | dist/
16 | build/
17 | MANIFEST
18 |
19 | # OS-generated files
20 | .DS_Store
21 | .Spotlight-V100
22 | .Trashes
23 | ehthumbs.db
24 | Thumbs.db
25 |
26 | _scratch/
27 | tmp/
28 | docs/notebooks/.ipynb_checkpoints
29 | .vscode
30 | .spyproject
31 | docs/notebooks/cgranges-test/*
32 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v5.0.0
4 | hooks:
5 | - id: check-ast
6 | - id: end-of-file-fixer
7 | - id: mixed-line-ending
8 | - id: trailing-whitespace
9 | - id: check-case-conflict
10 |
11 | - repo: https://github.com/astral-sh/ruff-pre-commit
12 | rev: v0.7.0
13 | hooks:
14 | - id: ruff
15 | types_or: [python, pyi, jupyter]
16 | args: [--fix, --show-fixes, --exit-non-zero-on-fix]
17 |
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yml # Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details # Required version: 2 # Build documentation in the docs/ directory with Sphinx sphinx: configuration: docs/conf.py # Build documentation with MkDocs #mkdocs: # configuration: mkdocs.yml # Optionally build your docs in additional formats such as PDF and ePub formats: all # Optionally set the version of Python and requirements required to build your docs python: version: 3.7 install: - requirements: docs/requirements.txt # .readthedocs.yml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 | build:
8 | os: ubuntu-22.04
9 | tools:
10 | python: "3.10"
11 | # Build documentation in the docs/ directory with Sphinx
12 | sphinx:
13 | configuration: docs/conf.py
14 |
15 | # Build documentation with MkDocs
16 | #mkdocs:
17 | # configuration: mkdocs.yml
18 |
19 | # Optionally build your docs in additional formats such as PDF and ePub
20 | formats: all
21 |
22 | # Optionally set the version of Python and requirements required to build your docs
23 | # setup_py_install: true
24 | python:
25 | install:
26 | - method: pip
27 | path: .
28 | extra_requirements:
29 | - dev
30 | - docs
31 |
--------------------------------------------------------------------------------
/CHANGES.md:
--------------------------------------------------------------------------------
1 | # Release notes
2 |
3 | ## [Upcoming release](https://github.com/open2c/bioframe/compare/v0.8.0...HEAD)
4 |
5 | ## v0.8.0
6 |
7 | Date: 2025-04-08
8 |
9 | API changes:
10 | * bigtools engine for bigwig and bigbed.
11 | * run length functions `mark_runs` and `compress_runs`.
12 |
13 | Maintenance:
14 | * Numpy 2.x support.
15 |
16 | ## v0.7.2
17 |
18 | Date: 2024-06-19
19 |
20 | API changes:
21 | * `read_alignment` function introduced in v0.7.0 has been pluralized to `read_alignments`
22 |
23 | Maintenance:
24 | * Skip `read_alignments` tests on big-endian architectures by @nvictus in https://github.com/open2c/bioframe/pull/216
25 |
26 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.7.1...v0.7.2
27 |
28 | ## v0.7.1
29 |
30 | Date: 2024-06-17
31 |
32 | Maintenance:
33 | * Refactor join arrayops and intidx internals by @nvictus in https://github.com/open2c/bioframe/pull/204
34 | * NumPy 2.0 was released. Pin `numpy < 2` until we migrate.
35 |
36 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.7.0...v0.7.1
37 |
38 | ## v0.7.0
39 |
40 | Date: 2024-05-20
41 |
42 | API changes:
43 | * Add `to_bed` function to validate and write standard BED files @gamazeps in https://github.com/open2c/bioframe/pull/203
44 | * `read_bam` deprecated in favor of `read_alignments` @gamazeps in https://github.com/open2c/bioframe/pull/206
45 |
46 | Documentation:
47 | * Add "bioframe for bedtools users" guide to docs by @gamazeps in https://github.com/open2c/bioframe/pull/198
48 |
49 | Bug fixes:
50 | * Fix contig name and JSON issues in read_bam implementation by @gamazeps in https://github.com/open2c/bioframe/pull/206
51 |
52 | New Contributors:
53 | * @gamazeps made their first contribution in https://github.com/open2c/bioframe/pull/203
54 |
55 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.6.4...v0.7.0
56 |
57 | ## v0.6.4
58 |
59 | Date: 2024-04-06
60 |
61 | Maintenance:
62 | * Migrate from setuptools `pkg_resources` to `importlib.resources` by @nvictus in https://github.com/open2c/bioframe/pull/194
63 | * Use `importlib.metadata` for versioning by @nvictus in https://github.com/open2c/bioframe/pull/195
64 |
65 | Bug fixes:
66 | * Overlap point segment patch #183 by @smitkadvani in https://github.com/open2c/bioframe/pull/184
67 | * #167: Replaced np.int with int as the attribute is deprecated by numpy by @harshit148 in https://github.com/open2c/bioframe/pull/192
68 |
69 | New Contributors:
70 | * @harshit148 made a first contribution in https://github.com/open2c/bioframe/pull/192
71 |
72 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.6.3...v0.6.4
73 |
74 | ## v0.6.3
75 |
76 | Date: 2024-03-11
77 |
78 | Fixes:
79 | * Prevent dropout from `closest` in some cases of left intervals with no neighbors by @agalitsyna in https://github.com/open2c/bioframe/pull/185
80 | * Fix overlap returning float indexes causing failing tests (numpy v1.22.4, pandas v1.5.2) by @agalitsyna in https://github.com/open2c/bioframe/pull/185
81 |
82 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.6.2...v0.6.3
83 |
84 | ## v0.6.2
85 |
86 | Date: 2024-02-08
87 |
88 | Changes:
89 | * cols and df_view_col passed to downstream functions by @smitkadvani in https://github.com/open2c/bioframe/pull/182
90 |
91 | Fixes:
92 | * Update to new UCSC hgdownload url by @golobor and @nvictus in https://github.com/open2c/bioframe/pull/187
93 |
94 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.6.1...v0.6.2
95 |
96 | ## v0.6.1
97 |
98 | Date: 2024-01-08
99 |
100 | API changes:
101 |
102 | Default behavior of `ensure_nullable` option in `overlap` was modified to minimize the possibility of regressions in libraries that depend on legacy behavior.
103 |
104 | * The new option was renamed `ensure_int` and is `True` by default. It ensures that output coordinate columns are always returned with an integer dtype, as was the case in prior versions. This is achieved by converting columns having non-nullable NumPy dtypes to Pandas nullable ones in the specific case where the result of an **outer join** generates missing values; otherwise, column dtypes are preserved unchanged in the output.
105 | * Unlike previous minor versions of bioframe, the nullable dtype chosen will have the **same underlying type** as the corresponding column from the input (i.e, an input dataframe using `np.uint32` start coordinates may yield a `pd.UInt32` start column in the output).
106 | * This behavior can be turned off by setting `ensure_int` to `False`, in which case outer joins on dataframes using NumPy dtypes may produce floating point output columns when missing values are introduced (stored as `NaN`), following the native casting behavior of such columns.
107 |
108 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.6.0...v0.6.1
109 |
110 | ## v0.6.0
111 |
112 | Date: 2024-01-04
113 |
114 | API changes:
115 | * `overlap`: In previous versions, output coordinate columns were always converted to Pandas "nullable" `Int64` dtype before returning outer join results. In the interest of flexibility, memory efficiency, and least surprise, the coordinate columns returned in the output dataframe now preserve dtype from the input dataframes, following native type casting rules if missing data are introduced. We introduce the `ensure_nullable` argument to force Pandas nullable dtypes in the output coordinates. See the [docs](https://bioframe.readthedocs.io/en/latest/api-intervalops.html#bioframe.ops.overlap) for more details. (#178)
116 |
117 | Bug fixes:
118 | * Fixed `coverage` with custom `cols1` (#170)
119 |
120 | Documentation:
121 | * Added contributing guidelines and NumFOCUS affiliation.
122 | * Updated README and added CITATION.cff file.
123 | * Updated performance benchmarks.
124 |
125 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.5.1...v0.6.0
126 |
127 | ## v0.5.1
128 |
129 | Date: 2023-11-08
130 |
131 | Bug fixes:
132 | * Series are treated like dict in `make_chromarms`
133 |
134 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.5.0...v0.5.1
135 |
136 | ## v0.5.0
137 |
138 | Date: 2023-10-05
139 |
140 | API changes:
141 | * New builtin curated genome assembly database (metadata, chromsizes, cytobands):
142 | * `bioframe.list_assemblies()`
143 | * `bioframe.assembly_info()`
144 | * New UCSC RGB color converter utility #158
145 | * Options added to `pair_by_distance`
146 |
147 | Bug fixes:
148 | * Make expand throw an error if both pad and scale are passed (#148)
149 | * Fixes to bioframe.select query interval semantics (#147)
150 |
151 | Maintenance:
152 | * Migrate to hatch build system and pyproject.toml
153 | * Various refactorings
154 |
155 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.4.1...v0.5.0
156 |
157 | ## v0.4.1
158 |
159 | Date: 2023-04-22
160 |
161 | Bug fixes:
162 | * Fix bug introduced in the last release in `select` and `select_*` query interval semantics. Results of select are now consistent with the query interval being interpreted as half-open, closed on the left.
163 |
164 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.4.0...v0.4.1
165 |
166 | ## v0.4.0
167 |
168 | Date: 2023-03-23
169 |
170 | API changes:
171 | * New strand-aware directionality options for `closest()` via `direction_col` #129.
172 | * New index-based range query selectors on single bioframes to complement `select()` #128:
173 | * `select_mask()` returns boolean indices corresponding to intervals that overlap the query region
174 | * `select_indices()` returns integer indices corresponding to intervals that overlap the query region
175 | * `select_labels()` returns pandas label indices corresponding to intervals that overlap the query region
176 |
177 | Bug fixes:
178 | * Import fixes in sandbox
179 | * Relax bioframe validator to permit using same column as start and end (e.g. point variants).
180 |
181 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.3.3...v0.4.0
182 |
183 | ## v0.3.3
184 |
185 | Date: 2022-02-28
186 |
187 | Bug fixes:
188 | * fixed a couple functions returning an error instance instead of raising
189 | * fetch_mrna link fixed
190 |
191 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.3.2...v0.3.3
192 |
193 | ## v0.3.2
194 |
195 | Date: 2022-02-01
196 |
197 | Bug fixes:
198 | * fixed error in is_contained
199 | * tutorial updates
200 |
201 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.3.1...v0.3.2
202 |
203 | ## v0.3.1
204 |
205 | Date: 2021-11-15
206 |
207 | API changes:
208 |
209 | * `bioframe.sort_bedframe` does not append columns or modify their dtypes.
210 |
211 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.3.0...v0.3.1
212 |
213 | ## v0.3.0
214 |
215 | Date: 2021-08-31
216 |
217 | Conceptual changes:
218 | * we formulated strict definitions for genomic intervals, dataframes, and
219 | their various properties. All bioframe functions are expected to follow
220 | to these definitions tightly.
221 |
222 | API changes:
223 | * reorganize modules:
224 | * ops - operations on genomic interval dataframes
225 | * extras - miscellaneous operations, most involving
226 | genomic sequences and gene annotations
227 | * vis - visualizations of genomic interval dataframes
228 | * core.arrops - operations on genomic interval arrays
229 | * core.checks - tests for definitions of genomic interval dataframes
230 | * core.construction - construction and sanitation of genomic interval dataframes
231 | * core.specs - specifications for the implementation of genomic intervals in pandas.dataframes
232 | (i.e. column names, datatypes, etc)
233 | * core.stringops - operations on genomic interval strings
234 | * io.fileops - I/O on common file formats for genomic data
235 | * io.schemas - schemas for standard tabular formats for genomic data storage
236 | * io.resources - interfaces to popular online genomic data resources
237 |
238 | * new functions: extras.pair_by_distance, ops.sort_bedframe, ops.assign_view,
239 | dataframe constructors
240 |
241 | * existing functions:
242 | * expand: take negative values and fractional values
243 | * overlap: change default suffixes, keep_order=True
244 | * subtract: add return_index and keep_order
245 |
246 | * enable pd.NA for missing values, typecasting
247 |
248 | New data:
249 | * add schemas for bedpe, gap, UCSCmRNA, pgsnp
250 | * add tables with curated detailed genome assembly information
251 |
252 | Bugfixes:
253 | * None?..
254 |
255 | Miscellaneous:
256 | * speed up frac_gc is faster now
257 | * drop support for Python 3.6, add support for 3.9
258 |
259 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.2.0...v0.3.0
260 |
261 | ## v0.2.0
262 |
263 | Date: 2020-12-02
264 |
265 | API changes:
266 | * `read_chromsizes` and `fetch_chromsizes`: add new `as_bed` parameter.
267 | * `read_chromsizes` and `fetch_chromsizes`: revert to filtering chromosome names by default, but clearly expose `filter_chroms` kwarg.
268 |
269 | Bug fixes:
270 | * Fixed `bioframe.split`
271 | * Restored `frac_genome_coverage`
272 |
273 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.1.0...v0.2.0
274 |
275 | ## v0.1.0
276 |
277 | Date: 2020-09-23
278 |
279 | First beta release.
280 |
281 | What's new:
282 | * New extensive dataframe genomic interval arithmetic toolsuite.
283 | * Improved region handling and region querying functions.
284 | * [Documentation!](https://bioframe.readthedocs.io/)
285 |
286 | Maintenance:
287 | * Dropped Python 2 support
288 | * Refactoring of various genome operations and resources.
289 | * Improved testing and linting
290 |
291 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.0.12...v0.1.0
292 |
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | type: software
3 | title: bioframe
4 | license: MIT
5 | repository-code: 'https://github.com/open2c/bioframe'
6 | message: >-
7 | If you use this software, please cite it using the
8 | metadata from this file.
9 | authors:
10 | - given-names: Nezar
11 | family-names: Abdennur
12 | orcid: 'https://orcid.org/0000-0001-5814-0864'
13 | - given-names: Geoffrey
14 | family-names: Fudenberg
15 | orcid: "https://orcid.org/0000-0001-5905-6517"
16 | - given-names: Ilya
17 | family-names: Flyamer
18 | orcid: "https://orcid.org/0000-0002-4892-4208"
19 | - given-names: Aleksandra
20 | family-names: Galitsyna
21 | orcid: "https://orcid.org/0000-0001-8969-5694"
22 | - given-names: Anton
23 | family-names: Goloborodko
24 | orcid: "https://orcid.org/0000-0002-2210-8616"
25 | - given-names: Maxim
26 | family-names: Imakaev
27 | orcid: "https://orcid.org/0000-0002-5320-2728"
28 | - given-names: Sergey
29 | family-names: Venev
30 | orcid: "https://orcid.org/0000-0002-1507-7460"
31 | abstract: >-
32 | Bioframe is a library to enable flexible and performant
33 | operations on genomic interval data frames in Python.
34 | keywords:
35 | - bioinformatics
36 | - genomics
37 | - ranges
38 | - intervals
39 | - dataframes
40 | - pandas
41 | - numpy
42 | - Python
43 | identifiers:
44 | - type: doi
45 | value: 10.5281/zenodo.3897573
46 | description: Zenodo
47 | - type: doi
48 | value: 10.1101/2022.02.16.480748
49 | description: bioRxiv preprint
50 | - type: doi
51 | value: 10.1093/bioinformatics/btae088
52 | description: Publication
53 | preferred-citation:
54 | type: article
55 | title: "Bioframe: Operations on Genomic Intervals in Pandas Dataframes"
56 | authors:
57 | - family-names: Open2C
58 | - given-names: Nezar
59 | family-names: Abdennur
60 | orcid: 'https://orcid.org/0000-0001-5814-0864'
61 | - given-names: Geoffrey
62 | family-names: Fudenberg
63 | orcid: "https://orcid.org/0000-0001-5905-6517"
64 | - given-names: Ilya
65 | family-names: Flyamer
66 | name-suffix: M
67 | orcid: "https://orcid.org/0000-0002-4892-4208"
68 | - given-names: Aleksandra
69 | family-names: Galitsyna
70 | name-suffix: A
71 | orcid: "https://orcid.org/0000-0001-8969-5694"
72 | - given-names: Anton
73 | family-names: Goloborodko
74 | orcid: "https://orcid.org/0000-0002-2210-8616"
75 | - given-names: Maxim
76 | family-names: Imakaev
77 | orcid: "https://orcid.org/0000-0002-5320-2728"
78 | - given-names: Sergey
79 | family-names: Venev
80 | orcid: "https://orcid.org/0000-0002-1507-7460"
81 | journal: Bioinformatics
82 | year: 2024
83 | url: "https://doi.org/10.1093/bioinformatics/btae088"
84 | doi: "10.1093/bioinformatics/btae088"
85 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 |
4 | ## General guidelines
5 |
6 | If you haven't contributed to open-source before, we recommend you read [this excellent guide by GitHub on how to contribute to open source](https://opensource.guide/how-to-contribute). The guide is long, so you can gloss over things you're familiar with.
7 |
8 | If you're not already familiar with it, we follow the [fork and pull model](https://help.github.com/articles/about-collaborative-development-models) on GitHub. Also, check out this recommended [git workflow](https://www.asmeurer.com/git-workflow/).
9 |
10 |
11 | ## Contributing Code
12 |
13 | This project has a number of requirements for all code contributed.
14 |
15 | * We follow the [PEP-8 style](https://www.python.org/dev/peps/pep-0008/) convention.
16 | * We use [NumPy-style docstrings](https://numpydoc.readthedocs.io/en/latest/format.html).
17 | * It's ideal if user-facing API changes or new features have documentation added.
18 | * It is best if all new functionality and/or bug fixes have unit tests added with each use-case.
19 |
20 |
21 | ## Setting up Your Development Environment
22 |
23 | This project uses the [hatch](https://hatch.pypa.io/latest/) project manager and build system. We recommend you install `hatch` as a global isolated application using [pipx](https://pipx.pypa.io/stable/). See other installation options [here](https://hatch.pypa.io/latest/install/).
24 |
25 | ```sh
26 | pipx install hatch
27 | ```
28 |
29 | > [!NOTE]
30 | > Many custom command shortcuts are accessible through hatch (and shown below). See `tool.hatch.envs.default.scripts` in our project's `pyproject.toml` configuration file.
31 |
32 | After forking and cloning the repository, you can create an isolated Python development environment and install the package in "editable" (i.e. development) mode as follows:
33 |
34 | ```sh
35 | git clone https://github.com/open2c/bioframe.git
36 | cd bioframe
37 | hatch shell
38 | ```
39 |
40 | The first time you run `hatch shell` the environment will be created and activated, and the package will be installed. In future sessions, running `hatch shell` will reactivate your development environment.
41 |
42 | > [!TIP]
43 | > If you prefer to store your virtual environments in your working directory (like classic virtualenvs) rather than in a centralized location (similar to conda), configure hatch as follows:
44 | >
45 | > ```sh
46 | > hatch config set dirs.env.virtual .venv
47 | > ```
48 | >
49 | > This will make hatch set up its environments within the current working directory under `.venv`.
50 |
51 | Alternatively, if you prefer to manage your virtual environments yourself, you can install the package for development using, for example:
52 |
53 | ```sh
54 | python -m venv .venv
55 | source .venv/bin/activate
56 | pip install -e '.[dev,test,docs]'
57 | ```
58 |
59 | For all pull requests, linting and unit tests are automatically run using the [GitHub Actions](https://docs.github.com/en/actions) Continuous Integration service. However, you are still encouraged to run these checks locally before pushing code to a PR.
60 |
61 | ## Linting and Formatting
62 |
63 | We use [ruff](https://docs.astral.sh/ruff/) for style checking. Run `ruff check .` or:
64 |
65 | ```sh
66 | hatch run lint
67 | ```
68 |
69 | Ruff can fix a lot of errors itself. Run `ruff check --fix .` or:
70 |
71 | ```sh
72 | hatch run fix
73 | ```
74 |
75 | Ruff includes a formatter that mimics [black](https://black.readthedocs.io/en/stable/). To automatically reformat your code, you can use `ruff format {source_file}`.
76 |
77 | We use [pre-commit](https://github.com/pre-commit/pre-commit) to make sure the coding style is enforced. You first need to install pre-commit and the corresponding git commit hooks:
78 |
79 | ```sh
80 | pip install pre-commit
81 | pre-commit install
82 | ```
83 |
84 | The last command installs the hooks listed in `.pre-commit-config.yaml` locally into your git repo. If you do this, the checks will run automatically before every commit. You can also manually make sure your code satisfies the coding style:
85 |
86 | ```sh
87 | pre-commit run --all-files
88 | ```
89 |
90 | ## Testing
91 |
92 | It is best if all new functionality and/or bug fixes have unit tests added with each use-case.
93 |
94 | We use [pytest](https://docs.pytest.org/en/latest) as our unit testing framework. Once you've configured your environment, you can just `cd` to the root of your repository and run `pytest` or:
95 |
96 | ```sh
97 | hatch run test
98 | ```
99 |
100 | ## Documentation
101 |
102 | If a feature is stable and relatively finalized, it is time to add it to the documentation. If you are adding any private/public functions, it is best to add docstrings, to aid in reviewing code and also for the API reference.
103 |
104 | We use [Numpy style docstrings](https://numpydoc.readthedocs.io/en/latest/format.html>) and [Sphinx](http://www.sphinx-doc.org/en/stable) to document this library. Sphinx, in turn, uses [reStructuredText](http://www.sphinx-doc.org/en/stable/rest.html) as its markup language for adding code.
105 |
106 | We use the [Sphinx Autosummary extension](http://www.sphinx-doc.org/en/stable/ext/autosummary.html) to generate API references. You may want to look at `docs/api-*.rst` files to see how they look and where to add new functions, classes or modules. We also use the [myst_nb extension](https://myst-nb.readthedocs.io/en/latest/) to render Jupyter notebooks in the documentation.
107 |
108 | To build the documentation, run `sphinx-autobuild` using:
109 |
110 | ```sh
111 | hatch run docs
112 | ```
113 |
114 | This will build the documentation and serve it on a local http server which listens for changes and automatically rebuilds.
115 |
116 | Documentation from the `main` branch and tagged releases is automatically built and hosted on [readthedocs](https://readthedocs.org/).
117 |
118 |
119 | ## Acknowledgments
120 |
121 | This document is based off of the [guidelines from the sparse project](https://github.com/pydata/sparse/blob/master/docs/contributing.rst).
122 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2016 Open2C Developers
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Bioframe: Operations on Genomic Interval Dataframes
2 |
3 |
4 |
5 | 
6 | [](https://results.pre-commit.ci/latest/github/open2c/bioframe/main)
7 | [](https://bioframe.readthedocs.io/en/latest/)
8 | [](https://doi.org/10.1093/bioinformatics/btae088)
9 | [](https://zenodo.org/badge/latestdoi/69901992)
10 | [](https://bit.ly/open2c-slack)
11 | [](https://www.numfocus.org)
12 |
13 | Bioframe enables flexible and scalable operations on genomic interval dataframes in Python.
14 |
15 | Bioframe is built directly on top of [Pandas](https://pandas.pydata.org/). Bioframe provides:
16 |
17 | * A variety of genomic interval operations that work directly on dataframes.
18 | * Operations for special classes of genomic intervals, including chromosome arms and fixed-size bins.
19 | * Conveniences for diverse tabular genomic data formats and loading genome assembly summary information.
20 |
21 | Read the [documentation](https://bioframe.readthedocs.io/en/latest/), including the [guide](https://bioframe.readthedocs.io/en/latest/guide-intervalops.html), as well as the [publication](https://doi.org/10.1093/bioinformatics/btae088) for more information.
22 |
23 | Bioframe is an Affiliated Project of [NumFOCUS](https://www.numfocus.org).
24 |
25 | ## Installation
26 |
27 | Bioframe is available on [PyPI](https://pypi.org/project/bioframe/) and [bioconda](https://bioconda.github.io/recipes/bioframe/README.html):
28 |
29 | ```sh
30 | pip install bioframe
31 | ```
32 |
33 | ## Contributing
34 |
35 | Interested in contributing to bioframe? That's great! To get started, check out the [contributing guide](https://github.com/open2c/bioframe/blob/main/CONTRIBUTING.md). Discussions about the project roadmap take place on the [Open2C Slack](https://bit.ly/open2c-slack) and regular developer meetings scheduled there. Anyone can join and participate!
36 |
37 |
38 | ## Interval operations
39 |
40 | Key genomic interval operations in bioframe include:
41 | - `overlap`: Find pairs of overlapping genomic intervals between two dataframes.
42 | - `closest`: For every interval in a dataframe, find the closest intervals in a second dataframe.
43 | - `cluster`: Group overlapping intervals in a dataframe into clusters.
44 | - `complement`: Find genomic intervals that are not covered by any interval from a dataframe.
45 |
46 | Bioframe additionally has functions that are frequently used for genomic interval operations and can be expressed as combinations of these core operations and dataframe operations, including: `coverage`, `expand`, `merge`, `select`, and `subtract`.
47 |
48 | To `overlap` two dataframes, call:
49 | ```python
50 | import bioframe as bf
51 |
52 | bf.overlap(df1, df2)
53 | ```
54 |
55 | For these two input dataframes, with intervals all on the same chromosome:
56 |
57 |
58 |
59 |
60 | `overlap` will return the following interval pairs as overlaps:
61 |
62 |
63 |
64 |
65 |
66 | To `merge` all overlapping intervals in a dataframe, call:
67 | ```python
68 | import bioframe as bf
69 |
70 | bf.merge(df1)
71 | ```
72 |
73 | For this input dataframe, with intervals all on the same chromosome:
74 |
75 |
76 |
77 | `merge` will return a new dataframe with these merged intervals:
78 |
79 |
80 |
81 | See the [guide](https://bioframe.readthedocs.io/en/latest/guide-intervalops.html) for visualizations of other interval operations in bioframe.
82 |
83 | ## File I/O
84 |
85 | Bioframe includes utilities for reading genomic file formats into dataframes and vice versa. One handy function is `read_table` which mirrors pandas’s read_csv/read_table but provides a [`schema`](https://github.com/open2c/bioframe/blob/main/bioframe/io/schemas.py) argument to populate column names for common tabular file formats.
86 |
87 | ```python
88 | jaspar_url = 'http://expdata.cmmt.ubc.ca/JASPAR/downloads/UCSC_tracks/2022/hg38/MA0139.1.tsv.gz'
89 | ctcf_motif_calls = bioframe.read_table(jaspar_url, schema='jaspar', skiprows=1)
90 | ```
91 |
92 | ## Tutorials
93 | See this [jupyter notebook](https://github.com/open2c/bioframe/tree/master/docs/tutorials/tutorial_assign_motifs_to_peaks.ipynb) for an example of how to assign TF motifs to ChIP-seq peaks using bioframe.
94 |
95 |
96 | ## Citing
97 |
98 | If you use ***bioframe*** in your work, please cite:
99 |
100 | ```bibtex
101 | @article{bioframe_2024,
102 | author = {Open2C and Abdennur, Nezar and Fudenberg, Geoffrey and Flyamer, Ilya M and Galitsyna, Aleksandra A and Goloborodko, Anton and Imakaev, Maxim and Venev, Sergey},
103 | doi = {10.1093/bioinformatics/btae088},
104 | journal = {Bioinformatics},
105 | title = {{Bioframe: Operations on Genomic Intervals in Pandas Dataframes}},
106 | year = {2024}
107 | }
108 | ```
109 |
--------------------------------------------------------------------------------
/bioframe/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 | from importlib.metadata import PackageNotFoundError, version
3 | except ImportError:
4 | from importlib_metadata import PackageNotFoundError, version
5 |
6 | try:
7 | __version__ = version("bioframe")
8 | except PackageNotFoundError:
9 | __version__ = "unknown"
10 |
11 | __all__ = [
12 | "arrops",
13 | "from_any",
14 | "from_dict",
15 | "from_list",
16 | "from_series",
17 | "is_bedframe",
18 | "is_cataloged",
19 | "is_chrom_dtype",
20 | "is_complete_ucsc_string",
21 | "is_contained",
22 | "is_covering",
23 | "is_overlapping",
24 | "is_sorted",
25 | "is_tiling",
26 | "is_viewframe",
27 | "make_viewframe",
28 | "parse_region",
29 | "parse_region_string",
30 | "sanitize_bedframe",
31 | "to_ucsc_string",
32 | "update_default_colnames",
33 | "binnify",
34 | "digest",
35 | "frac_gc",
36 | "frac_gene_coverage",
37 | "frac_mapped",
38 | "make_chromarms",
39 | "pair_by_distance",
40 | "seq_gc",
41 | "SCHEMAS",
42 | "UCSCClient",
43 | "assemblies_available",
44 | "assembly_info",
45 | "fetch_centromeres",
46 | "fetch_chromsizes",
47 | "load_fasta",
48 | "read_alignments",
49 | "read_bam",
50 | "read_bigbed",
51 | "read_bigwig",
52 | "read_chromsizes",
53 | "read_pairix",
54 | "read_tabix",
55 | "read_table",
56 | "to_bed",
57 | "to_bigbed",
58 | "to_bigwig",
59 | "assign_view",
60 | "closest",
61 | "cluster",
62 | "complement",
63 | "count_overlaps",
64 | "coverage",
65 | "expand",
66 | "merge",
67 | "overlap",
68 | "mark_runs",
69 | "merge_runs",
70 | "select",
71 | "select_indices",
72 | "select_labels",
73 | "select_mask",
74 | "setdiff",
75 | "sort_bedframe",
76 | "subtract",
77 | "trim",
78 | "plot_intervals",
79 | "to_ucsc_colorstring",
80 | ]
81 |
82 | from .core import (
83 | arrops,
84 | from_any,
85 | from_dict,
86 | from_list,
87 | from_series,
88 | is_bedframe,
89 | is_cataloged,
90 | is_chrom_dtype,
91 | is_complete_ucsc_string,
92 | is_contained,
93 | is_covering,
94 | is_overlapping,
95 | is_sorted,
96 | is_tiling,
97 | is_viewframe,
98 | make_viewframe,
99 | parse_region,
100 | parse_region_string,
101 | sanitize_bedframe,
102 | to_ucsc_string,
103 | update_default_colnames,
104 | )
105 | from .extras import (
106 | binnify,
107 | digest,
108 | frac_gc,
109 | frac_gene_coverage,
110 | frac_mapped,
111 | make_chromarms,
112 | mark_runs,
113 | merge_runs,
114 | pair_by_distance,
115 | seq_gc,
116 | )
117 | from .io import (
118 | SCHEMAS,
119 | UCSCClient,
120 | assemblies_available,
121 | assembly_info,
122 | fetch_centromeres,
123 | fetch_chromsizes,
124 | load_fasta,
125 | read_alignments,
126 | read_bam,
127 | read_bigbed,
128 | read_bigwig,
129 | read_chromsizes,
130 | read_pairix,
131 | read_tabix,
132 | read_table,
133 | to_bed,
134 | to_bigbed,
135 | to_bigwig,
136 | )
137 | from .ops import (
138 | assign_view,
139 | closest,
140 | cluster,
141 | complement,
142 | count_overlaps,
143 | coverage,
144 | expand,
145 | merge,
146 | overlap,
147 | select,
148 | select_indices,
149 | select_labels,
150 | select_mask,
151 | setdiff,
152 | sort_bedframe,
153 | subtract,
154 | trim,
155 | )
156 | from .vis import plot_intervals, to_ucsc_colorstring
157 |
158 | del version, PackageNotFoundError
159 |
--------------------------------------------------------------------------------
/bioframe/core/__init__.py:
--------------------------------------------------------------------------------
1 | from . import arrops
2 | from .checks import (
3 | is_bedframe,
4 | is_cataloged,
5 | is_contained,
6 | is_covering,
7 | is_overlapping,
8 | is_sorted,
9 | is_tiling,
10 | is_viewframe,
11 | )
12 | from .construction import (
13 | from_any,
14 | from_dict,
15 | from_list,
16 | from_series,
17 | make_viewframe,
18 | sanitize_bedframe,
19 | )
20 | from .specs import is_chrom_dtype, update_default_colnames
21 | from .stringops import (
22 | is_complete_ucsc_string,
23 | parse_region,
24 | parse_region_string,
25 | to_ucsc_string,
26 | )
27 |
28 | __all__ = [
29 | "arrops",
30 | "is_bedframe",
31 | "is_cataloged",
32 | "is_contained",
33 | "is_covering",
34 | "is_overlapping",
35 | "is_sorted",
36 | "is_tiling",
37 | "is_viewframe",
38 | "from_any",
39 | "from_dict",
40 | "from_list",
41 | "from_series",
42 | "make_viewframe",
43 | "sanitize_bedframe",
44 | "is_chrom_dtype",
45 | "update_default_colnames",
46 | "is_complete_ucsc_string",
47 | "parse_region",
48 | "parse_region_string",
49 | "to_ucsc_string",
50 | ]
51 |
--------------------------------------------------------------------------------
/bioframe/core/construction.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | from . import checks
5 | from .specs import _get_default_colnames, _verify_columns, is_chrom_dtype
6 | from .stringops import is_complete_ucsc_string, parse_region_string, to_ucsc_string
7 |
8 | __all__ = [
9 | "from_dict",
10 | "from_series",
11 | "from_list",
12 | "from_any",
13 | "make_viewframe",
14 | "sanitize_bedframe",
15 | ]
16 |
17 | ### conversions from various input formats into dataframes ###
18 |
19 |
20 | def from_dict(regions, cols=None):
21 | """
22 | Makes a dataframe from a dictionary of {str,int} pairs, interpreted as
23 | chromosome names.
24 |
25 | Note that {str,(int,int)} dictionaries of tuples are no longer supported!
26 |
27 | Parameters
28 | ----------
29 |
30 | regions : dict
31 |
32 | name_col : str
33 | Default 'name'.
34 |
35 | cols : (str, str, str) or None
36 | The names of columns containing the chromosome, start and end of the
37 | genomic intervals, provided separately for each set. The default
38 | values are 'chrom', 'start', 'end'.
39 |
40 | Returns
41 | -------
42 | df : pandas.DataFrame
43 | """
44 | ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
45 | data = []
46 | for k, v in dict(regions).items():
47 | chrom = k
48 | if np.isscalar(v):
49 | start = 0
50 | end = v
51 | else:
52 | raise ValueError("Unsupported dict format: {type(v)}")
53 | data.append([chrom, start, end])
54 | return pd.DataFrame(data, columns=[ck1, sk1, ek1])
55 |
56 |
57 | def from_series(regions, cols=None):
58 | ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
59 | chroms = regions.index.values
60 | data = {ck1: chroms, sk1: 0, ek1: regions.values}
61 | return pd.DataFrame(data)
62 |
63 |
64 | def from_list(regions, name_col="name", cols=None):
65 | ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
66 | df = pd.DataFrame(regions)
67 | if df.shape[1] == 3:
68 | df.columns = [ck1, sk1, ek1]
69 | elif df.shape[1] == 4:
70 | df.columns = [ck1, sk1, ek1, name_col]
71 | else:
72 | raise ValueError("wrong number of columns for list input format")
73 | return df
74 |
75 |
76 | def from_ucsc_string_list(region_list, cols=None):
77 | ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
78 | parsed = [parse_region_string(i) for i in region_list]
79 | df = pd.DataFrame(parsed, columns=[ck1, sk1, ek1])
80 | return df
81 |
82 |
83 | def from_any(regions, fill_null=False, name_col="name", cols=None):
84 | """
85 | Attempts to make a genomic interval dataframe with columns
86 | [chr, start, end, name_col] from a variety of input types.
87 |
88 | Parameters
89 | ----------
90 | regions : supported input
91 | Currently supported inputs:
92 |
93 | - dataframe
94 | - series of UCSC strings
95 | - dictionary of {str:int} key value pairs
96 | - pandas series where the index is interpreted as chromosomes and
97 | values are interpreted as end
98 | - list of tuples or lists, either [(chrom,start,end)] or
99 | [(chrom,start,end,name)]
100 | - tuple of tuples or lists, either [(chrom,start,end)] or
101 | [(chrom,start,end,name)]
102 |
103 | fill_null : False or dictionary
104 | Accepts a dictionary of {str:int} pairs, interpreted as chromosome sizes.
105 | Kept or backwards compatibility. Default False.
106 |
107 | name_col : str
108 | Column name. Only used if 4 column list is provided. Default "name".
109 |
110 | cols : (str,str,str)
111 | Names for dataframe columns.
112 | Default None sets them with get_default_colnames().
113 |
114 | Returns
115 | -------
116 | out_df:dataframe
117 |
118 | """
119 | ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
120 |
121 | if isinstance(regions, pd.DataFrame):
122 | if {ck1, sk1, ek1}.issubset(regions.columns):
123 | out_df = regions.copy()
124 | elif (len(regions[name_col].values.shape) == 1) and is_complete_ucsc_string(
125 | regions[name_col].values[0]
126 | ):
127 | out_df = from_ucsc_string_list(
128 | regions[name_col].values, cols=[ck1, sk1, ek1]
129 | )
130 | else:
131 | raise ValueError("Unknown dataFrame format: check column names")
132 |
133 | elif isinstance(regions, dict):
134 | out_df = from_dict(regions, cols=[ck1, sk1, ek1])
135 |
136 | elif isinstance(regions, pd.Series):
137 | out_df = from_series(regions, cols=[ck1, sk1, ek1])
138 |
139 | elif isinstance(regions, tuple):
140 | if np.shape(regions) == (3,):
141 | out_df = from_list([regions], name_col=name_col, cols=[ck1, sk1, ek1])
142 |
143 | elif len(np.shape(regions)) == 1 and isinstance(regions[0], str):
144 | out_df = from_ucsc_string_list(regions, cols=[ck1, sk1, ek1])
145 | else:
146 | out_df = from_list(list(regions), name_col=name_col, cols=[ck1, sk1, ek1])
147 |
148 | elif isinstance(regions, list):
149 | if np.shape(regions) == (3,):
150 | out_df = from_list([regions], name_col=name_col, cols=[ck1, sk1, ek1])
151 | elif len(np.shape(regions)) == 1 and isinstance(regions[0], str):
152 | out_df = from_ucsc_string_list(regions, cols=[ck1, sk1, ek1])
153 | else:
154 | out_df = from_list(regions, name_col=name_col, cols=[ck1, sk1, ek1])
155 | else:
156 | raise ValueError(f"Unknown input format: {type(regions)}")
157 |
158 | if fill_null:
159 | out_df[sk1] = pd.to_numeric(out_df[sk1]).fillna(0)
160 | try:
161 | ends = []
162 | for i in range(len(out_df)):
163 | if out_df[ek1].values[i] is None:
164 | ends.append(fill_null[out_df[ck1].values[i]])
165 | else:
166 | ends.append(out_df[ek1].values[i])
167 | out_df[ek1] = ends
168 | except Exception as e:
169 | raise ValueError("could not fill ends with provided chromsizes") from e
170 |
171 | return out_df
172 |
173 |
174 | def add_ucsc_name_column(reg_df, name_col="name", cols=None):
175 | """
176 | Auto-creates a UCSC name 'chrom:start-end' for each region
177 | (chrom,start,end) in reg_df.
178 |
179 | Replaces name_col if it exists.
180 | """
181 | ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
182 | df = reg_df.copy()
183 | _verify_columns(df, [ck1, sk1, ek1])
184 | data = zip(df[ck1], df[sk1], df[ek1])
185 | df[name_col] = [to_ucsc_string(i) for i in data]
186 | return df
187 |
188 |
189 | def make_viewframe(
190 | regions,
191 | check_bounds=None,
192 | name_style=None,
193 | view_name_col="name",
194 | cols=None,
195 | ):
196 | """
197 | Makes and validates a dataframe `view_df` out of regions.
198 |
199 | Parameters
200 | ----------
201 | regions : supported input type
202 | Currently supported input types:
203 |
204 | - a dictionary where keys are strings and values are integers
205 | {str:int}, specifying regions (chrom, 0, end, chrom)
206 | - a pandas series of chromosomes lengths with index specifying region names
207 | - a list of tuples [(chrom,start,end), ...] or [(chrom,start,end,name), ...]
208 | - a pandas DataFrame, skips to validation step
209 |
210 | name_style : None or "ucsc"
211 | If None and no column view_name_col, propagate values from cols[0]
212 | If "ucsc" and no column view_name_col, create UCSC style names
213 |
214 | check_bounds : None, or chromosome sizes provided as any of valid formats above
215 | Optional, if provided checks if regions in the view are contained by
216 | regions supplied in check_bounds, typically provided as a series of
217 | chromosome sizes. Default None.
218 |
219 | view_name_col : str
220 | Specifies column name of the view regions. Default 'name'.
221 |
222 | cols : (str, str, str) or None
223 | The names of columns containing the chromosome, start and end of the
224 | genomic intervals, provided separately for each set. The default
225 | values are 'chrom', 'start', 'end'.
226 |
227 | Returns
228 | -------
229 | view_df:dataframe satisfying properties of a view
230 |
231 | """
232 | ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
233 |
234 | view_df = from_any(regions, name_col=view_name_col, cols=cols)
235 |
236 | if check_bounds is not None:
237 | bounds_df = from_any(check_bounds, name_col="bounds", cols=cols)
238 | if not checks.is_contained(
239 | view_df,
240 | bounds_df,
241 | df_view_col=None,
242 | view_name_col="bounds",
243 | cols=cols,
244 | ):
245 | raise ValueError(
246 | "Invalid input to make a viewFrame, regions not contained by bounds"
247 | )
248 |
249 | if view_name_col not in view_df.columns:
250 | if name_style is None:
251 | view_df[view_name_col] = view_df[ck1].values
252 | elif name_style.lower() == "ucsc":
253 | view_df = add_ucsc_name_column(view_df, name_col=view_name_col, cols=cols)
254 | else:
255 | raise ValueError("unknown value for name_style")
256 |
257 | if checks.is_viewframe(
258 | view_df, view_name_col=view_name_col, cols=cols, raise_errors=True
259 | ):
260 | return view_df
261 | else:
262 | raise ValueError("could not make valid viewFrame, retry with new input")
263 |
264 |
265 | def sanitize_bedframe(
266 | df1,
267 | recast_dtypes=True,
268 | drop_null=False,
269 | start_exceed_end_action=None,
270 | cols=None,
271 | ):
272 | """
273 | Attempts to clean a genomic interval dataframe to be a valid bedframe.
274 |
275 | Parameters
276 | ----------
277 | df1 : pandas.DataFrame
278 |
279 | recast_dtypes : bool
280 | Whether to attempt to recast column dtypes to pandas nullable dtypes.
281 |
282 | drop_null : bool
283 | Drops rows with pd.NA. Default False.
284 |
285 | start_exceed_end_action : str or None
286 | Options: 'flip' or 'drop' or None. Default None.
287 |
288 | - If 'flip', attempts to sanitize by flipping intervals with start>end.
289 | - If 'drop' attempts to sanitize dropping intervals with start>end.
290 | - If None, does not alter these intervals if present.
291 |
292 | cols : (str, str, str) or None
293 | The names of columns containing the chromosome, start and end of the
294 | genomic intervals, provided separately for each set. The default
295 | values are 'chrom', 'start', 'end'.
296 |
297 | Returns
298 | -------
299 | out_df : pandas.DataFrame
300 | Sanitized dataframe satisfying the properties of a bedframe.
301 |
302 | Notes
303 | ------
304 | The option ``start_exceed_end_action='flip'`` may be useful for gff files
305 | with strand information but starts > ends.
306 |
307 | """
308 | ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
309 |
310 | out_df = df1.copy()
311 |
312 | _verify_columns(out_df, [ck1, sk1, ek1])
313 |
314 | if recast_dtypes:
315 | chrom_dtype, start_dtype, end_dtype = out_df.dtypes[[ck1, sk1, ek1]]
316 | if not is_chrom_dtype(chrom_dtype):
317 | out_df[ck1] = out_df[ck1].astype(str)
318 | if not ((start_dtype is pd.Int64Dtype()) and (end_dtype is pd.Int64Dtype())):
319 | out_df[sk1] = out_df[sk1].astype(pd.Int64Dtype())
320 | out_df[ek1] = out_df[ek1].astype(pd.Int64Dtype())
321 |
322 | nan_intervals = pd.isnull(out_df[[ck1, sk1, ek1]]).any(axis=1)
323 | out_df.loc[nan_intervals, [ck1, sk1, ek1]] = pd.NA
324 | if drop_null:
325 | out_df.dropna(axis=0, inplace=True)
326 | out_df.reset_index(drop=True, inplace=True)
327 |
328 | if start_exceed_end_action is not None:
329 | start_exceed_end_action = start_exceed_end_action.lower()
330 | if ((out_df[ek1] - out_df[sk1]) < 0).any():
331 | inds = ((out_df[ek1] - out_df[sk1]) < 0).values
332 | if start_exceed_end_action == "drop":
333 | out_df = out_df.loc[inds == 0]
334 | elif start_exceed_end_action == "flip":
335 | out_df.loc[inds, [sk1, ek1]] = out_df.loc[inds, [ek1, sk1]].values
336 | else:
337 | raise ValueError("unknown action for intervals with start>end")
338 | out_df.reset_index(drop=True, inplace=True)
339 |
340 | if checks.is_bedframe(out_df, cols=cols):
341 | return out_df
342 | else:
343 | raise ValueError("could not sanitize")
344 |
--------------------------------------------------------------------------------
/bioframe/core/specs.py:
--------------------------------------------------------------------------------
1 | import collections
2 |
3 | import numpy as np
4 | import pandas as pd
5 |
6 | __all__ = [
7 | "update_default_colnames",
8 | "is_chrom_dtype",
9 | ]
10 |
11 | _rc = {"colnames": {"chrom": "chrom", "start": "start", "end": "end"}}
12 |
13 |
14 | def _get_default_colnames():
15 | """
16 | Returns default column names.
17 |
18 | These defaults be updated with :func:`update_default_colnames`.
19 |
20 | Returns
21 | -------
22 | colnames : triplet (str, str, str)
23 |
24 | """
25 | return _rc["colnames"]["chrom"], _rc["colnames"]["start"], _rc["colnames"]["end"]
26 |
27 |
28 | class update_default_colnames:
29 | def __init__(self, new_colnames):
30 | self._old_colnames = dict(_rc["colnames"])
31 | if isinstance(new_colnames, collections.abc.Iterable):
32 | if len(new_colnames) != 3:
33 | raise ValueError(
34 | "Please, specify new columns using a list of "
35 | "3 strings or a dict!"
36 | )
37 | (
38 | _rc["colnames"]["chrom"],
39 | _rc["colnames"]["start"],
40 | _rc["colnames"]["end"],
41 | ) = new_colnames
42 | elif isinstance(new_colnames, collections.abc.Mapping):
43 | _rc["colnames"].update(
44 | {
45 | k: v
46 | for k, v in new_colnames.items()
47 | if k in ["chrom", "start", "end"]
48 | }
49 | )
50 | else:
51 | raise ValueError(
52 | "Please, specify new columns using a list of " "3 strings or a dict!"
53 | )
54 |
55 | def __enter__(self):
56 | return self
57 |
58 | def __exit__(self, *args):
59 | _rc["colnames"] = self._old_colnames
60 |
61 |
62 | def _verify_columns(df, colnames, unique_cols=False, return_as_bool=False):
63 | """
64 | Raises ValueError if columns with colnames are not present in dataframe df.
65 |
66 | Parameters
67 | ----------
68 | df: pandas.DataFrame
69 |
70 | colnames: list of column names
71 |
72 | return_as_bool : bool
73 | If True, returns as a boolean instead of raising errors. Default False.
74 |
75 | """
76 |
77 | if not isinstance(df, pd.DataFrame):
78 | if return_as_bool:
79 | return False
80 | raise ValueError("df is not a dataframe")
81 |
82 | if unique_cols:
83 | if len(set(colnames)) < len(colnames):
84 | raise ValueError("column names must be unique")
85 |
86 | if not set(colnames).issubset(df.columns):
87 | if return_as_bool:
88 | return False
89 | raise ValueError(
90 | ", ".join(set(colnames).difference(set(df.columns)))
91 | + " not in keys of df.columns"
92 | )
93 | if return_as_bool:
94 | return True
95 |
96 |
97 | def _verify_column_dtypes(df, cols=None, return_as_bool=False):
98 | """
99 | Checks that dataframe `df` has chrom, start, end columns with valid dtypes.
100 | Raises TypeErrors if cols have invalid dtypes.
101 |
102 | Parameters
103 | ----------
104 | df : pandas.DataFrame
105 |
106 | cols : (str, str, str) or None
107 | The names of columns containing the chromosome, start and end of the
108 | genomic intervals, provided separately for each set. The default
109 | values are 'chrom', 'start', 'end'.
110 |
111 | return_as_bool : bool
112 | If true, returns as a boolean instead of raising errors. Default False.
113 |
114 | """
115 | ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
116 | if not _verify_columns(df, [ck1, sk1, ek1], return_as_bool=True):
117 | if return_as_bool:
118 | return False
119 | raise ValueError("could not verify columns")
120 |
121 | chrom_dtype, start_dtype, end_dtype = df.dtypes[[ck1, sk1, ek1]]
122 |
123 | if not is_chrom_dtype(chrom_dtype):
124 | if return_as_bool:
125 | return False
126 | raise TypeError(
127 | "invalid df['chrom'] dtype, must be object, string, or categorical"
128 | )
129 | if not pd.api.types.is_integer_dtype(start_dtype):
130 | if return_as_bool:
131 | return False
132 | raise TypeError("invalid df['start'] dtype, must be integer")
133 |
134 | if not pd.api.types.is_integer_dtype(end_dtype):
135 | if return_as_bool:
136 | return False
137 | raise TypeError("invalid df['end'] dtype, must be integer")
138 |
139 | if return_as_bool:
140 | return True
141 |
142 |
143 | def is_chrom_dtype(chrom_dtype):
144 | """
145 | Returns True if dtype is any of the allowed bioframe chrom dtypes, False otherwise.
146 | """
147 | return np.any(
148 | [
149 | pd.api.types.is_string_dtype(chrom_dtype),
150 | pd.api.types.is_object_dtype(chrom_dtype),
151 | isinstance(chrom_dtype, pd.api.types.CategoricalDtype),
152 | ]
153 | )
154 |
--------------------------------------------------------------------------------
/bioframe/core/stringops.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import Optional, Tuple, Union
3 |
4 | import pandas as pd
5 |
6 | __all__ = [
7 | "parse_region",
8 | "parse_region_string",
9 | "is_complete_ucsc_string",
10 | "to_ucsc_string",
11 | ]
12 |
13 | NUMERIC_REGEX = re.compile("([0-9,.]+)")
14 |
15 | RANGE_TOKEN_SPEC = [
16 | ("HYPHEN", r"-"),
17 | ("COORD", r"[0-9,]+(\.[0-9]*)?(?:[a-z]+)?"),
18 | ("OTHER", r".+"),
19 | ]
20 |
21 | RANGE_REGEX = re.compile(
22 | r"\s*" + r"|\s*".join(rf"(?P<{name}>{token})" for name, token in RANGE_TOKEN_SPEC),
23 | re.IGNORECASE,
24 | )
25 |
26 |
27 | def to_ucsc_string(grange: Tuple[str, int, int]) -> str:
28 | """
29 | Convert a grange to a UCSC string.
30 |
31 | Parameters
32 | ----------
33 | grange : tuple or other iterable
34 | chrom, start, end
35 |
36 | Returns
37 | -------
38 | str
39 | UCSC-style genomic range string, '{chrom}:{start}-{end}'
40 | """
41 | return "{}:{}-{}".format(*grange)
42 |
43 |
44 | def is_complete_ucsc_string(s: str) -> bool:
45 | """
46 | Returns True if a string can be parsed into a completely informative
47 | (chrom, start, end) format.
48 |
49 | Parameters
50 | ----------
51 | s : str
52 |
53 | Returns
54 | -------
55 | bool
56 | True if able to be parsed and ``end`` is known.
57 |
58 | """
59 | if not isinstance(s, str):
60 | return False
61 | _, _, end = parse_region_string(s)
62 | if end is None:
63 | return False
64 | return True
65 |
66 |
67 | def _parse_humanized_int(s: str) -> int:
68 | _, value, unit = NUMERIC_REGEX.split(s.replace(",", ""))
69 |
70 | # No multiplier unit, just return the integer value
71 | if not len(unit):
72 | return int(value)
73 |
74 | # Parse and apply the multiplier. Remaining decimal places are dropped.
75 | value = float(value)
76 | unit = unit.upper().strip()
77 | if unit in ("K", "KB"):
78 | value *= 1_000
79 | elif unit in ("M", "MB"):
80 | value *= 1_000_000
81 | elif unit in ("G", "GB"):
82 | value *= 1_000_000_000
83 | else:
84 | raise ValueError(f"Unknown unit '{unit}'")
85 | return int(value)
86 |
87 |
88 | def parse_region_string(s: str) -> Tuple[str, int, int]:
89 | """
90 | Parse a UCSC-style genomic range string into a triple.
91 |
92 | Parameters
93 | ----------
94 | s : str
95 | UCSC-style genomic range string, e.g. "chr5:10,100,000-30,000,000".
96 |
97 | Returns
98 | -------
99 | tuple
100 | (str, int or None, int or None)
101 |
102 | See also
103 | --------
104 | parse_region
105 | """
106 |
107 | def _tokenize(s):
108 | for match in RANGE_REGEX.finditer(s):
109 | name = match.lastgroup
110 | yield name, match.group(name)
111 |
112 | def _parse_range(token_stream):
113 | name, token = next(token_stream, (None, None))
114 | if name != "COORD":
115 | raise ValueError(f"Expected COORD; got unexpected token: {name}: {token}")
116 | start = _parse_humanized_int(token)
117 |
118 | name, token = next(token_stream, (None, None))
119 | if name != "HYPHEN":
120 | raise ValueError(f"Expected HYPHEN; got unexpected token: {name}: {token}")
121 |
122 | name, token = next(token_stream, (None, None))
123 | if name is None: # No end coordinate
124 | end = None
125 | elif name == "COORD":
126 | end = _parse_humanized_int(token)
127 | else:
128 | raise ValueError(f"Expected COORD; got unexpected token: {name}: {token}")
129 |
130 | return start, end
131 |
132 | parts = s.split(":")
133 |
134 | chrom = parts[0].strip()
135 | if not len(chrom):
136 | raise ValueError("Chromosome name cannot be empty")
137 |
138 | if len(parts) < 2:
139 | return (chrom, None, None)
140 |
141 | start, end = _parse_range(_tokenize(parts[1]))
142 |
143 | return chrom, start, end
144 |
145 |
146 | def _parse_region_record(grange: tuple) -> Tuple[str, int, int]:
147 | """
148 | Coerce a genomic range record into a triple.
149 |
150 | Parameters
151 | ----------
152 | grange : str or tuple
153 | * A triple (chrom, start, end), where ``start`` or ``end`` may be
154 | ``None``.
155 | * A quadruple or higher-order tuple, e.g. (chrom, start, end, name).
156 | ``name`` and other fields will be ignored.
157 |
158 | Returns
159 | -------
160 | tuple
161 | A well-formed genomic range triple (str, int, int).
162 | """
163 | if len(grange) < 3:
164 | raise ValueError("Length of a range record should be at least 3")
165 | chrom, start, end = grange[:3]
166 | chrom = str(chrom)
167 | start = int(start) if start is not None else start
168 | end = int(end) if end is not None else end
169 | return chrom, start, end
170 |
171 |
172 | def parse_region(
173 | grange: Union[str, tuple],
174 | chromsizes: Optional[Union[dict, pd.Series]] = None,
175 | *,
176 | check_bounds: bool = True,
177 | ) -> Tuple[str, int, int]:
178 | """
179 | Coerce a genomic range string or sequence type into a triple.
180 |
181 | Parameters
182 | ----------
183 | grange : str or tuple
184 | * A UCSC-style genomic range string, e.g. "chr5:10,100,000-30,000,000".
185 | * A triple (chrom, start, end), where ``start`` or ``end`` may be
186 | ``None``.
187 | * A quadruple or higher-order tuple, e.g. (chrom, start, end, name).
188 | ``name`` and other fields will be ignored.
189 |
190 | chromsizes : dict or Series, optional
191 | Lookup table of sequence lengths for bounds checking and for
192 | filling in a missing end coordinate.
193 |
194 | check_bounds : bool, optional [default: True]
195 | If True, check that the genomic range is within the bounds of the
196 | sequence.
197 |
198 | Returns
199 | -------
200 | tuple
201 | A well-formed genomic range triple (str, int, int).
202 |
203 | Notes
204 | -----
205 | Genomic ranges are interpreted as half-open intervals (0-based starts,
206 | 1-based ends) along the length coordinate of a sequence.
207 |
208 | Sequence names may contain any character except for whitespace and colon.
209 |
210 | The start coordinate should be 0 or greater and the end coordinate should
211 | be less than or equal to the length of the sequence, if the latter is
212 | known. These are enforced when ``check_bounds`` is ``True``.
213 |
214 | If the start coordinate is missing, it is assumed to be 0. If the end
215 | coordinate is missing and chromsizes are provided, it is replaced with the
216 | length of the sequence.
217 |
218 | The end coordinate **must** be greater than or equal to the start.
219 |
220 | The start and end coordinates may be suffixed with k(b), M(b), or G(b)
221 | multipliers, case-insentive. e.g. "chr1:1K-2M" is equivalent to
222 | "chr1:1000-2000000".
223 | """
224 | if isinstance(grange, str):
225 | chrom, start, end = parse_region_string(grange)
226 | else:
227 | chrom, start, end = _parse_region_record(grange)
228 |
229 | # Fill in missing end coordinate if possible
230 | clen = None
231 | if chromsizes is not None:
232 | try:
233 | clen = chromsizes[chrom]
234 | except KeyError as e:
235 | raise ValueError(f"Unknown sequence label: {chrom}") from e
236 | if end is None:
237 | end = clen
238 |
239 | # Fill in missing start coordinate
240 | if start is None:
241 | start = 0
242 |
243 | if end is not None and (end < start):
244 | raise ValueError("End cannot be less than start")
245 |
246 | if check_bounds and (start < 0 or (clen is not None and end > clen)):
247 | raise ValueError(f"Genomic range out of bounds: [{start}, {end})")
248 |
249 | return chrom, start, end
250 |
--------------------------------------------------------------------------------
/bioframe/io/__init__.py:
--------------------------------------------------------------------------------
1 | from .assembly import assemblies_available, assembly_info
2 | from .bed import to_bed
3 | from .fileops import (
4 | load_fasta,
5 | read_alignments,
6 | read_bam,
7 | read_bigbed,
8 | read_bigwig,
9 | read_chromsizes,
10 | read_pairix,
11 | read_tabix,
12 | read_table,
13 | to_bigbed,
14 | to_bigwig,
15 | )
16 | from .resources import UCSCClient, fetch_centromeres, fetch_chromsizes
17 | from .schemas import SCHEMAS
18 |
19 | __all__ = [
20 | "assemblies_available",
21 | "assembly_info",
22 | "read_table",
23 | "read_chromsizes",
24 | "read_tabix",
25 | "read_pairix",
26 | "read_bam",
27 | "read_alignments",
28 | "load_fasta",
29 | "read_bigwig",
30 | "to_bed",
31 | "to_bigwig",
32 | "read_bigbed",
33 | "to_bigbed",
34 | "UCSCClient",
35 | "fetch_centromeres",
36 | "fetch_chromsizes",
37 | "SCHEMAS",
38 | ]
39 |
--------------------------------------------------------------------------------
/bioframe/io/assembly.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 |
3 | try:
4 | from importlib.resources import files as resource_path
5 | except ImportError:
6 | from importlib_resources import files as resource_path
7 |
8 | from typing import Dict, List, Optional, Tuple, Union
9 |
10 | try:
11 | from typing import Literal
12 | except ImportError:
13 | from typing_extensions import Literal
14 |
15 | import numpy as np
16 | import pandas as pd
17 | import yaml
18 |
19 | from bioframe import make_viewframe
20 |
21 | __all__ = ["assemblies_available", "assembly_info"]
22 |
23 | ASSEMBLY_METADATA_ROOT = resource_path("bioframe.io") / "data"
24 |
25 |
26 | @dataclass
27 | class GenomeAssembly:
28 | """
29 | A dataclass containing information about sequences in a genome assembly.
30 | """
31 |
32 | organism: str
33 | provider: str
34 | provider_build: str
35 | release_year: str
36 | seqinfo: pd.DataFrame
37 | cytobands: pd.DataFrame = None
38 | url: str = None
39 | alias_dict: Dict[str, str] = None
40 |
41 | def __post_init__(self):
42 | self.alias_dict = {}
43 | alias_lists = self.seqinfo["aliases"].str.split(",")
44 | names = self.seqinfo["name"]
45 | for aliases, name in zip(alias_lists, names):
46 | for alias in aliases:
47 | self.alias_dict[alias] = name
48 |
49 | @property
50 | def chromsizes(self) -> pd.Series:
51 | return self.seqinfo.set_index("name")["length"]
52 |
53 | @property
54 | def chromnames(self) -> List[str]:
55 | return self.seqinfo["name"].tolist()
56 |
57 | @property
58 | def viewframe(self) -> pd.DataFrame:
59 | return make_viewframe(self.chromsizes.to_dict())
60 |
61 | def __repr__(self) -> str:
62 | return (
63 | f"GenomeAssembly(organism='{self.organism}', provider='{self.provider}', "
64 | f"provider_build='{self.provider_build}', "
65 | f"release_year='{self.release_year}', ...)"
66 | )
67 |
68 |
69 | def assemblies_available() -> pd.DataFrame:
70 | """
71 | Get a list of available genome assembly metadata in local storage.
72 |
73 | Returns
74 | -------
75 | pandas.DataFrame
76 | A dataframe with metadata fields for available assemblies, including
77 | 'provider', 'provider_build', 'default_roles', 'default_units',
78 | and names of seqinfo and cytoband files.
79 | """
80 | with open(ASSEMBLY_METADATA_ROOT / "_assemblies.yml") as f:
81 | assemblies = yaml.safe_load(f)
82 | return pd.DataFrame.from_records(assemblies)
83 |
84 |
85 | def assembly_info(
86 | name: str,
87 | roles: Optional[Union[List, Tuple, Literal["all"]]] = None,
88 | units: Optional[Union[List, Tuple, Literal["all"]]] = None,
89 | ) -> GenomeAssembly:
90 | """
91 | Get information about a genome assembly.
92 |
93 | Parameters
94 | ----------
95 | name : str
96 | Name of the assembly. If the name contains a dot, it is interpreted as
97 | a provider name and a build, e.g. "hg38". Otherwise, the provider
98 | is inferred if the build name is unique.
99 | roles : list or tuple or "all", optional
100 | Sequence roles to include in the assembly info. If not specified, only
101 | sequences with the default sequence roles for the assembly are shown.
102 | e.g. "assembled", "unlocalized", "unplaced"
103 | units : list or tuple or "all", optional
104 | Assembly units to include in the assembly info. If not specified, only
105 | sequences from the default units for the assembly are shown.
106 | e.g. "primary", "non-nuclear", "decoy"
107 |
108 | Returns
109 | -------
110 | GenomeAssembly
111 | A dataclass containing information about the assembly.
112 |
113 | Raises
114 | ------
115 | ValueError
116 | If the assembly name is not found or is not unique.
117 |
118 | Examples
119 | --------
120 | >>> hg38 = assembly_info("hg38")
121 | >>> hg38.chromsizes
122 | name
123 | chr1 248956422
124 | chr2 242193529
125 | chr3 198295559
126 | ... ...
127 |
128 | >>> assembly_info("hg38", roles=("assembled", "non-nuclear"))
129 |
130 | >>> assembly_info("ucsc.hg38", units=("unplaced",))
131 |
132 | """
133 | assemblies = assemblies_available()
134 | provider = None
135 | if "." in name:
136 | provider, name = name.split(".", 1)
137 | provider = provider.lower()
138 |
139 | if provider is None:
140 | q = f"provider_build == '{name}'"
141 | else:
142 | q = f"provider == '{provider}' and provider_build == '{name}'"
143 |
144 | result = assemblies.query(q)
145 | if len(result) == 0:
146 | raise ValueError(f"Assembly not found: {name}")
147 | elif len(result) > 1:
148 | raise ValueError(f"Assembly identifer not unique: {result}")
149 |
150 | assembly = result.iloc[0].replace([np.nan], [None]).to_dict()
151 | default_roles = assembly["default_roles"]
152 | default_units = assembly["default_units"]
153 | seqinfo_path = assembly["seqinfo"]
154 | seqinfo = pd.read_table(ASSEMBLY_METADATA_ROOT / seqinfo_path)
155 |
156 | mask = np.ones(len(seqinfo), dtype=bool)
157 | if roles is None:
158 | mask &= seqinfo["role"].isin(default_roles)
159 | elif isinstance(roles, (tuple, list)):
160 | mask &= seqinfo["role"].isin(roles)
161 | elif isinstance(roles, str) and roles != "all":
162 | raise ValueError(f"roles must be a tuple or 'all', not {roles}")
163 | if units is None:
164 | mask &= seqinfo["unit"].isin(default_units)
165 | elif isinstance(units, (tuple, list)):
166 | mask &= seqinfo["unit"].isin(units)
167 | elif isinstance(units, str) and units != "all":
168 | raise ValueError(f"units must be a tuple or 'all', not {units}")
169 | seqinfo = seqinfo.loc[mask]
170 |
171 | cytobands = None
172 | cytobands_path = assembly["cytobands"]
173 | if cytobands_path is not None:
174 | cytobands = pd.read_table(ASSEMBLY_METADATA_ROOT / cytobands_path)
175 |
176 | return GenomeAssembly(
177 | organism=assembly["organism"],
178 | provider=assembly["provider"],
179 | provider_build=assembly["provider_build"],
180 | release_year=assembly["release_year"],
181 | seqinfo=seqinfo,
182 | cytobands=cytobands,
183 | url=assembly["url"],
184 | )
185 |
--------------------------------------------------------------------------------
/bioframe/io/data/_assemblies.yml:
--------------------------------------------------------------------------------
1 | - organism: homo sapiens
2 | provider: ncbi
3 | provider_build: GRCh37
4 | release_year: 2009
5 | seqinfo: hg19.seqinfo.tsv
6 | cytobands: hg19.cytoband.tsv
7 | default_roles: [assembled]
8 | default_units: [primary, non-nuclear-revised]
9 | url: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.25_GRCh37.p13/GRCh37_seqs_for_alignment_pipelines/
10 | - organism: homo sapiens
11 | provider: ucsc
12 | provider_build: hg19
13 | release_year: 2009
14 | seqinfo: hg19.seqinfo.tsv
15 | cytobands: hg19.cytoband.tsv
16 | default_roles: [assembled]
17 | default_units: [primary, non-nuclear]
18 | url: https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/analysisSet/
19 | - organism: homo sapiens
20 | provider: ncbi
21 | provider_build: GRCh38
22 | release_year: 2013
23 | seqinfo: hg38.seqinfo.tsv
24 | cytobands: hg38.cytoband.tsv
25 | default_roles: [assembled]
26 | default_units: [primary, non-nuclear]
27 | url: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.26_GRCh38/GRCh38_major_release_seqs_for_alignment_pipelines/
28 | - organism: homo sapiens
29 | provider: ucsc
30 | provider_build: hg38
31 | release_year: 2013
32 | seqinfo: hg38.seqinfo.tsv
33 | cytobands: hg38.cytoband.tsv
34 | default_roles: [assembled]
35 | default_units: [primary, non-nuclear]
36 | url: https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/analysisSet/
37 | - organism: homo sapiens
38 | provider: ncbi
39 | provider_build: T2T-CHM13v2.0
40 | release_year: 2022
41 | seqinfo: hs1.seqinfo.tsv
42 | cytobands: hs1.cytoband.tsv
43 | default_roles: [assembled]
44 | default_units: [primary, non-nuclear]
45 | url: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/009/914/755/GCA_009914755.4_T2T-CHM13v2.0/
46 | - organism: homo sapiens
47 | provider: ucsc
48 | provider_build: hs1
49 | release_year: 2022
50 | seqinfo: hs1.seqinfo.tsv
51 | cytobands: hs1.cytoband.tsv
52 | default_roles: [assembled]
53 | default_units: [primary, non-nuclear]
54 | url: https://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/
55 | - organism: mus musculus
56 | provider: ncbi
57 | provider_build: MGSCv37
58 | release_year: 2010
59 | seqinfo: mm9.seqinfo.tsv
60 | default_roles: [assembled]
61 | default_units: [primary, non-nuclear]
62 | url: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.18_MGSCv37/
63 | - organism: mus musculus
64 | provider: ucsc
65 | provider_build: mm9
66 | release_year: 2007
67 | seqinfo: mm9.seqinfo.tsv
68 | default_roles: [assembled]
69 | default_units: [primary, non-nuclear]
70 | url: https://hgdownload.soe.ucsc.edu/goldenPath/mm9/bigZips/
71 | - organism: mus musculus
72 | provider: ncbi
73 | provider_build: GRCm38
74 | release_year: 2011
75 | seqinfo: mm10.seqinfo.tsv
76 | default_roles: [assembled]
77 | default_units: [primary, non-nuclear]
78 | url: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.20_GRCm38/
79 | - organism: mus musculus
80 | provider: ucsc
81 | provider_build: mm10
82 | release_year: 2011
83 | seqinfo: mm10.seqinfo.tsv
84 | default_roles: [assembled]
85 | default_units: [primary, non-nuclear]
86 | url: https://hgdownload.soe.ucsc.edu/goldenPath/mm10/bigZips/
87 | - organism: mus musculus
88 | provider: ncbi
89 | provider_build: GRCm39
90 | release_year: 2020
91 | seqinfo: mm39.seqinfo.tsv
92 | default_roles: [assembled]
93 | default_units: [primary, non-nuclear]
94 | url: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.27_GRCm39/
95 | - organism: mus musculus
96 | provider: ucsc
97 | provider_build: mm39
98 | release_year: 2020
99 | seqinfo: mm39.seqinfo.tsv
100 | default_roles: [assembled]
101 | default_units: [primary, non-nuclear]
102 | url: https://hgdownload.soe.ucsc.edu/goldenPath/mm39/bigZips/
103 | - organism: drosophila melanogaster
104 | provider: ucsc
105 | provider_build: dm3
106 | release_year: 2006
107 | seqinfo: dm3.seqinfo.tsv
108 | default_roles: [assembled]
109 | default_units: [primary, non-nuclear]
110 | url: https://hgdownload.soe.ucsc.edu/goldenPath/dm3/bigZips/
111 | - organism: drosophila melanogaster
112 | provider: ucsc
113 | provider_build: dm6
114 | release_year: 2014
115 | seqinfo: dm6.seqinfo.tsv
116 | default_roles: [assembled]
117 | default_units: [primary, non-nuclear]
118 | url: https://hgdownload.soe.ucsc.edu/goldenPath/dm6/bigZips/
119 | - organism: caenorhabditis elegans
120 | provider: ucsc
121 | provider_build: ce10
122 | release_year: 2010
123 | seqinfo: ce10.seqinfo.tsv
124 | default_roles: [assembled]
125 | default_units: [primary, non-nuclear]
126 | url: https://hgdownload.soe.ucsc.edu/goldenPath/ce10/bigZips/
127 | - organism: caenorhabditis elegans
128 | provider: ucsc
129 | provider_build: ce11
130 | release_year: 2013
131 | seqinfo: ce11.seqinfo.tsv
132 | default_roles: [assembled]
133 | default_units: [primary, non-nuclear]
134 | url: https://hgdownload.soe.ucsc.edu/goldenPath/ce11/bigZips/
135 | - organism: danio rerio
136 | provider: ucsc
137 | provider_build: danRer10
138 | release_year: 2014
139 | seqinfo: danRer10.seqinfo.tsv
140 | default_roles: [assembled]
141 | default_units: [primary, non-nuclear]
142 | url: https://hgdownload.soe.ucsc.edu/goldenPath/danRer10/bigZips/
143 | - organism: danio rerio
144 | provider: ucsc
145 | provider_build: danRer11
146 | release_year: 2017
147 | seqinfo: danRer10.seqinfo.tsv
148 | default_roles: [assembled]
149 | default_units: [primary, non-nuclear]
150 | url: https://hgdownload.soe.ucsc.edu/goldenPath/danRer11/bigZips/
151 | - organism: saccharomyces cerevisiae
152 | provider: ucsc
153 | provider_build: sacCer3
154 | release_year: 2011
155 | seqinfo: sacCer3.seqinfo.tsv
156 | default_roles: [assembled]
157 | default_units: [primary, non-nuclear]
158 | url: https://hgdownload.soe.ucsc.edu/goldenPath/sacCer3/bigZips/
159 |
--------------------------------------------------------------------------------
/bioframe/io/data/ce10.seqinfo.tsv:
--------------------------------------------------------------------------------
1 | name length role molecule unit aliases
2 | chrI 15072423 assembled chrI primary NC_003279.7,I
3 | chrII 15279345 assembled chrII primary NC_003280.9,II
4 | chrIII 13783700 assembled chrIII primary NC_003281.9,III
5 | chrIV 17493793 assembled chrIV primary NC_003282.7,IV
6 | chrV 20924149 assembled chrV primary NC_003283.10,V
7 | chrX 17718866 assembled chrX primary NC_003284.8,X
8 | chrM 13794 assembled chrM non-nuclear NC_001328.1,MT,MtDNA
9 |
--------------------------------------------------------------------------------
/bioframe/io/data/ce11.seqinfo.tsv:
--------------------------------------------------------------------------------
1 | name length role molecule unit aliases
2 | chrI 15072434 assembled chrI primary NC_003279.8,I
3 | chrII 15279421 assembled chrII primary NC_003280.10,II
4 | chrIII 13783801 assembled chrIII primary NC_003281.10,III
5 | chrIV 17493829 assembled chrIV primary NC_003282.8,IV
6 | chrV 20924180 assembled chrV primary NC_003283.11,V
7 | chrX 17718942 assembled chrX primary NC_003284.9,X
8 | chrM 13794 assembled chrM non-nuclear NC_001328.1,MT,MtDNA
9 |
--------------------------------------------------------------------------------
/bioframe/io/data/dm3.seqinfo.tsv:
--------------------------------------------------------------------------------
1 | name length role molecule unit aliases
2 | chr2L 23011544 assembled chr2 primary 2L,NT_033779.4,AE014134.5
3 | chr2R 21146708 assembled chr2 primary 2R,NT_033778.3,AE013599.4
4 | chr3L 24543557 assembled chr3 primary 3L,NT_037436.3,AE014296.4
5 | chr3R 27905053 assembled chr3 primary 3R,NT_033777.2,AE014297.2
6 | chr4 1351857 assembled chr4 primary 4,NC_004353.3,AE014135.3
7 | chrX 22422827 assembled chrX primary X,NC_004354.3,AE014298.4
8 | chrM 19517 assembled chrM non-nuclear MT,NS_000188.1,FA000001.1
9 | chr2LHet 368872 unlocalized chr2 primary 2LHet,NW_001848855.1,CM000456.1
10 | chr2RHet 3288761 unlocalized chr2 primary 2RHet,NW_001848856.1,CM000457.1
11 | chr3LHet 2555491 unlocalized chr3 primary 3LHet,NW_001848857.1,CM000458.1
12 | chr3RHet 2517507 unlocalized chr3 primary 3RHet,NW_001848858.1,CM000459.1
13 | chrXHet 204112 unlocalized chrX primary XHet,NW_001848859.1,CM000460.1
14 | chrYHet 347038 unlocalized chrY primary YHet,NW_001848860.1,CM000461.1
15 | chrU 10049037 unplaced primary Un,NC_001709.1
16 | chrUextra 29004656 unplaced primary
17 |
--------------------------------------------------------------------------------
/bioframe/io/data/hg19.seqinfo.tsv:
--------------------------------------------------------------------------------
1 | name length role molecule unit aliases
2 | chr1 249250621 assembled chr1 primary 1,CM000663.1,NC_000001.10
3 | chr2 243199373 assembled chr2 primary 2,CM000664.1,NC_000002.11
4 | chr3 198022430 assembled chr3 primary 3,CM000665.1,NC_000003.11
5 | chr4 191154276 assembled chr4 primary 4,CM000666.1,NC_000004.11
6 | chr5 180915260 assembled chr5 primary 5,CM000667.1,NC_000005.9
7 | chr6 171115067 assembled chr6 primary 6,CM000668.1,NC_000006.11
8 | chr7 159138663 assembled chr7 primary 7,CM000669.1,NC_000007.13
9 | chr8 146364022 assembled chr8 primary 8,CM000670.1,NC_000008.10
10 | chr9 141213431 assembled chr9 primary 9,CM000671.1,NC_000009.11
11 | chr10 135534747 assembled chr10 primary 10,CM000672.1,NC_000010.10
12 | chr11 135006516 assembled chr11 primary 11,CM000673.1,NC_000011.9
13 | chr12 133851895 assembled chr12 primary 12,CM000674.1,NC_000012.11
14 | chr13 115169878 assembled chr13 primary 13,CM000675.1,NC_000013.10
15 | chr14 107349540 assembled chr14 primary 14,CM000676.1,NC_000014.8
16 | chr15 102531392 assembled chr15 primary 15,CM000677.1,NC_000015.9
17 | chr16 90354753 assembled chr16 primary 16,CM000678.1,NC_000016.9
18 | chr17 81195210 assembled chr17 primary 17,CM000679.1,NC_000017.10
19 | chr18 78077248 assembled chr18 primary 18,CM000680.1,NC_000018.9
20 | chr19 59128983 assembled chr19 primary 19,CM000681.1,NC_000019.9
21 | chr20 63025520 assembled chr20 primary 20,CM000682.1,NC_000020.10
22 | chr21 48129895 assembled chr21 primary 21,CM000683.1,NC_000021.8
23 | chr22 51304566 assembled chr22 primary 22,CM000684.1,NC_000022.10
24 | chrX 155270560 assembled chrX primary X,CM000685.1,NC_000023.10
25 | chrY 59373566 assembled chrY primary Y,CM000686.1,NC_000024.9
26 | chrM 16571 assembled chrM non-nuclear NC_001807.4
27 | chrMT 16569 assembled chrMT non-nuclear-revised MT,J01415.2,NC_012920.1
28 | chr1_gl000191_random 106433 unlocalized chr1 primary GL000191.1,HSCHR1_RANDOM_CTG5,NT_113878.1
29 | chr1_gl000192_random 547496 unlocalized chr1 primary GL000192.1,HSCHR1_RANDOM_CTG12,NT_167207.1
30 | chr4_gl000193_random 189789 unlocalized chr4 primary GL000193.1,HSCHR4_RANDOM_CTG2,NT_113885.1
31 | chr4_gl000194_random 191469 unlocalized chr4 primary GL000194.1,HSCHR4_RANDOM_CTG3,NT_113888.1
32 | chr7_gl000195_random 182896 unlocalized chr7 primary GL000195.1,HSCHR7_RANDOM_CTG1,NT_113901.1
33 | chr8_gl000196_random 38914 unlocalized chr8 primary GL000196.1,HSCHR8_RANDOM_CTG1,NT_113909.1
34 | chr8_gl000197_random 37175 unlocalized chr8 primary GL000197.1,HSCHR8_RANDOM_CTG4,NT_113907.1
35 | chr9_gl000198_random 90085 unlocalized chr9 primary GL000198.1,HSCHR9_RANDOM_CTG1,NT_113914.1
36 | chr9_gl000199_random 169874 unlocalized chr9 primary GL000199.1,HSCHR9_RANDOM_CTG2,NT_113916.2
37 | chr9_gl000200_random 187035 unlocalized chr9 primary GL000200.1,HSCHR9_RANDOM_CTG4,NT_113915.1
38 | chr9_gl000201_random 36148 unlocalized chr9 primary GL000201.1,HSCHR9_RANDOM_CTG5,NT_113911.1
39 | chr11_gl000202_random 40103 unlocalized chr11 primary GL000202.1,HSCHR11_RANDOM_CTG2,NT_113921.2
40 | chr17_gl000203_random 37498 unlocalized chr17 primary GL000203.1,HSCHR17_RANDOM_CTG1,NT_113941.1
41 | chr17_gl000204_random 81310 unlocalized chr17 primary GL000204.1,HSCHR17_RANDOM_CTG2,NT_113943.1
42 | chr17_gl000205_random 174588 unlocalized chr17 primary GL000205.1,HSCHR17_RANDOM_CTG3,NT_113930.1
43 | chr17_gl000206_random 41001 unlocalized chr17 primary GL000206.1,HSCHR17_RANDOM_CTG4,NT_113945.1
44 | chr18_gl000207_random 4262 unlocalized chr18 primary GL000207.1,HSCHR18_RANDOM_CTG1,NT_113947.1
45 | chr19_gl000208_random 92689 unlocalized chr19 primary GL000208.1,HSCHR19_RANDOM_CTG1,NT_113948.1
46 | chr19_gl000209_random 159169 unlocalized chr19 primary GL000209.1,HSCHR19_RANDOM_CTG2,NT_113949.1
47 | chr21_gl000210_random 27682 unlocalized chr21 primary GL000210.1,HSCHR21_RANDOM_CTG9,NT_113950.2
48 | chrUn_gl000211 166566 unplaced primary GL000211.1,HSCHRUN_RANDOM_CTG1,NT_113961.1
49 | chrUn_gl000212 186858 unplaced primary GL000212.1,HSCHRUN_RANDOM_CTG2,NT_113923.1
50 | chrUn_gl000213 164239 unplaced primary GL000213.1,HSCHRUN_RANDOM_CTG3,NT_167208.1
51 | chrUn_gl000214 137718 unplaced primary GL000214.1,HSCHRUN_RANDOM_CTG4,NT_167209.1
52 | chrUn_gl000215 172545 unplaced primary GL000215.1,HSCHRUN_RANDOM_CTG5,NT_167210.1
53 | chrUn_gl000216 172294 unplaced primary GL000216.1,HSCHRUN_RANDOM_CTG6,NT_167211.1
54 | chrUn_gl000217 172149 unplaced primary GL000217.1,HSCHRUN_RANDOM_CTG7,NT_167212.1
55 | chrUn_gl000218 161147 unplaced primary GL000218.1,HSCHRUN_RANDOM_CTG9,NT_113889.1
56 | chrUn_gl000219 179198 unplaced primary GL000219.1,HSCHRUN_RANDOM_CTG10,NT_167213.1
57 | chrUn_gl000220 161802 unplaced primary GL000220.1,HSCHRUN_RANDOM_CTG11,NT_167214.1
58 | chrUn_gl000221 155397 unplaced primary GL000221.1,HSCHRUN_RANDOM_CTG13,NT_167215.1
59 | chrUn_gl000222 186861 unplaced primary GL000222.1,HSCHRUN_RANDOM_CTG14,NT_167216.1
60 | chrUn_gl000223 180455 unplaced primary GL000223.1,HSCHRUN_RANDOM_CTG15,NT_167217.1
61 | chrUn_gl000224 179693 unplaced primary GL000224.1,HSCHRUN_RANDOM_CTG16,NT_167218.1
62 | chrUn_gl000225 211173 unplaced primary GL000225.1,HSCHRUN_RANDOM_CTG17,NT_167219.1
63 | chrUn_gl000226 15008 unplaced primary GL000226.1,HSCHRUN_RANDOM_CTG19,NT_167220.1
64 | chrUn_gl000227 128374 unplaced primary GL000227.1,HSCHRUN_RANDOM_CTG20,NT_167221.1
65 | chrUn_gl000228 129120 unplaced primary GL000228.1,HSCHRUN_RANDOM_CTG21,NT_167222.1
66 | chrUn_gl000229 19913 unplaced primary GL000229.1,HSCHRUN_RANDOM_CTG22,NT_167223.1
67 | chrUn_gl000230 43691 unplaced primary GL000230.1,HSCHRUN_RANDOM_CTG23,NT_167224.1
68 | chrUn_gl000231 27386 unplaced primary GL000231.1,HSCHRUN_RANDOM_CTG24,NT_167225.1
69 | chrUn_gl000232 40652 unplaced primary GL000232.1,HSCHRUN_RANDOM_CTG25,NT_167226.1
70 | chrUn_gl000233 45941 unplaced primary GL000233.1,HSCHRUN_RANDOM_CTG26,NT_167227.1
71 | chrUn_gl000234 40531 unplaced primary GL000234.1,HSCHRUN_RANDOM_CTG27,NT_167228.1
72 | chrUn_gl000235 34474 unplaced primary GL000235.1,HSCHRUN_RANDOM_CTG28,NT_167229.1
73 | chrUn_gl000236 41934 unplaced primary GL000236.1,HSCHRUN_RANDOM_CTG29,NT_167230.1
74 | chrUn_gl000237 45867 unplaced primary GL000237.1,HSCHRUN_RANDOM_CTG30,NT_167231.1
75 | chrUn_gl000238 39939 unplaced primary GL000238.1,HSCHRUN_RANDOM_CTG31,NT_167232.1
76 | chrUn_gl000239 33824 unplaced primary GL000239.1,HSCHRUN_RANDOM_CTG32,NT_167233.1
77 | chrUn_gl000240 41933 unplaced primary GL000240.1,HSCHRUN_RANDOM_CTG33,NT_167234.1
78 | chrUn_gl000241 42152 unplaced primary GL000241.1,HSCHRUN_RANDOM_CTG34,NT_167235.1
79 | chrUn_gl000242 43523 unplaced primary GL000242.1,HSCHRUN_RANDOM_CTG35,NT_167236.1
80 | chrUn_gl000243 43341 unplaced primary GL000243.1,HSCHRUN_RANDOM_CTG36,NT_167237.1
81 | chrUn_gl000244 39929 unplaced primary GL000244.1,HSCHRUN_RANDOM_CTG37,NT_167238.1
82 | chrUn_gl000245 36651 unplaced primary GL000245.1,HSCHRUN_RANDOM_CTG38,NT_167239.1
83 | chrUn_gl000246 38154 unplaced primary GL000246.1,HSCHRUN_RANDOM_CTG39,NT_167240.1
84 | chrUn_gl000247 36422 unplaced primary GL000247.1,HSCHRUN_RANDOM_CTG40,NT_167241.1
85 | chrUn_gl000248 39786 unplaced primary GL000248.1,HSCHRUN_RANDOM_CTG41,NT_167242.1
86 | chrUn_gl000249 38502 unplaced primary GL000249.1,HSCHRUN_RANDOM_CTG42,NT_167243.1
87 |
--------------------------------------------------------------------------------
/bioframe/io/data/hs1.seqinfo.tsv:
--------------------------------------------------------------------------------
1 | name length role molecule unit aliases
2 | chr1 248387328 assembled chr1 primary 1,CP068277.2,NC_060925.1
3 | chr2 242696752 assembled chr2 primary 2,CP068276.2,NC_060926.1
4 | chr3 201105948 assembled chr3 primary 3,CP068275.2,NC_060927.1
5 | chr4 193574945 assembled chr4 primary 4,CP068274.2,NC_060928.1
6 | chr5 182045439 assembled chr5 primary 5,CP068273.2,NC_060929.1
7 | chr6 172126628 assembled chr6 primary 6,CP068272.2,NC_060930.1
8 | chr7 160567428 assembled chr7 primary 7,CP068271.2,NC_060931.1
9 | chr8 146259331 assembled chr8 primary 8,CP068270.2,NC_060932.1
10 | chr9 150617247 assembled chr9 primary 9,CP068269.2,NC_060933.1
11 | chr10 134758134 assembled chr10 primary 10,CP068268.2,NC_060934.1
12 | chr11 135127769 assembled chr11 primary 11,CP068267.2,NC_060935.1
13 | chr12 133324548 assembled chr12 primary 12,CP068266.2,NC_060936.1
14 | chr13 113566686 assembled chr13 primary 13,CP068265.2,NC_060937.1
15 | chr14 101161492 assembled chr14 primary 14,CP068264.2,NC_060938.1
16 | chr15 99753195 assembled chr15 primary 15,CP068263.2,NC_060939.1
17 | chr16 96330374 assembled chr16 primary 16,CP068262.2,NC_060940.1
18 | chr17 84276897 assembled chr17 primary 17,CP068261.2,NC_060941.1
19 | chr18 80542538 assembled chr18 primary 18,CP068260.2,NC_060942.1
20 | chr19 61707364 assembled chr19 primary 19,CP068259.2,NC_060943.1
21 | chr20 66210255 assembled chr20 primary 20,CP068258.2,NC_060944.1
22 | chr21 45090682 assembled chr21 primary 21,CP068257.2,NC_060945.1
23 | chr22 51324926 assembled chr22 primary 22,CP068256.2,NC_060946.1
24 | chrX 154259566 assembled chrX primary X,CP068255.2,NC_060947.1
25 | chrY 62460029 assembled chrY primary Y,CP086569.2,NC_060948.1
26 | chrM 16569 assembled chrM non-nuclear MT,CP068254.1
27 |
--------------------------------------------------------------------------------
/bioframe/io/data/mm10.seqinfo.tsv:
--------------------------------------------------------------------------------
1 | name length role molecule unit aliases
2 | chr1 195471971 assembled chr1 primary 1,CM000994.2,NC_000067.6
3 | chr2 182113224 assembled chr2 primary 2,CM000995.2,NC_000068.7
4 | chr3 160039680 assembled chr3 primary 3,CM000996.2,NC_000069.6
5 | chr4 156508116 assembled chr4 primary 4,CM000997.2,NC_000070.6
6 | chr5 151834684 assembled chr5 primary 5,CM000998.2,NC_000071.6
7 | chr6 149736546 assembled chr6 primary 6,CM000999.2,NC_000072.6
8 | chr7 145441459 assembled chr7 primary 7,CM001000.2,NC_000073.6
9 | chr8 129401213 assembled chr8 primary 8,CM001001.2,NC_000074.6
10 | chr9 124595110 assembled chr9 primary 9,CM001002.2,NC_000075.6
11 | chr10 130694993 assembled chr10 primary 10,CM001003.2,NC_000076.6
12 | chr11 122082543 assembled chr11 primary 11,CM001004.2,NC_000077.6
13 | chr12 120129022 assembled chr12 primary 12,CM001005.2,NC_000078.6
14 | chr13 120421639 assembled chr13 primary 13,CM001006.2,NC_000079.6
15 | chr14 124902244 assembled chr14 primary 14,CM001007.2,NC_000080.6
16 | chr15 104043685 assembled chr15 primary 15,CM001008.2,NC_000081.6
17 | chr16 98207768 assembled chr16 primary 16,CM001009.2,NC_000082.6
18 | chr17 94987271 assembled chr17 primary 17,CM001010.2,NC_000083.6
19 | chr18 90702639 assembled chr18 primary 18,CM001011.2,NC_000084.6
20 | chr19 61431566 assembled chr19 primary 19,CM001012.2,NC_000085.6
21 | chrX 171031299 assembled chrX primary X,CM001013.2,NC_000086.7
22 | chrY 91744698 assembled chrY primary Y,CM001014.2,NC_000087.7
23 | chrM 16299 assembled chrM non-nuclear MT,AY172335.1,NC_005089.1
24 | chr1_GL456210_random 169725 unlocalized chr1 primary GL456210.1,MMCHR1_RANDOM_CTG1,NT_166280.1
25 | chr1_GL456211_random 241735 unlocalized chr1 primary GL456211.1,MMCHR1_RANDOM_CTG2,NT_166281.1
26 | chr1_GL456212_random 153618 unlocalized chr1 primary GL456212.1,MMCHR1_RANDOM_CTG3,NT_166282.1
27 | chr1_GL456213_random 39340 unlocalized chr1 primary GL456213.1,MMCHR1_RANDOM_CTG4,NT_166283.1
28 | chr1_GL456221_random 206961 unlocalized chr1 primary GL456221.1,MMCHR1_RANDOM_CTG5,NT_162750.1
29 | chr4_GL456216_random 66673 unlocalized chr4 primary GL456216.1,MMCHR4UN_CTG1,NT_166291.1
30 | chr4_GL456350_random 227966 unlocalized chr4 primary GL456350.1,MMCHR4UN_CTG3,NT_166434.1
31 | chr4_JH584292_random 14945 unlocalized chr4 primary JH584292.1,MMCHR4UN_CTG2,NT_187052.1
32 | chr4_JH584293_random 207968 unlocalized chr4 primary JH584293.1,MMCHR4UN_CTG4,NT_187053.1
33 | chr4_JH584294_random 191905 unlocalized chr4 primary JH584294.1,MMCHR4UN_CTG5,NT_187054.1
34 | chr4_JH584295_random 1976 unlocalized chr4 primary JH584295.1,MMCHR4UN_CTG6,NT_187055.1
35 | chr5_GL456354_random 195993 unlocalized chr5 primary GL456354.1,MMCHR5_RANDOM_CTG4,NT_166438.1
36 | chr5_JH584296_random 199368 unlocalized chr5 primary JH584296.1,MMCHR5_RANDOM_CTG1,NT_187056.1
37 | chr5_JH584297_random 205776 unlocalized chr5 primary JH584297.1,MMCHR5_RANDOM_CTG2,NT_187057.1
38 | chr5_JH584298_random 184189 unlocalized chr5 primary JH584298.1,MMCHR5_RANDOM_CTG3,NT_187058.1
39 | chr5_JH584299_random 953012 unlocalized chr5 primary JH584299.1,MMCHR5_RANDOM_CTG5,NT_187059.1
40 | chr7_GL456219_random 175968 unlocalized chr7 primary GL456219.1,MMCHR7_RANDOM_CTG1,NT_166307.1
41 | chrX_GL456233_random 336933 unlocalized chrX primary GL456233.1,MMCHRX_RANDOM_CTG2,NT_165789.2
42 | chrY_JH584300_random 182347 unlocalized chrY primary JH584300.1,MMCHRY_CTGU1,NT_187060.1
43 | chrY_JH584301_random 259875 unlocalized chrY primary JH584301.1,MMCHRY_CTGU2,NT_187061.1
44 | chrY_JH584302_random 155838 unlocalized chrY primary JH584302.1,MMCHRY_CTGU3,NT_187062.1
45 | chrY_JH584303_random 158099 unlocalized chrY primary JH584303.1,MMCHRY_CTGU4,NT_187063.1
46 | chrUn_GL456239 40056 unplaced primary GL456239.1,MSCHRUN_CTG1,NT_166338.1
47 | chrUn_GL456359 22974 unplaced primary GL456359.1,MSCHRUN_CTG13,NT_166443.1
48 | chrUn_GL456360 31704 unplaced primary GL456360.1,MSCHRUN_CTG14,NT_166444.1
49 | chrUn_GL456366 47073 unplaced primary GL456366.1,MSCHRUN_CTG21,NT_166450.1
50 | chrUn_GL456367 42057 unplaced primary GL456367.1,MSCHRUN_CTG2,NT_166451.1
51 | chrUn_GL456368 20208 unplaced primary GL456368.1,MSCHRUN_CTG22,NT_166452.1
52 | chrUn_GL456370 26764 unplaced primary GL456370.1,MSCHRUN_CTG19,NT_166454.1
53 | chrUn_GL456372 28664 unplaced primary GL456372.1,MSCHRUN_CTG16,NT_166456.1
54 | chrUn_GL456378 31602 unplaced primary GL456378.1,MSCHRUN_CTG3,NT_166462.1
55 | chrUn_GL456379 72385 unplaced primary GL456379.1,MSCHRUN_CTG20,NT_166463.1
56 | chrUn_GL456381 25871 unplaced primary GL456381.1,MSCHRUN_CTG4,NT_166465.1
57 | chrUn_GL456382 23158 unplaced primary GL456382.1,MSCHRUN_CTG5,NT_166466.1
58 | chrUn_GL456383 38659 unplaced primary GL456383.1,MSCHRUN_CTG6,NT_166467.1
59 | chrUn_GL456385 35240 unplaced primary GL456385.1,MSCHRUN_CTG7,NT_166469.1
60 | chrUn_GL456387 24685 unplaced primary GL456387.1,MSCHRUN_CTG17,NT_166471.1
61 | chrUn_GL456389 28772 unplaced primary GL456389.1,MSCHRUN_CTG18,NT_166473.1
62 | chrUn_GL456390 24668 unplaced primary GL456390.1,MSCHRUN_CTG9,NT_166474.1
63 | chrUn_GL456392 23629 unplaced primary GL456392.1,MSCHRUN_CTG10,NT_166476.1
64 | chrUn_GL456393 55711 unplaced primary GL456393.1,MSCHRUN_CTG11,NT_166477.1
65 | chrUn_GL456394 24323 unplaced primary GL456394.1,MSCHRUN_CTG12,NT_166478.1
66 | chrUn_GL456396 21240 unplaced primary GL456396.1,MSCHRUN_CTG15,NT_166480.1
67 | chrUn_JH584304 114452 unplaced primary JH584304.1,MSCHRUN_CTG23,NT_187064.1
68 |
--------------------------------------------------------------------------------
/bioframe/io/data/mm39.seqinfo.tsv:
--------------------------------------------------------------------------------
1 | name length role molecule unit aliases
2 | chr1 195154279 assembled chr1 primary 1,CM000994.3,NC_000067.7
3 | chr2 181755017 assembled chr2 primary 2,CM000995.3,NC_000068.8
4 | chr3 159745316 assembled chr3 primary 3,CM000996.3,NC_000069.7
5 | chr4 156860686 assembled chr4 primary 4,CM000997.3,NC_000070.7
6 | chr5 151758149 assembled chr5 primary 5,CM000998.3,NC_000071.7
7 | chr6 149588044 assembled chr6 primary 6,CM000999.3,NC_000072.7
8 | chr7 144995196 assembled chr7 primary 7,CM001000.3,NC_000073.7
9 | chr8 130127694 assembled chr8 primary 8,CM001001.3,NC_000074.7
10 | chr9 124359700 assembled chr9 primary 9,CM001002.3,NC_000075.7
11 | chr10 130530862 assembled chr10 primary 10,CM001003.3,NC_000076.7
12 | chr11 121973369 assembled chr11 primary 11,CM001004.3,NC_000077.7
13 | chr12 120092757 assembled chr12 primary 12,CM001005.3,NC_000078.7
14 | chr13 120883175 assembled chr13 primary 13,CM001006.3,NC_000079.7
15 | chr14 125139656 assembled chr14 primary 14,CM001007.3,NC_000080.7
16 | chr15 104073951 assembled chr15 primary 15,CM001008.3,NC_000081.7
17 | chr16 98008968 assembled chr16 primary 16,CM001009.3,NC_000082.7
18 | chr17 95294699 assembled chr17 primary 17,CM001010.3,NC_000083.7
19 | chr18 90720763 assembled chr18 primary 18,CM001011.3,NC_000084.7
20 | chr19 61420004 assembled chr19 primary 19,CM001012.3,NC_000085.7
21 | chrX 169476592 assembled chrX primary X,CM001013.3,NC_000086.8
22 | chrY 91455967 assembled chrY primary Y,CM001014.3,NC_000087.8
23 | chrM 16299 assembled chrM non-nuclear MT,AY172335.1,NC_005089.1
24 | chr1_GL456210v1_random 169725 unlocalized chr1 primary GL456210.1,MMCHR1_RANDOM_CTG1,NT_166280.1
25 | chr1_GL456211v1_random 241735 unlocalized chr1 primary GL456211.1,MMCHR1_RANDOM_CTG2,NT_166281.1
26 | chr1_GL456212v1_random 153618 unlocalized chr1 primary GL456212.1,MMCHR1_RANDOM_CTG3,NT_166282.1
27 | chr1_GL456221v1_random 206961 unlocalized chr1 primary GL456221.1,MMCHR1_RANDOM_CTG5,NT_162750.1
28 | chr1_GL456239v1_random 40056 unlocalized chr1 primary GL456239.1,MMCHR1_RANDOM_CTG7,NT_166338.1
29 | chr1_MU069434v1_random 8412 unlocalized chr1 primary MMCHR1_RANDOM_CTG6,MU069434.1,NW_023337853.1
30 | chr4_JH584295v1_random 1976 unlocalized chr4 primary JH584295.1,MMCHR4UN_CTG6,NT_187055.1
31 | chr5_GL456354v1_random 195993 unlocalized chr5 primary GL456354.1,MMCHR5_RANDOM_CTG4,NT_166438.1
32 | chr5_JH584296v1_random 199368 unlocalized chr5 primary JH584296.1,MMCHR5_RANDOM_CTG1,NT_187056.1
33 | chr5_JH584297v1_random 205776 unlocalized chr5 primary JH584297.1,MMCHR5_RANDOM_CTG2,NT_187057.1
34 | chr5_JH584298v1_random 184189 unlocalized chr5 primary JH584298.1,MMCHR5_RANDOM_CTG3,NT_187058.1
35 | chr5_JH584299v1_random 953012 unlocalized chr5 primary JH584299.1,MMCHR5_RANDOM_CTG5,NT_187059.1
36 | chr7_GL456219v1_random 175968 unlocalized chr7 primary GL456219.1,MMCHR7_RANDOM_CTG1,NT_166307.1
37 | chrX_GL456233v2_random 559103 unlocalized chrX primary GL456233.2,MMCHRX_RANDOM_CTG2,NT_165789.3
38 | chrY_JH584300v1_random 182347 unlocalized chrY primary JH584300.1,MMCHRY_CTGU1,NT_187060.1
39 | chrY_JH584301v1_random 259875 unlocalized chrY primary JH584301.1,MMCHRY_CTGU2,NT_187061.1
40 | chrY_JH584302v1_random 155838 unlocalized chrY primary JH584302.1,MMCHRY_CTGU3,NT_187062.1
41 | chrY_JH584303v1_random 158099 unlocalized chrY primary JH584303.1,MMCHRY_CTGU4,NT_187063.1
42 | chrUn_GL456359v1 22974 unplaced primary GL456359.1,MSCHRUN_CTG13,NT_166443.1
43 | chrUn_GL456360v1 31704 unplaced primary GL456360.1,MSCHRUN_CTG14,NT_166444.1
44 | chrUn_GL456366v1 47073 unplaced primary GL456366.1,MSCHRUN_CTG21,NT_166450.1
45 | chrUn_GL456367v1 42057 unplaced primary GL456367.1,MSCHRUN_CTG2,NT_166451.1
46 | chrUn_GL456368v1 20208 unplaced primary GL456368.1,MSCHRUN_CTG22,NT_166452.1
47 | chrUn_GL456370v1 26764 unplaced primary GL456370.1,MSCHRUN_CTG19,NT_166454.1
48 | chrUn_GL456372v1 28664 unplaced primary GL456372.1,MSCHRUN_CTG16,NT_166456.1
49 | chrUn_GL456378v1 31602 unplaced primary GL456378.1,MSCHRUN_CTG3,NT_166462.1
50 | chrUn_GL456379v1 72385 unplaced primary GL456379.1,MSCHRUN_CTG20,NT_166463.1
51 | chrUn_GL456381v1 25871 unplaced primary GL456381.1,MSCHRUN_CTG4,NT_166465.1
52 | chrUn_GL456382v1 23158 unplaced primary GL456382.1,MSCHRUN_CTG5,NT_166466.1
53 | chrUn_GL456383v1 38659 unplaced primary GL456383.1,MSCHRUN_CTG6,NT_166467.1
54 | chrUn_GL456385v1 35240 unplaced primary GL456385.1,MSCHRUN_CTG7,NT_166469.1
55 | chrUn_GL456387v1 24685 unplaced primary GL456387.1,MSCHRUN_CTG17,NT_166471.1
56 | chrUn_GL456389v1 28772 unplaced primary GL456389.1,MSCHRUN_CTG18,NT_166473.1
57 | chrUn_GL456390v1 24668 unplaced primary GL456390.1,MSCHRUN_CTG9,NT_166474.1
58 | chrUn_GL456392v1 23629 unplaced primary GL456392.1,MSCHRUN_CTG10,NT_166476.1
59 | chrUn_GL456394v1 24323 unplaced primary GL456394.1,MSCHRUN_CTG12,NT_166478.1
60 | chrUn_GL456396v1 21240 unplaced primary GL456396.1,MSCHRUN_CTG15,NT_166480.1
61 | chrUn_JH584304v1 114452 unplaced primary JH584304.1,MSCHRUN_CTG23,NT_187064.1
62 | chrUn_MU069435v1 31129 unplaced primary MU069435.1,MSCHRUN_CTG24,NW_023337853.1
63 |
--------------------------------------------------------------------------------
/bioframe/io/data/mm9.seqinfo.tsv:
--------------------------------------------------------------------------------
1 | name length role molecule unit aliases
2 | chr1 197195432 assembled chr1 primary 1,CM000994.1,NC_000067.5
3 | chr2 181748087 assembled chr2 primary 2,CM000995.1,NC_000068.6
4 | chr3 159599783 assembled chr3 primary 3,CM000996.1,NC_000069.5
5 | chr4 155630120 assembled chr4 primary 4,CM000997.1,NC_000070.5
6 | chr5 152537259 assembled chr5 primary 5,CM000998.1,NC_000071.5
7 | chr6 149517037 assembled chr6 primary 6,CM000999.1,NC_000072.5
8 | chr7 152524553 assembled chr7 primary 7,CM001000.1,NC_000073.5
9 | chr8 131738871 assembled chr8 primary 8,CM001001.1,NC_000074.5
10 | chr9 124076172 assembled chr9 primary 9,CM001002.1,NC_000075.5
11 | chr10 129993255 assembled chr10 primary 10,CM001003.1,NC_000076.5
12 | chr11 121843856 assembled chr11 primary 11,CM001004.1,NC_000077.5
13 | chr12 121257530 assembled chr12 primary 12,CM001005.1,NC_000078.5
14 | chr13 120284312 assembled chr13 primary 13,CM001006.1,NC_000079.5
15 | chr14 125194864 assembled chr14 primary 14,CM001007.1,NC_000080.5
16 | chr15 103494974 assembled chr15 primary 15,CM001008.1,NC_000081.5
17 | chr16 98319150 assembled chr16 primary 16,CM001009.1,NC_000082.5
18 | chr17 95272651 assembled chr17 primary 17,CM001010.1,NC_000083.5
19 | chr18 90772031 assembled chr18 primary 18,CM001011.1,NC_000084.5
20 | chr19 61342430 assembled chr19 primary 19,CM001012.1,NC_000085.5
21 | chrX 166650296 assembled chrX primary X,CM001013.1,NC_000086.6
22 | chrY 15902555 assembled chrY primary Y,CM001014.1,NC_000087.6
23 | chrM 16299 assembled chrM non-nuclear MT,AY172335.1,NC_005089.1
24 | chr1_random 1231697 unlocalized chr1 primary
25 | chr3_random 41899 unlocalized chr3 primary
26 | chr4_random 160594 unlocalized chr4 primary
27 | chr5_random 357350 unlocalized chr5 primary
28 | chr7_random 362490 unlocalized chr7 primary
29 | chr8_random 849593 unlocalized chr8 primary
30 | chr9_random 449403 unlocalized chr9 primary
31 | chr13_random 400311 unlocalized chr13 primary
32 | chr16_random 3994 unlocalized chr16 primary
33 | chr17_random 628739 unlocalized chr17 primary
34 | chrX_random 1785075 unlocalized chrX primary
35 | chrY_random 58682461 unlocalized chrY primary
36 | chrUn_random 5900358 unplaced primary
37 |
--------------------------------------------------------------------------------
/bioframe/io/data/sacCer3.seqinfo.tsv:
--------------------------------------------------------------------------------
1 | name length role molecule unit aliases
2 | chrI 230218 assembled chrI primary I,BK006935.2,NC_001133.9
3 | chrII 813184 assembled chrII primary II,BK006936.2,NC_001134.8
4 | chrIII 316620 assembled chrIII primary III,BK006937.2,NC_001135.5
5 | chrIV 1531933 assembled chrIV primary IV,BK006938.2,NC_001136.10
6 | chrV 576874 assembled chrV primary V,BK006939.2,NC_001137.3
7 | chrVI 270161 assembled chrVI primary VI,BK006940.2,NC_001138.5
8 | chrVII 1090940 assembled chrVII primary VII,BK006941.2,NC_001139.9
9 | chrVIII 562643 assembled chrVIII primary VIII,BK006934.2,NC_001140.6
10 | chrIX 439888 assembled chrIX primary IX,BK006942.2,NC_001141.2
11 | chrX 745751 assembled chrX primary X,BK006943.2,NC_001142.9
12 | chrXI 666816 assembled chrXI primary XI,BK006944.2,NC_001143.9
13 | chrXII 1078177 assembled chrXII primary XII,BK006945.2,NC_001144.5
14 | chrXIII 924431 assembled chrXIII primary XIII,BK006946.2,NC_001145.3
15 | chrXIV 784333 assembled chrXIV primary XIV,BK006947.3,NC_001146.8
16 | chrXV 1091291 assembled chrXV primary XV,BK006948.2,NC_001147.6
17 | chrXVI 948066 assembled chrXVI primary XVI,BK006949.2,NC_001148.4
18 | chrM 85779 assembled chrM non-nuclear MT,Mito,AJ011856.1,NC_001224.1
19 |
--------------------------------------------------------------------------------
/bioframe/io/data/wuhCor1.seqinfo.tsv:
--------------------------------------------------------------------------------
1 | name length role molecule unit aliases
2 | NC_045512v2 29903 assembled NC_045512 primary NC_045512.2,MN908947.3
3 |
--------------------------------------------------------------------------------
/bioframe/io/resources.py:
--------------------------------------------------------------------------------
1 | import urllib
2 | from functools import partial
3 | from typing import Union
4 | from urllib.parse import urljoin
5 |
6 | import numpy as np
7 | import pandas as pd
8 |
9 | from .assembly import assembly_info
10 | from .fileops import read_chromsizes, read_table
11 | from .schemas import SCHEMAS
12 |
13 | __all__ = [
14 | "fetch_chromsizes",
15 | "fetch_centromeres",
16 | "UCSCClient",
17 | ]
18 |
19 |
20 | def fetch_chromsizes(
21 | db: str,
22 | *,
23 | provider: str = "local",
24 | as_bed: bool = False,
25 | filter_chroms: bool = True,
26 | chrom_patterns: tuple = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"),
27 | natsort: bool = True,
28 | **kwargs,
29 | ) -> Union[pd.Series, pd.DataFrame]:
30 | """
31 | Fetch chromsizes from local storage or the UCSC database.
32 |
33 | Parameters
34 | ----------
35 | db : str
36 | Assembly name.
37 | provider : str, optional [default: "local"]
38 | The provider of chromsizes. Either "local" for local storage or "ucsc".
39 | as_bed : bool, optional
40 | If True, return chromsizes as an interval DataFrame (chrom, start, end)
41 | instead of a Series.
42 |
43 | The remaining options only apply to provider="ucsc".
44 |
45 | filter_chroms : bool, optional
46 | Filter for chromosome names given in ``chrom_patterns``.
47 | chrom_patterns : sequence, optional
48 | Sequence of regular expressions to capture desired sequence names.
49 | natsort : bool, optional
50 | Sort each captured group of names in natural order. Default is True.
51 | **kwargs :
52 | Passed to :func:`pandas.read_csv`
53 |
54 | Returns
55 | -------
56 | Series of integer bp lengths indexed by sequence name or BED3 DataFrame.
57 |
58 | Notes
59 | -----
60 | For more fine-grained control over the chromsizes from local storage,
61 | use :func:`bioframe.assembly_info`.
62 |
63 | Examples
64 | --------
65 | >>> fetch_chromsizes("hg38")
66 | name
67 | chr1 248956422
68 | chr2 242193529
69 | chr3 198295559
70 | ... ...
71 | chrX 156040895
72 | chrY 57227415
73 | chrM 16569
74 | Name: length, dtype: int64
75 |
76 | >>> fetch_chromsizes("hg38", as_bed=True)
77 | chrom start end
78 | 0 chr1 0 248956422
79 | 1 chr2 0 242193529
80 | 2 chr3 0 198295559
81 | ... ...
82 | 21 chrX 0 156040895
83 | 22 chrY 0 57227415
84 | 23 chrM 0 16569
85 |
86 | See also
87 | --------
88 | bioframe.assembly_info
89 | bioframe.UCSCClient
90 | """
91 | if provider == "local":
92 | assembly = assembly_info(db)
93 | if as_bed:
94 | return assembly.viewframe[["chrom", "start", "end"]].copy()
95 | else:
96 | return assembly.chromsizes
97 | elif provider == "ucsc":
98 | return UCSCClient(db).fetch_chromsizes(
99 | filter_chroms=filter_chroms,
100 | chrom_patterns=chrom_patterns,
101 | natsort=natsort,
102 | as_bed=as_bed,
103 | **kwargs,
104 | )
105 | else:
106 | raise ValueError(f"Unknown provider '{provider}'")
107 |
108 |
109 | def _origins_from_cytoband(
110 | cyb: pd.DataFrame, band_col: str = "gieStain"
111 | ) -> pd.DataFrame:
112 | """
113 | Extract chromosomal origin positions separating chromosome arms from
114 | cytological band data. Takes the cytological origin, i.e. the boundary
115 | between the two bands labeled 'acen'.
116 |
117 | Parameters
118 | ----------
119 | cyb : pandas.DataFrame
120 | DataFrame with cytoband data.
121 |
122 | Returns
123 | -------
124 | pandas.DataFrame
125 | A dataframe with columns 'chrom', 'start', 'end', 'mid'.
126 | """
127 | cyb = cyb[cyb[band_col] == "acen"]
128 | grouped = cyb.groupby("chrom", sort=False)
129 | cens = []
130 | for chrom, group in grouped:
131 | if not len(group) == 2:
132 | raise ValueError(f"Expected 2 'acen' bands for {chrom}, found {len(group)}")
133 | acens = group.sort_values("start")
134 | cens.append(
135 | {
136 | "chrom": chrom,
137 | "start": acens.iloc[0]["start"],
138 | "end": acens.iloc[1]["end"],
139 | "mid": acens.iloc[0]["end"],
140 | }
141 | )
142 | return pd.DataFrame.from_records(cens)
143 |
144 |
145 | def _origins_from_ucsccentromeres(cens: pd.DataFrame) -> pd.DataFrame:
146 | """
147 | Extract chromosomal origin positions from UCSC centromeres.txt table
148 | describing centromere model sequences. Takes the midpoint of all
149 | modeled centromere sequences.
150 |
151 | Parameters
152 | ----------
153 | cens : pandas.DataFrame
154 | DataFrame with centromeres.txt data.
155 |
156 | Returns
157 | -------
158 | pandas.DataFrame
159 | A dataframe with columns 'chrom', 'start', 'end', 'mid'.
160 | """
161 | cens = cens.groupby("chrom").agg({"start": np.min, "end": np.max}).reset_index()
162 | cens["mid"] = (cens["start"] + cens["end"]) // 2
163 | cens = (
164 | cens[["chrom", "start", "end", "mid"]]
165 | .sort_values("chrom")
166 | .reset_index(drop=True)
167 | )
168 | return cens
169 |
170 |
171 | def fetch_centromeres(db: str, provider: str = "local") -> pd.DataFrame:
172 | """
173 | Extract centromere locations for a given assembly 'db' from a variety
174 | of file formats in UCSC (cytoband, centromeres) depending on
175 | availability, returning a DataFrame.
176 |
177 | Parameters
178 | ----------
179 | db : str
180 | Assembly name.
181 | provider : str, optional [default: "local"]
182 | The provider of centromere data. Either "local" for local storage
183 | or "ucsc".
184 |
185 | Returns
186 | -------
187 | DataFrame with centromere 'chrom', 'start', 'end', 'mid'.
188 |
189 | Notes
190 | -----
191 | When provider="local", centromeres are derived from cytoband tables
192 | in local storage.
193 |
194 | Whe provider="ucsc", the fallback priority goes as follows:
195 | - UCSC cytoBand
196 | - UCSC cytoBandIdeo
197 | - UCSC centromeres.txt
198 |
199 | Note that UCSC "gap" files no longer provide centromere information.
200 |
201 | Currently only works for human assemblies.
202 |
203 | See also
204 | --------
205 | bioframe.assembly_info
206 | bioframe.UCSCClient
207 | """
208 | if provider == "local":
209 | assembly = assembly_info(db)
210 | cyb = assembly.cytobands
211 | if cyb is None:
212 | raise ValueError(
213 | f"No source for centromere data found from provider '{provider}'."
214 | )
215 | return _origins_from_cytoband(cyb, band_col="stain")
216 |
217 | elif provider == "ucsc":
218 | client = UCSCClient(db)
219 | fetchers = [
220 | ("cytoband", client.fetch_cytoband),
221 | ("cytoband", partial(client.fetch_cytoband, ideo=True)),
222 | ("centromeres", client.fetch_centromeres),
223 | ]
224 |
225 | for schema, fetcher in fetchers: # noqa: B007
226 | try:
227 | df = fetcher()
228 | break
229 | except urllib.error.HTTPError:
230 | pass
231 | else:
232 | raise ValueError(
233 | f"No source for centromere data found from provider '{provider}'."
234 | )
235 |
236 | if schema == "centromeres":
237 | return _origins_from_ucsccentromeres(df)
238 | else:
239 | return _origins_from_cytoband(df)
240 |
241 | else:
242 | raise ValueError(f"Unknown provider '{provider}'")
243 |
244 |
245 | class UCSCClient:
246 | BASE_URL = "https://hgdownload.soe.ucsc.edu/"
247 |
248 | def __init__(self, db: str):
249 | self._db = db
250 | self._db_url = urljoin(self.BASE_URL, f"goldenPath/{db}/")
251 |
252 | def fetch_chromsizes(
253 | self,
254 | filter_chroms: bool = True,
255 | chrom_patterns: tuple = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"),
256 | natsort: bool = True,
257 | as_bed: bool = False,
258 | **kwargs,
259 | ) -> Union[pd.Series, pd.DataFrame]:
260 | url = urljoin(self._db_url, f"bigZips/{self._db}.chrom.sizes")
261 | return read_chromsizes(
262 | url,
263 | filter_chroms=filter_chroms,
264 | chrom_patterns=chrom_patterns,
265 | natsort=natsort,
266 | as_bed=as_bed,
267 | **kwargs,
268 | )
269 |
270 | def fetch_centromeres(self, **kwargs) -> pd.DataFrame:
271 | url = urljoin(self._db_url, "database/centromeres.txt.gz")
272 | return read_table(url, schema="centromeres", **kwargs)
273 |
274 | def fetch_gaps(self, **kwargs):
275 | url = urljoin(self._db_url, "database/gap.txt.gz")
276 | return read_table(
277 | url,
278 | schema="gap",
279 | usecols=["chrom", "start", "end", "length", "type", "bridge"],
280 | **kwargs,
281 | )
282 |
283 | def fetch_cytoband(self, ideo: bool = False, **kwargs) -> pd.DataFrame:
284 | if ideo:
285 | url = urljoin(self._db_url, "database/cytoBandIdeo.txt.gz")
286 | else:
287 | url = urljoin(self._db_url, "database/cytoBand.txt.gz")
288 | return read_table(url, schema="cytoband")
289 |
290 | def fetch_mrna(self, **kwargs) -> pd.DataFrame:
291 | url = urljoin(self._db_url, "database/all_mrna.txt.gz")
292 | return read_table(
293 | url,
294 | schema=SCHEMAS["all_mrna"],
295 | **kwargs,
296 | )
297 |
--------------------------------------------------------------------------------
/bioframe/io/schemas.py:
--------------------------------------------------------------------------------
1 | """
2 | Field names for various genomic tabular files
3 |
4 | """
5 |
6 | __all__ = ["SCHEMAS"]
7 |
8 |
9 | # UCSC File Formats
10 | # https://genome.ucsc.edu/FAQ/FAQformat.html
11 | BED12_FIELDS = [
12 | "chrom",
13 | "start",
14 | "end",
15 | "name",
16 | "score",
17 | "strand",
18 | "thickStart",
19 | "thickEnd",
20 | "itemRgb",
21 | "blockCount",
22 | "blockSizes",
23 | "blockStarts",
24 | ]
25 |
26 | BED_FIELDS = BED12_FIELDS[:6]
27 |
28 | BEDGRAPH_FIELDS = ["chrom", "start", "end", "value"]
29 |
30 | BEDPE_FIELDS = [
31 | "chrom1",
32 | "start1",
33 | "end1",
34 | "chrom2",
35 | "start2",
36 | "end2",
37 | "name",
38 | "score",
39 | "strand1",
40 | "strand2",
41 | ]
42 |
43 | GFF_FIELDS = [
44 | "chrom",
45 | "source",
46 | "feature",
47 | "start",
48 | "end",
49 | "score",
50 | "strand",
51 | "frame",
52 | "attributes",
53 | ]
54 |
55 | PGSNP_FIELDS = [
56 | "chrom",
57 | "start",
58 | "end",
59 | "name",
60 | "alleleCount",
61 | "alleleFreq",
62 | "alleleScores",
63 | ]
64 |
65 | BEDRNAELEMENTS_FIELDS = [
66 | "chrom",
67 | "start",
68 | "end",
69 | "name",
70 | "score",
71 | "strand",
72 | "level",
73 | "signif",
74 | "score2",
75 | ]
76 |
77 | NARROWPEAK_FIELDS = [
78 | "chrom",
79 | "start",
80 | "end",
81 | "name",
82 | "score",
83 | "strand",
84 | "fc",
85 | "-log10p",
86 | "-log10q",
87 | "relSummit",
88 | ]
89 |
90 | BROADPEAK_FIELDS = [
91 | "chrom",
92 | "start",
93 | "end",
94 | "name",
95 | "score",
96 | "strand",
97 | "fc",
98 | "-log10p",
99 | "-log10q",
100 | ]
101 |
102 | GAPPEDPEAK_FIELDS = [
103 | "chrom",
104 | "start",
105 | "end",
106 | "name",
107 | "score",
108 | "strand",
109 | "thickStart",
110 | "thickEnd",
111 | "itemRgb",
112 | "blockCount",
113 | "blockSizes",
114 | "blockStarts",
115 | "fc",
116 | "-log10p",
117 | "-log10q",
118 | ]
119 |
120 | JASPAR_FIELDS = ["chrom", "start", "end", "name", "score", "pval", "strand"]
121 |
122 | GAP_FIELDS = ["bin", "chrom", "start", "end", "ix", "n", "length", "type", "bridge"]
123 |
124 | CENTROMERES_FIELDS = ["bin", "chrom", "start", "end", "name"]
125 |
126 | UCSC_MRNA_FIELDS = [
127 | "bin",
128 | "matches",
129 | "misMatches",
130 | "repMatches",
131 | "nCount",
132 | "qNumInsert",
133 | "qBaseInsert",
134 | "tNumInsert",
135 | "tBaseInsert",
136 | "strand",
137 | "qName",
138 | "qSize",
139 | "qStart",
140 | "qEnd",
141 | "tName",
142 | "tSize",
143 | "tStart",
144 | "tEnd",
145 | "blockCount",
146 | "blockSizes",
147 | "qStarts",
148 | "tStarts",
149 | ]
150 |
151 | CYTOBAND_FIELDS = ["chrom", "start", "end", "name", "gieStain"]
152 |
153 |
154 | # GA4GH File Formats
155 | # http://ga4gh.org/#/fileformats-team
156 | BAM_FIELDS = [
157 | "QNAME",
158 | "FLAG",
159 | "RNAME",
160 | "POS",
161 | "MAPQ",
162 | "CIGAR",
163 | "RNEXT",
164 | "PNEXT",
165 | "TLEN",
166 | "SEQ",
167 | "QUAL",
168 | "TAGs",
169 | ]
170 |
171 | VCF_FIELDS = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
172 |
173 |
174 | SCHEMAS = {
175 | "bed": BED_FIELDS,
176 | "bed3": BED_FIELDS[:3],
177 | "bed4": BED_FIELDS[:4],
178 | "bedGraph": BEDGRAPH_FIELDS,
179 | "bed5": BED_FIELDS[:5],
180 | "bed6": BED_FIELDS,
181 | "bed9": BED12_FIELDS[:9],
182 | "bed12": BED12_FIELDS,
183 | "bedpe": BEDPE_FIELDS,
184 | "gff": GFF_FIELDS,
185 | "gtf": GFF_FIELDS,
186 | "bedRnaElements": BEDRNAELEMENTS_FIELDS,
187 | "narrowPeak": NARROWPEAK_FIELDS,
188 | "broadPeak": BROADPEAK_FIELDS,
189 | "gappedPeak": GAPPEDPEAK_FIELDS,
190 | "centromeres": CENTROMERES_FIELDS,
191 | "cytoband": CYTOBAND_FIELDS,
192 | "sam": BAM_FIELDS,
193 | "vcf": VCF_FIELDS,
194 | "jaspar": JASPAR_FIELDS,
195 | "gap": GAP_FIELDS,
196 | "all_mrna": UCSC_MRNA_FIELDS,
197 | "pgsnp": PGSNP_FIELDS,
198 | }
199 |
200 |
201 | CHROM_NAME_PATTERNS = {
202 | "hg19": (
203 | r"^chr[0-9]+$",
204 | r"^chr[XY]$",
205 | r"^chrM$",
206 | r"^chr(?!Un).+_.*_random$",
207 | r"^chrUn_.*$",
208 | r"^chr(?!Un).+_.*_hap\d+$",
209 | ),
210 | "hg38": (
211 | r"^chr[0-9]+$",
212 | r"^chr[XY]$",
213 | r"^chrM$",
214 | r"^chrEBV$",
215 | r"^chr(?!Un).+_.*_random$",
216 | r"^chrUn_.*$",
217 | r"^chr(?!Un).+_.*_alt$",
218 | ),
219 | "mm9": (
220 | r"^chr[0-9]+$",
221 | r"^chr[XY]$",
222 | r"^chrM$",
223 | r"^chr(?!Un).+_random$",
224 | r"^chrUn_random$",
225 | ),
226 | "mm10": (
227 | r"^chr[0-9]+$",
228 | r"^chr[XY]$",
229 | r"^chrM$",
230 | r"^chr(?!Un).+_random$",
231 | r"^chrUn_.*$",
232 | ),
233 | "galGal4": (
234 | r"^chr[0-9]+$",
235 | r"^chr[ZW]$",
236 | r"^chrM$",
237 | r"^chrLGE64|chrLGE22C19W28_E50C23$",
238 | r"^chr(?!Un).+_random$",
239 | r"^chrUn_.*$",
240 | ),
241 | "galGal5": (
242 | r"^chr[0-9]+$",
243 | r"^chr[ZW]$",
244 | r"^chrM$",
245 | r"^chrLGE64$",
246 | r"^chr(?!Un).+_random$",
247 | r"^chrUn_.*$",
248 | ),
249 | "dm3": (
250 | r"^chr[234][LR[Het]*]*$",
251 | r"^chr[XY][Het]*$",
252 | r"^chrM$",
253 | r"^chr[U][extra]*$",
254 | ),
255 | "dm6": (
256 | r"^chr[234][LR]*",
257 | r"^chr[XY]$",
258 | r"^chrM$",
259 | r"^chr(?!Un).+_random$",
260 | r"^chrUn_.*$",
261 | ),
262 | "ce10": (r"chr[IV]+$", r"^chrX$", r"^chrM$"),
263 | "ce11": (r"chr[IV]+$", r"^chrX$", r"^chrM$"),
264 | "sacCer3": (r"chr[IXV]+$", r"^chrM$"),
265 | "TAIR10": (r"^\d+", r"^MT|Pltd$"),
266 | }
267 |
268 |
269 | UCSC_AUTOSOMES = r"^chr[0-9]+[A-Za-z]*$"
270 | UCSC_SEXCHROMS = r"^chr[XYZW][A-Za-z]*$"
271 | UCSC_NONNUCLEAR = r"^chrM$"
272 | UCSC_OTHER = r"^chrLGE.*$"
273 | UCSC_UNLOCALIZED = r"^chr(?!Un).+.*_random$"
274 | UCSC_UNPLACED = r"^chrUn_.*$|^chrU[A-Za-z]*$"
275 | UCSC_ALTCHROMS = r"^chr(?!Un).+_.*_hap\d+$|^chr(?!Un).+_.*_alt$"
276 | NCBI_AUTOSOMES = r"^[0-9]+$"
277 | NCBI_SEXCHROMS = r"^[XYZW]$"
278 | NCBI_NONNUCLEAR = r"^MT$|^Pltd$"
279 | ROMAN_LT10 = [r"^chrI+$", r"^chrIV$", r"^chrVI*$", r"^chrIX$"]
280 | ROMAN_LT20 = [*ROMAN_LT10, "^chrX$", "^chrXI*$", "^chrXIV$", "^chrXVI*$", "^chrXIX$"]
281 |
--------------------------------------------------------------------------------
/bioframe/sandbox/clients.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import glob
3 | import os
4 | import os.path as op
5 | import posixpath as pp
6 | from urllib.parse import urlencode, urljoin
7 |
8 | import pandas as pd
9 | import requests
10 |
11 |
12 | class EncodeClient:
13 | BASE_URL = "http://www.encodeproject.org/"
14 |
15 | # 2020-05-15 compatible with ENCODE Metadata at:
16 | METADATA_URL = "https://www.encodeproject.org/metadata/type=Experiment&status=released/metadata.tsv"
17 |
18 | KNOWN_ASSEMBLIES = (
19 | "GRCh38",
20 | "GRCh38-minimal",
21 | "ce10",
22 | "ce11",
23 | "dm3",
24 | "dm6",
25 | "hg19",
26 | "mm10",
27 | "mm10-minimal",
28 | "mm9",
29 | )
30 |
31 | def __init__(self, cachedir, assembly, metadata=None):
32 | if assembly not in self.KNOWN_ASSEMBLIES:
33 | raise ValueError("assembly must be in:", self.KNOWN_ASSEMBLIES)
34 |
35 | self.cachedir = op.join(cachedir, assembly)
36 | if not op.isdir(self.cachedir):
37 | os.makedirs(self.cachedir, exist_ok=True)
38 |
39 | if metadata is None:
40 | metadata_path = op.join(cachedir, "metadata.tsv")
41 |
42 | if not op.exists(metadata_path):
43 | print(
44 | "getting metadata from ENCODE, please wait while "
45 | "(~240Mb) file downloads"
46 | )
47 | with requests.get(self.METADATA_URL, stream=True) as r:
48 | r.raise_for_status()
49 | with open(metadata_path, "wb") as f:
50 | for chunk in r.iter_content(chunk_size=8192):
51 | f.write(chunk)
52 |
53 | self._meta = pd.read_table(metadata_path, low_memory=False)
54 | table_assemblies = sorted(
55 | self._meta["File assembly"].dropna().unique().tolist()
56 | )
57 |
58 | if not set(table_assemblies).issubset(set(self.KNOWN_ASSEMBLIES)):
59 | raise ValueError(
60 | "Table assemblies do not match known assemblies, "
61 | "check ENCODE metadata version"
62 | )
63 | self._meta = self._meta[self._meta["File assembly"] == assembly].copy()
64 | self._meta = self._meta.set_index("File accession")
65 |
66 | else:
67 | self._meta = metadata
68 |
69 | def _batch_download(self, args):
70 | params = urlencode(args)
71 | url = pp.join("batch_download", params)
72 | url = urljoin(self.BASE_URL, url)
73 | r = requests.get(url)
74 | r.raise_for_status()
75 | return r
76 |
77 | def _metadata(self, args):
78 | params = urlencode(args)
79 | url = pp.join("metadata", params, "metadata.tsv")
80 | url = urljoin(self.BASE_URL, url)
81 | r = requests.get(url)
82 | r.raise_for_status()
83 | return r
84 |
85 | @property
86 | def meta(self):
87 | return self._meta.copy()
88 |
89 | def info(self, accession, width=850, height=450):
90 | from IPython.display import HTML
91 |
92 | url = urljoin(self.BASE_URL, pp.join("experiments", accession))
93 | return HTML(
94 | f''
95 | )
96 |
97 | def fetch(self, accession):
98 | url = self.meta.loc[accession, "File download URL"]
99 | # sig = self.meta.loc[accession, 'md5sum']
100 | filename = op.split(url)[1]
101 | path = op.join(self.cachedir, filename)
102 | if op.exists(path):
103 | pass
104 | # print('File "{}" available'.format(filename))
105 | else:
106 | print(f'Downloading "{filename}"')
107 | r = requests.get(url)
108 | r.raise_for_status()
109 | with open(path, "wb") as f:
110 | f.write(r.content)
111 | return path
112 |
113 | def fetch_all(self, accessions):
114 | return list(map(self.fetch, accessions))
115 |
116 |
117 | class FDNClient:
118 | BASE_URL = "https://data.4dnucleome.org/"
119 |
120 | def __init__(self, cachedir, assembly, metadata=None, key_id=None, key_secret=None):
121 | self.cachedir = op.join(cachedir, assembly)
122 | if not op.isdir(self.cachedir):
123 | raise OSError(f"Directory doesn't exist: '{cachedir}'")
124 | if metadata is None:
125 | metadata_paths = sorted(glob.glob(op.join(cachedir, "metadata*.tsv")))
126 | metadata_path = metadata_paths[-1]
127 | self._meta = pd.read_table(metadata_path, low_memory=False, comment="#")
128 | if assembly == "GRCh38":
129 | self._meta = self._meta[self._meta["Organism"] == "human"].copy()
130 | self._meta = self._meta.set_index("File Accession")
131 | else:
132 | self._meta = metadata
133 | if key_id is not None:
134 | credential = (key_id + ":" + key_secret).encode("utf-8")
135 | self._token = base64.b64encode(credential)
136 | else:
137 | self._token = None
138 |
139 | @property
140 | def meta(self):
141 | return self._meta.copy()
142 |
143 | def info(self, accession, width=850, height=450):
144 | from IPython.display import HTML
145 |
146 | url = urljoin(self.BASE_URL, pp.join("experiments", accession))
147 | return HTML(
148 | f''
149 | )
150 |
151 | def fetch(self, accession):
152 | url = self.meta.loc[accession, "File Download URL"]
153 | # sig = self.meta.loc[accession, 'md5sum']
154 | filename = op.split(url)[1]
155 | path = op.join(self.cachedir, filename)
156 | if op.exists(path):
157 | pass
158 | # print('File "{}" available'.format(filename))
159 | else:
160 | print(f'Downloading "{filename}"')
161 | if self._token:
162 | headers = {"Authorization": b"Basic " + self._token}
163 | else:
164 | headers = None
165 | r = requests.get(url, headers=headers)
166 | r.raise_for_status()
167 | with open(path, "wb") as f:
168 | f.write(r.content)
169 | return path
170 |
171 | def fetch_all(self, accessions):
172 | return list(map(self.fetch, accessions))
173 |
--------------------------------------------------------------------------------
/bioframe/sandbox/gtf_io.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 |
4 | def parse_gtf_attributes(attrs, kv_sep="=", item_sep=";", quotechar='"', **kwargs):
5 | item_lists = attrs.str.split(item_sep)
6 | item_lists = item_lists.apply(
7 | lambda items: [item.strip().split(kv_sep) for item in items]
8 | )
9 | stripchars = quotechar + " "
10 | item_lists = item_lists.apply(
11 | lambda items: [
12 | [x.strip(stripchars) for x in item] for item in items if len(item) == 2
13 | ]
14 | )
15 | kv_records = item_lists.apply(dict)
16 | return pd.DataFrame.from_records(kv_records, **kwargs)
17 |
--------------------------------------------------------------------------------
/bioframe/sandbox/parquet_io.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 |
4 | def to_parquet(
5 | pieces,
6 | outpath,
7 | row_group_size=None,
8 | compression="snappy",
9 | use_dictionary=True,
10 | version=2.0,
11 | **kwargs,
12 | ):
13 | """
14 | Save an iterable of dataframe chunks to a single Apache Parquet file. For
15 | more info about Parquet, see https://arrow.apache.org/docs/python/parquet.html.
16 |
17 | Parameters
18 | ----------
19 | pieces : DataFrame or iterable of DataFrame
20 | Chunks to write
21 | outpath : str
22 | Path to output file
23 | row_group_size : int
24 | Number of rows per row group
25 | compression : {'snappy', 'gzip', 'brotli', 'none'}, optional
26 | Compression algorithm. Can be set on a per-column basis with a
27 | dictionary of column names to compression lib.
28 | use_dictionary : bool, optional
29 | Use dictionary encoding. Can be set on a per-column basis with a list
30 | of column names.
31 |
32 | See also
33 | --------
34 | pyarrow.parquet.write_table
35 | pyarrow.parquet.ParquetFile
36 | fastparquet
37 |
38 | """
39 | try:
40 | import pyarrow as pa
41 | import pyarrow.parquet
42 | except ImportError:
43 | raise ImportError("Saving to parquet requires the `pyarrow` package") from None
44 |
45 | if isinstance(pieces, pd.DataFrame):
46 | pieces = (pieces,)
47 |
48 | try:
49 | for i, piece in enumerate(pieces):
50 | table = pa.Table.from_pandas(piece, preserve_index=False)
51 | if i == 0:
52 | writer = pa.parquet.ParquetWriter(
53 | outpath,
54 | table.schema,
55 | compression=compression,
56 | use_dictionary=use_dictionary,
57 | version=version,
58 | **kwargs,
59 | )
60 | writer.write_table(table, row_group_size=row_group_size)
61 | finally:
62 | writer.close()
63 |
64 |
65 | def read_parquet(filepath, columns=None, iterator=False, **kwargs):
66 | """
67 | Load DataFrames from Parquet files, optionally in pieces.
68 |
69 | Parameters
70 | ----------
71 | filepath : str, pathlib.Path, pyarrow.NativeFile, or file-like object
72 | Readable source. For passing bytes or buffer-like file containing a
73 | Parquet file, use pyarorw.BufferReader
74 | columns: list
75 | If not None, only these columns will be read from the row groups. A
76 | column name may be a prefix of a nested field, e.g. 'a' will select
77 | 'a.b', 'a.c', and 'a.d.e'
78 | iterator : boolean, default False
79 | Return an iterator object that yields row group DataFrames and
80 | provides the ParquetFile interface.
81 | use_threads : boolean, default True
82 | Perform multi-threaded column reads
83 | memory_map : boolean, default True
84 | If the source is a file path, use a memory map to read file, which can
85 | improve performance in some environments
86 |
87 | Returns
88 | -------
89 | DataFrame or ParquetFileIterator
90 |
91 | """
92 | use_threads = kwargs.pop("use_threads", True)
93 |
94 | if not iterator:
95 | return pd.read_parquet(
96 | filepath, columns=columns, use_threads=use_threads, **kwargs
97 | )
98 | else:
99 | try:
100 | from pyarrow.parquet import ParquetFile
101 | except ImportError:
102 | raise ImportError(
103 | "Iterating over Parquet data requires the `pyarrow` package."
104 | ) from None
105 |
106 | class ParquetFileIterator(ParquetFile):
107 | def __iter__(self):
108 | return self
109 |
110 | def __next__(self):
111 | if not hasattr(self, "_rgid"):
112 | self._rgid = 0
113 | if self._rgid < self.num_row_groups:
114 | rg = self.read_row_group(
115 | self._rgid,
116 | columns=columns,
117 | use_threads=use_threads,
118 | use_pandas_metadata=True,
119 | )
120 | self._rgid += 1
121 | else:
122 | raise StopIteration
123 | return rg.to_pandas()
124 |
125 | return ParquetFileIterator(filepath, **kwargs)
126 |
--------------------------------------------------------------------------------
/bioframe/vis.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | from typing import Union
3 |
4 | import matplotlib as mpl
5 | import matplotlib.pyplot as plt
6 | import numpy as np
7 | import pandas as pd
8 | from matplotlib.colors import to_rgb
9 |
10 | from .core import arrops
11 |
12 | DEFAULT_FACECOLOR = "skyblue"
13 | DEFAULT_EDGECOLOR = "dimgray"
14 |
15 | __all__ = ["plot_intervals", "to_ucsc_colorstring"]
16 |
17 |
18 | def to_ucsc_colorstring(color: Union[str, tuple]) -> str:
19 | """
20 | Convert any matplotlib color identifier into a UCSC itemRgb color string.
21 |
22 | Parameters
23 | ----------
24 | color : str or tuple
25 | Any valid matplotlib color representation (e.g. 'red', 'tomato',
26 | '#ff0000', '#ff00', "#ff000055", (1, 0, 0), (1, 0, 0, 0.5))
27 |
28 | Returns
29 | -------
30 | str
31 | A UCSC itemRgb colorstring of the form "r,g,b" where r, g, and b are
32 | integers between 0 and 255, inclusive.
33 |
34 | Notes
35 | -----
36 | The alpha (opacity) channel is ignored if represented in the input.
37 |
38 | Null values are converted to "0", which is shorthand for "0,0,0" (black).
39 | Note that BED9+ files with uninformative itemRgb values should use "0" as
40 | the itemRgb value on every data line.
41 |
42 | Examples
43 | --------
44 | >>> to_ucsc_colorstring("red")
45 | '255,0,0'
46 | >>> to_ucsc_colorstring("tomato")
47 | '255,99,71'
48 | >>> df["itemRgb"] = df["color"].apply(to_ucsc_colorstring)
49 | >>> df
50 | chrom start end color itemRgb
51 | chr1 0 10 red 255,0,0
52 | chr1 10 20 blue 0,0,255
53 | chr2 0 10 green 0,128,0
54 | chr2 10 20 None 0
55 | """
56 | if pd.isnull(color) or color == "none":
57 | return "0"
58 | else:
59 | return ",".join(str(int(x * 255)) for x in to_rgb(color))
60 |
61 |
62 | def _plot_interval(
63 | start, end, level, facecolor=None, edgecolor=None, height=0.6, ax=None
64 | ):
65 | facecolor = DEFAULT_FACECOLOR if facecolor is None else facecolor
66 | edgecolor = DEFAULT_EDGECOLOR if edgecolor is None else edgecolor
67 |
68 | ax = plt.gca() if ax is None else ax
69 | ax.add_patch(
70 | mpl.patches.Rectangle(
71 | (start, level - height / 2),
72 | end - start,
73 | height,
74 | facecolor=facecolor,
75 | edgecolor=edgecolor,
76 | )
77 | )
78 |
79 |
80 | def plot_intervals_arr(
81 | starts,
82 | ends,
83 | levels=None,
84 | labels=None,
85 | colors=None,
86 | xlim=None,
87 | show_coords=False,
88 | figsize=(10, 2),
89 | ):
90 | """
91 | Plot a collection of intervals.
92 |
93 | Parameters
94 | ----------
95 | starts, ends : np.ndarray
96 | A collection of intervals.
97 |
98 | levels : iterable or None
99 | The level of each interval, i.e. the y-coordinate at which the interval
100 | must be plotted. If None, it will be determined automatically.
101 |
102 | labels : str or iterable or None
103 | The label of each interval.
104 |
105 | colors : str or iterable or None.
106 | The color of each interval.
107 |
108 | xlim : (float, float) or None
109 | The x-span of the plot.
110 |
111 | show_coords : bool
112 | If True, plot x-ticks.
113 |
114 | figsize : (float, float) or None.
115 | The size of the figure. If None, plot within the current figure.
116 |
117 | """
118 | starts = np.asarray(starts)
119 | ends = np.asarray(ends)
120 |
121 | if figsize is not None:
122 | plt.figure(figsize=figsize)
123 |
124 | if levels is None:
125 | levels = arrops.stack_intervals(starts, ends)
126 | else:
127 | levels = np.asarray(levels)
128 |
129 | if isinstance(colors, str) or (colors is None):
130 | colors = itertools.cycle([colors])
131 | else:
132 | colors = itertools.cycle(colors)
133 |
134 | if isinstance(labels, str) or (labels is None):
135 | labels = itertools.cycle([labels])
136 | else:
137 | labels = itertools.cycle(labels)
138 |
139 | for (start, end, level, color, label) in zip(
140 | starts, ends, levels, colors, labels
141 | ):
142 | _plot_interval(start, end, level, facecolor=color)
143 | if label is not None:
144 | plt.text(
145 | (start + end) / 2,
146 | level,
147 | label,
148 | horizontalalignment="center",
149 | verticalalignment="center",
150 | )
151 |
152 | plt.ylim(-0.5, np.max(levels) + 0.5)
153 | if xlim is None:
154 | plt.xlim(-0.5, np.max(ends) + 0.5)
155 | else:
156 | plt.xlim(xlim[0], xlim[1])
157 | plt.gca().set_aspect(1)
158 |
159 | plt.gca().set_frame_on(False)
160 | plt.yticks([])
161 | if show_coords:
162 | pass
163 | else:
164 | plt.xticks([])
165 |
166 |
167 | def plot_intervals(
168 | df,
169 | levels=None,
170 | labels=None,
171 | colors=None,
172 | xlim=None,
173 | show_coords=False,
174 | figsize=(10, 2),
175 | ):
176 | """
177 | Plot a collection of intervals, one plot per chromosome.
178 |
179 | Parameters
180 | ----------
181 | df : pandas.DataFrame
182 | A collection of intervals.
183 |
184 | levels : iterable or None
185 | The level of each interval, i.e. the y-coordinate at which the interval
186 | must be plotted. If None, it will be determined automatically.
187 |
188 | labels : str or iterable or None
189 | The label of each interval.
190 |
191 | colors : str or iterable or None.
192 | The color of each interval.
193 |
194 | xlim : (float, float) or None
195 | The x-span of the plot.
196 |
197 | show_coords : bool
198 | If True, plot x-ticks.
199 |
200 | figsize : (float, float) or None.
201 | The size of the figure. If None, plot within the current figure.
202 |
203 | """
204 | chrom_gb = df.groupby("chrom", observed=True)
205 | chrom_gb = df.reset_index(drop=True).groupby("chrom", observed=True)
206 | for chrom, chrom_df in chrom_gb:
207 | chrom_indices = chrom_gb.groups[chrom].to_numpy()
208 | if isinstance(levels, (list, pd.Series, np.ndarray)):
209 | chrom_levels = np.asarray(levels)[chrom_indices]
210 | elif levels is None:
211 | chrom_levels = None
212 | else:
213 | raise ValueError(f"Unknown type of levels: {type(levels)}")
214 |
215 | if isinstance(labels, (list, pd.Series, np.ndarray)):
216 | chrom_labels = np.asarray(labels)[chrom_indices]
217 | elif labels is None:
218 | chrom_labels = None
219 | else:
220 | raise ValueError(f"Unknown type of labels: {type(levels)}")
221 |
222 | if isinstance(colors, (list, pd.Series, np.ndarray)):
223 | chrom_colors = np.asarray(colors)[chrom_indices]
224 | elif colors is None or isinstance(colors, str):
225 | chrom_colors = colors
226 | else:
227 | raise ValueError(f"Unknown type of colors: {type(colors)}")
228 |
229 | plot_intervals_arr(
230 | chrom_df.start,
231 | chrom_df.end,
232 | levels=chrom_levels,
233 | labels=chrom_labels,
234 | colors=chrom_colors,
235 | xlim=xlim,
236 | show_coords=show_coords,
237 | figsize=figsize,
238 | )
239 | plt.title(chrom)
240 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/api-construction.rst:
--------------------------------------------------------------------------------
1 | Construction
2 | ============
3 |
4 | .. automodule:: bioframe.core.construction
5 | :autosummary:
6 | :members:
7 |
--------------------------------------------------------------------------------
/docs/api-extras.rst:
--------------------------------------------------------------------------------
1 | Additional tools
2 | ================
3 |
4 | .. automodule:: bioframe.extras
5 | :autosummary:
6 | :members:
7 |
--------------------------------------------------------------------------------
/docs/api-fileops.rst:
--------------------------------------------------------------------------------
1 | .. _API_fileops:
2 |
3 | File I/O
4 | ========
5 |
6 | .. automodule:: bioframe.io.fileops
7 | :autosummary:
8 | :members:
9 |
10 | .. autofunction:: bioframe.io.bed.to_bed
11 |
--------------------------------------------------------------------------------
/docs/api-intervalops.rst:
--------------------------------------------------------------------------------
1 | .. _API_ops:
2 |
3 | Interval operations
4 | ===================
5 |
6 | .. automodule:: bioframe.ops
7 | :autosummary:
8 | :members:
9 |
--------------------------------------------------------------------------------
/docs/api-lowlevel.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | formats: ipynb,md:myst
4 | text_representation:
5 | extension: .md
6 | format_name: myst
7 | format_version: 0.13
8 | jupytext_version: 1.11.3
9 | kernelspec:
10 | display_name: Python 3
11 | language: python
12 | name: python3
13 | ---
14 |
15 | # Low-level API
16 |
17 | ```{eval-rst}
18 | .. toctree::
19 | :maxdepth: 2
20 | :caption: Contents:
21 |
22 | lowlevel/arrops
23 | lowlevel/specs
24 | lowlevel/stringops
25 |
26 | ```
27 |
28 | Low level array-based operations are used to implement the genomic interval operations on dataframes.
29 |
30 | ```{code-cell} ipython3
31 | import itertools
32 |
33 | import numpy as np
34 | import matplotlib
35 | import matplotlib.pyplot as plt
36 | import pandas as pd
37 |
38 | import bioframe as bf
39 | import bioframe.vis
40 |
41 |
42 | from bioframe.core import arrops
43 | ```
44 |
45 | ```{code-cell} ipython3
46 | starts1, ends1 = np.array([
47 | [1,5],
48 | [3,8],
49 | [8,10],
50 | [12,14]
51 | ]).T
52 |
53 | starts2, ends2 = np.array([
54 | [4,8],
55 | [10,11],
56 | ]).T
57 | ```
58 |
59 | ```{code-cell} ipython3
60 | bf.vis.plot_intervals_arr(
61 | starts = starts1,
62 | ends = ends1,
63 | xlim = (-0.5,14.5),
64 | labels = np.arange(0,starts1.shape[0]),
65 | show_coords = True)
66 |
67 | bf.vis.plot_intervals_arr(
68 | starts = starts2,
69 | ends = ends2,
70 | colors = 'lightpink',
71 | xlim = (-0.5,14.5),
72 | labels = np.arange(0,starts2.shape[0]),
73 | show_coords = True)
74 | ```
75 |
76 | ```{code-cell} ipython3
77 | arrops.overlap_intervals(starts1, ends1, starts2, ends2)
78 | ```
79 |
80 | ```{code-cell} ipython3
81 | arrops.overlap_intervals_outer(starts1, ends1, starts2, ends2)
82 | ```
83 |
84 | ```{code-cell} ipython3
85 | arrops.merge_intervals(starts1, ends1, min_dist=0)
86 | ```
87 |
88 | ```{code-cell} ipython3
89 | arrops.merge_intervals(starts1, ends1, min_dist=None)
90 | ```
91 |
92 | ```{code-cell} ipython3
93 | arrops.merge_intervals(starts1, ends1, min_dist=2)
94 | ```
95 |
96 | ```{code-cell} ipython3
97 | arrops.complement_intervals(starts1, ends1)
98 | ```
99 |
--------------------------------------------------------------------------------
/docs/api-resources.rst:
--------------------------------------------------------------------------------
1 | Resources
2 | =========
3 |
4 | Genome assembly metadata
5 | ------------------------
6 |
7 | Bioframe provides a collection of genome assembly metadata for commonly used
8 | genomes. These are accessible through a convenient dataclass interface via :func:`bioframe.assembly_info`.
9 |
10 | The assemblies are listed in a manifest YAML file, and each assembly
11 | has a mandatory companion file called `seqinfo` that contains the sequence
12 | names, lengths, and other information. The records in the manifest file contain
13 | the following fields:
14 |
15 | - ``organism``: the organism name
16 | - ``provider``: the genome assembly provider (e.g, ucsc, ncbi)
17 | - ``provider_build``: the genome assembly build name (e.g., hg19, GRCh37)
18 | - ``release_year``: the year of the assembly release
19 | - ``seqinfo``: path to the seqinfo file
20 | - ``cytobands``: path to the cytoband file, if available
21 | - ``default_roles``: default molecular roles to include from the seqinfo file
22 | - ``default_units``: default assembly units to include from the seqinfo file
23 | - ``url``: URL to where the corresponding sequence files can be downloaded
24 |
25 | The `seqinfo` file is a TSV file with the following columns (with header):
26 |
27 | - ``name``: canonical sequence name
28 | - ``length``: sequence length
29 | - ``role``: role of the sequence or scaffold (e.g., "assembled", "unlocalized", "unplaced")
30 | - ``molecule``: name of the molecule that the sequence belongs to, if placed
31 | - ``unit``: assembly unit of the chromosome (e.g., "primary", "non-nuclear", "decoy")
32 | - ``aliases``: comma-separated list of aliases for the sequence name
33 |
34 | We currently do not include sequences with "alt" or "patch" roles in `seqinfo` files, but we
35 | do support the inclusion of additional decoy sequences (as used by so-called NGS *analysis
36 | sets* for human genome assemblies) by marking them as members of a "decoy" assembly unit.
37 |
38 | The `cytoband` file is an optional TSV file with the following columns (with header):
39 |
40 | - ``chrom``: chromosome name
41 | - ``start``: start position
42 | - ``end``: end position
43 | - ``band``: cytogenetic coordinate (name of the band)
44 | - ``stain``: Giesma stain result
45 |
46 | The order of the sequences in the `seqinfo` file is treated as canonical.
47 | The ordering of the chromosomes in the `cytobands` file should match the order
48 | of the chromosomes in the `seqinfo` file.
49 |
50 | The manifest and companion files are stored in the ``bioframe/io/data`` directory.
51 | New assemblies can be requested by opening an issue on GitHub or by submitting a pull request.
52 |
53 | .. automodule:: bioframe.io.assembly
54 | :autosummary:
55 | :members:
56 |
57 | .. autoclass:: bioframe.io.assembly.GenomeAssembly
58 | :members:
59 | :undoc-members:
60 |
61 |
62 | Remote resources
63 | ----------------
64 | These functions now default to using the local data store, but can be used to obtain chromsizes or
65 | centromere positions from UCSC by setting ``provider="ucsc"``.
66 |
67 | .. automodule:: bioframe.io.resources
68 | :autosummary:
69 | :members:
70 |
--------------------------------------------------------------------------------
/docs/api-validation.rst:
--------------------------------------------------------------------------------
1 | Validation
2 | ==========
3 |
4 | .. automodule:: bioframe.core.checks
5 | :autosummary:
6 | :members:
7 |
--------------------------------------------------------------------------------
/docs/api-vis.rst:
--------------------------------------------------------------------------------
1 | Plotting
2 | ===============
3 |
4 | .. automodule:: bioframe.vis
5 | :autosummary:
6 | :members:
7 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 | # import sys
9 | from datetime import datetime
10 | from importlib.metadata import metadata
11 |
12 | # autodoc_mock_imports = ["numpy", "pandas", "matplotlib", "requests"]
13 |
14 |
15 | # -- Project information -----------------------------------------------------
16 | # NOTE: If you installed your project in editable mode, this might be stale.
17 | # If this is the case, reinstall it to refresh the metadata
18 | info = metadata("bioframe")
19 | project_name = info["Name"]
20 | author = "Open2C"
21 | copyright = f"{datetime.now():%Y}, {author}."
22 | version = info["Version"]
23 | urls = dict(pu.split(", ") for pu in info.get_all("Project-URL"))
24 |
25 | # The full version, including alpha/beta/rc tags
26 | release = info["Version"]
27 |
28 | # -- General configuration ---------------------------------------------------
29 |
30 | # Add any Sphinx extension module names here, as strings. They can be
31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
32 | # ones.
33 | extensions = [
34 | # "sphinx.ext.autodoc",
35 | # 'sphinx.ext.doctest',
36 | # 'sphinx.ext.todo',
37 | # 'sphinx.ext.coverage',
38 | # 'sphinx.ext.mathjax',
39 | # 'sphinx.ext.ifconfig',
40 | "autodocsumm",
41 | "sphinx.ext.viewcode",
42 | "sphinx.ext.autosummary",
43 | "sphinx.ext.napoleon", # 'numpydoc'
44 | "myst_nb",
45 | ]
46 | # Add any paths that contain templates here, relative to this directory.
47 | templates_path = ["_templates"]
48 |
49 | # List of patterns, relative to source directory, that match files and
50 | # directories to ignore when looking for source files.
51 | # This pattern also affects html_static_path and html_extra_path.
52 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "**.ipynb_checkpoints"]
53 |
54 | # nbsphinx_custom_formats = {
55 | # '.md': ['jupytext.reads', {'fmt': 'MyST'}],
56 | # }
57 |
58 | # -- Options for HTML output -------------------------------------------------
59 |
60 | # The theme to use for HTML and HTML Help pages. See the documentation for
61 | # a list of builtin themes.
62 | #
63 | html_theme = "sphinx_rtd_theme"
64 |
65 | # Add any paths that contain custom static files (such as style sheets) here,
66 | # relative to this directory. They are copied after the builtin static files,
67 | # so a file named "default.css" will overwrite the builtin "default.css".
68 | html_static_path = ["_static"]
69 |
70 | master_doc = "index"
71 |
72 | autosummary_generate = True
73 |
74 | # Don't include fully qualified name prefixes in autodoc
75 | add_module_names = False
76 |
77 | # Cache MyST (.md or .ipynb) notebook outputs if unmodified
78 | jupyter_execute_notebooks = "cache"
79 | execution_excludepatterns = ["guide-performance.ipynb"]
80 |
--------------------------------------------------------------------------------
/docs/figs/._bioframe-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/._bioframe-logo.png
--------------------------------------------------------------------------------
/docs/figs/bioframe-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/bioframe-logo.png
--------------------------------------------------------------------------------
/docs/figs/bioframe_closest.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/bioframe_closest.pdf
--------------------------------------------------------------------------------
/docs/figs/closest0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/closest0.png
--------------------------------------------------------------------------------
/docs/figs/closest1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/closest1.png
--------------------------------------------------------------------------------
/docs/figs/closest2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/closest2.png
--------------------------------------------------------------------------------
/docs/figs/closest3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/closest3.png
--------------------------------------------------------------------------------
/docs/figs/df1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/df1.png
--------------------------------------------------------------------------------
/docs/figs/df2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/df2.png
--------------------------------------------------------------------------------
/docs/figs/df@.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/df@.png
--------------------------------------------------------------------------------
/docs/figs/merge_df1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/merge_df1.png
--------------------------------------------------------------------------------
/docs/figs/overlap_inner_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/overlap_inner_0.png
--------------------------------------------------------------------------------
/docs/figs/overlap_inner_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/overlap_inner_1.png
--------------------------------------------------------------------------------
/docs/guide-bedtools.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | formats: md:myst
4 | text_representation:
5 | extension: .md
6 | format_name: myst
7 | format_version: 0.13
8 | jupytext_version: 1.11.3
9 | kernelspec:
10 | display_name: Python 3
11 | language: python
12 | name: python3
13 | ---
14 |
15 | # Bioframe for bedtools users
16 |
17 |
18 | Bioframe is built around the analysis of genomic intervals as a pandas [DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) in memory, rather than working with tab-delimited text files saved on disk.
19 |
20 | Bioframe supports reading a number of standard genomics text file formats via [`read_table`](https://bioframe.readthedocs.io/en/latest/api-fileops.html#bioframe.io.fileops.read_table), including BED files (see [schemas](https://github.com/open2c/bioframe/blob/main/bioframe/io/schemas.py)), which will load them as pandas DataFrames, a complete list of helper functions is [available here](API_fileops).
21 |
22 | Any DataFrame object with `'chrom'`, `'start'`, and `'end'` columns will support the genomic [interval operations in bioframe](API_ops). The names of these columns can also be customized via the `cols=` arguments in bioframe functions.
23 |
24 | For example, with gtf files, you do not need to turn them into bed files, you can directly read them into pandas (with e.g. [gtfparse](https://github.com/openvax/gtfparse/tree/master)). For gtfs, it is often convenient to rename the `'seqname'` column to `'chrom'`, the default column name used in bioframe.
25 |
26 | Finally, if needed, bioframe provides a convenience function to write dataframes to a standard BED file using [`to_bed`](https://bioframe.readthedocs.io/en/latest/api-fileops.html#bioframe.io.bed.to_bed).
27 |
28 |
29 | ## `bedtools intersect`
30 |
31 | ### Select unique entries from the first bed overlapping the second bed `-u`
32 |
33 | ```sh
34 | bedtools intersect -u -a A.bed -b B.bed > out.bed
35 | ```
36 |
37 | ```py
38 | overlap = bf.overlap(A, B, how='inner', suffixes=('_1','_2'), return_index=True)
39 | out = A.loc[overlap['index_1'].unique()]
40 | ```
41 |
42 | ### Report the number of hits in B `-c`
43 |
44 | Reports 0 for A entries that have no overlap with B.
45 |
46 | ```sh
47 | bedtools intersect -c -a A.bed -b B.bed > out.bed
48 | ```
49 |
50 | ```py
51 | out = bf.count_overlaps(A, B)
52 | ```
53 |
54 | ### Return entries from both beds for each overlap `-wa -wb`
55 |
56 | ```sh
57 | bedtools intersect -wa -wb -a A.bed -b B.bed > out.bed
58 | ```
59 |
60 | ```py
61 | out = bf.overlap(A, B, how='inner')
62 | ```
63 |
64 | **Note:** This is called an "inner join", and is analogous to an inner pandas join or merge. The default column suffixes in the output dataframe are `''` (nothing) for A's columns and `'_'` for B's columns.
65 |
66 | ### Include all entries from the first bed, even if no overlap `-loj`
67 |
68 | ```sh
69 | bedtools intersect -wa -wb -loj -a A.bed -b B.bed > out.bed
70 | ```
71 |
72 | ```py
73 | out = bf.overlap(A, B, how='left')
74 | ```
75 |
76 | **Note:** This is called a "left-outer join".
77 |
78 | ### Select entries from the first bed for each overlap `-wa`
79 |
80 | ```sh
81 | bedtools intersect -wa -a A.bed -b B.bed > out.bed
82 | ```
83 |
84 | ```py
85 | overlap = bf.overlap(A, B, how='inner', suffixes=('_1','_2'), return_index=True)
86 | out = A.loc[overlap['index_1']]
87 |
88 | # Alternatively
89 | out = bf.overlap(A, B, how='inner')[A.columns]
90 | ```
91 |
92 | > **Note:** This gives one row per overlap and can contain duplicates. The output dataframe of the former method will use the same pandas index as the input dataframe `A`, while the latter result --- the join output --- will have an integer range index, like a pandas merge.
93 |
94 | ### Select entries from the second bed for each overlap `-wb`
95 |
96 | ```sh
97 | bedtools intersect -wb -a A.bed -b B.bed > out.bed
98 | ```
99 |
100 | ```py
101 | overlap = bf.overlap(A, B, how='inner', suffixes=('_1','_2'), return_index=True)
102 | out = B.loc[overlap['index_2']]
103 |
104 | # Alternatively
105 | out = bf.overlap(A, B, how='inner', suffixes=('_', ''))[B.columns]
106 | ```
107 |
108 | > **Note:** This gives one row per overlap and can contain duplicates. The output dataframe of the former method will use the same pandas index as the input dataframe `B`, while the latter result --- the join output --- will have an integer range index, like a pandas merge.
109 |
110 |
111 | ### Intersect multiple beds against A
112 |
113 | ```sh
114 | bedtools intersect -wa -a A.bed -b B.bed C.bed D.bed > out.bed
115 | ```
116 |
117 | ```py
118 | others = pd.concat([B, C, D])
119 | overlap = bf.overlap(A, others, how='inner', suffixes=('_1','_2'), return_index=True)
120 | out = A.loc[overlap['index_1']]
121 | ```
122 |
123 | ### Return everything in A that doesn't overlap with B `-v`
124 |
125 | ```sh
126 | bedtools intersect -wa -a A.bed -b B.bed -v > out.bed
127 | ```
128 |
129 | ```py
130 | out = bf.setdiff(A, B)
131 | ```
132 |
133 | **Note:** We call this a set difference.
134 |
135 | ### Force strandedness `-s`
136 |
137 | For intersection
138 |
139 | ```sh
140 | bedtools intersect -wa -a A.bed -b B.bed -s > out.bed
141 | ```
142 |
143 | ```py
144 | overlap = bf.overlap(A, B, on=['strand'], suffixes=('_1','_2'), return_index=True, how='inner')
145 | out = A.loc[overlap['index_1']]
146 | ```
147 |
148 | For non-intersection `-v`
149 |
150 | ```sh
151 | bedtools intersect -wa -a A.bed -b B.bed -v -s > out.bed
152 | ```
153 |
154 | ```py
155 | out = bf.setdiff(A, B, on=['strand'])
156 | ```
157 |
158 | ### Minimum overlap as a fraction of A `-f`
159 |
160 | We want to keep rows of A that are covered at least 70% by elements from B
161 |
162 | ```sh
163 | bedtools intersect -wa -a A.bed -b B.bed -f 0.7 > out.bed
164 | ```
165 |
166 | ```py
167 | cov = bf.coverage(A, B)
168 | out = A.loc[cov['coverage'] / (cov['end'] - cov['start']) ) >= 0.70]
169 |
170 | # Alternatively
171 | out = bf.coverage(A, B).query('coverage / (end - start) >= 0.7')[A.columns]
172 | ```
173 |
--------------------------------------------------------------------------------
/docs/guide-definitions.rst:
--------------------------------------------------------------------------------
1 | .. _Definitions:
2 |
3 | Definitions
4 | ===========
5 |
6 | Interval:
7 | - An *interval* is a tuple of integers (start, end) with start <= end.
8 | - Coordinates are assumed to be 0-based and intervals half-open (1-based ends) i.e. [start, end).
9 | - An interval has a *length* equal to (end - start).
10 | - A special case where start and end are the same, i.e. [X, X), is interpreted as a *point* (aka an *empty interval*, i.e. an edge between 1-bp bins). A point has zero length.
11 | - Negative coordinates are permissible for both ends of an interval.
12 |
13 | Properties of a pair of intervals:
14 | - Two intervals can either *overlap*, or not. The overlap length = max(0, min(end1, end2) - max(start1, start2)). Empty intervals can have overlap length = 0.
15 | - When two intervals overlap, the shorter of the two intervals is said to be *contained* in the longer one if the length of their overlap equals the length of the shorter interval. This property is often referred to as nestedness, but we use the term “contained” as it is less ambiguous when describing the relationship of sets of intervals to one interval.
16 | - If two intervals do not overlap, they have a *distance* = max(0, max(start1, start2) - min(end1, end2)).
17 | - If two intervals have overlap=0 and distance=0, they are said to be *abutting*.
18 |
19 | Scaffold:
20 | - A chromosome, contig or, more generally, a *scaffold* is an interval defined by a unique string and has a length>=0, with start=0 and end=length, implicitly defining an interval [0, length).
21 |
22 | Genome assembly:
23 | - The complete set of scaffolds associated with a genome is called an *assembly* (e.g. defined by the reference sequence from NCBI, etc.).
24 |
25 | Genomic interval:
26 | - A *genomic interval* is an interval with an associated scaffold, or chromosome, defined by a string, i.e. a triple (chrom, start, end).
27 | - Genomic intervals on different scaffolds never overlap and do not have a defined distance.
28 | - Genomic intervals can extend beyond their associated scaffold (e.g. with negative values or values greater than the scaffold length), as this can be useful in downstream applications. If they do, they are not contained by their associated scaffold.
29 | - A *base-pair* is a special case of a genomic interval with length=1, i.e. (chrom, start, start+1)
30 | - *strand* is an (optional) property of a genomic interval which specifies an interval’s orientation on its scaffold. Note start and end are still defined with respect to the scaffold’s reference orientation (positive strand), even if the interval lies on the negative strand. Intervals on different strands can either be allowed to overlap or not.
31 |
32 | View (i.e. a set of Genomic Regions):
33 | - A genomic *view* is an ordered set of non-overlapping genomic intervals each having a unique name defined by a string. Individual named intervals in a view are *regions*, defined by a quadruple, e.g. (chrom, start, end, name).
34 | - A view thus specifies a unified 1D coordinate system, i.e. a projection of multiple genomic regions onto a single axis.
35 | - We define views separately from the scaffolds that make up a genome assembly, as a set of more constrained and ordered genomic regions are often useful for downstream analysis and visualization.
36 | - An assembly is a special case of a view, where the individual regions correspond to the assembly’s entire scaffolds.
37 |
38 | Associating genomic intervals with views
39 | - Similarly to how genomic intervals are associated with a scaffold, they can also be associated with a region from a view with an additional string, making a quadruple (chrom, start, end, view_region). This string must be *cataloged* in the view, i.e. it must match the name of a region in the view. Typically the interval would be contained in its associated view region, or, at the minimum, have a greater overlap with that region than other view regions.
40 | - If each interval in a set is contained in their associated view region, the set is *contained* in the view.
41 | - A set of intervals *covers* a view if each region in the view is contained by the union of its associated intervals. Conversely, if a set does not cover all of view regions, the interval set will have *gaps* relative to that view (stretches of bases not covered by an interval).
42 |
43 | Properties of sets of genomic intervals:
44 | - A set of genomic intervals may have overlaps or not. If it does not, it is said to be *overlap-free*.
45 | - A set of genomic intervals is *tiling* if it: (i) covers the associated view, (ii) is contained in that view, and (iii) is overlap-free. Equivalently, a tiling set of intervals (a) has an initial interval that begins at the start of each region and (b) a final interval that terminates at the end of each region, and (c) every base pair is associated with a unique interval.
46 |
--------------------------------------------------------------------------------
/docs/guide-quickstart.rst:
--------------------------------------------------------------------------------
1 | Quickstart
2 | ==========
3 |
4 | Installation
5 | ------------
6 |
7 | ::
8 |
9 | $ pip install bioframe
10 |
11 | To install the latest development version of `bioframe` from
12 | github, first make a local clone of the github repository:
13 |
14 | .. code-block:: bash
15 |
16 | $ git clone https://github.com/open2c/bioframe
17 |
18 | Then, compile and install `bioframe` in
19 | `development mode `_. This installs the package without moving it to a system folder, and thus allows for testing changes to the python code on the fly.
20 |
21 | .. code-block:: bash
22 |
23 | $ cd bioframe
24 | $ pip install -e ./
25 |
--------------------------------------------------------------------------------
/docs/guide-recipes.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | formats: md:myst
4 | text_representation:
5 | extension: .md
6 | format_name: myst
7 | format_version: 0.13
8 | jupytext_version: 1.11.3
9 | kernelspec:
10 | display_name: Python 3
11 | language: python
12 | name: python3
13 | ---
14 |
15 | # How do I
16 |
17 | ## Obtain overlapping intervals with matching strandedness?
18 | Use overlap with the ``on`` argument:
19 | ```
20 | df = bf.overlap(df1, df2, on=[‘strand’])
21 | ```
22 |
23 | ## Obtain overlapping intervals with opposite strandedness?
24 | Overlap then filter pairs of opposite strandedness:
25 | ```
26 | df = bf.overlap(df1, df2)
27 | df = df.loc[df["strand"]!=df["strand_"]]
28 | ```
29 | ## Obtain intervals that exceed 50% coverage by another set of intervals?
30 | Coverage, then filter pairs by fractional coverage:
31 | ```
32 | df = bf.coverage(df1, df2)
33 | df = df[ ( df["coverage"] / (df["end"]-df["start"]) ) >=0.50]
34 | ```
35 |
36 | ## Shift all intervals on the positive strand by 10bp?
37 | Use pandas indexing:
38 | ```
39 | df.loc[df.strand=="+",["start", "end"]] += 10
40 | ```
41 |
42 | ## Obtain intervals overlapped by at least 2 intervals from another set?
43 | Count overlaps, then filter:
44 | ```
45 | df = bf.count_overlaps(df1, df2)
46 | df = df[ df["count"] >= 2]
47 | ```
48 |
49 | ## Find strand-specific downstream genomic features?
50 | Use closest after filtering by strand, and passing the `ignore_upsream=True` argument.
51 | ```
52 | bioframe.closest(df1.loc[df1['strand']=='+'], df2, ignore_upstream=True)
53 | ```
54 |
55 | For gener, the upstream/downstream direction might be defined by the direction of transcription.
56 | Use `direction_col='strand'` to set up the direction:
57 | ```
58 | bioframe.closest(df1, df2, ignore_upstream=True, direction_col='strand')
59 | ```
60 |
61 | ## Drop non-autosomes from a bedframe?
62 | Use pandas DataFrame.isin(values):
63 | ```
64 | df[ ~df.chrom.isin(['chrX','chrY'])]
65 | ```
66 |
--------------------------------------------------------------------------------
/docs/guide-specifications.rst:
--------------------------------------------------------------------------------
1 | .. _Specifications:
2 |
3 | Specifications
4 | ===========
5 |
6 | BedFrame (i.e. genomic intervals stored in a pandas dataframe):
7 | - In a BedFrame, three required columns specify the set of genomic intervals (default column names = (‘chrom’, ‘start’, ‘end’)).
8 | - Other reserved but not required column names: (‘strand’, ‘name’, ‘view_region’).
9 |
10 | - entries in column ‘name’ are expected to be unique
11 | - ‘view_region’ is expected to point to an associated region in a view with a matching name
12 | - ‘strand’ is expected to be encoded with strings (‘+’, ‘-’, ‘.’).
13 |
14 | - Additional columns are allowed: ‘zodiac_sign’, ‘soundcloud’, ‘twitter_name’, etc.
15 | - Repeated intervals are allowed.
16 | - The native pandas DataFrame index is not intended to be used as an immutable lookup table for genomic intervals in BedFrame. This is because many common genomic interval operations change the number of intervals stored in a BedFrame.
17 | - Two useful sorting schemes for BedFrames are:
18 |
19 | - scaffold-sorted: on (chrom, start, end), where chrom is sorted lexicographically.
20 | - view-sorted: on (view_region, start, end) where view_region is sorted by order in the view.
21 |
22 | - Null values are allowed, but only as pd.NA (using np.nan is discouraged as it results in unwanted type re-casting).
23 | - Note if no ‘view_region’ is assigned to a genomic interval, then ‘chrom’ implicitly defines an associated region
24 | - Note the BedFrame specification is a natural extension of the BED format ( https://samtools.github.io/hts-specs/BEDv1.pdf ) for pandas DataFrames.
25 |
26 | ViewFrames (a genomic view stored in a pandas dataframe)
27 | - BedFrame where:
28 |
29 | - intervals are non-overlapping
30 | - “name” column is mandatory and contains a set of unique strings.
31 |
32 | - Note that a ViewFrame can potentially be indexed by the name column to serve as a lookup table. This functionality is currently not implemented, because within the current Pandas implementation indexing by a column removes the column from the table.
33 | - Note that views can be defined by:
34 |
35 | - dictionary of string:ints (start=0 assumed) or string:tuples (start,end)
36 | - pandas series of chromsizes (start=0, name=chrom)
37 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. bioframe documentation master file, created by
2 | sphinx-quickstart on Sat Apr 11 11:44:26 2020.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | bioframe
7 | ========
8 |
9 | `Bioframe `_ is a library to enable flexible and scalable operations on genomic interval dataframes in python. Building bioframe directly on top of `pandas `_ enables immediate access to a rich set of dataframe operations. Working in python enables rapid visualization and iteration of genomic analyses.
10 |
11 |
12 | .. toctree::
13 | :maxdepth: 1
14 | :caption: Guide
15 |
16 | guide-quickstart
17 | guide-intervalops.md
18 | guide-io.ipynb
19 | guide-performance.ipynb
20 | guide-recipes.md
21 | guide-definitions
22 | guide-specifications
23 | guide-bedtools
24 |
25 | .. toctree::
26 | :maxdepth: 1
27 | :caption: Tutorials
28 |
29 | tutorials/tutorial_assign_motifs_to_peaks.ipynb
30 | tutorials/tutorial_assign_peaks_to_genes.ipynb
31 |
32 | .. toctree::
33 | :maxdepth: 3
34 | :caption: API
35 |
36 | api-construction
37 | api-validation
38 | api-intervalops
39 | api-fileops
40 | api-resources
41 | api-extras
42 | api-vis
43 | api-lowlevel.md
44 |
45 |
46 | Indices and tables
47 | ==================
48 |
49 | * :ref:`genindex`
50 | * :ref:`modindex`
51 | * :ref:`search`
52 |
--------------------------------------------------------------------------------
/docs/lowlevel/arrops.rst:
--------------------------------------------------------------------------------
1 | Array operations
2 | =================
3 |
4 | Low level operations that are used to implement the genomic interval operations.
5 |
6 | .. automodule:: bioframe.core.arrops
7 | :autosummary:
8 | :members:
9 |
--------------------------------------------------------------------------------
/docs/lowlevel/specs.rst:
--------------------------------------------------------------------------------
1 | Specifications
2 | =================
3 |
4 | .. automodule:: bioframe.core.specs
5 | :autosummary:
6 | :members:
7 |
8 | **Unexposed functions:**
9 |
10 | .. automethod:: bioframe.core.specs._verify_column_dtypes
11 | .. automethod:: bioframe.core.specs._verify_columns
12 | .. automethod:: bioframe.core.specs._get_default_colnames
13 |
--------------------------------------------------------------------------------
/docs/lowlevel/stringops.rst:
--------------------------------------------------------------------------------
1 | String operations
2 | =================
3 |
4 | .. automodule:: bioframe.core.stringops
5 | :autosummary:
6 | :members:
7 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/times100.bw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/times100.bw
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["hatchling", "hatch-vcs"]
3 | build-backend = "hatchling.build"
4 |
5 | [project]
6 | name = "bioframe"
7 | version = "0.8.0"
8 | description = "Operations and utilities for Genomic Interval Dataframes."
9 | license = {text = "MIT"}
10 | authors = [
11 | {name = "Open2C", email = "open.chromosome.collective@gmail.com"},
12 | ]
13 | keywords = [
14 | "pandas",
15 | "dataframe",
16 | "genomics",
17 | "epigenomics",
18 | "bioinformatics",
19 | "interval operations",
20 | "genomic ranges",
21 | "bedtools",
22 | "bedframe",
23 | "viewframe",
24 | "bed",
25 | ]
26 | classifiers = [
27 | "Development Status :: 4 - Beta",
28 | "Operating System :: OS Independent",
29 | "Programming Language :: Python",
30 | "Programming Language :: Python :: 3",
31 | "Programming Language :: Python :: 3.8",
32 | "Programming Language :: Python :: 3.9",
33 | "Programming Language :: Python :: 3.10",
34 | "Programming Language :: Python :: 3.11",
35 | "Programming Language :: Python :: 3.12",
36 | ]
37 | readme = "README.md"
38 | requires-python = ">=3.8"
39 | dependencies = [
40 | "matplotlib",
41 | "numpy>=1.10, <3",
42 | "pandas>=1.3",
43 | "pyyaml",
44 | "requests",
45 | "typing-extensions ; python_version<'3.9'",
46 | "importlib-metadata ; python_version<'3.8'",
47 | "importlib-resources ; python_version<'3.9'",
48 | ]
49 |
50 | [project.optional-dependencies]
51 | dev = [
52 | "biopython",
53 | "pre-commit",
54 | "pysam",
55 | "pybbi",
56 | "pytest",
57 | "ruff",
58 | ]
59 | test = [
60 | "pytest",
61 | ]
62 | docs = [
63 | "autodocsumm",
64 | "myst_nb",
65 | "jinja2",
66 | "Sphinx",
67 | "sphinx-autobuild",
68 | "sphinx_rtd_theme",
69 | ]
70 |
71 | [project.urls]
72 | homepage = "https://github.com/open2c/bioframe"
73 | documentation = "https://bioframe.readthedocs.io/en/latest"
74 | repository = "https://github.com/open2c/bioframe"
75 | changelog = "https://github.com/open2c/bioframe/blob/main/CHANGES.md"
76 |
77 | [tool.ruff]
78 | target-version = "py37"
79 | exclude = [
80 | ".venv",
81 | ]
82 |
83 | [tool.ruff.lint]
84 | extend-select = [
85 | "B", # bugbear
86 | # "C", # mccabe complexity
87 | # "D", # pydocstyle
88 | "E", # style errors
89 | "F", # pyflakes
90 | "I", # isort
91 | "RUF", # ruff-specific rules
92 | "UP", # pyupgrade
93 | "W", # style warnings
94 | ]
95 |
96 | [tool.ruff.lint.isort]
97 | known-first-party = ["bioframe"]
98 |
99 | [tool.ruff.lint.pydocstyle]
100 | convention = "numpy"
101 |
102 | [tool.pytest.ini_options]
103 | minversion = "7"
104 | log_cli_level = "info"
105 | xfail_strict = true
106 | addopts = [
107 | "-ra",
108 | "--showlocals",
109 | "--strict-config",
110 | "--strict-markers",
111 | ]
112 | filterwarnings = ["ignore::PendingDeprecationWarning"]
113 | testpaths = ["tests"]
114 |
115 | [tool.hatch.envs.default]
116 | features = ["dev", "test", "docs"]
117 |
118 | [tool.hatch.envs.default.scripts]
119 | fix = "ruff check --fix ."
120 | lint = "ruff check bioframe tests"
121 | format = "ruff format bioframe tests"
122 | test = "pytest ."
123 | docs = "sphinx-autobuild docs docs/_build/html"
124 |
125 | [tool.hatch.envs.test]
126 | features = ["dev", "test"]
127 |
128 | [[tool.hatch.envs.test.matrix]]
129 | python = ["3.9", "3.10", "3.11", "3.12"]
130 |
--------------------------------------------------------------------------------
/tests/test_assembly_info.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import pytest
3 |
4 | from bioframe.io.assembly import GenomeAssembly, assemblies_available, assembly_info
5 |
6 |
7 | def test_assemblies_available():
8 | assemblies = assemblies_available()
9 | assert isinstance(assemblies, pd.DataFrame)
10 | for col in ["provider", "provider_build", "default_roles", "default_units"]:
11 | assert col in assemblies.columns
12 |
13 |
14 | def test_assembly_info():
15 | hg38 = assembly_info("hg38")
16 | assert isinstance(hg38, GenomeAssembly)
17 | assert hg38.provider == "ucsc"
18 | assert hg38.provider_build == "hg38"
19 | assert isinstance(hg38.chromsizes, pd.Series)
20 | assert isinstance(hg38.chromnames, list)
21 | assert isinstance(hg38.alias_dict, dict)
22 |
23 | assert isinstance(hg38.seqinfo, pd.DataFrame)
24 | for col in ["name", "length", "aliases", "role", "unit"]:
25 | assert col in hg38.seqinfo.columns
26 |
27 | assert isinstance(hg38.viewframe, pd.DataFrame)
28 | for col in ["chrom", "start", "end", "name"]:
29 | assert col in hg38.viewframe.columns
30 |
31 | hg38 = assembly_info("ucsc.hg38", roles=("assembled", "unlocalized"))
32 | assert isinstance(hg38, GenomeAssembly)
33 |
34 | with pytest.raises(ValueError):
35 | assembly_info("ncbi.hg38") # provider-name mismatch
36 |
37 | assert isinstance(hg38.cytobands, pd.DataFrame)
38 | for col in ["chrom", "start", "end", "band", "stain"]:
39 | assert col in hg38.cytobands.columns
40 |
41 | sacCer3 = assembly_info("sacCer3")
42 | assert sacCer3.cytobands is None
43 |
--------------------------------------------------------------------------------
/tests/test_bed.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tempfile
3 |
4 | import pandas as pd
5 | import pytest
6 |
7 | import bioframe
8 |
9 |
10 | def test_involution():
11 | with tempfile.TemporaryDirectory() as directory:
12 | for schema in ['narrowPeak', 'bed12']:
13 | bf = bioframe.read_table(f'tests/test_data/{schema}.bed',
14 | schema=schema)
15 | fname = os.path.join(directory, f'{schema}.bed')
16 | bioframe.to_bed(bf, fname)
17 | involution = bioframe.read_table(fname, schema=schema)
18 | pd.testing.assert_frame_equal(bf, involution)
19 |
20 |
21 | def test_chrom_validators():
22 | with tempfile.TemporaryDirectory() as directory:
23 | bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12')
24 | bf.loc[0, 'chrom'] = 'value with space'
25 | with pytest.raises(ValueError):
26 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
27 |
28 | bf.loc[0, 'chrom'] = '' # must be non empty
29 | with pytest.raises(ValueError):
30 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
31 |
32 | bf.loc[0, 'chrom'] = 'a'*300 # must be shorter than 256
33 | with pytest.raises(ValueError):
34 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
35 |
36 |
37 | def test_end_validators():
38 | with tempfile.TemporaryDirectory() as directory:
39 | bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12')
40 | bf.loc[0, 'end'] = 10 # end must be after start
41 | bf.loc[0, 'start'] = 11
42 | with pytest.raises(ValueError):
43 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
44 |
45 |
46 | def test_name_validators():
47 | with tempfile.TemporaryDirectory() as directory:
48 | bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12')
49 | bf.loc[0, 'name'] = '' # must not be empty
50 | with pytest.raises(ValueError):
51 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
52 |
53 | bf.loc[0, 'name'] = 'a'*300 # must be less than 255 char
54 | with pytest.raises(ValueError):
55 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
56 |
57 |
58 | def test_score_validators():
59 | with tempfile.TemporaryDirectory() as directory:
60 | bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12')
61 | # negative value is enforced by the normal types
62 |
63 | bf.loc[0, 'score'] = 1001
64 | with pytest.raises(ValueError):
65 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'), strict_score=True)
66 |
67 | bf['score'] = '.' # enforced to be a number by the types
68 | with pytest.raises(TypeError):
69 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
70 |
71 |
72 | def test_strand_validators():
73 | with tempfile.TemporaryDirectory() as directory:
74 | bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12')
75 | bf.loc[0, 'strand'] = '*'
76 | with pytest.raises(ValueError):
77 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
78 |
79 |
80 | def test_thick_validators():
81 | with tempfile.TemporaryDirectory() as directory:
82 | for direction in ['Start', 'End']:
83 | bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12')
84 | bf.loc[0, 'start'] = 100
85 | bf.loc[0, 'end'] = 1000
86 | bf.loc[0, f'thick{direction}'] = 1001
87 | with pytest.raises(ValueError):
88 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
89 |
90 | bf.loc[0, f'thick{direction}'] = 99
91 | with pytest.raises(ValueError):
92 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
93 |
94 |
95 | def test_itemRgb_validators():
96 | with tempfile.TemporaryDirectory() as directory:
97 | bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12')
98 | bf["itemRgb"] = bf["itemRgb"].astype(str)
99 | bf.loc[0, 'itemRgb'] = 'a,12,13' # must be integers
100 | with pytest.raises(ValueError):
101 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
102 |
103 | bf.loc[0, 'itemRgb'] = '12,13' # must be 1 or 3 integers
104 | with pytest.raises(ValueError):
105 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
106 |
107 | bf.loc[0, 'itemRgb'] = '12,13,14,15' # must be 1 or 3 integers
108 | with pytest.raises(ValueError):
109 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
110 |
111 | bf.loc[0, 'itemRgb'] = '12,13,300' # must be between 0 and 255
112 | with pytest.raises(ValueError):
113 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
114 |
115 | bf.loc[0, 'itemRgb'] = '300' # must be between 0 and 255
116 | with pytest.raises(ValueError):
117 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
118 |
119 |
120 | def test_blockCount_validators():
121 | with tempfile.TemporaryDirectory() as directory:
122 | bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12')
123 | bf.loc[0, 'blockCount'] = 0
124 | with pytest.raises(ValueError):
125 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
126 |
127 |
128 | def test_blockSizes_validators():
129 | with tempfile.TemporaryDirectory() as directory:
130 | bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12')
131 | bf.loc[0, 'blockCount'] = 2
132 | bf.loc[0, 'blockSizes'] = '2,a,'
133 | with pytest.raises(ValueError):
134 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
135 |
136 | bf.loc[0, 'blockCount'] = 2
137 | bf.loc[0, 'blockSizes'] = '2,2,2,'
138 | with pytest.raises(ValueError):
139 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
140 |
141 |
142 | def test_blockStarts_validators():
143 | with tempfile.TemporaryDirectory() as directory:
144 | bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12')
145 | bf.loc[0, 'blockCount'] = 2
146 | bf.loc[0, 'blockSizes'] = '2,4,'
147 | bf.loc[0, 'blockStarts'] = '0,a,'
148 | with pytest.raises(ValueError):
149 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
150 |
151 | bf.loc[0, 'blockCount'] = 2
152 | bf.loc[0, 'blockSizes'] = '1,1,'
153 | bf.loc[0, 'blockStarts'] = '0,2,5,'
154 | with pytest.raises(ValueError):
155 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
156 |
157 | # ends after end
158 | bf.loc[0, 'start'] = 1
159 | bf.loc[0, 'end'] = 10
160 | bf.loc[0, 'blockCount'] = 1
161 | bf.loc[0, 'blockSizes'] = '100,'
162 | bf.loc[0, 'blockStarts'] = '0,'
163 | with pytest.raises(ValueError):
164 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
165 |
166 | # ends before end
167 | bf.loc[0, 'start'] = 1
168 | bf.loc[0, 'end'] = 10
169 | bf.loc[0, 'blockCount'] = 1
170 | bf.loc[0, 'blockSizes'] = '1,'
171 | bf.loc[0, 'blockStarts'] = '0,'
172 | with pytest.raises(ValueError):
173 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
174 |
175 | # overlap
176 | bf.loc[0, 'start'] = 1
177 | bf.loc[0, 'end'] = 10
178 | bf.loc[0, 'blockCount'] = 2
179 | bf.loc[0, 'blockSizes'] = '5,5,'
180 | bf.loc[0, 'blockStarts'] = '0,1,'
181 | with pytest.raises(ValueError):
182 | bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
183 |
--------------------------------------------------------------------------------
/tests/test_core_construction.py:
--------------------------------------------------------------------------------
1 | from io import StringIO
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import pytest
6 |
7 | from bioframe.core import construction
8 | from bioframe.core.construction import from_any
9 |
10 |
11 | def test_add_ucsc_name_column():
12 | df = pd.DataFrame(
13 | {"chrom": [f"chr{i}" for i in range(3)], "start": [1, 2, 3], "end": [4, 5, 6]}
14 | )
15 |
16 | pd.testing.assert_series_equal(
17 | construction.add_ucsc_name_column(df)["name"],
18 | pd.Series(
19 | data=["chr0:1-4", "chr1:2-5", "chr2:3-6"], index=[0, 1, 2], name="name"
20 | ),
21 | )
22 |
23 |
24 | def test_any():
25 | ### tests copied from old parse_regions
26 | # main functionality: convert to dataframe and create name
27 | df = pd.DataFrame(
28 | {"chrom": [f"chr{i}" for i in range(3)], "start": [1, 2, 3], "end": [4, 5, 6]}
29 | )
30 | parsed = from_any(df)
31 | assert "name" not in parsed.columns
32 | assert parsed.iloc[0]["chrom"] == "chr0"
33 |
34 | # re-create dataframe from UCSC name alone
35 | df2 = pd.DataFrame(
36 | {
37 | "regions": construction.add_ucsc_name_column(parsed, name_col="regions")[
38 | "regions"
39 | ].values
40 | }
41 | )
42 | assert (
43 | (from_any(df2, name_col="regions")[["chrom", "start", "end"]] == parsed)
44 | .all()
45 | .all()
46 | )
47 |
48 | # re-parsing results yields the same
49 | assert (from_any(parsed) == parsed).all().all()
50 |
51 | # extra columns don't get overwritten
52 | df["name"] = "test-value"
53 | assert (from_any(df)["name"] == df["name"]).all()
54 |
55 | # None or False will be parsed
56 | assert from_any([("chr1", None, 5)], fill_null={"chr1": 10})["start"].values[0] == 0
57 |
58 | # pull end from chromsizes
59 | p2 = from_any([("chr1", 5, None)], fill_null={"chr1": 40})
60 | assert list(p2.values[0]) == ["chr1", 5, 40]
61 |
62 | # We could keep things as None if chromsizes were not proviced
63 | p3 = from_any(["chr1", "chr2"], fill_null=False)
64 | assert list(p3.values[0]) == ["chr1", None, None]
65 |
66 | # parse the strange name
67 | p8 = from_any(["chr1:1,000,000-4M"])
68 | assert list(p8.values[0]) == ["chr1", 1000000, 4000000]
69 |
70 | p9 = from_any(["chr1"])
71 | assert list(p9.values[0]) == ["chr1", None, None]
72 |
73 | with pytest.raises(ValueError):
74 | from_any([("ch1", 1, 2, "chr1:1-2", "puppies")]) # puppies are not allowed
75 |
76 | with pytest.raises(ValueError):
77 | from_any([("chr1", 5, None)], fill_null={"chr2": 40})
78 |
79 | # input tuple of tuples
80 | p2 = from_any((("chr1", 5, 10), ("chrX", 10, 20)))
81 | assert list(p2.values[0]) == ["chr1", 5, 10]
82 |
83 | # input tuple of lists
84 | p2 = from_any((["chr1", 5, 10], ["chrX", 10, 20]))
85 | assert list(p2.values[0]) == ["chr1", 5, 10]
86 |
87 | # input tuple of ucsc strings
88 | p2 = from_any(("chr1:5-10",))
89 | assert list(p2.values[0]) == ["chr1", 5, 10]
90 |
91 | # input single tuple
92 | p2 = from_any(("chr1", 5, 10))
93 | assert list(p2.values[0]) == ["chr1", 5, 10]
94 |
95 |
96 | def test_sanitize_bedframe():
97 | df1 = pd.DataFrame(
98 | [
99 | ["chr1", 10, 20],
100 | ["chr1", 10, 20],
101 | ["chr1", 15, np.nan],
102 | ["chr1", pd.NA, 25],
103 | ],
104 | columns=["chrom", "start", "end"],
105 | )
106 |
107 | # drop rows with null values
108 | sanitized_df1 = pd.DataFrame(
109 | [["chr1", 10, 20], ["chr1", 10, 20]], columns=["chrom", "start", "end"]
110 | )
111 | sanitized_df1 = sanitized_df1.astype(
112 | {"chrom": str, "start": pd.Int64Dtype(), "end": pd.Int64Dtype()}
113 | )
114 | pd.testing.assert_frame_equal(
115 | sanitized_df1, construction.sanitize_bedframe(df1, drop_null=True)
116 | )
117 |
118 | # keep rows with null, but recast
119 | sanitized_df1 = pd.DataFrame(
120 | [
121 | ["chr1", 10, 20],
122 | ["chr1", 10, 20],
123 | [pd.NA, pd.NA, pd.NA],
124 | [pd.NA, pd.NA, pd.NA],
125 | ],
126 | columns=["chrom", "start", "end"],
127 | )
128 | sanitized_df1 = sanitized_df1.astype(
129 | {"chrom": object, "start": pd.Int64Dtype(), "end": pd.Int64Dtype()}
130 | )
131 | pd.testing.assert_frame_equal(
132 | sanitized_df1.fillna(-1), construction.sanitize_bedframe(df1).fillna(-1)
133 | )
134 |
135 | # flip intervals as well as drop NA
136 | df1 = pd.DataFrame(
137 | [
138 | ["chr1", 20, 10],
139 | ["chr1", pd.NA, 25],
140 | ],
141 | columns=["chrom", "start", "end"],
142 | )
143 | sanitized_df1 = pd.DataFrame([["chr1", 10, 20]], columns=["chrom", "start", "end"])
144 | sanitized_df1 = sanitized_df1.astype(
145 | {"chrom": str, "start": pd.Int64Dtype(), "end": pd.Int64Dtype()}
146 | )
147 | pd.testing.assert_frame_equal(
148 | sanitized_df1,
149 | construction.sanitize_bedframe(
150 | df1, start_exceed_end_action="fLiP", drop_null=True
151 | ),
152 | )
153 |
154 | # flip intervals as well as drop NA
155 | df1 = pd.DataFrame(
156 | [
157 | ["chr1", 20, 10],
158 | ["chr1", pd.NA, 25],
159 | ],
160 | columns=["chrom", "start", "end"],
161 | )
162 | sanitized_df1 = pd.DataFrame([["chr1", 10, 20]], columns=["chrom", "start", "end"])
163 | sanitized_df1 = sanitized_df1.astype(
164 | {"chrom": str, "start": pd.Int64Dtype(), "end": pd.Int64Dtype()}
165 | )
166 | assert construction.sanitize_bedframe(
167 | df1, start_exceed_end_action="drop", drop_null=True
168 | ).empty
169 |
170 |
171 | def test_make_viewframe():
172 | # test dict input
173 | view_df = pd.DataFrame(
174 | [
175 | ["chrTESTX", 0, 10, "chrTESTX:0-10"],
176 | ["chrTESTX_p", 0, 12, "chrTESTX_p:0-12"],
177 | ],
178 | columns=["chrom", "start", "end", "name"],
179 | )
180 | pd.testing.assert_frame_equal(
181 | view_df.copy(),
182 | construction.make_viewframe(
183 | {"chrTESTX": 10, "chrTESTX_p": 12}, name_style="ucsc"
184 | ),
185 | )
186 |
187 | # test list input
188 | region_list = [("chrTESTX", 0, 10), ("chrTESTX_p", 0, 12)]
189 | pd.testing.assert_frame_equal(
190 | view_df.copy(),
191 | construction.make_viewframe(region_list, name_style="ucsc"),
192 | )
193 |
194 | # test pd.Series input
195 | chromsizes = pd.Series(data=[5, 8], index=["chrTESTXq", "chrTEST_2p"])
196 | d = """ chrom start end name
197 | 0 chrTESTXq 0 5 chrTESTXq
198 | 1 chrTEST_2p 0 8 chrTEST_2p"""
199 | view_df = pd.read_csv(StringIO(d), sep=r"\s+")
200 | pd.testing.assert_frame_equal(
201 | view_df.copy(), construction.make_viewframe(chromsizes, name_style=None)
202 | )
203 |
204 | d = """ chrom start end name
205 | 0 chrTESTXq 0 5 chrTESTXq:0-5
206 | 1 chrTEST_2p 0 8 chrTEST_2p:0-8"""
207 | view_df = pd.read_csv(StringIO(d), sep=r"\s+")
208 | pd.testing.assert_frame_equal(
209 | view_df.copy(),
210 | construction.make_viewframe(chromsizes, name_style="UCSC"),
211 | )
212 |
213 | # test pd.DataFrame input
214 | pd.testing.assert_frame_equal(view_df.copy(), construction.make_viewframe(view_df))
215 |
216 | # if you provide unique names, this is accepted unchanged by make_viewframe
217 | view_df = pd.DataFrame(
218 | [["chrTESTX", 0, 10, "chrTEST_1"], ["chrTESTY", 0, 12, "chrTEST_2"]],
219 | columns=["chrom", "start", "end", "name"],
220 | )
221 |
222 | region_list = [("chrTESTX", 0, 10, "chrTEST_1"), ("chrTESTY", 0, 12, "chrTEST_2")]
223 |
224 | pd.testing.assert_frame_equal(
225 | view_df.copy(), construction.make_viewframe(region_list)
226 | )
227 |
228 | pd.testing.assert_frame_equal(view_df.copy(), construction.make_viewframe(view_df))
229 |
230 | pd.testing.assert_frame_equal(
231 | view_df.copy(),
232 | construction.make_viewframe(
233 | view_df, check_bounds={"chrTESTX": 11, "chrTESTY": 13}
234 | ),
235 | )
236 |
237 | with pytest.raises(ValueError):
238 | construction.make_viewframe(
239 | view_df, check_bounds={"chrTESTX": 9, "chrTESTY": 13}
240 | )
241 |
--------------------------------------------------------------------------------
/tests/test_core_specs.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import pytest
4 |
5 | import bioframe
6 | from bioframe.core import specs
7 |
8 |
9 | def test_get_default_colnames():
10 | assert specs._get_default_colnames() == ("chrom", "start", "end")
11 |
12 |
13 | def test_update_default_colnames():
14 | new_names = ("C", "chromStart", "chromStop")
15 | specs.update_default_colnames(new_names)
16 | assert specs._get_default_colnames() == new_names
17 |
18 | # test that with updated default column names, bioframe.ops recognizes df1
19 | df1 = pd.DataFrame(
20 | [["chr1", 1, 5], ["chr1", 3, 8], ["chr1", 8, 10], ["chr1", 12, 14]],
21 | columns=list(new_names),
22 | )
23 | df1_chromsizes = {"chr1": 100, "chrX": 100}
24 |
25 | df1_complement = pd.DataFrame(
26 | [
27 | ["chr1", 0, 1, "chr1"],
28 | ["chr1", 10, 12, "chr1"],
29 | ["chr1", 14, 100, "chr1"],
30 | ["chrX", 0, 100, "chrX"],
31 | ],
32 | columns=[*list(new_names), "view_region"],
33 | )
34 |
35 | pd.testing.assert_frame_equal(
36 | bioframe.complement(df1, view_df=df1_chromsizes), df1_complement
37 | )
38 |
39 | # cannot update with just two colujmns
40 | with pytest.raises(ValueError):
41 | specs.update_default_colnames(("chromosome", "position"))
42 |
43 | # extra stuff is not allowed
44 | with pytest.raises(ValueError):
45 | specs.update_default_colnames(["chromosome", "start", "end", "extrasuff"])
46 |
47 | # reset to default
48 | specs.update_default_colnames(("chrom", "start", "end"))
49 |
50 |
51 | def test_verify_columns():
52 | new_names = ("C", "chromStart", "chromStop")
53 | df1 = pd.DataFrame(
54 | [["chr1", 1, 5], ["chr1", 3, 8], ["chr1", 8, 10], ["chr1", 12, 14]],
55 | columns=list(new_names),
56 | )
57 |
58 | with pytest.raises(ValueError):
59 | specs._verify_columns(df1, specs._get_default_colnames())
60 |
61 | assert specs._verify_columns(
62 | df1,
63 | new_names,
64 | return_as_bool=True,
65 | )
66 |
67 | # no repeated column names
68 | with pytest.raises(ValueError):
69 | specs._verify_columns(df1, ["chromStart", "chromStart"], unique_cols=True)
70 |
71 |
72 | def test_verify_column_dtypes():
73 | new_names = ("C", "chromStart", "chromStop")
74 | df1 = pd.DataFrame(
75 | [["chr1", 1, 5], ["chr1", 3, 8], ["chr1", 8, 10], ["chr1", 12, 14]],
76 | columns=list(new_names),
77 | )
78 |
79 | with pytest.raises(ValueError):
80 | specs._verify_column_dtypes(df1, specs._get_default_colnames())
81 |
82 | assert specs._verify_column_dtypes(df1, new_names, return_as_bool=True)
83 |
84 | df1["chromStart"] = df1["chromStart"].astype(float)
85 | assert not specs._verify_column_dtypes(df1, new_names, return_as_bool=True)
86 |
87 | df1["chromStart"] = df1["chromStart"].astype(pd.Int64Dtype())
88 | assert specs._verify_column_dtypes(df1, new_names, return_as_bool=True)
89 |
90 | df1["C"] = df1["C"].str.replace("chr", "").astype(np.int64)
91 | assert not specs._verify_column_dtypes(df1, new_names, return_as_bool=True)
92 |
93 |
94 | def test_is_chrom_dtype():
95 | assert specs.is_chrom_dtype(str)
96 | fruit = pd.CategoricalDtype(
97 | categories=["oranges", "grapefruit", "apples"], ordered=True
98 | )
99 | assert specs.is_chrom_dtype(fruit)
100 | assert not specs.is_chrom_dtype(int)
101 | assert not specs.is_chrom_dtype(float)
102 |
--------------------------------------------------------------------------------
/tests/test_core_stringops.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pytest
3 |
4 | from bioframe.core import stringops
5 | from bioframe.core.stringops import parse_region
6 |
7 |
8 | def test_to_ucsc_string():
9 | assert stringops.to_ucsc_string(("chr21", 1, 4)) == "chr21:1-4"
10 |
11 |
12 | def test_parse_region():
13 | # UCSC-style names
14 | assert parse_region("chr21") == ("chr21", 0, None)
15 | assert parse_region("chr21:1000-2000") == ("chr21", 1000, 2000)
16 | assert parse_region("chr21:1,000-2,000") == ("chr21", 1000, 2000)
17 |
18 | # Ensembl style names
19 | assert parse_region("6") == ("6", 0, None)
20 | assert parse_region("6:1000-2000") == ("6", 1000, 2000)
21 | assert parse_region("6:1,000-2,000") == ("6", 1000, 2000)
22 |
23 | # FASTA style names
24 | assert parse_region("gb|accession|locus") == ("gb|accession|locus", 0, None)
25 | assert parse_region("gb|accession|locus:1000-2000") == (
26 | "gb|accession|locus",
27 | 1000,
28 | 2000,
29 | )
30 | assert parse_region("gb|accession|locus:1,000-2,000") == (
31 | "gb|accession|locus",
32 | 1000,
33 | 2000,
34 | )
35 |
36 | # Punctuation in names (aside from :)
37 | assert parse_region("name-with-hyphens-") == ("name-with-hyphens-", 0, None)
38 | assert parse_region("GL000207.1") == ("GL000207.1", 0, None)
39 | assert parse_region("GL000207.1:1000-2000") == ("GL000207.1", 1000, 2000)
40 |
41 | # Trailing dash
42 | assert parse_region("chr21:1000-") == ("chr21", 1000, None)
43 |
44 | # Humanized units
45 | assert parse_region("6:1kb-2kb") == ("6", 1000, 2000)
46 | assert parse_region("6:1k-2000") == ("6", 1000, 2000)
47 | assert parse_region("6:1kb-2M") == ("6", 1000, 2000000)
48 | assert parse_region("6:1Gb-") == ("6", 1000000000, None)
49 |
50 | with pytest.raises(ValueError):
51 | parse_region("chr1:2,000-1,000") # reverse selection
52 |
53 | with pytest.raises(ValueError):
54 | parse_region("chr1::1000-2000") # more than one colon
55 |
56 |
57 | def test_parse_region_string():
58 | assert stringops.parse_region_string("6:1kb-2kb") == ("6", 1000, 2000)
59 | assert stringops.parse_region_string("6:1,000-2,000") == ("6", 1000, 2000)
60 | assert stringops.parse_region_string("c6:1000-2000") == ("c6", 1000, 2000)
61 |
62 |
63 | def test_is_complete_ucsc_string():
64 | assert stringops.is_complete_ucsc_string("chrX:1M-2M")
65 | assert not stringops.is_complete_ucsc_string("chrX")
66 | assert not stringops.is_complete_ucsc_string("1M-2M")
67 | assert not stringops.is_complete_ucsc_string(1000)
68 | assert not stringops.is_complete_ucsc_string(np.array([100, 200]))
69 | assert not stringops.is_complete_ucsc_string(np.array(["chr1:100-200"]))
70 |
--------------------------------------------------------------------------------
/tests/test_data/bed12.bed:
--------------------------------------------------------------------------------
1 | chr19 54331773 54620705 5C_304_ENm007_FOR_1.5C_304_ENm007_REV_40 1000 . 54331773 54620705 0 2 14528,19855, 0,269077,
2 | chr19 54461360 54620705 5C_304_ENm007_FOR_26.5C_304_ENm007_REV_40 1000 . 54461360 54620705 0 2 800,19855, 0,139490,
3 | chr5 131346229 132145236 5C_299_ENm002_FOR_241.5C_299_ENm002_REV_33 1000 . 131346229 132145236 0 2 2609,2105, 0,796902,
4 | chr21 35037188 35285017 5C_302_ENm005_FOR_339.5C_302_ENm005_REV_403 1000 . 35037188 35285017 0 2 10878,8825, 0,239004,
5 | chr19 54357838 54691409 5C_304_ENm007_FOR_4.5C_304_ENm007_REV_51 1000 . 54357838 54691409 0 2 1055,14125, 0,319446,
6 | chr7 115924626 116693495 5C_298_ENm001_FOR_286.5C_298_ENm001_REV_28 1000 . 115924626 116693495 0 2 4890,1441, 0,767428,
7 | chr19 54600850 54772278 5C_304_ENm007_FOR_62.5C_304_ENm007_REV_40 1000 . 54600850 54772278 0 2 19855,8187, 0,163241,
8 | chr19 54359237 54620705 5C_304_ENm007_FOR_6.5C_304_ENm007_REV_40 1000 . 54359237 54620705 0 2 1949,19855, 0,241613,
9 | chr19 54461360 54893239 5C_304_ENm007_FOR_26.5C_304_ENm007_REV_85 1000 . 54461360 54893239 0 2 800,11088, 0,420791,
10 | chr7 116754962 117005110 5C_298_ENm001_FOR_383.5C_298_ENm001_REV_305 1000 . 116754962 117005110 0 2 10635,363, 0,249785,
11 | chr11 116617499 116757175 5C_300_ENm003_FOR_46.5C_300_ENm003_REV_79 1000 . 116617499 116757175 0 2 2921,19431, 0,120245,
12 | chr22 32920308 33427592 5C_301_ENm004_FOR_338.5C_301_ENm004_REV_218 1000 . 32920308 33427592 0 2 7415,8621, 0,498663,
13 | chr11 1748200 2195481 5C_308_ENm011_FOR_3.5C_308_ENm011_REV_63 1000 . 1748200 2195481 0 2 5843,9589, 0,437692,
14 | chr7 115924626 116158598 5C_298_ENm001_FOR_106.5C_298_ENm001_REV_28 1000 . 115924626 116158598 0 2 4890,1491, 0,232481,
15 | chr16 62281851 62641443 5C_997_ENr313_FOR_118.5C_997_ENr313_REV_2 1000 . 62281851 62641443 0 2 2408,2547, 0,357045,
16 | chr7 116434729 117617181 5C_298_ENm001_FOR_590.5C_298_ENm001_REV_203 1000 . 116434729 117617181 0 2 19679,7324, 0,1175128,
17 | chr7 116544149 116693495 5C_298_ENm001_FOR_286.5C_298_ENm001_REV_236 1000 . 116544149 116693495 0 2 3475,1441, 0,147905,
18 | chr11 1789267 2195481 5C_308_ENm011_FOR_8.5C_308_ENm011_REV_63 1000 . 1789267 2195481 0 2 3188,9589, 0,396625,
19 | chr7 116434729 116693495 5C_298_ENm001_FOR_286.5C_298_ENm001_REV_203 1000 . 116434729 116693495 0 2 19679,1441, 0,257325,
20 | chr7 116849860 117617181 5C_298_ENm001_FOR_590.5C_298_ENm001_REV_342 1000 . 116849860 117617181 0 2 15082,7324, 0,759997,
21 | chr22 32544939 33427592 5C_301_ENm004_FOR_338.5C_301_ENm004_REV_131 1000 . 32544939 33427592 0 2 4212,8621, 0,874032,
22 | chr19 54429407 54620705 5C_304_ENm007_FOR_20.5C_304_ENm007_REV_40 1000 . 54429407 54620705 0 2 7487,19855, 0,171443,
23 | chr19 54764091 54893239 5C_304_ENm007_FOR_62.5C_304_ENm007_REV_85 1000 . 54764091 54893239 0 2 8187,11088, 0,118060,
24 | chr16 62431952 62769565 5C_997_ENr313_FOR_46.5C_997_ENr313_REV_159 1000 . 62431952 62769565 0 2 4031,3833, 0,333780,
25 | chr21 35029593 35285017 5C_302_ENm005_FOR_337.5C_302_ENm005_REV_403 1000 . 35029593 35285017 0 2 6085,8825, 0,246599,
26 | chr5 131346229 132146235 5C_299_ENm002_FOR_242.5C_299_ENm002_REV_33 1000 . 131346229 132146235 0 2 2609,999, 0,799007,
27 | chr19 54600850 54703388 5C_304_ENm007_FOR_55.5C_304_ENm007_REV_40 1000 . 54600850 54703388 0 2 19855,7848, 0,94690,
28 | chrX 153198557 153625659 5C_303_ENm006_FOR_84.5C_303_ENm006_REV_17 1000 . 153198557 153625659 0 2 15711,11331, 0,415771,
29 | chr7 115861595 116766876 5C_298_ENm001_FOR_306.5C_298_ENm001_REV_13 1000 . 115861595 116766876 0 2 9373,1279, 0,904002,
30 | chr22 32920308 33282103 5C_301_ENm004_FOR_300.5C_301_ENm004_REV_218 1000 . 32920308 33282103 0 2 7415,1101, 0,360694,
31 |
--------------------------------------------------------------------------------
/tests/test_data/bed9.bed:
--------------------------------------------------------------------------------
1 | chr1 193500 194500 . 400 + . . 179,45,0
2 | chr1 618500 619500 . 700 + . . 179,45,0
3 | chr1 974500 975500 . 1000 + . . 179,45,0
4 | chr1 1301500 1302500 . 1000 + . . 179,45,0
5 | chr1 1479500 1480500 . 1000 + . . 179,45,0
6 | chr1 2154500 2155500 . 800 + . . 179,45,0
7 | chr1 2450500 2451500 . 900 + . . 179,45,0
8 | chr1 3719500 3720500 . 700 + . . 179,45,0
9 | chr1 4084500 4085500 . 600 + . . 179,45,0
10 | chr1 6292500 6293500 . 900 + . . 179,45,0
11 | chr1 6507500 6508500 . 900 + . . 179,45,0
12 | chr1 8182500 8183500 . 700 + . . 179,45,0
13 | chr1 8988500 8989500 . 1000 + . . 179,45,0
14 | chr1 9483500 9484500 . 900 + . . 179,45,0
15 | chr1 9815500 9816500 . 900 + . . 179,45,0
16 | chr1 10146500 10147500 . 900 + . . 179,45,0
17 | chr1 11023500 11024500 . 1000 + . . 179,45,0
18 | chr1 11266500 11267500 . 800 + . . 179,45,0
19 | chr1 11971500 11972500 . 1000 + . . 179,45,0
20 | chr1 12172500 12173500 . 1000 + . . 179,45,0
21 | chr1 13145500 13146500 . 400 + . . 179,45,0
22 | chr1 13464500 13465500 . 400 + . . 179,45,0
23 | chr1 14030500 14031500 . 600 + . . 179,45,0
24 | chr1 16068500 16069500 . 900 + . . 179,45,0
25 | chr1 16486500 16487500 . 900 + . . 179,45,0
26 | chr1 16756500 16757500 . 1000 + . . 179,45,0
27 | chr1 17035500 17036500 . 700 + . . 179,45,0
28 | chr1 17306500 17307500 . 700 + . . 179,45,0
29 | chr1 18393500 18394500 . 400 + . . 179,45,0
30 | chr1 19383500 19384500 . 700 + . . 179,45,0
31 |
--------------------------------------------------------------------------------
/tests/test_data/jaspar.bed:
--------------------------------------------------------------------------------
1 | chr1 10470 10489 CTCF 803 390 -
2 | chr1 11163 11182 CTCF 811 406 -
3 | chr1 11222 11241 CTCF 959 804 -
4 | chr1 11280 11299 CTCF 939 728 -
5 | chr1 11339 11358 CTCF 837 455 -
6 | chr1 11401 11420 CTCF 829 439 -
7 | chr1 11413 11432 CTCF 803 390 +
8 | chr1 13282 13301 CTCF 800 385 -
9 | chr1 14230 14249 CTCF 817 416 -
10 | chr1 15227 15246 CTCF 806 396 -
11 | chr1 15626 15645 CTCF 830 442 -
12 | chr1 16650 16669 CTCF 826 433 +
13 | chr1 17091 17110 CTCF 821 423 +
14 | chr1 17925 17944 CTCF 806 395 +
15 | chr1 18119 18138 CTCF 807 398 +
16 | chr1 18357 18376 CTCF 808 400 -
17 | chr1 18487 18506 CTCF 810 403 -
18 | chr1 19817 19836 CTCF 804 392 -
19 | chr1 22561 22580 CTCF 806 396 +
20 | chr1 23446 23465 CTCF 800 385 +
21 | chr1 23872 23891 CTCF 823 428 -
22 | chr1 24781 24800 CTCF 892 584 -
23 | chr1 24939 24958 CTCF 828 438 +
24 | chr1 26053 26072 CTCF 832 446 -
25 | chr1 26085 26104 CTCF 843 468 -
26 | chr1 32074 32093 CTCF 803 391 -
27 | chr1 34397 34416 CTCF 803 391 -
28 | chr1 34941 34960 CTCF 815 412 +
29 | chr1 35952 35971 CTCF 807 397 -
30 | chr1 36202 36221 CTCF 807 397 +
31 |
--------------------------------------------------------------------------------
/tests/test_data/narrowPeak.bed:
--------------------------------------------------------------------------------
1 | chr19 48309541 48309911 . 1000 . 5.04924 -1.00000 0.00438 185
2 | chr4 130563716 130564086 . 993 . 5.05052 -1.00000 0.00432 185
3 | chr1 200622507 200622877 . 591 . 5.05489 -1.00000 0.00400 185
4 | chr5 112848447 112848817 . 869 . 5.05841 -1.00000 0.00441 185
5 | chr1 145960616 145960986 . 575 . 5.05955 -1.00000 0.00439 185
6 | chr9 9912714 9913084 . 563 . 5.06079 -1.00000 0.00434 185
7 | chr6 2744599 2744969 . 795 . 5.06457 -1.00000 0.00401 185
8 | chr9 124777413 124777783 . 1000 . 5.06479 -1.00000 0.00402 185
9 | chr1 67701045 67701415 . 780 . 5.06708 -1.00000 0.00416 185
10 | chr10 119859586 119859956 . 825 . 5.08015 -1.00000 0.00362 185
11 | chr3 66816327 66816697 . 1000 . 5.08233 -1.00000 0.00379 185
12 | chr16 50248791 50249161 . 579 . 5.08249 -1.00000 0.00380 185
13 | chr19 41431677 41432047 . 1000 . 5.11060 -1.00000 0.00876 185
14 | chr4 131644839 131645209 . 1000 . 5.11204 -1.00000 0.00855 185
15 | chr2 203239519 203239889 . 753 . 5.11817 -1.00000 0.00755 185
16 | chr1 108877017 108877387 . 1000 . 5.12519 -1.00000 0.00777 185
17 | chr1 23665426 23665796 . 1000 . 5.12618 -1.00000 0.00712 185
18 | chr15 78415607 78415977 . 1000 . 5.14402 -1.00000 0.00913 185
19 | chr9 3181837 3182207 . 1000 . 5.14438 -1.00000 0.00903 185
20 | chr10 50275876 50276246 . 1000 . 5.14891 -1.00000 0.00867 185
21 | chr17 27388554 27388924 . 1000 . 5.15031 -1.00000 0.00809 185
22 | chr1 241485905 241486275 . 1000 . 5.16030 -1.00000 0.00723 185
23 | chr18 56995779 56996149 . 827 . 5.16128 -1.00000 0.00708 185
24 | chr11 24558049 24558419 . 620 . 5.16788 -1.00000 0.00557 185
25 | chr4 109134575 109134945 . 567 . 5.16876 -1.00000 0.00550 185
26 | chr10 84214795 84215165 . 1000 . 5.17597 -1.00000 0.00540 185
27 | chr20 4233733 4234103 . 1000 . 5.17899 -1.00000 0.00497 185
28 | chr2 130356160 130356530 . 1000 . 5.18574 -1.00000 0.00660 185
29 | chr18 55322509 55322879 . 865 . 5.19245 -1.00000 0.00626 185
30 | chr8 126510457 126510827 . 552 . 5.19561 -1.00000 0.00554 185
31 |
--------------------------------------------------------------------------------
/tests/test_data/test.chrom.sizes:
--------------------------------------------------------------------------------
1 | chrTESTX 5
2 | chrTEST2 7
3 |
--------------------------------------------------------------------------------
/tests/test_data/test.fa:
--------------------------------------------------------------------------------
1 | >chrTESTX
2 | AtGcN
3 | >chrTEST2
4 | NGATCNN
5 |
--------------------------------------------------------------------------------
/tests/test_data/test.fa.fai:
--------------------------------------------------------------------------------
1 | chrTESTX 5 10 5 6
2 | chrTEST2 7 26 7 8
3 |
--------------------------------------------------------------------------------
/tests/test_data/toy.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/tests/test_data/toy.bam
--------------------------------------------------------------------------------
/tests/test_data/toy.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/tests/test_data/toy.bam.bai
--------------------------------------------------------------------------------
/tests/test_data/toy.sam:
--------------------------------------------------------------------------------
1 | @SQ SN:ref LN:45
2 | @SQ SN:ref2 LN:40
3 | r001 163 ref 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112
4 | r002 0 ref 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA *
5 | r003 0 ref 9 30 5H6M * 0 0 AGCTAA *
6 | r004 0 ref 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC *
7 | r003 16 ref 29 30 6H5M * 0 0 TAGGC *
8 | r001 83 ref 37 30 9M = 7 -39 CAGCGCCAT *
9 | x1 0 ref2 1 30 20M * 0 0 aggttttataaaacaaataa ????????????????????
10 | x2 0 ref2 2 30 21M * 0 0 ggttttataaaacaaataatt ?????????????????????
11 | x3 0 ref2 6 30 9M4I13M * 0 0 ttataaaacAAATaattaagtctaca ??????????????????????????
12 | x4 0 ref2 10 30 25M * 0 0 CaaaTaattaagtctacagagcaac ?????????????????????????
13 | x5 0 ref2 12 30 24M * 0 0 aaTaattaagtctacagagcaact ????????????????????????
14 | x6 0 ref2 14 30 23M * 0 0 Taattaagtctacagagcaacta ???????????????????????
15 |
--------------------------------------------------------------------------------
/tests/test_fileops.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from io import StringIO
3 |
4 | import pandas as pd
5 | import pytest
6 |
7 | import bioframe
8 |
9 | is_big_endian = sys.byteorder == "big"
10 |
11 |
12 | ############# tests #####################
13 | def test_read_table():
14 | d = """chr1\nchr2\nchr2"""
15 | assert bioframe.read_table(StringIO(d), schema="bed3").shape == (3, 3)
16 |
17 | # raise a value error if any columns are filled with all NA
18 | with pytest.raises(ValueError):
19 | bioframe.read_table(StringIO(d), schema="bed3", schema_is_strict=True)
20 |
21 | # fill with nans to appropriate size if schema_is_strict=False (aka the default)
22 | d = """chr1 5 10
23 | chr1 10 20
24 | chr2 30 40"""
25 | assert bioframe.read_table(StringIO(d), schema="bed3", sep="\\s+").shape == (3, 3)
26 | assert bioframe.read_table(StringIO(d), schema="bed6", sep="\\s+").shape == (3, 6)
27 | assert bioframe.read_table(StringIO(d), schema="bed12", sep="\\s+").shape == (3, 12)
28 |
29 | # bedpe has 10 columns
30 | d = """chr1 5 10 chr2 5 10 interval1 . + -
31 | chr1 10 20 chr1 5 10 interval2 . + -
32 | chr2 30 40 chr2 5 10 interval3 12 + -
33 | """
34 | assert bioframe.read_table(
35 | StringIO(d), schema="bedpe", sep=r"\s+", schema_is_strict=True
36 | ).shape == (3, 10)
37 |
38 |
39 | def test_read_chromsizes():
40 | d = """chr1\nchr2\nchr2"""
41 | with pytest.raises(ValueError):
42 | bioframe.read_chromsizes(StringIO(d))
43 |
44 | d = """chr1\t1\nchr3\t2\nchr2\t3\n """
45 | chromsizes = bioframe.read_chromsizes(StringIO(d))
46 | assert isinstance(chromsizes, pd.Series)
47 | assert chromsizes.name == "length"
48 | assert list(chromsizes.index) == ["chr1", "chr2", "chr3"]
49 | assert list(chromsizes.values) == [1, 3, 2]
50 |
51 |
52 | def test_read_beds():
53 | # Checking that we properly read common bed schemas
54 | schemas = ['narrowPeak', 'jaspar', 'bed9', 'bed12']
55 |
56 | for schema in schemas:
57 | _ = bioframe.read_table(f'tests/test_data/{schema}.bed', schema=schema,
58 | schema_is_strict=True)
59 |
60 |
61 | @pytest.mark.skipif(is_big_endian, reason="Test skipped on big-endian systems")
62 | def test_read_sam():
63 | pytest.importorskip("pysam")
64 | # SAM file taken from https://github.com/samtools/samtools/blob/develop/examples/toy.sam
65 | _ = bioframe.read_alignments('tests/test_data/toy.sam')
66 |
67 |
68 | @pytest.mark.skipif(is_big_endian, reason="Test skipped on big-endian systems")
69 | def test_read_bam():
70 | pytest.importorskip("pysam")
71 | # converted toy.sam via `samtools view -bS toy.sam > toy.bam;
72 | # index file created with `samtools index toy.bam`
73 | _ = bioframe.read_alignments('tests/test_data/toy.bam')
74 |
--------------------------------------------------------------------------------
/tests/test_ops_select.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import pytest
3 |
4 | import bioframe
5 |
6 |
7 | def test_select():
8 | df = pd.DataFrame(
9 | [["chrX", 3, 8], ["chr1", 4, 5], ["chrX", 1, 5]],
10 | columns=["chrom", "start", "end"],
11 | )
12 |
13 | result = pd.DataFrame([["chr1", 4, 5]], columns=["chrom", "start", "end"])
14 | pd.testing.assert_frame_equal(
15 | result, bioframe.select(df, "chr1:4-10").reset_index(drop=True)
16 | )
17 |
18 | result = pd.DataFrame(
19 | [["chrX", 3, 8], ["chrX", 1, 5]], columns=["chrom", "start", "end"]
20 | )
21 | pd.testing.assert_frame_equal(
22 | result, bioframe.select(df, "chrX").reset_index(drop=True)
23 | )
24 |
25 | result = pd.DataFrame(
26 | [["chrX", 3, 8], ["chrX", 1, 5]], columns=["chrom", "start", "end"]
27 | )
28 | pd.testing.assert_frame_equal(
29 | result, bioframe.select(df, "chrX:4-6").reset_index(drop=True)
30 | )
31 |
32 | # Query range not in the dataframe
33 | assert len(bioframe.select(df, "chrZ")) == 0
34 | assert len(bioframe.select(df, "chr1:100-1000")) == 0
35 | assert len(bioframe.select(df, "chr1:1-3")) == 0
36 |
37 | # Invalid query range
38 | with pytest.raises(ValueError):
39 | bioframe.select(df, "chr1:1-0")
40 |
41 |
42 | def test_select__with_colnames():
43 | ### select with non-standard column names
44 | new_names = ["chr", "chrstart", "chrend"]
45 | df = pd.DataFrame(
46 | [["chrX", 3, 8], ["chr1", 4, 5], ["chrX", 1, 5]],
47 | columns=new_names,
48 | )
49 | result = pd.DataFrame(
50 | [["chrX", 3, 8], ["chrX", 1, 5]],
51 | columns=new_names,
52 | )
53 | pd.testing.assert_frame_equal(
54 | result, bioframe.select(df, "chrX:4-6", cols=new_names).reset_index(drop=True)
55 | )
56 | pd.testing.assert_frame_equal(
57 | result, bioframe.select(df, "chrX", cols=new_names).reset_index(drop=True)
58 | )
59 |
60 |
61 | def test_select__with_nulls():
62 | ### select from a DataFrame with NaNs
63 | colnames = ["chrom", "start", "end", "view_region"]
64 | df = pd.DataFrame(
65 | [
66 | ["chr1", -6, 12, "chr1p"],
67 | [pd.NA, pd.NA, pd.NA, "chr1q"],
68 | ["chrX", 1, 8, "chrX_0"],
69 | ],
70 | columns=colnames,
71 | ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()})
72 |
73 | result = pd.DataFrame(
74 | [["chr1", -6, 12, "chr1p"]],
75 | columns=colnames,
76 | ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()})
77 |
78 | pd.testing.assert_frame_equal(
79 | result, bioframe.select(df, "chr1:0-1").reset_index(drop=True)
80 | )
81 |
82 |
83 | def test_select__mask_indices_labels():
84 | df = pd.DataFrame(
85 | [["chrX", 3, 8], ["chr1", 4, 5], ["chrX", 1, 5]],
86 | columns=["chrom", "start", "end"],
87 | )
88 |
89 | region = "chr1:4-10"
90 | answer = pd.DataFrame([["chr1", 4, 5]], columns=["chrom", "start", "end"])
91 |
92 | result = bioframe.select(df, region)
93 | pd.testing.assert_frame_equal(answer, result.reset_index(drop=True))
94 | mask = bioframe.select_mask(df, region)
95 | pd.testing.assert_frame_equal(answer, df.loc[mask].reset_index(drop=True))
96 | labels = bioframe.select_labels(df, region)
97 | pd.testing.assert_frame_equal(answer, df.loc[labels].reset_index(drop=True))
98 | idx = bioframe.select_indices(df, region)
99 | pd.testing.assert_frame_equal(answer, df.iloc[idx].reset_index(drop=True))
100 |
101 |
102 | def test_select__query_intervals_are_half_open():
103 | df = pd.DataFrame(
104 | {
105 | "chrom": ["chr1", "chr1", "chr2", "chr2", "chr2", "chr2", "chr2", "chr2"],
106 | "start": [0, 10, 10, 20, 30, 40, 50, 60],
107 | "end": [10, 20, 20, 30, 40, 50, 60, 70],
108 | "name": ["a", "b", "A", "B", "C", "D", "E", "F"],
109 | }
110 | )
111 |
112 | result = bioframe.select(df, "chr1")
113 | assert (result["name"] == ["a", "b"]).all()
114 |
115 | result = bioframe.select(df, "chr2:20-70")
116 | assert (result["name"] == ["B", "C", "D", "E", "F"]).all()
117 |
118 | result = bioframe.select(df, "chr2:20-75")
119 | assert (result["name"] == ["B", "C", "D", "E", "F"]).all()
120 |
121 | result = bioframe.select(df, "chr2:20-")
122 | assert (result.index == [3, 4, 5, 6, 7]).all()
123 |
124 | result = bioframe.select(df, "chr2:20-30")
125 | assert (result["name"] == ["B"]).all()
126 |
127 | result = bioframe.select(df, "chr2:20-40")
128 | assert (result["name"] == ["B", "C"]).all()
129 |
130 | result = bioframe.select(df, "chr2:20-45")
131 | assert (result["name"] == ["B", "C", "D"]).all()
132 |
133 | result = bioframe.select(df, "chr2:19-45")
134 | assert (result["name"] == ["A", "B", "C", "D"]).all()
135 |
136 | result = bioframe.select(df, "chr2:25-45")
137 | assert (result["name"] == ["B", "C", "D"]).all()
138 |
139 | result = bioframe.select(df, "chr2:25-50")
140 | assert (result["name"] == ["B", "C", "D"]).all()
141 |
142 | result = bioframe.select(df, "chr2:25-51")
143 | assert (result["name"] == ["B", "C", "D", "E"]).all()
144 |
145 |
146 | def test_select__with_point_intervals():
147 | # Dataframe containing "point intervals"
148 | df = pd.DataFrame(
149 | {
150 | "chrom": ["chr1", "chr1", "chr2", "chr2", "chr2", "chr2", "chr2", "chr2"],
151 | "start": [0, 10, 10, 20, 30, 40, 50, 60],
152 | "end": [10, 10, 20, 30, 40, 50, 50, 70],
153 | "name": ["a", "b", "A", "B", "C", "D", "E", "F"],
154 | }
155 | )
156 | result = bioframe.select(df, "chr1")
157 | assert (result["name"] == ["a", "b"]).all()
158 |
159 | result = bioframe.select(df, "chr1:4-10")
160 | assert (result["name"] == ["a"]).all()
161 |
162 | result = bioframe.select(df, "chr1:4-4")
163 | assert (result["name"] == ["a"]).all()
164 |
165 | result = bioframe.select(df, "chr1:10-15")
166 | assert (result["name"] == ["b"]).all()
167 |
168 | result = bioframe.select(df, "chr2:20-70")
169 | assert (result["name"] == ["B", "C", "D", "E", "F"]).all()
170 |
171 | result = bioframe.select(df, "chr2:49-70")
172 | assert (result["name"] == ["D", "E", "F"]).all()
173 |
174 | result = bioframe.select(df, "chr2:50-70")
175 | assert (result["name"] == ["E", "F"]).all()
176 |
177 | result = bioframe.select(df, "chr2:50-51")
178 | assert (result["name"] == ["E"]).all()
179 |
180 | result = bioframe.select(df, "chr2:50-50")
181 | assert (result["name"] == ["E"]).all()
182 |
183 |
184 | def test_select__with_points():
185 | # Dataframe of points
186 | df = pd.DataFrame(
187 | [["chrX", 3, "A"], ["chr1", 4, "C"], ["chrX", 1, "B"]],
188 | columns=["chrom", "pos", "name"],
189 | )
190 |
191 | result = bioframe.select(df, "chr1:4-10", cols=["chrom", "pos", "pos"])
192 | assert (result["name"] == ["C"]).all()
193 |
194 | result = bioframe.select(df, "chr1:3-10", cols=["chrom", "pos", "pos"])
195 | assert (result["name"] == ["C"]).all()
196 |
197 | result = bioframe.select(df, "chr1:4-4", cols=["chrom", "pos", "pos"])
198 | assert (result["name"] == ["C"]).all()
199 |
--------------------------------------------------------------------------------
/tests/test_resources.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | import bioframe
4 |
5 |
6 | def test_fetch_chromsizes():
7 | db = "hg38"
8 | for provider in ["local", "ucsc"]:
9 | chromsizes = bioframe.fetch_chromsizes(db, provider=provider)
10 | assert isinstance(chromsizes, pd.Series)
11 | assert chromsizes.name == "length"
12 | assert len(chromsizes) == 25
13 |
14 | chromsizes_df = bioframe.fetch_chromsizes(db, provider=provider, as_bed=True)
15 | assert isinstance(chromsizes_df, pd.DataFrame)
16 | assert list(chromsizes_df.columns) == ["chrom", "start", "end"]
17 | assert len(chromsizes_df) == 25
18 |
19 | # Check synonymous local assemblies
20 | assert bioframe.fetch_chromsizes("hg38", provider="local").equals(
21 | bioframe.fetch_chromsizes("GRCh38", provider="local")
22 | )
23 |
24 |
25 | def test_fetch_chromsizes_local_vs_ucsc():
26 | for db in ["hg19", "hg38", "mm9", "mm10"]:
27 | assert bioframe.fetch_chromsizes(db, provider="local").equals(
28 | bioframe.fetch_chromsizes(db, provider="ucsc")
29 | )
30 |
31 |
32 | def test_fetch_centromeres():
33 | for db in ["hg19", "hg38"]:
34 | # Note: UCSC will usually have a different ordering of chromosomes
35 | for provider in ["local", "ucsc"]:
36 | centromeres = bioframe.fetch_centromeres(db, provider=provider)
37 | assert isinstance(centromeres, pd.DataFrame)
38 | assert list(centromeres.columns) == ["chrom", "start", "end", "mid"]
39 | assert len(centromeres) == 24
40 |
--------------------------------------------------------------------------------
/tests/test_vis.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import pytest
4 |
5 | import bioframe
6 |
7 |
8 | def test_to_ucsc_colorstring():
9 | assert bioframe.to_ucsc_colorstring("red") == "255,0,0"
10 | assert bioframe.to_ucsc_colorstring("blue") == "0,0,255"
11 | assert bioframe.to_ucsc_colorstring("green") == "0,128,0"
12 | assert bioframe.to_ucsc_colorstring("black") == "0,0,0"
13 | assert bioframe.to_ucsc_colorstring("white") == "255,255,255"
14 | assert bioframe.to_ucsc_colorstring("r") == "255,0,0"
15 | assert bioframe.to_ucsc_colorstring("tomato") == "255,99,71"
16 | assert bioframe.to_ucsc_colorstring("xkcd:sky blue") == "117,187,253"
17 | assert bioframe.to_ucsc_colorstring("#abc") == "170,187,204"
18 | assert bioframe.to_ucsc_colorstring("#ff0000") == "255,0,0"
19 | assert bioframe.to_ucsc_colorstring("#ff000055") == "255,0,0"
20 | assert bioframe.to_ucsc_colorstring((1, 0, 0)) == "255,0,0"
21 | assert bioframe.to_ucsc_colorstring((1, 0, 0, 0.5)) == "255,0,0"
22 | assert bioframe.to_ucsc_colorstring((0, 0, 1)) == "0,0,255"
23 | assert bioframe.to_ucsc_colorstring(None) == "0"
24 | assert bioframe.to_ucsc_colorstring("none") == "0"
25 | assert bioframe.to_ucsc_colorstring(np.nan) == "0"
26 | assert bioframe.to_ucsc_colorstring(pd.NA) == "0"
27 |
28 | with pytest.raises(ValueError):
29 | bioframe.to_ucsc_colorstring("notacolor")
30 |
31 | df = bioframe.from_any(
32 | [
33 | ["chr1", 0, 10, "red"],
34 | ["chr1", 10, 20, "blue"],
35 | ["chr2", 0, 10, "green"],
36 | ["chr2", 10, 20, None],
37 | ]
38 | )
39 | df["itemRgb"] = df["name"].apply(bioframe.to_ucsc_colorstring)
40 | assert df["itemRgb"].tolist() == ["255,0,0", "0,0,255", "0,128,0", "0"]
41 |
--------------------------------------------------------------------------------