├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── ci.yml
    │   └── publish.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── CHANGES.md
├── CITATION.cff
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── bioframe
    ├── __init__.py
    ├── core
    │   ├── __init__.py
    │   ├── arrops.py
    │   ├── checks.py
    │   ├── construction.py
    │   ├── specs.py
    │   └── stringops.py
    ├── extras.py
    ├── io
    │   ├── __init__.py
    │   ├── assembly.py
    │   ├── bed.py
    │   ├── data
    │   │   ├── _assemblies.yml
    │   │   ├── ce10.seqinfo.tsv
    │   │   ├── ce11.seqinfo.tsv
    │   │   ├── danRer10.seqinfo.tsv
    │   │   ├── danRer11.seqinfo.tsv
    │   │   ├── dm3.seqinfo.tsv
    │   │   ├── dm6.seqinfo.tsv
    │   │   ├── hg19.cytoband.tsv
    │   │   ├── hg19.seqinfo.tsv
    │   │   ├── hg38.cytoband.tsv
    │   │   ├── hg38.seqinfo.tsv
    │   │   ├── hs1.cytoband.tsv
    │   │   ├── hs1.seqinfo.tsv
    │   │   ├── mm10.seqinfo.tsv
    │   │   ├── mm39.seqinfo.tsv
    │   │   ├── mm9.seqinfo.tsv
    │   │   ├── sacCer3.seqinfo.tsv
    │   │   └── wuhCor1.seqinfo.tsv
    │   ├── fileops.py
    │   ├── resources.py
    │   └── schemas.py
    ├── ops.py
    ├── sandbox
    │   ├── clients.py
    │   ├── gtf_io.py
    │   └── parquet_io.py
    └── vis.py
├── docs
    ├── Makefile
    ├── api-construction.rst
    ├── api-extras.rst
    ├── api-fileops.rst
    ├── api-intervalops.rst
    ├── api-lowlevel.md
    ├── api-resources.rst
    ├── api-validation.rst
    ├── api-vis.rst
    ├── conf.py
    ├── figs
    │   ├── ._bioframe-logo.png
    │   ├── bioframe-logo.png
    │   ├── bioframe_closest.pdf
    │   ├── closest0.png
    │   ├── closest1.png
    │   ├── closest2.png
    │   ├── closest3.png
    │   ├── df1.png
    │   ├── df2.png
    │   ├── df@.png
    │   ├── merge_df1.png
    │   ├── overlap_inner_0.png
    │   └── overlap_inner_1.png
    ├── guide-bedtools.md
    ├── guide-definitions.rst
    ├── guide-intervalops.md
    ├── guide-io.ipynb
    ├── guide-performance.ipynb
    ├── guide-quickstart.rst
    ├── guide-recipes.md
    ├── guide-specifications.rst
    ├── index.rst
    ├── lowlevel
    │   ├── arrops.rst
    │   ├── specs.rst
    │   └── stringops.rst
    ├── make.bat
    ├── times100.bw
    └── tutorials
    │   ├── tutorial_assign_motifs_to_peaks.ipynb
    │   └── tutorial_assign_peaks_to_genes.ipynb
├── pyproject.toml
└── tests
    ├── test_assembly_info.py
    ├── test_bed.py
    ├── test_core_checks.py
    ├── test_core_construction.py
    ├── test_core_specs.py
    ├── test_core_stringops.py
    ├── test_data
        ├── bed12.bed
        ├── bed9.bed
        ├── jaspar.bed
        ├── narrowPeak.bed
        ├── test.chrom.sizes
        ├── test.fa
        ├── test.fa.fai
        ├── toy.bam
        ├── toy.bam.bai
        └── toy.sam
    ├── test_extras.py
    ├── test_fileops.py
    ├── test_ops.py
    ├── test_ops_select.py
    ├── test_resources.py
    └── test_vis.py


/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "github-actions"
 4 |     directory: "/"
 5 |     schedule:
 6 |       interval: "weekly"
 7 |     groups:
 8 |           actions:
 9 |             patterns:
10 |               - "*"
11 |   - package-ecosystem: "pip"
12 |     directory: "/"
13 |     schedule:
14 |       interval: "weekly"
15 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 | 
 7 |   pull_request:
 8 |     branches: [ main ]
 9 | 
10 | concurrency:
11 |   group: ${{ github.workflow }}-${{ github.ref }}
12 |   cancel-in-progress: true
13 | 
14 | jobs:
15 | 
16 |   Test:
17 |     runs-on: ubuntu-latest
18 |     strategy:
19 |       matrix:
20 |         python-version: [ "3.9", "3.10", "3.11", "3.12" ]
21 |     steps:
22 |       - uses: actions/checkout@v4
23 |       - name: Set up Python ${{ matrix.python-version }}
24 |         uses: actions/setup-python@v5
25 |         with:
26 |           python-version: ${{ matrix.python-version }}
27 |       - run: |
28 |           python -m pip install --upgrade pip hatch
29 |           pip install -e .[dev]
30 |           hatch run test
31 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python Package to PyPI
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 |   workflow_dispatch:
 7 | 
 8 | jobs:
 9 |   Publish:
10 |     # prevents this action from running on forks
11 |     if: github.repository == 'open2c/bioframe'
12 | 
13 |     runs-on: ubuntu-latest
14 |     permissions:
15 |       id-token: write
16 | 
17 |     steps:
18 |       - name: Checkout
19 |         uses: actions/checkout@v4
20 | 
21 |       - name: Setup Python
22 |         uses: actions/setup-python@v5
23 |         with:
24 |           python-version: "3.x"
25 | 
26 |       - name: Install dependencies
27 |         run: |
28 |           python -m pip install --upgrade pip
29 |           pip install build
30 | 
31 |       - name: Build
32 |         run: python -m build
33 | 
34 |       - name: Publish distribution 📦 to PyPI
35 |         uses: pypa/gh-action-pypi-publish@release/v1
36 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.swp
 2 | *.swo
 3 | *~
 4 | 
 5 | *.py[cod]
 6 | __pycache__
 7 | .cache
 8 | .pytest_cache
 9 | .ipynb_checkpoints/
10 | .venv/*
11 | 
12 | # setup and build
13 | docs/_*
14 | *.egg-info/
15 | dist/
16 | build/
17 | MANIFEST
18 | 
19 | # OS-generated files
20 | .DS_Store
21 | .Spotlight-V100
22 | .Trashes
23 | ehthumbs.db
24 | Thumbs.db
25 | 
26 | _scratch/
27 | tmp/
28 | docs/notebooks/.ipynb_checkpoints
29 | .vscode
30 | .spyproject
31 | docs/notebooks/cgranges-test/*
32 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v5.0.0
 4 |     hooks:
 5 |       - id: check-ast
 6 |       - id: end-of-file-fixer
 7 |       - id: mixed-line-ending
 8 |       - id: trailing-whitespace
 9 |       - id: check-case-conflict
10 | 
11 |   - repo: https://github.com/astral-sh/ruff-pre-commit
12 |     rev: v0.7.0
13 |     hooks:
14 |       - id: ruff
15 |         types_or: [python, pyi, jupyter]
16 |         args: [--fix, --show-fixes, --exit-non-zero-on-fix]
17 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml                                                                                                      # Read the Docs configuration file                                                                                      # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details                                                                                                                                                                     # Required                                                                                                              version: 2                                                                                                                                                                                                                                      # Build documentation in the docs/ directory with Sphinx                                                                sphinx:                                                                                                                   configuration: docs/conf.py                                                                                                                                                                                                                   # Build documentation with MkDocs                                                                                       #mkdocs:                                                                                                                #  configuration: mkdocs.yml                                                                                                                                                                                                                    # Optionally build your docs in additional formats such as PDF and ePub                                                 formats: all                                                                                                                                                                                                                                    # Optionally set the version of Python and requirements required to build your docs                                     python:                                                                                                                   version: 3.7                                                                                                            install:                                                                                                                  - requirements: docs/requirements.txt                                                                                                                           # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | build:
 8 |   os: ubuntu-22.04
 9 |   tools:
10 |     python: "3.10"
11 | # Build documentation in the docs/ directory with Sphinx
12 | sphinx:
13 |   configuration: docs/conf.py
14 | 
15 | # Build documentation with MkDocs
16 | #mkdocs:
17 | #  configuration: mkdocs.yml
18 | 
19 | # Optionally build your docs in additional formats such as PDF and ePub
20 | formats: all
21 | 
22 | # Optionally set the version of Python and requirements required to build your docs
23 | # setup_py_install: true
24 | python:
25 |   install:
26 |     - method: pip
27 |       path: .
28 |       extra_requirements:
29 |         - dev
30 |         - docs
31 | 


--------------------------------------------------------------------------------
/CHANGES.md:
--------------------------------------------------------------------------------
  1 | # Release notes
  2 | 
  3 | ## [Upcoming release](https://github.com/open2c/bioframe/compare/v0.8.0...HEAD)
  4 | 
  5 | ## v0.8.0
  6 | 
  7 | Date: 2025-04-08
  8 | 
  9 | API changes:
 10 | * bigtools engine for bigwig and bigbed.
 11 | * run length functions `mark_runs` and `compress_runs`.
 12 | 
 13 | Maintenance:
 14 | * Numpy 2.x support.
 15 | 
 16 | ## v0.7.2
 17 | 
 18 | Date: 2024-06-19
 19 | 
 20 | API changes:
 21 | * `read_alignment` function introduced in v0.7.0 has been pluralized to `read_alignments`
 22 | 
 23 | Maintenance:
 24 | * Skip `read_alignments` tests on big-endian architectures by @nvictus in https://github.com/open2c/bioframe/pull/216
 25 | 
 26 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.7.1...v0.7.2
 27 | 
 28 | ## v0.7.1
 29 | 
 30 | Date: 2024-06-17
 31 | 
 32 | Maintenance:
 33 | * Refactor join arrayops and intidx internals by @nvictus in https://github.com/open2c/bioframe/pull/204
 34 | * NumPy 2.0 was released. Pin `numpy < 2` until we migrate.
 35 | 
 36 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.7.0...v0.7.1
 37 | 
 38 | ## v0.7.0
 39 | 
 40 | Date: 2024-05-20
 41 | 
 42 | API changes:
 43 | * Add `to_bed` function to validate and write standard BED files @gamazeps in https://github.com/open2c/bioframe/pull/203
 44 | * `read_bam` deprecated in favor of `read_alignments` @gamazeps in https://github.com/open2c/bioframe/pull/206
 45 | 
 46 | Documentation:
 47 | * Add "bioframe for bedtools users" guide to docs by @gamazeps in https://github.com/open2c/bioframe/pull/198
 48 | 
 49 | Bug fixes:
 50 | * Fix contig name and JSON issues in read_bam implementation by @gamazeps in https://github.com/open2c/bioframe/pull/206
 51 | 
 52 | New Contributors:
 53 | * @gamazeps made their first contribution in https://github.com/open2c/bioframe/pull/203
 54 | 
 55 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.6.4...v0.7.0
 56 | 
 57 | ## v0.6.4
 58 | 
 59 | Date: 2024-04-06
 60 | 
 61 | Maintenance:
 62 | * Migrate from setuptools `pkg_resources` to `importlib.resources` by @nvictus in https://github.com/open2c/bioframe/pull/194
 63 | * Use `importlib.metadata` for versioning by @nvictus in https://github.com/open2c/bioframe/pull/195
 64 | 
 65 | Bug fixes:
 66 | * Overlap point segment patch #183 by @smitkadvani in https://github.com/open2c/bioframe/pull/184
 67 | * #167: Replaced np.int with int as the attribute is deprecated by numpy by @harshit148 in https://github.com/open2c/bioframe/pull/192
 68 | 
 69 | New Contributors:
 70 | * @harshit148 made a first contribution in https://github.com/open2c/bioframe/pull/192
 71 | 
 72 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.6.3...v0.6.4
 73 | 
 74 | ## v0.6.3
 75 | 
 76 | Date: 2024-03-11
 77 | 
 78 | Fixes:
 79 | * Prevent dropout from `closest` in some cases of left intervals with no neighbors by @agalitsyna in https://github.com/open2c/bioframe/pull/185
 80 | * Fix overlap returning float indexes causing failing tests (numpy v1.22.4, pandas v1.5.2)  by @agalitsyna in https://github.com/open2c/bioframe/pull/185
 81 | 
 82 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.6.2...v0.6.3
 83 | 
 84 | ## v0.6.2
 85 | 
 86 | Date: 2024-02-08
 87 | 
 88 | Changes:
 89 | * cols and df_view_col passed to downstream functions by @smitkadvani in https://github.com/open2c/bioframe/pull/182
 90 | 
 91 | Fixes:
 92 | * Update to new UCSC hgdownload url by @golobor and @nvictus in https://github.com/open2c/bioframe/pull/187
 93 | 
 94 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.6.1...v0.6.2
 95 | 
 96 | ## v0.6.1
 97 | 
 98 | Date: 2024-01-08
 99 | 
100 | API changes:
101 | 
102 | Default behavior of `ensure_nullable` option in `overlap` was modified to minimize the possibility of regressions in libraries that depend on legacy behavior.
103 | 
104 | * The new option was renamed `ensure_int` and is `True` by default. It ensures that output coordinate columns are always returned with an integer dtype, as was the case in prior versions. This is achieved by converting columns having non-nullable NumPy dtypes to Pandas nullable ones in the specific case where the result of an **outer join** generates missing values; otherwise, column dtypes are preserved unchanged in the output.
105 | * Unlike previous minor versions of bioframe, the nullable dtype chosen will have the **same underlying type** as the corresponding column from the input (i.e, an input dataframe using `np.uint32` start coordinates may yield a `pd.UInt32` start column in the output).
106 | * This behavior can be turned off by setting `ensure_int` to `False`, in which case outer joins on dataframes using NumPy dtypes may produce floating point output columns when missing values are introduced (stored as `NaN`), following the native casting behavior of such columns.
107 | 
108 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.6.0...v0.6.1
109 | 
110 | ## v0.6.0
111 | 
112 | Date: 2024-01-04
113 | 
114 | API changes:
115 | * `overlap`: In previous versions, output coordinate columns were always converted to Pandas "nullable" `Int64` dtype before returning outer join results. In the interest of flexibility, memory efficiency, and least surprise, the coordinate columns returned in the output dataframe now preserve dtype from the input dataframes, following native type casting rules if missing data are introduced. We introduce the `ensure_nullable` argument to force Pandas nullable dtypes in the output coordinates. See the [docs](https://bioframe.readthedocs.io/en/latest/api-intervalops.html#bioframe.ops.overlap) for more details. (#178)
116 | 
117 | Bug fixes:
118 | * Fixed `coverage` with custom `cols1` (#170)
119 | 
120 | Documentation:
121 | * Added contributing guidelines and NumFOCUS affiliation.
122 | * Updated README and added CITATION.cff file.
123 | * Updated performance benchmarks.
124 | 
125 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.5.1...v0.6.0
126 | 
127 | ## v0.5.1
128 | 
129 | Date: 2023-11-08
130 | 
131 | Bug fixes:
132 | * Series are treated like dict in `make_chromarms`
133 | 
134 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.5.0...v0.5.1
135 | 
136 | ## v0.5.0
137 | 
138 | Date: 2023-10-05
139 | 
140 | API changes:
141 | * New builtin curated genome assembly database (metadata, chromsizes, cytobands):
142 |   * `bioframe.list_assemblies()`
143 |   * `bioframe.assembly_info()`
144 | * New UCSC RGB color converter utility #158
145 | * Options added to `pair_by_distance`
146 | 
147 | Bug fixes:
148 | * Make expand throw an error if both pad and scale are passed (#148)
149 | * Fixes to bioframe.select query interval semantics (#147)
150 | 
151 | Maintenance:
152 | * Migrate to hatch build system and pyproject.toml
153 | * Various refactorings
154 | 
155 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.4.1...v0.5.0
156 | 
157 | ## v0.4.1
158 | 
159 | Date: 2023-04-22
160 | 
161 | Bug fixes:
162 | * Fix bug introduced in the last release in `select` and `select_*` query interval semantics. Results of select are now consistent with the query interval being interpreted as half-open, closed on the left.
163 | 
164 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.4.0...v0.4.1
165 | 
166 | ## v0.4.0
167 | 
168 | Date: 2023-03-23
169 | 
170 | API changes:
171 | * New strand-aware directionality options for `closest()` via `direction_col` #129.
172 | * New index-based range query selectors on single bioframes to complement `select()` #128:
173 |     * `select_mask()` returns boolean indices corresponding to intervals that overlap the query region
174 |     * `select_indices()` returns integer indices corresponding to intervals that overlap the query region
175 |     * `select_labels()` returns pandas label indices corresponding to intervals that overlap the query region
176 | 
177 | Bug fixes:
178 | * Import fixes in sandbox
179 | * Relax bioframe validator to permit using same column as start and end (e.g. point variants).
180 | 
181 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.3.3...v0.4.0
182 | 
183 | ## v0.3.3
184 | 
185 | Date: 2022-02-28
186 | 
187 | Bug fixes:
188 | * fixed a couple functions returning an error instance instead of raising
189 | * fetch_mrna link fixed
190 | 
191 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.3.2...v0.3.3
192 | 
193 | ## v0.3.2
194 | 
195 | Date: 2022-02-01
196 | 
197 | Bug fixes:
198 | * fixed error in is_contained
199 | * tutorial updates
200 | 
201 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.3.1...v0.3.2
202 | 
203 | ## v0.3.1
204 | 
205 | Date: 2021-11-15
206 | 
207 | API changes:
208 | 
209 | * `bioframe.sort_bedframe` does not append columns or modify their dtypes.
210 | 
211 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.3.0...v0.3.1
212 | 
213 | ## v0.3.0
214 | 
215 | Date: 2021-08-31
216 | 
217 | Conceptual changes:
218 | * we formulated strict definitions for genomic intervals, dataframes, and
219 |     their various properties. All bioframe functions are expected to follow
220 |     to these definitions tightly.
221 | 
222 | API changes:
223 | * reorganize modules:
224 |     * ops - operations on genomic interval dataframes
225 |     * extras - miscellaneous operations, most involving
226 |         genomic sequences and gene annotations
227 |     * vis - visualizations of genomic interval dataframes
228 |     * core.arrops - operations on genomic interval arrays
229 |     * core.checks - tests for definitions of genomic interval dataframes
230 |     * core.construction - construction and sanitation of genomic interval dataframes
231 |     * core.specs - specifications for the implementation of genomic intervals in pandas.dataframes
232 |         (i.e. column names, datatypes, etc)
233 |     * core.stringops - operations on genomic interval strings
234 |     * io.fileops - I/O on common file formats for genomic data
235 |     * io.schemas - schemas for standard tabular formats for genomic data storage
236 |     * io.resources - interfaces to popular online genomic data resources
237 | 
238 | * new functions: extras.pair_by_distance, ops.sort_bedframe, ops.assign_view,
239 |     dataframe constructors
240 | 
241 | * existing functions:
242 |     * expand: take negative values and fractional values
243 |     * overlap: change default suffixes, keep_order=True
244 |     * subtract: add return_index and keep_order
245 | 
246 | * enable pd.NA for missing values, typecasting
247 | 
248 | New data:
249 | * add schemas for bedpe, gap, UCSCmRNA, pgsnp
250 | * add tables with curated detailed genome assembly information
251 | 
252 | Bugfixes:
253 | * None?..
254 | 
255 | Miscellaneous:
256 | * speed up frac_gc is faster now
257 | * drop support for Python 3.6, add support for 3.9
258 | 
259 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.2.0...v0.3.0
260 | 
261 | ## v0.2.0
262 | 
263 | Date: 2020-12-02
264 | 
265 | API changes:
266 | * `read_chromsizes` and `fetch_chromsizes`: add new `as_bed` parameter.
267 | * `read_chromsizes` and `fetch_chromsizes`: revert to filtering chromosome names by default, but clearly expose `filter_chroms` kwarg.
268 | 
269 | Bug fixes:
270 | * Fixed `bioframe.split`
271 | * Restored `frac_genome_coverage`
272 | 
273 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.1.0...v0.2.0
274 | 
275 | ## v0.1.0
276 | 
277 | Date: 2020-09-23
278 | 
279 | First beta release.
280 | 
281 | What's new:
282 | * New extensive dataframe genomic interval arithmetic toolsuite.
283 | * Improved region handling and region querying functions.
284 | * [Documentation!](https://bioframe.readthedocs.io/)
285 | 
286 | Maintenance:
287 | * Dropped Python 2 support
288 | * Refactoring of various genome operations and resources.
289 | * Improved testing and linting
290 | 
291 | **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.0.12...v0.1.0
292 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | type: software
 3 | title: bioframe
 4 | license: MIT
 5 | repository-code: 'https://github.com/open2c/bioframe'
 6 | message: >-
 7 |   If you use this software, please cite it using the
 8 |   metadata from this file.
 9 | authors:
10 |   - given-names: Nezar
11 |     family-names: Abdennur
12 |     orcid: 'https://orcid.org/0000-0001-5814-0864'
13 |   - given-names: Geoffrey
14 |     family-names: Fudenberg
15 |     orcid: "https://orcid.org/0000-0001-5905-6517"
16 |   - given-names: Ilya
17 |     family-names: Flyamer
18 |     orcid: "https://orcid.org/0000-0002-4892-4208"
19 |   - given-names: Aleksandra
20 |     family-names: Galitsyna
21 |     orcid: "https://orcid.org/0000-0001-8969-5694"
22 |   - given-names: Anton
23 |     family-names: Goloborodko
24 |     orcid: "https://orcid.org/0000-0002-2210-8616"
25 |   - given-names: Maxim
26 |     family-names: Imakaev
27 |     orcid: "https://orcid.org/0000-0002-5320-2728"
28 |   - given-names: Sergey
29 |     family-names: Venev
30 |     orcid: "https://orcid.org/0000-0002-1507-7460"
31 | abstract: >-
32 |   Bioframe is a library to enable flexible and performant
33 |   operations on genomic interval data frames in Python.
34 | keywords:
35 |   - bioinformatics
36 |   - genomics
37 |   - ranges
38 |   - intervals
39 |   - dataframes
40 |   - pandas
41 |   - numpy
42 |   - Python
43 | identifiers:
44 |   - type: doi
45 |     value: 10.5281/zenodo.3897573
46 |     description: Zenodo
47 |   - type: doi
48 |     value: 10.1101/2022.02.16.480748
49 |     description: bioRxiv preprint
50 |   - type: doi
51 |     value: 10.1093/bioinformatics/btae088
52 |     description: Publication
53 | preferred-citation:
54 |   type: article
55 |   title: "Bioframe: Operations on Genomic Intervals in Pandas Dataframes"
56 |   authors:
57 |     - family-names: Open2C
58 |     - given-names: Nezar
59 |       family-names: Abdennur
60 |       orcid: 'https://orcid.org/0000-0001-5814-0864'
61 |     - given-names: Geoffrey
62 |       family-names: Fudenberg
63 |       orcid: "https://orcid.org/0000-0001-5905-6517"
64 |     - given-names: Ilya
65 |       family-names: Flyamer
66 |       name-suffix: M
67 |       orcid: "https://orcid.org/0000-0002-4892-4208"
68 |     - given-names: Aleksandra
69 |       family-names: Galitsyna
70 |       name-suffix: A
71 |       orcid: "https://orcid.org/0000-0001-8969-5694"
72 |     - given-names: Anton
73 |       family-names: Goloborodko
74 |       orcid: "https://orcid.org/0000-0002-2210-8616"
75 |     - given-names: Maxim
76 |       family-names: Imakaev
77 |       orcid: "https://orcid.org/0000-0002-5320-2728"
78 |     - given-names: Sergey
79 |       family-names: Venev
80 |       orcid: "https://orcid.org/0000-0002-1507-7460"
81 |   journal: Bioinformatics
82 |   year: 2024
83 |   url: "https://doi.org/10.1093/bioinformatics/btae088"
84 |   doi: "10.1093/bioinformatics/btae088"
85 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing
  2 | 
  3 | 
  4 | ## General guidelines
  5 | 
  6 | If you haven't contributed to open-source before, we recommend you read [this excellent guide by GitHub on how to contribute to open source](https://opensource.guide/how-to-contribute). The guide is long, so you can gloss over things you're familiar with.
  7 | 
  8 | If you're not already familiar with it, we follow the [fork and pull model](https://help.github.com/articles/about-collaborative-development-models) on GitHub. Also, check out this recommended [git workflow](https://www.asmeurer.com/git-workflow/).
  9 | 
 10 | 
 11 | ## Contributing Code
 12 | 
 13 | This project has a number of requirements for all code contributed.
 14 | 
 15 | * We follow the [PEP-8 style](https://www.python.org/dev/peps/pep-0008/) convention.
 16 | * We use [NumPy-style docstrings](https://numpydoc.readthedocs.io/en/latest/format.html).
 17 | * It's ideal if user-facing API changes or new features have documentation added.
 18 | * It is best if all new functionality and/or bug fixes have unit tests added with each use-case.
 19 | 
 20 | 
 21 | ## Setting up Your Development Environment
 22 | 
 23 | This project uses the [hatch](https://hatch.pypa.io/latest/) project manager and build system. We recommend you install `hatch` as a global isolated application using [pipx](https://pipx.pypa.io/stable/). See other installation options [here](https://hatch.pypa.io/latest/install/).
 24 | 
 25 | ```sh
 26 | pipx install hatch
 27 | ```
 28 | 
 29 | > [!NOTE]
 30 | > Many custom command shortcuts are accessible through hatch (and shown below). See `tool.hatch.envs.default.scripts` in our project's `pyproject.toml` configuration file.
 31 | 
 32 | After forking and cloning the repository, you can create an isolated Python development environment and install the package in "editable" (i.e. development) mode as follows:
 33 | 
 34 | ```sh
 35 | git clone https://github.com/open2c/bioframe.git
 36 | cd bioframe
 37 | hatch shell
 38 | ```
 39 | 
 40 | The first time you run `hatch shell` the environment will be created and activated, and the package will be installed. In future sessions, running `hatch shell` will reactivate your development environment.
 41 | 
 42 | > [!TIP]
 43 | > If you prefer to store your virtual environments in your working directory (like classic virtualenvs) rather than in a centralized location (similar to conda), configure hatch as follows:
 44 | >
 45 | > ```sh
 46 | > hatch config set dirs.env.virtual .venv
 47 | > ```
 48 | >
 49 | > This will make hatch set up its environments within the current working directory under `.venv`.
 50 | 
 51 | Alternatively, if you prefer to manage your virtual environments yourself, you can install the package for development using, for example:
 52 | 
 53 | ```sh
 54 | python -m venv .venv
 55 | source .venv/bin/activate
 56 | pip install -e '.[dev,test,docs]'
 57 | ```
 58 | 
 59 | For all pull requests, linting and unit tests are automatically run using the [GitHub Actions](https://docs.github.com/en/actions) Continuous Integration service. However, you are still encouraged to run these checks locally before pushing code to a PR.
 60 | 
 61 | ## Linting and Formatting
 62 | 
 63 | We use [ruff](https://docs.astral.sh/ruff/) for style checking. Run `ruff check .` or:
 64 | 
 65 | ```sh
 66 | hatch run lint
 67 | ```
 68 | 
 69 | Ruff can fix a lot of errors itself. Run `ruff check --fix .` or:
 70 | 
 71 | ```sh
 72 | hatch run fix
 73 | ```
 74 | 
 75 | Ruff includes a formatter that mimics [black](https://black.readthedocs.io/en/stable/). To automatically reformat your code, you can use `ruff format {source_file}`.
 76 | 
 77 | We use [pre-commit](https://github.com/pre-commit/pre-commit) to make sure the coding style is enforced. You first need to install pre-commit and the corresponding git commit hooks:
 78 | 
 79 | ```sh
 80 | pip install pre-commit
 81 | pre-commit install
 82 | ```
 83 | 
 84 | The last command installs the hooks listed in `.pre-commit-config.yaml` locally into your git repo. If you do this, the checks will run automatically before every commit. You can also manually make sure your code satisfies the coding style:
 85 | 
 86 | ```sh
 87 | pre-commit run --all-files
 88 | ```
 89 | 
 90 | ## Testing
 91 | 
 92 | It is best if all new functionality and/or bug fixes have unit tests added with each use-case.
 93 | 
 94 | We use [pytest](https://docs.pytest.org/en/latest) as our unit testing framework. Once you've configured your environment, you can just `cd` to the root of your repository and run `pytest` or:
 95 | 
 96 | ```sh
 97 | hatch run test
 98 | ```
 99 | 
100 | ## Documentation
101 | 
102 | If a feature is stable and relatively finalized, it is time to add it to the documentation. If you are adding any private/public functions, it is best to add docstrings, to aid in reviewing code and also for the API reference.
103 | 
104 | We use [Numpy style docstrings](https://numpydoc.readthedocs.io/en/latest/format.html>) and [Sphinx](http://www.sphinx-doc.org/en/stable) to document this library. Sphinx, in turn, uses [reStructuredText](http://www.sphinx-doc.org/en/stable/rest.html) as its markup language for adding code.
105 | 
106 | We use the [Sphinx Autosummary extension](http://www.sphinx-doc.org/en/stable/ext/autosummary.html) to generate API references. You may want to look at `docs/api-*.rst` files to see how they look and where to add new functions, classes or modules. We also use the [myst_nb extension](https://myst-nb.readthedocs.io/en/latest/) to render Jupyter notebooks in the documentation.
107 | 
108 | To build the documentation, run `sphinx-autobuild` using:
109 | 
110 | ```sh
111 | hatch run docs
112 | ```
113 | 
114 | This will build the documentation and serve it on a local http server which listens for changes and automatically rebuilds.
115 | 
116 | Documentation from the `main` branch and tagged releases is automatically built and hosted on [readthedocs](https://readthedocs.org/).
117 | 
118 | 
119 | ## Acknowledgments
120 | 
121 | This document is based off of the [guidelines from the sparse project](https://github.com/pydata/sparse/blob/master/docs/contributing.rst).
122 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Open2C Developers
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Bioframe: Operations on Genomic Interval Dataframes
  2 | 
  3 | <img src="https://github.com/open2c/bioframe/raw/main/docs/figs/bioframe-logo.png" width=75%>
  4 | 
  5 | ![CI](https://github.com/open2c/bioframe/actions/workflows/ci.yml/badge.svg)
  6 | [![pre-commit.ci status](https://results.pre-commit.ci/badge/github/open2c/bioframe/main.svg)](https://results.pre-commit.ci/latest/github/open2c/bioframe/main)
  7 | [![Docs status](https://readthedocs.org/projects/bioframe/badge/)](https://bioframe.readthedocs.io/en/latest/)
  8 | [![Paper](https://img.shields.io/badge/DOI-10.1093%2Fbioinformatics%2Fbtae088-blue)](https://doi.org/10.1093/bioinformatics/btae088)
  9 | [![Zenodo](https://zenodo.org/badge/69901992.svg)](https://zenodo.org/badge/latestdoi/69901992)
 10 | [![Slack](https://img.shields.io/badge/chat-slack-%233F0F3F?logo=slack)](https://bit.ly/open2c-slack)
 11 | [![NumFOCUS](https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](https://www.numfocus.org)
 12 | 
 13 | Bioframe enables flexible and scalable operations on genomic interval dataframes in Python.
 14 | 
 15 | Bioframe is built directly on top of [Pandas](https://pandas.pydata.org/). Bioframe provides:
 16 | 
 17 | * A variety of genomic interval operations that work directly on dataframes.
 18 | * Operations for special classes of genomic intervals, including chromosome arms and fixed-size bins.
 19 | * Conveniences for diverse tabular genomic data formats and loading genome assembly summary information.
 20 | 
 21 | Read the [documentation](https://bioframe.readthedocs.io/en/latest/), including the [guide](https://bioframe.readthedocs.io/en/latest/guide-intervalops.html), as well as the [publication](https://doi.org/10.1093/bioinformatics/btae088) for more information.
 22 | 
 23 | Bioframe is an Affiliated Project of [NumFOCUS](https://www.numfocus.org).
 24 | 
 25 | ## Installation
 26 | 
 27 | Bioframe is available on [PyPI](https://pypi.org/project/bioframe/) and [bioconda](https://bioconda.github.io/recipes/bioframe/README.html):
 28 | 
 29 | ```sh
 30 | pip install bioframe
 31 | ```
 32 | 
 33 | ## Contributing
 34 | 
 35 | Interested in contributing to bioframe? That's great! To get started, check out the [contributing guide](https://github.com/open2c/bioframe/blob/main/CONTRIBUTING.md). Discussions about the project roadmap take place on the [Open2C Slack](https://bit.ly/open2c-slack) and regular developer meetings scheduled there. Anyone can join and participate!
 36 | 
 37 | 
 38 | ## Interval operations
 39 | 
 40 | Key genomic interval operations in bioframe include:
 41 | - `overlap`: Find pairs of overlapping genomic intervals between two dataframes.
 42 | - `closest`: For every interval in a dataframe, find the closest intervals in a second dataframe.
 43 | - `cluster`: Group overlapping intervals in a dataframe into clusters.
 44 | - `complement`: Find genomic intervals that are not covered by any interval from a dataframe.
 45 | 
 46 | Bioframe additionally has functions that are frequently used for genomic interval operations and can be expressed as combinations of these core operations and dataframe operations, including: `coverage`, `expand`, `merge`, `select`, and `subtract`.
 47 | 
 48 | To `overlap` two dataframes, call:
 49 | ```python
 50 | import bioframe as bf
 51 | 
 52 | bf.overlap(df1, df2)
 53 | ```
 54 | 
 55 | For these two input dataframes, with intervals all on the same chromosome:
 56 | 
 57 | <img src="https://github.com/open2c/bioframe/raw/main/docs/figs/df1.png" width=60%>
 58 | <img src="https://github.com/open2c/bioframe/raw/main/docs/figs/df2.png" width=60%>
 59 | 
 60 | `overlap` will return the following interval pairs as overlaps:
 61 | 
 62 | <img src="https://github.com/open2c/bioframe/raw/main/docs/figs/overlap_inner_0.png" width=60%>
 63 | <img src="https://github.com/open2c/bioframe/raw/main/docs/figs/overlap_inner_1.png" width=60%>
 64 | 
 65 | 
 66 | To `merge` all overlapping intervals in a dataframe, call:
 67 | ```python
 68 | import bioframe as bf
 69 | 
 70 | bf.merge(df1)
 71 | ```
 72 | 
 73 | For this input dataframe, with intervals all on the same chromosome:
 74 | 
 75 | <img src="https://github.com/open2c/bioframe/raw/main/docs/figs/df1.png" width=60%>
 76 | 
 77 | `merge` will return a new dataframe with these merged intervals:
 78 | 
 79 | <img src="https://github.com/open2c/bioframe/raw/main/docs/figs/merge_df1.png" width=60%>
 80 | 
 81 | See the [guide](https://bioframe.readthedocs.io/en/latest/guide-intervalops.html) for visualizations of other interval operations in bioframe.
 82 | 
 83 | ## File I/O
 84 | 
 85 | Bioframe includes utilities for reading genomic file formats into dataframes and vice versa. One handy function is `read_table` which mirrors pandas’s read_csv/read_table but provides a [`schema`](https://github.com/open2c/bioframe/blob/main/bioframe/io/schemas.py) argument to populate column names for common tabular file formats.
 86 | 
 87 | ```python
 88 | jaspar_url = 'http://expdata.cmmt.ubc.ca/JASPAR/downloads/UCSC_tracks/2022/hg38/MA0139.1.tsv.gz'
 89 | ctcf_motif_calls = bioframe.read_table(jaspar_url, schema='jaspar', skiprows=1)
 90 | ```
 91 | 
 92 | ## Tutorials
 93 | See this [jupyter notebook](https://github.com/open2c/bioframe/tree/master/docs/tutorials/tutorial_assign_motifs_to_peaks.ipynb) for an example of how to assign TF motifs to ChIP-seq peaks using bioframe.
 94 | 
 95 | 
 96 | ## Citing
 97 | 
 98 | If you use ***bioframe*** in your work, please cite:
 99 | 
100 | ```bibtex
101 | @article{bioframe_2024,
102 | author = {Open2C and Abdennur, Nezar and Fudenberg, Geoffrey and Flyamer, Ilya M and Galitsyna, Aleksandra A and Goloborodko, Anton and Imakaev, Maxim and Venev, Sergey},
103 | doi = {10.1093/bioinformatics/btae088},
104 | journal = {Bioinformatics},
105 | title = {{Bioframe: Operations on Genomic Intervals in Pandas Dataframes}},
106 | year = {2024}
107 | }
108 | ```
109 | 


--------------------------------------------------------------------------------
/bioframe/__init__.py:
--------------------------------------------------------------------------------
  1 | try:
  2 |     from importlib.metadata import PackageNotFoundError, version
  3 | except ImportError:
  4 |     from importlib_metadata import PackageNotFoundError, version
  5 | 
  6 | try:
  7 |     __version__ = version("bioframe")
  8 | except PackageNotFoundError:
  9 |     __version__ = "unknown"
 10 | 
 11 | __all__ = [
 12 |     "arrops",
 13 |     "from_any",
 14 |     "from_dict",
 15 |     "from_list",
 16 |     "from_series",
 17 |     "is_bedframe",
 18 |     "is_cataloged",
 19 |     "is_chrom_dtype",
 20 |     "is_complete_ucsc_string",
 21 |     "is_contained",
 22 |     "is_covering",
 23 |     "is_overlapping",
 24 |     "is_sorted",
 25 |     "is_tiling",
 26 |     "is_viewframe",
 27 |     "make_viewframe",
 28 |     "parse_region",
 29 |     "parse_region_string",
 30 |     "sanitize_bedframe",
 31 |     "to_ucsc_string",
 32 |     "update_default_colnames",
 33 |     "binnify",
 34 |     "digest",
 35 |     "frac_gc",
 36 |     "frac_gene_coverage",
 37 |     "frac_mapped",
 38 |     "make_chromarms",
 39 |     "pair_by_distance",
 40 |     "seq_gc",
 41 |     "SCHEMAS",
 42 |     "UCSCClient",
 43 |     "assemblies_available",
 44 |     "assembly_info",
 45 |     "fetch_centromeres",
 46 |     "fetch_chromsizes",
 47 |     "load_fasta",
 48 |     "read_alignments",
 49 |     "read_bam",
 50 |     "read_bigbed",
 51 |     "read_bigwig",
 52 |     "read_chromsizes",
 53 |     "read_pairix",
 54 |     "read_tabix",
 55 |     "read_table",
 56 |     "to_bed",
 57 |     "to_bigbed",
 58 |     "to_bigwig",
 59 |     "assign_view",
 60 |     "closest",
 61 |     "cluster",
 62 |     "complement",
 63 |     "count_overlaps",
 64 |     "coverage",
 65 |     "expand",
 66 |     "merge",
 67 |     "overlap",
 68 |     "mark_runs",
 69 |     "merge_runs",
 70 |     "select",
 71 |     "select_indices",
 72 |     "select_labels",
 73 |     "select_mask",
 74 |     "setdiff",
 75 |     "sort_bedframe",
 76 |     "subtract",
 77 |     "trim",
 78 |     "plot_intervals",
 79 |     "to_ucsc_colorstring",
 80 | ]
 81 | 
 82 | from .core import (
 83 |     arrops,
 84 |     from_any,
 85 |     from_dict,
 86 |     from_list,
 87 |     from_series,
 88 |     is_bedframe,
 89 |     is_cataloged,
 90 |     is_chrom_dtype,
 91 |     is_complete_ucsc_string,
 92 |     is_contained,
 93 |     is_covering,
 94 |     is_overlapping,
 95 |     is_sorted,
 96 |     is_tiling,
 97 |     is_viewframe,
 98 |     make_viewframe,
 99 |     parse_region,
100 |     parse_region_string,
101 |     sanitize_bedframe,
102 |     to_ucsc_string,
103 |     update_default_colnames,
104 | )
105 | from .extras import (
106 |     binnify,
107 |     digest,
108 |     frac_gc,
109 |     frac_gene_coverage,
110 |     frac_mapped,
111 |     make_chromarms,
112 |     mark_runs,
113 |     merge_runs,
114 |     pair_by_distance,
115 |     seq_gc,
116 | )
117 | from .io import (
118 |     SCHEMAS,
119 |     UCSCClient,
120 |     assemblies_available,
121 |     assembly_info,
122 |     fetch_centromeres,
123 |     fetch_chromsizes,
124 |     load_fasta,
125 |     read_alignments,
126 |     read_bam,
127 |     read_bigbed,
128 |     read_bigwig,
129 |     read_chromsizes,
130 |     read_pairix,
131 |     read_tabix,
132 |     read_table,
133 |     to_bed,
134 |     to_bigbed,
135 |     to_bigwig,
136 | )
137 | from .ops import (
138 |     assign_view,
139 |     closest,
140 |     cluster,
141 |     complement,
142 |     count_overlaps,
143 |     coverage,
144 |     expand,
145 |     merge,
146 |     overlap,
147 |     select,
148 |     select_indices,
149 |     select_labels,
150 |     select_mask,
151 |     setdiff,
152 |     sort_bedframe,
153 |     subtract,
154 |     trim,
155 | )
156 | from .vis import plot_intervals, to_ucsc_colorstring
157 | 
158 | del version, PackageNotFoundError
159 | 


--------------------------------------------------------------------------------
/bioframe/core/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import arrops
 2 | from .checks import (
 3 |     is_bedframe,
 4 |     is_cataloged,
 5 |     is_contained,
 6 |     is_covering,
 7 |     is_overlapping,
 8 |     is_sorted,
 9 |     is_tiling,
10 |     is_viewframe,
11 | )
12 | from .construction import (
13 |     from_any,
14 |     from_dict,
15 |     from_list,
16 |     from_series,
17 |     make_viewframe,
18 |     sanitize_bedframe,
19 | )
20 | from .specs import is_chrom_dtype, update_default_colnames
21 | from .stringops import (
22 |     is_complete_ucsc_string,
23 |     parse_region,
24 |     parse_region_string,
25 |     to_ucsc_string,
26 | )
27 | 
28 | __all__ = [
29 |     "arrops",
30 |     "is_bedframe",
31 |     "is_cataloged",
32 |     "is_contained",
33 |     "is_covering",
34 |     "is_overlapping",
35 |     "is_sorted",
36 |     "is_tiling",
37 |     "is_viewframe",
38 |     "from_any",
39 |     "from_dict",
40 |     "from_list",
41 |     "from_series",
42 |     "make_viewframe",
43 |     "sanitize_bedframe",
44 |     "is_chrom_dtype",
45 |     "update_default_colnames",
46 |     "is_complete_ucsc_string",
47 |     "parse_region",
48 |     "parse_region_string",
49 |     "to_ucsc_string",
50 | ]
51 | 


--------------------------------------------------------------------------------
/bioframe/core/construction.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | from . import checks
  5 | from .specs import _get_default_colnames, _verify_columns, is_chrom_dtype
  6 | from .stringops import is_complete_ucsc_string, parse_region_string, to_ucsc_string
  7 | 
  8 | __all__ = [
  9 |     "from_dict",
 10 |     "from_series",
 11 |     "from_list",
 12 |     "from_any",
 13 |     "make_viewframe",
 14 |     "sanitize_bedframe",
 15 | ]
 16 | 
 17 | ### conversions from various input formats into dataframes ###
 18 | 
 19 | 
 20 | def from_dict(regions, cols=None):
 21 |     """
 22 |     Makes a dataframe from a dictionary of {str,int} pairs, interpreted as
 23 |     chromosome names.
 24 | 
 25 |     Note that {str,(int,int)} dictionaries of tuples are no longer supported!
 26 | 
 27 |     Parameters
 28 |     ----------
 29 | 
 30 |     regions : dict
 31 | 
 32 |     name_col : str
 33 |         Default 'name'.
 34 | 
 35 |     cols : (str, str, str) or None
 36 |         The names of columns containing the chromosome, start and end of the
 37 |         genomic intervals, provided separately for each set. The default
 38 |         values are 'chrom', 'start', 'end'.
 39 | 
 40 |     Returns
 41 |     -------
 42 |     df : pandas.DataFrame
 43 |     """
 44 |     ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
 45 |     data = []
 46 |     for k, v in dict(regions).items():
 47 |         chrom = k
 48 |         if np.isscalar(v):
 49 |             start = 0
 50 |             end = v
 51 |         else:
 52 |             raise ValueError("Unsupported dict format: {type(v)}")
 53 |         data.append([chrom, start, end])
 54 |     return pd.DataFrame(data, columns=[ck1, sk1, ek1])
 55 | 
 56 | 
 57 | def from_series(regions, cols=None):
 58 |     ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
 59 |     chroms = regions.index.values
 60 |     data = {ck1: chroms, sk1: 0, ek1: regions.values}
 61 |     return pd.DataFrame(data)
 62 | 
 63 | 
 64 | def from_list(regions, name_col="name", cols=None):
 65 |     ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
 66 |     df = pd.DataFrame(regions)
 67 |     if df.shape[1] == 3:
 68 |         df.columns = [ck1, sk1, ek1]
 69 |     elif df.shape[1] == 4:
 70 |         df.columns = [ck1, sk1, ek1, name_col]
 71 |     else:
 72 |         raise ValueError("wrong number of columns for list input format")
 73 |     return df
 74 | 
 75 | 
 76 | def from_ucsc_string_list(region_list, cols=None):
 77 |     ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
 78 |     parsed = [parse_region_string(i) for i in region_list]
 79 |     df = pd.DataFrame(parsed, columns=[ck1, sk1, ek1])
 80 |     return df
 81 | 
 82 | 
 83 | def from_any(regions, fill_null=False, name_col="name", cols=None):
 84 |     """
 85 |     Attempts to make a genomic interval dataframe with columns
 86 |     [chr, start, end, name_col] from a variety of input types.
 87 | 
 88 |     Parameters
 89 |     ----------
 90 |     regions : supported input
 91 |         Currently supported inputs:
 92 | 
 93 |             - dataframe
 94 |             - series of UCSC strings
 95 |             - dictionary of {str:int} key value pairs
 96 |             - pandas series where the index is interpreted as chromosomes and
 97 |               values are interpreted as end
 98 |             - list of tuples or lists, either [(chrom,start,end)] or
 99 |               [(chrom,start,end,name)]
100 |             - tuple of tuples or lists, either [(chrom,start,end)] or
101 |               [(chrom,start,end,name)]
102 | 
103 |     fill_null : False or dictionary
104 |         Accepts a dictionary of {str:int} pairs, interpreted as chromosome sizes.
105 |         Kept or backwards compatibility. Default False.
106 | 
107 |     name_col : str
108 |         Column name. Only used if 4 column list is provided. Default "name".
109 | 
110 |     cols : (str,str,str)
111 |         Names for dataframe columns.
112 |         Default None sets them with get_default_colnames().
113 | 
114 |     Returns
115 |     -------
116 |     out_df:dataframe
117 | 
118 |     """
119 |     ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
120 | 
121 |     if isinstance(regions, pd.DataFrame):
122 |         if {ck1, sk1, ek1}.issubset(regions.columns):
123 |             out_df = regions.copy()
124 |         elif (len(regions[name_col].values.shape) == 1) and is_complete_ucsc_string(
125 |             regions[name_col].values[0]
126 |         ):
127 |             out_df = from_ucsc_string_list(
128 |                 regions[name_col].values, cols=[ck1, sk1, ek1]
129 |             )
130 |         else:
131 |             raise ValueError("Unknown dataFrame format: check column names")
132 | 
133 |     elif isinstance(regions, dict):
134 |         out_df = from_dict(regions, cols=[ck1, sk1, ek1])
135 | 
136 |     elif isinstance(regions, pd.Series):
137 |         out_df = from_series(regions, cols=[ck1, sk1, ek1])
138 | 
139 |     elif isinstance(regions, tuple):
140 |         if np.shape(regions) == (3,):
141 |             out_df = from_list([regions], name_col=name_col, cols=[ck1, sk1, ek1])
142 | 
143 |         elif len(np.shape(regions)) == 1 and isinstance(regions[0], str):
144 |             out_df = from_ucsc_string_list(regions, cols=[ck1, sk1, ek1])
145 |         else:
146 |             out_df = from_list(list(regions), name_col=name_col, cols=[ck1, sk1, ek1])
147 | 
148 |     elif isinstance(regions, list):
149 |         if np.shape(regions) == (3,):
150 |             out_df = from_list([regions], name_col=name_col, cols=[ck1, sk1, ek1])
151 |         elif len(np.shape(regions)) == 1 and isinstance(regions[0], str):
152 |             out_df = from_ucsc_string_list(regions, cols=[ck1, sk1, ek1])
153 |         else:
154 |             out_df = from_list(regions, name_col=name_col, cols=[ck1, sk1, ek1])
155 |     else:
156 |         raise ValueError(f"Unknown input format: {type(regions)}")
157 | 
158 |     if fill_null:
159 |         out_df[sk1] = pd.to_numeric(out_df[sk1]).fillna(0)
160 |         try:
161 |             ends = []
162 |             for i in range(len(out_df)):
163 |                 if out_df[ek1].values[i] is None:
164 |                     ends.append(fill_null[out_df[ck1].values[i]])
165 |                 else:
166 |                     ends.append(out_df[ek1].values[i])
167 |             out_df[ek1] = ends
168 |         except Exception as e:
169 |             raise ValueError("could not fill ends with provided chromsizes") from e
170 | 
171 |     return out_df
172 | 
173 | 
174 | def add_ucsc_name_column(reg_df, name_col="name", cols=None):
175 |     """
176 |     Auto-creates a UCSC name 'chrom:start-end' for each region
177 |     (chrom,start,end) in reg_df.
178 | 
179 |     Replaces name_col if it exists.
180 |     """
181 |     ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
182 |     df = reg_df.copy()
183 |     _verify_columns(df, [ck1, sk1, ek1])
184 |     data = zip(df[ck1], df[sk1], df[ek1])
185 |     df[name_col] = [to_ucsc_string(i) for i in data]
186 |     return df
187 | 
188 | 
189 | def make_viewframe(
190 |     regions,
191 |     check_bounds=None,
192 |     name_style=None,
193 |     view_name_col="name",
194 |     cols=None,
195 | ):
196 |     """
197 |     Makes and validates a dataframe `view_df` out of regions.
198 | 
199 |     Parameters
200 |     ----------
201 |     regions : supported input type
202 |         Currently supported input types:
203 | 
204 |             - a dictionary where keys are strings and values are integers
205 |               {str:int}, specifying regions (chrom, 0, end, chrom)
206 |             - a pandas series of chromosomes lengths with index specifying region names
207 |             - a list of tuples [(chrom,start,end), ...] or [(chrom,start,end,name), ...]
208 |             - a pandas DataFrame, skips to validation step
209 | 
210 |     name_style : None or "ucsc"
211 |         If None and no column view_name_col, propagate values from cols[0]
212 |         If "ucsc" and no column view_name_col, create UCSC style names
213 | 
214 |     check_bounds : None, or chromosome sizes provided as any of valid formats above
215 |         Optional, if provided checks if regions in the view are contained by
216 |         regions supplied in check_bounds, typically provided as a series of
217 |         chromosome sizes. Default None.
218 | 
219 |     view_name_col : str
220 |         Specifies column name of the view regions. Default 'name'.
221 | 
222 |     cols : (str, str, str) or None
223 |         The names of columns containing the chromosome, start and end of the
224 |         genomic intervals, provided separately for each set. The default
225 |         values are 'chrom', 'start', 'end'.
226 | 
227 |     Returns
228 |     -------
229 |     view_df:dataframe satisfying properties of a view
230 | 
231 |     """
232 |     ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
233 | 
234 |     view_df = from_any(regions, name_col=view_name_col, cols=cols)
235 | 
236 |     if check_bounds is not None:
237 |         bounds_df = from_any(check_bounds, name_col="bounds", cols=cols)
238 |         if not checks.is_contained(
239 |             view_df,
240 |             bounds_df,
241 |             df_view_col=None,
242 |             view_name_col="bounds",
243 |             cols=cols,
244 |         ):
245 |             raise ValueError(
246 |                 "Invalid input to make a viewFrame, regions not contained by bounds"
247 |             )
248 | 
249 |     if view_name_col not in view_df.columns:
250 |         if name_style is None:
251 |             view_df[view_name_col] = view_df[ck1].values
252 |         elif name_style.lower() == "ucsc":
253 |             view_df = add_ucsc_name_column(view_df, name_col=view_name_col, cols=cols)
254 |         else:
255 |             raise ValueError("unknown value for name_style")
256 | 
257 |     if checks.is_viewframe(
258 |         view_df, view_name_col=view_name_col, cols=cols, raise_errors=True
259 |     ):
260 |         return view_df
261 |     else:
262 |         raise ValueError("could not make valid viewFrame, retry with new input")
263 | 
264 | 
265 | def sanitize_bedframe(
266 |     df1,
267 |     recast_dtypes=True,
268 |     drop_null=False,
269 |     start_exceed_end_action=None,
270 |     cols=None,
271 | ):
272 |     """
273 |     Attempts to clean a genomic interval dataframe to be a valid bedframe.
274 | 
275 |     Parameters
276 |     ----------
277 |     df1 : pandas.DataFrame
278 | 
279 |     recast_dtypes : bool
280 |         Whether to attempt to recast column dtypes to pandas nullable dtypes.
281 | 
282 |     drop_null : bool
283 |         Drops rows with pd.NA. Default False.
284 | 
285 |     start_exceed_end_action : str or None
286 |         Options: 'flip' or 'drop' or None. Default None.
287 | 
288 |             - If 'flip', attempts to sanitize by flipping intervals with start>end.
289 |             - If 'drop' attempts to sanitize dropping intervals with start>end.
290 |             - If None, does not alter these intervals if present.
291 | 
292 |     cols : (str, str, str) or None
293 |         The names of columns containing the chromosome, start and end of the
294 |         genomic intervals, provided separately for each set. The default
295 |         values are 'chrom', 'start', 'end'.
296 | 
297 |     Returns
298 |     -------
299 |     out_df : pandas.DataFrame
300 |         Sanitized dataframe satisfying the properties of a bedframe.
301 | 
302 |     Notes
303 |     ------
304 |     The option ``start_exceed_end_action='flip'`` may be useful for gff files
305 |     with strand information but starts > ends.
306 | 
307 |     """
308 |     ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
309 | 
310 |     out_df = df1.copy()
311 | 
312 |     _verify_columns(out_df, [ck1, sk1, ek1])
313 | 
314 |     if recast_dtypes:
315 |         chrom_dtype, start_dtype, end_dtype = out_df.dtypes[[ck1, sk1, ek1]]
316 |         if not is_chrom_dtype(chrom_dtype):
317 |             out_df[ck1] = out_df[ck1].astype(str)
318 |         if not ((start_dtype is pd.Int64Dtype()) and (end_dtype is pd.Int64Dtype())):
319 |             out_df[sk1] = out_df[sk1].astype(pd.Int64Dtype())
320 |             out_df[ek1] = out_df[ek1].astype(pd.Int64Dtype())
321 | 
322 |     nan_intervals = pd.isnull(out_df[[ck1, sk1, ek1]]).any(axis=1)
323 |     out_df.loc[nan_intervals, [ck1, sk1, ek1]] = pd.NA
324 |     if drop_null:
325 |         out_df.dropna(axis=0, inplace=True)
326 |         out_df.reset_index(drop=True, inplace=True)
327 | 
328 |     if start_exceed_end_action is not None:
329 |         start_exceed_end_action = start_exceed_end_action.lower()
330 |         if ((out_df[ek1] - out_df[sk1]) < 0).any():
331 |             inds = ((out_df[ek1] - out_df[sk1]) < 0).values
332 |             if start_exceed_end_action == "drop":
333 |                 out_df = out_df.loc[inds == 0]
334 |             elif start_exceed_end_action == "flip":
335 |                 out_df.loc[inds, [sk1, ek1]] = out_df.loc[inds, [ek1, sk1]].values
336 |             else:
337 |                 raise ValueError("unknown action for intervals with start>end")
338 |             out_df.reset_index(drop=True, inplace=True)
339 | 
340 |     if checks.is_bedframe(out_df, cols=cols):
341 |         return out_df
342 |     else:
343 |         raise ValueError("could not sanitize")
344 | 


--------------------------------------------------------------------------------
/bioframe/core/specs.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | __all__ = [
  7 |     "update_default_colnames",
  8 |     "is_chrom_dtype",
  9 | ]
 10 | 
 11 | _rc = {"colnames": {"chrom": "chrom", "start": "start", "end": "end"}}
 12 | 
 13 | 
 14 | def _get_default_colnames():
 15 |     """
 16 |     Returns default column names.
 17 | 
 18 |     These defaults be updated with :func:`update_default_colnames`.
 19 | 
 20 |     Returns
 21 |     -------
 22 |     colnames : triplet (str, str, str)
 23 | 
 24 |     """
 25 |     return _rc["colnames"]["chrom"], _rc["colnames"]["start"], _rc["colnames"]["end"]
 26 | 
 27 | 
 28 | class update_default_colnames:
 29 |     def __init__(self, new_colnames):
 30 |         self._old_colnames = dict(_rc["colnames"])
 31 |         if isinstance(new_colnames, collections.abc.Iterable):
 32 |             if len(new_colnames) != 3:
 33 |                 raise ValueError(
 34 |                     "Please, specify new columns using a list of "
 35 |                     "3 strings or a dict!"
 36 |                 )
 37 |             (
 38 |                 _rc["colnames"]["chrom"],
 39 |                 _rc["colnames"]["start"],
 40 |                 _rc["colnames"]["end"],
 41 |             ) = new_colnames
 42 |         elif isinstance(new_colnames, collections.abc.Mapping):
 43 |             _rc["colnames"].update(
 44 |                 {
 45 |                     k: v
 46 |                     for k, v in new_colnames.items()
 47 |                     if k in ["chrom", "start", "end"]
 48 |                 }
 49 |             )
 50 |         else:
 51 |             raise ValueError(
 52 |                 "Please, specify new columns using a list of " "3 strings or a dict!"
 53 |             )
 54 | 
 55 |     def __enter__(self):
 56 |         return self
 57 | 
 58 |     def __exit__(self, *args):
 59 |         _rc["colnames"] = self._old_colnames
 60 | 
 61 | 
 62 | def _verify_columns(df, colnames, unique_cols=False, return_as_bool=False):
 63 |     """
 64 |     Raises ValueError if columns with colnames are not present in dataframe df.
 65 | 
 66 |     Parameters
 67 |     ----------
 68 |     df: pandas.DataFrame
 69 | 
 70 |     colnames: list of column names
 71 | 
 72 |     return_as_bool : bool
 73 |         If True, returns as a boolean instead of raising errors. Default False.
 74 | 
 75 |     """
 76 | 
 77 |     if not isinstance(df, pd.DataFrame):
 78 |         if return_as_bool:
 79 |             return False
 80 |         raise ValueError("df is not a dataframe")
 81 | 
 82 |     if unique_cols:
 83 |         if len(set(colnames)) < len(colnames):
 84 |             raise ValueError("column names must be unique")
 85 | 
 86 |     if not set(colnames).issubset(df.columns):
 87 |         if return_as_bool:
 88 |             return False
 89 |         raise ValueError(
 90 |             ", ".join(set(colnames).difference(set(df.columns)))
 91 |             + " not in keys of df.columns"
 92 |         )
 93 |     if return_as_bool:
 94 |         return True
 95 | 
 96 | 
 97 | def _verify_column_dtypes(df, cols=None, return_as_bool=False):
 98 |     """
 99 |     Checks that dataframe `df` has chrom, start, end columns with valid dtypes.
100 |     Raises TypeErrors if cols have invalid dtypes.
101 | 
102 |     Parameters
103 |     ----------
104 |     df : pandas.DataFrame
105 | 
106 |     cols : (str, str, str) or None
107 |         The names of columns containing the chromosome, start and end of the
108 |         genomic intervals, provided separately for each set. The default
109 |         values are 'chrom', 'start', 'end'.
110 | 
111 |     return_as_bool : bool
112 |         If true, returns as a boolean instead of raising errors. Default False.
113 | 
114 |     """
115 |     ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
116 |     if not _verify_columns(df, [ck1, sk1, ek1], return_as_bool=True):
117 |         if return_as_bool:
118 |             return False
119 |         raise ValueError("could not verify columns")
120 | 
121 |     chrom_dtype, start_dtype, end_dtype = df.dtypes[[ck1, sk1, ek1]]
122 | 
123 |     if not is_chrom_dtype(chrom_dtype):
124 |         if return_as_bool:
125 |             return False
126 |         raise TypeError(
127 |             "invalid df['chrom'] dtype, must be object, string, or categorical"
128 |         )
129 |     if not pd.api.types.is_integer_dtype(start_dtype):
130 |         if return_as_bool:
131 |             return False
132 |         raise TypeError("invalid df['start'] dtype, must be integer")
133 | 
134 |     if not pd.api.types.is_integer_dtype(end_dtype):
135 |         if return_as_bool:
136 |             return False
137 |         raise TypeError("invalid df['end'] dtype, must be integer")
138 | 
139 |     if return_as_bool:
140 |         return True
141 | 
142 | 
143 | def is_chrom_dtype(chrom_dtype):
144 |     """
145 |     Returns True if dtype is any of the allowed bioframe chrom dtypes, False otherwise.
146 |     """
147 |     return np.any(
148 |         [
149 |             pd.api.types.is_string_dtype(chrom_dtype),
150 |             pd.api.types.is_object_dtype(chrom_dtype),
151 |             isinstance(chrom_dtype, pd.api.types.CategoricalDtype),
152 |         ]
153 |     )
154 | 


--------------------------------------------------------------------------------
/bioframe/core/stringops.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import Optional, Tuple, Union
  3 | 
  4 | import pandas as pd
  5 | 
  6 | __all__ = [
  7 |     "parse_region",
  8 |     "parse_region_string",
  9 |     "is_complete_ucsc_string",
 10 |     "to_ucsc_string",
 11 | ]
 12 | 
 13 | NUMERIC_REGEX = re.compile("([0-9,.]+)")
 14 | 
 15 | RANGE_TOKEN_SPEC = [
 16 |     ("HYPHEN", r"-"),
 17 |     ("COORD", r"[0-9,]+(\.[0-9]*)?(?:[a-z]+)?"),
 18 |     ("OTHER", r".+"),
 19 | ]
 20 | 
 21 | RANGE_REGEX = re.compile(
 22 |     r"\s*" + r"|\s*".join(rf"(?P<{name}>{token})" for name, token in RANGE_TOKEN_SPEC),
 23 |     re.IGNORECASE,
 24 | )
 25 | 
 26 | 
 27 | def to_ucsc_string(grange: Tuple[str, int, int]) -> str:
 28 |     """
 29 |     Convert a grange to a UCSC string.
 30 | 
 31 |     Parameters
 32 |     ----------
 33 |     grange : tuple or other iterable
 34 |         chrom, start, end
 35 | 
 36 |     Returns
 37 |     -------
 38 |     str
 39 |         UCSC-style genomic range string, '{chrom}:{start}-{end}'
 40 |     """
 41 |     return "{}:{}-{}".format(*grange)
 42 | 
 43 | 
 44 | def is_complete_ucsc_string(s: str) -> bool:
 45 |     """
 46 |     Returns True if a string can be parsed into a completely informative
 47 |     (chrom, start, end) format.
 48 | 
 49 |     Parameters
 50 |     ----------
 51 |     s : str
 52 | 
 53 |     Returns
 54 |     -------
 55 |     bool
 56 |         True if able to be parsed and ``end`` is known.
 57 | 
 58 |     """
 59 |     if not isinstance(s, str):
 60 |         return False
 61 |     _, _, end = parse_region_string(s)
 62 |     if end is None:
 63 |         return False
 64 |     return True
 65 | 
 66 | 
 67 | def _parse_humanized_int(s: str) -> int:
 68 |     _, value, unit = NUMERIC_REGEX.split(s.replace(",", ""))
 69 | 
 70 |     # No multiplier unit, just return the integer value
 71 |     if not len(unit):
 72 |         return int(value)
 73 | 
 74 |     # Parse and apply the multiplier. Remaining decimal places are dropped.
 75 |     value = float(value)
 76 |     unit = unit.upper().strip()
 77 |     if unit in ("K", "KB"):
 78 |         value *= 1_000
 79 |     elif unit in ("M", "MB"):
 80 |         value *= 1_000_000
 81 |     elif unit in ("G", "GB"):
 82 |         value *= 1_000_000_000
 83 |     else:
 84 |         raise ValueError(f"Unknown unit '{unit}'")
 85 |     return int(value)
 86 | 
 87 | 
 88 | def parse_region_string(s: str) -> Tuple[str, int, int]:
 89 |     """
 90 |     Parse a UCSC-style genomic range string into a triple.
 91 | 
 92 |     Parameters
 93 |     ----------
 94 |     s : str
 95 |         UCSC-style genomic range string, e.g. "chr5:10,100,000-30,000,000".
 96 | 
 97 |     Returns
 98 |     -------
 99 |     tuple
100 |         (str, int or None, int or None)
101 | 
102 |     See also
103 |     --------
104 |     parse_region
105 |     """
106 | 
107 |     def _tokenize(s):
108 |         for match in RANGE_REGEX.finditer(s):
109 |             name = match.lastgroup
110 |             yield name, match.group(name)
111 | 
112 |     def _parse_range(token_stream):
113 |         name, token = next(token_stream, (None, None))
114 |         if name != "COORD":
115 |             raise ValueError(f"Expected COORD; got unexpected token: {name}: {token}")
116 |         start = _parse_humanized_int(token)
117 | 
118 |         name, token = next(token_stream, (None, None))
119 |         if name != "HYPHEN":
120 |             raise ValueError(f"Expected HYPHEN; got unexpected token: {name}: {token}")
121 | 
122 |         name, token = next(token_stream, (None, None))
123 |         if name is None:  # No end coordinate
124 |             end = None
125 |         elif name == "COORD":
126 |             end = _parse_humanized_int(token)
127 |         else:
128 |             raise ValueError(f"Expected COORD; got unexpected token: {name}: {token}")
129 | 
130 |         return start, end
131 | 
132 |     parts = s.split(":")
133 | 
134 |     chrom = parts[0].strip()
135 |     if not len(chrom):
136 |         raise ValueError("Chromosome name cannot be empty")
137 | 
138 |     if len(parts) < 2:
139 |         return (chrom, None, None)
140 | 
141 |     start, end = _parse_range(_tokenize(parts[1]))
142 | 
143 |     return chrom, start, end
144 | 
145 | 
146 | def _parse_region_record(grange: tuple) -> Tuple[str, int, int]:
147 |     """
148 |     Coerce a genomic range record into a triple.
149 | 
150 |     Parameters
151 |     ----------
152 |     grange : str or tuple
153 |         * A triple (chrom, start, end), where ``start`` or ``end`` may be
154 |           ``None``.
155 |         * A quadruple or higher-order tuple, e.g. (chrom, start, end, name).
156 |           ``name`` and other fields will be ignored.
157 | 
158 |     Returns
159 |     -------
160 |     tuple
161 |         A well-formed genomic range triple (str, int, int).
162 |     """
163 |     if len(grange) < 3:
164 |         raise ValueError("Length of a range record should be at least 3")
165 |     chrom, start, end = grange[:3]
166 |     chrom = str(chrom)
167 |     start = int(start) if start is not None else start
168 |     end = int(end) if end is not None else end
169 |     return chrom, start, end
170 | 
171 | 
172 | def parse_region(
173 |     grange: Union[str, tuple],
174 |     chromsizes: Optional[Union[dict, pd.Series]] = None,
175 |     *,
176 |     check_bounds: bool = True,
177 | ) -> Tuple[str, int, int]:
178 |     """
179 |     Coerce a genomic range string or sequence type into a triple.
180 | 
181 |     Parameters
182 |     ----------
183 |     grange : str or tuple
184 |         * A UCSC-style genomic range string, e.g. "chr5:10,100,000-30,000,000".
185 |         * A triple (chrom, start, end), where ``start`` or ``end`` may be
186 |           ``None``.
187 |         * A quadruple or higher-order tuple, e.g. (chrom, start, end, name).
188 |           ``name`` and other fields will be ignored.
189 | 
190 |     chromsizes : dict or Series, optional
191 |         Lookup table of sequence lengths for bounds checking and for
192 |         filling in a missing end coordinate.
193 | 
194 |     check_bounds : bool, optional [default: True]
195 |         If True, check that the genomic range is within the bounds of the
196 |         sequence.
197 | 
198 |     Returns
199 |     -------
200 |     tuple
201 |         A well-formed genomic range triple (str, int, int).
202 | 
203 |     Notes
204 |     -----
205 |     Genomic ranges are interpreted as half-open intervals (0-based starts,
206 |     1-based ends) along the length coordinate of a sequence.
207 | 
208 |     Sequence names may contain any character except for whitespace and colon.
209 | 
210 |     The start coordinate should be 0 or greater and the end coordinate should
211 |     be less than or equal to the length of the sequence, if the latter is
212 |     known. These are enforced when ``check_bounds`` is ``True``.
213 | 
214 |     If the start coordinate is missing, it is assumed to be 0. If the end
215 |     coordinate is missing and chromsizes are provided, it is replaced with the
216 |     length of the sequence.
217 | 
218 |     The end coordinate **must** be greater than or equal to the start.
219 | 
220 |     The start and end coordinates may be suffixed with k(b), M(b), or G(b)
221 |     multipliers, case-insentive. e.g. "chr1:1K-2M" is equivalent to
222 |     "chr1:1000-2000000".
223 |     """
224 |     if isinstance(grange, str):
225 |         chrom, start, end = parse_region_string(grange)
226 |     else:
227 |         chrom, start, end = _parse_region_record(grange)
228 | 
229 |     # Fill in missing end coordinate if possible
230 |     clen = None
231 |     if chromsizes is not None:
232 |         try:
233 |             clen = chromsizes[chrom]
234 |         except KeyError as e:
235 |             raise ValueError(f"Unknown sequence label: {chrom}") from e
236 |         if end is None:
237 |             end = clen
238 | 
239 |     # Fill in missing start coordinate
240 |     if start is None:
241 |         start = 0
242 | 
243 |     if end is not None and (end < start):
244 |         raise ValueError("End cannot be less than start")
245 | 
246 |     if check_bounds and (start < 0 or (clen is not None and end > clen)):
247 |         raise ValueError(f"Genomic range out of bounds: [{start}, {end})")
248 | 
249 |     return chrom, start, end
250 | 


--------------------------------------------------------------------------------
/bioframe/io/__init__.py:
--------------------------------------------------------------------------------
 1 | from .assembly import assemblies_available, assembly_info
 2 | from .bed import to_bed
 3 | from .fileops import (
 4 |     load_fasta,
 5 |     read_alignments,
 6 |     read_bam,
 7 |     read_bigbed,
 8 |     read_bigwig,
 9 |     read_chromsizes,
10 |     read_pairix,
11 |     read_tabix,
12 |     read_table,
13 |     to_bigbed,
14 |     to_bigwig,
15 | )
16 | from .resources import UCSCClient, fetch_centromeres, fetch_chromsizes
17 | from .schemas import SCHEMAS
18 | 
19 | __all__ = [
20 |     "assemblies_available",
21 |     "assembly_info",
22 |     "read_table",
23 |     "read_chromsizes",
24 |     "read_tabix",
25 |     "read_pairix",
26 |     "read_bam",
27 |     "read_alignments",
28 |     "load_fasta",
29 |     "read_bigwig",
30 |     "to_bed",
31 |     "to_bigwig",
32 |     "read_bigbed",
33 |     "to_bigbed",
34 |     "UCSCClient",
35 |     "fetch_centromeres",
36 |     "fetch_chromsizes",
37 |     "SCHEMAS",
38 | ]
39 | 


--------------------------------------------------------------------------------
/bioframe/io/assembly.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | 
  3 | try:
  4 |     from importlib.resources import files as resource_path
  5 | except ImportError:
  6 |     from importlib_resources import files as resource_path
  7 | 
  8 | from typing import Dict, List, Optional, Tuple, Union
  9 | 
 10 | try:
 11 |     from typing import Literal
 12 | except ImportError:
 13 |     from typing_extensions import Literal
 14 | 
 15 | import numpy as np
 16 | import pandas as pd
 17 | import yaml
 18 | 
 19 | from bioframe import make_viewframe
 20 | 
 21 | __all__ = ["assemblies_available", "assembly_info"]
 22 | 
 23 | ASSEMBLY_METADATA_ROOT = resource_path("bioframe.io") / "data"
 24 | 
 25 | 
 26 | @dataclass
 27 | class GenomeAssembly:
 28 |     """
 29 |     A dataclass containing information about sequences in a genome assembly.
 30 |     """
 31 | 
 32 |     organism: str
 33 |     provider: str
 34 |     provider_build: str
 35 |     release_year: str
 36 |     seqinfo: pd.DataFrame
 37 |     cytobands: pd.DataFrame = None
 38 |     url: str = None
 39 |     alias_dict: Dict[str, str] = None
 40 | 
 41 |     def __post_init__(self):
 42 |         self.alias_dict = {}
 43 |         alias_lists = self.seqinfo["aliases"].str.split(",")
 44 |         names = self.seqinfo["name"]
 45 |         for aliases, name in zip(alias_lists, names):
 46 |             for alias in aliases:
 47 |                 self.alias_dict[alias] = name
 48 | 
 49 |     @property
 50 |     def chromsizes(self) -> pd.Series:
 51 |         return self.seqinfo.set_index("name")["length"]
 52 | 
 53 |     @property
 54 |     def chromnames(self) -> List[str]:
 55 |         return self.seqinfo["name"].tolist()
 56 | 
 57 |     @property
 58 |     def viewframe(self) -> pd.DataFrame:
 59 |         return make_viewframe(self.chromsizes.to_dict())
 60 | 
 61 |     def __repr__(self) -> str:
 62 |         return (
 63 |             f"GenomeAssembly(organism='{self.organism}', provider='{self.provider}', "
 64 |             f"provider_build='{self.provider_build}', "
 65 |             f"release_year='{self.release_year}', ...)"
 66 |         )
 67 | 
 68 | 
 69 | def assemblies_available() -> pd.DataFrame:
 70 |     """
 71 |     Get a list of available genome assembly metadata in local storage.
 72 | 
 73 |     Returns
 74 |     -------
 75 |     pandas.DataFrame
 76 |         A dataframe with metadata fields for available assemblies, including
 77 |         'provider', 'provider_build', 'default_roles', 'default_units',
 78 |         and names of seqinfo and cytoband files.
 79 |     """
 80 |     with open(ASSEMBLY_METADATA_ROOT / "_assemblies.yml") as f:
 81 |         assemblies = yaml.safe_load(f)
 82 |     return pd.DataFrame.from_records(assemblies)
 83 | 
 84 | 
 85 | def assembly_info(
 86 |     name: str,
 87 |     roles: Optional[Union[List, Tuple, Literal["all"]]] = None,
 88 |     units: Optional[Union[List, Tuple, Literal["all"]]] = None,
 89 | ) -> GenomeAssembly:
 90 |     """
 91 |     Get information about a genome assembly.
 92 | 
 93 |     Parameters
 94 |     ----------
 95 |     name : str
 96 |         Name of the assembly. If the name contains a dot, it is interpreted as
 97 |         a provider name and a build, e.g. "hg38". Otherwise, the provider
 98 |         is inferred if the build name is unique.
 99 |     roles : list or tuple or "all", optional
100 |         Sequence roles to include in the assembly info. If not specified, only
101 |         sequences with the default sequence roles for the assembly are shown.
102 |         e.g. "assembled", "unlocalized", "unplaced"
103 |     units : list or tuple or "all", optional
104 |         Assembly units to include in the assembly info. If not specified, only
105 |         sequences from the default units for the assembly are shown.
106 |         e.g. "primary", "non-nuclear", "decoy"
107 | 
108 |     Returns
109 |     -------
110 |     GenomeAssembly
111 |         A dataclass containing information about the assembly.
112 | 
113 |     Raises
114 |     ------
115 |     ValueError
116 |         If the assembly name is not found or is not unique.
117 | 
118 |     Examples
119 |     --------
120 |     >>> hg38 = assembly_info("hg38")
121 |     >>> hg38.chromsizes
122 |     name
123 |     chr1    248956422
124 |     chr2    242193529
125 |     chr3    198295559
126 |     ...     ...
127 | 
128 |     >>> assembly_info("hg38", roles=("assembled", "non-nuclear"))
129 | 
130 |     >>> assembly_info("ucsc.hg38", units=("unplaced",))
131 | 
132 |     """
133 |     assemblies = assemblies_available()
134 |     provider = None
135 |     if "." in name:
136 |         provider, name = name.split(".", 1)
137 |         provider = provider.lower()
138 | 
139 |     if provider is None:
140 |         q = f"provider_build == '{name}'"
141 |     else:
142 |         q = f"provider == '{provider}' and provider_build == '{name}'"
143 | 
144 |     result = assemblies.query(q)
145 |     if len(result) == 0:
146 |         raise ValueError(f"Assembly not found: {name}")
147 |     elif len(result) > 1:
148 |         raise ValueError(f"Assembly identifer not unique: {result}")
149 | 
150 |     assembly = result.iloc[0].replace([np.nan], [None]).to_dict()
151 |     default_roles = assembly["default_roles"]
152 |     default_units = assembly["default_units"]
153 |     seqinfo_path = assembly["seqinfo"]
154 |     seqinfo = pd.read_table(ASSEMBLY_METADATA_ROOT / seqinfo_path)
155 | 
156 |     mask = np.ones(len(seqinfo), dtype=bool)
157 |     if roles is None:
158 |         mask &= seqinfo["role"].isin(default_roles)
159 |     elif isinstance(roles, (tuple, list)):
160 |         mask &= seqinfo["role"].isin(roles)
161 |     elif isinstance(roles, str) and roles != "all":
162 |         raise ValueError(f"roles must be a tuple or 'all', not {roles}")
163 |     if units is None:
164 |         mask &= seqinfo["unit"].isin(default_units)
165 |     elif isinstance(units, (tuple, list)):
166 |         mask &= seqinfo["unit"].isin(units)
167 |     elif isinstance(units, str) and units != "all":
168 |         raise ValueError(f"units must be a tuple or 'all', not {units}")
169 |     seqinfo = seqinfo.loc[mask]
170 | 
171 |     cytobands = None
172 |     cytobands_path = assembly["cytobands"]
173 |     if cytobands_path is not None:
174 |         cytobands = pd.read_table(ASSEMBLY_METADATA_ROOT / cytobands_path)
175 | 
176 |     return GenomeAssembly(
177 |         organism=assembly["organism"],
178 |         provider=assembly["provider"],
179 |         provider_build=assembly["provider_build"],
180 |         release_year=assembly["release_year"],
181 |         seqinfo=seqinfo,
182 |         cytobands=cytobands,
183 |         url=assembly["url"],
184 |     )
185 | 


--------------------------------------------------------------------------------
/bioframe/io/data/_assemblies.yml:
--------------------------------------------------------------------------------
  1 | - organism: homo sapiens
  2 |   provider: ncbi
  3 |   provider_build: GRCh37
  4 |   release_year: 2009
  5 |   seqinfo: hg19.seqinfo.tsv
  6 |   cytobands: hg19.cytoband.tsv
  7 |   default_roles: [assembled]
  8 |   default_units: [primary, non-nuclear-revised]
  9 |   url: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.25_GRCh37.p13/GRCh37_seqs_for_alignment_pipelines/
 10 | - organism: homo sapiens
 11 |   provider: ucsc
 12 |   provider_build: hg19
 13 |   release_year: 2009
 14 |   seqinfo: hg19.seqinfo.tsv
 15 |   cytobands: hg19.cytoband.tsv
 16 |   default_roles: [assembled]
 17 |   default_units: [primary, non-nuclear]
 18 |   url: https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/analysisSet/
 19 | - organism: homo sapiens
 20 |   provider: ncbi
 21 |   provider_build: GRCh38
 22 |   release_year: 2013
 23 |   seqinfo: hg38.seqinfo.tsv
 24 |   cytobands: hg38.cytoband.tsv
 25 |   default_roles: [assembled]
 26 |   default_units: [primary, non-nuclear]
 27 |   url: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.26_GRCh38/GRCh38_major_release_seqs_for_alignment_pipelines/
 28 | - organism: homo sapiens
 29 |   provider: ucsc
 30 |   provider_build: hg38
 31 |   release_year: 2013
 32 |   seqinfo: hg38.seqinfo.tsv
 33 |   cytobands: hg38.cytoband.tsv
 34 |   default_roles: [assembled]
 35 |   default_units: [primary, non-nuclear]
 36 |   url: https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/analysisSet/
 37 | - organism: homo sapiens
 38 |   provider: ncbi
 39 |   provider_build: T2T-CHM13v2.0
 40 |   release_year: 2022
 41 |   seqinfo: hs1.seqinfo.tsv
 42 |   cytobands: hs1.cytoband.tsv
 43 |   default_roles: [assembled]
 44 |   default_units: [primary, non-nuclear]
 45 |   url: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/009/914/755/GCA_009914755.4_T2T-CHM13v2.0/
 46 | - organism: homo sapiens
 47 |   provider: ucsc
 48 |   provider_build: hs1
 49 |   release_year: 2022
 50 |   seqinfo: hs1.seqinfo.tsv
 51 |   cytobands: hs1.cytoband.tsv
 52 |   default_roles: [assembled]
 53 |   default_units: [primary, non-nuclear]
 54 |   url: https://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/
 55 | - organism: mus musculus
 56 |   provider: ncbi
 57 |   provider_build: MGSCv37
 58 |   release_year: 2010
 59 |   seqinfo: mm9.seqinfo.tsv
 60 |   default_roles: [assembled]
 61 |   default_units: [primary, non-nuclear]
 62 |   url: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.18_MGSCv37/
 63 | - organism: mus musculus
 64 |   provider: ucsc
 65 |   provider_build: mm9
 66 |   release_year: 2007
 67 |   seqinfo: mm9.seqinfo.tsv
 68 |   default_roles: [assembled]
 69 |   default_units: [primary, non-nuclear]
 70 |   url: https://hgdownload.soe.ucsc.edu/goldenPath/mm9/bigZips/
 71 | - organism: mus musculus
 72 |   provider: ncbi
 73 |   provider_build: GRCm38
 74 |   release_year: 2011
 75 |   seqinfo: mm10.seqinfo.tsv
 76 |   default_roles: [assembled]
 77 |   default_units: [primary, non-nuclear]
 78 |   url: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.20_GRCm38/
 79 | - organism: mus musculus
 80 |   provider: ucsc
 81 |   provider_build: mm10
 82 |   release_year: 2011
 83 |   seqinfo: mm10.seqinfo.tsv
 84 |   default_roles: [assembled]
 85 |   default_units: [primary, non-nuclear]
 86 |   url: https://hgdownload.soe.ucsc.edu/goldenPath/mm10/bigZips/
 87 | - organism: mus musculus
 88 |   provider: ncbi
 89 |   provider_build: GRCm39
 90 |   release_year: 2020
 91 |   seqinfo: mm39.seqinfo.tsv
 92 |   default_roles: [assembled]
 93 |   default_units: [primary, non-nuclear]
 94 |   url: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.27_GRCm39/
 95 | - organism: mus musculus
 96 |   provider: ucsc
 97 |   provider_build: mm39
 98 |   release_year: 2020
 99 |   seqinfo: mm39.seqinfo.tsv
100 |   default_roles: [assembled]
101 |   default_units: [primary, non-nuclear]
102 |   url: https://hgdownload.soe.ucsc.edu/goldenPath/mm39/bigZips/
103 | - organism: drosophila melanogaster
104 |   provider: ucsc
105 |   provider_build: dm3
106 |   release_year: 2006
107 |   seqinfo: dm3.seqinfo.tsv
108 |   default_roles: [assembled]
109 |   default_units: [primary, non-nuclear]
110 |   url: https://hgdownload.soe.ucsc.edu/goldenPath/dm3/bigZips/
111 | - organism: drosophila melanogaster
112 |   provider: ucsc
113 |   provider_build: dm6
114 |   release_year: 2014
115 |   seqinfo: dm6.seqinfo.tsv
116 |   default_roles: [assembled]
117 |   default_units: [primary, non-nuclear]
118 |   url: https://hgdownload.soe.ucsc.edu/goldenPath/dm6/bigZips/
119 | - organism: caenorhabditis elegans
120 |   provider: ucsc
121 |   provider_build: ce10
122 |   release_year: 2010
123 |   seqinfo: ce10.seqinfo.tsv
124 |   default_roles: [assembled]
125 |   default_units: [primary, non-nuclear]
126 |   url: https://hgdownload.soe.ucsc.edu/goldenPath/ce10/bigZips/
127 | - organism: caenorhabditis elegans
128 |   provider: ucsc
129 |   provider_build: ce11
130 |   release_year: 2013
131 |   seqinfo: ce11.seqinfo.tsv
132 |   default_roles: [assembled]
133 |   default_units: [primary, non-nuclear]
134 |   url: https://hgdownload.soe.ucsc.edu/goldenPath/ce11/bigZips/
135 | - organism: danio rerio
136 |   provider: ucsc
137 |   provider_build: danRer10
138 |   release_year: 2014
139 |   seqinfo: danRer10.seqinfo.tsv
140 |   default_roles: [assembled]
141 |   default_units: [primary, non-nuclear]
142 |   url: https://hgdownload.soe.ucsc.edu/goldenPath/danRer10/bigZips/
143 | - organism: danio rerio
144 |   provider: ucsc
145 |   provider_build: danRer11
146 |   release_year: 2017
147 |   seqinfo: danRer10.seqinfo.tsv
148 |   default_roles: [assembled]
149 |   default_units: [primary, non-nuclear]
150 |   url: https://hgdownload.soe.ucsc.edu/goldenPath/danRer11/bigZips/
151 | - organism: saccharomyces cerevisiae
152 |   provider: ucsc
153 |   provider_build: sacCer3
154 |   release_year: 2011
155 |   seqinfo: sacCer3.seqinfo.tsv
156 |   default_roles: [assembled]
157 |   default_units: [primary, non-nuclear]
158 |   url: https://hgdownload.soe.ucsc.edu/goldenPath/sacCer3/bigZips/
159 | 


--------------------------------------------------------------------------------
/bioframe/io/data/ce10.seqinfo.tsv:
--------------------------------------------------------------------------------
1 | name	length	role	molecule	unit	aliases
2 | chrI	15072423	assembled	chrI	primary	NC_003279.7,I
3 | chrII	15279345	assembled	chrII	primary	NC_003280.9,II
4 | chrIII	13783700	assembled	chrIII	primary	NC_003281.9,III
5 | chrIV	17493793	assembled	chrIV	primary	NC_003282.7,IV
6 | chrV	20924149	assembled	chrV	primary	NC_003283.10,V
7 | chrX	17718866	assembled	chrX	primary	NC_003284.8,X
8 | chrM	13794	assembled	chrM	non-nuclear	NC_001328.1,MT,MtDNA
9 | 


--------------------------------------------------------------------------------
/bioframe/io/data/ce11.seqinfo.tsv:
--------------------------------------------------------------------------------
1 | name	length	role	molecule	unit	aliases
2 | chrI	15072434	assembled	chrI	primary	NC_003279.8,I
3 | chrII	15279421	assembled	chrII	primary	NC_003280.10,II
4 | chrIII	13783801	assembled	chrIII	primary	NC_003281.10,III
5 | chrIV	17493829	assembled	chrIV	primary	NC_003282.8,IV
6 | chrV	20924180	assembled	chrV	primary	NC_003283.11,V
7 | chrX	17718942	assembled	chrX	primary	NC_003284.9,X
8 | chrM	13794	assembled	chrM	non-nuclear	NC_001328.1,MT,MtDNA
9 | 


--------------------------------------------------------------------------------
/bioframe/io/data/dm3.seqinfo.tsv:
--------------------------------------------------------------------------------
 1 | name	length	role	molecule	unit	aliases
 2 | chr2L	23011544	assembled	chr2	primary	2L,NT_033779.4,AE014134.5
 3 | chr2R	21146708	assembled	chr2	primary	2R,NT_033778.3,AE013599.4
 4 | chr3L	24543557	assembled	chr3	primary	3L,NT_037436.3,AE014296.4
 5 | chr3R	27905053	assembled	chr3	primary	3R,NT_033777.2,AE014297.2
 6 | chr4	1351857	assembled	chr4	primary	4,NC_004353.3,AE014135.3
 7 | chrX	22422827	assembled	chrX	primary	X,NC_004354.3,AE014298.4
 8 | chrM	19517	assembled	chrM	non-nuclear	MT,NS_000188.1,FA000001.1
 9 | chr2LHet	368872	unlocalized	chr2	primary	2LHet,NW_001848855.1,CM000456.1
10 | chr2RHet	3288761	unlocalized	chr2	primary	2RHet,NW_001848856.1,CM000457.1
11 | chr3LHet	2555491	unlocalized	chr3	primary	3LHet,NW_001848857.1,CM000458.1
12 | chr3RHet	2517507	unlocalized	chr3	primary	3RHet,NW_001848858.1,CM000459.1
13 | chrXHet	204112	unlocalized	chrX	primary	XHet,NW_001848859.1,CM000460.1
14 | chrYHet	347038	unlocalized	chrY	primary	YHet,NW_001848860.1,CM000461.1
15 | chrU	10049037	unplaced		primary	Un,NC_001709.1
16 | chrUextra	29004656	unplaced		primary
17 | 


--------------------------------------------------------------------------------
/bioframe/io/data/hg19.seqinfo.tsv:
--------------------------------------------------------------------------------
 1 | name	length	role	molecule	unit	aliases
 2 | chr1	249250621	assembled	chr1	primary	1,CM000663.1,NC_000001.10
 3 | chr2	243199373	assembled	chr2	primary	2,CM000664.1,NC_000002.11
 4 | chr3	198022430	assembled	chr3	primary	3,CM000665.1,NC_000003.11
 5 | chr4	191154276	assembled	chr4	primary	4,CM000666.1,NC_000004.11
 6 | chr5	180915260	assembled	chr5	primary	5,CM000667.1,NC_000005.9
 7 | chr6	171115067	assembled	chr6	primary	6,CM000668.1,NC_000006.11
 8 | chr7	159138663	assembled	chr7	primary	7,CM000669.1,NC_000007.13
 9 | chr8	146364022	assembled	chr8	primary	8,CM000670.1,NC_000008.10
10 | chr9	141213431	assembled	chr9	primary	9,CM000671.1,NC_000009.11
11 | chr10	135534747	assembled	chr10	primary	10,CM000672.1,NC_000010.10
12 | chr11	135006516	assembled	chr11	primary	11,CM000673.1,NC_000011.9
13 | chr12	133851895	assembled	chr12	primary	12,CM000674.1,NC_000012.11
14 | chr13	115169878	assembled	chr13	primary	13,CM000675.1,NC_000013.10
15 | chr14	107349540	assembled	chr14	primary	14,CM000676.1,NC_000014.8
16 | chr15	102531392	assembled	chr15	primary	15,CM000677.1,NC_000015.9
17 | chr16	90354753	assembled	chr16	primary	16,CM000678.1,NC_000016.9
18 | chr17	81195210	assembled	chr17	primary	17,CM000679.1,NC_000017.10
19 | chr18	78077248	assembled	chr18	primary	18,CM000680.1,NC_000018.9
20 | chr19	59128983	assembled	chr19	primary	19,CM000681.1,NC_000019.9
21 | chr20	63025520	assembled	chr20	primary	20,CM000682.1,NC_000020.10
22 | chr21	48129895	assembled	chr21	primary	21,CM000683.1,NC_000021.8
23 | chr22	51304566	assembled	chr22	primary	22,CM000684.1,NC_000022.10
24 | chrX	155270560	assembled	chrX	primary	X,CM000685.1,NC_000023.10
25 | chrY	59373566	assembled	chrY	primary	Y,CM000686.1,NC_000024.9
26 | chrM	16571	assembled	chrM	non-nuclear	NC_001807.4
27 | chrMT	16569	assembled	chrMT	non-nuclear-revised	MT,J01415.2,NC_012920.1
28 | chr1_gl000191_random	106433	unlocalized	chr1	primary	GL000191.1,HSCHR1_RANDOM_CTG5,NT_113878.1
29 | chr1_gl000192_random	547496	unlocalized	chr1	primary	GL000192.1,HSCHR1_RANDOM_CTG12,NT_167207.1
30 | chr4_gl000193_random	189789	unlocalized	chr4	primary	GL000193.1,HSCHR4_RANDOM_CTG2,NT_113885.1
31 | chr4_gl000194_random	191469	unlocalized	chr4	primary	GL000194.1,HSCHR4_RANDOM_CTG3,NT_113888.1
32 | chr7_gl000195_random	182896	unlocalized	chr7	primary	GL000195.1,HSCHR7_RANDOM_CTG1,NT_113901.1
33 | chr8_gl000196_random	38914	unlocalized	chr8	primary	GL000196.1,HSCHR8_RANDOM_CTG1,NT_113909.1
34 | chr8_gl000197_random	37175	unlocalized	chr8	primary	GL000197.1,HSCHR8_RANDOM_CTG4,NT_113907.1
35 | chr9_gl000198_random	90085	unlocalized	chr9	primary	GL000198.1,HSCHR9_RANDOM_CTG1,NT_113914.1
36 | chr9_gl000199_random	169874	unlocalized	chr9	primary	GL000199.1,HSCHR9_RANDOM_CTG2,NT_113916.2
37 | chr9_gl000200_random	187035	unlocalized	chr9	primary	GL000200.1,HSCHR9_RANDOM_CTG4,NT_113915.1
38 | chr9_gl000201_random	36148	unlocalized	chr9	primary	GL000201.1,HSCHR9_RANDOM_CTG5,NT_113911.1
39 | chr11_gl000202_random	40103	unlocalized	chr11	primary	GL000202.1,HSCHR11_RANDOM_CTG2,NT_113921.2
40 | chr17_gl000203_random	37498	unlocalized	chr17	primary	GL000203.1,HSCHR17_RANDOM_CTG1,NT_113941.1
41 | chr17_gl000204_random	81310	unlocalized	chr17	primary	GL000204.1,HSCHR17_RANDOM_CTG2,NT_113943.1
42 | chr17_gl000205_random	174588	unlocalized	chr17	primary	GL000205.1,HSCHR17_RANDOM_CTG3,NT_113930.1
43 | chr17_gl000206_random	41001	unlocalized	chr17	primary	GL000206.1,HSCHR17_RANDOM_CTG4,NT_113945.1
44 | chr18_gl000207_random	4262	unlocalized	chr18	primary	GL000207.1,HSCHR18_RANDOM_CTG1,NT_113947.1
45 | chr19_gl000208_random	92689	unlocalized	chr19	primary	GL000208.1,HSCHR19_RANDOM_CTG1,NT_113948.1
46 | chr19_gl000209_random	159169	unlocalized	chr19	primary	GL000209.1,HSCHR19_RANDOM_CTG2,NT_113949.1
47 | chr21_gl000210_random	27682	unlocalized	chr21	primary	GL000210.1,HSCHR21_RANDOM_CTG9,NT_113950.2
48 | chrUn_gl000211	166566	unplaced		primary	GL000211.1,HSCHRUN_RANDOM_CTG1,NT_113961.1
49 | chrUn_gl000212	186858	unplaced		primary	GL000212.1,HSCHRUN_RANDOM_CTG2,NT_113923.1
50 | chrUn_gl000213	164239	unplaced		primary	GL000213.1,HSCHRUN_RANDOM_CTG3,NT_167208.1
51 | chrUn_gl000214	137718	unplaced		primary	GL000214.1,HSCHRUN_RANDOM_CTG4,NT_167209.1
52 | chrUn_gl000215	172545	unplaced		primary	GL000215.1,HSCHRUN_RANDOM_CTG5,NT_167210.1
53 | chrUn_gl000216	172294	unplaced		primary	GL000216.1,HSCHRUN_RANDOM_CTG6,NT_167211.1
54 | chrUn_gl000217	172149	unplaced		primary	GL000217.1,HSCHRUN_RANDOM_CTG7,NT_167212.1
55 | chrUn_gl000218	161147	unplaced		primary	GL000218.1,HSCHRUN_RANDOM_CTG9,NT_113889.1
56 | chrUn_gl000219	179198	unplaced		primary	GL000219.1,HSCHRUN_RANDOM_CTG10,NT_167213.1
57 | chrUn_gl000220	161802	unplaced		primary	GL000220.1,HSCHRUN_RANDOM_CTG11,NT_167214.1
58 | chrUn_gl000221	155397	unplaced		primary	GL000221.1,HSCHRUN_RANDOM_CTG13,NT_167215.1
59 | chrUn_gl000222	186861	unplaced		primary	GL000222.1,HSCHRUN_RANDOM_CTG14,NT_167216.1
60 | chrUn_gl000223	180455	unplaced		primary	GL000223.1,HSCHRUN_RANDOM_CTG15,NT_167217.1
61 | chrUn_gl000224	179693	unplaced		primary	GL000224.1,HSCHRUN_RANDOM_CTG16,NT_167218.1
62 | chrUn_gl000225	211173	unplaced		primary	GL000225.1,HSCHRUN_RANDOM_CTG17,NT_167219.1
63 | chrUn_gl000226	15008	unplaced		primary	GL000226.1,HSCHRUN_RANDOM_CTG19,NT_167220.1
64 | chrUn_gl000227	128374	unplaced		primary	GL000227.1,HSCHRUN_RANDOM_CTG20,NT_167221.1
65 | chrUn_gl000228	129120	unplaced		primary	GL000228.1,HSCHRUN_RANDOM_CTG21,NT_167222.1
66 | chrUn_gl000229	19913	unplaced		primary	GL000229.1,HSCHRUN_RANDOM_CTG22,NT_167223.1
67 | chrUn_gl000230	43691	unplaced		primary	GL000230.1,HSCHRUN_RANDOM_CTG23,NT_167224.1
68 | chrUn_gl000231	27386	unplaced		primary	GL000231.1,HSCHRUN_RANDOM_CTG24,NT_167225.1
69 | chrUn_gl000232	40652	unplaced		primary	GL000232.1,HSCHRUN_RANDOM_CTG25,NT_167226.1
70 | chrUn_gl000233	45941	unplaced		primary	GL000233.1,HSCHRUN_RANDOM_CTG26,NT_167227.1
71 | chrUn_gl000234	40531	unplaced		primary	GL000234.1,HSCHRUN_RANDOM_CTG27,NT_167228.1
72 | chrUn_gl000235	34474	unplaced		primary	GL000235.1,HSCHRUN_RANDOM_CTG28,NT_167229.1
73 | chrUn_gl000236	41934	unplaced		primary	GL000236.1,HSCHRUN_RANDOM_CTG29,NT_167230.1
74 | chrUn_gl000237	45867	unplaced		primary	GL000237.1,HSCHRUN_RANDOM_CTG30,NT_167231.1
75 | chrUn_gl000238	39939	unplaced		primary	GL000238.1,HSCHRUN_RANDOM_CTG31,NT_167232.1
76 | chrUn_gl000239	33824	unplaced		primary	GL000239.1,HSCHRUN_RANDOM_CTG32,NT_167233.1
77 | chrUn_gl000240	41933	unplaced		primary	GL000240.1,HSCHRUN_RANDOM_CTG33,NT_167234.1
78 | chrUn_gl000241	42152	unplaced		primary	GL000241.1,HSCHRUN_RANDOM_CTG34,NT_167235.1
79 | chrUn_gl000242	43523	unplaced		primary	GL000242.1,HSCHRUN_RANDOM_CTG35,NT_167236.1
80 | chrUn_gl000243	43341	unplaced		primary	GL000243.1,HSCHRUN_RANDOM_CTG36,NT_167237.1
81 | chrUn_gl000244	39929	unplaced		primary	GL000244.1,HSCHRUN_RANDOM_CTG37,NT_167238.1
82 | chrUn_gl000245	36651	unplaced		primary	GL000245.1,HSCHRUN_RANDOM_CTG38,NT_167239.1
83 | chrUn_gl000246	38154	unplaced		primary	GL000246.1,HSCHRUN_RANDOM_CTG39,NT_167240.1
84 | chrUn_gl000247	36422	unplaced		primary	GL000247.1,HSCHRUN_RANDOM_CTG40,NT_167241.1
85 | chrUn_gl000248	39786	unplaced		primary	GL000248.1,HSCHRUN_RANDOM_CTG41,NT_167242.1
86 | chrUn_gl000249	38502	unplaced		primary	GL000249.1,HSCHRUN_RANDOM_CTG42,NT_167243.1
87 | 


--------------------------------------------------------------------------------
/bioframe/io/data/hs1.seqinfo.tsv:
--------------------------------------------------------------------------------
 1 | name	length	role	molecule	unit	aliases
 2 | chr1	248387328	assembled	chr1	primary	1,CP068277.2,NC_060925.1
 3 | chr2	242696752	assembled	chr2	primary	2,CP068276.2,NC_060926.1
 4 | chr3	201105948	assembled	chr3	primary	3,CP068275.2,NC_060927.1
 5 | chr4	193574945	assembled	chr4	primary	4,CP068274.2,NC_060928.1
 6 | chr5	182045439	assembled	chr5	primary	5,CP068273.2,NC_060929.1
 7 | chr6	172126628	assembled	chr6	primary	6,CP068272.2,NC_060930.1
 8 | chr7	160567428	assembled	chr7	primary	7,CP068271.2,NC_060931.1
 9 | chr8	146259331	assembled	chr8	primary	8,CP068270.2,NC_060932.1
10 | chr9	150617247	assembled	chr9	primary	9,CP068269.2,NC_060933.1
11 | chr10	134758134	assembled	chr10	primary	10,CP068268.2,NC_060934.1
12 | chr11	135127769	assembled	chr11	primary	11,CP068267.2,NC_060935.1
13 | chr12	133324548	assembled	chr12	primary	12,CP068266.2,NC_060936.1
14 | chr13	113566686	assembled	chr13	primary	13,CP068265.2,NC_060937.1
15 | chr14	101161492	assembled	chr14	primary	14,CP068264.2,NC_060938.1
16 | chr15	99753195	assembled	chr15	primary	15,CP068263.2,NC_060939.1
17 | chr16	96330374	assembled	chr16	primary	16,CP068262.2,NC_060940.1
18 | chr17	84276897	assembled	chr17	primary	17,CP068261.2,NC_060941.1
19 | chr18	80542538	assembled	chr18	primary	18,CP068260.2,NC_060942.1
20 | chr19	61707364	assembled	chr19	primary	19,CP068259.2,NC_060943.1
21 | chr20	66210255	assembled	chr20	primary	20,CP068258.2,NC_060944.1
22 | chr21	45090682	assembled	chr21	primary	21,CP068257.2,NC_060945.1
23 | chr22	51324926	assembled	chr22	primary	22,CP068256.2,NC_060946.1
24 | chrX	154259566	assembled	chrX	primary	X,CP068255.2,NC_060947.1
25 | chrY	62460029	assembled	chrY	primary	Y,CP086569.2,NC_060948.1
26 | chrM	16569	assembled	chrM	non-nuclear	MT,CP068254.1
27 | 


--------------------------------------------------------------------------------
/bioframe/io/data/mm10.seqinfo.tsv:
--------------------------------------------------------------------------------
 1 | name	length	role	molecule	unit	aliases
 2 | chr1	195471971	assembled	chr1	primary	1,CM000994.2,NC_000067.6
 3 | chr2	182113224	assembled	chr2	primary	2,CM000995.2,NC_000068.7
 4 | chr3	160039680	assembled	chr3	primary	3,CM000996.2,NC_000069.6
 5 | chr4	156508116	assembled	chr4	primary	4,CM000997.2,NC_000070.6
 6 | chr5	151834684	assembled	chr5	primary	5,CM000998.2,NC_000071.6
 7 | chr6	149736546	assembled	chr6	primary	6,CM000999.2,NC_000072.6
 8 | chr7	145441459	assembled	chr7	primary	7,CM001000.2,NC_000073.6
 9 | chr8	129401213	assembled	chr8	primary	8,CM001001.2,NC_000074.6
10 | chr9	124595110	assembled	chr9	primary	9,CM001002.2,NC_000075.6
11 | chr10	130694993	assembled	chr10	primary	10,CM001003.2,NC_000076.6
12 | chr11	122082543	assembled	chr11	primary	11,CM001004.2,NC_000077.6
13 | chr12	120129022	assembled	chr12	primary	12,CM001005.2,NC_000078.6
14 | chr13	120421639	assembled	chr13	primary	13,CM001006.2,NC_000079.6
15 | chr14	124902244	assembled	chr14	primary	14,CM001007.2,NC_000080.6
16 | chr15	104043685	assembled	chr15	primary	15,CM001008.2,NC_000081.6
17 | chr16	98207768	assembled	chr16	primary	16,CM001009.2,NC_000082.6
18 | chr17	94987271	assembled	chr17	primary	17,CM001010.2,NC_000083.6
19 | chr18	90702639	assembled	chr18	primary	18,CM001011.2,NC_000084.6
20 | chr19	61431566	assembled	chr19	primary	19,CM001012.2,NC_000085.6
21 | chrX	171031299	assembled	chrX	primary	X,CM001013.2,NC_000086.7
22 | chrY	91744698	assembled	chrY	primary	Y,CM001014.2,NC_000087.7
23 | chrM	16299	assembled	chrM	non-nuclear	MT,AY172335.1,NC_005089.1
24 | chr1_GL456210_random	169725	unlocalized	chr1	primary	GL456210.1,MMCHR1_RANDOM_CTG1,NT_166280.1
25 | chr1_GL456211_random	241735	unlocalized	chr1	primary	GL456211.1,MMCHR1_RANDOM_CTG2,NT_166281.1
26 | chr1_GL456212_random	153618	unlocalized	chr1	primary	GL456212.1,MMCHR1_RANDOM_CTG3,NT_166282.1
27 | chr1_GL456213_random	39340	unlocalized	chr1	primary	GL456213.1,MMCHR1_RANDOM_CTG4,NT_166283.1
28 | chr1_GL456221_random	206961	unlocalized	chr1	primary	GL456221.1,MMCHR1_RANDOM_CTG5,NT_162750.1
29 | chr4_GL456216_random	66673	unlocalized	chr4	primary	GL456216.1,MMCHR4UN_CTG1,NT_166291.1
30 | chr4_GL456350_random	227966	unlocalized	chr4	primary	GL456350.1,MMCHR4UN_CTG3,NT_166434.1
31 | chr4_JH584292_random	14945	unlocalized	chr4	primary	JH584292.1,MMCHR4UN_CTG2,NT_187052.1
32 | chr4_JH584293_random	207968	unlocalized	chr4	primary	JH584293.1,MMCHR4UN_CTG4,NT_187053.1
33 | chr4_JH584294_random	191905	unlocalized	chr4	primary	JH584294.1,MMCHR4UN_CTG5,NT_187054.1
34 | chr4_JH584295_random	1976	unlocalized	chr4	primary	JH584295.1,MMCHR4UN_CTG6,NT_187055.1
35 | chr5_GL456354_random	195993	unlocalized	chr5	primary	GL456354.1,MMCHR5_RANDOM_CTG4,NT_166438.1
36 | chr5_JH584296_random	199368	unlocalized	chr5	primary	JH584296.1,MMCHR5_RANDOM_CTG1,NT_187056.1
37 | chr5_JH584297_random	205776	unlocalized	chr5	primary	JH584297.1,MMCHR5_RANDOM_CTG2,NT_187057.1
38 | chr5_JH584298_random	184189	unlocalized	chr5	primary	JH584298.1,MMCHR5_RANDOM_CTG3,NT_187058.1
39 | chr5_JH584299_random	953012	unlocalized	chr5	primary	JH584299.1,MMCHR5_RANDOM_CTG5,NT_187059.1
40 | chr7_GL456219_random	175968	unlocalized	chr7	primary	GL456219.1,MMCHR7_RANDOM_CTG1,NT_166307.1
41 | chrX_GL456233_random	336933	unlocalized	chrX	primary	GL456233.1,MMCHRX_RANDOM_CTG2,NT_165789.2
42 | chrY_JH584300_random	182347	unlocalized	chrY	primary	JH584300.1,MMCHRY_CTGU1,NT_187060.1
43 | chrY_JH584301_random	259875	unlocalized	chrY	primary	JH584301.1,MMCHRY_CTGU2,NT_187061.1
44 | chrY_JH584302_random	155838	unlocalized	chrY	primary	JH584302.1,MMCHRY_CTGU3,NT_187062.1
45 | chrY_JH584303_random	158099	unlocalized	chrY	primary	JH584303.1,MMCHRY_CTGU4,NT_187063.1
46 | chrUn_GL456239	40056	unplaced		primary	GL456239.1,MSCHRUN_CTG1,NT_166338.1
47 | chrUn_GL456359	22974	unplaced		primary	GL456359.1,MSCHRUN_CTG13,NT_166443.1
48 | chrUn_GL456360	31704	unplaced		primary	GL456360.1,MSCHRUN_CTG14,NT_166444.1
49 | chrUn_GL456366	47073	unplaced		primary	GL456366.1,MSCHRUN_CTG21,NT_166450.1
50 | chrUn_GL456367	42057	unplaced		primary	GL456367.1,MSCHRUN_CTG2,NT_166451.1
51 | chrUn_GL456368	20208	unplaced		primary	GL456368.1,MSCHRUN_CTG22,NT_166452.1
52 | chrUn_GL456370	26764	unplaced		primary	GL456370.1,MSCHRUN_CTG19,NT_166454.1
53 | chrUn_GL456372	28664	unplaced		primary	GL456372.1,MSCHRUN_CTG16,NT_166456.1
54 | chrUn_GL456378	31602	unplaced		primary	GL456378.1,MSCHRUN_CTG3,NT_166462.1
55 | chrUn_GL456379	72385	unplaced		primary	GL456379.1,MSCHRUN_CTG20,NT_166463.1
56 | chrUn_GL456381	25871	unplaced		primary	GL456381.1,MSCHRUN_CTG4,NT_166465.1
57 | chrUn_GL456382	23158	unplaced		primary	GL456382.1,MSCHRUN_CTG5,NT_166466.1
58 | chrUn_GL456383	38659	unplaced		primary	GL456383.1,MSCHRUN_CTG6,NT_166467.1
59 | chrUn_GL456385	35240	unplaced		primary	GL456385.1,MSCHRUN_CTG7,NT_166469.1
60 | chrUn_GL456387	24685	unplaced		primary	GL456387.1,MSCHRUN_CTG17,NT_166471.1
61 | chrUn_GL456389	28772	unplaced		primary	GL456389.1,MSCHRUN_CTG18,NT_166473.1
62 | chrUn_GL456390	24668	unplaced		primary	GL456390.1,MSCHRUN_CTG9,NT_166474.1
63 | chrUn_GL456392	23629	unplaced		primary	GL456392.1,MSCHRUN_CTG10,NT_166476.1
64 | chrUn_GL456393	55711	unplaced		primary	GL456393.1,MSCHRUN_CTG11,NT_166477.1
65 | chrUn_GL456394	24323	unplaced		primary	GL456394.1,MSCHRUN_CTG12,NT_166478.1
66 | chrUn_GL456396	21240	unplaced		primary	GL456396.1,MSCHRUN_CTG15,NT_166480.1
67 | chrUn_JH584304	114452	unplaced		primary	JH584304.1,MSCHRUN_CTG23,NT_187064.1
68 | 


--------------------------------------------------------------------------------
/bioframe/io/data/mm39.seqinfo.tsv:
--------------------------------------------------------------------------------
 1 | name	length	role	molecule	unit	aliases
 2 | chr1	195154279	assembled	chr1	primary	1,CM000994.3,NC_000067.7
 3 | chr2	181755017	assembled	chr2	primary	2,CM000995.3,NC_000068.8
 4 | chr3	159745316	assembled	chr3	primary	3,CM000996.3,NC_000069.7
 5 | chr4	156860686	assembled	chr4	primary	4,CM000997.3,NC_000070.7
 6 | chr5	151758149	assembled	chr5	primary	5,CM000998.3,NC_000071.7
 7 | chr6	149588044	assembled	chr6	primary	6,CM000999.3,NC_000072.7
 8 | chr7	144995196	assembled	chr7	primary	7,CM001000.3,NC_000073.7
 9 | chr8	130127694	assembled	chr8	primary	8,CM001001.3,NC_000074.7
10 | chr9	124359700	assembled	chr9	primary	9,CM001002.3,NC_000075.7
11 | chr10	130530862	assembled	chr10	primary	10,CM001003.3,NC_000076.7
12 | chr11	121973369	assembled	chr11	primary	11,CM001004.3,NC_000077.7
13 | chr12	120092757	assembled	chr12	primary	12,CM001005.3,NC_000078.7
14 | chr13	120883175	assembled	chr13	primary	13,CM001006.3,NC_000079.7
15 | chr14	125139656	assembled	chr14	primary	14,CM001007.3,NC_000080.7
16 | chr15	104073951	assembled	chr15	primary	15,CM001008.3,NC_000081.7
17 | chr16	98008968	assembled	chr16	primary	16,CM001009.3,NC_000082.7
18 | chr17	95294699	assembled	chr17	primary	17,CM001010.3,NC_000083.7
19 | chr18	90720763	assembled	chr18	primary	18,CM001011.3,NC_000084.7
20 | chr19	61420004	assembled	chr19	primary	19,CM001012.3,NC_000085.7
21 | chrX	169476592	assembled	chrX	primary	X,CM001013.3,NC_000086.8
22 | chrY	91455967	assembled	chrY	primary	Y,CM001014.3,NC_000087.8
23 | chrM	16299	assembled	chrM	non-nuclear	MT,AY172335.1,NC_005089.1
24 | chr1_GL456210v1_random	169725	unlocalized	chr1	primary	GL456210.1,MMCHR1_RANDOM_CTG1,NT_166280.1
25 | chr1_GL456211v1_random	241735	unlocalized	chr1	primary	GL456211.1,MMCHR1_RANDOM_CTG2,NT_166281.1
26 | chr1_GL456212v1_random	153618	unlocalized	chr1	primary	GL456212.1,MMCHR1_RANDOM_CTG3,NT_166282.1
27 | chr1_GL456221v1_random	206961	unlocalized	chr1	primary	GL456221.1,MMCHR1_RANDOM_CTG5,NT_162750.1
28 | chr1_GL456239v1_random	40056	unlocalized	chr1	primary	GL456239.1,MMCHR1_RANDOM_CTG7,NT_166338.1
29 | chr1_MU069434v1_random	8412	unlocalized	chr1	primary	MMCHR1_RANDOM_CTG6,MU069434.1,NW_023337853.1
30 | chr4_JH584295v1_random	1976	unlocalized	chr4	primary	JH584295.1,MMCHR4UN_CTG6,NT_187055.1
31 | chr5_GL456354v1_random	195993	unlocalized	chr5	primary	GL456354.1,MMCHR5_RANDOM_CTG4,NT_166438.1
32 | chr5_JH584296v1_random	199368	unlocalized	chr5	primary	JH584296.1,MMCHR5_RANDOM_CTG1,NT_187056.1
33 | chr5_JH584297v1_random	205776	unlocalized	chr5	primary	JH584297.1,MMCHR5_RANDOM_CTG2,NT_187057.1
34 | chr5_JH584298v1_random	184189	unlocalized	chr5	primary	JH584298.1,MMCHR5_RANDOM_CTG3,NT_187058.1
35 | chr5_JH584299v1_random	953012	unlocalized	chr5	primary	JH584299.1,MMCHR5_RANDOM_CTG5,NT_187059.1
36 | chr7_GL456219v1_random	175968	unlocalized	chr7	primary	GL456219.1,MMCHR7_RANDOM_CTG1,NT_166307.1
37 | chrX_GL456233v2_random	559103	unlocalized	chrX	primary	GL456233.2,MMCHRX_RANDOM_CTG2,NT_165789.3
38 | chrY_JH584300v1_random	182347	unlocalized	chrY	primary	JH584300.1,MMCHRY_CTGU1,NT_187060.1
39 | chrY_JH584301v1_random	259875	unlocalized	chrY	primary	JH584301.1,MMCHRY_CTGU2,NT_187061.1
40 | chrY_JH584302v1_random	155838	unlocalized	chrY	primary	JH584302.1,MMCHRY_CTGU3,NT_187062.1
41 | chrY_JH584303v1_random	158099	unlocalized	chrY	primary	JH584303.1,MMCHRY_CTGU4,NT_187063.1
42 | chrUn_GL456359v1	22974	unplaced		primary	GL456359.1,MSCHRUN_CTG13,NT_166443.1
43 | chrUn_GL456360v1	31704	unplaced		primary	GL456360.1,MSCHRUN_CTG14,NT_166444.1
44 | chrUn_GL456366v1	47073	unplaced		primary	GL456366.1,MSCHRUN_CTG21,NT_166450.1
45 | chrUn_GL456367v1	42057	unplaced		primary	GL456367.1,MSCHRUN_CTG2,NT_166451.1
46 | chrUn_GL456368v1	20208	unplaced		primary	GL456368.1,MSCHRUN_CTG22,NT_166452.1
47 | chrUn_GL456370v1	26764	unplaced		primary	GL456370.1,MSCHRUN_CTG19,NT_166454.1
48 | chrUn_GL456372v1	28664	unplaced		primary	GL456372.1,MSCHRUN_CTG16,NT_166456.1
49 | chrUn_GL456378v1	31602	unplaced		primary	GL456378.1,MSCHRUN_CTG3,NT_166462.1
50 | chrUn_GL456379v1	72385	unplaced		primary	GL456379.1,MSCHRUN_CTG20,NT_166463.1
51 | chrUn_GL456381v1	25871	unplaced		primary	GL456381.1,MSCHRUN_CTG4,NT_166465.1
52 | chrUn_GL456382v1	23158	unplaced		primary	GL456382.1,MSCHRUN_CTG5,NT_166466.1
53 | chrUn_GL456383v1	38659	unplaced		primary	GL456383.1,MSCHRUN_CTG6,NT_166467.1
54 | chrUn_GL456385v1	35240	unplaced		primary	GL456385.1,MSCHRUN_CTG7,NT_166469.1
55 | chrUn_GL456387v1	24685	unplaced		primary	GL456387.1,MSCHRUN_CTG17,NT_166471.1
56 | chrUn_GL456389v1	28772	unplaced		primary	GL456389.1,MSCHRUN_CTG18,NT_166473.1
57 | chrUn_GL456390v1	24668	unplaced		primary	GL456390.1,MSCHRUN_CTG9,NT_166474.1
58 | chrUn_GL456392v1	23629	unplaced		primary	GL456392.1,MSCHRUN_CTG10,NT_166476.1
59 | chrUn_GL456394v1	24323	unplaced		primary	GL456394.1,MSCHRUN_CTG12,NT_166478.1
60 | chrUn_GL456396v1	21240	unplaced		primary	GL456396.1,MSCHRUN_CTG15,NT_166480.1
61 | chrUn_JH584304v1	114452	unplaced		primary	JH584304.1,MSCHRUN_CTG23,NT_187064.1
62 | chrUn_MU069435v1	31129	unplaced		primary	MU069435.1,MSCHRUN_CTG24,NW_023337853.1
63 | 


--------------------------------------------------------------------------------
/bioframe/io/data/mm9.seqinfo.tsv:
--------------------------------------------------------------------------------
 1 | name	length	role	molecule	unit	aliases
 2 | chr1	197195432	assembled	chr1	primary	1,CM000994.1,NC_000067.5
 3 | chr2	181748087	assembled	chr2	primary	2,CM000995.1,NC_000068.6
 4 | chr3	159599783	assembled	chr3	primary	3,CM000996.1,NC_000069.5
 5 | chr4	155630120	assembled	chr4	primary	4,CM000997.1,NC_000070.5
 6 | chr5	152537259	assembled	chr5	primary	5,CM000998.1,NC_000071.5
 7 | chr6	149517037	assembled	chr6	primary	6,CM000999.1,NC_000072.5
 8 | chr7	152524553	assembled	chr7	primary	7,CM001000.1,NC_000073.5
 9 | chr8	131738871	assembled	chr8	primary	8,CM001001.1,NC_000074.5
10 | chr9	124076172	assembled	chr9	primary	9,CM001002.1,NC_000075.5
11 | chr10	129993255	assembled	chr10	primary	10,CM001003.1,NC_000076.5
12 | chr11	121843856	assembled	chr11	primary	11,CM001004.1,NC_000077.5
13 | chr12	121257530	assembled	chr12	primary	12,CM001005.1,NC_000078.5
14 | chr13	120284312	assembled	chr13	primary	13,CM001006.1,NC_000079.5
15 | chr14	125194864	assembled	chr14	primary	14,CM001007.1,NC_000080.5
16 | chr15	103494974	assembled	chr15	primary	15,CM001008.1,NC_000081.5
17 | chr16	98319150	assembled	chr16	primary	16,CM001009.1,NC_000082.5
18 | chr17	95272651	assembled	chr17	primary	17,CM001010.1,NC_000083.5
19 | chr18	90772031	assembled	chr18	primary	18,CM001011.1,NC_000084.5
20 | chr19	61342430	assembled	chr19	primary	19,CM001012.1,NC_000085.5
21 | chrX	166650296	assembled	chrX	primary	X,CM001013.1,NC_000086.6
22 | chrY	15902555	assembled	chrY	primary	Y,CM001014.1,NC_000087.6
23 | chrM	16299	assembled	chrM	non-nuclear	MT,AY172335.1,NC_005089.1
24 | chr1_random	1231697	unlocalized	chr1	primary
25 | chr3_random	41899	unlocalized	chr3	primary
26 | chr4_random	160594	unlocalized	chr4	primary
27 | chr5_random	357350	unlocalized	chr5	primary
28 | chr7_random	362490	unlocalized	chr7	primary
29 | chr8_random	849593	unlocalized	chr8	primary
30 | chr9_random	449403	unlocalized	chr9	primary
31 | chr13_random	400311	unlocalized	chr13	primary
32 | chr16_random	3994	unlocalized	chr16	primary
33 | chr17_random	628739	unlocalized	chr17	primary
34 | chrX_random	1785075	unlocalized	chrX	primary
35 | chrY_random	58682461	unlocalized	chrY	primary
36 | chrUn_random	5900358	unplaced		primary
37 | 


--------------------------------------------------------------------------------
/bioframe/io/data/sacCer3.seqinfo.tsv:
--------------------------------------------------------------------------------
 1 | name	length	role	molecule	unit	aliases
 2 | chrI	230218	assembled	chrI	primary	I,BK006935.2,NC_001133.9
 3 | chrII	813184	assembled	chrII	primary	II,BK006936.2,NC_001134.8
 4 | chrIII	316620	assembled	chrIII	primary	III,BK006937.2,NC_001135.5
 5 | chrIV	1531933	assembled	chrIV	primary	IV,BK006938.2,NC_001136.10
 6 | chrV	576874	assembled	chrV	primary	V,BK006939.2,NC_001137.3
 7 | chrVI	270161	assembled	chrVI	primary	VI,BK006940.2,NC_001138.5
 8 | chrVII	1090940	assembled	chrVII	primary	VII,BK006941.2,NC_001139.9
 9 | chrVIII	562643	assembled	chrVIII	primary	VIII,BK006934.2,NC_001140.6
10 | chrIX	439888	assembled	chrIX	primary	IX,BK006942.2,NC_001141.2
11 | chrX	745751	assembled	chrX	primary	X,BK006943.2,NC_001142.9
12 | chrXI	666816	assembled	chrXI	primary	XI,BK006944.2,NC_001143.9
13 | chrXII	1078177	assembled	chrXII	primary	XII,BK006945.2,NC_001144.5
14 | chrXIII	924431	assembled	chrXIII	primary	XIII,BK006946.2,NC_001145.3
15 | chrXIV	784333	assembled	chrXIV	primary	XIV,BK006947.3,NC_001146.8
16 | chrXV	1091291	assembled	chrXV	primary	XV,BK006948.2,NC_001147.6
17 | chrXVI	948066	assembled	chrXVI	primary	XVI,BK006949.2,NC_001148.4
18 | chrM	85779	assembled	chrM	non-nuclear	MT,Mito,AJ011856.1,NC_001224.1
19 | 


--------------------------------------------------------------------------------
/bioframe/io/data/wuhCor1.seqinfo.tsv:
--------------------------------------------------------------------------------
1 | name	length	role	molecule	unit	aliases
2 | NC_045512v2	29903	assembled	NC_045512	primary	NC_045512.2,MN908947.3
3 | 


--------------------------------------------------------------------------------
/bioframe/io/resources.py:
--------------------------------------------------------------------------------
  1 | import urllib
  2 | from functools import partial
  3 | from typing import Union
  4 | from urllib.parse import urljoin
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | from .assembly import assembly_info
 10 | from .fileops import read_chromsizes, read_table
 11 | from .schemas import SCHEMAS
 12 | 
 13 | __all__ = [
 14 |     "fetch_chromsizes",
 15 |     "fetch_centromeres",
 16 |     "UCSCClient",
 17 | ]
 18 | 
 19 | 
 20 | def fetch_chromsizes(
 21 |     db: str,
 22 |     *,
 23 |     provider: str = "local",
 24 |     as_bed: bool = False,
 25 |     filter_chroms: bool = True,
 26 |     chrom_patterns: tuple = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"),
 27 |     natsort: bool = True,
 28 |     **kwargs,
 29 | ) -> Union[pd.Series, pd.DataFrame]:
 30 |     """
 31 |     Fetch chromsizes from local storage or the UCSC database.
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     db : str
 36 |         Assembly name.
 37 |     provider : str, optional [default: "local"]
 38 |         The provider of chromsizes. Either "local" for local storage or "ucsc".
 39 |     as_bed : bool, optional
 40 |         If True, return chromsizes as an interval DataFrame (chrom, start, end)
 41 |         instead of a Series.
 42 | 
 43 |     The remaining options only apply to provider="ucsc".
 44 | 
 45 |     filter_chroms : bool, optional
 46 |         Filter for chromosome names given in ``chrom_patterns``.
 47 |     chrom_patterns : sequence, optional
 48 |         Sequence of regular expressions to capture desired sequence names.
 49 |     natsort : bool, optional
 50 |         Sort each captured group of names in natural order. Default is True.
 51 |     **kwargs :
 52 |         Passed to :func:`pandas.read_csv`
 53 | 
 54 |     Returns
 55 |     -------
 56 |     Series of integer bp lengths indexed by sequence name or BED3 DataFrame.
 57 | 
 58 |     Notes
 59 |     -----
 60 |     For more fine-grained control over the chromsizes from local storage,
 61 |     use :func:`bioframe.assembly_info`.
 62 | 
 63 |     Examples
 64 |     --------
 65 |     >>> fetch_chromsizes("hg38")
 66 |     name
 67 |     chr1     248956422
 68 |     chr2     242193529
 69 |     chr3     198295559
 70 |     ...      ...
 71 |     chrX     156040895
 72 |     chrY      57227415
 73 |     chrM         16569
 74 |     Name: length, dtype: int64
 75 | 
 76 |     >>> fetch_chromsizes("hg38", as_bed=True)
 77 |             chrom      start        end
 78 |     0        chr1          0  248956422
 79 |     1        chr2          0  242193529
 80 |     2        chr3          0  198295559
 81 |     ...      ...
 82 |     21       chrX          0  156040895
 83 |     22       chrY          0   57227415
 84 |     23       chrM          0      16569
 85 | 
 86 |     See also
 87 |     --------
 88 |     bioframe.assembly_info
 89 |     bioframe.UCSCClient
 90 |     """
 91 |     if provider == "local":
 92 |         assembly = assembly_info(db)
 93 |         if as_bed:
 94 |             return assembly.viewframe[["chrom", "start", "end"]].copy()
 95 |         else:
 96 |             return assembly.chromsizes
 97 |     elif provider == "ucsc":
 98 |         return UCSCClient(db).fetch_chromsizes(
 99 |             filter_chroms=filter_chroms,
100 |             chrom_patterns=chrom_patterns,
101 |             natsort=natsort,
102 |             as_bed=as_bed,
103 |             **kwargs,
104 |         )
105 |     else:
106 |         raise ValueError(f"Unknown provider '{provider}'")
107 | 
108 | 
109 | def _origins_from_cytoband(
110 |     cyb: pd.DataFrame, band_col: str = "gieStain"
111 | ) -> pd.DataFrame:
112 |     """
113 |     Extract chromosomal origin positions separating chromosome arms from
114 |     cytological band data. Takes the cytological origin, i.e. the boundary
115 |     between the two bands labeled 'acen'.
116 | 
117 |     Parameters
118 |     ----------
119 |     cyb : pandas.DataFrame
120 |         DataFrame with cytoband data.
121 | 
122 |     Returns
123 |     -------
124 |     pandas.DataFrame
125 |         A dataframe with columns 'chrom', 'start', 'end', 'mid'.
126 |     """
127 |     cyb = cyb[cyb[band_col] == "acen"]
128 |     grouped = cyb.groupby("chrom", sort=False)
129 |     cens = []
130 |     for chrom, group in grouped:
131 |         if not len(group) == 2:
132 |             raise ValueError(f"Expected 2 'acen' bands for {chrom}, found {len(group)}")
133 |         acens = group.sort_values("start")
134 |         cens.append(
135 |             {
136 |                 "chrom": chrom,
137 |                 "start": acens.iloc[0]["start"],
138 |                 "end": acens.iloc[1]["end"],
139 |                 "mid": acens.iloc[0]["end"],
140 |             }
141 |         )
142 |     return pd.DataFrame.from_records(cens)
143 | 
144 | 
145 | def _origins_from_ucsccentromeres(cens: pd.DataFrame) -> pd.DataFrame:
146 |     """
147 |     Extract chromosomal origin positions from UCSC centromeres.txt table
148 |     describing centromere model sequences. Takes the midpoint of all
149 |     modeled centromere sequences.
150 | 
151 |     Parameters
152 |     ----------
153 |     cens : pandas.DataFrame
154 |         DataFrame with centromeres.txt data.
155 | 
156 |     Returns
157 |     -------
158 |     pandas.DataFrame
159 |         A dataframe with columns 'chrom', 'start', 'end', 'mid'.
160 |     """
161 |     cens = cens.groupby("chrom").agg({"start": np.min, "end": np.max}).reset_index()
162 |     cens["mid"] = (cens["start"] + cens["end"]) // 2
163 |     cens = (
164 |         cens[["chrom", "start", "end", "mid"]]
165 |         .sort_values("chrom")
166 |         .reset_index(drop=True)
167 |     )
168 |     return cens
169 | 
170 | 
171 | def fetch_centromeres(db: str, provider: str = "local") -> pd.DataFrame:
172 |     """
173 |     Extract centromere locations for a given assembly 'db' from a variety
174 |     of file formats in UCSC (cytoband, centromeres) depending on
175 |     availability, returning a DataFrame.
176 | 
177 |     Parameters
178 |     ----------
179 |     db : str
180 |         Assembly name.
181 |     provider : str, optional [default: "local"]
182 |         The provider of centromere data. Either "local" for local storage
183 |         or "ucsc".
184 | 
185 |     Returns
186 |     -------
187 |     DataFrame with centromere 'chrom', 'start', 'end', 'mid'.
188 | 
189 |     Notes
190 |     -----
191 |     When provider="local", centromeres are derived from cytoband tables
192 |     in local storage.
193 | 
194 |     Whe provider="ucsc", the fallback priority goes as follows:
195 |     - UCSC cytoBand
196 |     - UCSC cytoBandIdeo
197 |     - UCSC centromeres.txt
198 | 
199 |     Note that UCSC "gap" files no longer provide centromere information.
200 | 
201 |     Currently only works for human assemblies.
202 | 
203 |     See also
204 |     --------
205 |     bioframe.assembly_info
206 |     bioframe.UCSCClient
207 |     """
208 |     if provider == "local":
209 |         assembly = assembly_info(db)
210 |         cyb = assembly.cytobands
211 |         if cyb is None:
212 |             raise ValueError(
213 |                 f"No source for centromere data found from provider '{provider}'."
214 |             )
215 |         return _origins_from_cytoband(cyb, band_col="stain")
216 | 
217 |     elif provider == "ucsc":
218 |         client = UCSCClient(db)
219 |         fetchers = [
220 |             ("cytoband", client.fetch_cytoband),
221 |             ("cytoband", partial(client.fetch_cytoband, ideo=True)),
222 |             ("centromeres", client.fetch_centromeres),
223 |         ]
224 | 
225 |         for schema, fetcher in fetchers:  # noqa: B007
226 |             try:
227 |                 df = fetcher()
228 |                 break
229 |             except urllib.error.HTTPError:
230 |                 pass
231 |         else:
232 |             raise ValueError(
233 |                 f"No source for centromere data found from provider '{provider}'."
234 |             )
235 | 
236 |         if schema == "centromeres":
237 |             return _origins_from_ucsccentromeres(df)
238 |         else:
239 |             return _origins_from_cytoband(df)
240 | 
241 |     else:
242 |         raise ValueError(f"Unknown provider '{provider}'")
243 | 
244 | 
245 | class UCSCClient:
246 |     BASE_URL = "https://hgdownload.soe.ucsc.edu/"
247 | 
248 |     def __init__(self, db: str):
249 |         self._db = db
250 |         self._db_url = urljoin(self.BASE_URL, f"goldenPath/{db}/")
251 | 
252 |     def fetch_chromsizes(
253 |         self,
254 |         filter_chroms: bool = True,
255 |         chrom_patterns: tuple = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"),
256 |         natsort: bool = True,
257 |         as_bed: bool = False,
258 |         **kwargs,
259 |     ) -> Union[pd.Series, pd.DataFrame]:
260 |         url = urljoin(self._db_url, f"bigZips/{self._db}.chrom.sizes")
261 |         return read_chromsizes(
262 |             url,
263 |             filter_chroms=filter_chroms,
264 |             chrom_patterns=chrom_patterns,
265 |             natsort=natsort,
266 |             as_bed=as_bed,
267 |             **kwargs,
268 |         )
269 | 
270 |     def fetch_centromeres(self, **kwargs) -> pd.DataFrame:
271 |         url = urljoin(self._db_url, "database/centromeres.txt.gz")
272 |         return read_table(url, schema="centromeres", **kwargs)
273 | 
274 |     def fetch_gaps(self, **kwargs):
275 |         url = urljoin(self._db_url, "database/gap.txt.gz")
276 |         return read_table(
277 |             url,
278 |             schema="gap",
279 |             usecols=["chrom", "start", "end", "length", "type", "bridge"],
280 |             **kwargs,
281 |         )
282 | 
283 |     def fetch_cytoband(self, ideo: bool = False, **kwargs) -> pd.DataFrame:
284 |         if ideo:
285 |             url = urljoin(self._db_url, "database/cytoBandIdeo.txt.gz")
286 |         else:
287 |             url = urljoin(self._db_url, "database/cytoBand.txt.gz")
288 |         return read_table(url, schema="cytoband")
289 | 
290 |     def fetch_mrna(self, **kwargs) -> pd.DataFrame:
291 |         url = urljoin(self._db_url, "database/all_mrna.txt.gz")
292 |         return read_table(
293 |             url,
294 |             schema=SCHEMAS["all_mrna"],
295 |             **kwargs,
296 |         )
297 | 


--------------------------------------------------------------------------------
/bioframe/io/schemas.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Field names for various genomic tabular files
  3 | 
  4 | """
  5 | 
  6 | __all__ = ["SCHEMAS"]
  7 | 
  8 | 
  9 | # UCSC File Formats
 10 | # https://genome.ucsc.edu/FAQ/FAQformat.html
 11 | BED12_FIELDS = [
 12 |     "chrom",
 13 |     "start",
 14 |     "end",
 15 |     "name",
 16 |     "score",
 17 |     "strand",
 18 |     "thickStart",
 19 |     "thickEnd",
 20 |     "itemRgb",
 21 |     "blockCount",
 22 |     "blockSizes",
 23 |     "blockStarts",
 24 | ]
 25 | 
 26 | BED_FIELDS = BED12_FIELDS[:6]
 27 | 
 28 | BEDGRAPH_FIELDS = ["chrom", "start", "end", "value"]
 29 | 
 30 | BEDPE_FIELDS = [
 31 |     "chrom1",
 32 |     "start1",
 33 |     "end1",
 34 |     "chrom2",
 35 |     "start2",
 36 |     "end2",
 37 |     "name",
 38 |     "score",
 39 |     "strand1",
 40 |     "strand2",
 41 | ]
 42 | 
 43 | GFF_FIELDS = [
 44 |     "chrom",
 45 |     "source",
 46 |     "feature",
 47 |     "start",
 48 |     "end",
 49 |     "score",
 50 |     "strand",
 51 |     "frame",
 52 |     "attributes",
 53 | ]
 54 | 
 55 | PGSNP_FIELDS = [
 56 |     "chrom",
 57 |     "start",
 58 |     "end",
 59 |     "name",
 60 |     "alleleCount",
 61 |     "alleleFreq",
 62 |     "alleleScores",
 63 | ]
 64 | 
 65 | BEDRNAELEMENTS_FIELDS = [
 66 |     "chrom",
 67 |     "start",
 68 |     "end",
 69 |     "name",
 70 |     "score",
 71 |     "strand",
 72 |     "level",
 73 |     "signif",
 74 |     "score2",
 75 | ]
 76 | 
 77 | NARROWPEAK_FIELDS = [
 78 |     "chrom",
 79 |     "start",
 80 |     "end",
 81 |     "name",
 82 |     "score",
 83 |     "strand",
 84 |     "fc",
 85 |     "-log10p",
 86 |     "-log10q",
 87 |     "relSummit",
 88 | ]
 89 | 
 90 | BROADPEAK_FIELDS = [
 91 |     "chrom",
 92 |     "start",
 93 |     "end",
 94 |     "name",
 95 |     "score",
 96 |     "strand",
 97 |     "fc",
 98 |     "-log10p",
 99 |     "-log10q",
100 | ]
101 | 
102 | GAPPEDPEAK_FIELDS = [
103 |     "chrom",
104 |     "start",
105 |     "end",
106 |     "name",
107 |     "score",
108 |     "strand",
109 |     "thickStart",
110 |     "thickEnd",
111 |     "itemRgb",
112 |     "blockCount",
113 |     "blockSizes",
114 |     "blockStarts",
115 |     "fc",
116 |     "-log10p",
117 |     "-log10q",
118 | ]
119 | 
120 | JASPAR_FIELDS = ["chrom", "start", "end", "name", "score", "pval", "strand"]
121 | 
122 | GAP_FIELDS = ["bin", "chrom", "start", "end", "ix", "n", "length", "type", "bridge"]
123 | 
124 | CENTROMERES_FIELDS = ["bin", "chrom", "start", "end", "name"]
125 | 
126 | UCSC_MRNA_FIELDS = [
127 |     "bin",
128 |     "matches",
129 |     "misMatches",
130 |     "repMatches",
131 |     "nCount",
132 |     "qNumInsert",
133 |     "qBaseInsert",
134 |     "tNumInsert",
135 |     "tBaseInsert",
136 |     "strand",
137 |     "qName",
138 |     "qSize",
139 |     "qStart",
140 |     "qEnd",
141 |     "tName",
142 |     "tSize",
143 |     "tStart",
144 |     "tEnd",
145 |     "blockCount",
146 |     "blockSizes",
147 |     "qStarts",
148 |     "tStarts",
149 | ]
150 | 
151 | CYTOBAND_FIELDS = ["chrom", "start", "end", "name", "gieStain"]
152 | 
153 | 
154 | # GA4GH File Formats
155 | # http://ga4gh.org/#/fileformats-team
156 | BAM_FIELDS = [
157 |     "QNAME",
158 |     "FLAG",
159 |     "RNAME",
160 |     "POS",
161 |     "MAPQ",
162 |     "CIGAR",
163 |     "RNEXT",
164 |     "PNEXT",
165 |     "TLEN",
166 |     "SEQ",
167 |     "QUAL",
168 |     "TAGs",
169 | ]
170 | 
171 | VCF_FIELDS = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
172 | 
173 | 
174 | SCHEMAS = {
175 |     "bed": BED_FIELDS,
176 |     "bed3": BED_FIELDS[:3],
177 |     "bed4": BED_FIELDS[:4],
178 |     "bedGraph": BEDGRAPH_FIELDS,
179 |     "bed5": BED_FIELDS[:5],
180 |     "bed6": BED_FIELDS,
181 |     "bed9": BED12_FIELDS[:9],
182 |     "bed12": BED12_FIELDS,
183 |     "bedpe": BEDPE_FIELDS,
184 |     "gff": GFF_FIELDS,
185 |     "gtf": GFF_FIELDS,
186 |     "bedRnaElements": BEDRNAELEMENTS_FIELDS,
187 |     "narrowPeak": NARROWPEAK_FIELDS,
188 |     "broadPeak": BROADPEAK_FIELDS,
189 |     "gappedPeak": GAPPEDPEAK_FIELDS,
190 |     "centromeres": CENTROMERES_FIELDS,
191 |     "cytoband": CYTOBAND_FIELDS,
192 |     "sam": BAM_FIELDS,
193 |     "vcf": VCF_FIELDS,
194 |     "jaspar": JASPAR_FIELDS,
195 |     "gap": GAP_FIELDS,
196 |     "all_mrna": UCSC_MRNA_FIELDS,
197 |     "pgsnp": PGSNP_FIELDS,
198 | }
199 | 
200 | 
201 | CHROM_NAME_PATTERNS = {
202 |     "hg19": (
203 |         r"^chr[0-9]+$",
204 |         r"^chr[XY]$",
205 |         r"^chrM$",
206 |         r"^chr(?!Un).+_.*_random$",
207 |         r"^chrUn_.*$",
208 |         r"^chr(?!Un).+_.*_hap\d+$",
209 |     ),
210 |     "hg38": (
211 |         r"^chr[0-9]+$",
212 |         r"^chr[XY]$",
213 |         r"^chrM$",
214 |         r"^chrEBV$",
215 |         r"^chr(?!Un).+_.*_random$",
216 |         r"^chrUn_.*$",
217 |         r"^chr(?!Un).+_.*_alt$",
218 |     ),
219 |     "mm9": (
220 |         r"^chr[0-9]+$",
221 |         r"^chr[XY]$",
222 |         r"^chrM$",
223 |         r"^chr(?!Un).+_random$",
224 |         r"^chrUn_random$",
225 |     ),
226 |     "mm10": (
227 |         r"^chr[0-9]+$",
228 |         r"^chr[XY]$",
229 |         r"^chrM$",
230 |         r"^chr(?!Un).+_random$",
231 |         r"^chrUn_.*$",
232 |     ),
233 |     "galGal4": (
234 |         r"^chr[0-9]+$",
235 |         r"^chr[ZW]$",
236 |         r"^chrM$",
237 |         r"^chrLGE64|chrLGE22C19W28_E50C23$",
238 |         r"^chr(?!Un).+_random$",
239 |         r"^chrUn_.*$",
240 |     ),
241 |     "galGal5": (
242 |         r"^chr[0-9]+$",
243 |         r"^chr[ZW]$",
244 |         r"^chrM$",
245 |         r"^chrLGE64$",
246 |         r"^chr(?!Un).+_random$",
247 |         r"^chrUn_.*$",
248 |     ),
249 |     "dm3": (
250 |         r"^chr[234][LR[Het]*]*$",
251 |         r"^chr[XY][Het]*$",
252 |         r"^chrM$",
253 |         r"^chr[U][extra]*$",
254 |     ),
255 |     "dm6": (
256 |         r"^chr[234][LR]*",
257 |         r"^chr[XY]$",
258 |         r"^chrM$",
259 |         r"^chr(?!Un).+_random$",
260 |         r"^chrUn_.*$",
261 |     ),
262 |     "ce10": (r"chr[IV]+$", r"^chrX$", r"^chrM$"),
263 |     "ce11": (r"chr[IV]+$", r"^chrX$", r"^chrM$"),
264 |     "sacCer3": (r"chr[IXV]+$", r"^chrM$"),
265 |     "TAIR10": (r"^\d+", r"^MT|Pltd$"),
266 | }
267 | 
268 | 
269 | UCSC_AUTOSOMES = r"^chr[0-9]+[A-Za-z]*$"
270 | UCSC_SEXCHROMS = r"^chr[XYZW][A-Za-z]*$"
271 | UCSC_NONNUCLEAR = r"^chrM$"
272 | UCSC_OTHER = r"^chrLGE.*$"
273 | UCSC_UNLOCALIZED = r"^chr(?!Un).+.*_random$"
274 | UCSC_UNPLACED = r"^chrUn_.*$|^chrU[A-Za-z]*$"
275 | UCSC_ALTCHROMS = r"^chr(?!Un).+_.*_hap\d+$|^chr(?!Un).+_.*_alt$"
276 | NCBI_AUTOSOMES = r"^[0-9]+$"
277 | NCBI_SEXCHROMS = r"^[XYZW]$"
278 | NCBI_NONNUCLEAR = r"^MT$|^Pltd$"
279 | ROMAN_LT10 = [r"^chrI+$", r"^chrIV$", r"^chrVI*$", r"^chrIX$"]
280 | ROMAN_LT20 = [*ROMAN_LT10, "^chrX$", "^chrXI*$", "^chrXIV$", "^chrXVI*$", "^chrXIX$"]
281 | 


--------------------------------------------------------------------------------
/bioframe/sandbox/clients.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import glob
  3 | import os
  4 | import os.path as op
  5 | import posixpath as pp
  6 | from urllib.parse import urlencode, urljoin
  7 | 
  8 | import pandas as pd
  9 | import requests
 10 | 
 11 | 
 12 | class EncodeClient:
 13 |     BASE_URL = "http://www.encodeproject.org/"
 14 | 
 15 |     # 2020-05-15 compatible with ENCODE Metadata at:
 16 |     METADATA_URL = "https://www.encodeproject.org/metadata/type=Experiment&status=released/metadata.tsv"
 17 | 
 18 |     KNOWN_ASSEMBLIES = (
 19 |         "GRCh38",
 20 |         "GRCh38-minimal",
 21 |         "ce10",
 22 |         "ce11",
 23 |         "dm3",
 24 |         "dm6",
 25 |         "hg19",
 26 |         "mm10",
 27 |         "mm10-minimal",
 28 |         "mm9",
 29 |     )
 30 | 
 31 |     def __init__(self, cachedir, assembly, metadata=None):
 32 |         if assembly not in self.KNOWN_ASSEMBLIES:
 33 |             raise ValueError("assembly must be in:", self.KNOWN_ASSEMBLIES)
 34 | 
 35 |         self.cachedir = op.join(cachedir, assembly)
 36 |         if not op.isdir(self.cachedir):
 37 |             os.makedirs(self.cachedir, exist_ok=True)
 38 | 
 39 |         if metadata is None:
 40 |             metadata_path = op.join(cachedir, "metadata.tsv")
 41 | 
 42 |             if not op.exists(metadata_path):
 43 |                 print(
 44 |                     "getting metadata from ENCODE, please wait while "
 45 |                     "(~240Mb) file downloads"
 46 |                 )
 47 |                 with requests.get(self.METADATA_URL, stream=True) as r:
 48 |                     r.raise_for_status()
 49 |                     with open(metadata_path, "wb") as f:
 50 |                         for chunk in r.iter_content(chunk_size=8192):
 51 |                             f.write(chunk)
 52 | 
 53 |             self._meta = pd.read_table(metadata_path, low_memory=False)
 54 |             table_assemblies = sorted(
 55 |                 self._meta["File assembly"].dropna().unique().tolist()
 56 |             )
 57 | 
 58 |             if not set(table_assemblies).issubset(set(self.KNOWN_ASSEMBLIES)):
 59 |                 raise ValueError(
 60 |                     "Table assemblies do not match known assemblies, "
 61 |                     "check ENCODE metadata version"
 62 |                 )
 63 |             self._meta = self._meta[self._meta["File assembly"] == assembly].copy()
 64 |             self._meta = self._meta.set_index("File accession")
 65 | 
 66 |         else:
 67 |             self._meta = metadata
 68 | 
 69 |     def _batch_download(self, args):
 70 |         params = urlencode(args)
 71 |         url = pp.join("batch_download", params)
 72 |         url = urljoin(self.BASE_URL, url)
 73 |         r = requests.get(url)
 74 |         r.raise_for_status()
 75 |         return r
 76 | 
 77 |     def _metadata(self, args):
 78 |         params = urlencode(args)
 79 |         url = pp.join("metadata", params, "metadata.tsv")
 80 |         url = urljoin(self.BASE_URL, url)
 81 |         r = requests.get(url)
 82 |         r.raise_for_status()
 83 |         return r
 84 | 
 85 |     @property
 86 |     def meta(self):
 87 |         return self._meta.copy()
 88 | 
 89 |     def info(self, accession, width=850, height=450):
 90 |         from IPython.display import HTML
 91 | 
 92 |         url = urljoin(self.BASE_URL, pp.join("experiments", accession))
 93 |         return HTML(
 94 |             f'<iframe width="{width}px" height="{height}px" src={url}></iframe>'
 95 |         )
 96 | 
 97 |     def fetch(self, accession):
 98 |         url = self.meta.loc[accession, "File download URL"]
 99 |         # sig = self.meta.loc[accession, 'md5sum']
100 |         filename = op.split(url)[1]
101 |         path = op.join(self.cachedir, filename)
102 |         if op.exists(path):
103 |             pass
104 |             # print('File "{}" available'.format(filename))
105 |         else:
106 |             print(f'Downloading "{filename}"')
107 |             r = requests.get(url)
108 |             r.raise_for_status()
109 |             with open(path, "wb") as f:
110 |                 f.write(r.content)
111 |         return path
112 | 
113 |     def fetch_all(self, accessions):
114 |         return list(map(self.fetch, accessions))
115 | 
116 | 
117 | class FDNClient:
118 |     BASE_URL = "https://data.4dnucleome.org/"
119 | 
120 |     def __init__(self, cachedir, assembly, metadata=None, key_id=None, key_secret=None):
121 |         self.cachedir = op.join(cachedir, assembly)
122 |         if not op.isdir(self.cachedir):
123 |             raise OSError(f"Directory doesn't exist: '{cachedir}'")
124 |         if metadata is None:
125 |             metadata_paths = sorted(glob.glob(op.join(cachedir, "metadata*.tsv")))
126 |             metadata_path = metadata_paths[-1]
127 |             self._meta = pd.read_table(metadata_path, low_memory=False, comment="#")
128 |             if assembly == "GRCh38":
129 |                 self._meta = self._meta[self._meta["Organism"] == "human"].copy()
130 |             self._meta = self._meta.set_index("File Accession")
131 |         else:
132 |             self._meta = metadata
133 |         if key_id is not None:
134 |             credential = (key_id + ":" + key_secret).encode("utf-8")
135 |             self._token = base64.b64encode(credential)
136 |         else:
137 |             self._token = None
138 | 
139 |     @property
140 |     def meta(self):
141 |         return self._meta.copy()
142 | 
143 |     def info(self, accession, width=850, height=450):
144 |         from IPython.display import HTML
145 | 
146 |         url = urljoin(self.BASE_URL, pp.join("experiments", accession))
147 |         return HTML(
148 |             f'<iframe width="{width}px" height="{height}px" src={url}></iframe>'
149 |         )
150 | 
151 |     def fetch(self, accession):
152 |         url = self.meta.loc[accession, "File Download URL"]
153 |         # sig = self.meta.loc[accession, 'md5sum']
154 |         filename = op.split(url)[1]
155 |         path = op.join(self.cachedir, filename)
156 |         if op.exists(path):
157 |             pass
158 |             # print('File "{}" available'.format(filename))
159 |         else:
160 |             print(f'Downloading "{filename}"')
161 |             if self._token:
162 |                 headers = {"Authorization": b"Basic " + self._token}
163 |             else:
164 |                 headers = None
165 |             r = requests.get(url, headers=headers)
166 |             r.raise_for_status()
167 |             with open(path, "wb") as f:
168 |                 f.write(r.content)
169 |         return path
170 | 
171 |     def fetch_all(self, accessions):
172 |         return list(map(self.fetch, accessions))
173 | 


--------------------------------------------------------------------------------
/bioframe/sandbox/gtf_io.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def parse_gtf_attributes(attrs, kv_sep="=", item_sep=";", quotechar='"', **kwargs):
 5 |     item_lists = attrs.str.split(item_sep)
 6 |     item_lists = item_lists.apply(
 7 |         lambda items: [item.strip().split(kv_sep) for item in items]
 8 |     )
 9 |     stripchars = quotechar + " "
10 |     item_lists = item_lists.apply(
11 |         lambda items: [
12 |             [x.strip(stripchars) for x in item] for item in items if len(item) == 2
13 |         ]
14 |     )
15 |     kv_records = item_lists.apply(dict)
16 |     return pd.DataFrame.from_records(kv_records, **kwargs)
17 | 


--------------------------------------------------------------------------------
/bioframe/sandbox/parquet_io.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | 
  4 | def to_parquet(
  5 |     pieces,
  6 |     outpath,
  7 |     row_group_size=None,
  8 |     compression="snappy",
  9 |     use_dictionary=True,
 10 |     version=2.0,
 11 |     **kwargs,
 12 | ):
 13 |     """
 14 |     Save an iterable of dataframe chunks to a single Apache Parquet file. For
 15 |     more info about Parquet, see https://arrow.apache.org/docs/python/parquet.html.
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     pieces : DataFrame or iterable of DataFrame
 20 |         Chunks to write
 21 |     outpath : str
 22 |         Path to output file
 23 |     row_group_size : int
 24 |         Number of rows per row group
 25 |     compression : {'snappy', 'gzip', 'brotli', 'none'}, optional
 26 |         Compression algorithm. Can be set on a per-column basis with a
 27 |         dictionary of column names to compression lib.
 28 |     use_dictionary : bool, optional
 29 |         Use dictionary encoding. Can be set on a per-column basis with a list
 30 |         of column names.
 31 | 
 32 |     See also
 33 |     --------
 34 |     pyarrow.parquet.write_table
 35 |     pyarrow.parquet.ParquetFile
 36 |     fastparquet
 37 | 
 38 |     """
 39 |     try:
 40 |         import pyarrow as pa
 41 |         import pyarrow.parquet
 42 |     except ImportError:
 43 |         raise ImportError("Saving to parquet requires the `pyarrow` package") from None
 44 | 
 45 |     if isinstance(pieces, pd.DataFrame):
 46 |         pieces = (pieces,)
 47 | 
 48 |     try:
 49 |         for i, piece in enumerate(pieces):
 50 |             table = pa.Table.from_pandas(piece, preserve_index=False)
 51 |             if i == 0:
 52 |                 writer = pa.parquet.ParquetWriter(
 53 |                     outpath,
 54 |                     table.schema,
 55 |                     compression=compression,
 56 |                     use_dictionary=use_dictionary,
 57 |                     version=version,
 58 |                     **kwargs,
 59 |                 )
 60 |             writer.write_table(table, row_group_size=row_group_size)
 61 |     finally:
 62 |         writer.close()
 63 | 
 64 | 
 65 | def read_parquet(filepath, columns=None, iterator=False, **kwargs):
 66 |     """
 67 |     Load DataFrames from Parquet files, optionally in pieces.
 68 | 
 69 |     Parameters
 70 |     ----------
 71 |     filepath : str, pathlib.Path, pyarrow.NativeFile, or file-like object
 72 |         Readable source. For passing bytes or buffer-like file containing a
 73 |         Parquet file, use pyarorw.BufferReader
 74 |     columns: list
 75 |         If not None, only these columns will be read from the row groups. A
 76 |         column name may be a prefix of a nested field, e.g. 'a' will select
 77 |         'a.b', 'a.c', and 'a.d.e'
 78 |     iterator : boolean, default False
 79 |         Return an iterator object that yields row group DataFrames and
 80 |         provides the ParquetFile interface.
 81 |     use_threads : boolean, default True
 82 |         Perform multi-threaded column reads
 83 |     memory_map : boolean, default True
 84 |         If the source is a file path, use a memory map to read file, which can
 85 |         improve performance in some environments
 86 | 
 87 |     Returns
 88 |     -------
 89 |     DataFrame or ParquetFileIterator
 90 | 
 91 |     """
 92 |     use_threads = kwargs.pop("use_threads", True)
 93 | 
 94 |     if not iterator:
 95 |         return pd.read_parquet(
 96 |             filepath, columns=columns, use_threads=use_threads, **kwargs
 97 |         )
 98 |     else:
 99 |         try:
100 |             from pyarrow.parquet import ParquetFile
101 |         except ImportError:
102 |             raise ImportError(
103 |                 "Iterating over Parquet data requires the `pyarrow` package."
104 |             ) from None
105 | 
106 |         class ParquetFileIterator(ParquetFile):
107 |             def __iter__(self):
108 |                 return self
109 | 
110 |             def __next__(self):
111 |                 if not hasattr(self, "_rgid"):
112 |                     self._rgid = 0
113 |                 if self._rgid < self.num_row_groups:
114 |                     rg = self.read_row_group(
115 |                         self._rgid,
116 |                         columns=columns,
117 |                         use_threads=use_threads,
118 |                         use_pandas_metadata=True,
119 |                     )
120 |                     self._rgid += 1
121 |                 else:
122 |                     raise StopIteration
123 |                 return rg.to_pandas()
124 | 
125 |         return ParquetFileIterator(filepath, **kwargs)
126 | 


--------------------------------------------------------------------------------
/bioframe/vis.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | from typing import Union
  3 | 
  4 | import matplotlib as mpl
  5 | import matplotlib.pyplot as plt
  6 | import numpy as np
  7 | import pandas as pd
  8 | from matplotlib.colors import to_rgb
  9 | 
 10 | from .core import arrops
 11 | 
 12 | DEFAULT_FACECOLOR = "skyblue"
 13 | DEFAULT_EDGECOLOR = "dimgray"
 14 | 
 15 | __all__ = ["plot_intervals", "to_ucsc_colorstring"]
 16 | 
 17 | 
 18 | def to_ucsc_colorstring(color: Union[str, tuple]) -> str:
 19 |     """
 20 |     Convert any matplotlib color identifier into a UCSC itemRgb color string.
 21 | 
 22 |     Parameters
 23 |     ----------
 24 |     color : str or tuple
 25 |         Any valid matplotlib color representation (e.g. 'red', 'tomato',
 26 |         '#ff0000', '#ff00', "#ff000055", (1, 0, 0), (1, 0, 0, 0.5))
 27 | 
 28 |     Returns
 29 |     -------
 30 |     str
 31 |         A UCSC itemRgb colorstring of the form "r,g,b" where r, g, and b are
 32 |         integers between 0 and 255, inclusive.
 33 | 
 34 |     Notes
 35 |     -----
 36 |     The alpha (opacity) channel is ignored if represented in the input.
 37 | 
 38 |     Null values are converted to "0", which is shorthand for "0,0,0" (black).
 39 |     Note that BED9+ files with uninformative itemRgb values should use "0" as
 40 |     the itemRgb value on every data line.
 41 | 
 42 |     Examples
 43 |     --------
 44 |     >>> to_ucsc_colorstring("red")
 45 |     '255,0,0'
 46 |     >>> to_ucsc_colorstring("tomato")
 47 |     '255,99,71'
 48 |     >>> df["itemRgb"] = df["color"].apply(to_ucsc_colorstring)
 49 |     >>> df
 50 |     chrom  start  end  color  itemRgb
 51 |     chr1   0      10   red    255,0,0
 52 |     chr1   10     20   blue   0,0,255
 53 |     chr2   0      10   green  0,128,0
 54 |     chr2   10     20   None   0
 55 |     """
 56 |     if pd.isnull(color) or color == "none":
 57 |         return "0"
 58 |     else:
 59 |         return ",".join(str(int(x * 255)) for x in to_rgb(color))
 60 | 
 61 | 
 62 | def _plot_interval(
 63 |     start, end, level, facecolor=None, edgecolor=None, height=0.6, ax=None
 64 | ):
 65 |     facecolor = DEFAULT_FACECOLOR if facecolor is None else facecolor
 66 |     edgecolor = DEFAULT_EDGECOLOR if edgecolor is None else edgecolor
 67 | 
 68 |     ax = plt.gca() if ax is None else ax
 69 |     ax.add_patch(
 70 |         mpl.patches.Rectangle(
 71 |             (start, level - height / 2),
 72 |             end - start,
 73 |             height,
 74 |             facecolor=facecolor,
 75 |             edgecolor=edgecolor,
 76 |         )
 77 |     )
 78 | 
 79 | 
 80 | def plot_intervals_arr(
 81 |     starts,
 82 |     ends,
 83 |     levels=None,
 84 |     labels=None,
 85 |     colors=None,
 86 |     xlim=None,
 87 |     show_coords=False,
 88 |     figsize=(10, 2),
 89 | ):
 90 |     """
 91 |     Plot a collection of intervals.
 92 | 
 93 |     Parameters
 94 |     ----------
 95 |     starts, ends : np.ndarray
 96 |         A collection of intervals.
 97 | 
 98 |     levels : iterable or None
 99 |         The level of each interval, i.e. the y-coordinate at which the interval
100 |         must be plotted. If None, it will be determined automatically.
101 | 
102 |     labels : str or iterable or None
103 |         The label of each interval.
104 | 
105 |     colors : str or iterable or None.
106 |         The color of each interval.
107 | 
108 |     xlim : (float, float) or None
109 |         The x-span of the plot.
110 | 
111 |     show_coords : bool
112 |         If True, plot x-ticks.
113 | 
114 |     figsize : (float, float) or None.
115 |         The size of the figure. If None, plot within the current figure.
116 | 
117 |     """
118 |     starts = np.asarray(starts)
119 |     ends = np.asarray(ends)
120 | 
121 |     if figsize is not None:
122 |         plt.figure(figsize=figsize)
123 | 
124 |     if levels is None:
125 |         levels = arrops.stack_intervals(starts, ends)
126 |     else:
127 |         levels = np.asarray(levels)
128 | 
129 |     if isinstance(colors, str) or (colors is None):
130 |         colors = itertools.cycle([colors])
131 |     else:
132 |         colors = itertools.cycle(colors)
133 | 
134 |     if isinstance(labels, str) or (labels is None):
135 |         labels = itertools.cycle([labels])
136 |     else:
137 |         labels = itertools.cycle(labels)
138 | 
139 |     for (start, end, level, color, label) in zip(
140 |         starts, ends, levels, colors, labels
141 |     ):
142 |         _plot_interval(start, end, level, facecolor=color)
143 |         if label is not None:
144 |             plt.text(
145 |                 (start + end) / 2,
146 |                 level,
147 |                 label,
148 |                 horizontalalignment="center",
149 |                 verticalalignment="center",
150 |             )
151 | 
152 |     plt.ylim(-0.5, np.max(levels) + 0.5)
153 |     if xlim is None:
154 |         plt.xlim(-0.5, np.max(ends) + 0.5)
155 |     else:
156 |         plt.xlim(xlim[0], xlim[1])
157 |     plt.gca().set_aspect(1)
158 | 
159 |     plt.gca().set_frame_on(False)
160 |     plt.yticks([])
161 |     if show_coords:
162 |         pass
163 |     else:
164 |         plt.xticks([])
165 | 
166 | 
167 | def plot_intervals(
168 |     df,
169 |     levels=None,
170 |     labels=None,
171 |     colors=None,
172 |     xlim=None,
173 |     show_coords=False,
174 |     figsize=(10, 2),
175 | ):
176 |     """
177 |     Plot a collection of intervals, one plot per chromosome.
178 | 
179 |     Parameters
180 |     ----------
181 |     df : pandas.DataFrame
182 |         A collection of intervals.
183 | 
184 |     levels : iterable or None
185 |         The level of each interval, i.e. the y-coordinate at which the interval
186 |         must be plotted. If None, it will be determined automatically.
187 | 
188 |     labels : str or iterable or None
189 |         The label of each interval.
190 | 
191 |     colors : str or iterable or None.
192 |         The color of each interval.
193 | 
194 |     xlim : (float, float) or None
195 |         The x-span of the plot.
196 | 
197 |     show_coords : bool
198 |         If True, plot x-ticks.
199 | 
200 |     figsize : (float, float) or None.
201 |         The size of the figure. If None, plot within the current figure.
202 | 
203 |     """
204 |     chrom_gb = df.groupby("chrom", observed=True)
205 |     chrom_gb = df.reset_index(drop=True).groupby("chrom", observed=True)
206 |     for chrom, chrom_df in chrom_gb:
207 |         chrom_indices = chrom_gb.groups[chrom].to_numpy()
208 |         if isinstance(levels, (list, pd.Series, np.ndarray)):
209 |             chrom_levels = np.asarray(levels)[chrom_indices]
210 |         elif levels is None:
211 |             chrom_levels = None
212 |         else:
213 |             raise ValueError(f"Unknown type of levels: {type(levels)}")
214 | 
215 |         if isinstance(labels, (list, pd.Series, np.ndarray)):
216 |             chrom_labels = np.asarray(labels)[chrom_indices]
217 |         elif labels is None:
218 |             chrom_labels = None
219 |         else:
220 |             raise ValueError(f"Unknown type of labels: {type(levels)}")
221 | 
222 |         if isinstance(colors, (list, pd.Series, np.ndarray)):
223 |             chrom_colors = np.asarray(colors)[chrom_indices]
224 |         elif colors is None or isinstance(colors, str):
225 |             chrom_colors = colors
226 |         else:
227 |             raise ValueError(f"Unknown type of colors: {type(colors)}")
228 | 
229 |         plot_intervals_arr(
230 |             chrom_df.start,
231 |             chrom_df.end,
232 |             levels=chrom_levels,
233 |             labels=chrom_labels,
234 |             colors=chrom_colors,
235 |             xlim=xlim,
236 |             show_coords=show_coords,
237 |             figsize=figsize,
238 |         )
239 |         plt.title(chrom)
240 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/api-construction.rst:
--------------------------------------------------------------------------------
1 | Construction
2 | ============
3 | 
4 | .. automodule:: bioframe.core.construction
5 |    :autosummary:
6 |    :members:
7 | 


--------------------------------------------------------------------------------
/docs/api-extras.rst:
--------------------------------------------------------------------------------
1 | Additional tools
2 | ================
3 | 
4 | .. automodule:: bioframe.extras
5 |    :autosummary:
6 |    :members:
7 | 


--------------------------------------------------------------------------------
/docs/api-fileops.rst:
--------------------------------------------------------------------------------
 1 | .. _API_fileops:
 2 | 
 3 | File I/O
 4 | ========
 5 | 
 6 | .. automodule:: bioframe.io.fileops
 7 |    :autosummary:
 8 |    :members:
 9 | 
10 | .. autofunction:: bioframe.io.bed.to_bed
11 | 


--------------------------------------------------------------------------------
/docs/api-intervalops.rst:
--------------------------------------------------------------------------------
1 | .. _API_ops:
2 | 
3 | Interval operations
4 | ===================
5 | 
6 | .. automodule:: bioframe.ops
7 |    :autosummary:
8 |    :members:
9 | 


--------------------------------------------------------------------------------
/docs/api-lowlevel.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | jupytext:
 3 |   formats: ipynb,md:myst
 4 |   text_representation:
 5 |     extension: .md
 6 |     format_name: myst
 7 |     format_version: 0.13
 8 |     jupytext_version: 1.11.3
 9 | kernelspec:
10 |   display_name: Python 3
11 |   language: python
12 |   name: python3
13 | ---
14 | 
15 | # Low-level API
16 | 
17 | ```{eval-rst}
18 | .. toctree::
19 |    :maxdepth: 2
20 |    :caption: Contents:
21 | 
22 |    lowlevel/arrops
23 |    lowlevel/specs
24 |    lowlevel/stringops
25 | 
26 | ```
27 | 
28 | Low level array-based operations are used to implement the genomic interval operations on dataframes.
29 | 
30 | ```{code-cell} ipython3
31 | import itertools
32 | 
33 | import numpy as np
34 | import matplotlib
35 | import matplotlib.pyplot as plt
36 | import pandas as pd
37 | 
38 | import bioframe as bf
39 | import bioframe.vis
40 | 
41 | 
42 | from bioframe.core import arrops
43 | ```
44 | 
45 | ```{code-cell} ipython3
46 | starts1, ends1 = np.array([
47 |     [1,5],
48 |     [3,8],
49 |     [8,10],
50 |     [12,14]
51 | ]).T
52 | 
53 | starts2, ends2 = np.array([
54 |     [4,8],
55 |     [10,11],
56 | ]).T
57 | ```
58 | 
59 | ```{code-cell} ipython3
60 | bf.vis.plot_intervals_arr(
61 |     starts = starts1,
62 |     ends = ends1,
63 |     xlim = (-0.5,14.5),
64 |     labels = np.arange(0,starts1.shape[0]),
65 |     show_coords = True)
66 | 
67 | bf.vis.plot_intervals_arr(
68 |     starts = starts2,
69 |     ends = ends2,
70 |     colors = 'lightpink',
71 |     xlim = (-0.5,14.5),
72 |     labels = np.arange(0,starts2.shape[0]),
73 |     show_coords = True)
74 | ```
75 | 
76 | ```{code-cell} ipython3
77 | arrops.overlap_intervals(starts1, ends1, starts2, ends2)
78 | ```
79 | 
80 | ```{code-cell} ipython3
81 | arrops.overlap_intervals_outer(starts1, ends1, starts2, ends2)
82 | ```
83 | 
84 | ```{code-cell} ipython3
85 | arrops.merge_intervals(starts1, ends1, min_dist=0)
86 | ```
87 | 
88 | ```{code-cell} ipython3
89 | arrops.merge_intervals(starts1, ends1, min_dist=None)
90 | ```
91 | 
92 | ```{code-cell} ipython3
93 | arrops.merge_intervals(starts1, ends1, min_dist=2)
94 | ```
95 | 
96 | ```{code-cell} ipython3
97 | arrops.complement_intervals(starts1, ends1)
98 | ```
99 | 


--------------------------------------------------------------------------------
/docs/api-resources.rst:
--------------------------------------------------------------------------------
 1 | Resources
 2 | =========
 3 | 
 4 | Genome assembly metadata
 5 | ------------------------
 6 | 
 7 | Bioframe provides a collection of genome assembly metadata for commonly used
 8 | genomes. These are accessible through a convenient dataclass interface via :func:`bioframe.assembly_info`.
 9 | 
10 | The assemblies are listed in a manifest YAML file, and each assembly
11 | has a mandatory companion file called `seqinfo` that contains the sequence
12 | names, lengths, and other information. The records in the manifest file contain
13 | the following fields:
14 | 
15 | - ``organism``: the organism name
16 | - ``provider``: the genome assembly provider (e.g, ucsc, ncbi)
17 | - ``provider_build``: the genome assembly build name (e.g., hg19, GRCh37)
18 | - ``release_year``: the year of the assembly release
19 | - ``seqinfo``: path to the seqinfo file
20 | - ``cytobands``: path to the cytoband file, if available
21 | - ``default_roles``: default molecular roles to include from the seqinfo file
22 | - ``default_units``: default assembly units to include from the seqinfo file
23 | - ``url``: URL to where the corresponding sequence files can be downloaded
24 | 
25 | The `seqinfo` file is a TSV file with the following columns (with header):
26 | 
27 | - ``name``: canonical sequence name
28 | - ``length``: sequence length
29 | - ``role``: role of the sequence or scaffold (e.g., "assembled", "unlocalized", "unplaced")
30 | - ``molecule``: name of the molecule that the sequence belongs to, if placed
31 | - ``unit``: assembly unit of the chromosome (e.g., "primary", "non-nuclear", "decoy")
32 | - ``aliases``: comma-separated list of aliases for the sequence name
33 | 
34 | We currently do not include sequences with "alt" or "patch" roles in `seqinfo` files, but we
35 | do support the inclusion of additional decoy sequences (as used by so-called NGS *analysis
36 | sets* for human genome assemblies) by marking them as members of a "decoy" assembly unit.
37 | 
38 | The `cytoband` file is an optional TSV file with the following columns (with header):
39 | 
40 | - ``chrom``: chromosome name
41 | - ``start``: start position
42 | - ``end``: end position
43 | - ``band``: cytogenetic coordinate (name of the band)
44 | - ``stain``: Giesma stain result
45 | 
46 | The order of the sequences in the `seqinfo` file is treated as canonical.
47 | The ordering of the chromosomes in the `cytobands` file should match the order
48 | of the chromosomes in the `seqinfo` file.
49 | 
50 | The manifest and companion files are stored in the ``bioframe/io/data`` directory.
51 | New assemblies can be requested by opening an issue on GitHub or by submitting a pull request.
52 | 
53 | .. automodule:: bioframe.io.assembly
54 |    :autosummary:
55 |    :members:
56 | 
57 | .. autoclass:: bioframe.io.assembly.GenomeAssembly
58 |    :members:
59 |    :undoc-members:
60 | 
61 | 
62 | Remote resources
63 | ----------------
64 | These functions now default to using the local data store, but can be used to obtain chromsizes or
65 | centromere positions from UCSC by setting ``provider="ucsc"``.
66 | 
67 | .. automodule:: bioframe.io.resources
68 |    :autosummary:
69 |    :members:
70 | 


--------------------------------------------------------------------------------
/docs/api-validation.rst:
--------------------------------------------------------------------------------
1 | Validation
2 | ==========
3 | 
4 | .. automodule:: bioframe.core.checks
5 |    :autosummary:
6 |    :members:
7 | 


--------------------------------------------------------------------------------
/docs/api-vis.rst:
--------------------------------------------------------------------------------
1 | Plotting
2 | ===============
3 | 
4 | .. automodule:: bioframe.vis
5 |    :autosummary:
6 |    :members:
7 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | # import sys
 9 | from datetime import datetime
10 | from importlib.metadata import metadata
11 | 
12 | # autodoc_mock_imports = ["numpy", "pandas", "matplotlib", "requests"]
13 | 
14 | 
15 | # -- Project information -----------------------------------------------------
16 | # NOTE: If you installed your project in editable mode, this might be stale.
17 | #       If this is the case, reinstall it to refresh the metadata
18 | info = metadata("bioframe")
19 | project_name = info["Name"]
20 | author = "Open2C"
21 | copyright = f"{datetime.now():%Y}, {author}."
22 | version = info["Version"]
23 | urls = dict(pu.split(", ") for pu in info.get_all("Project-URL"))
24 | 
25 | # The full version, including alpha/beta/rc tags
26 | release = info["Version"]
27 | 
28 | # -- General configuration ---------------------------------------------------
29 | 
30 | # Add any Sphinx extension module names here, as strings. They can be
31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
32 | # ones.
33 | extensions = [
34 |     # "sphinx.ext.autodoc",
35 |     # 'sphinx.ext.doctest',
36 |     # 'sphinx.ext.todo',
37 |     # 'sphinx.ext.coverage',
38 |     # 'sphinx.ext.mathjax',
39 |     # 'sphinx.ext.ifconfig',
40 |     "autodocsumm",
41 |     "sphinx.ext.viewcode",
42 |     "sphinx.ext.autosummary",
43 |     "sphinx.ext.napoleon",  # 'numpydoc'
44 |     "myst_nb",
45 | ]
46 | # Add any paths that contain templates here, relative to this directory.
47 | templates_path = ["_templates"]
48 | 
49 | # List of patterns, relative to source directory, that match files and
50 | # directories to ignore when looking for source files.
51 | # This pattern also affects html_static_path and html_extra_path.
52 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "**.ipynb_checkpoints"]
53 | 
54 | # nbsphinx_custom_formats = {
55 | #     '.md': ['jupytext.reads', {'fmt': 'MyST'}],
56 | # }
57 | 
58 | # -- Options for HTML output -------------------------------------------------
59 | 
60 | # The theme to use for HTML and HTML Help pages.  See the documentation for
61 | # a list of builtin themes.
62 | #
63 | html_theme = "sphinx_rtd_theme"
64 | 
65 | # Add any paths that contain custom static files (such as style sheets) here,
66 | # relative to this directory. They are copied after the builtin static files,
67 | # so a file named "default.css" will overwrite the builtin "default.css".
68 | html_static_path = ["_static"]
69 | 
70 | master_doc = "index"
71 | 
72 | autosummary_generate = True
73 | 
74 | # Don't include fully qualified name prefixes in autodoc
75 | add_module_names = False
76 | 
77 | # Cache MyST (.md or .ipynb) notebook outputs if unmodified
78 | jupyter_execute_notebooks = "cache"
79 | execution_excludepatterns = ["guide-performance.ipynb"]
80 | 


--------------------------------------------------------------------------------
/docs/figs/._bioframe-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/._bioframe-logo.png


--------------------------------------------------------------------------------
/docs/figs/bioframe-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/bioframe-logo.png


--------------------------------------------------------------------------------
/docs/figs/bioframe_closest.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/bioframe_closest.pdf


--------------------------------------------------------------------------------
/docs/figs/closest0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/closest0.png


--------------------------------------------------------------------------------
/docs/figs/closest1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/closest1.png


--------------------------------------------------------------------------------
/docs/figs/closest2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/closest2.png


--------------------------------------------------------------------------------
/docs/figs/closest3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/closest3.png


--------------------------------------------------------------------------------
/docs/figs/df1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/df1.png


--------------------------------------------------------------------------------
/docs/figs/df2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/df2.png


--------------------------------------------------------------------------------
/docs/figs/df@.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/df@.png


--------------------------------------------------------------------------------
/docs/figs/merge_df1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/merge_df1.png


--------------------------------------------------------------------------------
/docs/figs/overlap_inner_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/overlap_inner_0.png


--------------------------------------------------------------------------------
/docs/figs/overlap_inner_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/figs/overlap_inner_1.png


--------------------------------------------------------------------------------
/docs/guide-bedtools.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | jupytext:
  3 |   formats: md:myst
  4 |   text_representation:
  5 |     extension: .md
  6 |     format_name: myst
  7 |     format_version: 0.13
  8 |     jupytext_version: 1.11.3
  9 | kernelspec:
 10 |   display_name: Python 3
 11 |   language: python
 12 |   name: python3
 13 | ---
 14 | 
 15 | # Bioframe for bedtools users
 16 | 
 17 | 
 18 | Bioframe is built around the analysis of genomic intervals as a pandas [DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) in memory, rather than working with tab-delimited text files saved on disk.
 19 | 
 20 | Bioframe supports reading a number of standard genomics text file formats via [`read_table`](https://bioframe.readthedocs.io/en/latest/api-fileops.html#bioframe.io.fileops.read_table), including BED files (see [schemas](https://github.com/open2c/bioframe/blob/main/bioframe/io/schemas.py)), which will load them as pandas DataFrames, a complete list of helper functions is [available here](API_fileops).
 21 | 
 22 | Any DataFrame object with `'chrom'`, `'start'`, and `'end'` columns will support the genomic [interval operations in bioframe](API_ops). The names of these columns can also be customized via the `cols=` arguments in bioframe functions.
 23 | 
 24 | For example, with gtf files, you do not need to turn them into bed files, you can directly read them into pandas (with e.g. [gtfparse](https://github.com/openvax/gtfparse/tree/master)). For gtfs, it is often convenient to rename the `'seqname'` column to `'chrom'`, the default column name used in bioframe.
 25 | 
 26 | Finally, if needed, bioframe provides a convenience function to write dataframes to a standard BED file using [`to_bed`](https://bioframe.readthedocs.io/en/latest/api-fileops.html#bioframe.io.bed.to_bed).
 27 | 
 28 | 
 29 | ## `bedtools intersect`
 30 | 
 31 | ### Select unique entries from the first bed overlapping the second bed `-u`
 32 | 
 33 | ```sh
 34 | bedtools intersect -u -a A.bed -b B.bed > out.bed
 35 | ```
 36 | 
 37 | ```py
 38 | overlap = bf.overlap(A, B, how='inner', suffixes=('_1','_2'), return_index=True)
 39 | out = A.loc[overlap['index_1'].unique()]
 40 | ```
 41 | 
 42 | ### Report the number of hits in B `-c`
 43 | 
 44 | Reports 0 for A entries that have no overlap with B.
 45 | 
 46 | ```sh
 47 | bedtools intersect -c -a A.bed -b B.bed > out.bed
 48 | ```
 49 | 
 50 | ```py
 51 | out = bf.count_overlaps(A, B)
 52 | ```
 53 | 
 54 | ### Return entries from both beds for each overlap `-wa -wb`
 55 | 
 56 | ```sh
 57 | bedtools intersect -wa -wb -a A.bed -b B.bed > out.bed
 58 | ```
 59 | 
 60 | ```py
 61 | out = bf.overlap(A, B, how='inner')
 62 | ```
 63 | 
 64 | **Note:** This is called an "inner join", and is analogous to an inner pandas join or merge. The default column suffixes in the output dataframe are `''` (nothing) for A's columns and `'_'` for B's columns.
 65 | 
 66 | ### Include all entries from the first bed, even if no overlap `-loj`
 67 | 
 68 | ```sh
 69 | bedtools intersect -wa -wb -loj -a A.bed -b B.bed > out.bed
 70 | ```
 71 | 
 72 | ```py
 73 | out = bf.overlap(A, B, how='left')
 74 | ```
 75 | 
 76 | **Note:** This is called a "left-outer join".
 77 | 
 78 | ### Select entries from the first bed for each overlap `-wa`
 79 | 
 80 | ```sh
 81 | bedtools intersect -wa -a A.bed -b B.bed > out.bed
 82 | ```
 83 | 
 84 | ```py
 85 | overlap = bf.overlap(A, B, how='inner', suffixes=('_1','_2'), return_index=True)
 86 | out = A.loc[overlap['index_1']]
 87 | 
 88 | # Alternatively
 89 | out = bf.overlap(A, B, how='inner')[A.columns]
 90 | ```
 91 | 
 92 | > **Note:** This gives one row per overlap and can contain duplicates. The output dataframe of the former method will use the same pandas index as the input dataframe `A`, while the latter result --- the join output --- will have an integer range index, like a pandas merge.
 93 | 
 94 | ### Select entries from the second bed for each overlap `-wb`
 95 | 
 96 | ```sh
 97 | bedtools intersect -wb -a A.bed -b B.bed > out.bed
 98 | ```
 99 | 
100 | ```py
101 | overlap = bf.overlap(A, B, how='inner', suffixes=('_1','_2'), return_index=True)
102 | out = B.loc[overlap['index_2']]
103 | 
104 | # Alternatively
105 | out = bf.overlap(A, B, how='inner', suffixes=('_', ''))[B.columns]
106 | ```
107 | 
108 | > **Note:** This gives one row per overlap and can contain duplicates. The output dataframe of the former method will use the same pandas index as the input dataframe `B`, while the latter result --- the join output --- will have an integer range index, like a pandas merge.
109 | 
110 | 
111 | ### Intersect multiple beds against A
112 | 
113 | ```sh
114 | bedtools intersect -wa -a A.bed -b B.bed C.bed D.bed > out.bed
115 | ```
116 | 
117 | ```py
118 | others = pd.concat([B, C, D])
119 | overlap = bf.overlap(A, others, how='inner', suffixes=('_1','_2'), return_index=True)
120 | out = A.loc[overlap['index_1']]
121 | ```
122 | 
123 | ### Return everything in A that doesn't overlap with B `-v`
124 | 
125 | ```sh
126 | bedtools intersect -wa -a A.bed -b B.bed -v > out.bed
127 | ```
128 | 
129 | ```py
130 | out = bf.setdiff(A, B)
131 | ```
132 | 
133 | **Note:** We call this a set difference.
134 | 
135 | ### Force strandedness `-s`
136 | 
137 | For intersection
138 | 
139 | ```sh
140 | bedtools intersect -wa -a A.bed -b B.bed -s > out.bed
141 | ```
142 | 
143 | ```py
144 | overlap = bf.overlap(A, B, on=['strand'], suffixes=('_1','_2'), return_index=True, how='inner')
145 | out = A.loc[overlap['index_1']]
146 | ```
147 | 
148 | For non-intersection `-v`
149 | 
150 | ```sh
151 | bedtools intersect -wa -a A.bed -b B.bed -v -s > out.bed
152 | ```
153 | 
154 | ```py
155 | out = bf.setdiff(A, B, on=['strand'])
156 | ```
157 | 
158 | ### Minimum overlap as a fraction of A `-f`
159 | 
160 | We want to keep rows of A that are covered at least 70% by elements from B
161 | 
162 | ```sh
163 | bedtools intersect -wa -a A.bed -b B.bed -f 0.7 > out.bed
164 | ```
165 | 
166 | ```py
167 | cov = bf.coverage(A, B)
168 | out = A.loc[cov['coverage'] / (cov['end'] - cov['start']) ) >= 0.70]
169 | 
170 | # Alternatively
171 | out = bf.coverage(A, B).query('coverage / (end - start) >= 0.7')[A.columns]
172 | ```
173 | 


--------------------------------------------------------------------------------
/docs/guide-definitions.rst:
--------------------------------------------------------------------------------
 1 | .. _Definitions:
 2 | 
 3 | Definitions
 4 | ===========
 5 | 
 6 | Interval:
 7 |     - An *interval* is a tuple of integers (start, end) with start <= end.
 8 |     - Coordinates are assumed to be 0-based and intervals half-open (1-based ends) i.e. [start, end).
 9 |     - An interval has a *length* equal to (end - start).
10 |     - A special case where start and end are the same, i.e. [X, X), is interpreted as a *point* (aka an *empty interval*, i.e. an edge between 1-bp bins). A point has zero length.
11 |     - Negative coordinates are permissible for both ends of an interval.
12 | 
13 | Properties of a pair of intervals:
14 |     - Two intervals can either *overlap*, or not. The overlap length = max(0, min(end1, end2) - max(start1, start2)). Empty intervals can have overlap length = 0.
15 |     - When two intervals overlap, the shorter of the two intervals is said to be *contained* in the longer one if the length of their overlap equals the length of the shorter interval. This property is often referred to as nestedness, but we use the term “contained” as it is less ambiguous when describing the relationship of sets of intervals to one interval.
16 |     - If two intervals do not overlap, they have a *distance* = max(0, max(start1, start2) - min(end1, end2)).
17 |     - If two intervals have overlap=0 and distance=0, they are said to be *abutting*.
18 | 
19 | Scaffold:
20 |     - A chromosome, contig or, more generally, a *scaffold* is an interval defined by a unique string and has a length>=0, with start=0 and end=length, implicitly defining an interval [0, length).
21 | 
22 | Genome assembly:
23 |     - The complete set of scaffolds associated with a genome is called an *assembly* (e.g. defined by the reference sequence from NCBI, etc.).
24 | 
25 | Genomic interval:
26 |     - A *genomic interval* is an interval with an associated scaffold, or chromosome, defined by a string, i.e. a triple (chrom, start, end).
27 |     - Genomic intervals on different scaffolds never overlap and do not have a defined distance.
28 |     - Genomic intervals can extend beyond their associated scaffold (e.g. with negative values or values greater than the scaffold length), as this can be useful in downstream applications. If they do, they are not contained by their associated scaffold.
29 |     - A *base-pair* is a special case of a genomic interval with length=1, i.e. (chrom, start, start+1)
30 |     - *strand* is an (optional) property of a genomic interval which specifies an interval’s orientation on its scaffold. Note start and end are still defined with respect to the scaffold’s reference orientation (positive strand), even if the interval lies on the negative strand. Intervals on different strands can either be allowed to overlap or not.
31 | 
32 | View (i.e. a set of Genomic Regions):
33 |     - A genomic *view* is an ordered set of non-overlapping genomic intervals each having a unique name defined by a string. Individual named intervals in a view are *regions*, defined by a quadruple, e.g. (chrom, start, end, name).
34 |     - A view thus specifies a unified 1D coordinate system, i.e. a projection of multiple genomic regions onto a single axis.
35 |     - We define views separately from the scaffolds that make up a genome assembly, as a set of more constrained and ordered genomic regions are often useful for downstream analysis and visualization.
36 |     - An assembly is a special case of a view, where the individual regions correspond to the assembly’s entire scaffolds.
37 | 
38 | Associating genomic intervals with views
39 |     - Similarly to how genomic intervals are associated with a scaffold, they can also be associated with a region from a view with an additional string, making a quadruple (chrom, start, end, view_region). This string must be *cataloged* in the view, i.e. it must match the name of a region in the view. Typically the interval would be contained in its associated view region, or, at the minimum, have a greater overlap with that region than other view regions.
40 |     - If each interval in a set is contained in their associated view region, the set is *contained* in the view.
41 |     - A set of intervals *covers* a view if each region in the view is contained by the union of its associated intervals. Conversely, if a set does not cover all of view regions, the interval set will have *gaps* relative to that view (stretches of bases not covered by an interval).
42 | 
43 | Properties of sets of genomic intervals:
44 |     - A set of genomic intervals may have overlaps or not. If it does not, it is said to be *overlap-free*.
45 |     - A set of genomic intervals is *tiling* if it: (i) covers the associated view, (ii) is contained in that view, and (iii) is overlap-free. Equivalently, a tiling set of intervals (a) has an initial interval that begins at the start of each region and (b) a final interval that terminates at the end of each region, and (c) every base pair is associated with a unique interval.
46 | 


--------------------------------------------------------------------------------
/docs/guide-quickstart.rst:
--------------------------------------------------------------------------------
 1 | Quickstart
 2 | ==========
 3 | 
 4 | Installation
 5 | ------------
 6 | 
 7 | ::
 8 | 
 9 |     $ pip install bioframe
10 | 
11 | To install the latest development version of `bioframe` from
12 | github, first make a local clone of the github repository:
13 | 
14 | .. code-block:: bash
15 | 
16 |     $ git clone https://github.com/open2c/bioframe
17 | 
18 | Then, compile and install `bioframe` in
19 | `development mode <https://setuptools.readthedocs.io/en/latest/setuptools.html#development-mode>`_. This installs the package without moving it to a system folder, and thus allows for testing changes to the python code on the fly.
20 | 
21 | .. code-block:: bash
22 | 
23 |     $ cd bioframe
24 |     $ pip install -e ./
25 | 


--------------------------------------------------------------------------------
/docs/guide-recipes.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | jupytext:
 3 |   formats: md:myst
 4 |   text_representation:
 5 |     extension: .md
 6 |     format_name: myst
 7 |     format_version: 0.13
 8 |     jupytext_version: 1.11.3
 9 | kernelspec:
10 |   display_name: Python 3
11 |   language: python
12 |   name: python3
13 | ---
14 | 
15 | # How do I
16 | 
17 | ## Obtain overlapping intervals with matching strandedness?
18 | Use overlap with the ``on`` argument:
19 | ```
20 | df = bf.overlap(df1, df2, on=[‘strand’])
21 | ```
22 | 
23 | ## Obtain overlapping intervals with opposite strandedness?
24 | Overlap then filter pairs of opposite strandedness:
25 | ```
26 | df = bf.overlap(df1, df2)
27 | df = df.loc[df["strand"]!=df["strand_"]]
28 | ```
29 | ## Obtain intervals that exceed 50% coverage by another set of intervals?
30 | Coverage, then filter pairs by fractional coverage:
31 | ```
32 | df = bf.coverage(df1, df2)
33 | df = df[ ( df["coverage"] / (df["end"]-df["start"]) ) >=0.50]
34 | ```
35 | 
36 | ## Shift all intervals on the positive strand by 10bp?
37 | Use pandas indexing:
38 | ```
39 | df.loc[df.strand=="+",["start", "end"]] += 10
40 | ```
41 | 
42 | ## Obtain intervals overlapped by at least 2 intervals from another set?
43 | Count overlaps, then filter:
44 | ```
45 | df = bf.count_overlaps(df1, df2)
46 | df = df[ df["count"] >= 2]
47 | ```
48 | 
49 | ## Find strand-specific downstream genomic features?
50 | Use closest after filtering by strand, and passing the `ignore_upsream=True` argument.
51 | ```
52 | bioframe.closest(df1.loc[df1['strand']=='+'], df2, ignore_upstream=True)
53 | ```
54 | 
55 | For gener, the upstream/downstream direction might be defined by the direction of transcription.
56 | Use `direction_col='strand'` to set up the direction:
57 | ```
58 | bioframe.closest(df1, df2, ignore_upstream=True, direction_col='strand')
59 | ```
60 | 
61 | ## Drop non-autosomes from a bedframe?
62 | Use pandas DataFrame.isin(values):
63 | ```
64 | df[ ~df.chrom.isin(['chrX','chrY'])]
65 | ```
66 | 


--------------------------------------------------------------------------------
/docs/guide-specifications.rst:
--------------------------------------------------------------------------------
 1 | .. _Specifications:
 2 | 
 3 | Specifications
 4 | ===========
 5 | 
 6 | BedFrame (i.e. genomic intervals stored in a pandas dataframe):
 7 |     - In a BedFrame, three required columns specify the set of genomic intervals (default column names = (‘chrom’, ‘start’, ‘end’)).
 8 |     - Other reserved but not required column names: (‘strand’, ‘name’, ‘view_region’).
 9 | 
10 |         - entries in column ‘name’ are expected to be unique
11 |         - ‘view_region’ is expected to point to an associated region in a view with a matching name
12 |         - ‘strand’ is expected to be encoded with strings (‘+’, ‘-’, ‘.’).
13 | 
14 |     - Additional columns are allowed: ‘zodiac_sign’, ‘soundcloud’, ‘twitter_name’, etc.
15 |     - Repeated intervals are allowed.
16 |     - The native pandas DataFrame index is not intended to be used as an immutable lookup table for genomic intervals in BedFrame. This is because many common genomic interval operations change the number of intervals stored in a BedFrame.
17 |    - Two useful sorting schemes for BedFrames are:
18 | 
19 |         - scaffold-sorted: on (chrom, start, end), where chrom is sorted lexicographically.
20 |         - view-sorted: on (view_region, start, end) where view_region is sorted by order in the view.
21 | 
22 |     - Null values are allowed, but only as pd.NA (using np.nan is discouraged as it results in unwanted type re-casting).
23 |    - Note if no ‘view_region’ is assigned to a genomic interval, then ‘chrom’ implicitly defines an associated region
24 |     - Note the BedFrame specification is a natural extension of the BED format ( ​​https://samtools.github.io/hts-specs/BEDv1.pdf ) for pandas DataFrames.
25 | 
26 | ViewFrames (a genomic view stored in a pandas dataframe)
27 |     - BedFrame where:
28 | 
29 |            - intervals are non-overlapping
30 |            - “name” column is mandatory and contains a set of unique strings.
31 | 
32 |     - Note that a ViewFrame can potentially be indexed by the name column to serve as a lookup table. This functionality is currently not implemented, because within the current Pandas implementation indexing by a column removes the column from the table.
33 |     - Note that views can be defined by:
34 | 
35 |         - dictionary of string:ints (start=0 assumed) or string:tuples (start,end)
36 |         - pandas series of chromsizes (start=0, name=chrom)
37 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. bioframe documentation master file, created by
 2 |    sphinx-quickstart on Sat Apr 11 11:44:26 2020.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | bioframe
 7 | ========
 8 | 
 9 | `Bioframe <https://github.com/open2c/bioframe>`_ is a library to enable flexible and scalable operations on genomic interval dataframes in python. Building bioframe directly on top of `pandas <https://pandas.pydata.org/>`_ enables immediate access to a rich set of dataframe operations. Working in python enables rapid visualization and iteration of genomic analyses.
10 | 
11 | 
12 | .. toctree::
13 |    :maxdepth: 1
14 |    :caption: Guide
15 | 
16 |    guide-quickstart
17 |    guide-intervalops.md
18 |    guide-io.ipynb
19 |    guide-performance.ipynb
20 |    guide-recipes.md
21 |    guide-definitions
22 |    guide-specifications
23 |    guide-bedtools
24 | 
25 | .. toctree::
26 |    :maxdepth: 1
27 |    :caption: Tutorials
28 | 
29 |    tutorials/tutorial_assign_motifs_to_peaks.ipynb
30 |    tutorials/tutorial_assign_peaks_to_genes.ipynb
31 | 
32 | .. toctree::
33 |    :maxdepth: 3
34 |    :caption: API
35 | 
36 |    api-construction
37 |    api-validation
38 |    api-intervalops
39 |    api-fileops
40 |    api-resources
41 |    api-extras
42 |    api-vis
43 |    api-lowlevel.md
44 | 
45 | 
46 | Indices and tables
47 | ==================
48 | 
49 | * :ref:`genindex`
50 | * :ref:`modindex`
51 | * :ref:`search`
52 | 


--------------------------------------------------------------------------------
/docs/lowlevel/arrops.rst:
--------------------------------------------------------------------------------
1 | Array operations
2 | =================
3 | 
4 | Low level operations that are used to implement the genomic interval operations.
5 | 
6 | .. automodule:: bioframe.core.arrops
7 |    :autosummary:
8 |    :members:
9 | 


--------------------------------------------------------------------------------
/docs/lowlevel/specs.rst:
--------------------------------------------------------------------------------
 1 | Specifications
 2 | =================
 3 | 
 4 | .. automodule:: bioframe.core.specs
 5 |    :autosummary:
 6 |    :members:
 7 | 
 8 | **Unexposed functions:**
 9 | 
10 | .. automethod:: bioframe.core.specs._verify_column_dtypes
11 | .. automethod:: bioframe.core.specs._verify_columns
12 | .. automethod:: bioframe.core.specs._get_default_colnames
13 | 


--------------------------------------------------------------------------------
/docs/lowlevel/stringops.rst:
--------------------------------------------------------------------------------
1 | String operations
2 | =================
3 | 
4 | .. automodule:: bioframe.core.stringops
5 |    :autosummary:
6 |    :members:
7 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/times100.bw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/docs/times100.bw


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["hatchling", "hatch-vcs"]
  3 | build-backend = "hatchling.build"
  4 | 
  5 | [project]
  6 | name = "bioframe"
  7 | version = "0.8.0"
  8 | description = "Operations and utilities for Genomic Interval Dataframes."
  9 | license = {text = "MIT"}
 10 | authors = [
 11 |   {name = "Open2C", email = "open.chromosome.collective@gmail.com"},
 12 | ]
 13 | keywords = [
 14 |     "pandas",
 15 |     "dataframe",
 16 |     "genomics",
 17 |     "epigenomics",
 18 |     "bioinformatics",
 19 |     "interval operations",
 20 |     "genomic ranges",
 21 |     "bedtools",
 22 |     "bedframe",
 23 |     "viewframe",
 24 |     "bed",
 25 | ]
 26 | classifiers = [
 27 |     "Development Status :: 4 - Beta",
 28 |     "Operating System :: OS Independent",
 29 |     "Programming Language :: Python",
 30 |     "Programming Language :: Python :: 3",
 31 |     "Programming Language :: Python :: 3.8",
 32 |     "Programming Language :: Python :: 3.9",
 33 |     "Programming Language :: Python :: 3.10",
 34 |     "Programming Language :: Python :: 3.11",
 35 |     "Programming Language :: Python :: 3.12",
 36 | ]
 37 | readme = "README.md"
 38 | requires-python = ">=3.8"
 39 | dependencies = [
 40 |     "matplotlib",
 41 |     "numpy>=1.10, <3",
 42 |     "pandas>=1.3",
 43 |     "pyyaml",
 44 |     "requests",
 45 |     "typing-extensions ; python_version<'3.9'",
 46 |     "importlib-metadata ; python_version<'3.8'",
 47 |     "importlib-resources ; python_version<'3.9'",
 48 | ]
 49 | 
 50 | [project.optional-dependencies]
 51 | dev = [
 52 |     "biopython",
 53 |     "pre-commit",
 54 |     "pysam",
 55 |     "pybbi",
 56 |     "pytest",
 57 |     "ruff",
 58 | ]
 59 | test = [
 60 |     "pytest",
 61 | ]
 62 | docs = [
 63 |     "autodocsumm",
 64 |     "myst_nb",
 65 |     "jinja2",
 66 |     "Sphinx",
 67 |     "sphinx-autobuild",
 68 |     "sphinx_rtd_theme",
 69 | ]
 70 | 
 71 | [project.urls]
 72 | homepage = "https://github.com/open2c/bioframe"
 73 | documentation = "https://bioframe.readthedocs.io/en/latest"
 74 | repository = "https://github.com/open2c/bioframe"
 75 | changelog = "https://github.com/open2c/bioframe/blob/main/CHANGES.md"
 76 | 
 77 | [tool.ruff]
 78 | target-version = "py37"
 79 | exclude = [
 80 |     ".venv",
 81 | ]
 82 | 
 83 | [tool.ruff.lint]
 84 | extend-select = [
 85 |     "B",  # bugbear
 86 |     # "C",  # mccabe complexity
 87 |     # "D",  # pydocstyle
 88 |     "E",  # style errors
 89 |     "F",  # pyflakes
 90 |     "I",  # isort
 91 |     "RUF", # ruff-specific rules
 92 |     "UP", # pyupgrade
 93 |     "W",  # style  warnings
 94 | ]
 95 | 
 96 | [tool.ruff.lint.isort]
 97 | known-first-party = ["bioframe"]
 98 | 
 99 | [tool.ruff.lint.pydocstyle]
100 | convention = "numpy"
101 | 
102 | [tool.pytest.ini_options]
103 | minversion = "7"
104 | log_cli_level = "info"
105 | xfail_strict = true
106 | addopts = [
107 |     "-ra",
108 |     "--showlocals",
109 |     "--strict-config",
110 |     "--strict-markers",
111 | ]
112 | filterwarnings = ["ignore::PendingDeprecationWarning"]
113 | testpaths = ["tests"]
114 | 
115 | [tool.hatch.envs.default]
116 | features = ["dev", "test", "docs"]
117 | 
118 | [tool.hatch.envs.default.scripts]
119 | fix = "ruff check --fix ."
120 | lint = "ruff check bioframe tests"
121 | format = "ruff format bioframe tests"
122 | test = "pytest ."
123 | docs = "sphinx-autobuild docs docs/_build/html"
124 | 
125 | [tool.hatch.envs.test]
126 | features = ["dev", "test"]
127 | 
128 | [[tool.hatch.envs.test.matrix]]
129 | python = ["3.9", "3.10", "3.11", "3.12"]
130 | 


--------------------------------------------------------------------------------
/tests/test_assembly_info.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | 
 4 | from bioframe.io.assembly import GenomeAssembly, assemblies_available, assembly_info
 5 | 
 6 | 
 7 | def test_assemblies_available():
 8 |     assemblies = assemblies_available()
 9 |     assert isinstance(assemblies, pd.DataFrame)
10 |     for col in ["provider", "provider_build", "default_roles", "default_units"]:
11 |         assert col in assemblies.columns
12 | 
13 | 
14 | def test_assembly_info():
15 |     hg38 = assembly_info("hg38")
16 |     assert isinstance(hg38, GenomeAssembly)
17 |     assert hg38.provider == "ucsc"
18 |     assert hg38.provider_build == "hg38"
19 |     assert isinstance(hg38.chromsizes, pd.Series)
20 |     assert isinstance(hg38.chromnames, list)
21 |     assert isinstance(hg38.alias_dict, dict)
22 | 
23 |     assert isinstance(hg38.seqinfo, pd.DataFrame)
24 |     for col in ["name", "length", "aliases", "role", "unit"]:
25 |         assert col in hg38.seqinfo.columns
26 | 
27 |     assert isinstance(hg38.viewframe, pd.DataFrame)
28 |     for col in ["chrom", "start", "end", "name"]:
29 |         assert col in hg38.viewframe.columns
30 | 
31 |     hg38 = assembly_info("ucsc.hg38", roles=("assembled", "unlocalized"))
32 |     assert isinstance(hg38, GenomeAssembly)
33 | 
34 |     with pytest.raises(ValueError):
35 |         assembly_info("ncbi.hg38")  # provider-name mismatch
36 | 
37 |     assert isinstance(hg38.cytobands, pd.DataFrame)
38 |     for col in ["chrom", "start", "end", "band", "stain"]:
39 |         assert col in hg38.cytobands.columns
40 | 
41 |     sacCer3 = assembly_info("sacCer3")
42 |     assert sacCer3.cytobands is None
43 | 


--------------------------------------------------------------------------------
/tests/test_bed.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | 
  4 | import pandas as pd
  5 | import pytest
  6 | 
  7 | import bioframe
  8 | 
  9 | 
 10 | def test_involution():
 11 |     with tempfile.TemporaryDirectory() as directory:
 12 |         for schema in ['narrowPeak', 'bed12']:
 13 |             bf = bioframe.read_table(f'tests/test_data/{schema}.bed',
 14 |                                      schema=schema)
 15 |             fname = os.path.join(directory, f'{schema}.bed')
 16 |             bioframe.to_bed(bf, fname)
 17 |             involution = bioframe.read_table(fname, schema=schema)
 18 |             pd.testing.assert_frame_equal(bf, involution)
 19 | 
 20 | 
 21 | def test_chrom_validators():
 22 |     with tempfile.TemporaryDirectory() as directory:
 23 |         bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12')
 24 |         bf.loc[0, 'chrom'] = 'value with space'
 25 |         with pytest.raises(ValueError):
 26 |             bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
 27 | 
 28 |         bf.loc[0, 'chrom'] = '' # must be non empty
 29 |         with pytest.raises(ValueError):
 30 |             bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
 31 | 
 32 |         bf.loc[0, 'chrom'] = 'a'*300 # must be shorter than 256
 33 |         with pytest.raises(ValueError):
 34 |             bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
 35 | 
 36 | 
 37 | def test_end_validators():
 38 |     with tempfile.TemporaryDirectory() as directory:
 39 |         bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12')
 40 |         bf.loc[0, 'end'] = 10 # end must be after start
 41 |         bf.loc[0, 'start'] = 11
 42 |         with pytest.raises(ValueError):
 43 |             bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
 44 | 
 45 | 
 46 | def test_name_validators():
 47 |     with tempfile.TemporaryDirectory() as directory:
 48 |         bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12')
 49 |         bf.loc[0, 'name'] = '' # must not be empty
 50 |         with pytest.raises(ValueError):
 51 |             bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
 52 | 
 53 |         bf.loc[0, 'name'] = 'a'*300 # must be less than 255 char
 54 |         with pytest.raises(ValueError):
 55 |             bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
 56 | 
 57 | 
 58 | def test_score_validators():
 59 |     with tempfile.TemporaryDirectory() as directory:
 60 |         bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12')
 61 |         # negative value is enforced by the normal types
 62 | 
 63 |         bf.loc[0, 'score'] = 1001
 64 |         with pytest.raises(ValueError):
 65 |             bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'), strict_score=True)
 66 | 
 67 |         bf['score'] = '.' # enforced to be a number by the types
 68 |         with pytest.raises(TypeError):
 69 |             bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
 70 | 
 71 | 
 72 | def test_strand_validators():
 73 |     with tempfile.TemporaryDirectory() as directory:
 74 |         bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12')
 75 |         bf.loc[0, 'strand'] = '*'
 76 |         with pytest.raises(ValueError):
 77 |             bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
 78 | 
 79 | 
 80 | def test_thick_validators():
 81 |     with tempfile.TemporaryDirectory() as directory:
 82 |         for direction in ['Start', 'End']:
 83 |             bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12')
 84 |             bf.loc[0, 'start'] = 100
 85 |             bf.loc[0, 'end'] = 1000
 86 |             bf.loc[0, f'thick{direction}'] = 1001
 87 |             with pytest.raises(ValueError):
 88 |                 bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
 89 | 
 90 |             bf.loc[0, f'thick{direction}'] = 99
 91 |             with pytest.raises(ValueError):
 92 |                 bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
 93 | 
 94 | 
 95 | def test_itemRgb_validators():
 96 |     with tempfile.TemporaryDirectory() as directory:
 97 |         bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12')
 98 |         bf["itemRgb"] = bf["itemRgb"].astype(str)
 99 |         bf.loc[0, 'itemRgb'] = 'a,12,13' # must be integers
100 |         with pytest.raises(ValueError):
101 |             bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
102 | 
103 |         bf.loc[0, 'itemRgb'] = '12,13' # must be 1 or 3 integers
104 |         with pytest.raises(ValueError):
105 |             bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
106 | 
107 |         bf.loc[0, 'itemRgb'] = '12,13,14,15' # must be 1 or 3 integers
108 |         with pytest.raises(ValueError):
109 |             bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
110 | 
111 |         bf.loc[0, 'itemRgb'] = '12,13,300' # must be between 0 and 255
112 |         with pytest.raises(ValueError):
113 |             bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
114 | 
115 |         bf.loc[0, 'itemRgb'] = '300' # must be between 0 and 255
116 |         with pytest.raises(ValueError):
117 |             bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
118 | 
119 | 
120 | def test_blockCount_validators():
121 |     with tempfile.TemporaryDirectory() as directory:
122 |         bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12')
123 |         bf.loc[0, 'blockCount'] = 0
124 |         with pytest.raises(ValueError):
125 |             bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
126 | 
127 | 
128 | def test_blockSizes_validators():
129 |     with tempfile.TemporaryDirectory() as directory:
130 |         bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12')
131 |         bf.loc[0, 'blockCount'] = 2
132 |         bf.loc[0, 'blockSizes'] = '2,a,'
133 |         with pytest.raises(ValueError):
134 |             bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
135 | 
136 |         bf.loc[0, 'blockCount'] = 2
137 |         bf.loc[0, 'blockSizes'] = '2,2,2,'
138 |         with pytest.raises(ValueError):
139 |             bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
140 | 
141 | 
142 | def test_blockStarts_validators():
143 |     with tempfile.TemporaryDirectory() as directory:
144 |         bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12')
145 |         bf.loc[0, 'blockCount'] = 2
146 |         bf.loc[0, 'blockSizes'] = '2,4,'
147 |         bf.loc[0, 'blockStarts'] = '0,a,'
148 |         with pytest.raises(ValueError):
149 |             bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
150 | 
151 |         bf.loc[0, 'blockCount'] = 2
152 |         bf.loc[0, 'blockSizes'] = '1,1,'
153 |         bf.loc[0, 'blockStarts'] = '0,2,5,'
154 |         with pytest.raises(ValueError):
155 |             bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
156 | 
157 |         # ends after end
158 |         bf.loc[0, 'start'] = 1
159 |         bf.loc[0, 'end'] = 10
160 |         bf.loc[0, 'blockCount'] = 1
161 |         bf.loc[0, 'blockSizes'] = '100,'
162 |         bf.loc[0, 'blockStarts'] = '0,'
163 |         with pytest.raises(ValueError):
164 |             bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
165 | 
166 |         # ends before end
167 |         bf.loc[0, 'start'] = 1
168 |         bf.loc[0, 'end'] = 10
169 |         bf.loc[0, 'blockCount'] = 1
170 |         bf.loc[0, 'blockSizes'] = '1,'
171 |         bf.loc[0, 'blockStarts'] = '0,'
172 |         with pytest.raises(ValueError):
173 |             bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
174 | 
175 |         # overlap
176 |         bf.loc[0, 'start'] = 1
177 |         bf.loc[0, 'end'] = 10
178 |         bf.loc[0, 'blockCount'] = 2
179 |         bf.loc[0, 'blockSizes'] = '5,5,'
180 |         bf.loc[0, 'blockStarts'] = '0,1,'
181 |         with pytest.raises(ValueError):
182 |             bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'))
183 | 


--------------------------------------------------------------------------------
/tests/test_core_construction.py:
--------------------------------------------------------------------------------
  1 | from io import StringIO
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import pytest
  6 | 
  7 | from bioframe.core import construction
  8 | from bioframe.core.construction import from_any
  9 | 
 10 | 
 11 | def test_add_ucsc_name_column():
 12 |     df = pd.DataFrame(
 13 |         {"chrom": [f"chr{i}" for i in range(3)], "start": [1, 2, 3], "end": [4, 5, 6]}
 14 |     )
 15 | 
 16 |     pd.testing.assert_series_equal(
 17 |         construction.add_ucsc_name_column(df)["name"],
 18 |         pd.Series(
 19 |             data=["chr0:1-4", "chr1:2-5", "chr2:3-6"], index=[0, 1, 2], name="name"
 20 |         ),
 21 |     )
 22 | 
 23 | 
 24 | def test_any():
 25 |     ### tests copied from old parse_regions
 26 |     # main functionality: convert to dataframe and create name
 27 |     df = pd.DataFrame(
 28 |         {"chrom": [f"chr{i}" for i in range(3)], "start": [1, 2, 3], "end": [4, 5, 6]}
 29 |     )
 30 |     parsed = from_any(df)
 31 |     assert "name" not in parsed.columns
 32 |     assert parsed.iloc[0]["chrom"] == "chr0"
 33 | 
 34 |     # re-create dataframe from UCSC name alone
 35 |     df2 = pd.DataFrame(
 36 |         {
 37 |             "regions": construction.add_ucsc_name_column(parsed, name_col="regions")[
 38 |                 "regions"
 39 |             ].values
 40 |         }
 41 |     )
 42 |     assert (
 43 |         (from_any(df2, name_col="regions")[["chrom", "start", "end"]] == parsed)
 44 |         .all()
 45 |         .all()
 46 |     )
 47 | 
 48 |     # re-parsing results yields the same
 49 |     assert (from_any(parsed) == parsed).all().all()
 50 | 
 51 |     # extra columns don't get overwritten
 52 |     df["name"] = "test-value"
 53 |     assert (from_any(df)["name"] == df["name"]).all()
 54 | 
 55 |     # None or False will be parsed
 56 |     assert from_any([("chr1", None, 5)], fill_null={"chr1": 10})["start"].values[0] == 0
 57 | 
 58 |     # pull end from chromsizes
 59 |     p2 = from_any([("chr1", 5, None)], fill_null={"chr1": 40})
 60 |     assert list(p2.values[0]) == ["chr1", 5, 40]
 61 | 
 62 |     # We could keep things as None if chromsizes were not proviced
 63 |     p3 = from_any(["chr1", "chr2"], fill_null=False)
 64 |     assert list(p3.values[0]) == ["chr1", None, None]
 65 | 
 66 |     # parse the strange name
 67 |     p8 = from_any(["chr1:1,000,000-4M"])
 68 |     assert list(p8.values[0]) == ["chr1", 1000000, 4000000]
 69 | 
 70 |     p9 = from_any(["chr1"])
 71 |     assert list(p9.values[0]) == ["chr1", None, None]
 72 | 
 73 |     with pytest.raises(ValueError):
 74 |         from_any([("ch1", 1, 2, "chr1:1-2", "puppies")])  # puppies are not allowed
 75 | 
 76 |     with pytest.raises(ValueError):
 77 |         from_any([("chr1", 5, None)], fill_null={"chr2": 40})
 78 | 
 79 |     # input tuple of tuples
 80 |     p2 = from_any((("chr1", 5, 10), ("chrX", 10, 20)))
 81 |     assert list(p2.values[0]) == ["chr1", 5, 10]
 82 | 
 83 |     # input tuple of lists
 84 |     p2 = from_any((["chr1", 5, 10], ["chrX", 10, 20]))
 85 |     assert list(p2.values[0]) == ["chr1", 5, 10]
 86 | 
 87 |     # input tuple of ucsc strings
 88 |     p2 = from_any(("chr1:5-10",))
 89 |     assert list(p2.values[0]) == ["chr1", 5, 10]
 90 | 
 91 |     # input single tuple
 92 |     p2 = from_any(("chr1", 5, 10))
 93 |     assert list(p2.values[0]) == ["chr1", 5, 10]
 94 | 
 95 | 
 96 | def test_sanitize_bedframe():
 97 |     df1 = pd.DataFrame(
 98 |         [
 99 |             ["chr1", 10, 20],
100 |             ["chr1", 10, 20],
101 |             ["chr1", 15, np.nan],
102 |             ["chr1", pd.NA, 25],
103 |         ],
104 |         columns=["chrom", "start", "end"],
105 |     )
106 | 
107 |     # drop rows with null values
108 |     sanitized_df1 = pd.DataFrame(
109 |         [["chr1", 10, 20], ["chr1", 10, 20]], columns=["chrom", "start", "end"]
110 |     )
111 |     sanitized_df1 = sanitized_df1.astype(
112 |         {"chrom": str, "start": pd.Int64Dtype(), "end": pd.Int64Dtype()}
113 |     )
114 |     pd.testing.assert_frame_equal(
115 |         sanitized_df1, construction.sanitize_bedframe(df1, drop_null=True)
116 |     )
117 | 
118 |     # keep rows with null, but recast
119 |     sanitized_df1 = pd.DataFrame(
120 |         [
121 |             ["chr1", 10, 20],
122 |             ["chr1", 10, 20],
123 |             [pd.NA, pd.NA, pd.NA],
124 |             [pd.NA, pd.NA, pd.NA],
125 |         ],
126 |         columns=["chrom", "start", "end"],
127 |     )
128 |     sanitized_df1 = sanitized_df1.astype(
129 |         {"chrom": object, "start": pd.Int64Dtype(), "end": pd.Int64Dtype()}
130 |     )
131 |     pd.testing.assert_frame_equal(
132 |         sanitized_df1.fillna(-1), construction.sanitize_bedframe(df1).fillna(-1)
133 |     )
134 | 
135 |     # flip intervals as well as drop NA
136 |     df1 = pd.DataFrame(
137 |         [
138 |             ["chr1", 20, 10],
139 |             ["chr1", pd.NA, 25],
140 |         ],
141 |         columns=["chrom", "start", "end"],
142 |     )
143 |     sanitized_df1 = pd.DataFrame([["chr1", 10, 20]], columns=["chrom", "start", "end"])
144 |     sanitized_df1 = sanitized_df1.astype(
145 |         {"chrom": str, "start": pd.Int64Dtype(), "end": pd.Int64Dtype()}
146 |     )
147 |     pd.testing.assert_frame_equal(
148 |         sanitized_df1,
149 |         construction.sanitize_bedframe(
150 |             df1, start_exceed_end_action="fLiP", drop_null=True
151 |         ),
152 |     )
153 | 
154 |     # flip intervals as well as drop NA
155 |     df1 = pd.DataFrame(
156 |         [
157 |             ["chr1", 20, 10],
158 |             ["chr1", pd.NA, 25],
159 |         ],
160 |         columns=["chrom", "start", "end"],
161 |     )
162 |     sanitized_df1 = pd.DataFrame([["chr1", 10, 20]], columns=["chrom", "start", "end"])
163 |     sanitized_df1 = sanitized_df1.astype(
164 |         {"chrom": str, "start": pd.Int64Dtype(), "end": pd.Int64Dtype()}
165 |     )
166 |     assert construction.sanitize_bedframe(
167 |         df1, start_exceed_end_action="drop", drop_null=True
168 |     ).empty
169 | 
170 | 
171 | def test_make_viewframe():
172 |     # test dict input
173 |     view_df = pd.DataFrame(
174 |         [
175 |             ["chrTESTX", 0, 10, "chrTESTX:0-10"],
176 |             ["chrTESTX_p", 0, 12, "chrTESTX_p:0-12"],
177 |         ],
178 |         columns=["chrom", "start", "end", "name"],
179 |     )
180 |     pd.testing.assert_frame_equal(
181 |         view_df.copy(),
182 |         construction.make_viewframe(
183 |             {"chrTESTX": 10, "chrTESTX_p": 12}, name_style="ucsc"
184 |         ),
185 |     )
186 | 
187 |     # test list input
188 |     region_list = [("chrTESTX", 0, 10), ("chrTESTX_p", 0, 12)]
189 |     pd.testing.assert_frame_equal(
190 |         view_df.copy(),
191 |         construction.make_viewframe(region_list, name_style="ucsc"),
192 |     )
193 | 
194 |     # test pd.Series input
195 |     chromsizes = pd.Series(data=[5, 8], index=["chrTESTXq", "chrTEST_2p"])
196 |     d = """      chrom  start  end        name
197 |     0  chrTESTXq      0    5   chrTESTXq
198 |     1   chrTEST_2p      0    8  chrTEST_2p"""
199 |     view_df = pd.read_csv(StringIO(d), sep=r"\s+")
200 |     pd.testing.assert_frame_equal(
201 |         view_df.copy(), construction.make_viewframe(chromsizes, name_style=None)
202 |     )
203 | 
204 |     d = """          chrom   start   end name
205 |     0   chrTESTXq   0   5   chrTESTXq:0-5
206 |     1   chrTEST_2p  0   8   chrTEST_2p:0-8"""
207 |     view_df = pd.read_csv(StringIO(d), sep=r"\s+")
208 |     pd.testing.assert_frame_equal(
209 |         view_df.copy(),
210 |         construction.make_viewframe(chromsizes, name_style="UCSC"),
211 |     )
212 | 
213 |     # test pd.DataFrame input
214 |     pd.testing.assert_frame_equal(view_df.copy(), construction.make_viewframe(view_df))
215 | 
216 |     # if you provide unique names, this is accepted unchanged by make_viewframe
217 |     view_df = pd.DataFrame(
218 |         [["chrTESTX", 0, 10, "chrTEST_1"], ["chrTESTY", 0, 12, "chrTEST_2"]],
219 |         columns=["chrom", "start", "end", "name"],
220 |     )
221 | 
222 |     region_list = [("chrTESTX", 0, 10, "chrTEST_1"), ("chrTESTY", 0, 12, "chrTEST_2")]
223 | 
224 |     pd.testing.assert_frame_equal(
225 |         view_df.copy(), construction.make_viewframe(region_list)
226 |     )
227 | 
228 |     pd.testing.assert_frame_equal(view_df.copy(), construction.make_viewframe(view_df))
229 | 
230 |     pd.testing.assert_frame_equal(
231 |         view_df.copy(),
232 |         construction.make_viewframe(
233 |             view_df, check_bounds={"chrTESTX": 11, "chrTESTY": 13}
234 |         ),
235 |     )
236 | 
237 |     with pytest.raises(ValueError):
238 |         construction.make_viewframe(
239 |             view_df, check_bounds={"chrTESTX": 9, "chrTESTY": 13}
240 |         )
241 | 


--------------------------------------------------------------------------------
/tests/test_core_specs.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import pytest
  4 | 
  5 | import bioframe
  6 | from bioframe.core import specs
  7 | 
  8 | 
  9 | def test_get_default_colnames():
 10 |     assert specs._get_default_colnames() == ("chrom", "start", "end")
 11 | 
 12 | 
 13 | def test_update_default_colnames():
 14 |     new_names = ("C", "chromStart", "chromStop")
 15 |     specs.update_default_colnames(new_names)
 16 |     assert specs._get_default_colnames() == new_names
 17 | 
 18 |     # test that with updated default column names, bioframe.ops recognizes df1
 19 |     df1 = pd.DataFrame(
 20 |         [["chr1", 1, 5], ["chr1", 3, 8], ["chr1", 8, 10], ["chr1", 12, 14]],
 21 |         columns=list(new_names),
 22 |     )
 23 |     df1_chromsizes = {"chr1": 100, "chrX": 100}
 24 | 
 25 |     df1_complement = pd.DataFrame(
 26 |         [
 27 |             ["chr1", 0, 1, "chr1"],
 28 |             ["chr1", 10, 12, "chr1"],
 29 |             ["chr1", 14, 100, "chr1"],
 30 |             ["chrX", 0, 100, "chrX"],
 31 |         ],
 32 |         columns=[*list(new_names), "view_region"],
 33 |     )
 34 | 
 35 |     pd.testing.assert_frame_equal(
 36 |         bioframe.complement(df1, view_df=df1_chromsizes), df1_complement
 37 |     )
 38 | 
 39 |     # cannot update with just two colujmns
 40 |     with pytest.raises(ValueError):
 41 |         specs.update_default_colnames(("chromosome", "position"))
 42 | 
 43 |     # extra stuff is not allowed
 44 |     with pytest.raises(ValueError):
 45 |         specs.update_default_colnames(["chromosome", "start", "end", "extrasuff"])
 46 | 
 47 |     # reset to default
 48 |     specs.update_default_colnames(("chrom", "start", "end"))
 49 | 
 50 | 
 51 | def test_verify_columns():
 52 |     new_names = ("C", "chromStart", "chromStop")
 53 |     df1 = pd.DataFrame(
 54 |         [["chr1", 1, 5], ["chr1", 3, 8], ["chr1", 8, 10], ["chr1", 12, 14]],
 55 |         columns=list(new_names),
 56 |     )
 57 | 
 58 |     with pytest.raises(ValueError):
 59 |         specs._verify_columns(df1, specs._get_default_colnames())
 60 | 
 61 |     assert specs._verify_columns(
 62 |         df1,
 63 |         new_names,
 64 |         return_as_bool=True,
 65 |     )
 66 | 
 67 |     # no repeated column names
 68 |     with pytest.raises(ValueError):
 69 |         specs._verify_columns(df1, ["chromStart", "chromStart"], unique_cols=True)
 70 | 
 71 | 
 72 | def test_verify_column_dtypes():
 73 |     new_names = ("C", "chromStart", "chromStop")
 74 |     df1 = pd.DataFrame(
 75 |         [["chr1", 1, 5], ["chr1", 3, 8], ["chr1", 8, 10], ["chr1", 12, 14]],
 76 |         columns=list(new_names),
 77 |     )
 78 | 
 79 |     with pytest.raises(ValueError):
 80 |         specs._verify_column_dtypes(df1, specs._get_default_colnames())
 81 | 
 82 |     assert specs._verify_column_dtypes(df1, new_names, return_as_bool=True)
 83 | 
 84 |     df1["chromStart"] = df1["chromStart"].astype(float)
 85 |     assert not specs._verify_column_dtypes(df1, new_names, return_as_bool=True)
 86 | 
 87 |     df1["chromStart"] = df1["chromStart"].astype(pd.Int64Dtype())
 88 |     assert specs._verify_column_dtypes(df1, new_names, return_as_bool=True)
 89 | 
 90 |     df1["C"] = df1["C"].str.replace("chr", "").astype(np.int64)
 91 |     assert not specs._verify_column_dtypes(df1, new_names, return_as_bool=True)
 92 | 
 93 | 
 94 | def test_is_chrom_dtype():
 95 |     assert specs.is_chrom_dtype(str)
 96 |     fruit = pd.CategoricalDtype(
 97 |         categories=["oranges", "grapefruit", "apples"], ordered=True
 98 |     )
 99 |     assert specs.is_chrom_dtype(fruit)
100 |     assert not specs.is_chrom_dtype(int)
101 |     assert not specs.is_chrom_dtype(float)
102 | 


--------------------------------------------------------------------------------
/tests/test_core_stringops.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | 
 4 | from bioframe.core import stringops
 5 | from bioframe.core.stringops import parse_region
 6 | 
 7 | 
 8 | def test_to_ucsc_string():
 9 |     assert stringops.to_ucsc_string(("chr21", 1, 4)) == "chr21:1-4"
10 | 
11 | 
12 | def test_parse_region():
13 |     # UCSC-style names
14 |     assert parse_region("chr21") == ("chr21", 0, None)
15 |     assert parse_region("chr21:1000-2000") == ("chr21", 1000, 2000)
16 |     assert parse_region("chr21:1,000-2,000") == ("chr21", 1000, 2000)
17 | 
18 |     # Ensembl style names
19 |     assert parse_region("6") == ("6", 0, None)
20 |     assert parse_region("6:1000-2000") == ("6", 1000, 2000)
21 |     assert parse_region("6:1,000-2,000") == ("6", 1000, 2000)
22 | 
23 |     # FASTA style names
24 |     assert parse_region("gb|accession|locus") == ("gb|accession|locus", 0, None)
25 |     assert parse_region("gb|accession|locus:1000-2000") == (
26 |         "gb|accession|locus",
27 |         1000,
28 |         2000,
29 |     )
30 |     assert parse_region("gb|accession|locus:1,000-2,000") == (
31 |         "gb|accession|locus",
32 |         1000,
33 |         2000,
34 |     )
35 | 
36 |     # Punctuation in names (aside from :)
37 |     assert parse_region("name-with-hyphens-") == ("name-with-hyphens-", 0, None)
38 |     assert parse_region("GL000207.1") == ("GL000207.1", 0, None)
39 |     assert parse_region("GL000207.1:1000-2000") == ("GL000207.1", 1000, 2000)
40 | 
41 |     # Trailing dash
42 |     assert parse_region("chr21:1000-") == ("chr21", 1000, None)
43 | 
44 |     # Humanized units
45 |     assert parse_region("6:1kb-2kb") == ("6", 1000, 2000)
46 |     assert parse_region("6:1k-2000") == ("6", 1000, 2000)
47 |     assert parse_region("6:1kb-2M") == ("6", 1000, 2000000)
48 |     assert parse_region("6:1Gb-") == ("6", 1000000000, None)
49 | 
50 |     with pytest.raises(ValueError):
51 |         parse_region("chr1:2,000-1,000")  # reverse selection
52 | 
53 |     with pytest.raises(ValueError):
54 |         parse_region("chr1::1000-2000")  # more than one colon
55 | 
56 | 
57 | def test_parse_region_string():
58 |     assert stringops.parse_region_string("6:1kb-2kb") == ("6", 1000, 2000)
59 |     assert stringops.parse_region_string("6:1,000-2,000") == ("6", 1000, 2000)
60 |     assert stringops.parse_region_string("c6:1000-2000") == ("c6", 1000, 2000)
61 | 
62 | 
63 | def test_is_complete_ucsc_string():
64 |     assert stringops.is_complete_ucsc_string("chrX:1M-2M")
65 |     assert not stringops.is_complete_ucsc_string("chrX")
66 |     assert not stringops.is_complete_ucsc_string("1M-2M")
67 |     assert not stringops.is_complete_ucsc_string(1000)
68 |     assert not stringops.is_complete_ucsc_string(np.array([100, 200]))
69 |     assert not stringops.is_complete_ucsc_string(np.array(["chr1:100-200"]))
70 | 


--------------------------------------------------------------------------------
/tests/test_data/bed12.bed:
--------------------------------------------------------------------------------
 1 | chr19	54331773	54620705	5C_304_ENm007_FOR_1.5C_304_ENm007_REV_40	1000	.	54331773	54620705	0	2	14528,19855,	0,269077,
 2 | chr19	54461360	54620705	5C_304_ENm007_FOR_26.5C_304_ENm007_REV_40	1000	.	54461360	54620705	0	2	800,19855,	0,139490,
 3 | chr5	131346229	132145236	5C_299_ENm002_FOR_241.5C_299_ENm002_REV_33	1000	.	131346229	132145236	0	2	2609,2105,	0,796902,
 4 | chr21	35037188	35285017	5C_302_ENm005_FOR_339.5C_302_ENm005_REV_403	1000	.	35037188	35285017	0	2	10878,8825,	0,239004,
 5 | chr19	54357838	54691409	5C_304_ENm007_FOR_4.5C_304_ENm007_REV_51	1000	.	54357838	54691409	0	2	1055,14125,	0,319446,
 6 | chr7	115924626	116693495	5C_298_ENm001_FOR_286.5C_298_ENm001_REV_28	1000	.	115924626	116693495	0	2	4890,1441,	0,767428,
 7 | chr19	54600850	54772278	5C_304_ENm007_FOR_62.5C_304_ENm007_REV_40	1000	.	54600850	54772278	0	2	19855,8187,	0,163241,
 8 | chr19	54359237	54620705	5C_304_ENm007_FOR_6.5C_304_ENm007_REV_40	1000	.	54359237	54620705	0	2	1949,19855,	0,241613,
 9 | chr19	54461360	54893239	5C_304_ENm007_FOR_26.5C_304_ENm007_REV_85	1000	.	54461360	54893239	0	2	800,11088,	0,420791,
10 | chr7	116754962	117005110	5C_298_ENm001_FOR_383.5C_298_ENm001_REV_305	1000	.	116754962	117005110	0	2	10635,363,	0,249785,
11 | chr11	116617499	116757175	5C_300_ENm003_FOR_46.5C_300_ENm003_REV_79	1000	.	116617499	116757175	0	2	2921,19431,	0,120245,
12 | chr22	32920308	33427592	5C_301_ENm004_FOR_338.5C_301_ENm004_REV_218	1000	.	32920308	33427592	0	2	7415,8621,	0,498663,
13 | chr11	1748200	2195481	5C_308_ENm011_FOR_3.5C_308_ENm011_REV_63	1000	.	1748200	2195481	0	2	5843,9589,	0,437692,
14 | chr7	115924626	116158598	5C_298_ENm001_FOR_106.5C_298_ENm001_REV_28	1000	.	115924626	116158598	0	2	4890,1491,	0,232481,
15 | chr16	62281851	62641443	5C_997_ENr313_FOR_118.5C_997_ENr313_REV_2	1000	.	62281851	62641443	0	2	2408,2547,	0,357045,
16 | chr7	116434729	117617181	5C_298_ENm001_FOR_590.5C_298_ENm001_REV_203	1000	.	116434729	117617181	0	2	19679,7324,	0,1175128,
17 | chr7	116544149	116693495	5C_298_ENm001_FOR_286.5C_298_ENm001_REV_236	1000	.	116544149	116693495	0	2	3475,1441,	0,147905,
18 | chr11	1789267	2195481	5C_308_ENm011_FOR_8.5C_308_ENm011_REV_63	1000	.	1789267	2195481	0	2	3188,9589,	0,396625,
19 | chr7	116434729	116693495	5C_298_ENm001_FOR_286.5C_298_ENm001_REV_203	1000	.	116434729	116693495	0	2	19679,1441,	0,257325,
20 | chr7	116849860	117617181	5C_298_ENm001_FOR_590.5C_298_ENm001_REV_342	1000	.	116849860	117617181	0	2	15082,7324,	0,759997,
21 | chr22	32544939	33427592	5C_301_ENm004_FOR_338.5C_301_ENm004_REV_131	1000	.	32544939	33427592	0	2	4212,8621,	0,874032,
22 | chr19	54429407	54620705	5C_304_ENm007_FOR_20.5C_304_ENm007_REV_40	1000	.	54429407	54620705	0	2	7487,19855,	0,171443,
23 | chr19	54764091	54893239	5C_304_ENm007_FOR_62.5C_304_ENm007_REV_85	1000	.	54764091	54893239	0	2	8187,11088,	0,118060,
24 | chr16	62431952	62769565	5C_997_ENr313_FOR_46.5C_997_ENr313_REV_159	1000	.	62431952	62769565	0	2	4031,3833,	0,333780,
25 | chr21	35029593	35285017	5C_302_ENm005_FOR_337.5C_302_ENm005_REV_403	1000	.	35029593	35285017	0	2	6085,8825,	0,246599,
26 | chr5	131346229	132146235	5C_299_ENm002_FOR_242.5C_299_ENm002_REV_33	1000	.	131346229	132146235	0	2	2609,999,	0,799007,
27 | chr19	54600850	54703388	5C_304_ENm007_FOR_55.5C_304_ENm007_REV_40	1000	.	54600850	54703388	0	2	19855,7848,	0,94690,
28 | chrX	153198557	153625659	5C_303_ENm006_FOR_84.5C_303_ENm006_REV_17	1000	.	153198557	153625659	0	2	15711,11331,	0,415771,
29 | chr7	115861595	116766876	5C_298_ENm001_FOR_306.5C_298_ENm001_REV_13	1000	.	115861595	116766876	0	2	9373,1279,	0,904002,
30 | chr22	32920308	33282103	5C_301_ENm004_FOR_300.5C_301_ENm004_REV_218	1000	.	32920308	33282103	0	2	7415,1101,	0,360694,
31 | 


--------------------------------------------------------------------------------
/tests/test_data/bed9.bed:
--------------------------------------------------------------------------------
 1 | chr1	193500	194500	.	400	+	.	.	179,45,0
 2 | chr1	618500	619500	.	700	+	.	.	179,45,0
 3 | chr1	974500	975500	.	1000	+	.	.	179,45,0
 4 | chr1	1301500	1302500	.	1000	+	.	.	179,45,0
 5 | chr1	1479500	1480500	.	1000	+	.	.	179,45,0
 6 | chr1	2154500	2155500	.	800	+	.	.	179,45,0
 7 | chr1	2450500	2451500	.	900	+	.	.	179,45,0
 8 | chr1	3719500	3720500	.	700	+	.	.	179,45,0
 9 | chr1	4084500	4085500	.	600	+	.	.	179,45,0
10 | chr1	6292500	6293500	.	900	+	.	.	179,45,0
11 | chr1	6507500	6508500	.	900	+	.	.	179,45,0
12 | chr1	8182500	8183500	.	700	+	.	.	179,45,0
13 | chr1	8988500	8989500	.	1000	+	.	.	179,45,0
14 | chr1	9483500	9484500	.	900	+	.	.	179,45,0
15 | chr1	9815500	9816500	.	900	+	.	.	179,45,0
16 | chr1	10146500	10147500	.	900	+	.	.	179,45,0
17 | chr1	11023500	11024500	.	1000	+	.	.	179,45,0
18 | chr1	11266500	11267500	.	800	+	.	.	179,45,0
19 | chr1	11971500	11972500	.	1000	+	.	.	179,45,0
20 | chr1	12172500	12173500	.	1000	+	.	.	179,45,0
21 | chr1	13145500	13146500	.	400	+	.	.	179,45,0
22 | chr1	13464500	13465500	.	400	+	.	.	179,45,0
23 | chr1	14030500	14031500	.	600	+	.	.	179,45,0
24 | chr1	16068500	16069500	.	900	+	.	.	179,45,0
25 | chr1	16486500	16487500	.	900	+	.	.	179,45,0
26 | chr1	16756500	16757500	.	1000	+	.	.	179,45,0
27 | chr1	17035500	17036500	.	700	+	.	.	179,45,0
28 | chr1	17306500	17307500	.	700	+	.	.	179,45,0
29 | chr1	18393500	18394500	.	400	+	.	.	179,45,0
30 | chr1	19383500	19384500	.	700	+	.	.	179,45,0
31 | 


--------------------------------------------------------------------------------
/tests/test_data/jaspar.bed:
--------------------------------------------------------------------------------
 1 | chr1	10470	10489	CTCF	803	390	-
 2 | chr1	11163	11182	CTCF	811	406	-
 3 | chr1	11222	11241	CTCF	959	804	-
 4 | chr1	11280	11299	CTCF	939	728	-
 5 | chr1	11339	11358	CTCF	837	455	-
 6 | chr1	11401	11420	CTCF	829	439	-
 7 | chr1	11413	11432	CTCF	803	390	+
 8 | chr1	13282	13301	CTCF	800	385	-
 9 | chr1	14230	14249	CTCF	817	416	-
10 | chr1	15227	15246	CTCF	806	396	-
11 | chr1	15626	15645	CTCF	830	442	-
12 | chr1	16650	16669	CTCF	826	433	+
13 | chr1	17091	17110	CTCF	821	423	+
14 | chr1	17925	17944	CTCF	806	395	+
15 | chr1	18119	18138	CTCF	807	398	+
16 | chr1	18357	18376	CTCF	808	400	-
17 | chr1	18487	18506	CTCF	810	403	-
18 | chr1	19817	19836	CTCF	804	392	-
19 | chr1	22561	22580	CTCF	806	396	+
20 | chr1	23446	23465	CTCF	800	385	+
21 | chr1	23872	23891	CTCF	823	428	-
22 | chr1	24781	24800	CTCF	892	584	-
23 | chr1	24939	24958	CTCF	828	438	+
24 | chr1	26053	26072	CTCF	832	446	-
25 | chr1	26085	26104	CTCF	843	468	-
26 | chr1	32074	32093	CTCF	803	391	-
27 | chr1	34397	34416	CTCF	803	391	-
28 | chr1	34941	34960	CTCF	815	412	+
29 | chr1	35952	35971	CTCF	807	397	-
30 | chr1	36202	36221	CTCF	807	397	+
31 | 


--------------------------------------------------------------------------------
/tests/test_data/narrowPeak.bed:
--------------------------------------------------------------------------------
 1 | chr19	48309541	48309911	.	1000	.	5.04924	-1.00000	0.00438	185
 2 | chr4	130563716	130564086	.	993	.	5.05052	-1.00000	0.00432	185
 3 | chr1	200622507	200622877	.	591	.	5.05489	-1.00000	0.00400	185
 4 | chr5	112848447	112848817	.	869	.	5.05841	-1.00000	0.00441	185
 5 | chr1	145960616	145960986	.	575	.	5.05955	-1.00000	0.00439	185
 6 | chr9	9912714	9913084	.	563	.	5.06079	-1.00000	0.00434	185
 7 | chr6	2744599	2744969	.	795	.	5.06457	-1.00000	0.00401	185
 8 | chr9	124777413	124777783	.	1000	.	5.06479	-1.00000	0.00402	185
 9 | chr1	67701045	67701415	.	780	.	5.06708	-1.00000	0.00416	185
10 | chr10	119859586	119859956	.	825	.	5.08015	-1.00000	0.00362	185
11 | chr3	66816327	66816697	.	1000	.	5.08233	-1.00000	0.00379	185
12 | chr16	50248791	50249161	.	579	.	5.08249	-1.00000	0.00380	185
13 | chr19	41431677	41432047	.	1000	.	5.11060	-1.00000	0.00876	185
14 | chr4	131644839	131645209	.	1000	.	5.11204	-1.00000	0.00855	185
15 | chr2	203239519	203239889	.	753	.	5.11817	-1.00000	0.00755	185
16 | chr1	108877017	108877387	.	1000	.	5.12519	-1.00000	0.00777	185
17 | chr1	23665426	23665796	.	1000	.	5.12618	-1.00000	0.00712	185
18 | chr15	78415607	78415977	.	1000	.	5.14402	-1.00000	0.00913	185
19 | chr9	3181837	3182207	.	1000	.	5.14438	-1.00000	0.00903	185
20 | chr10	50275876	50276246	.	1000	.	5.14891	-1.00000	0.00867	185
21 | chr17	27388554	27388924	.	1000	.	5.15031	-1.00000	0.00809	185
22 | chr1	241485905	241486275	.	1000	.	5.16030	-1.00000	0.00723	185
23 | chr18	56995779	56996149	.	827	.	5.16128	-1.00000	0.00708	185
24 | chr11	24558049	24558419	.	620	.	5.16788	-1.00000	0.00557	185
25 | chr4	109134575	109134945	.	567	.	5.16876	-1.00000	0.00550	185
26 | chr10	84214795	84215165	.	1000	.	5.17597	-1.00000	0.00540	185
27 | chr20	4233733	4234103	.	1000	.	5.17899	-1.00000	0.00497	185
28 | chr2	130356160	130356530	.	1000	.	5.18574	-1.00000	0.00660	185
29 | chr18	55322509	55322879	.	865	.	5.19245	-1.00000	0.00626	185
30 | chr8	126510457	126510827	.	552	.	5.19561	-1.00000	0.00554	185
31 | 


--------------------------------------------------------------------------------
/tests/test_data/test.chrom.sizes:
--------------------------------------------------------------------------------
1 | chrTESTX	5
2 | chrTEST2	7
3 | 


--------------------------------------------------------------------------------
/tests/test_data/test.fa:
--------------------------------------------------------------------------------
1 | >chrTESTX
2 | AtGcN
3 | >chrTEST2
4 | NGATCNN
5 | 


--------------------------------------------------------------------------------
/tests/test_data/test.fa.fai:
--------------------------------------------------------------------------------
1 | chrTESTX	5	10	5	6
2 | chrTEST2	7	26	7	8
3 | 


--------------------------------------------------------------------------------
/tests/test_data/toy.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/tests/test_data/toy.bam


--------------------------------------------------------------------------------
/tests/test_data/toy.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/bioframe/4fe9b255547e4dc47f8d7b4c9a3438f12738cd3d/tests/test_data/toy.bam.bai


--------------------------------------------------------------------------------
/tests/test_data/toy.sam:
--------------------------------------------------------------------------------
 1 | @SQ	SN:ref	LN:45
 2 | @SQ	SN:ref2	LN:40
 3 | r001	163	ref	7	30	8M4I4M1D3M	=	37	39	TTAGATAAAGAGGATACTG	*	XX:B:S,12561,2,20,112
 4 | r002	0	ref	9	30	1S2I6M1P1I1P1I4M2I	*	0	0	AAAAGATAAGGGATAAA	*
 5 | r003	0	ref	9	30	5H6M	*	0	0	AGCTAA	*
 6 | r004	0	ref	16	30	6M14N1I5M	*	0	0	ATAGCTCTCAGC	*
 7 | r003	16	ref	29	30	6H5M	*	0	0	TAGGC	*
 8 | r001	83	ref	37	30	9M	=	7	-39	CAGCGCCAT	*
 9 | x1	0	ref2	1	30	20M	*	0	0	aggttttataaaacaaataa	????????????????????
10 | x2	0	ref2	2	30	21M	*	0	0	ggttttataaaacaaataatt	?????????????????????
11 | x3	0	ref2	6	30	9M4I13M	*	0	0	ttataaaacAAATaattaagtctaca	??????????????????????????
12 | x4	0	ref2	10	30	25M	*	0	0	CaaaTaattaagtctacagagcaac	?????????????????????????
13 | x5	0	ref2	12	30	24M	*	0	0	aaTaattaagtctacagagcaact	????????????????????????
14 | x6	0	ref2	14	30	23M	*	0	0	Taattaagtctacagagcaacta	???????????????????????
15 | 


--------------------------------------------------------------------------------
/tests/test_fileops.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from io import StringIO
 3 | 
 4 | import pandas as pd
 5 | import pytest
 6 | 
 7 | import bioframe
 8 | 
 9 | is_big_endian = sys.byteorder == "big"
10 | 
11 | 
12 | ############# tests #####################
13 | def test_read_table():
14 |     d = """chr1\nchr2\nchr2"""
15 |     assert bioframe.read_table(StringIO(d), schema="bed3").shape == (3, 3)
16 | 
17 |     # raise a value error if any columns are filled with all NA
18 |     with pytest.raises(ValueError):
19 |         bioframe.read_table(StringIO(d), schema="bed3", schema_is_strict=True)
20 | 
21 |     # fill with nans to appropriate size if schema_is_strict=False (aka the default)
22 |     d = """chr1      5    10
23 |            chr1     10   20
24 |            chr2    30  40"""
25 |     assert bioframe.read_table(StringIO(d), schema="bed3", sep="\\s+").shape == (3, 3)
26 |     assert bioframe.read_table(StringIO(d), schema="bed6", sep="\\s+").shape == (3, 6)
27 |     assert bioframe.read_table(StringIO(d), schema="bed12", sep="\\s+").shape == (3, 12)
28 | 
29 |     # bedpe has 10 columns
30 |     d = """chr1    5    10  chr2   5   10   interval1  .  +  -
31 |            chr1    10   20  chr1   5   10   interval2  .  +  -
32 |            chr2    30   40  chr2   5   10   interval3  12  +  -
33 |         """
34 |     assert bioframe.read_table(
35 |         StringIO(d), schema="bedpe", sep=r"\s+", schema_is_strict=True
36 |     ).shape == (3, 10)
37 | 
38 | 
39 | def test_read_chromsizes():
40 |     d = """chr1\nchr2\nchr2"""
41 |     with pytest.raises(ValueError):
42 |         bioframe.read_chromsizes(StringIO(d))
43 | 
44 |     d = """chr1\t1\nchr3\t2\nchr2\t3\n """
45 |     chromsizes = bioframe.read_chromsizes(StringIO(d))
46 |     assert isinstance(chromsizes, pd.Series)
47 |     assert chromsizes.name == "length"
48 |     assert list(chromsizes.index) == ["chr1", "chr2", "chr3"]
49 |     assert list(chromsizes.values) == [1, 3, 2]
50 | 
51 | 
52 | def test_read_beds():
53 |     # Checking that we properly read common bed schemas
54 |     schemas = ['narrowPeak', 'jaspar', 'bed9', 'bed12']
55 | 
56 |     for schema in schemas:
57 |         _ = bioframe.read_table(f'tests/test_data/{schema}.bed', schema=schema,
58 |                                 schema_is_strict=True)
59 | 
60 | 
61 | @pytest.mark.skipif(is_big_endian, reason="Test skipped on big-endian systems")
62 | def test_read_sam():
63 |     pytest.importorskip("pysam")
64 |     # SAM file taken from https://github.com/samtools/samtools/blob/develop/examples/toy.sam
65 |     _ = bioframe.read_alignments('tests/test_data/toy.sam')
66 | 
67 | 
68 | @pytest.mark.skipif(is_big_endian, reason="Test skipped on big-endian systems")
69 | def test_read_bam():
70 |     pytest.importorskip("pysam")
71 |     # converted toy.sam via `samtools view -bS toy.sam > toy.bam;
72 |     # index file created with `samtools index toy.bam`
73 |     _ = bioframe.read_alignments('tests/test_data/toy.bam')
74 | 


--------------------------------------------------------------------------------
/tests/test_ops_select.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import pytest
  3 | 
  4 | import bioframe
  5 | 
  6 | 
  7 | def test_select():
  8 |     df = pd.DataFrame(
  9 |         [["chrX", 3, 8], ["chr1", 4, 5], ["chrX", 1, 5]],
 10 |         columns=["chrom", "start", "end"],
 11 |     )
 12 | 
 13 |     result = pd.DataFrame([["chr1", 4, 5]], columns=["chrom", "start", "end"])
 14 |     pd.testing.assert_frame_equal(
 15 |         result, bioframe.select(df, "chr1:4-10").reset_index(drop=True)
 16 |     )
 17 | 
 18 |     result = pd.DataFrame(
 19 |         [["chrX", 3, 8], ["chrX", 1, 5]], columns=["chrom", "start", "end"]
 20 |     )
 21 |     pd.testing.assert_frame_equal(
 22 |         result, bioframe.select(df, "chrX").reset_index(drop=True)
 23 |     )
 24 | 
 25 |     result = pd.DataFrame(
 26 |         [["chrX", 3, 8], ["chrX", 1, 5]], columns=["chrom", "start", "end"]
 27 |     )
 28 |     pd.testing.assert_frame_equal(
 29 |         result, bioframe.select(df, "chrX:4-6").reset_index(drop=True)
 30 |     )
 31 | 
 32 |     # Query range not in the dataframe
 33 |     assert len(bioframe.select(df, "chrZ")) == 0
 34 |     assert len(bioframe.select(df, "chr1:100-1000")) == 0
 35 |     assert len(bioframe.select(df, "chr1:1-3")) == 0
 36 | 
 37 |     # Invalid query range
 38 |     with pytest.raises(ValueError):
 39 |         bioframe.select(df, "chr1:1-0")
 40 | 
 41 | 
 42 | def test_select__with_colnames():
 43 |     ### select with non-standard column names
 44 |     new_names = ["chr", "chrstart", "chrend"]
 45 |     df = pd.DataFrame(
 46 |         [["chrX", 3, 8], ["chr1", 4, 5], ["chrX", 1, 5]],
 47 |         columns=new_names,
 48 |     )
 49 |     result = pd.DataFrame(
 50 |         [["chrX", 3, 8], ["chrX", 1, 5]],
 51 |         columns=new_names,
 52 |     )
 53 |     pd.testing.assert_frame_equal(
 54 |         result, bioframe.select(df, "chrX:4-6", cols=new_names).reset_index(drop=True)
 55 |     )
 56 |     pd.testing.assert_frame_equal(
 57 |         result, bioframe.select(df, "chrX", cols=new_names).reset_index(drop=True)
 58 |     )
 59 | 
 60 | 
 61 | def test_select__with_nulls():
 62 |     ### select from a DataFrame with NaNs
 63 |     colnames = ["chrom", "start", "end", "view_region"]
 64 |     df = pd.DataFrame(
 65 |         [
 66 |             ["chr1", -6, 12, "chr1p"],
 67 |             [pd.NA, pd.NA, pd.NA, "chr1q"],
 68 |             ["chrX", 1, 8, "chrX_0"],
 69 |         ],
 70 |         columns=colnames,
 71 |     ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()})
 72 | 
 73 |     result = pd.DataFrame(
 74 |         [["chr1", -6, 12, "chr1p"]],
 75 |         columns=colnames,
 76 |     ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()})
 77 | 
 78 |     pd.testing.assert_frame_equal(
 79 |         result, bioframe.select(df, "chr1:0-1").reset_index(drop=True)
 80 |     )
 81 | 
 82 | 
 83 | def test_select__mask_indices_labels():
 84 |     df = pd.DataFrame(
 85 |         [["chrX", 3, 8], ["chr1", 4, 5], ["chrX", 1, 5]],
 86 |         columns=["chrom", "start", "end"],
 87 |     )
 88 | 
 89 |     region = "chr1:4-10"
 90 |     answer = pd.DataFrame([["chr1", 4, 5]], columns=["chrom", "start", "end"])
 91 | 
 92 |     result = bioframe.select(df, region)
 93 |     pd.testing.assert_frame_equal(answer, result.reset_index(drop=True))
 94 |     mask = bioframe.select_mask(df, region)
 95 |     pd.testing.assert_frame_equal(answer, df.loc[mask].reset_index(drop=True))
 96 |     labels = bioframe.select_labels(df, region)
 97 |     pd.testing.assert_frame_equal(answer, df.loc[labels].reset_index(drop=True))
 98 |     idx = bioframe.select_indices(df, region)
 99 |     pd.testing.assert_frame_equal(answer, df.iloc[idx].reset_index(drop=True))
100 | 
101 | 
102 | def test_select__query_intervals_are_half_open():
103 |     df = pd.DataFrame(
104 |         {
105 |             "chrom": ["chr1", "chr1", "chr2", "chr2", "chr2", "chr2", "chr2", "chr2"],
106 |             "start": [0, 10, 10, 20, 30, 40, 50, 60],
107 |             "end": [10, 20, 20, 30, 40, 50, 60, 70],
108 |             "name": ["a", "b", "A", "B", "C", "D", "E", "F"],
109 |         }
110 |     )
111 | 
112 |     result = bioframe.select(df, "chr1")
113 |     assert (result["name"] == ["a", "b"]).all()
114 | 
115 |     result = bioframe.select(df, "chr2:20-70")
116 |     assert (result["name"] == ["B", "C", "D", "E", "F"]).all()
117 | 
118 |     result = bioframe.select(df, "chr2:20-75")
119 |     assert (result["name"] == ["B", "C", "D", "E", "F"]).all()
120 | 
121 |     result = bioframe.select(df, "chr2:20-")
122 |     assert (result.index == [3, 4, 5, 6, 7]).all()
123 | 
124 |     result = bioframe.select(df, "chr2:20-30")
125 |     assert (result["name"] == ["B"]).all()
126 | 
127 |     result = bioframe.select(df, "chr2:20-40")
128 |     assert (result["name"] == ["B", "C"]).all()
129 | 
130 |     result = bioframe.select(df, "chr2:20-45")
131 |     assert (result["name"] == ["B", "C", "D"]).all()
132 | 
133 |     result = bioframe.select(df, "chr2:19-45")
134 |     assert (result["name"] == ["A", "B", "C", "D"]).all()
135 | 
136 |     result = bioframe.select(df, "chr2:25-45")
137 |     assert (result["name"] == ["B", "C", "D"]).all()
138 | 
139 |     result = bioframe.select(df, "chr2:25-50")
140 |     assert (result["name"] == ["B", "C", "D"]).all()
141 | 
142 |     result = bioframe.select(df, "chr2:25-51")
143 |     assert (result["name"] == ["B", "C", "D", "E"]).all()
144 | 
145 | 
146 | def test_select__with_point_intervals():
147 |     # Dataframe containing "point intervals"
148 |     df = pd.DataFrame(
149 |         {
150 |             "chrom": ["chr1", "chr1", "chr2", "chr2", "chr2", "chr2", "chr2", "chr2"],
151 |             "start": [0, 10, 10, 20, 30, 40, 50, 60],
152 |             "end": [10, 10, 20, 30, 40, 50, 50, 70],
153 |             "name": ["a", "b", "A", "B", "C", "D", "E", "F"],
154 |         }
155 |     )
156 |     result = bioframe.select(df, "chr1")
157 |     assert (result["name"] == ["a", "b"]).all()
158 | 
159 |     result = bioframe.select(df, "chr1:4-10")
160 |     assert (result["name"] == ["a"]).all()
161 | 
162 |     result = bioframe.select(df, "chr1:4-4")
163 |     assert (result["name"] == ["a"]).all()
164 | 
165 |     result = bioframe.select(df, "chr1:10-15")
166 |     assert (result["name"] == ["b"]).all()
167 | 
168 |     result = bioframe.select(df, "chr2:20-70")
169 |     assert (result["name"] == ["B", "C", "D", "E", "F"]).all()
170 | 
171 |     result = bioframe.select(df, "chr2:49-70")
172 |     assert (result["name"] == ["D", "E", "F"]).all()
173 | 
174 |     result = bioframe.select(df, "chr2:50-70")
175 |     assert (result["name"] == ["E", "F"]).all()
176 | 
177 |     result = bioframe.select(df, "chr2:50-51")
178 |     assert (result["name"] == ["E"]).all()
179 | 
180 |     result = bioframe.select(df, "chr2:50-50")
181 |     assert (result["name"] == ["E"]).all()
182 | 
183 | 
184 | def test_select__with_points():
185 |     # Dataframe of points
186 |     df = pd.DataFrame(
187 |         [["chrX", 3, "A"], ["chr1", 4, "C"], ["chrX", 1, "B"]],
188 |         columns=["chrom", "pos", "name"],
189 |     )
190 | 
191 |     result = bioframe.select(df, "chr1:4-10", cols=["chrom", "pos", "pos"])
192 |     assert (result["name"] == ["C"]).all()
193 | 
194 |     result = bioframe.select(df, "chr1:3-10", cols=["chrom", "pos", "pos"])
195 |     assert (result["name"] == ["C"]).all()
196 | 
197 |     result = bioframe.select(df, "chr1:4-4", cols=["chrom", "pos", "pos"])
198 |     assert (result["name"] == ["C"]).all()
199 | 


--------------------------------------------------------------------------------
/tests/test_resources.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | import bioframe
 4 | 
 5 | 
 6 | def test_fetch_chromsizes():
 7 |     db = "hg38"
 8 |     for provider in ["local", "ucsc"]:
 9 |         chromsizes = bioframe.fetch_chromsizes(db, provider=provider)
10 |         assert isinstance(chromsizes, pd.Series)
11 |         assert chromsizes.name == "length"
12 |         assert len(chromsizes) == 25
13 | 
14 |         chromsizes_df = bioframe.fetch_chromsizes(db, provider=provider, as_bed=True)
15 |         assert isinstance(chromsizes_df, pd.DataFrame)
16 |         assert list(chromsizes_df.columns) == ["chrom", "start", "end"]
17 |         assert len(chromsizes_df) == 25
18 | 
19 |     # Check synonymous local assemblies
20 |     assert bioframe.fetch_chromsizes("hg38", provider="local").equals(
21 |         bioframe.fetch_chromsizes("GRCh38", provider="local")
22 |     )
23 | 
24 | 
25 | def test_fetch_chromsizes_local_vs_ucsc():
26 |     for db in ["hg19", "hg38", "mm9", "mm10"]:
27 |         assert bioframe.fetch_chromsizes(db, provider="local").equals(
28 |             bioframe.fetch_chromsizes(db, provider="ucsc")
29 |         )
30 | 
31 | 
32 | def test_fetch_centromeres():
33 |     for db in ["hg19", "hg38"]:
34 |         # Note: UCSC will usually have a different ordering of chromosomes
35 |         for provider in ["local", "ucsc"]:
36 |             centromeres = bioframe.fetch_centromeres(db, provider=provider)
37 |             assert isinstance(centromeres, pd.DataFrame)
38 |             assert list(centromeres.columns) == ["chrom", "start", "end", "mid"]
39 |             assert len(centromeres) == 24
40 | 


--------------------------------------------------------------------------------
/tests/test_vis.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | 
 5 | import bioframe
 6 | 
 7 | 
 8 | def test_to_ucsc_colorstring():
 9 |     assert bioframe.to_ucsc_colorstring("red") == "255,0,0"
10 |     assert bioframe.to_ucsc_colorstring("blue") == "0,0,255"
11 |     assert bioframe.to_ucsc_colorstring("green") == "0,128,0"
12 |     assert bioframe.to_ucsc_colorstring("black") == "0,0,0"
13 |     assert bioframe.to_ucsc_colorstring("white") == "255,255,255"
14 |     assert bioframe.to_ucsc_colorstring("r") == "255,0,0"
15 |     assert bioframe.to_ucsc_colorstring("tomato") == "255,99,71"
16 |     assert bioframe.to_ucsc_colorstring("xkcd:sky blue") == "117,187,253"
17 |     assert bioframe.to_ucsc_colorstring("#abc") == "170,187,204"
18 |     assert bioframe.to_ucsc_colorstring("#ff0000") == "255,0,0"
19 |     assert bioframe.to_ucsc_colorstring("#ff000055") == "255,0,0"
20 |     assert bioframe.to_ucsc_colorstring((1, 0, 0)) == "255,0,0"
21 |     assert bioframe.to_ucsc_colorstring((1, 0, 0, 0.5)) == "255,0,0"
22 |     assert bioframe.to_ucsc_colorstring((0, 0, 1)) == "0,0,255"
23 |     assert bioframe.to_ucsc_colorstring(None) == "0"
24 |     assert bioframe.to_ucsc_colorstring("none") == "0"
25 |     assert bioframe.to_ucsc_colorstring(np.nan) == "0"
26 |     assert bioframe.to_ucsc_colorstring(pd.NA) == "0"
27 | 
28 |     with pytest.raises(ValueError):
29 |         bioframe.to_ucsc_colorstring("notacolor")
30 | 
31 |     df = bioframe.from_any(
32 |         [
33 |             ["chr1", 0, 10, "red"],
34 |             ["chr1", 10, 20, "blue"],
35 |             ["chr2", 0, 10, "green"],
36 |             ["chr2", 10, 20, None],
37 |         ]
38 |     )
39 |     df["itemRgb"] = df["name"].apply(bioframe.to_ucsc_colorstring)
40 |     assert df["itemRgb"].tolist() == ["255,0,0", "0,0,255", "0,128,0", "0"]
41 | 


--------------------------------------------------------------------------------