├── .coveragerc ├── .flake8 ├── .github └── workflows │ ├── pypi-publish.yml │ ├── pytest.yml │ └── python-publish-test.yml ├── .gitignore ├── .readthedocs.yml ├── .zenodo.json ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── cooltools ├── __init__.py ├── __main__.py ├── api │ ├── __init__.py │ ├── coverage.py │ ├── directionality.py │ ├── dotfinder.py │ ├── eigdecomp.py │ ├── expected.py │ ├── insulation.py │ ├── rearrange.py │ ├── saddle.py │ ├── sample.py │ ├── snipping.py │ └── virtual4c.py ├── cli │ ├── __init__.py │ ├── coverage.py │ ├── dots.py │ ├── eigs_cis.py │ ├── eigs_trans.py │ ├── expected_cis.py │ ├── expected_trans.py │ ├── genome.py │ ├── insulation.py │ ├── logbin_expected.py │ ├── pileup.py │ ├── rearrange.py │ ├── saddle.py │ ├── sample.py │ ├── util.py │ └── virtual4c.py ├── lib │ ├── __init__.py │ ├── _numutils.pyx │ ├── _query.py │ ├── checks.py │ ├── common.py │ ├── io.py │ ├── numutils.py │ ├── peaks.py │ ├── plotting.py │ ├── runlength.py │ └── schemas.py └── sandbox │ ├── __init__.py │ ├── balance.py │ ├── contrast.py │ ├── cool2cworld.py │ ├── cooler_filters │ ├── Example_usage.ipynb │ ├── pixel_filter_util.py │ └── test_data_util.cool │ ├── cross_score.py │ ├── expected_smoothing.py │ ├── expected_smoothing_example.ipynb │ ├── fastsavetxt.pyx │ ├── obs_over_exp_cooler.py │ ├── observed_over_expected_example.ipynb │ ├── pairs_scaling_functions.py │ └── rearrange_cooler_example.ipynb ├── datasets └── external_test_files.tsv ├── docs ├── Makefile ├── cli.rst ├── conf.py ├── cooltools.lib.rst ├── cooltools.rst ├── figs │ └── cooltools-logo-futura.png ├── index.rst ├── make.bat ├── notebooks_old │ ├── 01_scaling-curves.ipynb │ ├── 02_expected.ipynb │ ├── 03_eigendecomposition.ipynb │ ├── 04_saddle-plots.ipynb │ ├── 05_insulation-score.ipynb │ ├── 06_snipping-pileups.ipynb │ ├── 07_pileups2.ipynb │ ├── 08_dot-calling-internals.ipynb │ └── data │ │ └── encode_motifs.hg38.ctcf_known1.liftover.bed.gz ├── releases.md └── requirements.txt ├── pyproject.toml ├── pytest.ini ├── requirements-dev.txt ├── requirements.txt ├── setup.py └── tests ├── data ├── CN.mm9.10000kb.cool ├── CN.mm9.1000kb.cool ├── CN.mm9.toy_expected.chromnamed.tsv ├── CN.mm9.toy_expected.tsv ├── CN.mm9.toy_features.bed ├── CN.mm9.toy_regions.bed ├── dotfinder_mock_inputs.npz ├── dotfinder_mock_res.csv.gz ├── make_test_compartments.py ├── mm9.chrom.sizes.reduced ├── mm9.named_nonoverlap_regions.bed ├── sin_eigs_mat.bg2.gz ├── sin_eigs_mat.cool ├── sin_eigs_track.tsv ├── test.10.bins └── test.chrom.sizes ├── test_call-dots.py ├── test_checks.py ├── test_compartments_saddle.py ├── test_coverage.py ├── test_dotfinder_chunking.py ├── test_dotfinder_stats.py ├── test_expected.py ├── test_insulation.py ├── test_io.py ├── test_lazy_toeplitz.py ├── test_lib_common.py ├── test_rearrange_cooler.py ├── test_sample.py ├── test_snipping.py └── test_virtual4c.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source= 3 | cooltools/ 4 | 5 | omit= 6 | cooltools/__main__.py 7 | 8 | [report] 9 | exclude_lines = 10 | pragma: no cover 11 | return NotImplemented 12 | raise NotImplementedError 13 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = 3 | __init__.py 4 | __main__.py 5 | 6 | max-line-length = 80 7 | ignore = 8 | # whitespace before ':' 9 | E203 10 | # too many leading '#' for block comment 11 | E266 12 | # line too long 13 | E501 14 | # line break before binary operator 15 | W503 16 | select = 17 | # mccabe complexity 18 | C 19 | # pycodestyle 20 | E 21 | # pyflakes error 22 | F 23 | # pyflakes warning 24 | W 25 | # bugbear 26 | B 27 | # line exceeds max-line-length + 10% 28 | B950 29 | -------------------------------------------------------------------------------- /.github/workflows/pypi-publish.yml: -------------------------------------------------------------------------------- 1 | name: Build and upload Python Package to PyPI 2 | 3 | on: 4 | workflow_call: 5 | workflow_dispatch: 6 | release: 7 | types: [released] 8 | 9 | jobs: 10 | Publish: 11 | runs-on: ubuntu-latest 12 | permissions: 13 | id-token: write 14 | 15 | steps: 16 | - name: Checkout 17 | uses: actions/checkout@v4 18 | 19 | - name: Setup Python 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: "3.x" 23 | 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install build 28 | 29 | - name: Build 30 | run: python -m build --sdist 31 | 32 | - name: Publish distribution 📦 to PyPI 33 | uses: pypa/gh-action-pypi-publish@release/v1 34 | -------------------------------------------------------------------------------- /.github/workflows/pytest.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Pytest 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: [ '3.9', '3.10', '3.11', '3.12' ] 19 | 20 | steps: 21 | - uses: actions/checkout@v2 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v1 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | - name: Install dependencies 27 | run: | 28 | pip install --upgrade pip wheel setuptools 29 | pip install numpy cython 30 | pip install -r requirements-dev.txt 31 | pip install -e . 32 | - name: Lint with flake8 33 | run: | 34 | pip install flake8 35 | # stop the build if there are Python syntax errors or undefined names 36 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 37 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 38 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 39 | - name: Test with pytest 40 | run: | 41 | pip install pytest 42 | pytest 43 | -------------------------------------------------------------------------------- /.github/workflows/python-publish-test.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Publish Python Package to Test PyPI 5 | 6 | on: 7 | release: 8 | types: [prereleased] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install cython numpy setuptools wheel twine 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python setup.py sdist 31 | twine upload --repository-url https://test.pypi.org/legacy/ dist/* 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Vim's cache 2 | *.un~ 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__ 6 | cooltools/__pycache__ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | *.c 13 | 14 | # Distribution / packaging 15 | .Python 16 | env/ 17 | .venv/ 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | # lib/ - we're using it cooltools/lib 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *,cover 52 | .hypothesis/ 53 | .pytest_cache 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | docs/notebooks 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # IPython Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # dotenv 87 | .env 88 | 89 | # virtualenv 90 | venv/ 91 | ENV/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | tmp/ 101 | 102 | # Downloaded data 103 | datasets/* 104 | !datasets/external_test_files.tsv 105 | tmp.npz 106 | .gitignore 107 | tmp.hdf5 108 | cooltools/sandbox/test.mcool 109 | 110 | .vscode/ 111 | .idea/ -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | build: 9 | os: ubuntu-22.04 10 | tools: 11 | python: "3.10" 12 | 13 | # Build documentation in the docs/ directory with Sphinx 14 | sphinx: 15 | configuration: docs/conf.py 16 | 17 | # Build documentation with MkDocs 18 | #mkdocs: 19 | # configuration: mkdocs.yml 20 | 21 | # Optionally build your docs in additional formats such as PDF and ePub 22 | formats: all 23 | 24 | # Optionally set the version of Python and requirements required to build your docs 25 | python: 26 | install: 27 | - requirements: docs/requirements.txt 28 | -------------------------------------------------------------------------------- /.zenodo.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "CoolTools: the tools for your .cools", 3 | "license": "MIT", 4 | "upload_type": "software", 5 | "access_right": "open", 6 | "creators": [ 7 | { 8 | "name": "Sergey Venev", 9 | "affiliation": "University of Massachusetts Medical School" 10 | }, 11 | { 12 | "name": "Nezar Abdennur", 13 | "affiliation": "MIT" 14 | }, 15 | { 16 | "name": "Anton Goloborodko", 17 | "affiliation": "IMBA" 18 | }, 19 | { 20 | "name": "Ilya Flyamer", 21 | "affiliation": "FMI" 22 | }, 23 | { 24 | "name": "Geoffrey Fudenberg", 25 | "affiliation": "University of Southern California" 26 | }, 27 | { 28 | "name": "Johannes Nuebler", 29 | "affiliation": "MIT" 30 | }, 31 | { 32 | "name": "Aleksandra Galitsyna", 33 | "affiliation": "Skolkovo Institute of Science and Technology" 34 | }, 35 | { 36 | "name": "Betul Akgol", 37 | "affiliation": "University of Massachusetts Medical School" 38 | }, 39 | { 40 | "name": "Sameer Abraham", 41 | "affiliation": "MIT" 42 | }, 43 | { 44 | "name": "Peter Kerpedjiev", 45 | "affiliation": "Harvard Medical School" 46 | }, 47 | { 48 | "name": "Maksim Imakaev", 49 | "affiliation": "MIT" 50 | } 51 | ], 52 | "keywords": [ 53 | "genomics", 54 | "bioinformatics", 55 | "Hi-C", 56 | "data", 57 | "analysis", 58 | "cooler" 59 | ] 60 | } 61 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | ## General guidelines 4 | 5 | If you haven't contributed to open-source before, we recommend you read [this excellent guide by GitHub on how to contribute to open source](https://opensource.guide/how-to-contribute). The guide is long, so you can gloss over things you're familiar with. 6 | 7 | If you're not already familiar with it, we follow the [fork and pull model](https://help.github.com/articles/about-collaborative-development-models) on GitHub. Also, check out this recommended [git workflow](https://www.asmeurer.com/git-workflow/). 8 | 9 | As a rough guide for cooltools: 10 | - contributors should preferably work on their forks and submit pull requests to the main branch 11 | - core maintainers can work on feature branches in the main fork and then submit pull requests to the main branch 12 | - core maintainers can push directly to the main branch if it's urgently needed 13 | 14 | 15 | ## Contributing Code 16 | 17 | This project has a number of requirements for all code contributed. 18 | 19 | * We follow the [PEP-8 style](https://www.python.org/dev/peps/pep-0008/) convention. 20 | * We use [flake8](http://flake8.pycqa.org/en/latest/) to automatically lint the code and maintain code style. You can use a code formatter like [black](https://github.com/psf/black) or [autopep8](https://github.com/hhatto/autopep8) to help keep the linter happy. 21 | * We use [Numpy-style docstrings](https://numpydoc.readthedocs.io/en/latest/format.html). 22 | * User-facing API changes or new features should have documentation added. 23 | 24 | Ideally, provide full test coverage for new code submitted in PRs. 25 | 26 | 27 | ## Setting up Your Development Environment 28 | 29 | For setting up an isolated virtual environment for development, we recommend using [conda](https://docs.conda.io/en/latest/miniconda.html). After forking and cloning the repository, install in "editable" (i.e. development) mode using the `-e` option: 30 | 31 | ```sh 32 | $ git clone https://github.com/open2c/cooltools.git 33 | $ cd cooltools 34 | $ pip install -e . 35 | ``` 36 | 37 | Editable mode installs the package by creating a "link" to your working (repo) directory. 38 | 39 | 40 | ## Unit Tests 41 | 42 | It is best if all new functionality and/or bug fixes have unit tests added with each use-case. 43 | 44 | We use [pytest](https://docs.pytest.org/en/latest) as our unit testing framework with the `pytest-cov` extension to check code coverage and `pytest-flake8` to check code style. You don't need to configure these extensions yourself. 45 | This automatically checks code style and functionality, and prints code coverage, even though it doesn't fail on low coverage. 46 | 47 | Once you've configured your environment, you can just `cd` to the root of your repository and run 48 | 49 | ```sh 50 | $ pytest 51 | ``` 52 | 53 | Unit tests are automatically run on Travis CI for pull requests. 54 | 55 | 56 | ## Coverage 57 | 58 | The `pytest` script automatically reports coverage, both on the terminal for missing line numbers, and in annotated HTML form in `htmlcov/index.html`. 59 | 60 | 61 | ## Documentation 62 | 63 | If a feature is stable and relatively finalized, it is time to add it to the documentation. If you are adding any private/public functions, it is best to add docstrings, to aid in reviewing code and also for the API reference. 64 | 65 | We use [Numpy style docstrings](https://numpydoc.readthedocs.io/en/latest/format.html>) and [Sphinx](http://www.sphinx-doc.org/en/stable) to document this library. Sphinx, in turn, uses [reStructuredText](http://www.sphinx-doc.org/en/stable/rest.html) as its markup language for adding code. 66 | 67 | We use the [Sphinx Autosummary extension](http://www.sphinx-doc.org/en/stable/ext/autosummary.html) to generate API references. You may want to look at `docs/api.rst` to see how these files look and where to add new functions, classes or modules. 68 | 69 | We also use the [nbsphinx extension](https://nbsphinx.readthedocs.io/en/0.5.0/) to render tutorial pages from Jupyter notebooks. 70 | 71 | To build the documentation: 72 | 73 | ```sh 74 | $ make docs 75 | ``` 76 | 77 | After this, you can find an HTML version of the documentation in `docs/_build/html/index.html`. 78 | 79 | Documentation from `master` and tagged releases is automatically built and hosted thanks to [readthedocs](https://readthedocs.org/). 80 | 81 | 82 | ## Acknowledgement 83 | 84 | If you've contributed significantly and would like your authorship to be included in subsequent uploads to [Zenodo](https://zenodo.org), please make a separate PR to add your name and affiliation to the `.zenodo.json` file. 85 | 86 | --- 87 | 88 | This document was modified from the [guidelines from the sparse project](https://github.com/pydata/sparse/blob/master/docs/contributing.rst). 89 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Cooltools developers 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include CHANGELOG.md 2 | include README.md 3 | include LICENSE 4 | include requirements.txt 5 | include requirements-dev.txt 6 | include environment.yml 7 | 8 | include cooltools/lib/_numutils.pyx 9 | graft tests 10 | graft docs 11 | prune docs/_build 12 | prune docs/_static 13 | prune docs/_templates 14 | 15 | global-exclude __pycache__/* 16 | global-exclude *.so 17 | global-exclude *.pyd 18 | global-exclude *.pyc 19 | global-exclude .git* 20 | global-exclude .deps/* 21 | global-exclude .DS_Store 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build install test docs clean clean-pyc clean-dist build-dist publish-test publish 2 | 3 | 4 | build: 5 | python setup.py build_ext --inplace 6 | 7 | install: 8 | pip install -e . 9 | 10 | test: 11 | pytest 12 | 13 | docs: 14 | cd docs && make html 15 | 16 | 17 | clean-pyc: 18 | find . -name '*.pyc' -exec rm --force {} + 19 | find . -name '*.pyo' -exec rm --force {} + 20 | find . -name '*~' -exec rm --force {} + 21 | 22 | clean-dist: 23 | rm -rf build/ 24 | rm -rf dist/ 25 | 26 | clean: clean-pyc clean-dist 27 | 28 | 29 | build-dist: clean-dist 30 | python setup.py sdist 31 | # python setup.py bdist_wheel 32 | 33 | publish-test: build-dist 34 | twine upload --repository-url https://test.pypi.org/legacy/ dist/* 35 | 36 | publish: build-dist 37 | twine upload dist/* 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cooltools: enabling high-resolution Hi-C analysis in Python 2 | 3 | 4 | 5 | 6 | [![Pytest](https://github.com/open2c/cooltools/actions/workflows/pytest.yml/badge.svg)](https://github.com/open2c/cooltools/actions/workflows/pytest.yml) 7 | [![Documentation Status](https://readthedocs.org/projects/cooltools/badge/?version=latest)](https://cooltools.readthedocs.io/en/latest/?badge=latest) 8 | [![Latest Release PyPI](https://img.shields.io/pypi/v/cooltools?color=blue&label=PyPI%20package)](https://pypi.org/project/cooltools) 9 | [![Latest Release Bioconda](https://img.shields.io/conda/vn/bioconda/cooltools?color=blue)](https://bioconda.github.io/recipes/cooltools/README.html) 10 | [![DOI](https://zenodo.org/badge/82413481.svg)](https://zenodo.org/badge/latestdoi/82413481) 11 | 12 | > tools for your .cools 13 | 14 | Chromosome conformation capture technologies reveal the incredible complexity of genome folding. A growing number of labs and multiple consortia, including the 4D Nucleome, the International Nucleome Consortium, and ENCODE, are generating higher-resolution datasets to probe genome architecture across cell states, types, and organisms. Larger datasets increase the challenges at each step of computational analysis, from storage, to memory, to researchers’ time. The recently-introduced [***cooler***](https://github.com/open2c/cooler/tree/master/cooler) format readily handles storage of high-resolution datasets via a sparse data model. 15 | 16 | ***cooltools*** leverages this format to enable flexible and reproducible analysis of high-resolution data. ***cooltools*** provides a suite of computational tools with a paired python API and command line access, which facilitates workflows either on high-performance computing clusters or via custom analysis notebooks. As part of the [***Open2C*** ecosystem](https://open2c.github.io/), ***cooltools*** also provides detailed introductions to key concepts in Hi-C-data analysis with interactive notebook documentation. For more information, see the [preprint](https://doi.org/10.1101/2022.10.31.514564): https://doi.org/10.1101/2022.10.31.514564. 17 | 18 | ## Requirements 19 | 20 | The following are required before installing cooltools: 21 | 22 | * Python 3.7+ 23 | * `numpy` 24 | * `cython` 25 | 26 | ## Installation 27 | 28 | ```sh 29 | pip install cooltools 30 | ``` 31 | 32 | or install the latest version directly from github: 33 | 34 | ``` 35 | $ pip install https://github.com/open2c/cooltools/archive/refs/heads/master.zip 36 | ``` 37 | 38 | See the [requirements.txt](https://github.com/open2c/cooltools/blob/master/requirements.txt) file for information on compatible dependencies, especially for [cooler](https://github.com/open2c/cooler/tree/master/cooler) and [bioframe](https://github.com/open2c/bioframe). 39 | 40 | 41 | ## Documentation and Tutorials 42 | 43 | Documentation can be found here: https://cooltools.readthedocs.io/en/latest/. 44 | 45 | Cooltools offers a number of tutorials using the [Open2c code ecosystem](https://github.com/open2c/). For users who are new to Hi-C analysis, we recommend going through example notebooks in the following order: 46 | 47 | - [Visualization](https://cooltools.readthedocs.io/en/latest/notebooks/viz.html): how to load and visualize Hi-C data stored in coolers. 48 | - [Contacts vs Distance](https://cooltools.readthedocs.io/en/latest/notebooks/contacts_vs_distance.html): how to calculate contact frequency as a function of genomic distance, the most prominent feature in Hi-C maps. 49 | - [Compartments and Saddles](https://cooltools.readthedocs.io/en/latest/notebooks/compartments_and_saddles.html): how to extract eigenvectors and create saddleplots reflecting A/B compartments. 50 | - [Insulation and Boundaries](https://cooltools.readthedocs.io/en/latest/notebooks/insulation_and_boundaries.html): how to extract insulation profiles and call boundaries using insulation profile minima. 51 | - [Pileups and Average Patterns](https://cooltools.readthedocs.io/en/latest/notebooks/pileup_CTCF.html): how to create avearge maps around genomic features like CTCF. 52 | 53 | For users interested in running analyses from the commmand line: 54 | - [Command line interface](https://cooltools.readthedocs.io/en/latest/notebooks/command_line_interface.html): how to use the cooltools CLI. 55 | 56 | Note that these notebooks currently focus on mammalian interphase Hi-C analysis, but are readily extendible to other organisms and cellular contexts. To clone notebooks for interactive analysis, visit https://github.com/open2c/open2c_examples. Docs for cooltools are built directly from these notebooks. 57 | 58 | ## Contributing 59 | Cooltools welcomes contributions. The guiding principles for tools are that they are (i) as simple as possible, (ii) as interpretable as possible, (iii) should not involve visualization. The following applies for contributing new functionality to cooltools. 60 | 61 | New functionality should: 62 | - clearly define the problem 63 | - discuss alternative solutions 64 | - provide a separate example (provided as a gist/notebook/etc) explaining its use cases on multiple datasets. 65 | - be compatible with the latest versions of cooler and cooltools (e.g. should be able to be run on any cooler generated by the latest version of cooler) 66 | 67 | New functionality should either: 68 | - generalize or extend existing tool without impairing user experience, and be submitted as PR to the relevant tool 69 | - or extract a distinct feature of genome organization, and be submitted as pull request to the sandbox 70 | 71 | Vignettes, using existing tools in new ways, should be submitted as pull requests to open2c_vignettes as a distinct jupyter notebook, rather than to cooltools sandbox. The bar for contributions to this repository is minimal. We recommend each vignette to include package version information, and raise an error for other versions. If it makes sense, the example data available for download using cooltools can be used to allow an easy way to try out the analysis. Otherwise, the source of data can be specified for others to obtain it. 72 | 73 | Practical aspects for contributing can be found in the guide [here](https://github.com/open2c/cooltools/blob/master/CONTRIBUTING.md). 74 | 75 | ## Citing `cooltools` 76 | 77 | Open2C*, Nezar Abdennur*, Sameer Abraham, Geoffrey Fudenberg*, Ilya M. Flyamer*, Aleksandra A. Galitsyna*, Anton Goloborodko*, Maxim Imakaev, Betul A. Oksuz, and Sergey V. Venev*. “Cooltools: Enabling High-Resolution Hi-C Analysis in Python.” bioRxiv, November 1, 2022. https://doi.org/10.1101/2022.10.31.514564. 78 | -------------------------------------------------------------------------------- /cooltools/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Cool tools 4 | ~~~~~~~~~~ 5 | 6 | The tools for your .cool's. 7 | 8 | :author: Cooltools developers 9 | :license: MIT 10 | 11 | """ 12 | import logging 13 | 14 | __version__ = "0.7.1" 15 | 16 | from . import lib 17 | 18 | from .lib import ( 19 | numutils, 20 | download_data, 21 | print_available_datasets, 22 | get_data_dir, 23 | download_file, 24 | get_md5sum, 25 | ) 26 | 27 | from .api.expected import expected_cis, expected_trans 28 | from .api.coverage import coverage 29 | from .api.eigdecomp import eigs_cis, eigs_trans 30 | from .api.saddle import digitize, saddle 31 | from .api.sample import sample 32 | from .api.snipping import pileup 33 | from .api.directionality import directionality 34 | from .api.insulation import insulation 35 | from .api.dotfinder import dots 36 | from .api.virtual4c import virtual4c 37 | -------------------------------------------------------------------------------- /cooltools/__main__.py: -------------------------------------------------------------------------------- 1 | from .cli import cli 2 | 3 | if __name__ == "__main__": 4 | cli() 5 | -------------------------------------------------------------------------------- /cooltools/api/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import pathlib 3 | 4 | __all__ = [ 5 | f.stem 6 | for f in pathlib.Path(__file__).parent.glob("*.py") 7 | if f.is_file() and not f.name == "__init__.py" 8 | ] 9 | 10 | for _ in __all__: 11 | importlib.import_module("." + _, "cooltools.api") 12 | 13 | del pathlib 14 | del importlib 15 | -------------------------------------------------------------------------------- /cooltools/api/directionality.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import numpy as np 3 | import pandas as pd 4 | 5 | def _dirscore( 6 | pixels, bins, window=10, ignore_diags=2, balanced=True, signed_chi2=False 7 | ): 8 | lo_bin_id = bins.index.min() 9 | hi_bin_id = bins.index.max() + 1 10 | N = hi_bin_id - lo_bin_id 11 | 12 | bad_bin_mask = ( 13 | bins["weight"].isnull().values if balanced else np.zeros(N, dtype=bool) 14 | ) 15 | 16 | diag_pixels = pixels[pixels["bin2_id"] - pixels["bin1_id"] <= (window - 1) * 2] 17 | if balanced: 18 | diag_pixels = diag_pixels[~diag_pixels["balanced"].isnull()] 19 | 20 | i = diag_pixels["bin1_id"].values - lo_bin_id 21 | j = diag_pixels["bin2_id"].values - lo_bin_id 22 | val = diag_pixels["balanced"].values if balanced else diag_pixels["count"].values 23 | 24 | sum_pixels_left = np.zeros(N) 25 | n_pixels_left = np.zeros(N) 26 | for i_shift in range(0, window): 27 | if i_shift < ignore_diags: 28 | continue 29 | 30 | mask = (i + i_shift == j) & (i + i_shift < N) & (j >= 0) 31 | sum_pixels_left += np.bincount(i[mask] + i_shift, val[mask], minlength=N) 32 | 33 | loc_bad_bin_mask = np.zeros(N, dtype=bool) 34 | if i_shift == 0: 35 | loc_bad_bin_mask |= bad_bin_mask 36 | else: 37 | loc_bad_bin_mask[i_shift:] |= bad_bin_mask[:-i_shift] 38 | loc_bad_bin_mask |= bad_bin_mask 39 | n_pixels_left[i_shift:] += 1 - loc_bad_bin_mask[i_shift:] 40 | 41 | sum_pixels_right = np.zeros(N) 42 | n_pixels_right = np.zeros(N) 43 | for j_shift in range(0, window): 44 | if j_shift < ignore_diags: 45 | continue 46 | 47 | mask = (i == j - j_shift) & (i < N) & (j - j_shift >= 0) 48 | 49 | sum_pixels_right += np.bincount(i[mask], val[mask], minlength=N) 50 | 51 | loc_bad_bin_mask = np.zeros(N, dtype=bool) 52 | loc_bad_bin_mask |= bad_bin_mask 53 | if j_shift == 0: 54 | loc_bad_bin_mask |= bad_bin_mask 55 | else: 56 | loc_bad_bin_mask[:-j_shift] |= bad_bin_mask[j_shift:] 57 | 58 | n_pixels_right[: (-j_shift if j_shift else None)] += ( 59 | 1 - loc_bad_bin_mask[: (-j_shift if j_shift else None)] 60 | ) 61 | 62 | with warnings.catch_warnings(): 63 | warnings.simplefilter("ignore") 64 | 65 | a = sum_pixels_left 66 | b = sum_pixels_right 67 | if signed_chi2: 68 | e = (a + b) / 2.0 69 | score = np.sign(b - a) * ((a - e) ** 2 + (b - e) ** 2) / e 70 | else: 71 | score = (b - a) / (a + b) 72 | 73 | return score 74 | 75 | 76 | def _dirscore_dense(A, window=10, signed_chi2=False): 77 | N = A.shape[0] 78 | di = np.zeros(N) 79 | for i in range(0, N): 80 | lo = max(0, i - window) 81 | hi = min((i + window) + 1, N) 82 | b, a = np.nansum(A[i, i:hi]), np.nansum(A[i, lo : i + 1]) 83 | if signed_chi2: 84 | e = (a + b) / 2.0 85 | if e: 86 | di[i] = np.sign(b - a) * ((a - e) ** 2 + (b - e) ** 2) / e 87 | else: 88 | di[i] = (b - a) / (a + b) 89 | mask = np.nansum(A, axis=0) == 0 90 | di[mask] = np.nan 91 | return di 92 | 93 | 94 | def directionality( 95 | clr, 96 | window_bp=100000, 97 | balance="weight", 98 | min_dist_bad_bin=2, 99 | ignore_diags=None, 100 | chromosomes=None, 101 | ): 102 | """Calculate the diamond insulation scores and call insulating boundaries. 103 | 104 | Parameters 105 | ---------- 106 | clr : cooler.Cooler 107 | A cooler with balanced Hi-C data. 108 | window_bp : int 109 | The size of the sliding diamond window used to calculate the insulation 110 | score. 111 | min_dist_bad_bin : int 112 | The minimal allowed distance to a bad bin. Do not calculate insulation 113 | scores for bins having a bad bin closer than this distance. 114 | ignore_diags : int 115 | The number of diagonals to ignore. If None, equals the number of 116 | diagonals ignored during IC balancing. 117 | 118 | Returns 119 | ------- 120 | ins_table : pandas.DataFrame 121 | A table containing the insulation scores of the genomic bins and 122 | the insulating boundary strengths. 123 | """ 124 | if chromosomes is None: 125 | chromosomes = clr.chromnames 126 | 127 | bin_size = clr.info["bin-size"] 128 | ignore_diags = ( 129 | ignore_diags 130 | if ignore_diags is not None 131 | else clr._load_attrs(clr.root.rstrip("/") + "/bins/weight")["ignore_diags"] 132 | ) 133 | window_bins = window_bp // bin_size 134 | 135 | if window_bp % bin_size != 0: 136 | raise Exception( 137 | "The window size ({}) has to be a multiple of the bin size {}".format( 138 | window_bp, bin_size 139 | ) 140 | ) 141 | 142 | dir_chrom_tables = [] 143 | for chrom in chromosomes: 144 | chrom_bins = clr.bins().fetch(chrom) 145 | chrom_pixels = clr.matrix(as_pixels=True, balance=balance).fetch(chrom) 146 | 147 | # mask neighbors of bad bins 148 | is_bad_bin = np.isnan(chrom_bins["weight"].values) 149 | bad_bin_neighbor = np.zeros_like(is_bad_bin) 150 | for i in range(0, min_dist_bad_bin): 151 | if i == 0: 152 | bad_bin_neighbor = bad_bin_neighbor | is_bad_bin 153 | else: 154 | bad_bin_neighbor = bad_bin_neighbor | np.r_[[True] * i, is_bad_bin[:-i]] 155 | bad_bin_neighbor = bad_bin_neighbor | np.r_[is_bad_bin[i:], [True] * i] 156 | 157 | dir_chrom = chrom_bins[["chrom", "start", "end"]].copy() 158 | dir_chrom["bad_bin_masked"] = bad_bin_neighbor 159 | 160 | with warnings.catch_warnings(): 161 | warnings.simplefilter("ignore", RuntimeWarning) 162 | dir_track = _dirscore( 163 | chrom_pixels, chrom_bins, window=window_bins, ignore_diags=ignore_diags 164 | ) 165 | dir_track[bad_bin_neighbor] = np.nan 166 | dir_track[~np.isfinite(dir_track)] = np.nan 167 | dir_chrom["directionality_ratio_{}".format(window_bp)] = dir_track 168 | 169 | dir_track = _dirscore( 170 | chrom_pixels, 171 | chrom_bins, 172 | window=window_bins, 173 | ignore_diags=ignore_diags, 174 | signed_chi2=True, 175 | ) 176 | dir_track[bad_bin_neighbor] = np.nan 177 | dir_track[~np.isfinite(dir_track)] = np.nan 178 | dir_chrom["directionality_index_{}".format(window_bp)] = dir_track 179 | 180 | dir_chrom_tables.append(dir_chrom) 181 | 182 | dir_table = pd.concat(dir_chrom_tables) 183 | return dir_table 184 | -------------------------------------------------------------------------------- /cooltools/api/sample.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | import cooler 5 | import cooler.parallel 6 | from .coverage import coverage 7 | from ..lib.common import pool_decorator 8 | 9 | 10 | 11 | def sample_pixels_approx(pixels, frac): 12 | pixels["count"] = np.random.binomial(pixels["count"], frac) 13 | mask = pixels["count"] > 0 14 | 15 | if issubclass(type(pixels), pd.DataFrame): 16 | pixels = pixels[mask] 17 | elif issubclass(type(pixels), dict): 18 | pixels = {k: arr[mask] for k, arr in pixels.items()} 19 | return pixels 20 | 21 | 22 | def sample_pixels_exact(pixels, count): 23 | cumcount = np.cumsum(np.asarray(pixels["count"])) 24 | total = cumcount[-1] 25 | n_pixels = cumcount.shape[0] 26 | 27 | # sample a given number of distinct contacts 28 | random_contacts = np.random.choice(total, size=count, replace=False) 29 | 30 | # find where those contacts live in the cumcount array 31 | loc = np.searchsorted(cumcount, random_contacts, side="right") 32 | 33 | # re-bin those locations to get new counts 34 | new_counts = np.bincount(loc, minlength=n_pixels) 35 | 36 | pixels["count"] = new_counts 37 | mask = pixels["count"] > 0 38 | if issubclass(type(pixels), pd.DataFrame): 39 | pixels = pixels[mask] 40 | elif issubclass(type(pixels), dict): 41 | pixels = {k: arr[mask] for k, arr in pixels.items()} 42 | return pixels 43 | 44 | 45 | def _extract_pixel_chunk(chunk): 46 | return chunk["pixels"] 47 | 48 | @pool_decorator 49 | def sample( 50 | clr, 51 | out_clr_path, 52 | count=None, 53 | cis_count=None, 54 | frac=None, 55 | exact=False, 56 | chunksize=int(1e7), 57 | nproc=1, 58 | map_functor=map, 59 | ): 60 | """ 61 | Pick a random subset of contacts from a Hi-C map. 62 | 63 | Parameters 64 | ---------- 65 | clr : cooler.Cooler or str 66 | A Cooler or a path/URI to a Cooler with input data. 67 | 68 | out_clr_path : str 69 | A path/URI to the output. 70 | 71 | count : int 72 | The target number of contacts in the sample. 73 | Mutually exclusive with `cis_count` and `frac`. 74 | 75 | cis_count : int 76 | The target number of cis contacts in the sample. 77 | Mutually exclusive with `count` and `frac`. 78 | 79 | frac : float 80 | The target sample size as a fraction of contacts in the original 81 | dataset. Mutually exclusive with `count` and `cis_count`. 82 | 83 | exact : bool 84 | If True, the resulting sample size will exactly match the target value. 85 | Exact sampling will load the whole pixel table into memory! 86 | If False, binomial sampling will be used instead and the sample size 87 | will be randomly distributed around the target value. 88 | 89 | chunksize : int 90 | The number of pixels loaded and processed per step of computation. 91 | 92 | nproc : int, optional 93 | How many processes to use for calculation. Ignored if map_functor is passed. 94 | 95 | map_functor : callable, optional 96 | Map function to dispatch the matrix chunks to workers. 97 | If left unspecified, pool_decorator applies the following defaults: if nproc>1 this defaults to multiprocess.Pool; 98 | If nproc=1 this defaults the builtin map. 99 | 100 | """ 101 | if issubclass(type(clr), str): 102 | clr = cooler.Cooler(clr) 103 | 104 | if frac is not None and count is None and cis_count is None: 105 | pass 106 | elif frac is None and count is not None and cis_count is None: 107 | frac = count / clr.info["sum"] 108 | elif frac is None and count is None and cis_count is not None: 109 | # note division by two, since coverage() counts each side separately 110 | cis_total = clr.info.get("cis", np.sum(coverage(clr)[0] // 2, dtype=int)) 111 | frac = cis_count / cis_total 112 | else: 113 | raise ValueError( 114 | "Please specify exactly one argument among `count`, `cis_count`" 115 | " and `frac`" 116 | ) 117 | 118 | if frac > 1.0: 119 | raise ValueError( 120 | "The number of contacts in a sample cannot exceed " 121 | "that in the original dataset." 122 | ) 123 | 124 | if exact: 125 | count = np.round(frac * clr.info["sum"]).astype(int) 126 | pixels = sample_pixels_exact(clr.pixels()[:], count) 127 | cooler.create_cooler(out_clr_path, clr.bins()[:], pixels, ordered=True) 128 | 129 | else: 130 | pipeline = ( 131 | cooler.parallel.split( 132 | clr, include_bins=False, map=map_functor, chunksize=chunksize 133 | ) 134 | .pipe(_extract_pixel_chunk) 135 | .pipe(sample_pixels_approx, frac=frac) 136 | ) 137 | 138 | cooler.create_cooler( 139 | out_clr_path, 140 | clr.bins()[:][["chrom", "start", "end"]], 141 | iter(pipeline), 142 | ordered=True, 143 | ) 144 | -------------------------------------------------------------------------------- /cooltools/api/virtual4c.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logging.basicConfig(level=logging.INFO) 4 | 5 | from functools import partial 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import bioframe 10 | 11 | 12 | from ..lib.checks import is_cooler_balanced 13 | from ..lib.common import pool_decorator 14 | 15 | 16 | 17 | def _extract_profile(chrom, clr, clr_weight_name, viewpoint): 18 | to_return = [] 19 | if clr_weight_name: 20 | colname = "balanced" 21 | else: 22 | colname = "count" 23 | pxls1 = clr.matrix(balance=clr_weight_name, as_pixels=True, join=True).fetch( 24 | chrom, viewpoint 25 | ) 26 | pxls1[["chrom2"]] = viewpoint[0] 27 | pxls1[["start2"]] = viewpoint[1] 28 | pxls1[["end2"]] = viewpoint[2] 29 | 30 | pxls1 = ( 31 | pxls1.groupby(["chrom1", "start1", "end1"], observed=True)[colname] 32 | .mean() 33 | .reset_index() 34 | ) 35 | pxls1.columns = ["chrom", "start", "end", colname] 36 | if pxls1.shape[0] > 0: 37 | to_return.append(pxls1) 38 | 39 | pxls2 = clr.matrix(balance=clr_weight_name, as_pixels=True, join=True).fetch( 40 | viewpoint, chrom 41 | ) 42 | pxls2[["chrom1"]] = viewpoint[0] 43 | pxls2[["start1"]] = viewpoint[1] 44 | pxls2[["end1"]] = viewpoint[2] 45 | pxls2 = ( 46 | pxls2.groupby(["chrom2", "start2", "end2"], observed=True)[colname] 47 | .mean() 48 | .reset_index() 49 | ) 50 | pxls2.columns = ["chrom", "start", "end", colname] 51 | if pxls2.shape[0] > 0: 52 | to_return.append(pxls2) 53 | if len(to_return) == 0: 54 | return pd.DataFrame(columns=["chrom", "start", "end", colname]) 55 | else: 56 | return pd.concat(to_return, ignore_index=True) 57 | 58 | @pool_decorator 59 | def virtual4c( 60 | clr, 61 | viewpoint, 62 | clr_weight_name="weight", 63 | nproc=1, 64 | map_functor=map, 65 | ): 66 | """Generate genome-wide contact profile for a given viewpoint. 67 | 68 | Extract all contacts of a given viewpoint from a cooler file. 69 | 70 | Parameters 71 | ---------- 72 | clr : cooler.Cooler 73 | A cooler with balanced Hi-C data. 74 | viewpoint : tuple or str 75 | Coordinates of the viewpoint. 76 | clr_weight_name : str 77 | Name of the column in the bin table with weight 78 | nproc : int, optional 79 | How many processes to use for calculation. Ignored if map_functor is passed. 80 | map_functor : callable, optional 81 | Map function to dispatch the matrix chunks to workers. 82 | If left unspecified, pool_decorator applies the following defaults: if nproc>1 this defaults to multiprocess.Pool; 83 | If nproc=1 this defaults the builtin map. 84 | 85 | Returns 86 | ------- 87 | v4C_table : pandas.DataFrame 88 | A table containing the interaction frequency of the viewpoint with the rest of 89 | the genome 90 | 91 | Note 92 | ---- 93 | Note: this is a new (experimental) function, the interface or output might change in 94 | a future version. 95 | """ 96 | if clr_weight_name not in [None, False]: 97 | # check if cooler is balanced 98 | try: 99 | _ = is_cooler_balanced(clr, clr_weight_name, raise_errors=True) 100 | 101 | except Exception as e: 102 | raise ValueError( 103 | f"provided cooler is not balanced or {clr_weight_name} is missing" 104 | ) from e 105 | colname = "balanced" 106 | else: 107 | colname = "count" 108 | viewpoint = bioframe.core.stringops.parse_region(viewpoint) 109 | 110 | f = partial( 111 | _extract_profile, clr=clr, clr_weight_name=clr_weight_name, viewpoint=viewpoint 112 | ) 113 | 114 | counts = list(map_functor(f, clr.chromnames)) 115 | 116 | # Concatenate all chrompsome dfs into one 117 | v4c = pd.concat(counts, ignore_index=True) 118 | if v4c.shape[0] == 0: 119 | logging.warn(f"No contacts found for viewpoint {viewpoint}") 120 | v4c = clr.bins()[:][["chrom", "start", "end"]] 121 | v4c[colname] = np.nan 122 | else: 123 | v4c["chrom"] = v4c["chrom"].astype("category") 124 | v4c["start"] = v4c["start"].astype(int) 125 | v4c["end"] = v4c["end"].astype(int) 126 | bioframe.sort_bedframe( 127 | v4c, 128 | view_df=bioframe.make_viewframe(clr.chromsizes), 129 | ) # sort it according clr.chromsizes order 130 | v4c.loc[ 131 | (v4c["chrom"] == viewpoint[0]) 132 | & (v4c["start"] >= viewpoint[1]) 133 | & (v4c["end"] <= viewpoint[2]), 134 | colname, 135 | ] = np.nan # Set within-viewpoint bins to nan 136 | v4c = ( 137 | pd.merge( 138 | clr.bins()[:][["chrom", "start", "end"]], 139 | v4c, 140 | on=["chrom", "start", "end"], 141 | how="left", 142 | ) 143 | .drop_duplicates() 144 | .reset_index(drop=True) 145 | ) # Ensure we return all bins even if empty 146 | return v4c 147 | -------------------------------------------------------------------------------- /cooltools/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division, print_function 3 | import click 4 | import sys 5 | from .. import __version__ 6 | 7 | # Monkey patch 8 | click.core._verify_python3_env = lambda: None 9 | 10 | 11 | CONTEXT_SETTINGS = { 12 | "help_option_names": ["-h", "--help"], 13 | } 14 | 15 | 16 | @click.version_option(__version__, "-V", "--version") 17 | @click.group(context_settings=CONTEXT_SETTINGS) 18 | @click.option("-v", "--verbose", help="Verbose logging", is_flag=True, default=False) 19 | @click.option( 20 | "-d", "--debug", help="Post mortem debugging", is_flag=True, default=False 21 | ) 22 | def cli(verbose, debug): 23 | """ 24 | Type -h or --help after any subcommand for more information. 25 | 26 | """ 27 | if verbose: 28 | pass 29 | # logger.setLevel(logging.DEBUG) 30 | 31 | if debug: 32 | import traceback 33 | 34 | try: 35 | import ipdb as pdb 36 | except ImportError: 37 | import pdb 38 | 39 | def _excepthook(exc_type, value, tb): 40 | traceback.print_exception(exc_type, value, tb) 41 | print() 42 | pdb.pm() 43 | 44 | sys.excepthook = _excepthook 45 | 46 | 47 | from . import ( 48 | expected_cis, 49 | expected_trans, 50 | insulation, 51 | pileup, 52 | eigs_cis, 53 | eigs_trans, 54 | saddle, 55 | dots, 56 | genome, 57 | sample, 58 | coverage, 59 | virtual4c, 60 | rearrange, 61 | ) 62 | -------------------------------------------------------------------------------- /cooltools/cli/coverage.py: -------------------------------------------------------------------------------- 1 | import click 2 | import cooler 3 | 4 | from . import cli 5 | from .. import api 6 | 7 | import bioframe 8 | 9 | 10 | 11 | @cli.command() 12 | @click.argument( 13 | "cool_path", metavar="COOL_PATH", type=str, nargs=1, 14 | ) 15 | @click.option( 16 | "--output", 17 | "-o", 18 | help="Specify output file name to store the coverage in a tsv format.", 19 | type=str, 20 | required=False, 21 | ) 22 | @click.option( 23 | "--ignore-diags", 24 | help="The number of diagonals to ignore. By default, equals" 25 | " the number of diagonals ignored during IC balancing.", 26 | type=int, 27 | default=None, 28 | show_default=True, 29 | ) 30 | @click.option( 31 | "--store", 32 | help="Append columns with coverage (cov_cis_raw, cov_tot_raw), or" 33 | " (cov_cis_clr_weight_name, cov_tot_clr_weight_name) if calculating" 34 | " balanced coverage, to the cooler bin table. If clr_weight_name=None," 35 | " also stores total cis counts in the cooler info", 36 | is_flag=True, 37 | ) 38 | @click.option( 39 | "--chunksize", 40 | help="Split the contact matrix pixel records into equally sized chunks to" 41 | " save memory and/or parallelize. Default is 10^7", 42 | type=int, 43 | default=1e7, 44 | show_default=True, 45 | ) 46 | @click.option( 47 | "--bigwig", 48 | help="Also save output as bigWig files for cis and total coverage" 49 | " with the names ..bw", 50 | is_flag=True, 51 | default=False, 52 | ) 53 | @click.option( 54 | "--clr_weight_name", 55 | help="Name of the weight column. Specify to calculate coverage of" 56 | " balanced cooler.", 57 | type=str, 58 | default=None, 59 | show_default=False, 60 | ) 61 | @click.option( 62 | "-p", 63 | "--nproc", 64 | help="Number of processes to split the work between." 65 | " [default: 1, i.e. no process pool]", 66 | default=1, 67 | type=int, 68 | ) 69 | def coverage( 70 | cool_path, output, ignore_diags, store, chunksize, bigwig, clr_weight_name, nproc, 71 | ): 72 | """ 73 | Calculate the sums of cis and genome-wide contacts (aka coverage aka marginals) for 74 | a sparse Hi-C contact map in Cooler HDF5 format. 75 | Note that the sum(tot_cov) from this function is two times the number of reads 76 | contributing to the cooler, as each side contributes to the coverage. 77 | 78 | COOL_PATH : The paths to a .cool file with a balanced Hi-C map. 79 | 80 | """ 81 | 82 | clr = cooler.Cooler(cool_path) 83 | 84 | cis_cov, tot_cov = api.coverage.coverage( 85 | clr, ignore_diags=ignore_diags, chunksize=chunksize, nproc=nproc, store=store, clr_weight_name=clr_weight_name 86 | ) 87 | 88 | 89 | coverage_table = clr.bins()[:][["chrom", "start", "end"]] 90 | if clr_weight_name is None: 91 | store_names = ["cov_cis_raw", "cov_tot_raw"] 92 | coverage_table[store_names[0]] = cis_cov.astype(int) 93 | coverage_table[store_names[1]] = tot_cov.astype(int) 94 | else: 95 | store_names = [f"cov_cis_{clr_weight_name}", f"cov_tot_{clr_weight_name}"] 96 | coverage_table[store_names[0]] = cis_cov.astype(float) 97 | coverage_table[store_names[1]] = tot_cov.astype(float) 98 | 99 | # output to file if specified: 100 | if output: 101 | coverage_table.to_csv(output, sep="\t", index=False, na_rep="nan") 102 | # or print into stdout otherwise: 103 | else: 104 | print(coverage_table.to_csv(sep="\t", index=False, na_rep="nan")) 105 | 106 | # Write the coverage tracks as a bigwigs: 107 | if bigwig: 108 | bioframe.to_bigwig( 109 | coverage_table, 110 | clr.chromsizes, 111 | f"{output}.cis.bw", 112 | value_field=store_names[0], 113 | ) 114 | bioframe.to_bigwig( 115 | coverage_table, 116 | clr.chromsizes, 117 | f"{output}.tot.bw", 118 | value_field=store_names[1], 119 | ) 120 | -------------------------------------------------------------------------------- /cooltools/cli/dots.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import cooler 3 | import logging 4 | 5 | import click 6 | from . import cli 7 | from .. import api 8 | 9 | from ..lib.common import make_cooler_view 10 | from ..lib.io import read_viewframe_from_file, read_expected_from_file 11 | 12 | from .util import validate_csv 13 | 14 | logging.basicConfig(level=logging.INFO) 15 | 16 | 17 | @cli.command() 18 | @click.argument( 19 | "cool_path", 20 | metavar="COOL_PATH", 21 | type=str, 22 | nargs=1, 23 | ) 24 | @click.argument( 25 | "expected_path", 26 | metavar="EXPECTED_PATH", 27 | type=str, 28 | callback=partial(validate_csv, default_column="balanced.avg"), 29 | ) 30 | @click.option( 31 | "--view", 32 | "--regions", 33 | help="Path to a BED file with the definition of viewframe (regions)" 34 | " used in the calculation of EXPECTED_PATH. Dot-calling will be" 35 | " performed for these regions independently e.g. chromosome arms." 36 | " Note that '--regions' is the deprecated name of the option. Use '--view' instead. ", 37 | type=click.Path(exists=False, dir_okay=False), 38 | default=None, 39 | show_default=True, 40 | ) 41 | @click.option( 42 | "--clr-weight-name", 43 | help="Use cooler balancing weight with this name.", 44 | type=str, 45 | default="weight", 46 | show_default=True, 47 | ) 48 | @click.option( 49 | "-p", 50 | "--nproc", 51 | help="Number of processes to split the work between." 52 | " [default: 1, i.e. no process pool]", 53 | default=1, 54 | type=int, 55 | ) 56 | @click.option( 57 | "--max-loci-separation", 58 | help="Limit loci separation for dot-calling, i.e., do not call dots for" 59 | " loci that are further than max_loci_separation basepair apart." 60 | " 2-20MB is reasonable and would capture most of CTCF-dots.", 61 | type=int, 62 | default=2000000, 63 | show_default=True, 64 | ) 65 | @click.option( 66 | "--max-nans-tolerated", 67 | help="Maximum number of NaNs tolerated in a footprint of every used filter." 68 | " Must be controlled with caution, as large max-nans-tolerated, might lead to" 69 | ' pixels scored in the padding area of the tiles to "penetrate" to the list' 70 | " of scored pixels for the statistical testing. [max-nans-tolerated <= 2*w ]", 71 | type=int, 72 | default=1, 73 | show_default=True, 74 | ) 75 | @click.option( 76 | "--tile-size", 77 | help="Tile size for the Hi-C heatmap tiling." 78 | " Typically on order of several mega-bases, and <= max_loci_separation.", 79 | type=int, 80 | default=6000000, 81 | show_default=True, 82 | ) 83 | @click.option( 84 | "--num-lambda-bins", 85 | help="Number of log-spaced bins to divide your adjusted expected" 86 | " between. Same as HiCCUPS_W1_MAX_INDX (40) in the original HiCCUPS.", 87 | type=int, 88 | default=45, 89 | show_default=True, 90 | ) 91 | @click.option( 92 | "--fdr", 93 | help="False discovery rate (FDR) to control in the multiple" 94 | " hypothesis testing BH-FDR procedure.", 95 | type=float, 96 | default=0.02, 97 | show_default=True, 98 | ) 99 | @click.option( 100 | "--clustering-radius", 101 | help="Radius for clustering dots that have been called too close to each other." 102 | "Typically on order of 40 kilo-bases, and >= binsize.", 103 | type=int, 104 | default=39000, 105 | show_default=True, 106 | ) 107 | @click.option( 108 | "-v", "--verbose", help="Enable verbose output", is_flag=True, default=False 109 | ) 110 | @click.option( 111 | "-o", 112 | "--output", 113 | help="Specify output file name to store called dots in a BEDPE-like format", 114 | type=str, 115 | required=True, 116 | ) 117 | def dots( 118 | cool_path, 119 | expected_path, 120 | view, 121 | clr_weight_name, 122 | nproc, 123 | max_loci_separation, 124 | max_nans_tolerated, 125 | tile_size, 126 | num_lambda_bins, 127 | fdr, 128 | clustering_radius, 129 | verbose, 130 | output, 131 | ): 132 | """ 133 | Call dots on a Hi-C heatmap that are not larger than max_loci_separation. 134 | 135 | COOL_PATH : The paths to a .cool file with a balanced Hi-C map. 136 | 137 | EXPECTED_PATH : The paths to a tsv-like file with expected signal, 138 | including a header. Use the '::' syntax to specify a column name. 139 | 140 | Analysis will be performed for chromosomes referred to in EXPECTED_PATH, and 141 | therefore these chromosomes must be a subset of chromosomes referred to in 142 | COOL_PATH. Also chromosomes refered to in EXPECTED_PATH must be non-trivial, 143 | i.e., contain not-NaN signal. Thus, make sure to prune your EXPECTED_PATH 144 | before applying this script. 145 | 146 | COOL_PATH and EXPECTED_PATH must be binned at the same resolution. 147 | 148 | EXPECTED_PATH must contain at least the following columns for cis contacts: 149 | 'region1/2', 'dist', 'n_valid', value_name. value_name is controlled using 150 | options. Header must be present in a file. 151 | 152 | """ 153 | clr = cooler.Cooler(cool_path) 154 | expected_path, expected_value_col = expected_path 155 | 156 | # Either use view from file or all chromosomes in the provided cooler 157 | if view is None: 158 | view_df = make_cooler_view(clr) 159 | else: 160 | view_df = read_viewframe_from_file(view, clr, check_sorting=True) 161 | 162 | expected = read_expected_from_file( 163 | expected_path, 164 | contact_type="cis", 165 | expected_value_cols=[expected_value_col], 166 | verify_view=view_df, 167 | verify_cooler=clr, 168 | ) 169 | 170 | dot_calls_df = api.dotfinder.dots( 171 | clr, 172 | expected, 173 | expected_value_col=expected_value_col, 174 | clr_weight_name=clr_weight_name, 175 | view_df=view_df, 176 | kernels=None, # engaging default HiCCUPS kernels 177 | max_loci_separation=max_loci_separation, 178 | max_nans_tolerated=max_nans_tolerated, # test if this has desired behavior 179 | n_lambda_bins=num_lambda_bins, # update this eventually 180 | lambda_bin_fdr=fdr, 181 | clustering_radius=clustering_radius, 182 | cluster_filtering=None, 183 | tile_size=tile_size, 184 | nproc=nproc, 185 | ) 186 | 187 | # output results in a file, when specified 188 | if output: 189 | dot_calls_df.to_csv(output, sep="\t", header=True, index=False, na_rep="nan") 190 | # or print into stdout otherwise: 191 | else: 192 | print( 193 | dot_calls_df.to_csv( 194 | output, sep="\t", header=True, index=False, na_rep="nan" 195 | ) 196 | ) 197 | -------------------------------------------------------------------------------- /cooltools/cli/eigs_cis.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import cooler 4 | import bioframe 5 | from ..api import eigdecomp 6 | from ..lib.common import make_cooler_view 7 | from ..lib.io import read_viewframe_from_file 8 | 9 | import click 10 | from .util import TabularFilePath, sniff_for_header 11 | from . import cli 12 | 13 | 14 | @cli.command() 15 | @click.argument("cool_path", metavar="COOL_PATH", type=str) 16 | @click.option( 17 | "--phasing-track", 18 | help="Phasing track for orienting and ranking eigenvectors," 19 | "provided as /path/to/track::track_value_column_name.", 20 | type=TabularFilePath(exists=True, default_column_index=3), 21 | metavar="TRACK_PATH", 22 | ) 23 | @click.option( 24 | "--view", 25 | "--regions", 26 | help="Path to a BED file which defines which regions of the chromosomes to use" 27 | " (only implemented for cis contacts)." 28 | " Note that '--regions' is the deprecated name of the option. Use '--view' instead. ", 29 | default=None, 30 | type=str, 31 | ) 32 | @click.option( 33 | "--n-eigs", 34 | help="Number of eigenvectors to compute.", 35 | type=int, 36 | default=3, 37 | show_default=True, 38 | ) 39 | @click.option( 40 | "--clr-weight-name", 41 | help="Use balancing weight with this name. " 42 | "Using raw unbalanced data is not currently supported for eigenvectors.", 43 | type=str, 44 | default="weight", 45 | show_default=True, 46 | ) 47 | @click.option( 48 | "--ignore-diags", 49 | help="The number of diagonals to ignore. By default, equals" 50 | " the number of diagonals ignored during IC balancing.", 51 | type=int, 52 | default=None, 53 | show_default=True, 54 | ) 55 | @click.option( 56 | "-v", "--verbose", help="Enable verbose output", is_flag=True, default=False 57 | ) 58 | @click.option( 59 | "-o", 60 | "--out-prefix", 61 | help="Save compartment track as a BED-like file." 62 | " Eigenvectors and corresponding eigenvalues are stored in" 63 | " out_prefix.contact_type.vecs.tsv and out_prefix.contact_type.lam.txt", 64 | required=True, 65 | ) 66 | @click.option( 67 | "--bigwig", 68 | help="Also save compartment track (E1) as a bigWig file" 69 | " with the name out_prefix.contact_type.bw", 70 | is_flag=True, 71 | default=False, 72 | ) 73 | def eigs_cis( 74 | cool_path, 75 | phasing_track, 76 | view, 77 | n_eigs, 78 | clr_weight_name, 79 | ignore_diags, 80 | verbose, 81 | out_prefix, 82 | bigwig, 83 | ): 84 | """ 85 | Perform eigen value decomposition on a cooler matrix to calculate 86 | compartment signal by finding the eigenvector that correlates best with the 87 | phasing track. 88 | 89 | 90 | COOL_PATH : the paths to a .cool file with a balanced Hi-C map. Use the 91 | '::' syntax to specify a group path in a multicooler file. 92 | 93 | TRACK_PATH : the path to a BedGraph-like file that stores phasing track as 94 | track-name named column. 95 | 96 | BedGraph-like format assumes tab-separated columns chrom, start, stop and 97 | track-name. 98 | 99 | """ 100 | clr = cooler.Cooler(cool_path) 101 | 102 | if phasing_track is not None: 103 | 104 | # TODO: This all needs to be refactored into a more generic tabular file parser 105 | # Needs to handle stdin case too. 106 | track_path, col = phasing_track 107 | buf, names = sniff_for_header(track_path) 108 | 109 | if names is None: 110 | if not isinstance(col, int): 111 | raise click.BadParameter( 112 | "No header found. " 113 | 'Cannot find "{}" column without a header.'.format(col) 114 | ) 115 | 116 | track_name = "ref" 117 | kwargs = dict( 118 | header=None, 119 | usecols=[0, 1, 2, col], 120 | names=["chrom", "start", "end", track_name], 121 | ) 122 | else: 123 | if isinstance(col, int): 124 | try: 125 | col = names[col] 126 | except IndexError: 127 | raise click.BadParameter( 128 | 'Column #{} not compatible with header "{}".'.format( 129 | col, ",".join(names) 130 | ) 131 | ) 132 | else: 133 | if col not in names: 134 | raise click.BadParameter( 135 | 'Column "{}" not found in header "{}"'.format( 136 | col, ",".join(names) 137 | ) 138 | ) 139 | 140 | track_name = col 141 | kwargs = dict(header="infer", usecols=["chrom", "start", "end", track_name]) 142 | 143 | track_df = pd.read_table( 144 | buf, 145 | dtype={ 146 | "chrom": str, 147 | "start": np.int64, 148 | "end": np.int64, 149 | track_name: np.float64, 150 | }, 151 | comment="#", 152 | verbose=verbose, 153 | **kwargs 154 | ) 155 | phasing_track = track_df 156 | 157 | # define view for cis compartment-calling 158 | # use input "view" BED file or all chromosomes mentioned in "track": 159 | if view is None: 160 | cooler_view_df = make_cooler_view(clr) 161 | view_df = cooler_view_df 162 | else: 163 | view_df = read_viewframe_from_file(view, clr, check_sorting=True) 164 | 165 | # TODO: Add check that view_df has the same bins as track 166 | eigvals, eigvec_table = eigdecomp.eigs_cis( 167 | clr=clr, 168 | phasing_track=phasing_track, 169 | view_df=view_df, 170 | n_eigs=n_eigs, 171 | clr_weight_name=clr_weight_name, 172 | ignore_diags=ignore_diags, 173 | clip_percentile=99.9, 174 | sort_metric=None, 175 | ) 176 | 177 | # Output 178 | eigvals.to_csv(out_prefix + ".cis" + ".lam.txt", sep="\t", index=False) 179 | eigvec_table.to_csv(out_prefix + ".cis" + ".vecs.tsv", sep="\t", index=False) 180 | if bigwig: 181 | bioframe.to_bigwig( 182 | eigvec_table, 183 | clr.chromsizes, 184 | out_prefix + ".cis" + ".bw", 185 | value_field="E1", 186 | ) 187 | -------------------------------------------------------------------------------- /cooltools/cli/eigs_trans.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import cooler 4 | import bioframe 5 | from ..api import eigdecomp 6 | from ..lib.common import make_cooler_view 7 | 8 | import click 9 | from .util import TabularFilePath, sniff_for_header 10 | from . import cli 11 | 12 | 13 | @cli.command() 14 | @click.argument("cool_path", metavar="COOL_PATH", type=str) 15 | @click.option( 16 | "--phasing-track", 17 | help="Phasing track for orienting and ranking eigenvectors," 18 | "provided as /path/to/track::track_value_column_name.", 19 | type=TabularFilePath(exists=True, default_column_index=3), 20 | metavar="TRACK_PATH", 21 | ) 22 | @click.option( 23 | "--view", 24 | "--regions", 25 | help="Path to a BED file which defines which regions of the chromosomes to use" 26 | " (only implemented for cis contacts). " 27 | " Note that '--regions' is the deprecated name of the option. Use '--view' instead. ", 28 | default=None, 29 | type=str, 30 | ) 31 | @click.option( 32 | "--n-eigs", 33 | help="Number of eigenvectors to compute.", 34 | type=int, 35 | default=3, 36 | show_default=True, 37 | ) 38 | @click.option( 39 | "--clr-weight-name", 40 | help="Use balancing weight with this name. Using raw unbalanced data is not supported for saddles.", 41 | type=str, 42 | default="weight", 43 | show_default=True, 44 | ) 45 | @click.option( 46 | "-v", "--verbose", help="Enable verbose output", is_flag=True, default=False 47 | ) 48 | @click.option( 49 | "-o", 50 | "--out-prefix", 51 | help="Save compartment track as a BED-like file." 52 | " Eigenvectors and corresponding eigenvalues are stored in" 53 | " out_prefix.contact_type.vecs.tsv and out_prefix.contact_type.lam.txt", 54 | required=True, 55 | ) 56 | @click.option( 57 | "--bigwig", 58 | help="Also save compartment track (E1) as a bigWig file" 59 | " with the name out_prefix.contact_type.bw", 60 | is_flag=True, 61 | default=False, 62 | ) 63 | def eigs_trans( 64 | cool_path, 65 | phasing_track, 66 | view, 67 | n_eigs, 68 | clr_weight_name, 69 | verbose, 70 | out_prefix, 71 | bigwig, 72 | ): 73 | """ 74 | Perform eigen value decomposition on a cooler matrix to calculate 75 | compartment signal by finding the eigenvector that correlates best with the 76 | phasing track. 77 | 78 | 79 | COOL_PATH : the paths to a .cool file with a balanced Hi-C map. Use the 80 | '::' syntax to specify a group path in a multicooler file. 81 | 82 | TRACK_PATH : the path to a BedGraph-like file that stores phasing track as 83 | track-name named column. 84 | 85 | BedGraph-like format assumes tab-separated columns chrom, start, stop and 86 | track-name. 87 | 88 | """ 89 | clr = cooler.Cooler(cool_path) 90 | # full chromosome view, based on cooler 91 | cooler_view_df = make_cooler_view(clr) 92 | 93 | if phasing_track is not None: 94 | 95 | # TODO: This all needs to be refactored into a more generic tabular file parser 96 | # Needs to handle stdin case too. 97 | track_path, col = phasing_track 98 | buf, names = sniff_for_header(track_path) 99 | 100 | if names is None: 101 | if not isinstance(col, int): 102 | raise click.BadParameter( 103 | "No header found. " 104 | 'Cannot find "{}" column without a header.'.format(col) 105 | ) 106 | 107 | track_name = "ref" 108 | kwargs = dict( 109 | header=None, 110 | usecols=[0, 1, 2, col], 111 | names=["chrom", "start", "end", track_name], 112 | ) 113 | else: 114 | if isinstance(col, int): 115 | try: 116 | col = names[col] 117 | except IndexError: 118 | raise click.BadParameter( 119 | 'Column #{} not compatible with header "{}".'.format( 120 | col, ",".join(names) 121 | ) 122 | ) 123 | else: 124 | if col not in names: 125 | raise click.BadParameter( 126 | 'Column "{}" not found in header "{}"'.format( 127 | col, ",".join(names) 128 | ) 129 | ) 130 | 131 | track_name = col 132 | kwargs = dict(header="infer", usecols=["chrom", "start", "end", track_name]) 133 | 134 | track_df = pd.read_table( 135 | buf, 136 | dtype={ 137 | "chrom": str, 138 | "start": np.int64, 139 | "end": np.int64, 140 | track_name: np.float64, 141 | }, 142 | comment="#", 143 | verbose=verbose, 144 | **kwargs 145 | ) 146 | phasing_track = track_df 147 | 148 | # TODO: implement view for eigs-trans instead of current "partition" 149 | # use input "view" BED file or all chromosomes mentioned in "track": 150 | if view is None: 151 | view_df = cooler_view_df 152 | else: 153 | raise NotImplementedError( 154 | "views are not currently implemented for CLI eigs-trans" 155 | ) 156 | # view_df = read_viewframe_from_file(view, clr, check_sorting=True) 157 | 158 | # TODO: Add check that view_df has the same bins as track 159 | eigvals, eigvec_table = eigdecomp.eigs_trans( 160 | clr=clr, 161 | phasing_track=phasing_track, 162 | n_eigs=n_eigs, 163 | clr_weight_name=clr_weight_name, 164 | partition=None, 165 | sort_metric=None, 166 | ) 167 | 168 | # Output 169 | eigvals.to_csv(out_prefix + ".trans" + ".lam.txt", sep="\t", index=False) 170 | eigvec_table.to_csv(out_prefix + ".trans" + ".vecs.tsv", sep="\t", index=False) 171 | if bigwig: 172 | bioframe.to_bigwig( 173 | eigvec_table, 174 | clr.chromsizes, 175 | out_prefix + ".trans" + ".bw", 176 | value_field="E1", 177 | ) 178 | -------------------------------------------------------------------------------- /cooltools/cli/expected_cis.py: -------------------------------------------------------------------------------- 1 | import cooler 2 | from .. import api 3 | from ..lib.common import make_cooler_view 4 | from ..lib.io import read_viewframe_from_file 5 | 6 | import click 7 | from . import cli 8 | 9 | 10 | @cli.command() 11 | @click.argument("cool_path", metavar="COOL_PATH", type=str, nargs=1) 12 | @click.option( 13 | "--nproc", 14 | "-p", 15 | help="Number of processes to split the work between." 16 | "[default: 1, i.e. no process pool]", 17 | default=1, 18 | type=int, 19 | ) 20 | @click.option( 21 | "--chunksize", 22 | "-c", 23 | help="Control the number of pixels handled by each worker process at a time.", 24 | type=int, 25 | default=int(10e6), 26 | show_default=True, 27 | ) 28 | @click.option( 29 | "--output", 30 | "-o", 31 | help="Specify output file name to store the expected in a tsv format.", 32 | type=str, 33 | required=False, 34 | ) 35 | @click.option( 36 | "--view", 37 | "--regions", 38 | help="Path to a 3 or 4-column BED file with genomic regions" 39 | " to calculated cis-expected on. When region names are not provided" 40 | " (no 4th column), UCSC-style region names are generated." 41 | " Cis-expected is calculated for all chromosomes, when this is not specified." 42 | " Note that '--regions' is the deprecated name of the option. Use '--view' instead.", 43 | type=click.Path(exists=True), 44 | required=False, 45 | ) 46 | @click.option( 47 | "--smooth", 48 | help="If set, cis-expected is smoothed and result stored in an additional column" 49 | " e.g. balanced.avg.smoothed", 50 | is_flag=True, 51 | ) 52 | @click.option( 53 | "--aggregate-smoothed", 54 | help="If set, cis-expected is averaged over all regions and then smoothed." 55 | " Result is stored in an additional column, e.g. balanced.avg.smoothed.agg." 56 | " Ignored without smoothing", 57 | is_flag=True, 58 | ) 59 | @click.option( 60 | "--smooth-sigma", 61 | help="Control smoothing with the standard deviation of the smoothing Gaussian kernel," 62 | " ignored without smoothing.", 63 | type=float, 64 | default=0.1, 65 | show_default=True, 66 | ) 67 | @click.option( 68 | "--clr-weight-name", 69 | help="Use balancing weight with this name stored in cooler." 70 | "Provide empty argument to calculate cis-expected on raw data", 71 | type=str, 72 | default="weight", 73 | show_default=True, 74 | ) 75 | @click.option( 76 | "--ignore-diags", 77 | help="Number of diagonals to neglect for cis contact type", 78 | type=int, 79 | default=2, 80 | show_default=True, 81 | ) 82 | def expected_cis( 83 | cool_path, 84 | nproc, 85 | chunksize, 86 | output, 87 | view, 88 | smooth, 89 | aggregate_smoothed, 90 | smooth_sigma, 91 | clr_weight_name, 92 | ignore_diags, 93 | ): 94 | """ 95 | Calculate expected Hi-C signal for cis regions of chromosomal interaction map: 96 | average of interactions separated by the same genomic distance, i.e. 97 | are on the same diagonal on the cis-heatmap. 98 | 99 | When balancing weights are not applied to the data, there is no 100 | masking of bad bins performed. 101 | 102 | COOL_PATH : The paths to a .cool file with a balanced Hi-C map. 103 | 104 | """ 105 | 106 | clr = cooler.Cooler(cool_path) 107 | 108 | if view is None: 109 | # full chromosome case 110 | view_df = make_cooler_view(clr) 111 | else: 112 | # Read view_df dataframe, and verify against cooler 113 | view_df = read_viewframe_from_file(view, clr, check_sorting=True) 114 | 115 | result = api.expected.expected_cis( 116 | clr, 117 | view_df=view_df, 118 | intra_only=True, 119 | smooth=smooth, 120 | aggregate_smoothed=aggregate_smoothed, 121 | smooth_sigma=smooth_sigma, 122 | clr_weight_name=clr_weight_name if clr_weight_name else None, 123 | ignore_diags=ignore_diags, 124 | chunksize=chunksize, 125 | nproc=nproc, 126 | ) 127 | 128 | # output to file if specified: 129 | if output: 130 | result.to_csv(output, sep="\t", index=False, na_rep="nan") 131 | # or print into stdout otherwise: 132 | else: 133 | print(result.to_csv(sep="\t", index=False, na_rep="nan")) 134 | -------------------------------------------------------------------------------- /cooltools/cli/expected_trans.py: -------------------------------------------------------------------------------- 1 | import cooler 2 | from .. import api 3 | from ..lib.common import make_cooler_view 4 | from ..lib.io import read_viewframe_from_file 5 | 6 | 7 | import click 8 | from . import cli 9 | 10 | 11 | @cli.command() 12 | @click.argument("cool_path", metavar="COOL_PATH", type=str, nargs=1) 13 | @click.option( 14 | "--nproc", 15 | "-p", 16 | help="Number of processes to split the work between." 17 | "[default: 1, i.e. no process pool]", 18 | default=1, 19 | type=int, 20 | ) 21 | @click.option( 22 | "--chunksize", 23 | "-c", 24 | help="Control the number of pixels handled by each worker process at a time.", 25 | type=int, 26 | default=int(10e6), 27 | show_default=True, 28 | ) 29 | @click.option( 30 | "--output", 31 | "-o", 32 | help="Specify output file name to store the expected in a tsv format.", 33 | type=str, 34 | required=False, 35 | ) 36 | @click.option( 37 | "--view", 38 | "--regions", 39 | help="Path to a 3 or 4-column BED file with genomic regions. Trans-expected" 40 | " is calculated on all pairwise combinations of these regions." 41 | " When region names are not provided (no 4th column)," 42 | " UCSC-style region names are generated. Trans-expected is calculated " 43 | " for all inter-chromosomal pairs, when view is not specified." 44 | " Note that '--regions' is the deprecated name of the option. Use '--view' instead.", 45 | type=click.Path(exists=True), 46 | required=False, 47 | ) 48 | @click.option( 49 | "--clr-weight-name", 50 | help="Use balancing weight with this name stored in cooler." 51 | "Provide empty argument to calculate cis-expected on raw data", 52 | type=str, 53 | default="weight", 54 | show_default=True, 55 | ) 56 | def expected_trans( 57 | cool_path, 58 | nproc, 59 | chunksize, 60 | output, 61 | view, 62 | clr_weight_name, 63 | ): 64 | """ 65 | Calculate expected Hi-C signal for trans regions of chromosomal interaction map: 66 | average of interactions in a rectangular block defined by a pair of regions, e.g. 67 | inter-chromosomal blocks. 68 | 69 | When balancing weights are not applied to the data, there is no 70 | masking of bad bins performed. 71 | 72 | COOL_PATH : The paths to a .cool file with a balanced Hi-C map. 73 | 74 | """ 75 | 76 | clr = cooler.Cooler(cool_path) 77 | 78 | if view is None: 79 | # full chromosome case 80 | view_df = make_cooler_view(clr) 81 | else: 82 | # Read view_df dataframe, and verify against cooler 83 | view_df = read_viewframe_from_file(view, clr, check_sorting=True) 84 | 85 | result = api.expected.expected_trans( 86 | clr, 87 | view_df=view_df, 88 | clr_weight_name=clr_weight_name if clr_weight_name else None, 89 | chunksize=chunksize, 90 | nproc=nproc, 91 | ) 92 | 93 | # output to file if specified: 94 | if output: 95 | result.to_csv(output, sep="\t", index=False, na_rep="nan") 96 | # or print into stdout otherwise: 97 | else: 98 | print(result.to_csv(sep="\t", index=False, na_rep="nan")) 99 | -------------------------------------------------------------------------------- /cooltools/cli/genome.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import click 3 | from . import cli 4 | 5 | 6 | @cli.group() 7 | def genome(): 8 | """ 9 | Utilities for binned genome assemblies. 10 | 11 | """ 12 | 13 | 14 | @genome.command() 15 | @click.argument("db") 16 | def fetch_chromsizes(db): 17 | import bioframe 18 | 19 | chromsizes = bioframe.fetch_chromsizes(db) 20 | print(chromsizes.to_csv(sep="\t")) 21 | 22 | 23 | @genome.command() 24 | @click.argument("chromsizes_path") 25 | @click.argument("binsize", type=int) 26 | @click.option( 27 | "--all-names", 28 | help='Parse all chromosome names from file, not only default r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$". ', 29 | is_flag=True, 30 | ) 31 | def binnify(chromsizes_path, binsize, all_names): 32 | import bioframe 33 | 34 | chromsizes = bioframe.read_chromsizes( 35 | chromsizes_path, filter_chroms=not (all_names) 36 | ) 37 | bins = bioframe.binnify(chromsizes, binsize) 38 | print(bins.to_csv(sep="\t", index=False)) 39 | 40 | 41 | @genome.command() 42 | @click.argument("chromsizes_path") 43 | @click.argument("fasta_path") 44 | @click.argument("enzyme_name") 45 | def digest(chromsizes_path, fasta_path, enzyme_name): 46 | import bioframe 47 | 48 | chromsizes = bioframe.read_chromsizes(chromsizes_path, filter_chroms=False) 49 | fasta_records = bioframe.load_fasta(fasta_path, engine="pyfaidx", as_raw=True) 50 | if not chromsizes.index.isin(fasta_records).all(): 51 | raise ValueError( 52 | "Some chromosomes mentioned in {}" 53 | " are not found in {}".format(chromsizes_path, fasta_path) 54 | ) 55 | frags = bioframe.digest(fasta_records, enzyme_name) 56 | print(frags.to_csv(sep="\t", index=False)) 57 | 58 | 59 | @genome.command() 60 | @click.argument("bins_path") 61 | @click.argument("fasta_path") 62 | @click.option("--mapped-only", is_flag=True, default=True) 63 | def gc(bins_path, fasta_path, mapped_only): 64 | import bioframe 65 | import pandas as pd 66 | 67 | if bins_path == "-": 68 | bins_path = sys.stdin 69 | bins = pd.read_table(bins_path) 70 | chromosomes = bins["chrom"].unique() 71 | fasta_records = bioframe.load_fasta(fasta_path, engine="pyfaidx", as_raw=True) 72 | if any(chrom not in fasta_records.keys() for chrom in chromosomes): 73 | raise ValueError( 74 | "Some chromosomes mentioned in {}" 75 | " are not found in {}".format(bins_path, fasta_path) 76 | ) 77 | bins = bioframe.frac_gc(bins, fasta_records, mapped_only) 78 | print(bins.to_csv(sep="\t", index=False)) 79 | 80 | 81 | @genome.command() 82 | @click.argument("bins_path") 83 | @click.argument("db") 84 | def genecov(bins_path, db): 85 | """ 86 | BINS_PATH is the path to bintable. 87 | 88 | DB is the name of the genome assembly. 89 | The gene locations will be automatically downloaded from teh UCSC goldenPath. 90 | """ 91 | import bioframe 92 | import pandas as pd 93 | 94 | bins = pd.read_table(bins_path) 95 | bins = bioframe.frac_gene_coverage(bins, db) 96 | print(bins.to_csv(sep="\t", index=False)) 97 | -------------------------------------------------------------------------------- /cooltools/cli/insulation.py: -------------------------------------------------------------------------------- 1 | import click 2 | import cooler 3 | 4 | from . import cli 5 | from .. import api 6 | from ..lib.common import make_cooler_view 7 | from ..lib.io import read_viewframe_from_file 8 | import bioframe 9 | 10 | 11 | @cli.command() 12 | @click.argument("in_path", metavar="IN_PATH", type=str, nargs=1) 13 | @click.argument("window", nargs=-1, metavar="WINDOW", type=int) 14 | @click.option( 15 | "--nproc", 16 | "-p", 17 | help="Number of processes to split the work between." 18 | "[default: 1, i.e. no process pool]", 19 | default=1, 20 | type=int, 21 | ) 22 | @click.option( 23 | "--output", 24 | "-o", 25 | help="Specify output file name to store the insulation in a tsv format.", 26 | type=str, 27 | required=False, 28 | ) 29 | @click.option( 30 | "--view", 31 | "--regions", 32 | help="Path to a BED file containing genomic regions " 33 | "for which insulation scores will be calculated. Region names can " 34 | "be provided in a 4th column and should match regions and " 35 | "their names in expected." 36 | " Note that '--regions' is the deprecated name of the option. Use '--view' instead. ", 37 | type=click.Path(exists=True), 38 | required=False, 39 | ) 40 | @click.option( 41 | "--ignore-diags", 42 | help="The number of diagonals to ignore. By default, equals" 43 | " the number of diagonals ignored during IC balancing.", 44 | type=int, 45 | default=None, 46 | show_default=True, 47 | ) 48 | @click.option( 49 | "--clr-weight-name", 50 | help="Use balancing weight with this name. " 51 | "Provide empty argument to calculate insulation on raw data (no masking bad pixels).", 52 | type=str, 53 | default="weight", 54 | show_default=True, 55 | ) 56 | @click.option( 57 | "--min-frac-valid-pixels", 58 | help="The minimal fraction of valid pixels in a sliding diamond. " 59 | "Used to mask bins during boundary detection.", 60 | type=float, 61 | default=0.66, 62 | show_default=True, 63 | ) 64 | @click.option( 65 | "--min-dist-bad-bin", 66 | help="The minimal allowed distance to a bad bin. " 67 | "Use to mask bins after insulation calculation and during boundary detection.", 68 | type=int, 69 | default=0, 70 | show_default=True, 71 | ) 72 | @click.option( 73 | "--threshold", 74 | help="Rule used to threshold the histogram of boundary strengths to exclude weak" 75 | "boundaries. 'Li' or 'Otsu' use corresponding methods from skimage.thresholding." 76 | "Providing a float value will filter by a fixed threshold", 77 | type=str, 78 | default=0, 79 | show_default=True, 80 | ) 81 | @click.option( 82 | "--window-pixels", 83 | help="If set then the window sizes are provided in units of pixels.", 84 | is_flag=True, 85 | ) 86 | @click.option( 87 | "--append-raw-scores", 88 | help="Append columns with raw scores (sum_counts, sum_balanced, n_pixels) " 89 | "to the output table.", 90 | is_flag=True, 91 | ) 92 | @click.option("--chunksize", help="", type=int, default=20000000, show_default=True) 93 | @click.option("--verbose", help="Report real-time progress.", is_flag=True) 94 | @click.option( 95 | "--bigwig", 96 | help="Also save insulation tracks as a bigWig files for different window sizes" 97 | " with the names output..bw", 98 | is_flag=True, 99 | default=False, 100 | ) 101 | def insulation( 102 | in_path, 103 | window, 104 | output, 105 | view, 106 | ignore_diags, 107 | clr_weight_name, 108 | min_frac_valid_pixels, 109 | min_dist_bad_bin, 110 | threshold, 111 | window_pixels, 112 | append_raw_scores, 113 | chunksize, 114 | verbose, 115 | bigwig, 116 | nproc, 117 | ): 118 | """ 119 | Calculate the diamond insulation scores and call insulating boundaries. 120 | 121 | IN_PATH : The path to a .cool file with a balanced Hi-C map. 122 | 123 | WINDOW : The window size for the insulation score calculations. 124 | Multiple space-separated values can be provided. 125 | By default, the window size must be provided in units of bp. 126 | When the flag --window-pixels is set, the window sizes must 127 | be provided in units of pixels instead. 128 | """ 129 | 130 | clr = cooler.Cooler(in_path) 131 | 132 | # Create view: 133 | cooler_view_df = make_cooler_view(clr) 134 | if view is None: 135 | # full chromosomes: 136 | view_df = cooler_view_df 137 | else: 138 | # read view_df dataframe, and verify against cooler 139 | view_df = read_viewframe_from_file(view, clr, check_sorting=True) 140 | 141 | # Read list with windows: 142 | if window_pixels: 143 | window = [win * clr.info["bin-size"] for win in window] 144 | 145 | ins_table = api.insulation.insulation( 146 | clr, 147 | view_df=view_df, 148 | window_bp=window, 149 | ignore_diags=ignore_diags, 150 | clr_weight_name=clr_weight_name if clr_weight_name else None, 151 | min_frac_valid_pixels=min_frac_valid_pixels, 152 | min_dist_bad_bin=min_dist_bad_bin, 153 | threshold=threshold, 154 | append_raw_scores=append_raw_scores, 155 | chunksize=chunksize, 156 | verbose=verbose, 157 | nproc=nproc, 158 | ) 159 | 160 | # output to file if specified: 161 | if output: 162 | ins_table.to_csv(output, sep="\t", index=False, na_rep="nan") 163 | # or print into stdout otherwise: 164 | else: 165 | print(ins_table.to_csv(sep="\t", index=False, na_rep="nan")) 166 | 167 | # Write the insulation track as a bigwig: 168 | if bigwig: 169 | for w in window: 170 | bioframe.to_bigwig( 171 | ins_table, 172 | clr.chromsizes, 173 | output + "." + str(w) + ".bw", 174 | value_field=f"log2_insulation_score_{w}", 175 | ) 176 | -------------------------------------------------------------------------------- /cooltools/cli/logbin_expected.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | from ..api import expected 3 | from ..lib.io import read_expected_from_file 4 | 5 | import click 6 | from .util import validate_csv 7 | from . import cli 8 | 9 | 10 | @cli.command() 11 | @click.argument( 12 | "expected_path", 13 | metavar="EXPECTED_PATH", 14 | type=str, 15 | callback=partial(validate_csv, default_column="balanced.sum"), 16 | ) 17 | @click.argument("output_prefix", metavar="OUTPUT_PREFIX", type=str, nargs=1) 18 | @click.option( 19 | "--bins-per-order-magnitude", 20 | metavar="bins_per_order_magnitude", 21 | help="How many bins per order of magnitude. " 22 | "Default of 10 has a ratio of neighboring bins of about 1.25", 23 | type=int, 24 | nargs=1, 25 | default=10, 26 | show_default=True, 27 | ) 28 | @click.option( 29 | "--bin-layout", 30 | metavar="bin_layout", 31 | help="'fixed' means that bins are exactly the same for different datasets, " 32 | "and only depend on bins_per_order_magnitude " 33 | "'longest_regio' means that the last bin will end at size of the longest region. " 34 | "\nGOOD: the last bin will have as much data as possible. " 35 | "\nBAD: bin edges will end up different for different datasets, " 36 | "you can't divide them by each other", 37 | type=click.Choice(["fixed", "longest_region"]), 38 | nargs=1, 39 | default="fixed", 40 | show_default=True, 41 | ) 42 | @click.option( 43 | "--min-nvalid", 44 | metavar="min_nvalid", 45 | help="For each region, throw out bins (log-spaced) that have less than min_nvalid " 46 | "valid pixels. This will ensure that each entree in Pc by region has at least " 47 | "n_valid valid pixels. " 48 | "Don't set it to zero, or it will introduce bugs. Setting it to 1 is OK, but " 49 | "not recommended.", 50 | type=int, 51 | nargs=1, 52 | default=200, 53 | show_default=True, 54 | ) 55 | @click.option( 56 | "--min-count", 57 | metavar="min_count", 58 | help="If counts are found in the data, then for each region, throw out bins " 59 | "(log-spaced) that have more than min_counts of counts.sum (raw Hi-C counts). " 60 | "This will ensure that each entree in P(s) by region has at least min_count " 61 | "raw Hi-C reads", 62 | type=int, 63 | nargs=1, 64 | default=50, 65 | show_default=True, 66 | ) 67 | @click.option( 68 | "--spread-funcs", 69 | metavar="spread_funcs", 70 | help="A way to estimate the spread of the P(s) curves between regions. " 71 | "* 'minmax' - the minimum/maximum of by-region P(s)\n" 72 | "* 'std' - weighted standard deviation of P(s) curves (may produce negative results)\n " 73 | "* 'logstd' (recommended) weighted standard deviation in logspace", 74 | type=click.Choice(["minmax", "std", "logstd"]), 75 | default="logstd", 76 | show_default=True, 77 | nargs=1, 78 | ) 79 | @click.option( 80 | "--spread-funcs-slope", 81 | metavar="spread_funcs_slope", 82 | help="Same as spread-funcs, but for slope (derivative) ratehr than P(s)", 83 | type=click.Choice(["minmax", "std", "logstd"]), 84 | default="std", 85 | show_default=True, 86 | nargs=1, 87 | ) 88 | @click.option( 89 | "--resolution", 90 | metavar="resolution", 91 | help="Data resolution in bp. If provided, additonal column of separation in bp " 92 | "(s_bp) will be added to the outputs", 93 | type=int, 94 | nargs=1, 95 | ) 96 | def logbin_expected( 97 | expected_path, 98 | output_prefix, 99 | bins_per_order_magnitude, 100 | bin_layout, 101 | min_nvalid, 102 | min_count, 103 | spread_funcs, 104 | spread_funcs_slope, 105 | resolution, 106 | ): 107 | """ 108 | Logarithmically bin expected values generated using compute_expected for cis data. 109 | 110 | This smoothes the data, resulting in clearer plots and more robust analysis results. 111 | Also calculates derivative after gaussian smoothing. 112 | For a very detailed escription, see 113 | https://github.com/open2c/cooltools/blob/51b95c3bed8d00a5f1f91370fc5192d9a7face7c/cooltools/expected.py#L988 114 | 115 | EXPECTED_PATH : The paths to a .tsv file with output of compute_expected. 116 | Must include a header. Use the '::' syntax to specify a summary column name. 117 | 118 | OUTPUT_PREFIX: Output file name prefix to store the logbinned expected 119 | (prefix.log.tsv) and derivative (prefix.der.tsv) in the tsv format." 120 | """ 121 | 122 | # unpack expected path and name as generated by click's callback to validate_csv: 123 | expected_path, exp_summary_name = expected_path 124 | # make sure "count.sum" is present in the expected file: 125 | expected_summary_cols = [exp_summary_name] 126 | if "count.sum" not in expected_summary_cols: 127 | expected_summary_cols.append("count.sum") 128 | 129 | cvd = read_expected_from_file( 130 | expected_path, 131 | contact_type="cis", 132 | expected_value_cols=expected_summary_cols, 133 | ) 134 | 135 | # name of the column with Probability of contacts is 136 | # based on the name of the column with the diagonal-summary 137 | # stats in the input expected DataFrame: 138 | exp_summary_base, *_ = exp_summary_name.split(".") 139 | Pc_name = f"{exp_summary_base}.avg" 140 | 141 | lb_cvd, lb_slopes, lb_distbins = expected.logbin_expected( 142 | cvd, 143 | summary_name=exp_summary_name, 144 | bins_per_order_magnitude=bins_per_order_magnitude, 145 | bin_layout=bin_layout, 146 | min_nvalid=min_nvalid, 147 | min_count=min_count, 148 | ) 149 | # combine Probabilities of contact for the regions: 150 | lb_cvd_agg, lb_slopes_agg = expected.combine_binned_expected( 151 | lb_cvd, 152 | Pc_name=Pc_name, 153 | binned_exp_slope=lb_slopes, 154 | spread_funcs=spread_funcs, 155 | spread_funcs_slope=spread_funcs_slope, 156 | ) 157 | if resolution is not None: 158 | lb_cvd_agg["s_bp"] = lb_cvd_agg["dist.avg"] * resolution 159 | lb_slopes_agg["s_bp"] = lb_slopes_agg["dist.avg"] * resolution 160 | 161 | lb_cvd_agg.to_csv( 162 | f"{output_prefix}.log.tsv", 163 | sep="\t", 164 | index=False, 165 | na_rep="nan", 166 | ) 167 | lb_slopes_agg.to_csv( 168 | f"{output_prefix}.der.tsv", 169 | sep="\t", 170 | index=False, 171 | na_rep="nan", 172 | ) 173 | -------------------------------------------------------------------------------- /cooltools/cli/rearrange.py: -------------------------------------------------------------------------------- 1 | import click 2 | import cooler 3 | import pandas as pd 4 | 5 | from .. import api 6 | from . import cli 7 | from .util import sniff_for_header 8 | 9 | 10 | @cli.command() 11 | @click.argument("in_path", metavar="IN_PATH", type=str, nargs=1) 12 | @click.argument("out_path", metavar="OUT_PATH", type=str, nargs=1) 13 | @click.option( 14 | "--view", 15 | help="Path to a BED-like file which defines which regions of the chromosomes to use" 16 | " and in what order. Using --new-chrom-col and --orientation-col you can specify the" 17 | " new chromosome names and whether to invert each region (optional)", 18 | default=None, 19 | required=True, 20 | type=str, 21 | ) 22 | @click.option( 23 | "--new-chrom-col", 24 | help="Column name in the view with new chromosome names." 25 | " If not provided and there is no column named 'new_chrom' in the view file, uses" 26 | " original chromosome names", 27 | default=None, 28 | type=str, 29 | ) 30 | @click.option( 31 | "--orientation-col", 32 | help="Columns name in the view with orientations of each region (+ or -)." 33 | " If not providedand there is no column named 'strand' in the view file, assumes" 34 | " all are forward oriented", 35 | default=None, 36 | type=str, 37 | ) 38 | @click.option( 39 | "--assembly", 40 | help="The name of the assembly for the new cooler. If None, uses the same as in the" 41 | " original cooler.", 42 | default=None, 43 | type=str, 44 | ) 45 | @click.option( 46 | "--chunksize", 47 | help="The number of pixels loaded and processed per step of computation.", 48 | type=int, 49 | default=int(1e7), 50 | show_default=True, 51 | ) 52 | @click.option( 53 | "--mode", 54 | help="(w)rite or (a)ppend to the output file (default: w)", 55 | default="w", 56 | type=click.Choice(["w", "a"], case_sensitive=False), 57 | ) 58 | def rearrange( 59 | in_path, out_path, view, new_chrom_col, orientation_col, assembly, chunksize, mode 60 | ): 61 | """Rearrange data from a cooler according to a new genomic view 62 | 63 | Parameters 64 | ---------- 65 | IN_PATH : str 66 | .cool file (or URI) with data to rearrange. 67 | OUT_PATH : str 68 | .cool file (or URI) to save the rearrange data. 69 | view : str 70 | Path to a BED-like file which defines which regions of the chromosomes to use 71 | and in what order. Has to be a valid viewframe (columns corresponding to region 72 | coordinates followed by the region name), with potential additional columns. 73 | Using --new-chrom-col and --orientation-col you can specify the new chromosome 74 | names and whether to invert each region (optional). 75 | If has no header with column names, assumes the `new-chrom-col` is the fifth 76 | column and `--orientation-col` is the sixth, if they exist. 77 | new_chrom_col : str 78 | Column name in the view with new chromosome names. 79 | If not provided and there is no column named 'new_chrom' in the view file, uses 80 | original chromosome names. 81 | orientation_col : str 82 | Columns name in the view with orientations of each region (+ or -). - means the 83 | region will be inverted. 84 | If not providedand there is no column named 'strand' in the view file, assumes 85 | all are forward oriented. 86 | assembly : str 87 | The name of the assembly for the new cooler. If None, uses the same as in the 88 | original cooler. 89 | chunksize : int 90 | The number of pixels loaded and processed per step of computation. 91 | mode : str 92 | (w)rite or (a)ppend to the output file (default: w) 93 | """ 94 | clr = cooler.Cooler(in_path) 95 | default_names = ["chrom", "start", "end", "name", "new_chrom", "strand"] 96 | buf, names = sniff_for_header(view) 97 | if names is not None: 98 | # Simply take column names from the file 99 | view_df = pd.read_table(buf, header=0, sep="\t") 100 | else: 101 | # Use default names 102 | # If some are missing, pandas creates them with all NaNs 103 | view_df = pd.read_csv(buf, names=default_names, sep="\t") 104 | names = view_df.columns 105 | # If additinal column names not provided, set them to defaults 106 | # If additional columns are not in the view, raise 107 | if new_chrom_col is None: 108 | new_chrom_col = "new_chrom" 109 | elif new_chrom_col not in view_df.columns: 110 | raise ValueError(f"New chrom col {new_chrom_col} not found in view columns") 111 | if orientation_col is None: 112 | orientation_col = "strand" 113 | elif orientation_col not in view_df.columns: 114 | raise ValueError(f"Orientation col {orientation_col} not found in view columns") 115 | 116 | # Fill NaNs in additional columns: if they were created here, will be filled with 117 | # default values. Allows not specifying default values in the file, i.e. only 118 | # regions that need to be inverted need to have "-" in orientation_col 119 | view_df[new_chrom_col] = view_df[new_chrom_col].fillna(view_df["chrom"]) 120 | view_df[orientation_col] = view_df[orientation_col].fillna("+") 121 | api.rearrange.rearrange_cooler( 122 | clr, 123 | view_df, 124 | out_path, 125 | new_chrom_col=new_chrom_col, 126 | orientation_col=orientation_col, 127 | assembly=assembly, 128 | chunksize=chunksize, 129 | mode=mode, 130 | ) 131 | -------------------------------------------------------------------------------- /cooltools/cli/sample.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from . import cli 4 | from .. import api 5 | 6 | 7 | @cli.command() 8 | @click.argument("in_path", metavar="IN_PATH", type=str, nargs=1) 9 | @click.argument("out_path", metavar="OUT_PATH", type=str, nargs=1) 10 | @click.option( 11 | "-c", 12 | "--count", 13 | help="The target number of contacts in the sample. " 14 | "The resulting sample size will not match it precisely. " 15 | "Mutually exclusive with --frac and --cis-count", 16 | type=int, 17 | default=None, 18 | show_default=False, 19 | ) 20 | @click.option( 21 | "--cis-count", 22 | help="The target number of cis contacts in the sample. " 23 | "The resulting sample size will not match it precisely. " 24 | "Mutually exclusive with --count and --frac", 25 | type=int, 26 | default=None, 27 | show_default=False, 28 | ) 29 | @click.option( 30 | "-f", 31 | "--frac", 32 | help="The target sample size as a fraction of contacts in the original dataset. " 33 | "Mutually exclusive with --count and --cis-count", 34 | type=float, 35 | default=None, 36 | show_default=False, 37 | ) 38 | @click.option( 39 | "--exact", 40 | help="If specified, use exact sampling that guarantees the size of the output sample. " 41 | "Otherwise, binomial sampling will be used and the sample size will be distributed around the target value. ", 42 | is_flag=True, 43 | ) 44 | @click.option( 45 | "--nproc", 46 | "-p", 47 | help="Number of processes to split the work between." 48 | "[default: 1, i.e. no process pool]", 49 | default=1, 50 | type=int, 51 | ) 52 | @click.option( 53 | "--chunksize", 54 | help="The number of pixels loaded and processed per step of computation.", 55 | type=int, 56 | default=int(1e7), 57 | show_default=True, 58 | ) 59 | def random_sample(in_path, out_path, count, cis_count, frac, exact, nproc, chunksize): 60 | """ 61 | Pick a random sample of contacts from a Hi-C map. 62 | 63 | IN_PATH : Input cooler path or URI. 64 | 65 | OUT_PATH : Output cooler path or URI. 66 | 67 | Specify the target sample size with either --count or --frac. 68 | 69 | """ 70 | 71 | api.sample.sample( 72 | in_path, 73 | out_path, 74 | count=count, 75 | cis_count=cis_count, 76 | frac=frac, 77 | exact=exact, 78 | chunksize=chunksize, 79 | nproc=nproc 80 | ) 81 | -------------------------------------------------------------------------------- /cooltools/cli/util.py: -------------------------------------------------------------------------------- 1 | import os.path as op 2 | import csv 3 | import io 4 | import click 5 | 6 | 7 | class TabularFilePath(click.Path): 8 | def __init__( 9 | self, default_column_index, exists=False, resolve_path=False, allow_dash=False 10 | ): 11 | """ 12 | Parameters 13 | ---------- 14 | default_column : str or int 15 | Name of desired column or 0-based column index. 16 | exists : bool 17 | resolve_path : bool 18 | 19 | Returns 20 | ------- 21 | path to file, column name or index 22 | 23 | """ 24 | self.default_column_index = default_column_index 25 | super().__init__( 26 | exists=exists, resolve_path=resolve_path, allow_dash=allow_dash 27 | ) 28 | 29 | def convert(self, value, param, ctx): 30 | if value is None: 31 | return 32 | file_path, _, field = value.partition("::") 33 | file_path = super().convert(file_path, param, ctx) 34 | if not field: 35 | col = self.default_column_index 36 | elif field.isdigit(): 37 | col = int(field) - 1 # assume one-based from command line 38 | if col < 0: 39 | self.fail('Expected one-based column number, received "0".', param, ctx) 40 | else: 41 | col = field 42 | return file_path, col 43 | 44 | 45 | def sniff_for_header(file_path, sep="\t", comment="#"): 46 | """ 47 | Warning: reads the entire file into a StringIO buffer! 48 | 49 | """ 50 | with open(file_path, "r") as f: 51 | buf = io.StringIO(f.read()) 52 | 53 | sample_lines = [] 54 | for line in buf: 55 | if not line.startswith(comment): 56 | sample_lines.append(line) 57 | break 58 | for _ in range(10): 59 | sample_lines.append(buf.readline()) 60 | buf.seek(0) 61 | 62 | has_header = csv.Sniffer().has_header("\n".join(sample_lines)) 63 | if has_header: 64 | names = sample_lines[0].strip().split(sep) 65 | else: 66 | names = None 67 | 68 | return buf, names 69 | 70 | 71 | def validate_csv(ctx, param, value, default_column): 72 | if value is None: 73 | return 74 | file_path, _, field_name = value.partition("::") 75 | if not op.exists(file_path): 76 | raise click.BadParameter( 77 | 'Path not found: "{}"'.format(file_path), ctx=ctx, param=param 78 | ) 79 | if not field_name: 80 | field_name = default_column 81 | elif field_name.isdigit(): 82 | field_name = int(field_name) 83 | return file_path, field_name 84 | -------------------------------------------------------------------------------- /cooltools/cli/virtual4c.py: -------------------------------------------------------------------------------- 1 | import cooler 2 | import bioframe 3 | from .. import api 4 | 5 | 6 | import click 7 | from . import cli 8 | 9 | 10 | @cli.command() 11 | @click.argument("cool_path", metavar="COOL_PATH", type=str, nargs=1) 12 | @click.argument("viewpoint", metavar="VIEWPOINT", type=str, nargs=1) 13 | @click.option( 14 | "--clr-weight-name", 15 | help="Use balancing weight with this name. " 16 | "Provide empty argument to calculate insulation on raw data (no masking bad pixels).", 17 | type=str, 18 | default="weight", 19 | show_default=True, 20 | ) 21 | @click.option( 22 | "-o", 23 | "--out-prefix", 24 | help="Save virtual 4C track as a BED-like file." 25 | " Contact frequency is stored in out_prefix.v4C.tsv", 26 | required=True, 27 | ) 28 | @click.option( 29 | "--bigwig", 30 | help="Also save virtual 4C track as a bigWig file with the name out_prefix.v4C.bw", 31 | is_flag=True, 32 | default=False, 33 | ) 34 | @click.option( 35 | "-p", 36 | "--nproc", 37 | help="Number of processes to split the work between." 38 | " [default: 1, i.e. no process pool]", 39 | default=1, 40 | type=int, 41 | ) 42 | def virtual4c( 43 | cool_path, 44 | viewpoint, 45 | clr_weight_name, 46 | out_prefix, 47 | bigwig, 48 | nproc, 49 | ): 50 | """ 51 | Generate virtual 4C profile from a contact map by extracting all interactions of a 52 | given viewpoint with the rest of the genome. 53 | 54 | 55 | COOL_PATH : the paths to a .cool file with a Hi-C map. Use the '::' syntax to 56 | specify a group path in a multicooler file. 57 | 58 | VIEWPOINT : the viewpoint to use for the virtual 4C profile. Provide as a UCSC-string 59 | (e.g. chr1:1-1000) 60 | 61 | 62 | Note: this is a new (experimental) tool, the interface or output might change in a 63 | future version. 64 | """ 65 | clr = cooler.Cooler(cool_path) 66 | 67 | viewpoint = bioframe.core.stringops.parse_region_string(viewpoint) 68 | v4c = api.virtual4c.virtual4c( 69 | clr, 70 | viewpoint, 71 | clr_weight_name=clr_weight_name if clr_weight_name else None, 72 | nproc=nproc, 73 | ) 74 | # Output 75 | if out_prefix: 76 | v4c.to_csv(out_prefix + ".tsv", sep="\t", index=False, na_rep="nan") 77 | if bigwig: 78 | bioframe.to_bigwig( 79 | v4c.dropna(), 80 | clr.chromsizes, 81 | out_prefix + ".bw", 82 | value_field=v4c.columns[3], 83 | ) 84 | else: 85 | print(v4c.to_csv(sep="\t", index=False, na_rep="nan")) 86 | return 87 | -------------------------------------------------------------------------------- /cooltools/lib/__init__.py: -------------------------------------------------------------------------------- 1 | from .common import * 2 | from .io import * 3 | from .checks import * 4 | -------------------------------------------------------------------------------- /cooltools/lib/_query.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import numpy as np 3 | import pandas as pd 4 | 5 | # from scipy.sparse import coo_matrix 6 | from cooler.core import _IndexingMixin 7 | 8 | 9 | def arg_prune_partition(seq, step): 10 | """ 11 | Take a monotonic sequence of integers and downsample it such that they 12 | are at least ``step`` apart (roughly), preserving the first and last 13 | elements. Returns indices, not values. 14 | 15 | """ 16 | lo, hi = seq[0], seq[-1] 17 | num = 2 + (hi - lo) // step 18 | cuts = np.linspace(lo, hi, num, dtype=int) 19 | return np.unique(np.searchsorted(seq, cuts)) 20 | 21 | 22 | class CSRSelector(_IndexingMixin): 23 | """ 24 | Instantiates 2D range queries. 25 | 26 | Example 27 | ------- 28 | >>> selector = CSRSelector(h5, (100, 100), 'count', 10000) 29 | >>> query = selector[lo1:hi1, lo2:hi2] 30 | 31 | """ 32 | 33 | def __init__(self, grp, shape, field, chunksize): 34 | self.grp = grp 35 | self.shape = shape 36 | self.field = field 37 | self.chunksize = chunksize 38 | self.offset_selector = grp["indexes"]["bin1_offset"] 39 | self.bin1_selector = grp["pixels"]["bin1_id"] 40 | self.bin2_selector = grp["pixels"]["bin2_id"] 41 | self.data_selector = grp["pixels"][field] 42 | 43 | def _make_getchunk(self, ispan, jspan): 44 | # Factory for function that executes any piece of a 2D range query by 45 | # index. 46 | 47 | bin1_selector = self.bin1_selector 48 | bin2_selector = self.bin2_selector 49 | data_selector = self.data_selector 50 | field = self.field 51 | i0, i1 = ispan 52 | j0, j1 = jspan 53 | 54 | # coarsegrain the offsets to extract a big chunk of rows at a time 55 | if (i1 - i0 < 1) or (j1 - j0 < 1): 56 | offsets = [] 57 | loc_pruned_offsets = [] 58 | else: 59 | offsets = self.offset_selector[i0 : i1 + 1] 60 | loc_pruned_offsets = arg_prune_partition(offsets, self.chunksize) 61 | 62 | self._loc_pruned_offsets = loc_pruned_offsets 63 | # i0 -- matrix row number offset 64 | # o0 -- corresponding pixel id offset = offsets[0] 65 | 66 | # let's take the downsampled subset of pixel id offsets [o0, ...., o1] 67 | # each successive pair corresponds to a "piece" of the query 68 | def getchunk(chunk_id, include_index=False): 69 | out = {"bin1_id": [], "bin2_id": [], field: []} 70 | if include_index: 71 | out["__index"] = [] 72 | 73 | # extract a chunk of on-disk rows 74 | oi, of = loc_pruned_offsets[chunk_id], loc_pruned_offsets[chunk_id + 1] 75 | p0, p1 = offsets[oi], offsets[of] 76 | slc = slice(p0, p1) 77 | 78 | bin2_extracted = bin2_selector[slc] 79 | data_extracted = data_selector[slc] 80 | if include_index: 81 | ind_extracted = np.arange(slc.start, slc.stop) 82 | 83 | # go row by row and filter 84 | for i in range(oi, of): 85 | # correct the offsets 86 | lo = offsets[i] - p0 87 | hi = offsets[i + 1] - p0 88 | 89 | # this row 90 | bin2 = bin2_extracted[lo:hi] 91 | 92 | # filter for the range of j values we want 93 | mask = (bin2 >= j0) & (bin2 < j1) 94 | cols = bin2[mask] 95 | 96 | # apply same mask for data 97 | data = data_extracted[lo:hi][mask] 98 | 99 | # shortcut for row data 100 | rows = np.full(len(cols), i0 + i, dtype=bin1_selector.dtype) 101 | 102 | out["bin1_id"].append(rows) 103 | out["bin2_id"].append(cols) 104 | out[field].append(data) 105 | if include_index: 106 | out["__index"].append(ind_extracted[lo:hi][mask]) 107 | 108 | if len(out): 109 | for k in out.keys(): 110 | out[k] = np.concatenate(out[k], axis=0) 111 | else: 112 | out["bin1_id"] = np.array([], dtype=bin1_selector.dtype) 113 | out["bin2_id"] = np.array([], dtype=bin2_selector.dtype) 114 | out[field] = np.array([], dtype=data_selector.dtype) 115 | if include_index: 116 | out["__index"] = np.array([], dtype=np.int64) 117 | 118 | return out 119 | 120 | return getchunk, loc_pruned_offsets 121 | 122 | def __getitem__(self, key): 123 | s1, s2 = self._unpack_index(key) 124 | ispan = self._process_slice(s1, self.shape[0]) 125 | jspan = self._process_slice(s2, self.shape[1]) 126 | getchunk, loc_pruned_offsets = self._make_getchunk(ispan, jspan) 127 | return RangeQuery(self, ispan, jspan, self.field, getchunk, loc_pruned_offsets) 128 | 129 | 130 | class RangeQuery(object): 131 | """ 132 | Executor that fulfills a partitioned 2D range query using a variety of outputs. 133 | 134 | """ 135 | 136 | def __init__(self, selector, ispan, jspan, field, getchunk, loc_pruned_offsets): 137 | self.selector = selector 138 | self.ispan = ispan 139 | self.jspan = jspan 140 | self.field = field 141 | self.n_chunks = len(loc_pruned_offsets) - 1 142 | self._locs = loc_pruned_offsets 143 | self._getchunk = getchunk 144 | 145 | def read_chunk(self, i, include_index=False): 146 | """Read any chunk of the partitioned query as a dictionary.""" 147 | if not 0 <= i < self.n_chunks: 148 | raise IndexError(i) 149 | return self._getchunk(i, include_index) 150 | 151 | def read_chunked(self, include_index=False): 152 | """Iterator over chunks (as dictionaries).""" 153 | for i in range(self.n_chunks): 154 | yield self._getchunk(i, include_index) 155 | 156 | def read(self, include_index=False): 157 | """Read the complete range query as a dictionary""" 158 | result = list(self.read_chunked(include_index)) 159 | return { 160 | k: np.concatenate([d[k] for d in result], axis=0) 161 | for k in ["bin1_id", "bin2_id", self.field] 162 | } 163 | 164 | def __repr__(self): 165 | return ( 166 | "{self.__class__.__name__}" 167 | '({self.ispan}, {self.jspan}, "{self.field}", ...) ' 168 | "[{n} piece(s)]" 169 | ).format(self=self, n=self.n_chunks) 170 | -------------------------------------------------------------------------------- /cooltools/lib/plotting.py: -------------------------------------------------------------------------------- 1 | """ 2 | Migrated from :mod:`mirnylib.plotting`. 3 | 4 | """ 5 | try: 6 | from matplotlib.cm import register_cmap 7 | except ImportError: 8 | from matplotlib import colormaps 9 | register_cmap = colormaps.register 10 | 11 | import matplotlib as mpl 12 | import matplotlib.pyplot as plt 13 | import numpy as np 14 | 15 | 16 | PALETTES = { 17 | "fall": np.array( 18 | ( 19 | (255, 255, 255), 20 | (255, 255, 204), 21 | (255, 237, 160), 22 | (254, 217, 118), 23 | (254, 178, 76), 24 | (253, 141, 60), 25 | (252, 78, 42), 26 | (227, 26, 28), 27 | (189, 0, 38), 28 | (128, 0, 38), 29 | (0, 0, 0), 30 | ) 31 | ) 32 | / 255, 33 | "blues": np.array( 34 | ( 35 | (255, 255, 255), 36 | (180, 204, 225), 37 | (116, 169, 207), 38 | (54, 144, 192), 39 | (5, 112, 176), 40 | (4, 87, 135), 41 | (3, 65, 100), 42 | (2, 40, 66), 43 | (1, 20, 30), 44 | (0, 0, 0), 45 | ) 46 | ) 47 | / 255, 48 | "acidblues": np.array( 49 | ( 50 | (255, 255, 255), 51 | (162, 192, 222), 52 | (140, 137, 187), 53 | (140, 87, 167), 54 | (140, 45, 143), 55 | (120, 20, 120), 56 | (90, 15, 90), 57 | (60, 10, 60), 58 | (30, 5, 30), 59 | (0, 0, 0), 60 | ) 61 | ) 62 | / 255, 63 | "nmeth": np.array( 64 | ( 65 | (236, 250, 255), 66 | (148, 189, 217), 67 | (118, 169, 68), 68 | (131, 111, 43), 69 | (122, 47, 25), 70 | (41, 0, 20), 71 | ) 72 | ) 73 | / 255, 74 | } 75 | 76 | 77 | def list_to_colormap(color_list, name=None): 78 | color_list = np.array(color_list) 79 | if color_list.min() < 0: 80 | raise ValueError("Colors should be 0 to 1, or 0 to 255") 81 | if color_list.max() > 1.0: 82 | if color_list.max() > 255: 83 | raise ValueError("Colors should be 0 to 1 or 0 to 255") 84 | else: 85 | color_list = color_list / 255.0 86 | return mpl.colors.LinearSegmentedColormap.from_list(name, color_list, 256) 87 | 88 | 89 | def get_cmap(name): 90 | is_reversed = name.endswith("_r") 91 | try: 92 | if is_reversed: 93 | pal = PALETTES[name[:-2]][::-1] 94 | else: 95 | pal = PALETTES[name] 96 | except KeyError: 97 | raise ValueError('Palette not found "{}"'.format(name)) 98 | return list_to_colormap(pal) 99 | 100 | 101 | def _register_cmaps(): 102 | for name, pal in PALETTES.items(): 103 | register_cmap(cmap=list_to_colormap(pal), name=name) 104 | register_cmap(cmap=list_to_colormap(pal[::-1]), name=name + "_r") 105 | 106 | 107 | _register_cmaps() 108 | 109 | 110 | def gridspec_inches(wcols, hrows, fig_kwargs={}): 111 | 112 | fig_height_inches = sum(hrows) 113 | 114 | fig_width_inches = sum(wcols) 115 | 116 | fig = plt.figure( 117 | figsize=(fig_width_inches, fig_height_inches), 118 | subplotpars=mpl.figure.SubplotParams( 119 | left=0, right=1, bottom=0, top=1, wspace=0, hspace=0.0 120 | ), 121 | # frameon=False, 122 | **fig_kwargs 123 | ) 124 | fig.set_size_inches(fig_width_inches, fig_height_inches, forward=True) 125 | 126 | gs = mpl.gridspec.GridSpec( 127 | len(hrows), 128 | len(wcols), 129 | left=0, 130 | right=1, 131 | top=1, 132 | bottom=0, 133 | wspace=0, 134 | hspace=0, 135 | width_ratios=wcols, 136 | height_ratios=hrows, 137 | ) 138 | 139 | return fig, gs 140 | -------------------------------------------------------------------------------- /cooltools/lib/runlength.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | 5 | def isrle(starts, lengths, values): 6 | if not (len(starts) == len(lengths) == len(values)): 7 | return False 8 | 9 | if np.any(np.diff(starts) < 0): 10 | return False 11 | 12 | ends = starts + lengths 13 | if np.any(ends[:-1] > starts[1:]): 14 | return False 15 | 16 | return True 17 | 18 | 19 | def rlencode(x, dropna=False): 20 | """ 21 | Run length encoding. 22 | Based on http://stackoverflow.com/a/32681075, which is based on the rle 23 | function from R. 24 | 25 | Parameters 26 | ---------- 27 | x : 1D array_like 28 | Input array to encode 29 | dropna: bool, optional 30 | Drop all runs of NaNs. 31 | 32 | Returns 33 | ------- 34 | start positions, run lengths, run values 35 | 36 | """ 37 | where = np.flatnonzero 38 | x = np.asarray(x) 39 | n = len(x) 40 | if n == 0: 41 | return ( 42 | np.array([], dtype=int), 43 | np.array([], dtype=int), 44 | np.array([], dtype=x.dtype), 45 | ) 46 | 47 | isnumeric = np.issubdtype(x.dtype, np.number) 48 | 49 | if isnumeric: 50 | starts = np.r_[0, where(~np.isclose(x[1:], x[:-1], equal_nan=True)) + 1] 51 | else: 52 | starts = np.r_[0, where(x[1:] != x[:-1]) + 1] 53 | lengths = np.diff(np.r_[starts, n]) 54 | values = x[starts] 55 | 56 | if isnumeric and dropna: 57 | mask = ~np.isnan(values) 58 | starts, lengths, values = starts[mask], lengths[mask], values[mask] 59 | 60 | return starts, lengths, values 61 | 62 | 63 | def rldecode(starts, lengths, values, minlength=None): 64 | """ 65 | Decode a run-length encoding of a 1D array. 66 | 67 | Parameters 68 | ---------- 69 | starts, lengths, values : 1D array_like 70 | The run-length encoding. 71 | minlength : int, optional 72 | Minimum length of the output array. 73 | 74 | Returns 75 | ------- 76 | 1D array. Missing data will be filled with NaNs. 77 | 78 | """ 79 | starts, lengths, values = map(np.asarray, (starts, lengths, values)) 80 | # TODO: check validity of rle 81 | ends = starts + lengths 82 | n = ends[-1] 83 | if minlength is not None: 84 | n = max(minlength, n) 85 | x = np.full(n, np.nan) 86 | for lo, hi, val in zip(starts, ends, values): 87 | x[lo:hi] = val 88 | return x 89 | 90 | 91 | def iterruns(x, value=None, **kwargs): 92 | starts, lengths, values = rlencode(x, **kwargs) 93 | if value is None: 94 | ends = starts + lengths 95 | return zip(starts, ends, values) 96 | else: 97 | mask = values == value 98 | starts, lengths = starts[mask], lengths[mask] 99 | ends = starts + lengths 100 | return zip(starts, ends) 101 | 102 | 103 | def fillgaps(starts, lengths, values, minlength=None, fill_value=np.nan): 104 | """ 105 | Add additional runs to fill in spaces between runs. Defaults to runs of NaN. 106 | """ 107 | where = np.flatnonzero 108 | n = starts[-1] + lengths[-1] 109 | if minlength is not None: 110 | n = max(minlength, n) 111 | 112 | ends = starts + lengths 113 | lo = np.r_[0, ends] 114 | hi = np.r_[starts, n] 115 | gap_locs = where((hi - lo) > 0) 116 | if len(gap_locs): 117 | starts = np.insert(starts, gap_locs, lo[gap_locs]) 118 | lengths = np.insert(lengths, gap_locs, hi[gap_locs] - lo[gap_locs]) 119 | values = np.insert(values, gap_locs, fill_value) 120 | return starts, lengths, values 121 | 122 | 123 | def dropgaps(starts, lengths, values): 124 | """ 125 | Discard runs of NaN. 126 | """ 127 | mask = np.isnan(values) 128 | starts, lengths, values = starts[mask], lengths[mask], values[mask] 129 | return starts, lengths, values 130 | 131 | 132 | def align(slv1, slv2, minlength=None): 133 | """ 134 | Remove NaN runs and runs of length zero and stich together consecutive runs 135 | of the same value. 136 | 137 | """ 138 | starts1, lengths1, values1 = fillgaps(*slv1) 139 | starts2, lengths2, values2 = fillgaps(*slv2) 140 | n1 = starts1[-1] + lengths1[-1] 141 | n2 = starts2[-1] + lengths2[-1] 142 | if minlength is not None: 143 | n = max(minlength, n1, n2) 144 | 145 | starts = np.concatenate([starts1, starts2]) 146 | values = np.concatenate([values1, values2]) 147 | idx = np.argsort(starts) 148 | starts = starts[idx] 149 | values = values[idx] 150 | lengths = np.diff(np.r_[starts, n]) 151 | return starts, lengths, values 152 | 153 | 154 | def simplify(starts, lengths, values, minlength=None): 155 | """ 156 | Remove NaN runs and runs of length zero and stich together consecutive runs 157 | of the same value. 158 | 159 | """ 160 | starts, lengths, values = fillgaps(starts, lengths, values, minlength) 161 | n = starts[-1] + lengths[-1] 162 | 163 | is_nontrivial = lengths > 0 164 | starts = starts[is_nontrivial] 165 | values = values[is_nontrivial] 166 | 167 | is_new_run = np.r_[True, ~np.isclose(values[:-1], values[1:], equal_nan=True)] 168 | starts = starts[is_new_run] 169 | values = values[is_new_run] 170 | 171 | lengths = np.r_[starts[1:] - starts[:-1], n - starts[-1]] 172 | 173 | mask = ~np.isnan(values) 174 | return starts[mask], lengths[mask], values[mask] 175 | -------------------------------------------------------------------------------- /cooltools/lib/schemas.py: -------------------------------------------------------------------------------- 1 | # schemas of datastructures commonly used in cooltools 2 | # including description DataFrame dtypes/columns definitions 3 | diag_expected_dtypes = { 4 | "region1": "string", 5 | "region2": "string", 6 | "dist": "Int64", 7 | "n_valid": "Int64", 8 | } 9 | 10 | block_expected_dtypes = { 11 | "region1": "string", 12 | "region2": "string", 13 | "n_valid": "Int64", 14 | } 15 | 16 | # cooler weight names that are potentially divisive 17 | # cooltools supports only multiplicative weight for now 18 | DIVISIVE_WEIGHTS_4DN = ["KR", "VC", "VC_SQRT"] 19 | -------------------------------------------------------------------------------- /cooltools/sandbox/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open2c/cooltools/aedd531e80e7940311f941918c6551d3229eaf21/cooltools/sandbox/__init__.py -------------------------------------------------------------------------------- /cooltools/sandbox/balance.py: -------------------------------------------------------------------------------- 1 | from functools import partial, reduce 2 | from multiprocess import Pool 3 | from operator import add 4 | 5 | import numpy as np 6 | import pandas 7 | import pandas 8 | import h5py 9 | 10 | from scipy.sparse import linalg 11 | from cooler.parallel import split, partition 12 | import cooler 13 | 14 | 15 | def bnewt(matvec, mask, tol=1e-6, x0=None, delta=0.1, Delta=3, fl=0): 16 | """ 17 | A balancing algorithm for symmetric matrices 18 | 19 | X = BNEWT(A) attempts to find a vector X such that 20 | diag(X)*A*diag(X) is close to doubly stochastic. A must 21 | be symmetric and nonnegative. 22 | 23 | Parameters 24 | ---------- 25 | matvec : callable 26 | Linear operator that returns the matrix-vector product with x 27 | mask : 1D array of bool 28 | Mask of good bins 29 | tol : float 30 | Error tolerance 31 | x0 : 1D array 32 | Initial guess 33 | delta : float 34 | How close balancing vectors can get to the edge of the positive cone 35 | Delta : float 36 | How far balancing vectors can get from the edge of the positive cone 37 | 38 | We use a relative measure on the size of elements. 39 | 40 | Returns 41 | ------- 42 | x : 1D array 43 | balancing weights 44 | res : float 45 | residual error, measured by norm(diag(x)*A*x - e) 46 | 47 | """ 48 | # Initialize 49 | n = mask.sum() 50 | 51 | e = np.ones(n) 52 | if x0 is None: 53 | x0 = e.copy() 54 | res = [] 55 | 56 | # Inner stopping criterion parameters. 57 | g = 0.9 58 | etamax = 0.1 59 | eta = etamax 60 | stop_tol = tol * 0.5 61 | x = x0 62 | rt = tol ** 2 63 | v = x * matvec(x, mask) 64 | 65 | rk = 1 - v 66 | rho_km1 = np.dot(rk, rk) 67 | rho_km2 = None # will be defined later 68 | rout = rho_km1 69 | rold = rout 70 | 71 | MVP = 0 # We’ll count matrix vector products. 72 | i = 0 # Outer iteration count. 73 | 74 | if fl == 1: 75 | print("it in. it res", flush=True) 76 | 77 | # Outer iteration 78 | while rout > rt: 79 | i += 1 80 | k = 0 81 | y = e.copy() 82 | innertol = max((eta ** 2) * rout, rt) 83 | 84 | # Inner iteration by Conjugate Gradient 85 | while rho_km1 > innertol: 86 | k += 1 87 | 88 | if k == 1: 89 | Z = rk / v 90 | p = Z.copy() 91 | rho_km1 = np.dot(rk, Z) 92 | else: 93 | beta = rho_km1 / rho_km2 94 | p = Z + beta * p 95 | 96 | # Update search direction efficiently. 97 | w = x * matvec(x * p, mask) + v * p 98 | 99 | alpha = rho_km1 / np.dot(p, w) 100 | ap = alpha * p 101 | 102 | # Test distance to boundary of cone. 103 | ynew = y + ap 104 | if min(ynew) <= delta: 105 | if delta == 0: 106 | break 107 | idx = ap < 0 108 | gamma = np.min((delta - y[idx]) / ap[idx]) 109 | y = y + gamma * ap 110 | break 111 | 112 | if max(ynew) >= Delta: 113 | idx = ynew > Delta 114 | gamma = np.min((Delta - y[idx]) / ap[idx]) 115 | y = y + gamma * ap 116 | break 117 | 118 | y = ynew.copy() 119 | rk = rk - alpha * w 120 | rho_km2 = rho_km1 121 | Z = rk / v 122 | rho_km1 = np.dot(rk, Z) 123 | 124 | x = x * y 125 | v = x * matvec(x, mask) 126 | 127 | rk = 1 - v 128 | rho_km1 = np.dot(rk, rk) 129 | rout = rho_km1 130 | MVP += k + 1 131 | 132 | # Update inner iteration stopping criterion. 133 | rat = rout / rold 134 | rold = rout 135 | res_norm = np.sqrt(rout) 136 | eta_o = eta 137 | eta = g * rat 138 | if g * (eta_o ** 2) > 0.1: 139 | eta = max(eta, g * (eta_o ** 2)) 140 | 141 | eta = max(min(eta, etamax), stop_tol / res_norm) 142 | if fl == 1: 143 | print("%3d\t%6d\t%.3e" % (i, k, res_norm), flush=True) 144 | res.append(res_norm) 145 | 146 | print("Matrix-vector products = %6d" % (MVP,), flush=True) 147 | 148 | x_full = np.zeros(len(mask)) 149 | x_full[mask] = x 150 | return x_full, np.array(res) 151 | -------------------------------------------------------------------------------- /cooltools/sandbox/cool2cworld.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | import gzip 4 | import tarfile 5 | import tempfile 6 | 7 | from . import fastsavetxt 8 | 9 | import cooler 10 | 11 | 12 | def dump_cworld( 13 | in_cooler, 14 | out=None, 15 | region=None, 16 | iced=False, 17 | iced_unity=False, 18 | buffer_size=int(1e8), 19 | ): 20 | """ 21 | Dump a genome-wide contact matrix from cooler into a CWorld-format 22 | text matrix. 23 | 24 | Parameters 25 | ---------- 26 | in_cooler : str or cooler 27 | A cooler object or the path to the file. 28 | 29 | out : str or file object 30 | Either: 31 | -- a path to the output file. If ends with .gz the output is gzipped 32 | -- a file object 33 | -- a stdin of a Popen object 34 | -- None, in which case the data is dumped into a string and returned 35 | TIP: when using files/stdin do not forget to flush()/communicate(). 36 | 37 | region : str 38 | The region to dump. By default is None, dump the genome-wide matrix. 39 | 40 | iced : bool, optional 41 | If True, dump the balanced matrix. 42 | 43 | iced_unity : bool, optional 44 | If True and `iced` is True, dump the matrix balanced to a unity. 45 | 46 | buffer_size : int 47 | The chunk size for iterating over the rows of the Hi-C matrix. 48 | """ 49 | 50 | # Prepare the out pipe and the clean-up function. 51 | if not (out): 52 | out = io.BytesIO(b"") 53 | if issubclass(type(out), str) or issubclass(type(out), bytearray): 54 | if out.endswith(".gz"): 55 | writer = fastsavetxt.gzipWriter(out) 56 | out_pipe = writer.stdin 57 | close_out_func = writer.communicate 58 | else: 59 | writer = open(out, "wb") 60 | out_pipe = writer 61 | close_out_func = writer.flush 62 | elif hasattr(out, "write"): 63 | out_pipe = out 64 | close_out_func = fastsavetxt.empty_func 65 | 66 | # Make headers 67 | if not issubclass(type(in_cooler), cooler.Cooler): 68 | c = cooler.Cooler(in_cooler) 69 | else: 70 | c = in_cooler 71 | 72 | res = c.info["bin-size"] 73 | gname = c.info["genome-assembly"] 74 | 75 | bins = c.bins()[:] if not (region) else c.bins().fetch(region) 76 | nbins = len(bins) 77 | 78 | col_headers = "\t".join( 79 | ["{}x{}".format(nbins, nbins)] 80 | + [ 81 | "{}|{}|{}:{}-{}".format(binidx, gname, b.chrom, b.start + 1, b.end) 82 | for binidx, b in bins.iterrows() 83 | ] 84 | ).encode() 85 | 86 | row_headers = [ 87 | "{}|{}|{}:{}-{}".format(binidx1, gname, b1.chrom, b1.start + 1, b1.end).encode() 88 | for binidx1, b1 in bins.iterrows() 89 | ] 90 | 91 | # Iterate over a matrix one block at a time. 92 | nrows_per_step = max(1, buffer_size // nbins) 93 | for i in range(nbins // nrows_per_step + 1): 94 | lo = min(nbins, i * nrows_per_step) 95 | hi = min(nbins, (i + 1) * nrows_per_step) 96 | if hi <= lo: 97 | break 98 | mat = ( 99 | c.matrix(balance=iced) 100 | if not (region) 101 | else c.matrix(balance=iced).fetch(region) 102 | )[lo:hi] 103 | if iced and (not iced_unity): 104 | mat *= c._load_attrs("/bins/weight")["scale"] 105 | 106 | fastsavetxt.array2txt( 107 | mat, 108 | out_pipe, 109 | format_string=b"%.8f" if iced_unity else b"%.4lf", 110 | header=col_headers if i == 0 else None, 111 | row_headers=row_headers[lo:hi], 112 | ) 113 | 114 | if issubclass(type(out), io.BytesIO): 115 | return out.getvalue() 116 | else: 117 | close_out_func() 118 | 119 | 120 | def dump_cworld_tar( 121 | cooler_paths, 122 | out_path, 123 | ): 124 | """ 125 | Makes a CWorld .tar archive with binned contact maps at multiple resolutions 126 | in .matrix.txt.gz format. 127 | 128 | Parameters 129 | ---------- 130 | cooler_paths : a list of str 131 | The paths to all coolers to dump into a single CWorld tar archive. 132 | Must correspond to the same dataset and have different resolutions. 133 | 134 | out_path : str 135 | The path to the output file. 136 | 137 | """ 138 | 139 | dataset_name = os.path.splitext(os.path.split(out_path)[1])[0] 140 | 141 | with tempfile.TemporaryDirectory() as cworld_tmp_path: 142 | for cooler_path in cooler_paths: 143 | res = cooler.Cooler(cooler_path).info["bin-size"] 144 | os.mkdir(os.path.join(cworld_tmp_path, "C-" + str(res))) 145 | for iced, iced_label in [(True, "iced"), (False, "raw")]: 146 | folder_path = os.path.join(cworld_tmp_path, "C-" + str(res), iced_label) 147 | os.mkdir(folder_path) 148 | 149 | mat_path = os.path.join( 150 | folder_path, 151 | "{}__C-{}-{}.matrix.gz".format(dataset_name, res, iced_label), 152 | ) 153 | 154 | dump_cworld( 155 | in_cooler=cooler_path, out=mat_path, iced=iced, iced_unity=False 156 | ) 157 | 158 | with tarfile.open(out_path, mode="w") as archive: 159 | archive.add(cworld_tmp_path, arcname=dataset_name, recursive=True) 160 | -------------------------------------------------------------------------------- /cooltools/sandbox/cooler_filters/Example_usage.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pixel_filter_util\n", 10 | "import cooler" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "- cis_total_ratio_filter() allows partial evaluation. However if you only provide threshold, you will need to use key word 'threshold' as the below example. This will return a function that can further take cooler object" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "test_filter = pixel_filter_util.cis_total_ratio_filter(threshold=0.5)\n", 27 | "clr = cooler.Cooler('test_data_util.cool')\n", 28 | "bin_mask = test_filter(clr)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "- If you have multiple filters you can use generate_bin_mask() to apply filters sequentially to the bin table and output a mask" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "test_filter2 = pixel_filter_util.cis_total_ratio_filter(threshold=0.7)\n", 45 | "bin_mask = pixel_filter_util.generate_bin_mask(clr, [test_filter, test_filter2])\n" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "- The bin_mask then can be used in create_filtered_cooler" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "output_path = f\"./test_data_util_filtered.cool\"\n", 62 | "pixel_filter_util.create_filtered_cooler(\n", 63 | " output_path, clr, bin_mask, chunksize=10_000_000, nproc=1\n", 64 | " )" 65 | ] 66 | } 67 | ], 68 | "metadata": { 69 | "kernelspec": { 70 | "display_name": "akita", 71 | "language": "python", 72 | "name": "python3" 73 | }, 74 | "language_info": { 75 | "codemirror_mode": { 76 | "name": "ipython", 77 | "version": 3 78 | }, 79 | "file_extension": ".py", 80 | "mimetype": "text/x-python", 81 | "name": "python", 82 | "nbconvert_exporter": "python", 83 | "pygments_lexer": "ipython3", 84 | "version": "3.9.18" 85 | } 86 | }, 87 | "nbformat": 4, 88 | "nbformat_minor": 2 89 | } 90 | -------------------------------------------------------------------------------- /cooltools/sandbox/cooler_filters/pixel_filter_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from toolz import curry 3 | import cooltools 4 | import cooler 5 | import functools 6 | from multiprocessing import Pool 7 | import logging 8 | from cooltools.lib.common import pool_decorator 9 | 10 | formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s : %(message)s") 11 | logger = logging.getLogger("data_util") 12 | logger.propagate = False # Disable propagation to the root logger 13 | 14 | ch = logging.StreamHandler() 15 | ch.setLevel(logging.INFO) 16 | ch.setFormatter(formatter) 17 | logger.addHandler(ch) 18 | 19 | @curry 20 | def cis_total_ratio_filter(clr, threshold=0.5): 21 | """ 22 | Filter out bins with low cis-to-total coverage ratio from a Cooler object. 23 | 24 | Parameters 25 | ---------- 26 | clr : cooler.Cooler 27 | A Cooler object containing Hi-C contact matrices. 28 | threshold : float 29 | The threshold cis-to-total coverage ratio below which bins are considered bad. 30 | 31 | Returns 32 | ------- 33 | numpy.ndarray 34 | An array of bin mask. 35 | 36 | Note 37 | ---- 38 | This curried function accepts partial evaluation with only providing threshold value. 39 | """ 40 | if isinstance(clr, float): 41 | raise TypeError( 42 | "If only threshold value is provided, please use 'threshold' keyword to set threshold value (e.g. threshold=0.2)" 43 | ) 44 | coverage = cooltools.coverage(clr) 45 | cis_total_cov = coverage[0] / coverage[1] 46 | bin_mask = cis_total_cov > threshold 47 | 48 | return bin_mask 49 | 50 | 51 | def generate_bin_mask( 52 | clr, filters=[None], store=False, store_name="cis_total_ratio_>_0.5_thres" 53 | ): 54 | """ 55 | Generates a binary mask for a given `clr` object based on a list of filters and thresholds. 56 | 57 | Parameters 58 | ---------- 59 | clr : cooler.Cooler 60 | A cooler object containing Hi-C contact matrices. 61 | filters : list 62 | A list of filter functions to apply to the contact matrices. 63 | store : bool, optional 64 | If True, store the results in the input cooler file when finished. Default is False. 65 | store_name : str, optional 66 | Name of the columns of the bin table to save bin mask. 67 | 68 | Returns 69 | ------- 70 | bin_mask : numpy.ndarray 71 | A binary mask indicating which genomic bins pass all filters. 72 | """ 73 | if not isinstance(filters, list): 74 | logger.error("filter_lst parameter takes a list") 75 | 76 | bin_mask = np.array([True] * clr.bins().shape[0]) 77 | for filter in filters: 78 | bin_mask *= filter(clr) 79 | 80 | if store: 81 | with clr.open("r+") as grp: 82 | if store_name in grp["bins"]: 83 | del grp["bins"][store_name] 84 | h5opts = dict(compression="gzip", compression_opts=6) 85 | grp["bins"].create_dataset(store_name, data=bin_mask, **h5opts, dtype=bool) 86 | 87 | return bin_mask 88 | 89 | 90 | def _pixel_filter(chunk_pixels, good_bins_index): 91 | """ 92 | Filters a chunk of pixels based on a list of good bin indices. 93 | 94 | Parameters 95 | ---------- 96 | chunk_pixels : pandas.DataFrame 97 | A DataFrame containing the pixels to be filtered. It must have columns 'bin1_id' and 'bin2_id'. 98 | good_bins_index : list of int 99 | A list of indices representing the good bins. 100 | 101 | Returns 102 | ------- 103 | pandas.DataFrame 104 | A DataFrame containing only the pixels whose bin1_id and bin2_id are in good_bins_index. 105 | """ 106 | 107 | pixels_mask = chunk_pixels["bin1_id"].isin(good_bins_index) * chunk_pixels[ 108 | "bin2_id" 109 | ].isin(good_bins_index) 110 | return chunk_pixels[pixels_mask] 111 | 112 | 113 | def pixel_iter_chunks(clr, chunksize): 114 | """ 115 | Iterate over the pixels of a cooler object in chunks of a given size. 116 | 117 | Parameters 118 | ---------- 119 | clr : cooler.Cooler 120 | A cooler object containing Hi-C data. 121 | chunksize : int 122 | The size of each chunk of pixels to iterate over. 123 | 124 | Yields 125 | ------ 126 | chunk : numpy.ndarray 127 | A chunk of pixels of size `chunksize`. 128 | """ 129 | selector = clr.pixels() 130 | for lo, hi in cooler.util.partition(0, len(selector), chunksize): 131 | chunk = selector[lo:hi] 132 | yield chunk 133 | 134 | @pool_decorator 135 | def create_filtered_cooler( 136 | output_uri, clr, bin_mask, chunksize=10_000_000, nproc=1, map=map 137 | ): 138 | """ 139 | Create a filtered cooler file from a given cooler object and a binary mask of good bins. 140 | 141 | Parameters 142 | ---------- 143 | output_uri : str 144 | The URI of the output cooler file to be created. 145 | clr : cooler.Cooler 146 | The cooler object to be filtered. 147 | bin_mask : numpy.ndarray 148 | A boolean array indicating which bins to keep (True) and which to discard (False). 149 | Must have the same length as the number of bins in the cooler object. 150 | nproc : int, optional 151 | The number of processes to use for parallelization. Default is 16. 152 | chunksize : int, optional 153 | The number of pixels to process per chunk. Default is 10,000,000. 154 | 155 | Returns 156 | ------- 157 | None 158 | """ 159 | if len(bin_mask) != clr.bins().shape[0]: 160 | raise ValueError( 161 | "bin_mask should have the same length as bin table in cool file" 162 | ) 163 | logger.debug("Start to create cooler file...") 164 | bin_table = clr.bins()[:][['chrom','start','end']].copy() 165 | good_bins_index = np.array(range(clr.bins().shape[0]))[bin_mask] 166 | pixels_filter = functools.partial(_pixel_filter, good_bins_index=good_bins_index) 167 | 168 | cooler.create_cooler( 169 | output_uri, 170 | bins=bin_table, 171 | pixels=map(pixels_filter, pixel_iter_chunks(clr, chunksize)), 172 | ordered=True, 173 | columns=["count"], 174 | ) 175 | 176 | logger.debug("done") 177 | -------------------------------------------------------------------------------- /cooltools/sandbox/cooler_filters/test_data_util.cool: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open2c/cooltools/aedd531e80e7940311f941918c6551d3229eaf21/cooltools/sandbox/cooler_filters/test_data_util.cool -------------------------------------------------------------------------------- /cooltools/sandbox/expected_smoothing.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterable 2 | 3 | import numpy as np 4 | import numba 5 | 6 | 7 | def _log_interp(xs, xp, fp): 8 | """ 9 | Interpolate a function in the log-log space. 10 | Equivalent to np.exp(np.interp(np.log(xs), np.log(xp), np.log(fp))). 11 | 12 | Parameters 13 | ---------- 14 | xs : array-like 15 | The x-coordinates at which to evaluate the interpolated values. 16 | xp : 1-D sequence of floats 17 | The x-coordinates of the data points, must be increasing. 18 | fp : 1D array 19 | The y-coordinates of the data points, same length as xp. 20 | 21 | Returns 22 | ------- 23 | ys : 1D array 24 | The interpolated values, same shape as x. 25 | """ 26 | with np.errstate(divide="ignore"): 27 | ys = np.exp( 28 | np.interp( 29 | np.log(xs), 30 | np.log(xp), 31 | np.log(fp), 32 | ) 33 | ) 34 | 35 | return ys 36 | 37 | 38 | @numba.njit 39 | def _log_thin(xs, min_log10_step=0.1): 40 | """ 41 | Thin out a sorted array, by selecting a subset of elements that are uniformly spaced in log-space. 42 | 43 | Parameters 44 | ---------- 45 | xs : array-like 46 | An array of elements to thin out. 47 | min_log10_step : float, optional 48 | The minimal log10 ratio between consecutive elements in the output, by default 0.1 49 | 50 | Returns 51 | ------- 52 | xs_thinned : array-like 53 | A subset of elements from xs, whose logs are approx. uniformly spaced. 54 | """ 55 | xs_thinned = [xs[0]] 56 | prev = xs[0] 57 | min_ratio = 10**min_log10_step 58 | for x in xs[1:]: 59 | if x > prev * min_ratio: 60 | xs_thinned.append(x) 61 | prev = x 62 | 63 | if xs_thinned[-1] != xs[-1]: 64 | xs_thinned.append(xs[-1]) 65 | return np.array(xs_thinned) 66 | 67 | 68 | @numba.njit 69 | def _log_smooth_numba( 70 | xs, 71 | ys, 72 | sigma_log10=0.1, 73 | window_sigma=5, 74 | points_per_sigma=10, 75 | ): 76 | xs_thinned = xs 77 | if points_per_sigma: 78 | xs_thinned = _log_thin(xs, sigma_log10 / points_per_sigma) 79 | 80 | N = xs_thinned.size 81 | N_FUNCS = ys.shape[0] 82 | 83 | log_xs = np.log10(xs) 84 | log_thinned_xs = np.log10(xs_thinned) 85 | 86 | ys_smoothed = np.zeros((N_FUNCS, N)) 87 | 88 | for i in range(N): 89 | cur_log_x = log_thinned_xs[i] 90 | lo = np.searchsorted(log_xs, cur_log_x - sigma_log10 * window_sigma) 91 | hi = np.searchsorted(log_xs, cur_log_x + sigma_log10 * window_sigma) 92 | smooth_weights = np.exp( 93 | -((cur_log_x - log_xs[lo:hi]) ** 2) / 2 / sigma_log10 / sigma_log10 94 | ) 95 | norm = smooth_weights.sum() 96 | 97 | if norm > 0: 98 | smooth_weights /= norm 99 | 100 | for k in range(N_FUNCS): 101 | ys_smoothed[k, i] = np.sum(ys[k, lo:hi] * smooth_weights) 102 | 103 | return xs_thinned, ys_smoothed 104 | 105 | 106 | def log_smooth( 107 | xs, 108 | ys, 109 | sigma_log10=0.1, 110 | window_sigma=5, 111 | points_per_sigma=10, 112 | ): 113 | """ 114 | Convolve a function or multiple functions with a gaussian kernel in the log space. 115 | 116 | Parameters 117 | ---------- 118 | xs : 1D array 119 | The x-coordinates (function arguments) of the data points, must be increasing. 120 | ys : 1D or 2D array 121 | The y-coordinates (function values) of the data points. 122 | If 2D, rows correspond to multiple functions, columns correspond to different points. 123 | sigma_log10 : float, optional 124 | The standard deviation of the smoothing Gaussian kernel, applied over log10(xs), by default 0.1 125 | window_sigma : int, optional 126 | Width of the smoothing window, expressed in sigmas, by default 5 127 | points_per_sigma : int, optional 128 | If provided, smoothing is done only for `points_per_sigma` points per sigma and the 129 | rest of the values are interpolated (this results in a major speed-up). By default 10 130 | 131 | Returns 132 | ------- 133 | xs_thinned : 1D array 134 | The subset of arguments, uniformly spaced in log-space. 135 | ys_smoothed : 1D or 2D array 136 | The gaussian-smoothed function values. 137 | 138 | """ 139 | xs = np.asarray(xs) 140 | ys = np.asarray(ys) 141 | 142 | if xs.ndim != 1: 143 | raise ValueError("xs must be a 1D vector") 144 | if ys.ndim not in (1, 2): 145 | raise ValueError('ys must be either a 1D vector or a "tall" 2D matrix') 146 | if xs.shape[0] != ys.shape[-1]: 147 | raise ValueError("xs and ys must have the same number of observations") 148 | 149 | ys = ys[np.newaxis, :] if ys.ndim == 1 else ys 150 | 151 | xs_thinned, ys_smoothed = _log_smooth_numba( 152 | xs, ys, sigma_log10, window_sigma, points_per_sigma 153 | ) 154 | 155 | if points_per_sigma: 156 | ys_smoothed = np.asarray( 157 | [_log_interp(xs, xs_thinned, ys_row) for ys_row in ys_smoothed] 158 | ) 159 | 160 | ys_smoothed = ys_smoothed[0] if ys.shape[0] == 1 else ys_smoothed 161 | 162 | return ys_smoothed 163 | 164 | 165 | def _smooth_cvd_group( 166 | cvd, sigma_log10, window_sigma, points_per_sigma, cols=None, suffix="" 167 | ): 168 | cvd_smoothed = ( 169 | cvd.groupby(cols["dist"]) 170 | .agg( 171 | { 172 | cols["n_pixels"]: "sum", 173 | cols["n_contacts"]: "sum", 174 | } 175 | ) 176 | .reset_index() 177 | ) 178 | 179 | smoothed_balanced_sum, smoothed_n_valid = log_smooth( 180 | cvd_smoothed[cols["dist"]].values.astype(np.float64), 181 | [ 182 | cvd_smoothed[cols["n_contacts"]].values.astype(np.float64), 183 | cvd_smoothed[cols["n_pixels"]].values.astype(np.float64), 184 | ], 185 | sigma_log10=sigma_log10, 186 | window_sigma=window_sigma, 187 | points_per_sigma=points_per_sigma, 188 | ) 189 | 190 | cvd_smoothed[cols["n_pixels"] + suffix] = smoothed_n_valid 191 | cvd_smoothed[cols["n_contacts"] + suffix] = smoothed_balanced_sum 192 | cvd_smoothed[cols["output_prefix"] + suffix] = ( 193 | cvd_smoothed[cols["n_contacts"] + suffix] 194 | / cvd_smoothed[cols["n_pixels"] + suffix] 195 | ) 196 | 197 | return cvd_smoothed -------------------------------------------------------------------------------- /cooltools/sandbox/fastsavetxt.pyx: -------------------------------------------------------------------------------- 1 | ### Adaptation of Max Imakaev's fast txt matrix writer. 2 | 3 | cimport cython 4 | import os 5 | import subprocess 6 | 7 | from libc.stdlib cimport malloc, free 8 | from libc.string cimport strcpy, strlen 9 | 10 | import numpy as np 11 | cimport numpy as np 12 | 13 | cdef extern from "stdio.h": 14 | int sprintf(char *str, char *format, ...) 15 | 16 | def commandExists(command): 17 | """ 18 | Checks if the bash command exists. 19 | """ 20 | command = command.split()[0] 21 | if subprocess.call(['which', command]) != 0: 22 | return False 23 | return True 24 | 25 | def gzipWriter(filepath): 26 | """ 27 | Creates a writing process with gzip or parallel gzip (pigz) attached to it. 28 | """ 29 | filepath = os.path.abspath(filepath) 30 | with open(filepath, 'wb') as outFile: 31 | if commandExists("pigz"): 32 | writer = ["pigz", "-c", "-9"] 33 | else: 34 | writer = ["gzip", "-c", "-2"] 35 | 36 | pwrite = subprocess.Popen( 37 | writer, 38 | stdin=subprocess.PIPE, 39 | stdout=outFile, 40 | shell=False, 41 | bufsize=-1) 42 | return pwrite 43 | 44 | def empty_func(): 45 | return None 46 | 47 | @cython.boundscheck(False) 48 | @cython.nonecheck(False) 49 | @cython.wraparound(False) 50 | 51 | def array2txt( 52 | mat, 53 | out, 54 | format_string=b'%.4lf', 55 | sep=b'\t', 56 | newline=b'\n', 57 | header=None, 58 | row_headers=None, 59 | max_element_len=100): 60 | """ 61 | Dump a 2d array into a text file, optionally gzipped. 62 | This implementation if faster than the np.savetxt and it allows the user 63 | to provide column/row headers. 64 | 65 | Parameters 66 | ---------- 67 | mat : a 2D numpy array of a list of lists of numbers (float/integer) 68 | 69 | out : str or file object 70 | Either: 71 | -- a path to the output file. If ends with .gz the output is gzipped 72 | -- a file object 73 | -- a stdin of a Popen object 74 | TIP: when using files/stdin do not forget to flush()/communicate(). 75 | 76 | format_string : bytes, optional 77 | A printf-style formatting string to specify the coversion of 78 | the elements of the matrix into strings. 79 | 80 | sep : bytes, optional 81 | The column separator. 82 | 83 | newline : bytes, optional 84 | The newline separator. 85 | 86 | header : bytes, optional 87 | A header to prepend to the output file, is separated from the main table 88 | by a `newline`. 89 | 90 | row_headers : a list of bytes, optional 91 | Row headers to prepend to the output file, one per each row in `mat`. 92 | 93 | max_element_len : int 94 | The maximal length of the string representation of a matrix element, 95 | produced by sprintf(`format_string`). Used to preallocate memory. 96 | """ 97 | 98 | 99 | cdef int N = len(mat) 100 | cdef int M = len(mat[0]) 101 | 102 | if issubclass(type(out), str) or issubclass(type(out), bytearray): 103 | if out.endswith('.gz'): 104 | writer = gzipWriter(out) 105 | out_pipe = writer.stdin 106 | close_out_func = writer.communicate 107 | else: 108 | writer = open(out, 'wb') 109 | out_pipe = writer 110 | close_out_func = writer.flush 111 | elif hasattr(out, 'write'): 112 | out_pipe = out 113 | close_out_func = empty_func 114 | else: 115 | raise Exception('`out` must be either a file path or a file handle/stream') 116 | 117 | cdef np.ndarray[np.double_t, ndim=2] mat_ndarray = np.array(mat, dtype=np.double, order="C") 118 | 119 | cdef char* newline_cstr = newline 120 | cdef char* sep_cstr = sep 121 | cdef char* next_row_header 122 | cdef char* s_start 123 | cdef char* s_cur 124 | 125 | cdef int max_header_len = 0 126 | if row_headers is not None: 127 | max_header_len = max([len(row_header) for row_header in row_headers]) 128 | 129 | s_start = malloc((max_element_len * M + max_header_len) * sizeof(char)) 130 | 131 | cdef double element 132 | cdef char* curStringTemplate 133 | template = b''.join([format_string, sep]) 134 | curStringTemplate = template 135 | 136 | if header is not None: 137 | out_pipe.write(header) 138 | out_pipe.write(newline_cstr) 139 | 140 | cdef int i,j 141 | for i in xrange(N): 142 | s_cur = s_start 143 | if row_headers is not None: 144 | 145 | next_row_header = row_headers[i] 146 | s_cur = strcpy(s_cur, next_row_header) 147 | s_cur += sizeof(char) * strlen(next_row_header) 148 | 149 | s_cur = strcpy(s_cur, sep_cstr) 150 | s_cur += sizeof(char) * strlen(sep_cstr) 151 | 152 | for j in xrange(M): 153 | element = mat_ndarray[i,j] 154 | s_cur = s_cur + sprintf(s_cur, curStringTemplate, element) 155 | 156 | s_cur = strcpy(s_cur, newline_cstr) 157 | s_cur += sizeof(char) * strlen(newline_cstr) 158 | 159 | out_pipe.write(s_start) 160 | free(s_start) 161 | 162 | close_out_func() 163 | -------------------------------------------------------------------------------- /cooltools/sandbox/pairs_scaling_functions.py: -------------------------------------------------------------------------------- 1 | ################################### 2 | # 3 | # several functions for calculating scalings using pairs 4 | # they used to reside in cooltools.expected module 5 | # 6 | #################################### 7 | 8 | import numpy as np 9 | from ..lib import numutils 10 | 11 | def _contact_areas(distbins, scaffold_length): 12 | distbins = distbins.astype(float) 13 | scaffold_length = float(scaffold_length) 14 | outer_areas = np.maximum(scaffold_length - distbins[:-1], 0) ** 2 15 | inner_areas = np.maximum(scaffold_length - distbins[1:], 0) ** 2 16 | return 0.5 * (outer_areas - inner_areas) 17 | 18 | 19 | def contact_areas(distbins, region1, region2): 20 | if region1 == region2: 21 | start, end = region1 22 | areas = _contact_areas(distbins, end - start) 23 | else: 24 | start1, end1 = region1 25 | start2, end2 = region2 26 | if start2 <= start1: 27 | start1, start2 = start2, start1 28 | end1, end2 = end2, end1 29 | areas = ( 30 | _contact_areas(distbins, end2 - start1) 31 | - _contact_areas(distbins, start2 - start1) 32 | - _contact_areas(distbins, end2 - end1) 33 | ) 34 | if end1 < start2: 35 | areas += _contact_areas(distbins, start2 - end1) 36 | 37 | return areas 38 | 39 | 40 | def compute_scaling(df, region1, region2=None, dmin=int(1e1), dmax=int(1e7), n_bins=50): 41 | 42 | import dask.array as da 43 | 44 | if region2 is None: 45 | region2 = region1 46 | 47 | distbins = numutils.logbins(dmin, dmax, N=n_bins) 48 | areas = contact_areas(distbins, region1, region2) 49 | 50 | df = df[ 51 | (df["pos1"] >= region1[0]) 52 | & (df["pos1"] < region1[1]) 53 | & (df["pos2"] >= region2[0]) 54 | & (df["pos2"] < region2[1]) 55 | ] 56 | dists = (df["pos2"] - df["pos1"]).values 57 | 58 | if isinstance(dists, da.Array): 59 | obs, _ = da.histogram(dists[(dists >= dmin) & (dists < dmax)], bins=distbins) 60 | else: 61 | obs, _ = np.histogram(dists[(dists >= dmin) & (dists < dmax)], bins=distbins) 62 | 63 | return distbins, obs, areas 64 | -------------------------------------------------------------------------------- /datasets/external_test_files.tsv: -------------------------------------------------------------------------------- 1 | # key filename checksum link comment 2 | HFF_MicroC test.mcool e4a0fc25c8dc3d38e9065fd74c565dd1 https://osf.io/3h9js/download Micro-C data from HFF human cells for two chromosomes (hg38) in a multi-resolution mcool format. Krietenstein et al. 2021 data. 3 | hESC_MicroC test_hESC.mcool ac0e636605505fb76fac25fa08784d5b https://osf.io/3kdyj/download Micro-C data from human ES cells for two chromosomes (hg38) in a multi-resolution mcool format. Krietenstein et al. 2021 data. 4 | HFF_CTCF_fc test_CTCF.bigWig 62429de974b5b4a379578cc85adc65a3 https://osf.io/w92u3/download ChIP-Seq fold change over input with CTCF antibodies in HFF cells (hg38). Downloaded from ENCODE ENCSR000DWQ, ENCFF761RHS.bigWig file 5 | HFF_CTCF_binding test_CTCF.bed.gz 61ecfdfa821571a8e0ea362e8fd48f63 https://osf.io/c9pwe/download Binding sites called from CTCF ChIP-Seq peaks for HFF cells (hg38). Peaks are from ENCODE ENCSR000DWQ, ENCFF498QCT.bed file. The motifs are called with gimmemotifs (options --nreport 1 --cutoff 0), with JASPAR pwm MA0139. 6 | mESC_dRAD21_IAA dRAD21_IAA.mm10.mapq_30.mcool 40087388c443aae19110fdf099738c06 https://osf.io/5xaut/download Micro-C data from mESC for three chromosomes (mm10) in a multi-resolution mcool format (Hsieh et al. 2022). dRad21 IAA treatment, degraded Rad21. 7 | mESC_dRAD21_UT dRAD21_UT.mm10.mapq_30.mcool 2ff91a7def1a9dd3e1f9b62d89d579a7 https://osf.io/u75pd/download Micro-C data from mESC for three chromosomes (mm10) in a multi-resolution mcool format (Hsieh et al. 2022). dRad21 untreated (UT), control for Rad21 degradation. 8 | mESC_dCTCF_IAA dCTCF_IAA.mm10.mapq_30.mcool 33ec02cafa9f1f31d2cbba227cf38cc6 https://osf.io/xwy9j/download Micro-C data from mESC for three chromosomes (mm10) in a multi-resolution mcool format (Hsieh et al. 2022). dCTCF IAA treatment, degraded CTCF. 9 | mESC_dWAPL_IAA dWAPL_IAA.mm10.mapq_30.mcool 11088c9a6d10826a23a69807fc296005 https://osf.io/fk74t/download Micro-C data from mESC for three chromosomes (mm10) in a multi-resolution mcool format (Hsieh et al. 2022). dWapl IAA treatment, degraded Wapl. 10 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @cd ..; python setup.py build_ext --inplace; cd docs 21 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 22 | -------------------------------------------------------------------------------- /docs/cli.rst: -------------------------------------------------------------------------------- 1 | CLI Reference 2 | ============= 3 | 4 | .. click:: cooltools.cli:cli 5 | :prog: cooltools 6 | :show-nested: 7 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # http://www.sphinx-doc.org/en/master/config 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('..')) 16 | 17 | from unittest.mock import Mock 18 | MOCK_MODULES = [ 19 | 'cooltools.io.fastsavetxt', 20 | 'cooltools.lib._numutils', 21 | 'cooler', 22 | 'cooler.core', 23 | 'cooler.tools', 24 | 'cooler.parallel', 25 | 'cooler.util', 26 | 'cython', 27 | 'dask', 28 | 'h5py', 29 | 'matplotlib', 30 | 'matplotlib.cm', 31 | 'matplotlib.pyplot', 32 | 'matplotlib.colors', 33 | 'numba', 34 | # 'numpy', 35 | 'pandas', 36 | 'scipy', 37 | 'scipy.interpolate', 38 | 'scipy.linalg', 39 | 'scipy.sparse', 40 | 'scipy.sparse.linalg', 41 | 'scipy.ndimage', 42 | 'scipy.ndimage.filters', 43 | 'scipy.ndimage.interpolation', 44 | 'scipy.signal', 45 | 'scipy.stats', 46 | 'sklearn', 47 | 'sklearn.cluster', 48 | 'skimage', 49 | 'skimage.filters', 50 | ] 51 | for mod_name in MOCK_MODULES: 52 | sys.modules[mod_name] = Mock() 53 | 54 | 55 | # -- Project information ----------------------------------------------------- 56 | 57 | project = 'cooltools' 58 | copyright = '2020, cooltoolers' 59 | author = 'cooltoolers' 60 | 61 | 62 | # -- General configuration --------------------------------------------------- 63 | 64 | # Apparently readthedocs looks for contents.rst by default if this isn't set. 65 | master_doc = 'index' 66 | 67 | # Add any paths that contain templates here, relative to this directory. 68 | templates_path = ['_templates'] 69 | 70 | # List of patterns, relative to source directory, that match files and 71 | # directories to ignore when looking for source files. 72 | # This pattern also affects html_static_path and html_extra_path. 73 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints'] 74 | 75 | # Add any Sphinx extension module names here, as strings. They can be 76 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 77 | # ones. 78 | extensions = [ 79 | 'sphinx.ext.todo', 80 | 'sphinx.ext.coverage', 81 | 'sphinx.ext.autodoc', 82 | 'sphinx.ext.viewcode', 83 | 'sphinx.ext.autosummary', 84 | 'sphinx.ext.napoleon', 85 | 'sphinx.ext.mathjax', 86 | 'sphinx_click.ext', 87 | 'recommonmark', 88 | 'nbsphinx', 89 | 'sphinx_rtd_theme' 90 | ] 91 | 92 | # Extension configuration 93 | napoleon_google_docstring = False 94 | # napoleon_use_param = False 95 | # napoleon_use_ivar = True 96 | napoleon_use_rtype = False 97 | 98 | # Notebook prolog and epilog 99 | nbsphinx_prolog = """""" 100 | nbsphinx_epilog = r""" 101 | ---- 102 | {% set docname = env.doc2path(env.docname, base='docs') %} 103 | 104 | This page was generated with nbsphinx_ from `{{ docname }}`__ 105 | 106 | __ https://github.com/open2c/cooltools/blob/master{{ env.config.release }}/{{ docname }} 107 | 108 | .. _nbsphinx: https://nbsphinx.readthedocs.io/ 109 | 110 | """ 111 | 112 | # -- Options for HTML output ------------------------------------------------- 113 | 114 | # The theme to use for HTML and HTML Help pages. See the documentation for 115 | # a list of builtin themes. 116 | # 117 | html_theme = 'sphinx_rtd_theme' 118 | 119 | # Add any paths that contain custom static files (such as style sheets) here, 120 | # relative to this directory. They are copied after the builtin static files, 121 | # so a file named "default.css" will overwrite the builtin "default.css". 122 | html_static_path = ['_static'] 123 | 124 | 125 | # -- Style overrides ---------------------------------------------------------- 126 | # Place CSS in _static directory 127 | # def setup(app): 128 | # app.add_stylesheet('theme_overrides.css') 129 | 130 | 131 | # Pull jupyter notebooks from the open2c_examples repo 132 | def setup(app): 133 | from subprocess import run 134 | 135 | if os.path.isdir('notebooks'): 136 | cmd = 'cd notebooks && git pull' 137 | else: 138 | cmd = 'git clone https://github.com/open2c/open2c_examples.git notebooks' 139 | 140 | print("Updating Open2C examples...") 141 | run(cmd, check=True, shell=True) 142 | -------------------------------------------------------------------------------- /docs/cooltools.lib.rst: -------------------------------------------------------------------------------- 1 | cooltools.lib package 2 | ===================== 3 | 4 | common 5 | -------- 6 | 7 | .. automodule:: cooltools.lib.common 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | 12 | numutils 13 | -------- 14 | 15 | .. automodule:: cooltools.lib.numutils 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | peaks 21 | ----- 22 | 23 | .. automodule:: cooltools.lib.peaks 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | plotting 29 | -------- 30 | 31 | .. automodule:: cooltools.lib.plotting 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | schemas 37 | -------- 38 | 39 | .. automodule:: cooltools.lib.schemas 40 | :members: 41 | :undoc-members: 42 | :show-inheritance: -------------------------------------------------------------------------------- /docs/cooltools.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ============= 3 | 4 | subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | cooltools.lib 10 | 11 | cooltools.api.coverage module 12 | ----------------------------- 13 | 14 | .. automodule:: cooltools.api.coverage 15 | :members: 16 | :undoc-members: 17 | :show-inheritance: 18 | 19 | cooltools.api.directionality module 20 | ----------------------------------- 21 | 22 | .. automodule:: cooltools.api.directionality 23 | :members: 24 | :undoc-members: 25 | :show-inheritance: 26 | 27 | cooltools.api.dotfinder module 28 | ------------------------------ 29 | 30 | .. automodule:: cooltools.api.dotfinder 31 | :members: 32 | :undoc-members: 33 | :show-inheritance: 34 | 35 | cooltools.api.eigdecomp module 36 | ------------------------------ 37 | 38 | .. automodule:: cooltools.api.eigdecomp 39 | :members: 40 | :undoc-members: 41 | :show-inheritance: 42 | 43 | cooltools.api.expected module 44 | ----------------------------- 45 | 46 | .. automodule:: cooltools.api.expected 47 | :members: 48 | :undoc-members: 49 | :show-inheritance: 50 | 51 | cooltools.api.insulation module 52 | ------------------------------- 53 | 54 | .. automodule:: cooltools.api.insulation 55 | :members: 56 | :undoc-members: 57 | :show-inheritance: 58 | 59 | cooltools.api.saddle module 60 | --------------------------- 61 | 62 | .. automodule:: cooltools.api.saddle 63 | :members: 64 | :undoc-members: 65 | :show-inheritance: 66 | 67 | cooltools.api.sample module 68 | --------------------------- 69 | 70 | .. automodule:: cooltools.api.sample 71 | :members: 72 | :undoc-members: 73 | :show-inheritance: 74 | 75 | cooltools.api.snipping module 76 | ----------------------------- 77 | 78 | .. automodule:: cooltools.api.snipping 79 | :members: 80 | :undoc-members: 81 | :show-inheritance: 82 | 83 | cooltools.api.virtual4c module 84 | ----------------------------- 85 | 86 | .. automodule:: cooltools.api.virtual4c 87 | :members: 88 | :undoc-members: 89 | :show-inheritance: -------------------------------------------------------------------------------- /docs/figs/cooltools-logo-futura.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open2c/cooltools/aedd531e80e7940311f941918c6551d3229eaf21/docs/figs/cooltools-logo-futura.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. cooltools documentation master file, created by 2 | sphinx-quickstart on Wed Jun 12 16:42:43 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | .. toctree:: 7 | :caption: Overview 8 | :hidden: 9 | :maxdepth: 2 10 | 11 | self 12 | 13 | Getting started 14 | *************** 15 | 16 | The tools for your *.cool*\ s 17 | 18 | Chromosome conformation capture technologies reveal the incredible complexity of genome folding. A growing number of labs and multiple consortia, including the 4D Nucleome, the International Nucleome Consortium, and ENCODE, are generating higher-resolution datasets to probe genome architecture across cell states, types, and organisms. Larger datasets increase the challenges at each step of computational analysis, from storage, to memory, to researchers’ time. The recently-introduced `cooler `_ format readily handles storage of high-resolution datasets via a sparse data model. 19 | 20 | **cooltools** leverages this format to enable flexible and reproducible analysis of high-resolution data. **cooltools** provides a suite of computational tools with a paired python API and command line access, which facilitates workflows either on high-performance computing clusters or via custom analysis notebooks. As part of the `Open2C ecosystem `_, **cooltools** also provides detailed introductions to key concepts in Hi-C-data analysis with interactive notebook documentation. 21 | 22 | If you use **cooltools** in your work, please cite **cooltools**: https://doi.org/10.1101/2022.10.31.514564. 23 | 24 | Installation 25 | ============ 26 | 27 | Requirements 28 | ------------ 29 | 30 | - Python 3.7+ 31 | - Scientific Python packages 32 | 33 | Install using pip 34 | ----------------- 35 | 36 | Compile and install `cooltools` and its Python dependencies from 37 | PyPI using pip: 38 | 39 | .. code-block:: bash 40 | 41 | $ pip install cooltools 42 | 43 | or install the latest version directly from github: 44 | 45 | .. code-block:: bash 46 | 47 | $ pip install https://github.com/open2c/cooltools/archive/refs/heads/master.zip 48 | 49 | 50 | Install the development version 51 | ------------------------------- 52 | 53 | Finally, you can install the latest development version of `cooltools` from 54 | github. First, make a local clone of the github repository: 55 | 56 | .. code-block:: bash 57 | 58 | $ git clone https://github.com/open2c/cooltools 59 | 60 | Then, you can compile and install `cooltools` in 61 | `development mode `_, 62 | which installs the package without moving it to a system folder and thus allows 63 | immediate live-testing any changes in the python code. 64 | 65 | .. code-block:: bash 66 | 67 | $ cd cooltools 68 | $ pip install -e ./ 69 | 70 | 71 | .. toctree:: 72 | :maxdepth: 2 73 | :caption: Tutorials 74 | :titlesonly: 75 | 76 | ./notebooks/viz.ipynb 77 | ./notebooks/contacts_vs_distance.ipynb 78 | ./notebooks/compartments_and_saddles.ipynb 79 | ./notebooks/insulation_and_boundaries.ipynb 80 | ./notebooks/dots.ipynb 81 | ./notebooks/pileup_CTCF.ipynb 82 | ./notebooks/command_line_interface.ipynb 83 | 84 | Note that these notebooks currently focus on mammalian interphase Hi-C analysis, but are readily extendible to other organisms and cellular contexts. To clone and work interactively with these notebooks, visit: https://github.com/open2c/open2c_examples. 85 | 86 | 87 | .. toctree:: 88 | :maxdepth: 1 89 | :caption: Reference 90 | 91 | cli 92 | cooltools 93 | releases 94 | 95 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/notebooks_old/data/encode_motifs.hg38.ctcf_known1.liftover.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open2c/cooltools/aedd531e80e7940311f941918c6551d3229eaf21/docs/notebooks_old/data/encode_motifs.hg38.ctcf_known1.liftover.bed.gz -------------------------------------------------------------------------------- /docs/releases.md: -------------------------------------------------------------------------------- 1 | ../CHANGELOG.md -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | bioframe 2 | click>=7.0 3 | ipython 4 | nbsphinx 5 | multiprocess 6 | numpy 7 | pygments<3,>=2.4.1 8 | recommonmark 9 | Sphinx 10 | sphinx-rtd-theme 11 | sphinx-click 12 | docutils<=0.16 13 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "cython", "numpy"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = 3 | --cov cooltools 4 | --cov-config .coveragerc 5 | --cov-report term-missing 6 | --cov-report html 7 | --cov-report xml 8 | filterwarnings = 9 | ignore::PendingDeprecationWarning 10 | testpaths = 11 | tests 12 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | pytest 3 | pytest-flake8 4 | pytest-cov 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bioframe>=0.4.1 2 | click>=7 3 | cooler>=0.9.1 4 | cython 5 | joblib 6 | matplotlib 7 | multiprocess 8 | numba 9 | numpy 10 | pandas>=1.5.1 11 | scikit-learn>=1.1.2 12 | scipy 13 | scikit-image 14 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import io 4 | import os 5 | import re 6 | 7 | from setuptools import setup, find_packages 8 | from setuptools.extension import Extension 9 | from Cython.Build import cythonize 10 | import numpy as np 11 | 12 | 13 | classifiers = """\ 14 | Development Status :: 4 - Beta 15 | Programming Language :: Python 16 | Programming Language :: Python :: 3 17 | Programming Language :: Python :: 3.8 18 | Programming Language :: Python :: 3.9 19 | Programming Language :: Python :: 3.10 20 | """ 21 | 22 | 23 | def _read(*parts, **kwargs): 24 | filepath = os.path.join(os.path.dirname(__file__), *parts) 25 | encoding = kwargs.pop("encoding", "utf-8") 26 | with io.open(filepath, encoding=encoding) as fh: 27 | text = fh.read() 28 | return text 29 | 30 | 31 | def get_version(): 32 | version = re.search( 33 | r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', 34 | _read("cooltools", "__init__.py"), 35 | re.MULTILINE, 36 | ).group(1) 37 | return version 38 | 39 | 40 | def get_long_description(): 41 | return _read("README.md") 42 | 43 | 44 | def get_requirements(path): 45 | content = _read(path) 46 | return [ 47 | req 48 | for req in content.split("\n") 49 | if req != "" and not (req.startswith("#") or req.startswith("-")) 50 | ] 51 | 52 | 53 | setup_requires = [ 54 | "cython", 55 | "numpy", 56 | ] 57 | 58 | 59 | install_requires = get_requirements("requirements.txt") 60 | 61 | 62 | extensions = [ 63 | Extension( 64 | "cooltools.lib._numutils", 65 | ["cooltools/lib/_numutils.pyx"], 66 | include_dirs=[np.get_include()], 67 | ), 68 | ] 69 | 70 | 71 | packages = find_packages() 72 | 73 | 74 | setup( 75 | name="cooltools", 76 | author="Open2C", 77 | author_email="open.chromosome.collective@gmail.com", 78 | version=get_version(), 79 | license="MIT", 80 | description="Analysis tools for genomic interaction data stored in .cool format", 81 | long_description=get_long_description(), 82 | long_description_content_type="text/markdown", 83 | keywords=["genomics", "bioinformatics", "Hi-C", "analysis", "cooler"], 84 | url="https://github.com/open2c/cooltools", 85 | zip_safe=False, 86 | classifiers=[s.strip() for s in classifiers.split("\n") if s], 87 | python_requires=">=3.7.1", # same as pandas 88 | packages=packages, 89 | ext_modules=cythonize(extensions), 90 | include_dirs=[np.get_include()], 91 | setup_requires=setup_requires, 92 | install_requires=install_requires, 93 | entry_points={ 94 | "console_scripts": [ 95 | "cooltools = cooltools.cli:cli", 96 | ] 97 | }, 98 | ) 99 | -------------------------------------------------------------------------------- /tests/data/CN.mm9.10000kb.cool: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open2c/cooltools/aedd531e80e7940311f941918c6551d3229eaf21/tests/data/CN.mm9.10000kb.cool -------------------------------------------------------------------------------- /tests/data/CN.mm9.1000kb.cool: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open2c/cooltools/aedd531e80e7940311f941918c6551d3229eaf21/tests/data/CN.mm9.1000kb.cool -------------------------------------------------------------------------------- /tests/data/CN.mm9.toy_expected.chromnamed.tsv: -------------------------------------------------------------------------------- 1 | region1 region2 dist n_valid count.sum balanced.sum balanced.avg 2 | chr1 chr1 0 50 3 | chr1 chr1 1 49 4 | chr1 chr1 2 48 448255.0 2.3205076553434987 0.04834390948632289 5 | chr1 chr1 3 47 271497.0 1.38339695992966 0.02943397787084383 6 | chr1 chr1 4 46 179491.0 0.900655795691491 0.01957947381938024 7 | chr1 chr1 5 45 135426.0 0.6826130105698165 0.015169178012662588 8 | chr1 chr1 6 44 96841.0 0.48167647260294866 0.010947192559157925 9 | chr1 chr1 7 43 74458.0 0.36747755422094075 0.008545989633045134 10 | chr1 chr1 8 42 56431.0 0.2767897183400133 0.0065902313890479364 11 | chr1 chr1 9 41 46579.0 0.23020444753273792 0.005614742622749705 12 | chr1 chr1 10 40 42800.0 0.21407619204942857 0.005351904801235714 13 | chr1 chr1 11 39 38893.0 0.1931769021342914 0.0049532539008792665 14 | chr1 chr1 12 38 35915.0 0.1760485882134026 0.004632857584563227 15 | chr1 chr1 13 37 31507.0 0.15432815796541483 0.0041710312963625625 16 | chr1 chr1 14 36 28275.0 0.13916825128679033 0.003865784757966398 17 | chr1 chr1 15 35 26582.0 0.13214461875460215 0.0037755605358457756 18 | chr1 chr1 16 34 24080.0 0.1200420079045525 0.0035306472913103674 19 | chr1 chr1 17 33 22554.0 0.1123809167677425 0.0034054823262952274 20 | chr1 chr1 18 32 21069.0 0.10519693902501005 0.003287404344531564 21 | chr1 chr1 19 31 19565.0 0.09730388315158268 0.003138834940373635 22 | chr1 chr1 20 30 18830.0 0.09344118037915836 0.003114706012638612 23 | chr1 chr1 21 29 18180.0 0.09181365603513099 0.003165988139142448 24 | chr1 chr1 22 28 16817.0 0.0857312761411997 0.003061831290757132 25 | chr1 chr1 23 27 15637.0 0.08088906104487427 0.0029958911498101583 26 | chr1 chr1 24 26 13554.0 0.0696931607808895 0.0026805061838803654 27 | chr1 chr1 25 25 12151.0 0.062133968853916574 0.002485358754156663 28 | chr1 chr1 26 24 10641.0 0.053908741063492124 0.002246197544312172 29 | chr1 chr1 27 23 9371.0 0.04780835937733471 0.002078624320753683 30 | chr1 chr1 28 22 8684.0 0.04565538936132342 0.0020752449709692464 31 | chr1 chr1 29 21 7883.0 0.04194264489363847 0.0019972688044589747 32 | chr1 chr1 30 20 7602.0 0.04117335917285604 0.002058667958642802 33 | chr1 chr1 31 19 6783.0 0.03642786791651601 0.0019172562061324217 34 | chr1 chr1 32 18 6220.0 0.033609930607101324 0.0018672183670611847 35 | chr1 chr1 33 17 5752.0 0.03126540105125592 0.0018391412383091717 36 | chr1 chr1 34 16 5236.0 0.02870993254323146 0.0017943707839519663 37 | chr1 chr1 35 15 4806.0 0.026732726358511393 0.0017821817572340928 38 | chr1 chr1 36 14 4562.0 0.025516336044875902 0.0018225954317768502 39 | chr1 chr1 37 13 4484.0 0.025173064987642168 0.001936389614434013 40 | chr1 chr1 38 12 4322.0 0.024324300745100825 0.0020270250620917354 41 | chr1 chr1 39 11 3797.0 0.02095540632794532 0.00190503693890412 42 | chr1 chr1 40 10 3403.0 0.018630663941423948 0.0018630663941423947 43 | chr1 chr1 41 9 3044.0 0.016810995031025552 0.001867888336780617 44 | chr1 chr1 42 8 2716.0 0.015316241229781234 0.0019145301537226542 45 | chr1 chr1 43 7 2461.0 0.014124488058201323 0.002017784008314475 46 | chr1 chr1 44 6 2060.0 0.011782977088540664 0.0019638295147567774 47 | chr1 chr1 45 5 1629.0 0.009356770724295723 0.0018713541448591446 48 | chr1 chr1 46 4 1325.0 0.007777107004193509 0.0019442767510483773 49 | chr1 chr1 47 3 950.0 0.005574745304582236 0.0018582484348607453 50 | chr1 chr1 48 2 629.0 0.003669007156579109 0.0018345035782895544 51 | chr1 chr1 49 1 326.0 0.0020415942196967394 0.0020415942196967394 52 | chr2 chr2 0 49 53 | chr2 chr2 1 48 54 | chr2 chr2 2 47 450107.0 2.1180050802546933 0.04506393787775943 55 | chr2 chr2 3 46 238644.0 1.1182026520831783 0.02430875330615605 56 | chr2 chr2 4 45 151877.0 0.7065426657897472 0.01570094812866105 57 | chr2 chr2 5 44 105862.0 0.4889639900117408 0.01111281795481229 58 | chr2 chr2 6 43 84565.0 0.3886687958491317 0.00903880920579376 59 | chr2 chr2 7 42 67656.0 0.305587801420597 0.007275900033823738 60 | chr2 chr2 8 41 56605.0 0.2536802573536893 0.006187323350089984 61 | chr2 chr2 9 40 49125.0 0.21940452543596367 0.005485113135899092 62 | chr2 chr2 10 39 43256.0 0.19302073776471373 0.004949249686274711 63 | chr2 chr2 11 38 38908.0 0.17213966992023477 0.0045299913136903885 64 | chr2 chr2 12 37 33613.0 0.1494114335367291 0.00403814685234403 65 | chr2 chr2 13 36 29008.0 0.1286862020151156 0.0035746167226421 66 | chr2 chr2 14 35 28208.0 0.1257340707416353 0.0035924020211895802 67 | chr2 chr2 15 34 26130.0 0.11682046178278417 0.0034358959347877698 68 | chr2 chr2 16 33 24355.0 0.10848220502658447 0.0032873395462601354 69 | chr2 chr2 17 32 21902.0 0.09720413992092795 0.0030376293725289986 70 | chr2 chr2 18 31 19754.0 0.08921457365055102 0.00287788947259842 71 | chr2 chr2 19 30 17506.0 0.0798108423392565 0.00266036141130855 72 | chr2 chr2 20 29 16951.0 0.07831020324831016 0.002700351836148626 73 | chr2 chr2 21 28 16124.0 0.07470713314986098 0.0026681118982093206 74 | chr2 chr2 22 27 16237.0 0.07516147832181286 0.002783758456363439 75 | chr2 chr2 23 26 15583.0 0.07144738725071081 0.0027479764327196466 76 | chr2 chr2 24 25 14864.0 0.06801519019393452 0.0027206076077573808 77 | chr2 chr2 25 24 14174.0 0.06516873511627985 0.002715363963178327 78 | chr2 chr2 26 23 14169.0 0.06554949528961256 0.002849978056070111 79 | chr2 chr2 27 22 13561.0 0.06221042530718225 0.0028277466048719207 80 | chr2 chr2 28 21 12073.0 0.055813578961226296 0.0026577894743441094 81 | chr2 chr2 29 20 11032.0 0.05118868034313225 0.0025594340171566127 82 | chr2 chr2 30 19 10723.0 0.050269590871060296 0.0026457679405821207 83 | chr2 chr2 31 18 10646.0 0.04998712073522266 0.0027770622630679254 84 | chr2 chr2 32 17 10320.0 0.04943531274185869 0.002907959573050511 85 | chr2 chr2 33 16 9664.0 0.04604888783607321 0.0028780554897545755 86 | chr2 chr2 34 15 9227.0 0.04425307710295975 0.0029502051401973164 87 | chr2 chr2 35 14 9111.0 0.04421548066666439 0.0031582486190474567 88 | chr2 chr2 36 13 9923.0 0.04945120961837048 0.003803939201413114 89 | chr2 chr2 37 12 9219.0 0.04674824569212995 0.0038956871410108294 90 | chr2 chr2 38 11 8027.0 0.04077733321358686 0.0037070302921442602 91 | chr2 chr2 39 10 6756.0 0.03230495094148628 0.0032304950941486276 92 | chr2 chr2 40 9 5996.0 0.027699878189309274 0.003077764243256586 93 | chr2 chr2 41 8 5280.0 0.023833680900535406 0.0029792101125669258 94 | chr2 chr2 42 7 4560.0 0.019837282406156377 0.002833897486593768 95 | chr2 chr2 43 6 3911.0 0.01627847374007839 0.0027130789566797314 96 | chr2 chr2 44 5 3155.0 0.012966661266117605 0.002593332253223521 97 | chr2 chr2 45 4 2335.0 0.008792759755829107 0.0021981899389572766 98 | chr2 chr2 46 3 1518.0 0.005519380548429014 0.0018397935161430046 99 | chr2 chr2 47 2 1142.0 0.003471630969823881 0.0017358154849119406 100 | chr2 chr2 48 1 756.0 0.0019403909506671992 0.0019403909506671992 101 | chr2 chr2 49 0 361.0 0.0 102 | -------------------------------------------------------------------------------- /tests/data/CN.mm9.toy_expected.tsv: -------------------------------------------------------------------------------- 1 | region1 region2 dist n_valid count.sum balanced.sum balanced.avg 2 | foo foo 0 50 3 | foo foo 1 49 4 | foo foo 2 48 448255.0 2.3205076553434987 0.04834390948632289 5 | foo foo 3 47 271497.0 1.38339695992966 0.02943397787084383 6 | foo foo 4 46 179491.0 0.900655795691491 0.01957947381938024 7 | foo foo 5 45 135426.0 0.6826130105698165 0.015169178012662588 8 | foo foo 6 44 96841.0 0.48167647260294866 0.010947192559157925 9 | foo foo 7 43 74458.0 0.36747755422094075 0.008545989633045134 10 | foo foo 8 42 56431.0 0.2767897183400133 0.0065902313890479364 11 | foo foo 9 41 46579.0 0.23020444753273792 0.005614742622749705 12 | foo foo 10 40 42800.0 0.21407619204942857 0.005351904801235714 13 | foo foo 11 39 38893.0 0.1931769021342914 0.0049532539008792665 14 | foo foo 12 38 35915.0 0.1760485882134026 0.004632857584563227 15 | foo foo 13 37 31507.0 0.15432815796541483 0.0041710312963625625 16 | foo foo 14 36 28275.0 0.13916825128679033 0.003865784757966398 17 | foo foo 15 35 26582.0 0.13214461875460215 0.0037755605358457756 18 | foo foo 16 34 24080.0 0.1200420079045525 0.0035306472913103674 19 | foo foo 17 33 22554.0 0.1123809167677425 0.0034054823262952274 20 | foo foo 18 32 21069.0 0.10519693902501005 0.003287404344531564 21 | foo foo 19 31 19565.0 0.09730388315158268 0.003138834940373635 22 | foo foo 20 30 18830.0 0.09344118037915836 0.003114706012638612 23 | foo foo 21 29 18180.0 0.09181365603513099 0.003165988139142448 24 | foo foo 22 28 16817.0 0.0857312761411997 0.003061831290757132 25 | foo foo 23 27 15637.0 0.08088906104487427 0.0029958911498101583 26 | foo foo 24 26 13554.0 0.0696931607808895 0.0026805061838803654 27 | foo foo 25 25 12151.0 0.062133968853916574 0.002485358754156663 28 | foo foo 26 24 10641.0 0.053908741063492124 0.002246197544312172 29 | foo foo 27 23 9371.0 0.04780835937733471 0.002078624320753683 30 | foo foo 28 22 8684.0 0.04565538936132342 0.0020752449709692464 31 | foo foo 29 21 7883.0 0.04194264489363847 0.0019972688044589747 32 | foo foo 30 20 7602.0 0.04117335917285604 0.002058667958642802 33 | foo foo 31 19 6783.0 0.03642786791651601 0.0019172562061324217 34 | foo foo 32 18 6220.0 0.033609930607101324 0.0018672183670611847 35 | foo foo 33 17 5752.0 0.03126540105125592 0.0018391412383091717 36 | foo foo 34 16 5236.0 0.02870993254323146 0.0017943707839519663 37 | foo foo 35 15 4806.0 0.026732726358511393 0.0017821817572340928 38 | foo foo 36 14 4562.0 0.025516336044875902 0.0018225954317768502 39 | foo foo 37 13 4484.0 0.025173064987642168 0.001936389614434013 40 | foo foo 38 12 4322.0 0.024324300745100825 0.0020270250620917354 41 | foo foo 39 11 3797.0 0.02095540632794532 0.00190503693890412 42 | foo foo 40 10 3403.0 0.018630663941423948 0.0018630663941423947 43 | foo foo 41 9 3044.0 0.016810995031025552 0.001867888336780617 44 | foo foo 42 8 2716.0 0.015316241229781234 0.0019145301537226542 45 | foo foo 43 7 2461.0 0.014124488058201323 0.002017784008314475 46 | foo foo 44 6 2060.0 0.011782977088540664 0.0019638295147567774 47 | foo foo 45 5 1629.0 0.009356770724295723 0.0018713541448591446 48 | foo foo 46 4 1325.0 0.007777107004193509 0.0019442767510483773 49 | foo foo 47 3 950.0 0.005574745304582236 0.0018582484348607453 50 | foo foo 48 2 629.0 0.003669007156579109 0.0018345035782895544 51 | foo foo 49 1 326.0 0.0020415942196967394 0.0020415942196967394 52 | bar bar 0 49 53 | bar bar 1 48 54 | bar bar 2 47 450107.0 2.1180050802546933 0.04506393787775943 55 | bar bar 3 46 238644.0 1.1182026520831783 0.02430875330615605 56 | bar bar 4 45 151877.0 0.7065426657897472 0.01570094812866105 57 | bar bar 5 44 105862.0 0.4889639900117408 0.01111281795481229 58 | bar bar 6 43 84565.0 0.3886687958491317 0.00903880920579376 59 | bar bar 7 42 67656.0 0.305587801420597 0.007275900033823738 60 | bar bar 8 41 56605.0 0.2536802573536893 0.006187323350089984 61 | bar bar 9 40 49125.0 0.21940452543596367 0.005485113135899092 62 | bar bar 10 39 43256.0 0.19302073776471373 0.004949249686274711 63 | bar bar 11 38 38908.0 0.17213966992023477 0.0045299913136903885 64 | bar bar 12 37 33613.0 0.1494114335367291 0.00403814685234403 65 | bar bar 13 36 29008.0 0.1286862020151156 0.0035746167226421 66 | bar bar 14 35 28208.0 0.1257340707416353 0.0035924020211895802 67 | bar bar 15 34 26130.0 0.11682046178278417 0.0034358959347877698 68 | bar bar 16 33 24355.0 0.10848220502658447 0.0032873395462601354 69 | bar bar 17 32 21902.0 0.09720413992092795 0.0030376293725289986 70 | bar bar 18 31 19754.0 0.08921457365055102 0.00287788947259842 71 | bar bar 19 30 17506.0 0.0798108423392565 0.00266036141130855 72 | bar bar 20 29 16951.0 0.07831020324831016 0.002700351836148626 73 | bar bar 21 28 16124.0 0.07470713314986098 0.0026681118982093206 74 | bar bar 22 27 16237.0 0.07516147832181286 0.002783758456363439 75 | bar bar 23 26 15583.0 0.07144738725071081 0.0027479764327196466 76 | bar bar 24 25 14864.0 0.06801519019393452 0.0027206076077573808 77 | bar bar 25 24 14174.0 0.06516873511627985 0.002715363963178327 78 | bar bar 26 23 14169.0 0.06554949528961256 0.002849978056070111 79 | bar bar 27 22 13561.0 0.06221042530718225 0.0028277466048719207 80 | bar bar 28 21 12073.0 0.055813578961226296 0.0026577894743441094 81 | bar bar 29 20 11032.0 0.05118868034313225 0.0025594340171566127 82 | bar bar 30 19 10723.0 0.050269590871060296 0.0026457679405821207 83 | bar bar 31 18 10646.0 0.04998712073522266 0.0027770622630679254 84 | bar bar 32 17 10320.0 0.04943531274185869 0.002907959573050511 85 | bar bar 33 16 9664.0 0.04604888783607321 0.0028780554897545755 86 | bar bar 34 15 9227.0 0.04425307710295975 0.0029502051401973164 87 | bar bar 35 14 9111.0 0.04421548066666439 0.0031582486190474567 88 | bar bar 36 13 9923.0 0.04945120961837048 0.003803939201413114 89 | bar bar 37 12 9219.0 0.04674824569212995 0.0038956871410108294 90 | bar bar 38 11 8027.0 0.04077733321358686 0.0037070302921442602 91 | bar bar 39 10 6756.0 0.03230495094148628 0.0032304950941486276 92 | bar bar 40 9 5996.0 0.027699878189309274 0.003077764243256586 93 | bar bar 41 8 5280.0 0.023833680900535406 0.0029792101125669258 94 | bar bar 42 7 4560.0 0.019837282406156377 0.002833897486593768 95 | bar bar 43 6 3911.0 0.01627847374007839 0.0027130789566797314 96 | bar bar 44 5 3155.0 0.012966661266117605 0.002593332253223521 97 | bar bar 45 4 2335.0 0.008792759755829107 0.0021981899389572766 98 | bar bar 46 3 1518.0 0.005519380548429014 0.0018397935161430046 99 | bar bar 47 2 1142.0 0.003471630969823881 0.0017358154849119406 100 | bar bar 48 1 756.0 0.0019403909506671992 0.0019403909506671992 101 | bar bar 49 0 361.0 0.0 102 | -------------------------------------------------------------------------------- /tests/data/CN.mm9.toy_features.bed: -------------------------------------------------------------------------------- 1 | chr1 100100000 100150000 2 | chr2 100200000 100250000 3 | -------------------------------------------------------------------------------- /tests/data/CN.mm9.toy_regions.bed: -------------------------------------------------------------------------------- 1 | chr1 100000000 150000000 foo 2 | chr2 100000000 150000000 bar 3 | -------------------------------------------------------------------------------- /tests/data/dotfinder_mock_inputs.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open2c/cooltools/aedd531e80e7940311f941918c6551d3229eaf21/tests/data/dotfinder_mock_inputs.npz -------------------------------------------------------------------------------- /tests/data/dotfinder_mock_res.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open2c/cooltools/aedd531e80e7940311f941918c6551d3229eaf21/tests/data/dotfinder_mock_res.csv.gz -------------------------------------------------------------------------------- /tests/data/make_test_compartments.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import pandas as pd 3 | import numpy as np 4 | import h5py 5 | 6 | 7 | # make chromsizes 8 | with open("./test.chrom.sizes", "w") as chromsizes: 9 | chromsizes.write("chr1\t1000\n") 10 | chromsizes.write("chr2\t2000\n") 11 | chromsizes.write("chr3\t3000") 12 | 13 | BIN_SIZE = 10 14 | # make bins 15 | subprocess.check_output( 16 | f"cooltools genome binnify ./test.chrom.sizes {BIN_SIZE} > ./test.10.bins", 17 | shell=True, 18 | ) 19 | 20 | # make Hi-C data 21 | bins = pd.read_table("./test.10.bins", sep="\t") 22 | EIG_PERIOD_BP = 500 23 | EIG_AMPLITUDE = np.sqrt(0.5) 24 | SCALING = -2 25 | MAX_CIS_COUNTS = 1e8 26 | MAX_TRANS_COUNTS = 1e5 27 | 28 | bins["eig"] = EIG_AMPLITUDE * np.sin(bins.start * 2 * np.pi / EIG_PERIOD_BP) 29 | bins["key"] = 0 30 | pixels = pd.merge(bins, bins, on="key", how="outer", suffixes=("1", "2")) 31 | pixels.drop("key", axis="columns", inplace=True) 32 | pixels["count"] = np.nan 33 | 34 | cis = pixels.chrom1 == pixels.chrom2 35 | pixels.loc[cis, "count"] = pixels[cis].eval( 36 | "@MAX_CIS_COUNTS * ((abs(start1-start2)+@BIN_SIZE)**@SCALING) * (1.0+eig1*eig2)" 37 | ) 38 | pixels.loc[~cis, "count"] = pixels[~cis].eval("@MAX_TRANS_COUNTS * (1.0+eig1*eig2)") 39 | 40 | pixels["count"] = pixels["count"].astype(int) 41 | pixels[["chrom1", "start1", "end1", "chrom2", "start2", "end2", "count"]].to_csv( 42 | "./sin_eigs_mat.bg2.gz", sep="\t", index=False, header=None, compression="gzip" 43 | ) 44 | 45 | 46 | # make a cooler 47 | subprocess.check_output( 48 | "cooler load -f bg2 --count-as-float --tril-action drop " 49 | + f"./test.chrom.sizes:{BIN_SIZE} ./sin_eigs_mat.bg2.gz " 50 | + "./sin_eigs_mat.cool", 51 | shell=True, 52 | ) 53 | 54 | # fake IC 55 | f = h5py.File("./sin_eigs_mat.cool") 56 | f["bins/weight"] = np.ones_like(f["bins/start"], dtype=float) 57 | f["bins/weight"].attrs["ignore_diags"] = 2 58 | f.close() 59 | -------------------------------------------------------------------------------- /tests/data/mm9.chrom.sizes.reduced: -------------------------------------------------------------------------------- 1 | chr1 197195432 2 | chr2 181748087 3 | chr3 159599783 4 | chr4 155630120 5 | chr5 152537259 6 | chr6 149517037 7 | chr7 152524553 8 | chr8 131738871 9 | chr9 124076172 10 | chr10 129993255 11 | chr11 121843856 12 | chr12 121257530 13 | chr13 120284312 14 | chr14 125194864 15 | chr15 103494974 16 | chr16 98319150 17 | chr17 95272651 18 | chr18 90772031 19 | chr19 61342430 20 | chrX 166650296 21 | chrY 15902555 22 | chrM 16299 23 | -------------------------------------------------------------------------------- /tests/data/mm9.named_nonoverlap_regions.bed: -------------------------------------------------------------------------------- 1 | chr1 0 99000000 chr1_firsthalf 2 | chr1 100000000 197195432 chr1_secondhalf 3 | chr2 0 99000000 chr2_firsthalf 4 | chr2 100000000 181748087 chr2_secondhalf 5 | chr10 0 129993255 chr10_full 6 | -------------------------------------------------------------------------------- /tests/data/sin_eigs_mat.bg2.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open2c/cooltools/aedd531e80e7940311f941918c6551d3229eaf21/tests/data/sin_eigs_mat.bg2.gz -------------------------------------------------------------------------------- /tests/data/sin_eigs_mat.cool: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open2c/cooltools/aedd531e80e7940311f941918c6551d3229eaf21/tests/data/sin_eigs_mat.cool -------------------------------------------------------------------------------- /tests/data/test.chrom.sizes: -------------------------------------------------------------------------------- 1 | chr1 1000 2 | chr2 2000 3 | chr3 3000 -------------------------------------------------------------------------------- /tests/test_call-dots.py: -------------------------------------------------------------------------------- 1 | import os.path as op 2 | 3 | from click.testing import CliRunner 4 | from cooltools.cli import cli 5 | import cooler 6 | import numpy as np 7 | from cooltools import api 8 | from cooltools.lib.io import read_viewframe_from_file, read_expected_from_file 9 | 10 | 11 | # test user-facing API for calling dots 12 | def test_dots(request): 13 | # Note that call-dots requires ucsc named expected and view 14 | in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool") 15 | in_exp = op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.tsv") 16 | in_regions = op.join(request.fspath.dirname, "data/CN.mm9.toy_regions.bed") 17 | 18 | # read data for the test: 19 | clr = cooler.Cooler(in_cool) 20 | view_df = read_viewframe_from_file(in_regions, clr, check_sorting=True) 21 | expected_df = read_expected_from_file( 22 | in_exp, 23 | expected_value_cols=["balanced.avg"], 24 | verify_view=view_df, 25 | verify_cooler=clr, 26 | ) 27 | 28 | # generate dot-calls 29 | dot_calls_df = api.dotfinder.dots( 30 | clr, 31 | expected_df, 32 | view_df=view_df, 33 | kernels={ 34 | "d": np.array([[1, 0, 1], [0, 0, 0], [1, 0, 1]]), 35 | "v": np.array([[0, 1, 0], [0, 0, 0], [0, 1, 0]]), 36 | "h": np.array([[0, 0, 0], [1, 0, 1], [0, 0, 0]]), 37 | }, 38 | max_loci_separation=100_000_000, 39 | max_nans_tolerated=1, 40 | n_lambda_bins=50, 41 | lambda_bin_fdr=0.1, 42 | clustering_radius=False, 43 | cluster_filtering=None, 44 | tile_size=50_000_000, 45 | nproc=1, 46 | ) 47 | 48 | # no comparison with reference results yet 49 | # just checking if it runs without errors 50 | assert not dot_calls_df.empty 51 | 52 | dot_calls_df_pooled = api.dotfinder.dots( 53 | clr, 54 | expected_df, 55 | view_df=view_df, 56 | kernels={ 57 | "d": np.array([[1, 0, 1], [0, 0, 0], [1, 0, 1]]), 58 | "v": np.array([[0, 1, 0], [0, 0, 0], [0, 1, 0]]), 59 | "h": np.array([[0, 0, 0], [1, 0, 1], [0, 0, 0]]), 60 | }, 61 | max_loci_separation=100_000_000, 62 | max_nans_tolerated=1, 63 | n_lambda_bins=50, 64 | lambda_bin_fdr=0.1, 65 | clustering_radius=False, 66 | cluster_filtering=None, 67 | tile_size=50_000_000, 68 | nproc=3, 69 | ) 70 | 71 | assert dot_calls_df.equals(dot_calls_df_pooled) 72 | 73 | 74 | def test_call_dots_cli(request, tmpdir): 75 | in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool") 76 | in_exp = op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.chromnamed.tsv") 77 | out_dots = op.join(tmpdir, "test.dots") 78 | 79 | runner = CliRunner() 80 | result = runner.invoke( 81 | cli, 82 | [ 83 | "dots", 84 | "-p", 85 | 1, 86 | "--tile-size", 87 | 60_000_000, 88 | "--max-loci-separation", 89 | 100_000_000, 90 | "--output", 91 | out_dots, 92 | in_cool, 93 | in_exp, 94 | ], 95 | ) 96 | # This command should fail because viewframe interpreted from cooler does not correspond to toy_expected: 97 | assert result.exit_code == 1 98 | 99 | 100 | # comment this test for now, until we swap out input data and/or allow for custom kernels 101 | 102 | # def test_call_dots_view_cli(request, tmpdir): 103 | # # Note that call-dots requires ucsc named expected and view 104 | # in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool") 105 | # in_exp = op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.tsv") 106 | # in_regions = op.join(request.fspath.dirname, "data/CN.mm9.toy_regions.bed") 107 | # out_dots = op.join(tmpdir, "test.dots") 108 | 109 | # runner = CliRunner() 110 | # cmd = [ 111 | # "dots", 112 | # "--view", 113 | # in_regions, 114 | # "-p", 115 | # 1, 116 | # "--tile-size", 117 | # 60_000_000, 118 | # "--max-loci-separation", 119 | # 100_000_000, 120 | # "--output", 121 | # out_dots, 122 | # in_cool, 123 | # in_exp, 124 | # ] 125 | # result = runner.invoke(cli, cmd) 126 | # assert result.exit_code == 0 127 | # # make sure output is generated: 128 | # assert op.isfile(out_dots) 129 | 130 | -------------------------------------------------------------------------------- /tests/test_coverage.py: -------------------------------------------------------------------------------- 1 | import os.path as op 2 | import cooler 3 | 4 | import cooltools 5 | import cooltools.api 6 | from numpy import testing 7 | import numpy as np 8 | import pandas as pd 9 | 10 | 11 | def test_coverage_symmetric_upper(request): 12 | # perform test: 13 | clr = cooler.Cooler(op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")) 14 | cis_cov, tot_cov = cooltools.api.coverage.coverage( 15 | clr, ignore_diags=2, chunksize=int(1e7) 16 | ) 17 | 18 | # Test that minimal coverage is larger than 0.5 19 | assert tot_cov[tot_cov > 0].min() >= 1 20 | # Check multiprocessed result 21 | cis_cov_pooled, tot_cov_pooled = cooltools.api.coverage.coverage( 22 | clr, ignore_diags=2, chunksize=int(1e7), nproc=3 23 | ) 24 | assert np.array_equal(cis_cov, cis_cov_pooled, equal_nan=True) 25 | assert np.array_equal(tot_cov, tot_cov_pooled, equal_nan=True) 26 | 27 | 28 | # Test that dense matrix marginal is the same: 29 | mtx = clr.matrix(balance=False, as_pixels=False)[:] 30 | np.fill_diagonal(mtx, 0) 31 | np.fill_diagonal(mtx[1:, :], 0) 32 | np.fill_diagonal(mtx[:, 1:], 0) 33 | cov_dense = np.sum(mtx, axis=1) 34 | testing.assert_allclose( 35 | actual=tot_cov, 36 | desired=cov_dense, 37 | equal_nan=True, 38 | ) 39 | 40 | """ generate the following cooler to test coverage: 41 | array([[0, 1, 2], 42 | [1, 0, 0], 43 | [2, 0, 0]], dtype=int32) 44 | """ 45 | 46 | bins = pd.DataFrame( 47 | [["chr1", 0, 1], ["chr1", 1, 2], ["chrX", 1, 2]], 48 | columns=["chrom", "start", "end"], 49 | ) 50 | 51 | pixels = pd.DataFrame( 52 | [[0, 1, 1], [0, 2, 2]], columns=["bin1_id", "bin2_id", "count"] 53 | ) 54 | 55 | clr_file = op.join(request.fspath.dirname, "data/test_coverage.cool") 56 | cooler.create_cooler(clr_file, bins, pixels) 57 | clr = cooler.Cooler(clr_file) 58 | cis_cov, tot_cov = cooltools.coverage(clr, ignore_diags=0, store=True) 59 | assert (cis_cov == np.array([1, 1, 0])).all() 60 | assert (tot_cov == np.array([3, 1, 2])).all() 61 | assert clr.info["cis"] == 1 62 | assert clr.info["sum"] == 3 63 | 64 | def test_balanced_coverage(request): 65 | # perform test: 66 | clr = cooler.Cooler(op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")) 67 | cis_cov_weight, tot_cov_weight = cooltools.api.coverage.coverage( 68 | clr, ignore_diags=2, chunksize=int(1e7), clr_weight_name="weight" 69 | ) 70 | 71 | # Test that mean total balanced coverage is 1.0 72 | assert np.nanmean(tot_cov_weight) == 1.0 73 | 74 | cis_cov_weight_pooled, tot_cov_weight_pooled = cooltools.api.coverage.coverage( 75 | clr, ignore_diags=2, chunksize=int(1e7), clr_weight_name="weight", nproc=3 76 | ) 77 | assert np.array_equal(cis_cov_weight, cis_cov_weight_pooled, equal_nan=True) 78 | assert np.array_equal(tot_cov_weight, tot_cov_weight_pooled, equal_nan=True) 79 | 80 | # Generate test matrix with weights 81 | bins=pd.DataFrame( 82 | [["chr1", 0, 1, 0.5], 83 | ["chr1", 1, 2, 1], 84 | ["chrX", 1, 2, 0.2], 85 | ["chrX", 2, 3, np.nan]], 86 | columns=["chrom", "start", "end", "weight"], 87 | ) 88 | 89 | pixels = pd.DataFrame( 90 | [[0, 1, 1], [0, 2, 2], [1, 3, 2], [2, 3, 1]], 91 | columns=["bin1_id", "bin2_id", "count"] 92 | ) 93 | 94 | clr_file = op.join(request.fspath.dirname, "data/test_coverage.cool") 95 | cooler.create_cooler(clr_file, bins, pixels) 96 | clr = cooler.Cooler(clr_file) 97 | cis_cov_weight, tot_cov_weight = cooltools.coverage(clr, ignore_diags=0, store=True, clr_weight_name="weight") 98 | assert np.allclose(cis_cov_weight, np.array([0.5, 0.5, 0, np.nan]), 99 | equal_nan=True) 100 | assert np.allclose(tot_cov_weight, np.array([0.7, 0.5, 0.2, np.nan]), 101 | equal_nan=True) 102 | -------------------------------------------------------------------------------- /tests/test_insulation.py: -------------------------------------------------------------------------------- 1 | import os.path as op 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from click.testing import CliRunner 6 | from cooltools.cli import cli 7 | 8 | from cooltools.api.insulation import ( 9 | calculate_insulation_score, 10 | find_boundaries, 11 | insul_diamond, 12 | _find_insulating_boundaries_dense, 13 | ) 14 | import cooler 15 | 16 | def test_insulation_cli(request, tmpdir): 17 | 18 | in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool") 19 | window = 10_000_000 20 | out_prefix = op.join(tmpdir, "CN.insulation.tsv") 21 | runner = CliRunner() 22 | result = runner.invoke(cli, ["insulation", "-o", out_prefix, in_cool, window]) 23 | assert result.exit_code == 1 24 | 25 | 26 | def test_insulation_cli_nobalance(request, tmpdir): 27 | 28 | in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool") 29 | window = 10_000_000 30 | out_prefix = op.join(tmpdir, "CN.insulation.tsv") 31 | runner = CliRunner() 32 | result = runner.invoke( 33 | cli, 34 | [ 35 | "insulation", 36 | "-o", 37 | out_prefix, 38 | "--clr-weight-name", 39 | "", 40 | "--ignore-diags", 41 | 1, 42 | in_cool, 43 | window, 44 | ], 45 | ) 46 | assert result.exit_code == 1 47 | 48 | 49 | def test_calculate_insulation_score(request): 50 | clr_path = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool") 51 | clr = cooler.Cooler(clr_path) 52 | windows = [10_000_000, 20_000_000] 53 | 54 | # I. Regular insulation, check presence of columns for each window: 55 | insulation = calculate_insulation_score(clr, windows) 56 | assert {f"log2_insulation_score_{window}" for window in windows}.issubset( 57 | insulation.columns 58 | ) 59 | assert {f"n_valid_pixels_{window}" for window in windows}.issubset( 60 | insulation.columns 61 | ) 62 | # check multiprocessed result 63 | insulation_pooled = calculate_insulation_score(clr, windows, nproc=3) 64 | assert insulation.equals(insulation_pooled) 65 | 66 | # II. Insulation with masking bad bins 67 | insulation = calculate_insulation_score(clr, 10_000_000, min_dist_bad_bin=1) 68 | # All bins closer than 1 to bad bins are filled with np.nans: 69 | assert np.all( 70 | np.isnan(insulation.query("dist_bad_bin==0")["log2_insulation_score_10000000"]) 71 | ) 72 | # Some of the bins at the distance 1 (above threshold) are not np.nans: 73 | assert np.any( 74 | ~np.isnan(insulation.query("dist_bad_bin==1")["log2_insulation_score_10000000"]) 75 | ) 76 | # check multiprocessed result 77 | insulation_pooled = calculate_insulation_score(clr, 10_000_000, min_dist_bad_bin=1, nproc=3) 78 | assert insulation.equals(insulation_pooled) 79 | 80 | # III. Insulation for separate view: 81 | region = pd.DataFrame( 82 | {"chrom": ["chr1"], "start": [0], "end": [10_000_000], "name": ["fragment01"]} 83 | ) 84 | insulation = calculate_insulation_score( 85 | clr, 10_000_000, min_dist_bad_bin=0, view_df=region 86 | ) 87 | assert len(insulation) == 10 88 | # check multiprocessed result 89 | insulation_pooled = calculate_insulation_score( 90 | clr, 10_000_000, min_dist_bad_bin=0, view_df=region, nproc=3 91 | ) 92 | assert insulation.equals(insulation_pooled) 93 | 94 | # IV. Insulation with string or float inputs for window sizes should work. 95 | calculate_insulation_score(clr, '10_000_000') 96 | calculate_insulation_score(clr, '10_000_000', nproc=3) 97 | 98 | 99 | def test_find_boundaries(request): 100 | clr_path = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool") 101 | clr = cooler.Cooler(clr_path) 102 | windows = [10_000_000, 20_000_000] 103 | 104 | # I. Regular boundaries, check presence of columns for each window: 105 | insulation = calculate_insulation_score(clr, windows) 106 | boundaries = find_boundaries(insulation) 107 | assert {f"boundary_strength_{window}" for window in windows}.issubset( 108 | boundaries.columns 109 | ) 110 | 111 | 112 | def test_insul_diamond(request): 113 | clr_path = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool") 114 | clr = cooler.Cooler(clr_path) 115 | 116 | # Pixel query 117 | from cooltools.lib._query import CSRSelector 118 | 119 | nbins = len(clr.bins()) 120 | chunksize = 10_000 121 | selector = CSRSelector( 122 | clr.open("r"), shape=(nbins, nbins), field="count", chunksize=chunksize 123 | ) 124 | c0 = 0 125 | c1 = 10 126 | pixel_query = selector[c0:c1, c0:c1] 127 | 128 | # Define bins with different weights: 129 | bins = pd.DataFrame( 130 | [ 131 | ["chr1", 0, 1000000, 1, 0.1, 0.01], 132 | ["chr1", 1000000, 2000000, 1, 0.1, 0.01], 133 | ["chr1", 2000000, 3000000, 1, 0.1, 0.01], 134 | ["chr1", 3000000, 4000000, 1, 0.1, 0.01], 135 | ["chr1", 4000000, 5000000, 1, 0.1, 0.01], 136 | ["chr1", 5000000, 6000000, 1, 0.1, 0.01], 137 | ["chr1", 6000000, 7000000, 1, 0.1, 0.01], 138 | ["chr1", 7000000, 8000000, 1, 0.1, 0.01], 139 | ["chr1", 8000000, 9000000, 1, 0.1, 0.01], 140 | ["chr1", 9000000, 10000000, 1, 0.1, 0.01], 141 | ], 142 | columns=["chrom", "start", "end", "weight", "weight_cis", "weight_trans"], 143 | ) 144 | 145 | # Run insul_diamond: 146 | score, n_pixels, sum_balanced, sum_counts = insul_diamond( 147 | pixel_query, 148 | bins, 149 | window=3, 150 | ignore_diags=2, 151 | norm_by_median=False, 152 | clr_weight_name="weight", 153 | ) 154 | 155 | assert np.allclose(sum_balanced, sum_counts) 156 | 157 | score, n_pixels, sum_balanced, sum_counts = insul_diamond( 158 | pixel_query, 159 | bins, 160 | window=3, 161 | ignore_diags=2, 162 | norm_by_median=False, 163 | clr_weight_name="weight_cis", 164 | ) 165 | 166 | assert np.allclose(sum_balanced, 0.01 * sum_counts) 167 | 168 | 169 | def test_insulation_sparse_vs_dense(request): 170 | clr_path = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool") 171 | clr = cooler.Cooler(clr_path) 172 | insul_dense = _find_insulating_boundaries_dense( 173 | clr, 174 | 10_000_000, 175 | clr_weight_name="weight", 176 | min_dist_bad_bin=0, 177 | ignore_diags=2, 178 | ) 179 | 180 | insulation_sparse = calculate_insulation_score( 181 | clr, 10_000_000, clr_weight_name="weight", min_dist_bad_bin=0, ignore_diags=2 182 | ) 183 | boundaries_sparse = find_boundaries(insulation_sparse) 184 | 185 | assert np.allclose( 186 | insul_dense["log2_insulation_score_10000000"], 187 | boundaries_sparse["log2_insulation_score_10000000"], 188 | equal_nan=True, 189 | ) -------------------------------------------------------------------------------- /tests/test_io.py: -------------------------------------------------------------------------------- 1 | import os.path as op 2 | import pandas as pd 3 | from cooltools.lib.io import read_expected_from_file, read_viewframe_from_file 4 | from cooltools.lib import is_valid_expected 5 | import bioframe 6 | import pytest 7 | 8 | 9 | def test_read_expected_from_file(request, tmpdir): 10 | 11 | expected_file = op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.chromnamed.tsv") 12 | expected_df = read_expected_from_file(expected_file, expected_value_cols=["balanced.avg"]) 13 | 14 | assert is_valid_expected( 15 | expected_df, "cis", expected_value_cols=["balanced.avg"] 16 | ) 17 | 18 | # test for error when string in one row of "n_valid" column (supposed to be Int64 dtype): 19 | expected_df_wrongdtype = expected_df.copy() 20 | expected_df_wrongdtype["n_valid"] = expected_df_wrongdtype["n_valid"].astype(str) 21 | expected_df_wrongdtype.loc[0,"n_valid"] = "string" 22 | expected_df_wrongdtype.to_csv(op.join(tmpdir, "CN.mm9.toy_expected_wrongdtype.tsv"), 23 | sep="\t", index=False) 24 | with pytest.raises(ValueError): 25 | read_expected_from_file( 26 | op.join(tmpdir, "CN.mm9.toy_expected_wrongdtype.tsv"), 27 | expected_value_cols=["balanced.avg"], 28 | ) 29 | 30 | # test that read_expected from file works if chroms are mix of str and int 31 | expected_df_intchr = expected_df.copy() 32 | expected_df_intchr["region1"] = expected_df_intchr["region1"].str.replace('chr1','1') 33 | expected_df_intchr["region2"] = expected_df_intchr["region2"].str.replace('chr1','1') 34 | expected_df_intchr.to_csv(op.join(tmpdir, "CN.mm9.toy_expected_intchr.tsv"), 35 | sep="\t", index=False) 36 | expected_df_intchr = read_expected_from_file(op.join(tmpdir, "CN.mm9.toy_expected_intchr.tsv"), 37 | expected_value_cols=["balanced.avg"]) 38 | assert is_valid_expected( 39 | expected_df_intchr, "cis", expected_value_cols=["balanced.avg"] 40 | ) 41 | 42 | 43 | def test_read_viewframe_from_file(request, tmpdir): 44 | 45 | # test viewframe with 4 columns - i.e. with unique names 46 | view_file_wnames = op.join(request.fspath.dirname, "data/CN.mm9.toy_regions.bed") 47 | view_df = read_viewframe_from_file(view_file_wnames, verify_cooler=None, check_sorting=False) 48 | assert bioframe.is_viewframe(view_df) 49 | 50 | # test viewframe with 3 columns - i.e. without unique names 51 | view_file_wonames = op.join(request.fspath.dirname, "data/CN.mm9.toy_features.bed") 52 | view_df = read_viewframe_from_file(view_file_wonames, verify_cooler=None, check_sorting=False) 53 | assert bioframe.is_viewframe(view_df) 54 | # for a 3 column viewframe, UCSC strings should assigned to names 55 | assert view_df["name"].apply(bioframe.is_complete_ucsc_string).all() 56 | -------------------------------------------------------------------------------- /tests/test_lazy_toeplitz.py: -------------------------------------------------------------------------------- 1 | from scipy.linalg import toeplitz 2 | import numpy as np 3 | from cooltools.lib.numutils import LazyToeplitz 4 | 5 | 6 | n = 100 7 | m = 150 8 | c = np.arange(1, n + 1) 9 | r = np.r_[1, np.arange(-2, -m, -1)] 10 | 11 | L = LazyToeplitz(c, r) 12 | T = toeplitz(c, r) 13 | 14 | 15 | def test_symmetric(): 16 | for si in [ 17 | slice(10, 20), 18 | slice(0, 150), 19 | slice(0, 0), 20 | slice(150, 150), 21 | slice(10, 10), 22 | ]: 23 | assert np.allclose(L[si, si], T[si, si]) 24 | 25 | 26 | def test_triu_no_overlap(): 27 | for si, sj in [ 28 | (slice(10, 20), slice(30, 40)), 29 | (slice(10, 15), slice(30, 40)), 30 | (slice(10, 20), slice(30, 45)), 31 | ]: 32 | assert np.allclose(L[si, sj], T[si, sj]) 33 | 34 | 35 | def test_tril_no_overlap(): 36 | for si, sj in [ 37 | (slice(30, 40), slice(10, 20)), 38 | (slice(30, 40), slice(10, 15)), 39 | (slice(30, 45), slice(10, 20)), 40 | ]: 41 | assert np.allclose(L[si, sj], T[si, sj]) 42 | 43 | 44 | def test_triu_with_overlap(): 45 | for si, sj in [ 46 | (slice(10, 20), slice(15, 25)), 47 | (slice(13, 22), slice(15, 25)), 48 | (slice(10, 20), slice(18, 22)), 49 | ]: 50 | assert np.allclose(L[si, sj], T[si, sj]) 51 | 52 | 53 | def test_tril_with_overlap(): 54 | for si, sj in [ 55 | (slice(15, 25), slice(10, 20)), 56 | (slice(15, 22), slice(10, 20)), 57 | (slice(15, 25), slice(10, 18)), 58 | ]: 59 | assert np.allclose(L[si, sj], T[si, sj]) 60 | 61 | 62 | def test_nested(): 63 | for si, sj in [ 64 | (slice(10, 40), slice(20, 30)), 65 | (slice(10, 35), slice(20, 30)), 66 | (slice(10, 40), slice(20, 25)), 67 | (slice(20, 30), slice(10, 40)), 68 | ]: 69 | assert np.allclose(L[si, sj], T[si, sj]) 70 | -------------------------------------------------------------------------------- /tests/test_lib_common.py: -------------------------------------------------------------------------------- 1 | import os.path as op 2 | import pandas as pd 3 | import cooler 4 | import cooltools 5 | import pytest 6 | 7 | # TODO: tests for 8 | # assign_supports, or assign_regions, or deprecate & remove both 9 | # assign_regions_to_bins 10 | # make_cooler_view 11 | # view_from_track 12 | 13 | 14 | def test_align_track_with_cooler(request, tmpdir): 15 | 16 | clr_file = op.join(request.fspath.dirname, "data/sin_eigs_mat.cool") 17 | clr = cooler.Cooler(clr_file) 18 | 19 | # valid track with three entries that can all be aligned 20 | track = pd.DataFrame( 21 | [ 22 | ["chr1", 990, 995, 22], 23 | ["chr2", 20, 30, -1], 24 | ["chr3", 0, 10, 0.1], 25 | ], 26 | columns=["chrom", "start", "end", "value"], 27 | ) 28 | assert ( 29 | ~cooltools.lib.align_track_with_cooler(track, clr)["value"].isna() 30 | ).sum() == 3 31 | 32 | # not a track, is not sorted 33 | track = pd.DataFrame( 34 | [["chr3", 0, 10, 0.1], ["chr2", 20, 30, -1], ["chr2", 0, 10, 21]], 35 | columns=["chrom", "start", "end", "value"], 36 | ) 37 | with pytest.raises(ValueError): 38 | cooltools.lib.align_track_with_cooler(track, clr) 39 | 40 | # not a track, is overlapping 41 | track = pd.DataFrame( 42 | [ 43 | ["chr1", 990, 1000, 22], 44 | ["chr2", 5, 15, 0.1], 45 | ["chr2", 20, 30, -1], 46 | ], 47 | columns=["chrom", "start", "end", "value"], 48 | ) 49 | with pytest.raises(ValueError): 50 | cooltools.lib.align_track_with_cooler(track, clr) 51 | 52 | # bin size mismatch 53 | track = pd.DataFrame( 54 | [["chr1", 990, 995, 22], ["chr2", 20, 25, -1], ["chr3", 0, 5, 0.1]], 55 | columns=["chrom", "start", "end", "value"], 56 | ) 57 | with pytest.raises(ValueError): 58 | cooltools.lib.align_track_with_cooler(track, clr) 59 | 60 | # clr_weight_name mismatch 61 | track = pd.DataFrame( 62 | [ 63 | ["chr1", 990, 995, 22], 64 | ["chr2", 20, 30, -1], 65 | ["chr3", 0, 10, 0.1], 66 | ], 67 | columns=["chrom", "start", "end", "value"], 68 | ) 69 | with pytest.raises(ValueError): 70 | cooltools.lib.align_track_with_cooler( 71 | track, clr, clr_weight_name="invalid_weight_name" 72 | ) 73 | 74 | # regions with no assigned values 75 | track = pd.DataFrame( 76 | [["chr1", 0, 10, 0.1], ["chr1", 20, 30, -1], ["chr1", 990, 995, 22]], 77 | columns=["chrom", "start", "end", "value"], 78 | ) 79 | with pytest.raises(ValueError): 80 | cooltools.lib.align_track_with_cooler(track, clr) 81 | 82 | # using a restricted view only considers chr1, avoids valueError from no assigned values 83 | view_df = cooltools.lib.make_cooler_view(clr) 84 | assert ( 85 | ~cooltools.lib.align_track_with_cooler(track, clr, view_df=view_df[:1])[ 86 | "value" 87 | ].isna() 88 | ).sum() == 3 89 | 90 | # testing mask_bad_bins option 91 | clr_file = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool") 92 | clr = cooler.Cooler(clr_file) 93 | view_df = cooltools.lib.make_cooler_view(clr)[:1] 94 | 95 | track = pd.DataFrame( 96 | [["chr1", 0, 1000000, 1], ["chr1", 3000000, 4000000, 10]], 97 | columns=["chrom", "start", "end", "value"], 98 | ) 99 | # without masking, both get assigned 100 | assert ( 101 | cooltools.lib.align_track_with_cooler( 102 | track, clr, view_df=view_df, mask_clr_bad_bins=False 103 | )["value"].sum() 104 | == 11 105 | ) 106 | 107 | # with masking, only the second value from the track gets assigned 108 | assert ( 109 | cooltools.lib.align_track_with_cooler( 110 | track, clr, view_df=view_df, mask_clr_bad_bins=True 111 | )["value"].sum() 112 | == 10 113 | ) 114 | -------------------------------------------------------------------------------- /tests/test_rearrange_cooler.py: -------------------------------------------------------------------------------- 1 | import cooler 2 | import bioframe 3 | import os.path as op 4 | 5 | import numpy as np 6 | 7 | from cooltools.api.rearrange import rearrange_cooler 8 | from pandas.testing import assert_frame_equal 9 | 10 | 11 | def test_rearrange_cooler(request): 12 | # Read cool file and create view_df out of it: 13 | clr = cooler.Cooler(op.join(request.fspath.dirname, "data/CN.mm9.10000kb.cool")) 14 | orig_view = bioframe.make_viewframe(clr.chromsizes) 15 | 16 | # I. 17 | # Check that with the same view, nothing changes 18 | rearrange_cooler(clr, orig_view, "test_not_reordered.cool") 19 | new_clr = cooler.Cooler("test_not_reordered.cool") 20 | assert_frame_equal(new_clr.pixels()[:], clr.pixels()[:]) 21 | assert_frame_equal(new_clr.bins()[:], clr.bins()[:]) 22 | assert_frame_equal(new_clr.bins()[:], clr.bins()[:]) 23 | 24 | # II. 25 | # Check that when just getting one chrom, all is as expected 26 | new_view = orig_view.iloc[:1, :] 27 | rearrange_cooler(clr, new_view, "test_chrom1_reordered.cool") 28 | new_clr = cooler.Cooler("test_chrom1_reordered.cool") 29 | old_bins = clr.bins()[:].query('chrom=="chr1"') 30 | old_bins["chrom"] = old_bins["chrom"].astype(str) 31 | new_bins = new_clr.bins()[:] 32 | new_bins["chrom"] = new_bins["chrom"].astype(str) 33 | assert_frame_equal(old_bins, new_bins) 34 | 35 | old_pixels = clr.matrix(as_pixels=True).fetch("chr1").drop(columns=["balanced"]) 36 | new_pixels = new_clr.pixels()[:] 37 | assert_frame_equal(old_pixels, new_pixels) 38 | assert_frame_equal(clr.chroms()[:1], new_clr.chroms()[:]) 39 | 40 | # III. 41 | # Check that when just getting one chrom and inverting it, all is as expected 42 | inverted_view = new_view.copy() 43 | inverted_view["strand"] = "-" 44 | rearrange_cooler(clr, inverted_view, "test_chrom1_reordered_inverted.cool") 45 | inverted_clr = cooler.Cooler("test_chrom1_reordered_inverted.cool") 46 | inverted_bins = inverted_clr.bins()[:] 47 | inverted_bins[["end", "start"]] = ( 48 | inverted_bins.iloc[-1]["end"] - inverted_bins[["start", "end"]] 49 | ) 50 | inverted_bins = inverted_bins.iloc[::-1].reset_index(drop=True) 51 | inverted_bins["chrom"] = inverted_bins["chrom"].astype(str) 52 | assert_frame_equal(new_bins, inverted_bins) 53 | inverted_pixels = inverted_clr.pixels()[:] 54 | inverted_pixels[["bin1_id", "bin2_id"]] = np.sort( 55 | inverted_bins.index[-1] - inverted_pixels[["bin1_id", "bin2_id"]] 56 | ) 57 | inverted_pixels = inverted_pixels.sort_values(["bin1_id", "bin2_id"]).reset_index( 58 | drop=True 59 | ) 60 | assert_frame_equal(new_clr.pixels()[:], inverted_pixels) 61 | assert_frame_equal(new_clr.chroms()[:1], inverted_clr.chroms()[:]) 62 | 63 | # III. 64 | # Check that when taking two chromosomes in a different order and inverting one, 65 | # all is ax espected 66 | 67 | reorder_invert_view = ( 68 | orig_view.iloc[1::-1].assign(strand=["+", "-"]).reset_index(drop=True) 69 | ) 70 | rearrange_cooler(clr, reorder_invert_view, "test_chr2chr1_reordered_inverted.cool") 71 | reordered_inverted_clr = cooler.Cooler("test_chr2chr1_reordered_inverted.cool") 72 | 73 | # compare chr2 bins 74 | old_bins_chr2 = clr.bins().fetch("chr2").reset_index(drop=True) 75 | old_bins_chr2["chrom"] = old_bins_chr2["chrom"].astype(str) 76 | reordered_inverted_bins_chr2 = reordered_inverted_clr.bins().fetch("chr2") 77 | reordered_inverted_bins_chr2["chrom"] = reordered_inverted_bins_chr2[ 78 | "chrom" 79 | ].astype(str) 80 | assert_frame_equal(old_bins_chr2, reordered_inverted_bins_chr2) 81 | # compare chr2 pixels 82 | old_pixels_chr2 = ( 83 | clr.pixels() 84 | .fetch("chr2") 85 | .query(f'bin2_id<={clr.bins().fetch("chr2").index[-1]}') 86 | .reset_index(drop=True) 87 | ) 88 | reordered_inverted_pixels_chr2 = ( 89 | reordered_inverted_clr.pixels() 90 | .fetch("chr2") 91 | .query(f'bin2_id<={reordered_inverted_clr.bins().fetch("chr2").index[-1]}') 92 | .reset_index(drop=True) 93 | ) 94 | reordered_inverted_pixels_chr2[["bin1_id", "bin2_id"]] += ( 95 | clr.bins().fetch("chr1").index[-1] + 1 96 | ) 97 | assert_frame_equal(old_pixels_chr2, reordered_inverted_pixels_chr2) 98 | # Compare chr1 bins 99 | old_bins_chr1 = clr.bins().fetch("chr1") 100 | old_bins_chr1["chrom"] = old_bins_chr1["chrom"].astype(str) 101 | 102 | reordered_inverted_bins_chr1 = reordered_inverted_clr.bins().fetch("chr1") 103 | reordered_inverted_bins_chr1[["end", "start"]] = ( 104 | reordered_inverted_bins_chr1.iloc[-1]["end"] 105 | - reordered_inverted_bins_chr1[["start", "end"]] 106 | ) 107 | reordered_inverted_bins_chr1.index = ( 108 | reordered_inverted_bins_chr1.index[::-1] - old_bins_chr1.index[-1] 109 | ) 110 | reordered_inverted_bins_chr1 = reordered_inverted_bins_chr1.iloc[::-1] 111 | reordered_inverted_bins_chr1["chrom"] = reordered_inverted_bins_chr1[ 112 | "chrom" 113 | ].astype(str) 114 | 115 | assert_frame_equal(old_bins_chr1, reordered_inverted_bins_chr1) 116 | # Compare chr1 pixels 117 | old_pixels_chr1 = ( 118 | clr.pixels() 119 | .fetch("chr1") 120 | .query(f'bin2_id<={clr.bins().fetch("chr1").index[-1]}') 121 | .reset_index(drop=True) 122 | ) 123 | reordered_inverted_pixels_chr1 = reordered_inverted_clr.pixels().fetch("chr1") 124 | 125 | reordered_inverted_pixels_chr1[["bin1_id", "bin2_id"]] = np.sort( 126 | reordered_inverted_bins_chr1.index[-1] 127 | - reordered_inverted_pixels_chr1[["bin1_id", "bin2_id"]] 128 | + reordered_inverted_bins_chr2.index[-1] 129 | + 1 130 | ) 131 | reordered_inverted_pixels_chr1 = inverted_pixels.sort_values(["bin1_id", "bin2_id"]) 132 | assert_frame_equal(old_pixels_chr1, reordered_inverted_pixels_chr1) 133 | # Compare trans matrix (easier than pixels) 134 | old_trans_m = clr.matrix().fetch("chr1", "chr2") 135 | reordered_inverted_trans_m = reordered_inverted_clr.matrix().fetch("chr1", "chr2")[ 136 | ::-1, : 137 | ] 138 | assert np.array_equal(old_trans_m, reordered_inverted_trans_m, equal_nan=True) 139 | -------------------------------------------------------------------------------- /tests/test_sample.py: -------------------------------------------------------------------------------- 1 | import os.path as op 2 | import cooler 3 | 4 | import cooltools 5 | import cooltools.api 6 | from numpy import testing 7 | 8 | 9 | def test_sample(request): 10 | # perform test: 11 | clr = cooler.Cooler(op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")) 12 | 13 | cooltools.api.sample.sample( 14 | clr, 15 | op.join(request.fspath.dirname, "data/CN.mm9.1000kb.test_sampled.cool"), 16 | frac=0.2, 17 | nproc=3 18 | ) 19 | clr_result = cooler.Cooler( 20 | op.join(request.fspath.dirname, "data/CN.mm9.1000kb.test_sampled.cool") 21 | ) 22 | # Test that deviation from expected total is very small 23 | testing.assert_allclose(clr_result.info["sum"], clr.info["sum"] / 5, rtol=1e-3) 24 | 25 | cooltools.api.sample.sample( 26 | clr, 27 | op.join(request.fspath.dirname, "data/CN.mm9.1000kb.test_sampled.cool"), 28 | count=20000000, 29 | nproc=3 30 | ) 31 | clr_result = cooler.Cooler( 32 | op.join(request.fspath.dirname, "data/CN.mm9.1000kb.test_sampled.cool") 33 | ) 34 | # Test that deviation from expected total is very small 35 | testing.assert_allclose(clr_result.info["sum"], 20000000, rtol=1e-2) 36 | 37 | 38 | def test_sample_exact(request): 39 | # Exact sampling is very slow! So commented out 40 | clr = cooler.Cooler(op.join(request.fspath.dirname, "data/CN.mm9.10000kb.cool")) 41 | 42 | cooltools.api.sample.sample( 43 | clr, 44 | op.join(request.fspath.dirname, "data/CN.mm9.10000kb.test_sampled.cool"), 45 | frac=0.2, 46 | exact=True, 47 | nproc=3 48 | ) 49 | clr_result = cooler.Cooler( 50 | op.join(request.fspath.dirname, "data/CN.mm9.10000kb.test_sampled.cool") 51 | ) 52 | # Test that result matches expectation exactly 53 | testing.assert_equal(clr_result.info["sum"], round(clr.info["sum"] * 0.2)) 54 | 55 | cooltools.api.sample.sample( 56 | clr, 57 | op.join(request.fspath.dirname, "data/CN.mm9.10000kb.test_sampled.cool"), 58 | count=2000000, 59 | exact=True, 60 | nproc=3 61 | ) 62 | clr_result = cooler.Cooler( 63 | op.join(request.fspath.dirname, "data/CN.mm9.10000kb.test_sampled.cool") 64 | ) 65 | # Test that result matches expectation exactly 66 | testing.assert_equal(clr_result.info["sum"], 2000000) 67 | -------------------------------------------------------------------------------- /tests/test_virtual4c.py: -------------------------------------------------------------------------------- 1 | import os.path as op 2 | 3 | from click.testing import CliRunner 4 | from cooltools.cli import cli 5 | 6 | from cooltools.api import virtual4c 7 | import cooler 8 | 9 | 10 | def test_virtual4c(request): 11 | clr_path = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool") 12 | clr = cooler.Cooler(clr_path) 13 | viewpoint = "chr1:30000000-40000000" 14 | 15 | v4c = virtual4c.virtual4c(clr, viewpoint) 16 | 17 | assert v4c.shape[0] == clr.bins()[:].shape[0] 18 | 19 | # check multiprocessed result 20 | pooled_v4c = virtual4c.virtual4c(clr, viewpoint, nproc=3) 21 | assert v4c.equals(pooled_v4c) 22 | 23 | 24 | def test_virtual4c_cli(request, tmpdir): 25 | 26 | in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool") 27 | out_prefix = op.join(tmpdir, "CN.virtual4c") 28 | viewpoint = "chr1:30000000-40000000" 29 | 30 | runner = CliRunner() 31 | result = runner.invoke(cli, ["virtual4c", "-o", out_prefix, in_cool, viewpoint]) 32 | assert result.exit_code == 0 33 | 34 | 35 | def test_virtual4c_cli_nobalance(request, tmpdir): 36 | 37 | in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool") 38 | out_prefix = op.join(tmpdir, "CN.virtual4c") 39 | viewpoint = "chr1:30000000-40000000" 40 | 41 | runner = CliRunner() 42 | result = runner.invoke( 43 | cli, 44 | ["virtual4c", "--clr-weight-name", "", "-o", out_prefix, in_cool, viewpoint], 45 | ) 46 | assert result.exit_code == 0 47 | --------------------------------------------------------------------------------