├── .coveragerc
├── .flake8
├── .github
    └── workflows
    │   ├── pypi-publish.yml
    │   ├── pytest.yml
    │   └── python-publish-test.yml
├── .gitignore
├── .readthedocs.yml
├── .zenodo.json
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── cooltools
    ├── __init__.py
    ├── __main__.py
    ├── api
    │   ├── __init__.py
    │   ├── coverage.py
    │   ├── directionality.py
    │   ├── dotfinder.py
    │   ├── eigdecomp.py
    │   ├── expected.py
    │   ├── insulation.py
    │   ├── rearrange.py
    │   ├── saddle.py
    │   ├── sample.py
    │   ├── snipping.py
    │   └── virtual4c.py
    ├── cli
    │   ├── __init__.py
    │   ├── coverage.py
    │   ├── dots.py
    │   ├── eigs_cis.py
    │   ├── eigs_trans.py
    │   ├── expected_cis.py
    │   ├── expected_trans.py
    │   ├── genome.py
    │   ├── insulation.py
    │   ├── logbin_expected.py
    │   ├── pileup.py
    │   ├── rearrange.py
    │   ├── saddle.py
    │   ├── sample.py
    │   ├── util.py
    │   └── virtual4c.py
    ├── lib
    │   ├── __init__.py
    │   ├── _numutils.pyx
    │   ├── _query.py
    │   ├── checks.py
    │   ├── common.py
    │   ├── io.py
    │   ├── numutils.py
    │   ├── peaks.py
    │   ├── plotting.py
    │   ├── runlength.py
    │   └── schemas.py
    └── sandbox
    │   ├── __init__.py
    │   ├── balance.py
    │   ├── contrast.py
    │   ├── cool2cworld.py
    │   ├── cooler_filters
    │       ├── Example_usage.ipynb
    │       ├── pixel_filter_util.py
    │       └── test_data_util.cool
    │   ├── cross_score.py
    │   ├── expected_smoothing.py
    │   ├── expected_smoothing_example.ipynb
    │   ├── fastsavetxt.pyx
    │   ├── obs_over_exp_cooler.py
    │   ├── observed_over_expected_example.ipynb
    │   ├── pairs_scaling_functions.py
    │   └── rearrange_cooler_example.ipynb
├── datasets
    └── external_test_files.tsv
├── docs
    ├── Makefile
    ├── cli.rst
    ├── conf.py
    ├── cooltools.lib.rst
    ├── cooltools.rst
    ├── figs
    │   └── cooltools-logo-futura.png
    ├── index.rst
    ├── make.bat
    ├── notebooks_old
    │   ├── 01_scaling-curves.ipynb
    │   ├── 02_expected.ipynb
    │   ├── 03_eigendecomposition.ipynb
    │   ├── 04_saddle-plots.ipynb
    │   ├── 05_insulation-score.ipynb
    │   ├── 06_snipping-pileups.ipynb
    │   ├── 07_pileups2.ipynb
    │   ├── 08_dot-calling-internals.ipynb
    │   └── data
    │   │   └── encode_motifs.hg38.ctcf_known1.liftover.bed.gz
    ├── releases.md
    └── requirements.txt
├── pyproject.toml
├── pytest.ini
├── requirements-dev.txt
├── requirements.txt
├── setup.py
└── tests
    ├── data
        ├── CN.mm9.10000kb.cool
        ├── CN.mm9.1000kb.cool
        ├── CN.mm9.toy_expected.chromnamed.tsv
        ├── CN.mm9.toy_expected.tsv
        ├── CN.mm9.toy_features.bed
        ├── CN.mm9.toy_regions.bed
        ├── dotfinder_mock_inputs.npz
        ├── dotfinder_mock_res.csv.gz
        ├── make_test_compartments.py
        ├── mm9.chrom.sizes.reduced
        ├── mm9.named_nonoverlap_regions.bed
        ├── sin_eigs_mat.bg2.gz
        ├── sin_eigs_mat.cool
        ├── sin_eigs_track.tsv
        ├── test.10.bins
        └── test.chrom.sizes
    ├── test_call-dots.py
    ├── test_checks.py
    ├── test_compartments_saddle.py
    ├── test_coverage.py
    ├── test_dotfinder_chunking.py
    ├── test_dotfinder_stats.py
    ├── test_expected.py
    ├── test_insulation.py
    ├── test_io.py
    ├── test_lazy_toeplitz.py
    ├── test_lib_common.py
    ├── test_rearrange_cooler.py
    ├── test_sample.py
    ├── test_snipping.py
    └── test_virtual4c.py


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | source=
 3 |     cooltools/
 4 | 
 5 | omit=
 6 |     cooltools/__main__.py
 7 | 
 8 | [report]
 9 | exclude_lines =
10 |     pragma: no cover
11 |     return NotImplemented
12 |     raise NotImplementedError
13 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | exclude =
 3 |     __init__.py
 4 |     __main__.py
 5 | 
 6 | max-line-length = 80
 7 | ignore =
 8 |     # whitespace before ':'
 9 |     E203
10 |     # too many leading '#' for block comment
11 |     E266
12 |     # line too long
13 |     E501
14 |     # line break before binary operator
15 |     W503
16 | select =
17 |     # mccabe complexity
18 |     C
19 |     # pycodestyle
20 |     E
21 |     # pyflakes error
22 |     F
23 |     # pyflakes warning
24 |     W
25 |     # bugbear
26 |     B
27 |     # line exceeds max-line-length + 10%
28 |     B950
29 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Build and upload Python Package to PyPI
 2 | 
 3 | on:
 4 |   workflow_call:
 5 |   workflow_dispatch:
 6 |   release:
 7 |     types: [released]
 8 | 
 9 | jobs:
10 |   Publish:
11 |     runs-on: ubuntu-latest
12 |     permissions:
13 |       id-token: write
14 | 
15 |     steps:
16 |       - name: Checkout
17 |         uses: actions/checkout@v4
18 | 
19 |       - name: Setup Python
20 |         uses: actions/setup-python@v4
21 |         with:
22 |           python-version: "3.x"
23 | 
24 |       - name: Install dependencies
25 |         run: |
26 |           python -m pip install --upgrade pip
27 |           pip install build
28 | 
29 |       - name: Build
30 |         run: python -m build --sdist
31 | 
32 |       - name: Publish distribution 📦 to PyPI
33 |         uses: pypa/gh-action-pypi-publish@release/v1
34 | 


--------------------------------------------------------------------------------
/.github/workflows/pytest.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Pytest
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: [ '3.9', '3.10', '3.11', '3.12' ]
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v2
22 |     - name: Set up Python ${{ matrix.python-version }}
23 |       uses: actions/setup-python@v1
24 |       with:
25 |         python-version: ${{ matrix.python-version }}
26 |     - name: Install dependencies
27 |       run: |
28 |         pip install --upgrade pip wheel setuptools
29 |         pip install numpy cython
30 |         pip install -r requirements-dev.txt
31 |         pip install -e .
32 |     - name: Lint with flake8
33 |       run: |
34 |         pip install flake8
35 |         # stop the build if there are Python syntax errors or undefined names
36 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
37 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
38 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
39 |     - name: Test with pytest
40 |       run: |
41 |         pip install pytest
42 |         pytest
43 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish-test.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Publish Python Package to Test PyPI
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [prereleased]
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install cython numpy setuptools wheel twine
25 |     - name: Build and publish
26 |       env:
27 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 |       run: |
30 |         python setup.py sdist
31 |         twine upload --repository-url https://test.pypi.org/legacy/ dist/*
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Vim's cache
  2 | *.un~
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__
  6 | cooltools/__pycache__
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | *.c
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | env/
 17 | .venv/
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | # lib/ - we're using it cooltools/lib
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *,cover
 52 | .hypothesis/
 53 | .pytest_cache
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | docs/notebooks
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # IPython Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # pyenv
 81 | .python-version
 82 | 
 83 | # celery beat schedule file
 84 | celerybeat-schedule
 85 | 
 86 | # dotenv
 87 | .env
 88 | 
 89 | # virtualenv
 90 | venv/
 91 | ENV/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | tmp/
101 | 
102 | # Downloaded data
103 | datasets/*
104 | !datasets/external_test_files.tsv
105 | tmp.npz
106 | .gitignore
107 | tmp.hdf5
108 | cooltools/sandbox/test.mcool
109 | 
110 | .vscode/
111 | .idea/


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | build:
 9 |   os: ubuntu-22.04
10 |   tools:
11 |     python: "3.10"
12 |     
13 | # Build documentation in the docs/ directory with Sphinx
14 | sphinx:
15 |   configuration: docs/conf.py
16 | 
17 | # Build documentation with MkDocs
18 | #mkdocs:
19 | #  configuration: mkdocs.yml
20 | 
21 | # Optionally build your docs in additional formats such as PDF and ePub
22 | formats: all
23 | 
24 | # Optionally set the version of Python and requirements required to build your docs
25 | python:
26 |   install:
27 |     - requirements: docs/requirements.txt
28 | 


--------------------------------------------------------------------------------
/.zenodo.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "description": "CoolTools: the tools for your .cools", 
 3 |     "license": "MIT", 
 4 |     "upload_type": "software", 
 5 |     "access_right": "open", 
 6 |     "creators": [
 7 |         {
 8 |             "name": "Sergey Venev",
 9 |             "affiliation": "University of Massachusetts Medical School"
10 |         }, 
11 |         {
12 |             "name": "Nezar Abdennur",
13 |             "affiliation": "MIT"
14 |         }, 
15 |         {
16 |             "name": "Anton Goloborodko",
17 |             "affiliation": "IMBA"
18 |         }, 
19 |         {
20 |             "name": "Ilya Flyamer",
21 |             "affiliation": "FMI"
22 |         }, 
23 |         {
24 |             "name": "Geoffrey Fudenberg",
25 |             "affiliation": "University of Southern California"
26 |         }, 
27 |         {
28 |             "name": "Johannes Nuebler",
29 |             "affiliation": "MIT"
30 |         }, 
31 |         {
32 |             "name": "Aleksandra Galitsyna",
33 |             "affiliation": "Skolkovo Institute of Science and Technology"
34 |         }, 
35 |         {
36 |             "name": "Betul Akgol",
37 |             "affiliation": "University of Massachusetts Medical School"
38 |         }, 
39 |         {
40 |             "name": "Sameer Abraham",
41 |             "affiliation": "MIT"
42 |         }, 
43 |         {
44 |             "name": "Peter Kerpedjiev",
45 |             "affiliation": "Harvard Medical School"
46 |         }, 
47 |         {
48 |             "name": "Maksim Imakaev",
49 |             "affiliation": "MIT"
50 |         }
51 |     ], 
52 |     "keywords": [
53 |         "genomics",
54 |         "bioinformatics",
55 |         "Hi-C",
56 |         "data",
57 |         "analysis",
58 |         "cooler"
59 |     ]
60 | }
61 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | ## General guidelines
 4 | 
 5 | If you haven't contributed to open-source before, we recommend you read [this excellent guide by GitHub on how to contribute to open source](https://opensource.guide/how-to-contribute). The guide is long, so you can gloss over things you're familiar with.
 6 | 
 7 | If you're not already familiar with it, we follow the [fork and pull model](https://help.github.com/articles/about-collaborative-development-models) on GitHub. Also, check out this recommended [git workflow](https://www.asmeurer.com/git-workflow/).
 8 | 
 9 | As a rough guide for cooltools:
10 | - contributors should preferably work on their forks and submit pull requests to the main branch
11 | - core maintainers can work on feature branches in the main fork and then submit pull requests to the main branch
12 | - core maintainers can push directly to the main branch if it's urgently needed 
13 | 
14 | 
15 | ## Contributing Code
16 | 
17 | This project has a number of requirements for all code contributed.
18 | 
19 | * We follow the [PEP-8 style](https://www.python.org/dev/peps/pep-0008/) convention.
20 | * We use [flake8](http://flake8.pycqa.org/en/latest/) to automatically lint the code and maintain code style. You can use a code formatter like [black](https://github.com/psf/black) or [autopep8](https://github.com/hhatto/autopep8) to help keep the linter happy.
21 | * We use [Numpy-style docstrings](https://numpydoc.readthedocs.io/en/latest/format.html).
22 | * User-facing API changes or new features should have documentation added.
23 | 
24 | Ideally, provide full test coverage for new code submitted in PRs.
25 | 
26 | 
27 | ## Setting up Your Development Environment
28 | 
29 | For setting up an isolated virtual environment for development, we recommend using [conda](https://docs.conda.io/en/latest/miniconda.html). After forking and cloning the repository, install in "editable" (i.e. development) mode using the `-e` option:
30 | 
31 | ```sh
32 | $ git clone https://github.com/open2c/cooltools.git
33 | $ cd cooltools
34 | $ pip install -e .
35 | ```
36 | 
37 | Editable mode installs the package by creating a "link" to your working (repo) directory.
38 | 
39 | 
40 | ## Unit Tests
41 | 
42 | It is best if all new functionality and/or bug fixes have unit tests added with each use-case.
43 | 
44 | We use [pytest](https://docs.pytest.org/en/latest) as our unit testing framework with the `pytest-cov` extension to check code coverage and `pytest-flake8` to check code style. You don't need to configure these extensions yourself.
45 | This automatically checks code style and functionality, and prints code coverage, even though it doesn't fail on low coverage. 
46 | 
47 | Once you've configured your environment, you can just `cd` to the root of your repository and run
48 | 
49 | ```sh
50 | $ pytest
51 | ```
52 | 
53 | Unit tests are automatically run on Travis CI for pull requests.
54 | 
55 | 
56 | ## Coverage
57 | 
58 | The `pytest` script automatically reports coverage, both on the terminal for missing line numbers, and in annotated HTML form in `htmlcov/index.html`.
59 | 
60 | 
61 | ## Documentation
62 | 
63 | If a feature is stable and relatively finalized, it is time to add it to the documentation. If you are adding any private/public functions, it is best to add docstrings, to aid in reviewing code and also for the API reference.
64 | 
65 | We use [Numpy style docstrings](https://numpydoc.readthedocs.io/en/latest/format.html>) and [Sphinx](http://www.sphinx-doc.org/en/stable) to document this library. Sphinx, in turn, uses [reStructuredText](http://www.sphinx-doc.org/en/stable/rest.html) as its markup language for adding code.
66 | 
67 | We use the [Sphinx Autosummary extension](http://www.sphinx-doc.org/en/stable/ext/autosummary.html) to generate API references. You may want to look at `docs/api.rst` to see how these files look and where to add new functions, classes or modules.
68 | 
69 | We also use the [nbsphinx extension](https://nbsphinx.readthedocs.io/en/0.5.0/) to render tutorial pages from Jupyter notebooks.
70 | 
71 | To build the documentation:
72 | 
73 | ```sh
74 | $ make docs
75 | ```
76 | 
77 | After this, you can find an HTML version of the documentation in `docs/_build/html/index.html`.
78 | 
79 | Documentation from `master` and tagged releases is automatically built and hosted thanks to [readthedocs](https://readthedocs.org/).
80 | 
81 | 
82 | ## Acknowledgement
83 | 
84 | If you've contributed significantly and would like your authorship to be included in subsequent uploads to [Zenodo](https://zenodo.org), please make a separate PR to add your name and affiliation to the `.zenodo.json` file.
85 | 
86 | ---
87 | 
88 | This document was modified from the [guidelines from the sparse project](https://github.com/pydata/sparse/blob/master/docs/contributing.rst).
89 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Cooltools developers
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include CHANGELOG.md
 2 | include README.md
 3 | include LICENSE
 4 | include requirements.txt
 5 | include requirements-dev.txt
 6 | include environment.yml
 7 | 
 8 | include cooltools/lib/_numutils.pyx
 9 | graft tests
10 | graft docs
11 | prune docs/_build
12 | prune docs/_static
13 | prune docs/_templates
14 | 
15 | global-exclude __pycache__/*
16 | global-exclude *.so
17 | global-exclude *.pyd
18 | global-exclude *.pyc
19 | global-exclude .git*
20 | global-exclude .deps/*
21 | global-exclude .DS_Store
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: build install test docs clean clean-pyc clean-dist build-dist publish-test publish
 2 | 
 3 | 
 4 | build:
 5 | 	python setup.py build_ext --inplace
 6 | 
 7 | install:
 8 | 	pip install -e .
 9 | 
10 | test:
11 | 	pytest
12 | 
13 | docs:
14 | 	cd docs && make html
15 | 
16 | 
17 | clean-pyc:
18 | 	find . -name '*.pyc' -exec rm --force {} +
19 | 	find . -name '*.pyo' -exec rm --force {} +
20 | 	find . -name '*~' -exec rm --force  {} +
21 | 
22 | clean-dist:
23 | 	rm -rf build/
24 | 	rm -rf dist/
25 | 
26 | clean: clean-pyc clean-dist
27 | 
28 | 
29 | build-dist: clean-dist
30 | 	python setup.py sdist
31 | 	# python setup.py bdist_wheel
32 | 
33 | publish-test: build-dist
34 | 	twine upload --repository-url https://test.pypi.org/legacy/ dist/*
35 | 
36 | publish: build-dist
37 | 	twine upload dist/*
38 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # cooltools: enabling high-resolution Hi-C analysis in Python
 2 | 
 3 | 
 4 | <img src="https://github.com/open2c/cooltools/blob/master/docs/figs/cooltools-logo-futura.png" width=15%> 
 5 | 
 6 | [![Pytest](https://github.com/open2c/cooltools/actions/workflows/pytest.yml/badge.svg)](https://github.com/open2c/cooltools/actions/workflows/pytest.yml)
 7 | [![Documentation Status](https://readthedocs.org/projects/cooltools/badge/?version=latest)](https://cooltools.readthedocs.io/en/latest/?badge=latest)
 8 | [![Latest Release PyPI](https://img.shields.io/pypi/v/cooltools?color=blue&label=PyPI%20package)](https://pypi.org/project/cooltools)
 9 | [![Latest Release Bioconda](https://img.shields.io/conda/vn/bioconda/cooltools?color=blue)](https://bioconda.github.io/recipes/cooltools/README.html)
10 | [![DOI](https://zenodo.org/badge/82413481.svg)](https://zenodo.org/badge/latestdoi/82413481)
11 | 
12 | > tools for your .cools
13 | 
14 | Chromosome conformation capture technologies reveal the incredible complexity of genome folding. A growing number of labs and multiple consortia, including the 4D Nucleome, the International Nucleome Consortium, and ENCODE, are generating higher-resolution datasets to probe genome architecture across cell states, types, and organisms. Larger datasets increase the challenges at each step of computational analysis, from storage, to memory, to researchers’ time. The recently-introduced [***cooler***](https://github.com/open2c/cooler/tree/master/cooler) format readily handles storage of high-resolution datasets via a sparse data model.
15 | 
16 | ***cooltools*** leverages this format to enable flexible and reproducible analysis of high-resolution data. ***cooltools*** provides a suite of computational tools with a paired python API and command line access, which facilitates workflows either on high-performance computing clusters or via custom analysis notebooks. As part of the [***Open2C*** ecosystem](https://open2c.github.io/), ***cooltools*** also provides detailed introductions to key concepts in Hi-C-data analysis with interactive notebook documentation. For more information, see the [preprint](https://doi.org/10.1101/2022.10.31.514564): https://doi.org/10.1101/2022.10.31.514564.
17 | 
18 | ## Requirements
19 | 
20 | The following are required before installing cooltools:
21 | 
22 | * Python 3.7+
23 | * `numpy`
24 | * `cython`
25 | 
26 | ## Installation
27 | 
28 | ```sh
29 | pip install cooltools
30 | ```
31 | 
32 | or install the latest version directly from github:
33 | 
34 | ```
35 |     $ pip install https://github.com/open2c/cooltools/archive/refs/heads/master.zip
36 | ``` 
37 | 
38 | See the [requirements.txt](https://github.com/open2c/cooltools/blob/master/requirements.txt) file for information on compatible dependencies, especially for [cooler](https://github.com/open2c/cooler/tree/master/cooler) and [bioframe](https://github.com/open2c/bioframe).
39 | 
40 | 
41 | ## Documentation and Tutorials
42 | 
43 | Documentation can be found here: https://cooltools.readthedocs.io/en/latest/.
44 | 
45 | Cooltools offers a number of tutorials using the [Open2c code ecosystem](https://github.com/open2c/). For users who are new to Hi-C analysis, we recommend going through example notebooks in the following order:
46 | 
47 | - [Visualization](https://cooltools.readthedocs.io/en/latest/notebooks/viz.html): how to load and visualize Hi-C data stored in coolers.
48 | - [Contacts vs Distance](https://cooltools.readthedocs.io/en/latest/notebooks/contacts_vs_distance.html):  how to calculate contact frequency as a function of genomic distance, the most prominent feature in Hi-C maps.
49 | - [Compartments and Saddles](https://cooltools.readthedocs.io/en/latest/notebooks/compartments_and_saddles.html):  how to extract eigenvectors and create saddleplots reflecting A/B compartments.
50 | - [Insulation and Boundaries](https://cooltools.readthedocs.io/en/latest/notebooks/insulation_and_boundaries.html):  how to extract insulation profiles and call boundaries using insulation profile minima.
51 | - [Pileups and Average Patterns](https://cooltools.readthedocs.io/en/latest/notebooks/pileup_CTCF.html): how to create avearge maps around genomic features like CTCF.
52 | 
53 | For users interested in running analyses from the commmand line:
54 | - [Command line interface](https://cooltools.readthedocs.io/en/latest/notebooks/command_line_interface.html): how to use the cooltools CLI.
55 | 
56 | Note that these notebooks currently focus on mammalian interphase Hi-C analysis, but are readily extendible to other organisms and cellular contexts. To clone notebooks for interactive analysis, visit https://github.com/open2c/open2c_examples. Docs for cooltools are built directly from these notebooks.
57 | 
58 | ## Contributing
59 | Cooltools welcomes contributions. The guiding principles for tools are that they are (i) as simple as possible, (ii) as interpretable as possible, (iii) should not involve visualization. The following applies for contributing new functionality to cooltools.
60 | 
61 | New functionality should:
62 | - clearly define the problem 
63 | - discuss alternative solutions
64 | - provide a separate example (provided as a gist/notebook/etc) explaining its use cases on multiple datasets.
65 | - be compatible with the latest versions of cooler and cooltools (e.g. should be able to be run on any cooler generated by the latest version of cooler)
66 | 
67 | New functionality should either:
68 | - generalize or extend existing tool without impairing user experience, and be submitted as PR to the relevant tool
69 | - or extract a distinct feature of genome organization, and be submitted as pull request to the sandbox
70 | 
71 | Vignettes, using existing tools in new ways, should be submitted as pull requests to open2c_vignettes as a distinct jupyter notebook, rather than to cooltools sandbox. The bar for contributions to this repository is minimal. We recommend each vignette to include package version information, and raise an error for other versions. If it makes sense, the example data available for download using cooltools can be used to allow an easy way to try out the analysis. Otherwise, the source of data can be specified for others to obtain it.
72 | 
73 | Practical aspects for contributing can be found in the guide [here](https://github.com/open2c/cooltools/blob/master/CONTRIBUTING.md).
74 | 
75 | ## Citing `cooltools`
76 | 
77 | Open2C*, Nezar Abdennur*, Sameer Abraham, Geoffrey Fudenberg*, Ilya M. Flyamer*, Aleksandra A. Galitsyna*, Anton Goloborodko*, Maxim Imakaev, Betul A. Oksuz, and Sergey V. Venev*. “Cooltools: Enabling High-Resolution Hi-C Analysis in Python.” bioRxiv, November 1, 2022. https://doi.org/10.1101/2022.10.31.514564.
78 | 


--------------------------------------------------------------------------------
/cooltools/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Cool tools
 4 | ~~~~~~~~~~
 5 | 
 6 | The tools for your .cool's.
 7 | 
 8 | :author: Cooltools developers
 9 | :license: MIT
10 | 
11 | """
12 | import logging
13 | 
14 | __version__ = "0.7.1"
15 | 
16 | from . import lib
17 | 
18 | from .lib import (
19 |     numutils,
20 |     download_data,
21 |     print_available_datasets,
22 |     get_data_dir,
23 |     download_file,
24 |     get_md5sum,
25 | )
26 | 
27 | from .api.expected import expected_cis, expected_trans
28 | from .api.coverage import coverage
29 | from .api.eigdecomp import eigs_cis, eigs_trans
30 | from .api.saddle import digitize, saddle
31 | from .api.sample import sample
32 | from .api.snipping import pileup
33 | from .api.directionality import directionality
34 | from .api.insulation import insulation
35 | from .api.dotfinder import dots
36 | from .api.virtual4c import virtual4c
37 | 


--------------------------------------------------------------------------------
/cooltools/__main__.py:
--------------------------------------------------------------------------------
1 | from .cli import cli
2 | 
3 | if __name__ == "__main__":
4 |     cli()
5 | 


--------------------------------------------------------------------------------
/cooltools/api/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import pathlib
 3 | 
 4 | __all__ = [
 5 |     f.stem
 6 |     for f in pathlib.Path(__file__).parent.glob("*.py")
 7 |     if f.is_file() and not f.name == "__init__.py"
 8 | ]
 9 | 
10 | for _ in __all__:
11 |     importlib.import_module("." + _, "cooltools.api")
12 | 
13 | del pathlib
14 | del importlib
15 | 


--------------------------------------------------------------------------------
/cooltools/api/directionality.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | import numpy as np
  3 | import pandas as pd
  4 | 
  5 | def _dirscore(
  6 |     pixels, bins, window=10, ignore_diags=2, balanced=True, signed_chi2=False
  7 | ):
  8 |     lo_bin_id = bins.index.min()
  9 |     hi_bin_id = bins.index.max() + 1
 10 |     N = hi_bin_id - lo_bin_id
 11 | 
 12 |     bad_bin_mask = (
 13 |         bins["weight"].isnull().values if balanced else np.zeros(N, dtype=bool)
 14 |     )
 15 | 
 16 |     diag_pixels = pixels[pixels["bin2_id"] - pixels["bin1_id"] <= (window - 1) * 2]
 17 |     if balanced:
 18 |         diag_pixels = diag_pixels[~diag_pixels["balanced"].isnull()]
 19 | 
 20 |     i = diag_pixels["bin1_id"].values - lo_bin_id
 21 |     j = diag_pixels["bin2_id"].values - lo_bin_id
 22 |     val = diag_pixels["balanced"].values if balanced else diag_pixels["count"].values
 23 | 
 24 |     sum_pixels_left = np.zeros(N)
 25 |     n_pixels_left = np.zeros(N)
 26 |     for i_shift in range(0, window):
 27 |         if i_shift < ignore_diags:
 28 |             continue
 29 | 
 30 |         mask = (i + i_shift == j) & (i + i_shift < N) & (j >= 0)
 31 |         sum_pixels_left += np.bincount(i[mask] + i_shift, val[mask], minlength=N)
 32 | 
 33 |         loc_bad_bin_mask = np.zeros(N, dtype=bool)
 34 |         if i_shift == 0:
 35 |             loc_bad_bin_mask |= bad_bin_mask
 36 |         else:
 37 |             loc_bad_bin_mask[i_shift:] |= bad_bin_mask[:-i_shift]
 38 |             loc_bad_bin_mask |= bad_bin_mask
 39 |         n_pixels_left[i_shift:] += 1 - loc_bad_bin_mask[i_shift:]
 40 | 
 41 |     sum_pixels_right = np.zeros(N)
 42 |     n_pixels_right = np.zeros(N)
 43 |     for j_shift in range(0, window):
 44 |         if j_shift < ignore_diags:
 45 |             continue
 46 | 
 47 |         mask = (i == j - j_shift) & (i < N) & (j - j_shift >= 0)
 48 | 
 49 |         sum_pixels_right += np.bincount(i[mask], val[mask], minlength=N)
 50 | 
 51 |         loc_bad_bin_mask = np.zeros(N, dtype=bool)
 52 |         loc_bad_bin_mask |= bad_bin_mask
 53 |         if j_shift == 0:
 54 |             loc_bad_bin_mask |= bad_bin_mask
 55 |         else:
 56 |             loc_bad_bin_mask[:-j_shift] |= bad_bin_mask[j_shift:]
 57 | 
 58 |         n_pixels_right[: (-j_shift if j_shift else None)] += (
 59 |             1 - loc_bad_bin_mask[: (-j_shift if j_shift else None)]
 60 |         )
 61 | 
 62 |     with warnings.catch_warnings():
 63 |         warnings.simplefilter("ignore")
 64 | 
 65 |         a = sum_pixels_left
 66 |         b = sum_pixels_right
 67 |         if signed_chi2:
 68 |             e = (a + b) / 2.0
 69 |             score = np.sign(b - a) * ((a - e) ** 2 + (b - e) ** 2) / e
 70 |         else:
 71 |             score = (b - a) / (a + b)
 72 | 
 73 |     return score
 74 | 
 75 | 
 76 | def _dirscore_dense(A, window=10, signed_chi2=False):
 77 |     N = A.shape[0]
 78 |     di = np.zeros(N)
 79 |     for i in range(0, N):
 80 |         lo = max(0, i - window)
 81 |         hi = min((i + window) + 1, N)
 82 |         b, a = np.nansum(A[i, i:hi]), np.nansum(A[i, lo : i + 1])
 83 |         if signed_chi2:
 84 |             e = (a + b) / 2.0
 85 |             if e:
 86 |                 di[i] = np.sign(b - a) * ((a - e) ** 2 + (b - e) ** 2) / e
 87 |         else:
 88 |             di[i] = (b - a) / (a + b)
 89 |     mask = np.nansum(A, axis=0) == 0
 90 |     di[mask] = np.nan
 91 |     return di
 92 | 
 93 | 
 94 | def directionality(
 95 |     clr,
 96 |     window_bp=100000,
 97 |     balance="weight",
 98 |     min_dist_bad_bin=2,
 99 |     ignore_diags=None,
100 |     chromosomes=None,
101 | ):
102 |     """Calculate the diamond insulation scores and call insulating boundaries.
103 | 
104 |     Parameters
105 |     ----------
106 |     clr : cooler.Cooler
107 |         A cooler with balanced Hi-C data.
108 |     window_bp : int
109 |         The size of the sliding diamond window used to calculate the insulation
110 |         score.
111 |     min_dist_bad_bin : int
112 |         The minimal allowed distance to a bad bin. Do not calculate insulation
113 |         scores for bins having a bad bin closer than this distance.
114 |     ignore_diags : int
115 |         The number of diagonals to ignore. If None, equals the number of
116 |         diagonals ignored during IC balancing.
117 | 
118 |     Returns
119 |     -------
120 |     ins_table : pandas.DataFrame
121 |         A table containing the insulation scores of the genomic bins and
122 |         the insulating boundary strengths.
123 |     """
124 |     if chromosomes is None:
125 |         chromosomes = clr.chromnames
126 | 
127 |     bin_size = clr.info["bin-size"]
128 |     ignore_diags = (
129 |         ignore_diags
130 |         if ignore_diags is not None
131 |         else clr._load_attrs(clr.root.rstrip("/") + "/bins/weight")["ignore_diags"]
132 |     )
133 |     window_bins = window_bp // bin_size
134 | 
135 |     if window_bp % bin_size != 0:
136 |         raise Exception(
137 |             "The window size ({}) has to be a multiple of the bin size {}".format(
138 |                 window_bp, bin_size
139 |             )
140 |         )
141 | 
142 |     dir_chrom_tables = []
143 |     for chrom in chromosomes:
144 |         chrom_bins = clr.bins().fetch(chrom)
145 |         chrom_pixels = clr.matrix(as_pixels=True, balance=balance).fetch(chrom)
146 | 
147 |         # mask neighbors of bad bins
148 |         is_bad_bin = np.isnan(chrom_bins["weight"].values)
149 |         bad_bin_neighbor = np.zeros_like(is_bad_bin)
150 |         for i in range(0, min_dist_bad_bin):
151 |             if i == 0:
152 |                 bad_bin_neighbor = bad_bin_neighbor | is_bad_bin
153 |             else:
154 |                 bad_bin_neighbor = bad_bin_neighbor | np.r_[[True] * i, is_bad_bin[:-i]]
155 |                 bad_bin_neighbor = bad_bin_neighbor | np.r_[is_bad_bin[i:], [True] * i]
156 | 
157 |         dir_chrom = chrom_bins[["chrom", "start", "end"]].copy()
158 |         dir_chrom["bad_bin_masked"] = bad_bin_neighbor
159 | 
160 |         with warnings.catch_warnings():
161 |             warnings.simplefilter("ignore", RuntimeWarning)
162 |             dir_track = _dirscore(
163 |                 chrom_pixels, chrom_bins, window=window_bins, ignore_diags=ignore_diags
164 |             )
165 |             dir_track[bad_bin_neighbor] = np.nan
166 |             dir_track[~np.isfinite(dir_track)] = np.nan
167 |             dir_chrom["directionality_ratio_{}".format(window_bp)] = dir_track
168 | 
169 |             dir_track = _dirscore(
170 |                 chrom_pixels,
171 |                 chrom_bins,
172 |                 window=window_bins,
173 |                 ignore_diags=ignore_diags,
174 |                 signed_chi2=True,
175 |             )
176 |             dir_track[bad_bin_neighbor] = np.nan
177 |             dir_track[~np.isfinite(dir_track)] = np.nan
178 |             dir_chrom["directionality_index_{}".format(window_bp)] = dir_track
179 | 
180 |         dir_chrom_tables.append(dir_chrom)
181 | 
182 |     dir_table = pd.concat(dir_chrom_tables)
183 |     return dir_table
184 | 


--------------------------------------------------------------------------------
/cooltools/api/sample.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | import cooler
  5 | import cooler.parallel
  6 | from .coverage import coverage
  7 | from ..lib.common import pool_decorator
  8 | 
  9 | 
 10 | 
 11 | def sample_pixels_approx(pixels, frac):
 12 |     pixels["count"] = np.random.binomial(pixels["count"], frac)
 13 |     mask = pixels["count"] > 0
 14 | 
 15 |     if issubclass(type(pixels), pd.DataFrame):
 16 |         pixels = pixels[mask]
 17 |     elif issubclass(type(pixels), dict):
 18 |         pixels = {k: arr[mask] for k, arr in pixels.items()}
 19 |     return pixels
 20 | 
 21 | 
 22 | def sample_pixels_exact(pixels, count):
 23 |     cumcount = np.cumsum(np.asarray(pixels["count"]))
 24 |     total = cumcount[-1]
 25 |     n_pixels = cumcount.shape[0]
 26 | 
 27 |     # sample a given number of distinct contacts
 28 |     random_contacts = np.random.choice(total, size=count, replace=False)
 29 | 
 30 |     # find where those contacts live in the cumcount array
 31 |     loc = np.searchsorted(cumcount, random_contacts, side="right")
 32 | 
 33 |     # re-bin those locations to get new counts
 34 |     new_counts = np.bincount(loc, minlength=n_pixels)
 35 | 
 36 |     pixels["count"] = new_counts
 37 |     mask = pixels["count"] > 0
 38 |     if issubclass(type(pixels), pd.DataFrame):
 39 |         pixels = pixels[mask]
 40 |     elif issubclass(type(pixels), dict):
 41 |         pixels = {k: arr[mask] for k, arr in pixels.items()}
 42 |     return pixels
 43 | 
 44 | 
 45 | def _extract_pixel_chunk(chunk):
 46 |     return chunk["pixels"]
 47 | 
 48 | @pool_decorator
 49 | def sample(
 50 |     clr,
 51 |     out_clr_path,
 52 |     count=None,
 53 |     cis_count=None,
 54 |     frac=None,
 55 |     exact=False,
 56 |     chunksize=int(1e7),
 57 |     nproc=1,
 58 |     map_functor=map,
 59 | ):
 60 |     """
 61 |     Pick a random subset of contacts from a Hi-C map.
 62 | 
 63 |     Parameters
 64 |     ----------
 65 |     clr : cooler.Cooler or str
 66 |         A Cooler or a path/URI to a Cooler with input data.
 67 | 
 68 |     out_clr_path : str
 69 |         A path/URI to the output.
 70 | 
 71 |     count : int
 72 |         The target number of contacts in the sample.
 73 |         Mutually exclusive with `cis_count` and `frac`.
 74 | 
 75 |     cis_count : int
 76 |         The target number of cis contacts in the sample.
 77 |         Mutually exclusive with `count` and `frac`.
 78 | 
 79 |     frac : float
 80 |         The target sample size as a fraction of contacts in the original
 81 |         dataset. Mutually exclusive with `count` and `cis_count`.
 82 | 
 83 |     exact : bool
 84 |         If True, the resulting sample size will exactly match the target value.
 85 |         Exact sampling will load the whole pixel table into memory!
 86 |         If False, binomial sampling will be used instead and the sample size
 87 |         will be randomly distributed around the target value.
 88 | 
 89 |     chunksize : int
 90 |         The number of pixels loaded and processed per step of computation.
 91 | 
 92 |     nproc : int, optional
 93 |         How many processes to use for calculation. Ignored if map_functor is passed.
 94 |         
 95 |     map_functor : callable, optional
 96 |         Map function to dispatch the matrix chunks to workers.
 97 |         If left unspecified, pool_decorator applies the following defaults: if nproc>1 this defaults to multiprocess.Pool;
 98 |         If nproc=1 this defaults the builtin map. 
 99 | 
100 |     """
101 |     if issubclass(type(clr), str):
102 |         clr = cooler.Cooler(clr)
103 | 
104 |     if frac is not None and count is None and cis_count is None:
105 |         pass
106 |     elif frac is None and count is not None and cis_count is None:
107 |         frac = count / clr.info["sum"]
108 |     elif frac is None and count is None and cis_count is not None:
109 |         # note division by two, since coverage() counts each side separately
110 |         cis_total = clr.info.get("cis", np.sum(coverage(clr)[0] // 2, dtype=int))
111 |         frac = cis_count / cis_total
112 |     else:
113 |         raise ValueError(
114 |             "Please specify exactly one argument among `count`, `cis_count`"
115 |             " and `frac`"
116 |         )
117 | 
118 |     if frac > 1.0:
119 |         raise ValueError(
120 |             "The number of contacts in a sample cannot exceed "
121 |             "that in the original dataset."
122 |         )
123 | 
124 |     if exact:
125 |         count = np.round(frac * clr.info["sum"]).astype(int)
126 |         pixels = sample_pixels_exact(clr.pixels()[:], count)
127 |         cooler.create_cooler(out_clr_path, clr.bins()[:], pixels, ordered=True)
128 | 
129 |     else:
130 |         pipeline = (
131 |             cooler.parallel.split(
132 |                 clr, include_bins=False, map=map_functor, chunksize=chunksize
133 |             )
134 |             .pipe(_extract_pixel_chunk)
135 |             .pipe(sample_pixels_approx, frac=frac)
136 |         )
137 | 
138 |         cooler.create_cooler(
139 |             out_clr_path,
140 |             clr.bins()[:][["chrom", "start", "end"]],
141 |             iter(pipeline),
142 |             ordered=True,
143 |         )
144 | 


--------------------------------------------------------------------------------
/cooltools/api/virtual4c.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | logging.basicConfig(level=logging.INFO)
  4 | 
  5 | from functools import partial
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import bioframe
 10 | 
 11 | 
 12 | from ..lib.checks import is_cooler_balanced
 13 | from ..lib.common import pool_decorator
 14 | 
 15 | 
 16 | 
 17 | def _extract_profile(chrom, clr, clr_weight_name, viewpoint):
 18 |     to_return = []
 19 |     if clr_weight_name:
 20 |         colname = "balanced"
 21 |     else:
 22 |         colname = "count"
 23 |     pxls1 = clr.matrix(balance=clr_weight_name, as_pixels=True, join=True).fetch(
 24 |         chrom, viewpoint
 25 |     )
 26 |     pxls1[["chrom2"]] = viewpoint[0]
 27 |     pxls1[["start2"]] = viewpoint[1]
 28 |     pxls1[["end2"]] = viewpoint[2]
 29 | 
 30 |     pxls1 = (
 31 |         pxls1.groupby(["chrom1", "start1", "end1"], observed=True)[colname]
 32 |         .mean()
 33 |         .reset_index()
 34 |     )
 35 |     pxls1.columns = ["chrom", "start", "end", colname]
 36 |     if pxls1.shape[0] > 0:
 37 |         to_return.append(pxls1)
 38 | 
 39 |     pxls2 = clr.matrix(balance=clr_weight_name, as_pixels=True, join=True).fetch(
 40 |         viewpoint, chrom
 41 |     )
 42 |     pxls2[["chrom1"]] = viewpoint[0]
 43 |     pxls2[["start1"]] = viewpoint[1]
 44 |     pxls2[["end1"]] = viewpoint[2]
 45 |     pxls2 = (
 46 |         pxls2.groupby(["chrom2", "start2", "end2"], observed=True)[colname]
 47 |         .mean()
 48 |         .reset_index()
 49 |     )
 50 |     pxls2.columns = ["chrom", "start", "end", colname]
 51 |     if pxls2.shape[0] > 0:
 52 |         to_return.append(pxls2)
 53 |     if len(to_return) == 0:
 54 |         return pd.DataFrame(columns=["chrom", "start", "end", colname])
 55 |     else:
 56 |         return pd.concat(to_return, ignore_index=True)
 57 | 
 58 | @pool_decorator
 59 | def virtual4c(
 60 |     clr,
 61 |     viewpoint,
 62 |     clr_weight_name="weight",
 63 |     nproc=1,
 64 |     map_functor=map,
 65 | ):
 66 |     """Generate genome-wide contact profile for a given viewpoint.
 67 | 
 68 |     Extract all contacts of a given viewpoint from a cooler file.
 69 | 
 70 |     Parameters
 71 |     ----------
 72 |     clr : cooler.Cooler
 73 |         A cooler with balanced Hi-C data.
 74 |     viewpoint : tuple or str
 75 |         Coordinates of the viewpoint.
 76 |     clr_weight_name : str
 77 |         Name of the column in the bin table with weight
 78 |     nproc : int, optional
 79 |         How many processes to use for calculation. Ignored if map_functor is passed.
 80 |     map_functor : callable, optional
 81 |         Map function to dispatch the matrix chunks to workers.
 82 |         If left unspecified, pool_decorator applies the following defaults: if nproc>1 this defaults to multiprocess.Pool;
 83 |         If nproc=1 this defaults the builtin map. 
 84 | 
 85 |     Returns
 86 |     -------
 87 |     v4C_table : pandas.DataFrame
 88 |         A table containing the interaction frequency of the viewpoint with the rest of
 89 |         the genome
 90 | 
 91 |     Note
 92 |     ----
 93 |     Note: this is a new (experimental) function, the interface or output might change in
 94 |     a future version.
 95 |     """
 96 |     if clr_weight_name not in [None, False]:
 97 |         # check if cooler is balanced
 98 |         try:
 99 |             _ = is_cooler_balanced(clr, clr_weight_name, raise_errors=True)
100 | 
101 |         except Exception as e:
102 |             raise ValueError(
103 |                 f"provided cooler is not balanced or {clr_weight_name} is missing"
104 |             ) from e
105 |         colname = "balanced"
106 |     else:
107 |         colname = "count"
108 |     viewpoint = bioframe.core.stringops.parse_region(viewpoint)
109 | 
110 |     f = partial(
111 |         _extract_profile, clr=clr, clr_weight_name=clr_weight_name, viewpoint=viewpoint
112 |     )
113 | 
114 |     counts = list(map_functor(f, clr.chromnames))
115 | 
116 |     # Concatenate all chrompsome dfs into one
117 |     v4c = pd.concat(counts, ignore_index=True)
118 |     if v4c.shape[0] == 0:
119 |         logging.warn(f"No contacts found for viewpoint {viewpoint}")
120 |         v4c = clr.bins()[:][["chrom", "start", "end"]]
121 |         v4c[colname] = np.nan
122 |     else:
123 |         v4c["chrom"] = v4c["chrom"].astype("category")
124 |         v4c["start"] = v4c["start"].astype(int)
125 |         v4c["end"] = v4c["end"].astype(int)
126 |         bioframe.sort_bedframe(
127 |             v4c,
128 |             view_df=bioframe.make_viewframe(clr.chromsizes),
129 |         )  # sort it according clr.chromsizes order
130 |         v4c.loc[
131 |             (v4c["chrom"] == viewpoint[0])
132 |             & (v4c["start"] >= viewpoint[1])
133 |             & (v4c["end"] <= viewpoint[2]),
134 |             colname,
135 |         ] = np.nan  # Set within-viewpoint bins to nan
136 |         v4c = (
137 |             pd.merge(
138 |                 clr.bins()[:][["chrom", "start", "end"]],
139 |                 v4c,
140 |                 on=["chrom", "start", "end"],
141 |                 how="left",
142 |             )
143 |             .drop_duplicates()
144 |             .reset_index(drop=True)
145 |         )  # Ensure we return all bins even if empty
146 |     return v4c
147 | 


--------------------------------------------------------------------------------
/cooltools/cli/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division, print_function
 3 | import click
 4 | import sys
 5 | from .. import __version__
 6 | 
 7 | # Monkey patch
 8 | click.core._verify_python3_env = lambda: None
 9 | 
10 | 
11 | CONTEXT_SETTINGS = {
12 |     "help_option_names": ["-h", "--help"],
13 | }
14 | 
15 | 
16 | @click.version_option(__version__, "-V", "--version")
17 | @click.group(context_settings=CONTEXT_SETTINGS)
18 | @click.option("-v", "--verbose", help="Verbose logging", is_flag=True, default=False)
19 | @click.option(
20 |     "-d", "--debug", help="Post mortem debugging", is_flag=True, default=False
21 | )
22 | def cli(verbose, debug):
23 |     """
24 |     Type -h or --help after any subcommand for more information.
25 | 
26 |     """
27 |     if verbose:
28 |         pass
29 |         # logger.setLevel(logging.DEBUG)
30 | 
31 |     if debug:
32 |         import traceback
33 | 
34 |         try:
35 |             import ipdb as pdb
36 |         except ImportError:
37 |             import pdb
38 | 
39 |         def _excepthook(exc_type, value, tb):
40 |             traceback.print_exception(exc_type, value, tb)
41 |             print()
42 |             pdb.pm()
43 | 
44 |         sys.excepthook = _excepthook
45 | 
46 | 
47 | from . import (
48 |     expected_cis,
49 |     expected_trans,
50 |     insulation,
51 |     pileup,
52 |     eigs_cis,
53 |     eigs_trans,
54 |     saddle,
55 |     dots,
56 |     genome,
57 |     sample,
58 |     coverage,
59 |     virtual4c,
60 |     rearrange,
61 | )
62 | 


--------------------------------------------------------------------------------
/cooltools/cli/coverage.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | import cooler
  3 | 
  4 | from . import cli
  5 | from .. import api
  6 | 
  7 | import bioframe
  8 | 
  9 | 
 10 | 
 11 | @cli.command()
 12 | @click.argument(
 13 |     "cool_path", metavar="COOL_PATH", type=str, nargs=1,
 14 | )
 15 | @click.option(
 16 |     "--output",
 17 |     "-o",
 18 |     help="Specify output file name to store the coverage in a tsv format.",
 19 |     type=str,
 20 |     required=False,
 21 | )
 22 | @click.option(
 23 |     "--ignore-diags",
 24 |     help="The number of diagonals to ignore. By default, equals"
 25 |     " the number of diagonals ignored during IC balancing.",
 26 |     type=int,
 27 |     default=None,
 28 |     show_default=True,
 29 | )
 30 | @click.option(
 31 |     "--store",
 32 |     help="Append columns with coverage (cov_cis_raw, cov_tot_raw), or"
 33 |     " (cov_cis_clr_weight_name, cov_tot_clr_weight_name) if calculating"
 34 |     " balanced coverage, to the cooler bin table. If clr_weight_name=None,"
 35 |     " also stores total cis counts in the cooler info",
 36 |     is_flag=True,
 37 | )
 38 | @click.option(
 39 |     "--chunksize",
 40 |     help="Split the contact matrix pixel records into equally sized chunks to"
 41 |     " save memory and/or parallelize. Default is 10^7",
 42 |     type=int,
 43 |     default=1e7,
 44 |     show_default=True,
 45 | )
 46 | @click.option(
 47 |     "--bigwig",
 48 |     help="Also save output as bigWig files for cis and total coverage"
 49 |     " with the names <output>.<cis/tot>.bw",
 50 |     is_flag=True,
 51 |     default=False,
 52 | )
 53 | @click.option(
 54 |     "--clr_weight_name",
 55 |     help="Name of the weight column. Specify to calculate coverage of"
 56 |     " balanced cooler.",
 57 |     type=str,
 58 |     default=None,
 59 |     show_default=False,
 60 | )
 61 | @click.option(
 62 |     "-p",
 63 |     "--nproc",
 64 |     help="Number of processes to split the work between."
 65 |     " [default: 1, i.e. no process pool]",
 66 |     default=1,
 67 |     type=int,
 68 | )
 69 | def coverage(
 70 |     cool_path, output, ignore_diags, store, chunksize, bigwig, clr_weight_name, nproc,
 71 | ):
 72 |     """
 73 |     Calculate the sums of cis and genome-wide contacts (aka coverage aka marginals) for
 74 |     a sparse Hi-C contact map in Cooler HDF5 format.
 75 |     Note that the sum(tot_cov) from this function is two times the number of reads
 76 |     contributing to the cooler, as each side contributes to the coverage.
 77 | 
 78 |     COOL_PATH : The paths to a .cool file with a balanced Hi-C map.
 79 | 
 80 |     """
 81 | 
 82 |     clr = cooler.Cooler(cool_path)
 83 | 
 84 |     cis_cov, tot_cov = api.coverage.coverage(
 85 |         clr, ignore_diags=ignore_diags, chunksize=chunksize, nproc=nproc, store=store, clr_weight_name=clr_weight_name
 86 |     )
 87 |     
 88 | 
 89 |     coverage_table = clr.bins()[:][["chrom", "start", "end"]]
 90 |     if clr_weight_name is None:
 91 |         store_names = ["cov_cis_raw", "cov_tot_raw"]
 92 |         coverage_table[store_names[0]] = cis_cov.astype(int)
 93 |         coverage_table[store_names[1]] = tot_cov.astype(int)
 94 |     else:
 95 |         store_names = [f"cov_cis_{clr_weight_name}", f"cov_tot_{clr_weight_name}"]
 96 |         coverage_table[store_names[0]] = cis_cov.astype(float)
 97 |         coverage_table[store_names[1]] = tot_cov.astype(float)
 98 | 
 99 |     # output to file if specified:
100 |     if output:
101 |         coverage_table.to_csv(output, sep="\t", index=False, na_rep="nan")
102 |     # or print into stdout otherwise:
103 |     else:
104 |         print(coverage_table.to_csv(sep="\t", index=False, na_rep="nan"))
105 | 
106 |     # Write the coverage tracks as a bigwigs:
107 |     if bigwig:
108 |         bioframe.to_bigwig(
109 |             coverage_table,
110 |             clr.chromsizes,
111 |             f"{output}.cis.bw",
112 |             value_field=store_names[0],
113 |         )
114 |         bioframe.to_bigwig(
115 |             coverage_table,
116 |             clr.chromsizes,
117 |             f"{output}.tot.bw",
118 |             value_field=store_names[1],
119 |         )
120 | 


--------------------------------------------------------------------------------
/cooltools/cli/dots.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | import cooler
  3 | import logging
  4 | 
  5 | import click
  6 | from . import cli
  7 | from .. import api
  8 | 
  9 | from ..lib.common import make_cooler_view
 10 | from ..lib.io import read_viewframe_from_file, read_expected_from_file
 11 | 
 12 | from .util import validate_csv
 13 | 
 14 | logging.basicConfig(level=logging.INFO)
 15 | 
 16 | 
 17 | @cli.command()
 18 | @click.argument(
 19 |     "cool_path",
 20 |     metavar="COOL_PATH",
 21 |     type=str,
 22 |     nargs=1,
 23 | )
 24 | @click.argument(
 25 |     "expected_path",
 26 |     metavar="EXPECTED_PATH",
 27 |     type=str,
 28 |     callback=partial(validate_csv, default_column="balanced.avg"),
 29 | )
 30 | @click.option(
 31 |     "--view",
 32 |     "--regions",
 33 |     help="Path to a BED file with the definition of viewframe (regions)"
 34 |     " used in the calculation of EXPECTED_PATH. Dot-calling will be"
 35 |     " performed for these regions independently e.g. chromosome arms."
 36 |     " Note that '--regions' is the deprecated name of the option. Use '--view' instead. ",
 37 |     type=click.Path(exists=False, dir_okay=False),
 38 |     default=None,
 39 |     show_default=True,
 40 | )
 41 | @click.option(
 42 |     "--clr-weight-name",
 43 |     help="Use cooler balancing weight with this name.",
 44 |     type=str,
 45 |     default="weight",
 46 |     show_default=True,
 47 | )
 48 | @click.option(
 49 |     "-p",
 50 |     "--nproc",
 51 |     help="Number of processes to split the work between."
 52 |     " [default: 1, i.e. no process pool]",
 53 |     default=1,
 54 |     type=int,
 55 | )
 56 | @click.option(
 57 |     "--max-loci-separation",
 58 |     help="Limit loci separation for dot-calling, i.e., do not call dots for"
 59 |     " loci that are further than max_loci_separation basepair apart."
 60 |     " 2-20MB is reasonable and would capture most of CTCF-dots.",
 61 |     type=int,
 62 |     default=2000000,
 63 |     show_default=True,
 64 | )
 65 | @click.option(
 66 |     "--max-nans-tolerated",
 67 |     help="Maximum number of NaNs tolerated in a footprint of every used filter."
 68 |     " Must be controlled with caution, as large max-nans-tolerated, might lead to"
 69 |     ' pixels scored in the padding area of the tiles to "penetrate" to the list'
 70 |     " of scored pixels for the statistical testing. [max-nans-tolerated <= 2*w ]",
 71 |     type=int,
 72 |     default=1,
 73 |     show_default=True,
 74 | )
 75 | @click.option(
 76 |     "--tile-size",
 77 |     help="Tile size for the Hi-C heatmap tiling."
 78 |     " Typically on order of several mega-bases, and <= max_loci_separation.",
 79 |     type=int,
 80 |     default=6000000,
 81 |     show_default=True,
 82 | )
 83 | @click.option(
 84 |     "--num-lambda-bins",
 85 |     help="Number of log-spaced bins to divide your adjusted expected"
 86 |     " between. Same as HiCCUPS_W1_MAX_INDX (40) in the original HiCCUPS.",
 87 |     type=int,
 88 |     default=45,
 89 |     show_default=True,
 90 | )
 91 | @click.option(
 92 |     "--fdr",
 93 |     help="False discovery rate (FDR) to control in the multiple"
 94 |     " hypothesis testing BH-FDR procedure.",
 95 |     type=float,
 96 |     default=0.02,
 97 |     show_default=True,
 98 | )
 99 | @click.option(
100 |     "--clustering-radius",
101 |     help="Radius for clustering dots that have been called too close to each other."
102 |     "Typically on order of 40 kilo-bases, and >= binsize.",
103 |     type=int,
104 |     default=39000,
105 |     show_default=True,
106 | )
107 | @click.option(
108 |     "-v", "--verbose", help="Enable verbose output", is_flag=True, default=False
109 | )
110 | @click.option(
111 |     "-o",
112 |     "--output",
113 |     help="Specify output file name to store called dots in a BEDPE-like format",
114 |     type=str,
115 |     required=True,
116 | )
117 | def dots(
118 |     cool_path,
119 |     expected_path,
120 |     view,
121 |     clr_weight_name,
122 |     nproc,
123 |     max_loci_separation,
124 |     max_nans_tolerated,
125 |     tile_size,
126 |     num_lambda_bins,
127 |     fdr,
128 |     clustering_radius,
129 |     verbose,
130 |     output,
131 | ):
132 |     """
133 |     Call dots on a Hi-C heatmap that are not larger than max_loci_separation.
134 | 
135 |     COOL_PATH : The paths to a .cool file with a balanced Hi-C map.
136 | 
137 |     EXPECTED_PATH : The paths to a tsv-like file with expected signal,
138 |     including a header. Use the '::' syntax to specify a column name.
139 | 
140 |     Analysis will be performed for chromosomes referred to in EXPECTED_PATH, and
141 |     therefore these chromosomes must be a subset of chromosomes referred to in
142 |     COOL_PATH. Also chromosomes refered to in EXPECTED_PATH must be non-trivial,
143 |     i.e., contain not-NaN signal. Thus, make sure to prune your EXPECTED_PATH
144 |     before applying this script.
145 | 
146 |     COOL_PATH and EXPECTED_PATH must be binned at the same resolution.
147 | 
148 |     EXPECTED_PATH must contain at least the following columns for cis contacts:
149 |     'region1/2', 'dist', 'n_valid', value_name. value_name is controlled using
150 |     options. Header must be present in a file.
151 | 
152 |     """
153 |     clr = cooler.Cooler(cool_path)
154 |     expected_path, expected_value_col = expected_path
155 | 
156 |     # Either use view from file or all chromosomes in the provided cooler
157 |     if view is None:
158 |         view_df = make_cooler_view(clr)
159 |     else:
160 |         view_df = read_viewframe_from_file(view, clr, check_sorting=True)
161 | 
162 |     expected = read_expected_from_file(
163 |         expected_path,
164 |         contact_type="cis",
165 |         expected_value_cols=[expected_value_col],
166 |         verify_view=view_df,
167 |         verify_cooler=clr,
168 |     )
169 | 
170 |     dot_calls_df = api.dotfinder.dots(
171 |         clr,
172 |         expected,
173 |         expected_value_col=expected_value_col,
174 |         clr_weight_name=clr_weight_name,
175 |         view_df=view_df,
176 |         kernels=None,  # engaging default HiCCUPS kernels
177 |         max_loci_separation=max_loci_separation,
178 |         max_nans_tolerated=max_nans_tolerated,  # test if this has desired behavior
179 |         n_lambda_bins=num_lambda_bins,  # update this eventually
180 |         lambda_bin_fdr=fdr,
181 |         clustering_radius=clustering_radius,
182 |         cluster_filtering=None,
183 |         tile_size=tile_size,
184 |         nproc=nproc,
185 |     )
186 | 
187 |     # output results in a file, when specified
188 |     if output:
189 |         dot_calls_df.to_csv(output, sep="\t", header=True, index=False, na_rep="nan")
190 |     # or print into stdout otherwise:
191 |     else:
192 |         print(
193 |             dot_calls_df.to_csv(
194 |                 output, sep="\t", header=True, index=False, na_rep="nan"
195 |             )
196 |         )
197 | 


--------------------------------------------------------------------------------
/cooltools/cli/eigs_cis.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import cooler
  4 | import bioframe
  5 | from ..api import eigdecomp
  6 | from ..lib.common import make_cooler_view
  7 | from ..lib.io import read_viewframe_from_file
  8 | 
  9 | import click
 10 | from .util import TabularFilePath, sniff_for_header
 11 | from . import cli
 12 | 
 13 | 
 14 | @cli.command()
 15 | @click.argument("cool_path", metavar="COOL_PATH", type=str)
 16 | @click.option(
 17 |     "--phasing-track",
 18 |     help="Phasing track for orienting and ranking eigenvectors,"
 19 |     "provided as /path/to/track::track_value_column_name.",
 20 |     type=TabularFilePath(exists=True, default_column_index=3),
 21 |     metavar="TRACK_PATH",
 22 | )
 23 | @click.option(
 24 |     "--view",
 25 |     "--regions",
 26 |     help="Path to a BED file which defines which regions of the chromosomes to use"
 27 |     " (only implemented for cis contacts)."
 28 |     " Note that '--regions' is the deprecated name of the option. Use '--view' instead. ",
 29 |     default=None,
 30 |     type=str,
 31 | )
 32 | @click.option(
 33 |     "--n-eigs",
 34 |     help="Number of eigenvectors to compute.",
 35 |     type=int,
 36 |     default=3,
 37 |     show_default=True,
 38 | )
 39 | @click.option(
 40 |     "--clr-weight-name",
 41 |     help="Use balancing weight with this name. "
 42 |     "Using raw unbalanced data is not currently supported for eigenvectors.",
 43 |     type=str,
 44 |     default="weight",
 45 |     show_default=True,
 46 | )
 47 | @click.option(
 48 |     "--ignore-diags",
 49 |     help="The number of diagonals to ignore. By default, equals"
 50 |     " the number of diagonals ignored during IC balancing.",
 51 |     type=int,
 52 |     default=None,
 53 |     show_default=True,
 54 | )
 55 | @click.option(
 56 |     "-v", "--verbose", help="Enable verbose output", is_flag=True, default=False
 57 | )
 58 | @click.option(
 59 |     "-o",
 60 |     "--out-prefix",
 61 |     help="Save compartment track as a BED-like file."
 62 |     " Eigenvectors and corresponding eigenvalues are stored in"
 63 |     " out_prefix.contact_type.vecs.tsv and out_prefix.contact_type.lam.txt",
 64 |     required=True,
 65 | )
 66 | @click.option(
 67 |     "--bigwig",
 68 |     help="Also save compartment track (E1) as a bigWig file"
 69 |     " with the name out_prefix.contact_type.bw",
 70 |     is_flag=True,
 71 |     default=False,
 72 | )
 73 | def eigs_cis(
 74 |     cool_path,
 75 |     phasing_track,
 76 |     view,
 77 |     n_eigs,
 78 |     clr_weight_name,
 79 |     ignore_diags,
 80 |     verbose,
 81 |     out_prefix,
 82 |     bigwig,
 83 | ):
 84 |     """
 85 |     Perform eigen value decomposition on a cooler matrix to calculate
 86 |     compartment signal by finding the eigenvector that correlates best with the
 87 |     phasing track.
 88 | 
 89 | 
 90 |     COOL_PATH : the paths to a .cool file with a balanced Hi-C map. Use the
 91 |     '::' syntax to specify a group path in a multicooler file.
 92 | 
 93 |     TRACK_PATH : the path to a BedGraph-like file that stores phasing track as
 94 |     track-name named column.
 95 | 
 96 |     BedGraph-like format assumes tab-separated columns chrom, start, stop and
 97 |     track-name.
 98 | 
 99 |     """
100 |     clr = cooler.Cooler(cool_path)
101 | 
102 |     if phasing_track is not None:
103 | 
104 |         # TODO: This all needs to be refactored into a more generic tabular file parser
105 |         # Needs to handle stdin case too.
106 |         track_path, col = phasing_track
107 |         buf, names = sniff_for_header(track_path)
108 | 
109 |         if names is None:
110 |             if not isinstance(col, int):
111 |                 raise click.BadParameter(
112 |                     "No header found. "
113 |                     'Cannot find "{}" column without a header.'.format(col)
114 |                 )
115 | 
116 |             track_name = "ref"
117 |             kwargs = dict(
118 |                 header=None,
119 |                 usecols=[0, 1, 2, col],
120 |                 names=["chrom", "start", "end", track_name],
121 |             )
122 |         else:
123 |             if isinstance(col, int):
124 |                 try:
125 |                     col = names[col]
126 |                 except IndexError:
127 |                     raise click.BadParameter(
128 |                         'Column #{} not compatible with header "{}".'.format(
129 |                             col, ",".join(names)
130 |                         )
131 |                     )
132 |             else:
133 |                 if col not in names:
134 |                     raise click.BadParameter(
135 |                         'Column "{}" not found in header "{}"'.format(
136 |                             col, ",".join(names)
137 |                         )
138 |                     )
139 | 
140 |             track_name = col
141 |             kwargs = dict(header="infer", usecols=["chrom", "start", "end", track_name])
142 | 
143 |         track_df = pd.read_table(
144 |             buf,
145 |             dtype={
146 |                 "chrom": str,
147 |                 "start": np.int64,
148 |                 "end": np.int64,
149 |                 track_name: np.float64,
150 |             },
151 |             comment="#",
152 |             verbose=verbose,
153 |             **kwargs
154 |         )
155 |         phasing_track = track_df
156 | 
157 |     # define view for cis compartment-calling
158 |     # use input "view" BED file or all chromosomes mentioned in "track":
159 |     if view is None:
160 |         cooler_view_df = make_cooler_view(clr)
161 |         view_df = cooler_view_df
162 |     else:
163 |         view_df = read_viewframe_from_file(view, clr, check_sorting=True)
164 | 
165 |     # TODO: Add check that view_df has the same bins as track
166 |     eigvals, eigvec_table = eigdecomp.eigs_cis(
167 |         clr=clr,
168 |         phasing_track=phasing_track,
169 |         view_df=view_df,
170 |         n_eigs=n_eigs,
171 |         clr_weight_name=clr_weight_name,
172 |         ignore_diags=ignore_diags,
173 |         clip_percentile=99.9,
174 |         sort_metric=None,
175 |     )
176 | 
177 |     # Output
178 |     eigvals.to_csv(out_prefix + ".cis" + ".lam.txt", sep="\t", index=False)
179 |     eigvec_table.to_csv(out_prefix + ".cis" + ".vecs.tsv", sep="\t", index=False)
180 |     if bigwig:
181 |         bioframe.to_bigwig(
182 |             eigvec_table,
183 |             clr.chromsizes,
184 |             out_prefix + ".cis" + ".bw",
185 |             value_field="E1",
186 |         )
187 | 


--------------------------------------------------------------------------------
/cooltools/cli/eigs_trans.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import cooler
  4 | import bioframe
  5 | from ..api import eigdecomp
  6 | from ..lib.common import make_cooler_view
  7 | 
  8 | import click
  9 | from .util import TabularFilePath, sniff_for_header
 10 | from . import cli
 11 | 
 12 | 
 13 | @cli.command()
 14 | @click.argument("cool_path", metavar="COOL_PATH", type=str)
 15 | @click.option(
 16 |     "--phasing-track",
 17 |     help="Phasing track for orienting and ranking eigenvectors,"
 18 |     "provided as /path/to/track::track_value_column_name.",
 19 |     type=TabularFilePath(exists=True, default_column_index=3),
 20 |     metavar="TRACK_PATH",
 21 | )
 22 | @click.option(
 23 |     "--view",
 24 |     "--regions",
 25 |     help="Path to a BED file which defines which regions of the chromosomes to use"
 26 |     " (only implemented for cis contacts). "
 27 |     " Note that '--regions' is the deprecated name of the option. Use '--view' instead. ",
 28 |     default=None,
 29 |     type=str,
 30 | )
 31 | @click.option(
 32 |     "--n-eigs",
 33 |     help="Number of eigenvectors to compute.",
 34 |     type=int,
 35 |     default=3,
 36 |     show_default=True,
 37 | )
 38 | @click.option(
 39 |     "--clr-weight-name",
 40 |     help="Use balancing weight with this name. Using raw unbalanced data is not supported for saddles.",
 41 |     type=str,
 42 |     default="weight",
 43 |     show_default=True,
 44 | )
 45 | @click.option(
 46 |     "-v", "--verbose", help="Enable verbose output", is_flag=True, default=False
 47 | )
 48 | @click.option(
 49 |     "-o",
 50 |     "--out-prefix",
 51 |     help="Save compartment track as a BED-like file."
 52 |     " Eigenvectors and corresponding eigenvalues are stored in"
 53 |     " out_prefix.contact_type.vecs.tsv and out_prefix.contact_type.lam.txt",
 54 |     required=True,
 55 | )
 56 | @click.option(
 57 |     "--bigwig",
 58 |     help="Also save compartment track (E1) as a bigWig file"
 59 |     " with the name out_prefix.contact_type.bw",
 60 |     is_flag=True,
 61 |     default=False,
 62 | )
 63 | def eigs_trans(
 64 |     cool_path,
 65 |     phasing_track,
 66 |     view,
 67 |     n_eigs,
 68 |     clr_weight_name,
 69 |     verbose,
 70 |     out_prefix,
 71 |     bigwig,
 72 | ):
 73 |     """
 74 |     Perform eigen value decomposition on a cooler matrix to calculate
 75 |     compartment signal by finding the eigenvector that correlates best with the
 76 |     phasing track.
 77 | 
 78 | 
 79 |     COOL_PATH : the paths to a .cool file with a balanced Hi-C map. Use the
 80 |     '::' syntax to specify a group path in a multicooler file.
 81 | 
 82 |     TRACK_PATH : the path to a BedGraph-like file that stores phasing track as
 83 |     track-name named column.
 84 | 
 85 |     BedGraph-like format assumes tab-separated columns chrom, start, stop and
 86 |     track-name.
 87 | 
 88 |     """
 89 |     clr = cooler.Cooler(cool_path)
 90 |     # full chromosome view, based on cooler
 91 |     cooler_view_df = make_cooler_view(clr)
 92 | 
 93 |     if phasing_track is not None:
 94 | 
 95 |         # TODO: This all needs to be refactored into a more generic tabular file parser
 96 |         # Needs to handle stdin case too.
 97 |         track_path, col = phasing_track
 98 |         buf, names = sniff_for_header(track_path)
 99 | 
100 |         if names is None:
101 |             if not isinstance(col, int):
102 |                 raise click.BadParameter(
103 |                     "No header found. "
104 |                     'Cannot find "{}" column without a header.'.format(col)
105 |                 )
106 | 
107 |             track_name = "ref"
108 |             kwargs = dict(
109 |                 header=None,
110 |                 usecols=[0, 1, 2, col],
111 |                 names=["chrom", "start", "end", track_name],
112 |             )
113 |         else:
114 |             if isinstance(col, int):
115 |                 try:
116 |                     col = names[col]
117 |                 except IndexError:
118 |                     raise click.BadParameter(
119 |                         'Column #{} not compatible with header "{}".'.format(
120 |                             col, ",".join(names)
121 |                         )
122 |                     )
123 |             else:
124 |                 if col not in names:
125 |                     raise click.BadParameter(
126 |                         'Column "{}" not found in header "{}"'.format(
127 |                             col, ",".join(names)
128 |                         )
129 |                     )
130 | 
131 |             track_name = col
132 |             kwargs = dict(header="infer", usecols=["chrom", "start", "end", track_name])
133 | 
134 |         track_df = pd.read_table(
135 |             buf,
136 |             dtype={
137 |                 "chrom": str,
138 |                 "start": np.int64,
139 |                 "end": np.int64,
140 |                 track_name: np.float64,
141 |             },
142 |             comment="#",
143 |             verbose=verbose,
144 |             **kwargs
145 |         )
146 |         phasing_track = track_df
147 | 
148 |     # TODO: implement view for eigs-trans instead of current "partition"
149 |     # use input "view" BED file or all chromosomes mentioned in "track":
150 |     if view is None:
151 |         view_df = cooler_view_df
152 |     else:
153 |         raise NotImplementedError(
154 |             "views are not currently implemented for CLI eigs-trans"
155 |         )
156 |         # view_df = read_viewframe_from_file(view, clr, check_sorting=True)
157 | 
158 |     # TODO: Add check that view_df has the same bins as track
159 |     eigvals, eigvec_table = eigdecomp.eigs_trans(
160 |         clr=clr,
161 |         phasing_track=phasing_track,
162 |         n_eigs=n_eigs,
163 |         clr_weight_name=clr_weight_name,
164 |         partition=None,
165 |         sort_metric=None,
166 |     )
167 | 
168 |     # Output
169 |     eigvals.to_csv(out_prefix + ".trans" + ".lam.txt", sep="\t", index=False)
170 |     eigvec_table.to_csv(out_prefix + ".trans" + ".vecs.tsv", sep="\t", index=False)
171 |     if bigwig:
172 |         bioframe.to_bigwig(
173 |             eigvec_table,
174 |             clr.chromsizes,
175 |             out_prefix + ".trans" + ".bw",
176 |             value_field="E1",
177 |         )
178 | 


--------------------------------------------------------------------------------
/cooltools/cli/expected_cis.py:
--------------------------------------------------------------------------------
  1 | import cooler
  2 | from .. import api
  3 | from ..lib.common import make_cooler_view
  4 | from ..lib.io import read_viewframe_from_file
  5 | 
  6 | import click
  7 | from . import cli
  8 | 
  9 | 
 10 | @cli.command()
 11 | @click.argument("cool_path", metavar="COOL_PATH", type=str, nargs=1)
 12 | @click.option(
 13 |     "--nproc",
 14 |     "-p",
 15 |     help="Number of processes to split the work between."
 16 |     "[default: 1, i.e. no process pool]",
 17 |     default=1,
 18 |     type=int,
 19 | )
 20 | @click.option(
 21 |     "--chunksize",
 22 |     "-c",
 23 |     help="Control the number of pixels handled by each worker process at a time.",
 24 |     type=int,
 25 |     default=int(10e6),
 26 |     show_default=True,
 27 | )
 28 | @click.option(
 29 |     "--output",
 30 |     "-o",
 31 |     help="Specify output file name to store the expected in a tsv format.",
 32 |     type=str,
 33 |     required=False,
 34 | )
 35 | @click.option(
 36 |     "--view",
 37 |     "--regions",
 38 |     help="Path to a 3 or 4-column BED file with genomic regions"
 39 |     " to calculated cis-expected on. When region names are not provided"
 40 |     " (no 4th column), UCSC-style region names are generated."
 41 |     " Cis-expected is calculated for all chromosomes, when this is not specified."
 42 |     " Note that '--regions' is the deprecated name of the option. Use '--view' instead.",
 43 |     type=click.Path(exists=True),
 44 |     required=False,
 45 | )
 46 | @click.option(
 47 |     "--smooth",
 48 |     help="If set, cis-expected is smoothed and result stored in an additional column"
 49 |     " e.g. balanced.avg.smoothed",
 50 |     is_flag=True,
 51 | )
 52 | @click.option(
 53 |     "--aggregate-smoothed",
 54 |     help="If set, cis-expected is averaged over all regions and then smoothed."
 55 |     " Result is stored in an additional column, e.g. balanced.avg.smoothed.agg."
 56 |     " Ignored without smoothing",
 57 |     is_flag=True,
 58 | )
 59 | @click.option(
 60 |     "--smooth-sigma",
 61 |     help="Control smoothing with the standard deviation of the smoothing Gaussian kernel,"
 62 |     " ignored without smoothing.",
 63 |     type=float,
 64 |     default=0.1,
 65 |     show_default=True,
 66 | )
 67 | @click.option(
 68 |     "--clr-weight-name",
 69 |     help="Use balancing weight with this name stored in cooler."
 70 |     "Provide empty argument to calculate cis-expected on raw data",
 71 |     type=str,
 72 |     default="weight",
 73 |     show_default=True,
 74 | )
 75 | @click.option(
 76 |     "--ignore-diags",
 77 |     help="Number of diagonals to neglect for cis contact type",
 78 |     type=int,
 79 |     default=2,
 80 |     show_default=True,
 81 | )
 82 | def expected_cis(
 83 |     cool_path,
 84 |     nproc,
 85 |     chunksize,
 86 |     output,
 87 |     view,
 88 |     smooth,
 89 |     aggregate_smoothed,
 90 |     smooth_sigma,
 91 |     clr_weight_name,
 92 |     ignore_diags,
 93 | ):
 94 |     """
 95 |     Calculate expected Hi-C signal for cis regions of chromosomal interaction map:
 96 |     average of interactions separated by the same genomic distance, i.e.
 97 |     are on the same diagonal on the cis-heatmap.
 98 | 
 99 |     When balancing weights are not applied to the data, there is no
100 |     masking of bad bins performed.
101 | 
102 |     COOL_PATH : The paths to a .cool file with a balanced Hi-C map.
103 | 
104 |     """
105 | 
106 |     clr = cooler.Cooler(cool_path)
107 | 
108 |     if view is None:
109 |         # full chromosome case
110 |         view_df = make_cooler_view(clr)
111 |     else:
112 |         # Read view_df dataframe, and verify against cooler
113 |         view_df = read_viewframe_from_file(view, clr, check_sorting=True)
114 | 
115 |     result = api.expected.expected_cis(
116 |         clr,
117 |         view_df=view_df,
118 |         intra_only=True,
119 |         smooth=smooth,
120 |         aggregate_smoothed=aggregate_smoothed,
121 |         smooth_sigma=smooth_sigma,
122 |         clr_weight_name=clr_weight_name if clr_weight_name else None,
123 |         ignore_diags=ignore_diags,
124 |         chunksize=chunksize,
125 |         nproc=nproc,
126 |     )
127 | 
128 |     # output to file if specified:
129 |     if output:
130 |         result.to_csv(output, sep="\t", index=False, na_rep="nan")
131 |     # or print into stdout otherwise:
132 |     else:
133 |         print(result.to_csv(sep="\t", index=False, na_rep="nan"))
134 | 


--------------------------------------------------------------------------------
/cooltools/cli/expected_trans.py:
--------------------------------------------------------------------------------
 1 | import cooler
 2 | from .. import api
 3 | from ..lib.common import make_cooler_view
 4 | from ..lib.io import read_viewframe_from_file
 5 | 
 6 | 
 7 | import click
 8 | from . import cli
 9 | 
10 | 
11 | @cli.command()
12 | @click.argument("cool_path", metavar="COOL_PATH", type=str, nargs=1)
13 | @click.option(
14 |     "--nproc",
15 |     "-p",
16 |     help="Number of processes to split the work between."
17 |     "[default: 1, i.e. no process pool]",
18 |     default=1,
19 |     type=int,
20 | )
21 | @click.option(
22 |     "--chunksize",
23 |     "-c",
24 |     help="Control the number of pixels handled by each worker process at a time.",
25 |     type=int,
26 |     default=int(10e6),
27 |     show_default=True,
28 | )
29 | @click.option(
30 |     "--output",
31 |     "-o",
32 |     help="Specify output file name to store the expected in a tsv format.",
33 |     type=str,
34 |     required=False,
35 | )
36 | @click.option(
37 |     "--view",
38 |     "--regions",
39 |     help="Path to a 3 or 4-column BED file with genomic regions. Trans-expected"
40 |     " is calculated on all pairwise combinations of these regions."
41 |     " When region names are not provided (no 4th column),"
42 |     " UCSC-style region names are generated. Trans-expected is calculated "
43 |     " for all inter-chromosomal pairs, when view is not specified."
44 |     " Note that '--regions' is the deprecated name of the option. Use '--view' instead.",
45 |     type=click.Path(exists=True),
46 |     required=False,
47 | )
48 | @click.option(
49 |     "--clr-weight-name",
50 |     help="Use balancing weight with this name stored in cooler."
51 |     "Provide empty argument to calculate cis-expected on raw data",
52 |     type=str,
53 |     default="weight",
54 |     show_default=True,
55 | )
56 | def expected_trans(
57 |     cool_path,
58 |     nproc,
59 |     chunksize,
60 |     output,
61 |     view,
62 |     clr_weight_name,
63 | ):
64 |     """
65 |     Calculate expected Hi-C signal for trans regions of chromosomal interaction map:
66 |     average of interactions in a rectangular block defined by a pair of regions, e.g.
67 |     inter-chromosomal blocks.
68 | 
69 |     When balancing weights are not applied to the data, there is no
70 |     masking of bad bins performed.
71 | 
72 |     COOL_PATH : The paths to a .cool file with a balanced Hi-C map.
73 | 
74 |     """
75 | 
76 |     clr = cooler.Cooler(cool_path)
77 | 
78 |     if view is None:
79 |         # full chromosome case
80 |         view_df = make_cooler_view(clr)
81 |     else:
82 |         # Read view_df dataframe, and verify against cooler
83 |         view_df = read_viewframe_from_file(view, clr, check_sorting=True)
84 | 
85 |     result = api.expected.expected_trans(
86 |         clr,
87 |         view_df=view_df,
88 |         clr_weight_name=clr_weight_name if clr_weight_name else None,
89 |         chunksize=chunksize,
90 |         nproc=nproc,
91 |     )
92 | 
93 |     # output to file if specified:
94 |     if output:
95 |         result.to_csv(output, sep="\t", index=False, na_rep="nan")
96 |     # or print into stdout otherwise:
97 |     else:
98 |         print(result.to_csv(sep="\t", index=False, na_rep="nan"))
99 | 


--------------------------------------------------------------------------------
/cooltools/cli/genome.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import click
 3 | from . import cli
 4 | 
 5 | 
 6 | @cli.group()
 7 | def genome():
 8 |     """
 9 |     Utilities for binned genome assemblies.
10 | 
11 |     """
12 | 
13 | 
14 | @genome.command()
15 | @click.argument("db")
16 | def fetch_chromsizes(db):
17 |     import bioframe
18 | 
19 |     chromsizes = bioframe.fetch_chromsizes(db)
20 |     print(chromsizes.to_csv(sep="\t"))
21 | 
22 | 
23 | @genome.command()
24 | @click.argument("chromsizes_path")
25 | @click.argument("binsize", type=int)
26 | @click.option(
27 |     "--all-names",
28 |     help='Parse all chromosome names from file, not only default r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$". ',
29 |     is_flag=True,
30 | )
31 | def binnify(chromsizes_path, binsize, all_names):
32 |     import bioframe
33 | 
34 |     chromsizes = bioframe.read_chromsizes(
35 |         chromsizes_path, filter_chroms=not (all_names)
36 |     )
37 |     bins = bioframe.binnify(chromsizes, binsize)
38 |     print(bins.to_csv(sep="\t", index=False))
39 | 
40 | 
41 | @genome.command()
42 | @click.argument("chromsizes_path")
43 | @click.argument("fasta_path")
44 | @click.argument("enzyme_name")
45 | def digest(chromsizes_path, fasta_path, enzyme_name):
46 |     import bioframe
47 | 
48 |     chromsizes = bioframe.read_chromsizes(chromsizes_path, filter_chroms=False)
49 |     fasta_records = bioframe.load_fasta(fasta_path, engine="pyfaidx", as_raw=True)
50 |     if not chromsizes.index.isin(fasta_records).all():
51 |         raise ValueError(
52 |             "Some chromosomes mentioned in {}"
53 |             " are not found in {}".format(chromsizes_path, fasta_path)
54 |         )
55 |     frags = bioframe.digest(fasta_records, enzyme_name)
56 |     print(frags.to_csv(sep="\t", index=False))
57 | 
58 | 
59 | @genome.command()
60 | @click.argument("bins_path")
61 | @click.argument("fasta_path")
62 | @click.option("--mapped-only", is_flag=True, default=True)
63 | def gc(bins_path, fasta_path, mapped_only):
64 |     import bioframe
65 |     import pandas as pd
66 | 
67 |     if bins_path == "-":
68 |         bins_path = sys.stdin
69 |     bins = pd.read_table(bins_path)
70 |     chromosomes = bins["chrom"].unique()
71 |     fasta_records = bioframe.load_fasta(fasta_path, engine="pyfaidx", as_raw=True)
72 |     if any(chrom not in fasta_records.keys() for chrom in chromosomes):
73 |         raise ValueError(
74 |             "Some chromosomes mentioned in {}"
75 |             " are not found in {}".format(bins_path, fasta_path)
76 |         )
77 |     bins = bioframe.frac_gc(bins, fasta_records, mapped_only)
78 |     print(bins.to_csv(sep="\t", index=False))
79 | 
80 | 
81 | @genome.command()
82 | @click.argument("bins_path")
83 | @click.argument("db")
84 | def genecov(bins_path, db):
85 |     """
86 |     BINS_PATH is the path to bintable.
87 | 
88 |     DB is the name of the genome assembly.
89 |     The gene locations will be automatically downloaded from teh UCSC goldenPath.
90 |     """
91 |     import bioframe
92 |     import pandas as pd
93 | 
94 |     bins = pd.read_table(bins_path)
95 |     bins = bioframe.frac_gene_coverage(bins, db)
96 |     print(bins.to_csv(sep="\t", index=False))
97 | 


--------------------------------------------------------------------------------
/cooltools/cli/insulation.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | import cooler
  3 | 
  4 | from . import cli
  5 | from .. import api
  6 | from ..lib.common import make_cooler_view
  7 | from ..lib.io import read_viewframe_from_file
  8 | import bioframe
  9 | 
 10 | 
 11 | @cli.command()
 12 | @click.argument("in_path", metavar="IN_PATH", type=str, nargs=1)
 13 | @click.argument("window", nargs=-1, metavar="WINDOW", type=int)
 14 | @click.option(
 15 |     "--nproc",
 16 |     "-p",
 17 |     help="Number of processes to split the work between."
 18 |     "[default: 1, i.e. no process pool]",
 19 |     default=1,
 20 |     type=int,
 21 | )
 22 | @click.option(
 23 |     "--output",
 24 |     "-o",
 25 |     help="Specify output file name to store the insulation in a tsv format.",
 26 |     type=str,
 27 |     required=False,
 28 | )
 29 | @click.option(
 30 |     "--view",
 31 |     "--regions",
 32 |     help="Path to a BED file containing genomic regions "
 33 |     "for which insulation scores will be calculated. Region names can "
 34 |     "be provided in a 4th column and should match regions and "
 35 |     "their names in expected."
 36 |     " Note that '--regions' is the deprecated name of the option. Use '--view' instead. ",
 37 |     type=click.Path(exists=True),
 38 |     required=False,
 39 | )
 40 | @click.option(
 41 |     "--ignore-diags",
 42 |     help="The number of diagonals to ignore. By default, equals"
 43 |     " the number of diagonals ignored during IC balancing.",
 44 |     type=int,
 45 |     default=None,
 46 |     show_default=True,
 47 | )
 48 | @click.option(
 49 |     "--clr-weight-name",
 50 |     help="Use balancing weight with this name. "
 51 |     "Provide empty argument to calculate insulation on raw data (no masking bad pixels).",
 52 |     type=str,
 53 |     default="weight",
 54 |     show_default=True,
 55 | )
 56 | @click.option(
 57 |     "--min-frac-valid-pixels",
 58 |     help="The minimal fraction of valid pixels in a sliding diamond. "
 59 |     "Used to mask bins during boundary detection.",
 60 |     type=float,
 61 |     default=0.66,
 62 |     show_default=True,
 63 | )
 64 | @click.option(
 65 |     "--min-dist-bad-bin",
 66 |     help="The minimal allowed distance to a bad bin. "
 67 |     "Use to mask bins after insulation calculation and during boundary detection.",
 68 |     type=int,
 69 |     default=0,
 70 |     show_default=True,
 71 | )
 72 | @click.option(
 73 |     "--threshold",
 74 |     help="Rule used to threshold the histogram of boundary strengths to exclude weak"
 75 |     "boundaries. 'Li' or 'Otsu' use corresponding methods from skimage.thresholding."
 76 |     "Providing a float value will filter by a fixed threshold",
 77 |     type=str,
 78 |     default=0,
 79 |     show_default=True,
 80 | )
 81 | @click.option(
 82 |     "--window-pixels",
 83 |     help="If set then the window sizes are provided in units of pixels.",
 84 |     is_flag=True,
 85 | )
 86 | @click.option(
 87 |     "--append-raw-scores",
 88 |     help="Append columns with raw scores (sum_counts, sum_balanced, n_pixels) "
 89 |     "to the output table.",
 90 |     is_flag=True,
 91 | )
 92 | @click.option("--chunksize", help="", type=int, default=20000000, show_default=True)
 93 | @click.option("--verbose", help="Report real-time progress.", is_flag=True)
 94 | @click.option(
 95 |     "--bigwig",
 96 |     help="Also save insulation tracks as a bigWig files for different window sizes"
 97 |     " with the names output.<window-size>.bw",
 98 |     is_flag=True,
 99 |     default=False,
100 | )
101 | def insulation(
102 |     in_path,
103 |     window,
104 |     output,
105 |     view,
106 |     ignore_diags,
107 |     clr_weight_name,
108 |     min_frac_valid_pixels,
109 |     min_dist_bad_bin,
110 |     threshold,
111 |     window_pixels,
112 |     append_raw_scores,
113 |     chunksize,
114 |     verbose,
115 |     bigwig,
116 |     nproc,
117 | ):
118 |     """
119 |     Calculate the diamond insulation scores and call insulating boundaries.
120 | 
121 |     IN_PATH : The path to a .cool file with a balanced Hi-C map.
122 | 
123 |     WINDOW : The window size for the insulation score calculations.
124 |              Multiple space-separated values can be provided.
125 |              By default, the window size must be provided in units of bp.
126 |              When the flag --window-pixels is set, the window sizes must
127 |              be provided in units of pixels instead.
128 |     """
129 | 
130 |     clr = cooler.Cooler(in_path)
131 | 
132 |     # Create view:
133 |     cooler_view_df = make_cooler_view(clr)
134 |     if view is None:
135 |         # full chromosomes:
136 |         view_df = cooler_view_df
137 |     else:
138 |         # read view_df dataframe, and verify against cooler
139 |         view_df = read_viewframe_from_file(view, clr, check_sorting=True)
140 | 
141 |     # Read list with windows:
142 |     if window_pixels:
143 |         window = [win * clr.info["bin-size"] for win in window]
144 | 
145 |     ins_table = api.insulation.insulation(
146 |         clr,
147 |         view_df=view_df,
148 |         window_bp=window,
149 |         ignore_diags=ignore_diags,
150 |         clr_weight_name=clr_weight_name if clr_weight_name else None,
151 |         min_frac_valid_pixels=min_frac_valid_pixels,
152 |         min_dist_bad_bin=min_dist_bad_bin,
153 |         threshold=threshold,
154 |         append_raw_scores=append_raw_scores,
155 |         chunksize=chunksize,
156 |         verbose=verbose,
157 |         nproc=nproc,
158 |     )
159 | 
160 |     # output to file if specified:
161 |     if output:
162 |         ins_table.to_csv(output, sep="\t", index=False, na_rep="nan")
163 |     # or print into stdout otherwise:
164 |     else:
165 |         print(ins_table.to_csv(sep="\t", index=False, na_rep="nan"))
166 | 
167 |     # Write the insulation track as a bigwig:
168 |     if bigwig:
169 |         for w in window:
170 |             bioframe.to_bigwig(
171 |                 ins_table,
172 |                 clr.chromsizes,
173 |                 output + "." + str(w) + ".bw",
174 |                 value_field=f"log2_insulation_score_{w}",
175 |             )
176 | 


--------------------------------------------------------------------------------
/cooltools/cli/logbin_expected.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | from ..api import expected
  3 | from ..lib.io import read_expected_from_file
  4 | 
  5 | import click
  6 | from .util import validate_csv
  7 | from . import cli
  8 | 
  9 | 
 10 | @cli.command()
 11 | @click.argument(
 12 |     "expected_path",
 13 |     metavar="EXPECTED_PATH",
 14 |     type=str,
 15 |     callback=partial(validate_csv, default_column="balanced.sum"),
 16 | )
 17 | @click.argument("output_prefix", metavar="OUTPUT_PREFIX", type=str, nargs=1)
 18 | @click.option(
 19 |     "--bins-per-order-magnitude",
 20 |     metavar="bins_per_order_magnitude",
 21 |     help="How many bins per order of magnitude. "
 22 |     "Default of 10 has a ratio of neighboring bins of about 1.25",
 23 |     type=int,
 24 |     nargs=1,
 25 |     default=10,
 26 |     show_default=True,
 27 | )
 28 | @click.option(
 29 |     "--bin-layout",
 30 |     metavar="bin_layout",
 31 |     help="'fixed' means that bins are exactly the same for different datasets, "
 32 |     "and only depend on bins_per_order_magnitude "
 33 |     "'longest_regio' means that the last bin will end at size of the longest region. "
 34 |     "\nGOOD: the last bin will have as much data as possible. "
 35 |     "\nBAD: bin edges will end up different for different datasets, "
 36 |     "you can't divide them by each other",
 37 |     type=click.Choice(["fixed", "longest_region"]),
 38 |     nargs=1,
 39 |     default="fixed",
 40 |     show_default=True,
 41 | )
 42 | @click.option(
 43 |     "--min-nvalid",
 44 |     metavar="min_nvalid",
 45 |     help="For each region, throw out bins (log-spaced) that have less than min_nvalid "
 46 |     "valid pixels. This will ensure that each entree in Pc by region has at least "
 47 |     "n_valid valid pixels. "
 48 |     "Don't set it to zero, or it will introduce bugs. Setting it to 1 is OK, but "
 49 |     "not recommended.",
 50 |     type=int,
 51 |     nargs=1,
 52 |     default=200,
 53 |     show_default=True,
 54 | )
 55 | @click.option(
 56 |     "--min-count",
 57 |     metavar="min_count",
 58 |     help="If counts are found in the data, then for each region, throw out bins "
 59 |     "(log-spaced) that have more than min_counts of counts.sum (raw Hi-C counts). "
 60 |     "This will ensure that each entree in P(s) by region has at least min_count "
 61 |     "raw Hi-C reads",
 62 |     type=int,
 63 |     nargs=1,
 64 |     default=50,
 65 |     show_default=True,
 66 | )
 67 | @click.option(
 68 |     "--spread-funcs",
 69 |     metavar="spread_funcs",
 70 |     help="A way to estimate the spread of the P(s) curves between regions. "
 71 |     "* 'minmax' - the minimum/maximum of by-region P(s)\n"
 72 |     "* 'std' - weighted standard deviation of P(s) curves (may produce negative results)\n "
 73 |     "* 'logstd' (recommended) weighted standard deviation in logspace",
 74 |     type=click.Choice(["minmax", "std", "logstd"]),
 75 |     default="logstd",
 76 |     show_default=True,
 77 |     nargs=1,
 78 | )
 79 | @click.option(
 80 |     "--spread-funcs-slope",
 81 |     metavar="spread_funcs_slope",
 82 |     help="Same as spread-funcs, but for slope (derivative) ratehr than P(s)",
 83 |     type=click.Choice(["minmax", "std", "logstd"]),
 84 |     default="std",
 85 |     show_default=True,
 86 |     nargs=1,
 87 | )
 88 | @click.option(
 89 |     "--resolution",
 90 |     metavar="resolution",
 91 |     help="Data resolution in bp. If provided, additonal column of separation in bp "
 92 |     "(s_bp) will be added to the outputs",
 93 |     type=int,
 94 |     nargs=1,
 95 | )
 96 | def logbin_expected(
 97 |     expected_path,
 98 |     output_prefix,
 99 |     bins_per_order_magnitude,
100 |     bin_layout,
101 |     min_nvalid,
102 |     min_count,
103 |     spread_funcs,
104 |     spread_funcs_slope,
105 |     resolution,
106 | ):
107 |     """
108 |     Logarithmically bin expected values generated using compute_expected for cis data.
109 | 
110 |     This smoothes the data, resulting in clearer plots and more robust analysis results.
111 |     Also calculates derivative after gaussian smoothing.
112 |     For a very detailed escription, see
113 |     https://github.com/open2c/cooltools/blob/51b95c3bed8d00a5f1f91370fc5192d9a7face7c/cooltools/expected.py#L988
114 | 
115 |     EXPECTED_PATH : The paths to a .tsv file with output of compute_expected.
116 |     Must include a header. Use the '::' syntax to specify a summary column name.
117 | 
118 |     OUTPUT_PREFIX: Output file name prefix to store the logbinned expected
119 |     (prefix.log.tsv) and derivative (prefix.der.tsv) in the tsv format."
120 |     """
121 | 
122 |     # unpack expected path and name as generated by click's callback to validate_csv:
123 |     expected_path, exp_summary_name = expected_path
124 |     # make sure "count.sum" is present in the expected file:
125 |     expected_summary_cols = [exp_summary_name]
126 |     if "count.sum" not in expected_summary_cols:
127 |         expected_summary_cols.append("count.sum")
128 | 
129 |     cvd = read_expected_from_file(
130 |         expected_path,
131 |         contact_type="cis",
132 |         expected_value_cols=expected_summary_cols,
133 |     )
134 | 
135 |     # name of the column with Probability of contacts is
136 |     # based on the name of the column  with the diagonal-summary
137 |     # stats in the input expected DataFrame:
138 |     exp_summary_base, *_ = exp_summary_name.split(".")
139 |     Pc_name = f"{exp_summary_base}.avg"
140 | 
141 |     lb_cvd, lb_slopes, lb_distbins = expected.logbin_expected(
142 |         cvd,
143 |         summary_name=exp_summary_name,
144 |         bins_per_order_magnitude=bins_per_order_magnitude,
145 |         bin_layout=bin_layout,
146 |         min_nvalid=min_nvalid,
147 |         min_count=min_count,
148 |     )
149 |     # combine Probabilities of contact for the regions:
150 |     lb_cvd_agg, lb_slopes_agg = expected.combine_binned_expected(
151 |         lb_cvd,
152 |         Pc_name=Pc_name,
153 |         binned_exp_slope=lb_slopes,
154 |         spread_funcs=spread_funcs,
155 |         spread_funcs_slope=spread_funcs_slope,
156 |     )
157 |     if resolution is not None:
158 |         lb_cvd_agg["s_bp"] = lb_cvd_agg["dist.avg"] * resolution
159 |         lb_slopes_agg["s_bp"] = lb_slopes_agg["dist.avg"] * resolution
160 | 
161 |     lb_cvd_agg.to_csv(
162 |         f"{output_prefix}.log.tsv",
163 |         sep="\t",
164 |         index=False,
165 |         na_rep="nan",
166 |     )
167 |     lb_slopes_agg.to_csv(
168 |         f"{output_prefix}.der.tsv",
169 |         sep="\t",
170 |         index=False,
171 |         na_rep="nan",
172 |     )
173 | 


--------------------------------------------------------------------------------
/cooltools/cli/rearrange.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | import cooler
  3 | import pandas as pd
  4 | 
  5 | from .. import api
  6 | from . import cli
  7 | from .util import sniff_for_header
  8 | 
  9 | 
 10 | @cli.command()
 11 | @click.argument("in_path", metavar="IN_PATH", type=str, nargs=1)
 12 | @click.argument("out_path", metavar="OUT_PATH", type=str, nargs=1)
 13 | @click.option(
 14 |     "--view",
 15 |     help="Path to a BED-like file which defines which regions of the chromosomes to use"
 16 |     " and in what order. Using --new-chrom-col and --orientation-col you can specify the"
 17 |     " new chromosome names and whether to invert each region (optional)",
 18 |     default=None,
 19 |     required=True,
 20 |     type=str,
 21 | )
 22 | @click.option(
 23 |     "--new-chrom-col",
 24 |     help="Column name in the view with new chromosome names."
 25 |     " If not provided and there is no column named 'new_chrom' in the view file, uses"
 26 |     " original chromosome names",
 27 |     default=None,
 28 |     type=str,
 29 | )
 30 | @click.option(
 31 |     "--orientation-col",
 32 |     help="Columns name in the view with orientations of each region (+ or -)."
 33 |     " If not providedand there is no column named 'strand' in the view file, assumes"
 34 |     " all are forward oriented",
 35 |     default=None,
 36 |     type=str,
 37 | )
 38 | @click.option(
 39 |     "--assembly",
 40 |     help="The name of the assembly for the new cooler. If None, uses the same as in the"
 41 |     " original cooler.",
 42 |     default=None,
 43 |     type=str,
 44 | )
 45 | @click.option(
 46 |     "--chunksize",
 47 |     help="The number of pixels loaded and processed per step of computation.",
 48 |     type=int,
 49 |     default=int(1e7),
 50 |     show_default=True,
 51 | )
 52 | @click.option(
 53 |     "--mode",
 54 |     help="(w)rite or (a)ppend to the output file (default: w)",
 55 |     default="w",
 56 |     type=click.Choice(["w", "a"], case_sensitive=False),
 57 | )
 58 | def rearrange(
 59 |     in_path, out_path, view, new_chrom_col, orientation_col, assembly, chunksize, mode
 60 | ):
 61 |     """Rearrange data from a cooler according to a new genomic view
 62 | 
 63 |     Parameters
 64 |     ----------
 65 |     IN_PATH : str
 66 |         .cool file (or URI) with data to rearrange.
 67 |     OUT_PATH : str
 68 |         .cool file (or URI) to save the rearrange data.
 69 |     view : str
 70 |         Path to a BED-like file which defines which regions of the chromosomes to use
 71 |         and in what order. Has to be a valid viewframe (columns corresponding to region
 72 |         coordinates followed by the region name), with potential additional columns.
 73 |         Using --new-chrom-col and --orientation-col you can specify the new chromosome
 74 |         names and whether to invert each region (optional).
 75 |         If has no header with column names, assumes the `new-chrom-col` is the fifth
 76 |         column and `--orientation-col` is the sixth, if they exist.
 77 |     new_chrom_col : str
 78 |         Column name in the view with new chromosome names.
 79 |         If not provided and there is no column named 'new_chrom' in the view file, uses
 80 |         original chromosome names.
 81 |     orientation_col : str
 82 |         Columns name in the view with orientations of each region (+ or -). - means the
 83 |         region will be inverted.
 84 |         If not providedand there is no column named 'strand' in the view file, assumes
 85 |         all are forward oriented.
 86 |     assembly : str
 87 |         The name of the assembly for the new cooler. If None, uses the same as in the
 88 |         original cooler.
 89 |     chunksize : int
 90 |         The number of pixels loaded and processed per step of computation.
 91 |     mode : str
 92 |         (w)rite or (a)ppend to the output file (default: w)
 93 |     """
 94 |     clr = cooler.Cooler(in_path)
 95 |     default_names = ["chrom", "start", "end", "name", "new_chrom", "strand"]
 96 |     buf, names = sniff_for_header(view)
 97 |     if names is not None:
 98 |         # Simply take column names from the file
 99 |         view_df = pd.read_table(buf, header=0, sep="\t")
100 |     else:
101 |         # Use default names
102 |         # If some are missing, pandas creates them with all NaNs
103 |         view_df = pd.read_csv(buf, names=default_names, sep="\t")
104 |         names = view_df.columns
105 |     # If additinal column names not provided, set them to defaults
106 |     # If additional columns are not in the view, raise
107 |     if new_chrom_col is None:
108 |         new_chrom_col = "new_chrom"
109 |     elif new_chrom_col not in view_df.columns:
110 |         raise ValueError(f"New chrom col {new_chrom_col} not found in view columns")
111 |     if orientation_col is None:
112 |         orientation_col = "strand"
113 |     elif orientation_col not in view_df.columns:
114 |         raise ValueError(f"Orientation col {orientation_col} not found in view columns")
115 | 
116 |     # Fill NaNs in additional columns: if they were created here, will be filled with
117 |     # default values. Allows not specifying default values in the file, i.e. only
118 |     # regions that need to be inverted need to have "-" in orientation_col
119 |     view_df[new_chrom_col] = view_df[new_chrom_col].fillna(view_df["chrom"])
120 |     view_df[orientation_col] = view_df[orientation_col].fillna("+")
121 |     api.rearrange.rearrange_cooler(
122 |         clr,
123 |         view_df,
124 |         out_path,
125 |         new_chrom_col=new_chrom_col,
126 |         orientation_col=orientation_col,
127 |         assembly=assembly,
128 |         chunksize=chunksize,
129 |         mode=mode,
130 |     )
131 | 


--------------------------------------------------------------------------------
/cooltools/cli/sample.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | 
 3 | from . import cli
 4 | from .. import api
 5 | 
 6 | 
 7 | @cli.command()
 8 | @click.argument("in_path", metavar="IN_PATH", type=str, nargs=1)
 9 | @click.argument("out_path", metavar="OUT_PATH", type=str, nargs=1)
10 | @click.option(
11 |     "-c",
12 |     "--count",
13 |     help="The target number of contacts in the sample. "
14 |     "The resulting sample size will not match it precisely. "
15 |     "Mutually exclusive with --frac and --cis-count",
16 |     type=int,
17 |     default=None,
18 |     show_default=False,
19 | )
20 | @click.option(
21 |     "--cis-count",
22 |     help="The target number of cis contacts in the sample. "
23 |     "The resulting sample size will not match it precisely. "
24 |     "Mutually exclusive with --count and --frac",
25 |     type=int,
26 |     default=None,
27 |     show_default=False,
28 | )
29 | @click.option(
30 |     "-f",
31 |     "--frac",
32 |     help="The target sample size as a fraction of contacts in the original dataset. "
33 |     "Mutually exclusive with --count and --cis-count",
34 |     type=float,
35 |     default=None,
36 |     show_default=False,
37 | )
38 | @click.option(
39 |     "--exact",
40 |     help="If specified, use exact sampling that guarantees the size of the output sample. "
41 |     "Otherwise, binomial sampling will be used and the sample size will be distributed around the target value. ",
42 |     is_flag=True,
43 | )
44 | @click.option(
45 |     "--nproc",
46 |     "-p",
47 |     help="Number of processes to split the work between."
48 |     "[default: 1, i.e. no process pool]",
49 |     default=1,
50 |     type=int,
51 | )
52 | @click.option(
53 |     "--chunksize",
54 |     help="The number of pixels loaded and processed per step of computation.",
55 |     type=int,
56 |     default=int(1e7),
57 |     show_default=True,
58 | )
59 | def random_sample(in_path, out_path, count, cis_count, frac, exact, nproc, chunksize):
60 |     """
61 |     Pick a random sample of contacts from a Hi-C map.
62 | 
63 |     IN_PATH : Input cooler path or URI.
64 | 
65 |     OUT_PATH : Output cooler path or URI.
66 | 
67 |     Specify the target sample size with either --count or --frac.
68 | 
69 |     """
70 |     
71 |     api.sample.sample(
72 |         in_path,
73 |         out_path,
74 |         count=count,
75 |         cis_count=cis_count,
76 |         frac=frac,
77 |         exact=exact,
78 |         chunksize=chunksize,
79 |         nproc=nproc
80 |     )
81 | 


--------------------------------------------------------------------------------
/cooltools/cli/util.py:
--------------------------------------------------------------------------------
 1 | import os.path as op
 2 | import csv
 3 | import io
 4 | import click
 5 | 
 6 | 
 7 | class TabularFilePath(click.Path):
 8 |     def __init__(
 9 |         self, default_column_index, exists=False, resolve_path=False, allow_dash=False
10 |     ):
11 |         """
12 |         Parameters
13 |         ----------
14 |         default_column : str or int
15 |             Name of desired column or 0-based column index.
16 |         exists : bool
17 |         resolve_path : bool
18 | 
19 |         Returns
20 |         -------
21 |         path to file, column name or index
22 | 
23 |         """
24 |         self.default_column_index = default_column_index
25 |         super().__init__(
26 |             exists=exists, resolve_path=resolve_path, allow_dash=allow_dash
27 |         )
28 | 
29 |     def convert(self, value, param, ctx):
30 |         if value is None:
31 |             return
32 |         file_path, _, field = value.partition("::")
33 |         file_path = super().convert(file_path, param, ctx)
34 |         if not field:
35 |             col = self.default_column_index
36 |         elif field.isdigit():
37 |             col = int(field) - 1  # assume one-based from command line
38 |             if col < 0:
39 |                 self.fail('Expected one-based column number, received "0".', param, ctx)
40 |         else:
41 |             col = field
42 |         return file_path, col
43 | 
44 | 
45 | def sniff_for_header(file_path, sep="\t", comment="#"):
46 |     """
47 |     Warning: reads the entire file into a StringIO buffer!
48 | 
49 |     """
50 |     with open(file_path, "r") as f:
51 |         buf = io.StringIO(f.read())
52 | 
53 |     sample_lines = []
54 |     for line in buf:
55 |         if not line.startswith(comment):
56 |             sample_lines.append(line)
57 |             break
58 |     for _ in range(10):
59 |         sample_lines.append(buf.readline())
60 |     buf.seek(0)
61 | 
62 |     has_header = csv.Sniffer().has_header("\n".join(sample_lines))
63 |     if has_header:
64 |         names = sample_lines[0].strip().split(sep)
65 |     else:
66 |         names = None
67 | 
68 |     return buf, names
69 | 
70 | 
71 | def validate_csv(ctx, param, value, default_column):
72 |     if value is None:
73 |         return
74 |     file_path, _, field_name = value.partition("::")
75 |     if not op.exists(file_path):
76 |         raise click.BadParameter(
77 |             'Path not found: "{}"'.format(file_path), ctx=ctx, param=param
78 |         )
79 |     if not field_name:
80 |         field_name = default_column
81 |     elif field_name.isdigit():
82 |         field_name = int(field_name)
83 |     return file_path, field_name
84 | 


--------------------------------------------------------------------------------
/cooltools/cli/virtual4c.py:
--------------------------------------------------------------------------------
 1 | import cooler
 2 | import bioframe
 3 | from .. import api
 4 | 
 5 | 
 6 | import click
 7 | from . import cli
 8 | 
 9 | 
10 | @cli.command()
11 | @click.argument("cool_path", metavar="COOL_PATH", type=str, nargs=1)
12 | @click.argument("viewpoint", metavar="VIEWPOINT", type=str, nargs=1)
13 | @click.option(
14 |     "--clr-weight-name",
15 |     help="Use balancing weight with this name. "
16 |     "Provide empty argument to calculate insulation on raw data (no masking bad pixels).",
17 |     type=str,
18 |     default="weight",
19 |     show_default=True,
20 | )
21 | @click.option(
22 |     "-o",
23 |     "--out-prefix",
24 |     help="Save virtual 4C track as a BED-like file."
25 |     " Contact frequency is stored in out_prefix.v4C.tsv",
26 |     required=True,
27 | )
28 | @click.option(
29 |     "--bigwig",
30 |     help="Also save virtual 4C track as a bigWig file with the name out_prefix.v4C.bw",
31 |     is_flag=True,
32 |     default=False,
33 | )
34 | @click.option(
35 |     "-p",
36 |     "--nproc",
37 |     help="Number of processes to split the work between."
38 |     " [default: 1, i.e. no process pool]",
39 |     default=1,
40 |     type=int,
41 | )
42 | def virtual4c(
43 |     cool_path,
44 |     viewpoint,
45 |     clr_weight_name,
46 |     out_prefix,
47 |     bigwig,
48 |     nproc,
49 | ):
50 |     """
51 |     Generate virtual 4C profile from a contact map by extracting all interactions of a
52 |     given viewpoint with the rest of the genome.
53 | 
54 | 
55 |     COOL_PATH : the paths to a .cool file with a Hi-C map. Use the '::' syntax to
56 |     specify a group path in a multicooler file.
57 | 
58 |     VIEWPOINT : the viewpoint to use for the virtual 4C profile. Provide as a UCSC-string
59 |     (e.g. chr1:1-1000)
60 |     
61 | 
62 |     Note: this is a new (experimental) tool, the interface or output might change in a
63 |     future version.
64 |     """
65 |     clr = cooler.Cooler(cool_path)
66 | 
67 |     viewpoint = bioframe.core.stringops.parse_region_string(viewpoint)
68 |     v4c = api.virtual4c.virtual4c(
69 |         clr,
70 |         viewpoint,
71 |         clr_weight_name=clr_weight_name if clr_weight_name else None,
72 |         nproc=nproc,
73 |     )
74 |     # Output
75 |     if out_prefix:
76 |         v4c.to_csv(out_prefix + ".tsv", sep="\t", index=False, na_rep="nan")
77 |         if bigwig:
78 |             bioframe.to_bigwig(
79 |                 v4c.dropna(),
80 |                 clr.chromsizes,
81 |                 out_prefix + ".bw",
82 |                 value_field=v4c.columns[3],
83 |             )
84 |     else:
85 |         print(v4c.to_csv(sep="\t", index=False, na_rep="nan"))
86 |     return
87 | 


--------------------------------------------------------------------------------
/cooltools/lib/__init__.py:
--------------------------------------------------------------------------------
1 | from .common import *
2 | from .io import *
3 | from .checks import *
4 | 


--------------------------------------------------------------------------------
/cooltools/lib/_query.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import numpy as np
  3 | import pandas as pd
  4 | 
  5 | # from scipy.sparse import coo_matrix
  6 | from cooler.core import _IndexingMixin
  7 | 
  8 | 
  9 | def arg_prune_partition(seq, step):
 10 |     """
 11 |     Take a monotonic sequence of integers and downsample it such that they
 12 |     are at least ``step`` apart (roughly), preserving the first and last
 13 |     elements. Returns indices, not values.
 14 | 
 15 |     """
 16 |     lo, hi = seq[0], seq[-1]
 17 |     num = 2 + (hi - lo) // step
 18 |     cuts = np.linspace(lo, hi, num, dtype=int)
 19 |     return np.unique(np.searchsorted(seq, cuts))
 20 | 
 21 | 
 22 | class CSRSelector(_IndexingMixin):
 23 |     """
 24 |     Instantiates 2D range queries.
 25 | 
 26 |     Example
 27 |     -------
 28 |     >>> selector = CSRSelector(h5, (100, 100), 'count', 10000)
 29 |     >>> query = selector[lo1:hi1, lo2:hi2]
 30 | 
 31 |     """
 32 | 
 33 |     def __init__(self, grp, shape, field, chunksize):
 34 |         self.grp = grp
 35 |         self.shape = shape
 36 |         self.field = field
 37 |         self.chunksize = chunksize
 38 |         self.offset_selector = grp["indexes"]["bin1_offset"]
 39 |         self.bin1_selector = grp["pixels"]["bin1_id"]
 40 |         self.bin2_selector = grp["pixels"]["bin2_id"]
 41 |         self.data_selector = grp["pixels"][field]
 42 | 
 43 |     def _make_getchunk(self, ispan, jspan):
 44 |         # Factory for function that executes any piece of a 2D range query by
 45 |         # index.
 46 | 
 47 |         bin1_selector = self.bin1_selector
 48 |         bin2_selector = self.bin2_selector
 49 |         data_selector = self.data_selector
 50 |         field = self.field
 51 |         i0, i1 = ispan
 52 |         j0, j1 = jspan
 53 | 
 54 |         # coarsegrain the offsets to extract a big chunk of rows at a time
 55 |         if (i1 - i0 < 1) or (j1 - j0 < 1):
 56 |             offsets = []
 57 |             loc_pruned_offsets = []
 58 |         else:
 59 |             offsets = self.offset_selector[i0 : i1 + 1]
 60 |             loc_pruned_offsets = arg_prune_partition(offsets, self.chunksize)
 61 | 
 62 |         self._loc_pruned_offsets = loc_pruned_offsets
 63 |         # i0 -- matrix row number offset
 64 |         # o0 -- corresponding pixel id offset = offsets[0]
 65 | 
 66 |         # let's take the downsampled subset of pixel id offsets [o0, ...., o1]
 67 |         # each successive pair corresponds to a "piece" of the query
 68 |         def getchunk(chunk_id, include_index=False):
 69 |             out = {"bin1_id": [], "bin2_id": [], field: []}
 70 |             if include_index:
 71 |                 out["__index"] = []
 72 | 
 73 |             # extract a chunk of on-disk rows
 74 |             oi, of = loc_pruned_offsets[chunk_id], loc_pruned_offsets[chunk_id + 1]
 75 |             p0, p1 = offsets[oi], offsets[of]
 76 |             slc = slice(p0, p1)
 77 | 
 78 |             bin2_extracted = bin2_selector[slc]
 79 |             data_extracted = data_selector[slc]
 80 |             if include_index:
 81 |                 ind_extracted = np.arange(slc.start, slc.stop)
 82 | 
 83 |             # go row by row and filter
 84 |             for i in range(oi, of):
 85 |                 # correct the offsets
 86 |                 lo = offsets[i] - p0
 87 |                 hi = offsets[i + 1] - p0
 88 | 
 89 |                 # this row
 90 |                 bin2 = bin2_extracted[lo:hi]
 91 | 
 92 |                 # filter for the range of j values we want
 93 |                 mask = (bin2 >= j0) & (bin2 < j1)
 94 |                 cols = bin2[mask]
 95 | 
 96 |                 # apply same mask for data
 97 |                 data = data_extracted[lo:hi][mask]
 98 | 
 99 |                 # shortcut for row data
100 |                 rows = np.full(len(cols), i0 + i, dtype=bin1_selector.dtype)
101 | 
102 |                 out["bin1_id"].append(rows)
103 |                 out["bin2_id"].append(cols)
104 |                 out[field].append(data)
105 |                 if include_index:
106 |                     out["__index"].append(ind_extracted[lo:hi][mask])
107 | 
108 |             if len(out):
109 |                 for k in out.keys():
110 |                     out[k] = np.concatenate(out[k], axis=0)
111 |             else:
112 |                 out["bin1_id"] = np.array([], dtype=bin1_selector.dtype)
113 |                 out["bin2_id"] = np.array([], dtype=bin2_selector.dtype)
114 |                 out[field] = np.array([], dtype=data_selector.dtype)
115 |                 if include_index:
116 |                     out["__index"] = np.array([], dtype=np.int64)
117 | 
118 |             return out
119 | 
120 |         return getchunk, loc_pruned_offsets
121 | 
122 |     def __getitem__(self, key):
123 |         s1, s2 = self._unpack_index(key)
124 |         ispan = self._process_slice(s1, self.shape[0])
125 |         jspan = self._process_slice(s2, self.shape[1])
126 |         getchunk, loc_pruned_offsets = self._make_getchunk(ispan, jspan)
127 |         return RangeQuery(self, ispan, jspan, self.field, getchunk, loc_pruned_offsets)
128 | 
129 | 
130 | class RangeQuery(object):
131 |     """
132 |     Executor that fulfills a partitioned 2D range query using a variety of outputs.
133 | 
134 |     """
135 | 
136 |     def __init__(self, selector, ispan, jspan, field, getchunk, loc_pruned_offsets):
137 |         self.selector = selector
138 |         self.ispan = ispan
139 |         self.jspan = jspan
140 |         self.field = field
141 |         self.n_chunks = len(loc_pruned_offsets) - 1
142 |         self._locs = loc_pruned_offsets
143 |         self._getchunk = getchunk
144 | 
145 |     def read_chunk(self, i, include_index=False):
146 |         """Read any chunk of the partitioned query as a dictionary."""
147 |         if not 0 <= i < self.n_chunks:
148 |             raise IndexError(i)
149 |         return self._getchunk(i, include_index)
150 | 
151 |     def read_chunked(self, include_index=False):
152 |         """Iterator over chunks (as dictionaries)."""
153 |         for i in range(self.n_chunks):
154 |             yield self._getchunk(i, include_index)
155 | 
156 |     def read(self, include_index=False):
157 |         """Read the complete range query as a dictionary"""
158 |         result = list(self.read_chunked(include_index))
159 |         return {
160 |             k: np.concatenate([d[k] for d in result], axis=0)
161 |             for k in ["bin1_id", "bin2_id", self.field]
162 |         }
163 | 
164 |     def __repr__(self):
165 |         return (
166 |             "{self.__class__.__name__}"
167 |             '({self.ispan}, {self.jspan}, "{self.field}", ...) '
168 |             "[{n} piece(s)]"
169 |         ).format(self=self, n=self.n_chunks)
170 | 


--------------------------------------------------------------------------------
/cooltools/lib/plotting.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Migrated from :mod:`mirnylib.plotting`.
  3 | 
  4 | """
  5 | try:
  6 |     from matplotlib.cm import register_cmap
  7 | except ImportError:
  8 |     from matplotlib import colormaps
  9 |     register_cmap = colormaps.register
 10 | 
 11 | import matplotlib as mpl
 12 | import matplotlib.pyplot as plt
 13 | import numpy as np
 14 | 
 15 | 
 16 | PALETTES = {
 17 |     "fall": np.array(
 18 |         (
 19 |             (255, 255, 255),
 20 |             (255, 255, 204),
 21 |             (255, 237, 160),
 22 |             (254, 217, 118),
 23 |             (254, 178, 76),
 24 |             (253, 141, 60),
 25 |             (252, 78, 42),
 26 |             (227, 26, 28),
 27 |             (189, 0, 38),
 28 |             (128, 0, 38),
 29 |             (0, 0, 0),
 30 |         )
 31 |     )
 32 |     / 255,
 33 |     "blues": np.array(
 34 |         (
 35 |             (255, 255, 255),
 36 |             (180, 204, 225),
 37 |             (116, 169, 207),
 38 |             (54, 144, 192),
 39 |             (5, 112, 176),
 40 |             (4, 87, 135),
 41 |             (3, 65, 100),
 42 |             (2, 40, 66),
 43 |             (1, 20, 30),
 44 |             (0, 0, 0),
 45 |         )
 46 |     )
 47 |     / 255,
 48 |     "acidblues": np.array(
 49 |         (
 50 |             (255, 255, 255),
 51 |             (162, 192, 222),
 52 |             (140, 137, 187),
 53 |             (140, 87, 167),
 54 |             (140, 45, 143),
 55 |             (120, 20, 120),
 56 |             (90, 15, 90),
 57 |             (60, 10, 60),
 58 |             (30, 5, 30),
 59 |             (0, 0, 0),
 60 |         )
 61 |     )
 62 |     / 255,
 63 |     "nmeth": np.array(
 64 |         (
 65 |             (236, 250, 255),
 66 |             (148, 189, 217),
 67 |             (118, 169, 68),
 68 |             (131, 111, 43),
 69 |             (122, 47, 25),
 70 |             (41, 0, 20),
 71 |         )
 72 |     )
 73 |     / 255,
 74 | }
 75 | 
 76 | 
 77 | def list_to_colormap(color_list, name=None):
 78 |     color_list = np.array(color_list)
 79 |     if color_list.min() < 0:
 80 |         raise ValueError("Colors should be 0 to 1, or 0 to 255")
 81 |     if color_list.max() > 1.0:
 82 |         if color_list.max() > 255:
 83 |             raise ValueError("Colors should be 0 to 1 or 0 to 255")
 84 |         else:
 85 |             color_list = color_list / 255.0
 86 |     return mpl.colors.LinearSegmentedColormap.from_list(name, color_list, 256)
 87 | 
 88 | 
 89 | def get_cmap(name):
 90 |     is_reversed = name.endswith("_r")
 91 |     try:
 92 |         if is_reversed:
 93 |             pal = PALETTES[name[:-2]][::-1]
 94 |         else:
 95 |             pal = PALETTES[name]
 96 |     except KeyError:
 97 |         raise ValueError('Palette not found "{}"'.format(name))
 98 |     return list_to_colormap(pal)
 99 | 
100 | 
101 | def _register_cmaps():
102 |     for name, pal in PALETTES.items():
103 |         register_cmap(cmap=list_to_colormap(pal), name=name)
104 |         register_cmap(cmap=list_to_colormap(pal[::-1]), name=name + "_r")
105 | 
106 | 
107 | _register_cmaps()
108 | 
109 | 
110 | def gridspec_inches(wcols, hrows, fig_kwargs={}):
111 | 
112 |     fig_height_inches = sum(hrows)
113 | 
114 |     fig_width_inches = sum(wcols)
115 | 
116 |     fig = plt.figure(
117 |         figsize=(fig_width_inches, fig_height_inches),
118 |         subplotpars=mpl.figure.SubplotParams(
119 |             left=0, right=1, bottom=0, top=1, wspace=0, hspace=0.0
120 |         ),
121 |         # frameon=False,
122 |         **fig_kwargs
123 |     )
124 |     fig.set_size_inches(fig_width_inches, fig_height_inches, forward=True)
125 | 
126 |     gs = mpl.gridspec.GridSpec(
127 |         len(hrows),
128 |         len(wcols),
129 |         left=0,
130 |         right=1,
131 |         top=1,
132 |         bottom=0,
133 |         wspace=0,
134 |         hspace=0,
135 |         width_ratios=wcols,
136 |         height_ratios=hrows,
137 |     )
138 | 
139 |     return fig, gs
140 | 


--------------------------------------------------------------------------------
/cooltools/lib/runlength.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | 
  4 | 
  5 | def isrle(starts, lengths, values):
  6 |     if not (len(starts) == len(lengths) == len(values)):
  7 |         return False
  8 | 
  9 |     if np.any(np.diff(starts) < 0):
 10 |         return False
 11 | 
 12 |     ends = starts + lengths
 13 |     if np.any(ends[:-1] > starts[1:]):
 14 |         return False
 15 | 
 16 |     return True
 17 | 
 18 | 
 19 | def rlencode(x, dropna=False):
 20 |     """
 21 |     Run length encoding.
 22 |     Based on http://stackoverflow.com/a/32681075, which is based on the rle
 23 |     function from R.
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |     x : 1D array_like
 28 |         Input array to encode
 29 |     dropna: bool, optional
 30 |         Drop all runs of NaNs.
 31 | 
 32 |     Returns
 33 |     -------
 34 |     start positions, run lengths, run values
 35 | 
 36 |     """
 37 |     where = np.flatnonzero
 38 |     x = np.asarray(x)
 39 |     n = len(x)
 40 |     if n == 0:
 41 |         return (
 42 |             np.array([], dtype=int),
 43 |             np.array([], dtype=int),
 44 |             np.array([], dtype=x.dtype),
 45 |         )
 46 | 
 47 |     isnumeric = np.issubdtype(x.dtype, np.number)
 48 | 
 49 |     if isnumeric:
 50 |         starts = np.r_[0, where(~np.isclose(x[1:], x[:-1], equal_nan=True)) + 1]
 51 |     else:
 52 |         starts = np.r_[0, where(x[1:] != x[:-1]) + 1]
 53 |     lengths = np.diff(np.r_[starts, n])
 54 |     values = x[starts]
 55 | 
 56 |     if isnumeric and dropna:
 57 |         mask = ~np.isnan(values)
 58 |         starts, lengths, values = starts[mask], lengths[mask], values[mask]
 59 | 
 60 |     return starts, lengths, values
 61 | 
 62 | 
 63 | def rldecode(starts, lengths, values, minlength=None):
 64 |     """
 65 |     Decode a run-length encoding of a 1D array.
 66 | 
 67 |     Parameters
 68 |     ----------
 69 |     starts, lengths, values : 1D array_like
 70 |         The run-length encoding.
 71 |     minlength : int, optional
 72 |         Minimum length of the output array.
 73 | 
 74 |     Returns
 75 |     -------
 76 |     1D array. Missing data will be filled with NaNs.
 77 | 
 78 |     """
 79 |     starts, lengths, values = map(np.asarray, (starts, lengths, values))
 80 |     # TODO: check validity of rle
 81 |     ends = starts + lengths
 82 |     n = ends[-1]
 83 |     if minlength is not None:
 84 |         n = max(minlength, n)
 85 |     x = np.full(n, np.nan)
 86 |     for lo, hi, val in zip(starts, ends, values):
 87 |         x[lo:hi] = val
 88 |     return x
 89 | 
 90 | 
 91 | def iterruns(x, value=None, **kwargs):
 92 |     starts, lengths, values = rlencode(x, **kwargs)
 93 |     if value is None:
 94 |         ends = starts + lengths
 95 |         return zip(starts, ends, values)
 96 |     else:
 97 |         mask = values == value
 98 |         starts, lengths = starts[mask], lengths[mask]
 99 |         ends = starts + lengths
100 |         return zip(starts, ends)
101 | 
102 | 
103 | def fillgaps(starts, lengths, values, minlength=None, fill_value=np.nan):
104 |     """
105 |     Add additional runs to fill in spaces between runs. Defaults to runs of NaN.
106 |     """
107 |     where = np.flatnonzero
108 |     n = starts[-1] + lengths[-1]
109 |     if minlength is not None:
110 |         n = max(minlength, n)
111 | 
112 |     ends = starts + lengths
113 |     lo = np.r_[0, ends]
114 |     hi = np.r_[starts, n]
115 |     gap_locs = where((hi - lo) > 0)
116 |     if len(gap_locs):
117 |         starts = np.insert(starts, gap_locs, lo[gap_locs])
118 |         lengths = np.insert(lengths, gap_locs, hi[gap_locs] - lo[gap_locs])
119 |         values = np.insert(values, gap_locs, fill_value)
120 |     return starts, lengths, values
121 | 
122 | 
123 | def dropgaps(starts, lengths, values):
124 |     """
125 |     Discard runs of NaN.
126 |     """
127 |     mask = np.isnan(values)
128 |     starts, lengths, values = starts[mask], lengths[mask], values[mask]
129 |     return starts, lengths, values
130 | 
131 | 
132 | def align(slv1, slv2, minlength=None):
133 |     """
134 |     Remove NaN runs and runs of length zero and stich together consecutive runs
135 |     of the same value.
136 | 
137 |     """
138 |     starts1, lengths1, values1 = fillgaps(*slv1)
139 |     starts2, lengths2, values2 = fillgaps(*slv2)
140 |     n1 = starts1[-1] + lengths1[-1]
141 |     n2 = starts2[-1] + lengths2[-1]
142 |     if minlength is not None:
143 |         n = max(minlength, n1, n2)
144 | 
145 |     starts = np.concatenate([starts1, starts2])
146 |     values = np.concatenate([values1, values2])
147 |     idx = np.argsort(starts)
148 |     starts = starts[idx]
149 |     values = values[idx]
150 |     lengths = np.diff(np.r_[starts, n])
151 |     return starts, lengths, values
152 | 
153 | 
154 | def simplify(starts, lengths, values, minlength=None):
155 |     """
156 |     Remove NaN runs and runs of length zero and stich together consecutive runs
157 |     of the same value.
158 | 
159 |     """
160 |     starts, lengths, values = fillgaps(starts, lengths, values, minlength)
161 |     n = starts[-1] + lengths[-1]
162 | 
163 |     is_nontrivial = lengths > 0
164 |     starts = starts[is_nontrivial]
165 |     values = values[is_nontrivial]
166 | 
167 |     is_new_run = np.r_[True, ~np.isclose(values[:-1], values[1:], equal_nan=True)]
168 |     starts = starts[is_new_run]
169 |     values = values[is_new_run]
170 | 
171 |     lengths = np.r_[starts[1:] - starts[:-1], n - starts[-1]]
172 | 
173 |     mask = ~np.isnan(values)
174 |     return starts[mask], lengths[mask], values[mask]
175 | 


--------------------------------------------------------------------------------
/cooltools/lib/schemas.py:
--------------------------------------------------------------------------------
 1 | # schemas of datastructures commonly used in cooltools
 2 | # including description DataFrame dtypes/columns definitions
 3 | diag_expected_dtypes = {
 4 |     "region1": "string",
 5 |     "region2": "string",
 6 |     "dist": "Int64",
 7 |     "n_valid": "Int64",
 8 | }
 9 | 
10 | block_expected_dtypes = {
11 |     "region1": "string",
12 |     "region2": "string",
13 |     "n_valid": "Int64",
14 | }
15 | 
16 | # cooler weight names that are potentially divisive
17 | # cooltools supports only multiplicative weight for now
18 | DIVISIVE_WEIGHTS_4DN = ["KR", "VC", "VC_SQRT"]
19 | 


--------------------------------------------------------------------------------
/cooltools/sandbox/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/cooltools/aedd531e80e7940311f941918c6551d3229eaf21/cooltools/sandbox/__init__.py


--------------------------------------------------------------------------------
/cooltools/sandbox/balance.py:
--------------------------------------------------------------------------------
  1 | from functools import partial, reduce
  2 | from multiprocess import Pool
  3 | from operator import add
  4 | 
  5 | import numpy as np
  6 | import pandas
  7 | import pandas
  8 | import h5py
  9 | 
 10 | from scipy.sparse import linalg
 11 | from cooler.parallel import split, partition
 12 | import cooler
 13 | 
 14 | 
 15 | def bnewt(matvec, mask, tol=1e-6, x0=None, delta=0.1, Delta=3, fl=0):
 16 |     """
 17 |     A balancing algorithm for symmetric matrices
 18 | 
 19 |     X = BNEWT(A) attempts to find a vector X such that
 20 |     diag(X)*A*diag(X) is close to doubly stochastic. A must
 21 |     be symmetric and nonnegative.
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     matvec : callable
 26 |         Linear operator that returns the matrix-vector product with x
 27 |     mask : 1D array of bool
 28 |         Mask of good bins
 29 |     tol : float
 30 |         Error tolerance
 31 |     x0 : 1D array
 32 |         Initial guess
 33 |     delta : float
 34 |         How close balancing vectors can get to the edge of the positive cone
 35 |     Delta : float
 36 |         How far balancing vectors can get from the edge of the positive cone
 37 | 
 38 |     We use a relative measure on the size of elements.
 39 | 
 40 |     Returns
 41 |     -------
 42 |     x : 1D array
 43 |         balancing weights
 44 |     res : float
 45 |         residual error, measured by norm(diag(x)*A*x - e)
 46 | 
 47 |     """
 48 |     # Initialize
 49 |     n = mask.sum()
 50 | 
 51 |     e = np.ones(n)
 52 |     if x0 is None:
 53 |         x0 = e.copy()
 54 |     res = []
 55 | 
 56 |     # Inner stopping criterion parameters.
 57 |     g = 0.9
 58 |     etamax = 0.1
 59 |     eta = etamax
 60 |     stop_tol = tol * 0.5
 61 |     x = x0
 62 |     rt = tol ** 2
 63 |     v = x * matvec(x, mask)
 64 | 
 65 |     rk = 1 - v
 66 |     rho_km1 = np.dot(rk, rk)
 67 |     rho_km2 = None # will be defined later
 68 |     rout = rho_km1
 69 |     rold = rout
 70 | 
 71 |     MVP = 0  # We’ll count matrix vector products.
 72 |     i = 0  # Outer iteration count.
 73 | 
 74 |     if fl == 1:
 75 |         print("it in. it res", flush=True)
 76 | 
 77 |     # Outer iteration
 78 |     while rout > rt:
 79 |         i += 1
 80 |         k = 0
 81 |         y = e.copy()
 82 |         innertol = max((eta ** 2) * rout, rt)
 83 | 
 84 |         # Inner iteration by Conjugate Gradient
 85 |         while rho_km1 > innertol:
 86 |             k += 1
 87 | 
 88 |             if k == 1:
 89 |                 Z = rk / v
 90 |                 p = Z.copy()
 91 |                 rho_km1 = np.dot(rk, Z)
 92 |             else:
 93 |                 beta = rho_km1 / rho_km2
 94 |                 p = Z + beta * p
 95 | 
 96 |             # Update search direction efficiently.
 97 |             w = x * matvec(x * p, mask) + v * p
 98 | 
 99 |             alpha = rho_km1 / np.dot(p, w)
100 |             ap = alpha * p
101 | 
102 |             # Test distance to boundary of cone.
103 |             ynew = y + ap
104 |             if min(ynew) <= delta:
105 |                 if delta == 0:
106 |                     break
107 |                 idx = ap < 0
108 |                 gamma = np.min((delta - y[idx]) / ap[idx])
109 |                 y = y + gamma * ap
110 |                 break
111 | 
112 |             if max(ynew) >= Delta:
113 |                 idx = ynew > Delta
114 |                 gamma = np.min((Delta - y[idx]) / ap[idx])
115 |                 y = y + gamma * ap
116 |                 break
117 | 
118 |             y = ynew.copy()
119 |             rk = rk - alpha * w
120 |             rho_km2 = rho_km1
121 |             Z = rk / v
122 |             rho_km1 = np.dot(rk, Z)
123 | 
124 |         x = x * y
125 |         v = x * matvec(x, mask)
126 | 
127 |         rk = 1 - v
128 |         rho_km1 = np.dot(rk, rk)
129 |         rout = rho_km1
130 |         MVP += k + 1
131 | 
132 |         # Update inner iteration stopping criterion.
133 |         rat = rout / rold
134 |         rold = rout
135 |         res_norm = np.sqrt(rout)
136 |         eta_o = eta
137 |         eta = g * rat
138 |         if g * (eta_o ** 2) > 0.1:
139 |             eta = max(eta, g * (eta_o ** 2))
140 | 
141 |         eta = max(min(eta, etamax), stop_tol / res_norm)
142 |         if fl == 1:
143 |             print("%3d\t%6d\t%.3e" % (i, k, res_norm), flush=True)
144 |         res.append(res_norm)
145 | 
146 |         print("Matrix-vector products = %6d" % (MVP,), flush=True)
147 | 
148 |     x_full = np.zeros(len(mask))
149 |     x_full[mask] = x
150 |     return x_full, np.array(res)
151 | 


--------------------------------------------------------------------------------
/cooltools/sandbox/cool2cworld.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import io
  3 | import gzip
  4 | import tarfile
  5 | import tempfile
  6 | 
  7 | from . import fastsavetxt
  8 | 
  9 | import cooler
 10 | 
 11 | 
 12 | def dump_cworld(
 13 |     in_cooler,
 14 |     out=None,
 15 |     region=None,
 16 |     iced=False,
 17 |     iced_unity=False,
 18 |     buffer_size=int(1e8),
 19 | ):
 20 |     """
 21 |     Dump a genome-wide contact matrix from cooler into a CWorld-format
 22 |     text matrix.
 23 | 
 24 |     Parameters
 25 |     ----------
 26 |     in_cooler : str or cooler
 27 |         A cooler object or the path to the file.
 28 | 
 29 |     out : str or file object
 30 |         Either:
 31 |         -- a path to the output file. If ends with .gz the output is gzipped
 32 |         -- a file object
 33 |         -- a stdin of a Popen object
 34 |         -- None, in which case the data is dumped into a string and returned
 35 |         TIP: when using files/stdin do not forget to flush()/communicate().
 36 | 
 37 |     region : str
 38 |         The region to dump. By default is None, dump the genome-wide matrix.
 39 | 
 40 |     iced : bool, optional
 41 |         If True, dump the balanced matrix.
 42 | 
 43 |     iced_unity : bool, optional
 44 |         If True and `iced` is True, dump the matrix balanced to a unity.
 45 | 
 46 |     buffer_size : int
 47 |         The chunk size for iterating over the rows of the Hi-C matrix.
 48 |     """
 49 | 
 50 |     # Prepare the out pipe and the clean-up function.
 51 |     if not (out):
 52 |         out = io.BytesIO(b"")
 53 |     if issubclass(type(out), str) or issubclass(type(out), bytearray):
 54 |         if out.endswith(".gz"):
 55 |             writer = fastsavetxt.gzipWriter(out)
 56 |             out_pipe = writer.stdin
 57 |             close_out_func = writer.communicate
 58 |         else:
 59 |             writer = open(out, "wb")
 60 |             out_pipe = writer
 61 |             close_out_func = writer.flush
 62 |     elif hasattr(out, "write"):
 63 |         out_pipe = out
 64 |         close_out_func = fastsavetxt.empty_func
 65 | 
 66 |     # Make headers
 67 |     if not issubclass(type(in_cooler), cooler.Cooler):
 68 |         c = cooler.Cooler(in_cooler)
 69 |     else:
 70 |         c = in_cooler
 71 | 
 72 |     res = c.info["bin-size"]
 73 |     gname = c.info["genome-assembly"]
 74 | 
 75 |     bins = c.bins()[:] if not (region) else c.bins().fetch(region)
 76 |     nbins = len(bins)
 77 | 
 78 |     col_headers = "\t".join(
 79 |         ["{}x{}".format(nbins, nbins)]
 80 |         + [
 81 |             "{}|{}|{}:{}-{}".format(binidx, gname, b.chrom, b.start + 1, b.end)
 82 |             for binidx, b in bins.iterrows()
 83 |         ]
 84 |     ).encode()
 85 | 
 86 |     row_headers = [
 87 |         "{}|{}|{}:{}-{}".format(binidx1, gname, b1.chrom, b1.start + 1, b1.end).encode()
 88 |         for binidx1, b1 in bins.iterrows()
 89 |     ]
 90 | 
 91 |     # Iterate over a matrix one block at a time.
 92 |     nrows_per_step = max(1, buffer_size // nbins)
 93 |     for i in range(nbins // nrows_per_step + 1):
 94 |         lo = min(nbins, i * nrows_per_step)
 95 |         hi = min(nbins, (i + 1) * nrows_per_step)
 96 |         if hi <= lo:
 97 |             break
 98 |         mat = (
 99 |             c.matrix(balance=iced)
100 |             if not (region)
101 |             else c.matrix(balance=iced).fetch(region)
102 |         )[lo:hi]
103 |         if iced and (not iced_unity):
104 |             mat *= c._load_attrs("/bins/weight")["scale"]
105 | 
106 |         fastsavetxt.array2txt(
107 |             mat,
108 |             out_pipe,
109 |             format_string=b"%.8f" if iced_unity else b"%.4lf",
110 |             header=col_headers if i == 0 else None,
111 |             row_headers=row_headers[lo:hi],
112 |         )
113 | 
114 |     if issubclass(type(out), io.BytesIO):
115 |         return out.getvalue()
116 |     else:
117 |         close_out_func()
118 | 
119 | 
120 | def dump_cworld_tar(
121 |     cooler_paths,
122 |     out_path,
123 | ):
124 |     """
125 |     Makes a CWorld .tar archive with binned contact maps at multiple resolutions
126 |     in .matrix.txt.gz format.
127 | 
128 |     Parameters
129 |     ----------
130 |     cooler_paths : a list of str
131 |         The paths to all coolers to dump into a single CWorld tar archive.
132 |         Must correspond to the same dataset and have different resolutions.
133 | 
134 |     out_path : str
135 |         The path to the output file.
136 | 
137 |     """
138 | 
139 |     dataset_name = os.path.splitext(os.path.split(out_path)[1])[0]
140 | 
141 |     with tempfile.TemporaryDirectory() as cworld_tmp_path:
142 |         for cooler_path in cooler_paths:
143 |             res = cooler.Cooler(cooler_path).info["bin-size"]
144 |             os.mkdir(os.path.join(cworld_tmp_path, "C-" + str(res)))
145 |             for iced, iced_label in [(True, "iced"), (False, "raw")]:
146 |                 folder_path = os.path.join(cworld_tmp_path, "C-" + str(res), iced_label)
147 |                 os.mkdir(folder_path)
148 | 
149 |                 mat_path = os.path.join(
150 |                     folder_path,
151 |                     "{}__C-{}-{}.matrix.gz".format(dataset_name, res, iced_label),
152 |                 )
153 | 
154 |                 dump_cworld(
155 |                     in_cooler=cooler_path, out=mat_path, iced=iced, iced_unity=False
156 |                 )
157 | 
158 |         with tarfile.open(out_path, mode="w") as archive:
159 |             archive.add(cworld_tmp_path, arcname=dataset_name, recursive=True)
160 | 


--------------------------------------------------------------------------------
/cooltools/sandbox/cooler_filters/Example_usage.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import pixel_filter_util\n",
10 |     "import cooler"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "markdown",
15 |    "metadata": {},
16 |    "source": [
17 |     "- cis_total_ratio_filter() allows partial evaluation. However if you only provide threshold, you will need to use key word 'threshold' as the below example. This will return a function that can further take cooler object"
18 |    ]
19 |   },
20 |   {
21 |    "cell_type": "code",
22 |    "execution_count": null,
23 |    "metadata": {},
24 |    "outputs": [],
25 |    "source": [
26 |     "test_filter = pixel_filter_util.cis_total_ratio_filter(threshold=0.5)\n",
27 |     "clr = cooler.Cooler('test_data_util.cool')\n",
28 |     "bin_mask = test_filter(clr)"
29 |    ]
30 |   },
31 |   {
32 |    "cell_type": "markdown",
33 |    "metadata": {},
34 |    "source": [
35 |     "- If you have multiple filters you can use generate_bin_mask() to apply filters sequentially to the bin table and output a mask"
36 |    ]
37 |   },
38 |   {
39 |    "cell_type": "code",
40 |    "execution_count": null,
41 |    "metadata": {},
42 |    "outputs": [],
43 |    "source": [
44 |     "test_filter2 = pixel_filter_util.cis_total_ratio_filter(threshold=0.7)\n",
45 |     "bin_mask = pixel_filter_util.generate_bin_mask(clr, [test_filter, test_filter2])\n"
46 |    ]
47 |   },
48 |   {
49 |    "cell_type": "markdown",
50 |    "metadata": {},
51 |    "source": [
52 |     "- The bin_mask then can be used in create_filtered_cooler"
53 |    ]
54 |   },
55 |   {
56 |    "cell_type": "code",
57 |    "execution_count": null,
58 |    "metadata": {},
59 |    "outputs": [],
60 |    "source": [
61 |     "output_path = f\"./test_data_util_filtered.cool\"\n",
62 |     "pixel_filter_util.create_filtered_cooler(\n",
63 |     "            output_path, clr, bin_mask, chunksize=10_000_000, nproc=1\n",
64 |     "        )"
65 |    ]
66 |   }
67 |  ],
68 |  "metadata": {
69 |   "kernelspec": {
70 |    "display_name": "akita",
71 |    "language": "python",
72 |    "name": "python3"
73 |   },
74 |   "language_info": {
75 |    "codemirror_mode": {
76 |     "name": "ipython",
77 |     "version": 3
78 |    },
79 |    "file_extension": ".py",
80 |    "mimetype": "text/x-python",
81 |    "name": "python",
82 |    "nbconvert_exporter": "python",
83 |    "pygments_lexer": "ipython3",
84 |    "version": "3.9.18"
85 |   }
86 |  },
87 |  "nbformat": 4,
88 |  "nbformat_minor": 2
89 | }
90 | 


--------------------------------------------------------------------------------
/cooltools/sandbox/cooler_filters/pixel_filter_util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from toolz import curry
  3 | import cooltools
  4 | import cooler
  5 | import functools
  6 | from multiprocessing import Pool
  7 | import logging
  8 | from cooltools.lib.common import pool_decorator
  9 | 
 10 | formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s : %(message)s")
 11 | logger = logging.getLogger("data_util")
 12 | logger.propagate = False  # Disable propagation to the root logger
 13 | 
 14 | ch = logging.StreamHandler()
 15 | ch.setLevel(logging.INFO)
 16 | ch.setFormatter(formatter)
 17 | logger.addHandler(ch)
 18 | 
 19 | @curry
 20 | def cis_total_ratio_filter(clr, threshold=0.5):
 21 |     """
 22 |     Filter out bins with low cis-to-total coverage ratio from a Cooler object.
 23 | 
 24 |     Parameters
 25 |     ----------
 26 |     clr : cooler.Cooler
 27 |         A Cooler object containing Hi-C contact matrices.
 28 |     threshold : float
 29 |         The threshold cis-to-total coverage ratio below which bins are considered bad.
 30 | 
 31 |     Returns
 32 |     -------
 33 |     numpy.ndarray
 34 |         An array of bin mask.
 35 | 
 36 |     Note
 37 |     ----
 38 |     This curried function accepts partial evaluation with only providing threshold value.
 39 |     """
 40 |     if isinstance(clr, float):
 41 |         raise TypeError(
 42 |             "If only threshold value is provided, please use 'threshold' keyword to set threshold value (e.g. threshold=0.2)"
 43 |         )
 44 |     coverage = cooltools.coverage(clr)
 45 |     cis_total_cov = coverage[0] / coverage[1]
 46 |     bin_mask = cis_total_cov > threshold
 47 | 
 48 |     return bin_mask
 49 | 
 50 | 
 51 | def generate_bin_mask(
 52 |     clr, filters=[None], store=False, store_name="cis_total_ratio_>_0.5_thres"
 53 | ):
 54 |     """
 55 |     Generates a binary mask for a given `clr` object based on a list of filters and thresholds.
 56 | 
 57 |     Parameters
 58 |     ----------
 59 |     clr : cooler.Cooler
 60 |         A cooler object containing Hi-C contact matrices.
 61 |     filters : list
 62 |         A list of filter functions to apply to the contact matrices.
 63 |     store : bool, optional
 64 |         If True, store the results in the input cooler file when finished. Default is False.
 65 |     store_name : str, optional
 66 |         Name of the columns of the bin table to save bin mask.
 67 | 
 68 |     Returns
 69 |     -------
 70 |     bin_mask : numpy.ndarray
 71 |         A binary mask indicating which genomic bins pass all filters.
 72 |     """
 73 |     if not isinstance(filters, list):
 74 |         logger.error("filter_lst parameter takes a list")
 75 | 
 76 |     bin_mask = np.array([True] * clr.bins().shape[0])
 77 |     for filter in filters:
 78 |         bin_mask *= filter(clr)
 79 | 
 80 |     if store:
 81 |         with clr.open("r+") as grp:
 82 |             if store_name in grp["bins"]:
 83 |                 del grp["bins"][store_name]
 84 |             h5opts = dict(compression="gzip", compression_opts=6)
 85 |             grp["bins"].create_dataset(store_name, data=bin_mask, **h5opts, dtype=bool)
 86 | 
 87 |     return bin_mask
 88 | 
 89 | 
 90 | def _pixel_filter(chunk_pixels, good_bins_index):
 91 |     """
 92 |     Filters a chunk of pixels based on a list of good bin indices.
 93 | 
 94 |     Parameters
 95 |     ----------
 96 |     chunk_pixels : pandas.DataFrame
 97 |         A DataFrame containing the pixels to be filtered. It must have columns 'bin1_id' and 'bin2_id'.
 98 |     good_bins_index : list of int
 99 |         A list of indices representing the good bins.
100 | 
101 |     Returns
102 |     -------
103 |     pandas.DataFrame
104 |         A DataFrame containing only the pixels whose bin1_id and bin2_id are in good_bins_index.
105 |     """
106 | 
107 |     pixels_mask = chunk_pixels["bin1_id"].isin(good_bins_index) * chunk_pixels[
108 |         "bin2_id"
109 |     ].isin(good_bins_index)
110 |     return chunk_pixels[pixels_mask]
111 | 
112 | 
113 | def pixel_iter_chunks(clr, chunksize):
114 |     """
115 |     Iterate over the pixels of a cooler object in chunks of a given size.
116 | 
117 |     Parameters
118 |     ----------
119 |     clr : cooler.Cooler
120 |         A cooler object containing Hi-C data.
121 |     chunksize : int
122 |         The size of each chunk of pixels to iterate over.
123 | 
124 |     Yields
125 |     ------
126 |     chunk : numpy.ndarray
127 |         A chunk of pixels of size `chunksize`.
128 |     """
129 |     selector = clr.pixels()
130 |     for lo, hi in cooler.util.partition(0, len(selector), chunksize):
131 |         chunk = selector[lo:hi]
132 |         yield chunk
133 | 
134 | @pool_decorator
135 | def create_filtered_cooler(
136 |     output_uri, clr, bin_mask, chunksize=10_000_000, nproc=1, map=map
137 | ):
138 |     """
139 |     Create a filtered cooler file from a given cooler object and a binary mask of good bins.
140 | 
141 |     Parameters
142 |     ----------
143 |     output_uri : str
144 |         The URI of the output cooler file to be created.
145 |     clr : cooler.Cooler
146 |         The cooler object to be filtered.
147 |     bin_mask : numpy.ndarray
148 |         A boolean array indicating which bins to keep (True) and which to discard (False).
149 |         Must have the same length as the number of bins in the cooler object.
150 |     nproc : int, optional
151 |         The number of processes to use for parallelization. Default is 16.
152 |     chunksize : int, optional
153 |         The number of pixels to process per chunk. Default is 10,000,000.
154 | 
155 |     Returns
156 |     -------
157 |     None
158 |     """
159 |     if len(bin_mask) != clr.bins().shape[0]:
160 |         raise ValueError(
161 |             "bin_mask should have the same length as bin table in cool file"
162 |         )
163 |     logger.debug("Start to create cooler file...")
164 |     bin_table = clr.bins()[:][['chrom','start','end']].copy()
165 |     good_bins_index = np.array(range(clr.bins().shape[0]))[bin_mask]
166 |     pixels_filter = functools.partial(_pixel_filter, good_bins_index=good_bins_index)
167 | 
168 |     cooler.create_cooler(
169 |         output_uri,
170 |         bins=bin_table,
171 |         pixels=map(pixels_filter, pixel_iter_chunks(clr, chunksize)),
172 |         ordered=True,
173 |         columns=["count"],
174 |     )
175 | 
176 |     logger.debug("done")
177 | 


--------------------------------------------------------------------------------
/cooltools/sandbox/cooler_filters/test_data_util.cool:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/cooltools/aedd531e80e7940311f941918c6551d3229eaf21/cooltools/sandbox/cooler_filters/test_data_util.cool


--------------------------------------------------------------------------------
/cooltools/sandbox/expected_smoothing.py:
--------------------------------------------------------------------------------
  1 | from collections.abc import Iterable
  2 | 
  3 | import numpy as np
  4 | import numba
  5 | 
  6 | 
  7 | def _log_interp(xs, xp, fp):
  8 |     """
  9 |     Interpolate a function in the log-log space.
 10 |     Equivalent to np.exp(np.interp(np.log(xs), np.log(xp), np.log(fp))).
 11 | 
 12 |     Parameters
 13 |     ----------
 14 |     xs : array-like
 15 |         The x-coordinates at which to evaluate the interpolated values.
 16 |     xp : 1-D sequence of floats
 17 |         The x-coordinates of the data points, must be increasing.
 18 |     fp : 1D array
 19 |         The y-coordinates of the data points, same length as xp.
 20 | 
 21 |     Returns
 22 |     -------
 23 |     ys : 1D array
 24 |         The interpolated values, same shape as x.
 25 |     """
 26 |     with np.errstate(divide="ignore"):
 27 |         ys = np.exp(
 28 |             np.interp(
 29 |                 np.log(xs),
 30 |                 np.log(xp),
 31 |                 np.log(fp),
 32 |             )
 33 |         )
 34 | 
 35 |     return ys
 36 | 
 37 | 
 38 | @numba.njit
 39 | def _log_thin(xs, min_log10_step=0.1):
 40 |     """
 41 |     Thin out a sorted array, by selecting a subset of elements that are uniformly spaced in log-space.
 42 | 
 43 |     Parameters
 44 |     ----------
 45 |     xs : array-like
 46 |         An array of elements to thin out.
 47 |     min_log10_step : float, optional
 48 |         The minimal log10 ratio between consecutive elements in the output, by default 0.1
 49 | 
 50 |     Returns
 51 |     -------
 52 |     xs_thinned : array-like
 53 |         A subset of elements from xs, whose logs are approx. uniformly spaced.
 54 |     """
 55 |     xs_thinned = [xs[0]]
 56 |     prev = xs[0]
 57 |     min_ratio = 10**min_log10_step
 58 |     for x in xs[1:]:
 59 |         if x > prev * min_ratio:
 60 |             xs_thinned.append(x)
 61 |             prev = x
 62 | 
 63 |     if xs_thinned[-1] != xs[-1]:
 64 |         xs_thinned.append(xs[-1])
 65 |     return np.array(xs_thinned)
 66 | 
 67 | 
 68 | @numba.njit
 69 | def _log_smooth_numba(
 70 |     xs,
 71 |     ys,
 72 |     sigma_log10=0.1,
 73 |     window_sigma=5,
 74 |     points_per_sigma=10,
 75 | ):
 76 |     xs_thinned = xs
 77 |     if points_per_sigma:
 78 |         xs_thinned = _log_thin(xs, sigma_log10 / points_per_sigma)
 79 | 
 80 |     N = xs_thinned.size
 81 |     N_FUNCS = ys.shape[0]
 82 | 
 83 |     log_xs = np.log10(xs)
 84 |     log_thinned_xs = np.log10(xs_thinned)
 85 | 
 86 |     ys_smoothed = np.zeros((N_FUNCS, N))
 87 | 
 88 |     for i in range(N):
 89 |         cur_log_x = log_thinned_xs[i]
 90 |         lo = np.searchsorted(log_xs, cur_log_x - sigma_log10 * window_sigma)
 91 |         hi = np.searchsorted(log_xs, cur_log_x + sigma_log10 * window_sigma)
 92 |         smooth_weights = np.exp(
 93 |             -((cur_log_x - log_xs[lo:hi]) ** 2) / 2 / sigma_log10 / sigma_log10
 94 |         )
 95 |         norm = smooth_weights.sum()
 96 | 
 97 |         if norm > 0:
 98 |             smooth_weights /= norm
 99 | 
100 |             for k in range(N_FUNCS):
101 |                 ys_smoothed[k, i] = np.sum(ys[k, lo:hi] * smooth_weights)
102 | 
103 |     return xs_thinned, ys_smoothed
104 | 
105 | 
106 | def log_smooth(
107 |     xs,
108 |     ys,
109 |     sigma_log10=0.1,
110 |     window_sigma=5,
111 |     points_per_sigma=10,
112 | ):
113 |     """
114 |     Convolve a function or multiple functions with a gaussian kernel in the log space.
115 | 
116 |     Parameters
117 |     ----------
118 |     xs : 1D array
119 |         The x-coordinates (function arguments) of the data points, must be increasing.
120 |     ys : 1D or 2D array
121 |         The y-coordinates (function values) of the data points.
122 |         If 2D, rows correspond to multiple functions, columns correspond to different points.
123 |     sigma_log10 : float, optional
124 |         The standard deviation of the smoothing Gaussian kernel, applied over log10(xs), by default 0.1
125 |     window_sigma : int, optional
126 |         Width of the smoothing window, expressed in sigmas, by default 5
127 |     points_per_sigma : int, optional
128 |         If provided, smoothing is done only for `points_per_sigma` points per sigma and the
129 |         rest of the values are interpolated (this results in a major speed-up). By default 10
130 | 
131 |     Returns
132 |     -------
133 |     xs_thinned : 1D array
134 |         The subset of arguments, uniformly spaced in log-space.
135 |     ys_smoothed : 1D or 2D array
136 |         The gaussian-smoothed function values.
137 | 
138 |     """
139 |     xs = np.asarray(xs)
140 |     ys = np.asarray(ys)
141 | 
142 |     if xs.ndim != 1:
143 |         raise ValueError("xs must be a 1D vector")
144 |     if ys.ndim not in (1, 2):
145 |         raise ValueError('ys must be either a 1D vector or a "tall" 2D matrix')
146 |     if xs.shape[0] != ys.shape[-1]:
147 |         raise ValueError("xs and ys must have the same number of observations")
148 | 
149 |     ys = ys[np.newaxis, :] if ys.ndim == 1 else ys
150 | 
151 |     xs_thinned, ys_smoothed = _log_smooth_numba(
152 |         xs, ys, sigma_log10, window_sigma, points_per_sigma
153 |     )
154 | 
155 |     if points_per_sigma:
156 |         ys_smoothed = np.asarray(
157 |             [_log_interp(xs, xs_thinned, ys_row) for ys_row in ys_smoothed]
158 |         )
159 | 
160 |     ys_smoothed = ys_smoothed[0] if ys.shape[0] == 1 else ys_smoothed
161 | 
162 |     return ys_smoothed
163 | 
164 | 
165 | def _smooth_cvd_group(
166 |     cvd, sigma_log10, window_sigma, points_per_sigma, cols=None, suffix=""
167 | ):
168 |     cvd_smoothed = (
169 |         cvd.groupby(cols["dist"])
170 |         .agg(
171 |             {
172 |                 cols["n_pixels"]: "sum",
173 |                 cols["n_contacts"]: "sum",
174 |             }
175 |         )
176 |         .reset_index()
177 |     )
178 | 
179 |     smoothed_balanced_sum, smoothed_n_valid = log_smooth(
180 |         cvd_smoothed[cols["dist"]].values.astype(np.float64),
181 |         [
182 |             cvd_smoothed[cols["n_contacts"]].values.astype(np.float64),
183 |             cvd_smoothed[cols["n_pixels"]].values.astype(np.float64),
184 |         ],
185 |         sigma_log10=sigma_log10,
186 |         window_sigma=window_sigma,
187 |         points_per_sigma=points_per_sigma,
188 |     )
189 | 
190 |     cvd_smoothed[cols["n_pixels"] + suffix] = smoothed_n_valid
191 |     cvd_smoothed[cols["n_contacts"] + suffix] = smoothed_balanced_sum
192 |     cvd_smoothed[cols["output_prefix"] + suffix] = (
193 |         cvd_smoothed[cols["n_contacts"] + suffix]
194 |         / cvd_smoothed[cols["n_pixels"] + suffix]
195 |     )
196 | 
197 |     return cvd_smoothed


--------------------------------------------------------------------------------
/cooltools/sandbox/fastsavetxt.pyx:
--------------------------------------------------------------------------------
  1 | ### Adaptation of Max Imakaev's fast txt matrix writer.
  2 | 
  3 | cimport cython
  4 | import os
  5 | import subprocess
  6 | 
  7 | from libc.stdlib cimport malloc, free
  8 | from libc.string cimport strcpy, strlen
  9 | 
 10 | import numpy as np
 11 | cimport numpy as np
 12 | 
 13 | cdef extern from "stdio.h":
 14 |     int sprintf(char *str, char *format, ...)
 15 | 
 16 | def commandExists(command):
 17 |     """
 18 |     Checks if the bash command exists.
 19 |     """
 20 |     command = command.split()[0]
 21 |     if subprocess.call(['which', command]) != 0:
 22 |         return False
 23 |     return True
 24 | 
 25 | def gzipWriter(filepath):
 26 |     """
 27 |     Creates a writing process with gzip or parallel gzip (pigz) attached to it.
 28 |     """
 29 |     filepath = os.path.abspath(filepath)
 30 |     with open(filepath, 'wb') as outFile:
 31 |         if commandExists("pigz"):
 32 |             writer = ["pigz", "-c", "-9"]
 33 |         else:
 34 |             writer = ["gzip", "-c", "-2"]
 35 | 
 36 |         pwrite = subprocess.Popen(
 37 |             writer,
 38 |             stdin=subprocess.PIPE,
 39 |             stdout=outFile,
 40 |             shell=False,
 41 |             bufsize=-1)
 42 |     return pwrite
 43 | 
 44 | def empty_func():
 45 |     return None
 46 | 
 47 | @cython.boundscheck(False)
 48 | @cython.nonecheck(False)
 49 | @cython.wraparound(False)
 50 | 
 51 | def array2txt(
 52 |     mat,
 53 |     out,
 54 |     format_string=b'%.4lf',
 55 |     sep=b'\t',
 56 |     newline=b'\n',
 57 |     header=None,
 58 |     row_headers=None,
 59 |     max_element_len=100):
 60 |     """
 61 |     Dump a 2d array into a text file, optionally gzipped.
 62 |     This implementation if faster than the np.savetxt and it allows the user
 63 |     to provide column/row headers.
 64 | 
 65 |     Parameters
 66 |     ----------
 67 |     mat : a 2D numpy array of a list of lists of numbers (float/integer)
 68 | 
 69 |     out : str or file object
 70 |         Either:
 71 |         -- a path to the output file. If ends with .gz the output is gzipped
 72 |         -- a file object
 73 |         -- a stdin of a Popen object
 74 |         TIP: when using files/stdin do not forget to flush()/communicate().
 75 | 
 76 |     format_string : bytes, optional
 77 |         A printf-style formatting string to specify the coversion of
 78 |         the elements of the matrix into strings.
 79 | 
 80 |     sep : bytes, optional
 81 |         The column separator.
 82 | 
 83 |     newline : bytes, optional
 84 |         The newline separator.
 85 | 
 86 |     header : bytes, optional
 87 |         A header to prepend to the output file, is separated from the main table
 88 |         by a `newline`.
 89 | 
 90 |     row_headers : a list of bytes, optional
 91 |         Row headers to prepend to the output file, one per each row in `mat`.
 92 | 
 93 |     max_element_len : int
 94 |             The maximal length of the string representation of a matrix element,
 95 |         produced by sprintf(`format_string`). Used to preallocate memory.
 96 |     """
 97 | 
 98 | 
 99 |     cdef int N = len(mat)
100 |     cdef int M = len(mat[0])
101 | 
102 |     if issubclass(type(out), str) or issubclass(type(out), bytearray):
103 |         if out.endswith('.gz'):
104 |             writer = gzipWriter(out)
105 |             out_pipe = writer.stdin
106 |             close_out_func = writer.communicate
107 |         else:
108 |             writer = open(out, 'wb')
109 |             out_pipe = writer
110 |             close_out_func = writer.flush
111 |     elif hasattr(out, 'write'):
112 |         out_pipe = out
113 |         close_out_func = empty_func
114 |     else:
115 |         raise Exception('`out` must be either a file path or a file handle/stream')
116 | 
117 |     cdef np.ndarray[np.double_t, ndim=2] mat_ndarray = np.array(mat, dtype=np.double, order="C")
118 | 
119 |     cdef char* newline_cstr = newline
120 |     cdef char* sep_cstr = sep
121 |     cdef char* next_row_header
122 |     cdef char* s_start
123 |     cdef char* s_cur
124 | 
125 |     cdef int max_header_len = 0
126 |     if row_headers is not None:
127 |         max_header_len = max([len(row_header) for row_header in row_headers])
128 | 
129 |     s_start = <char *>malloc((max_element_len * M  + max_header_len) * sizeof(char))
130 | 
131 |     cdef double element
132 |     cdef char* curStringTemplate
133 |     template = b''.join([format_string, sep])
134 |     curStringTemplate = template
135 | 
136 |     if header is not None:
137 |         out_pipe.write(header)
138 |         out_pipe.write(newline_cstr)
139 | 
140 |     cdef int i,j
141 |     for i in xrange(N):
142 |         s_cur = s_start
143 |         if row_headers is not None:
144 | 
145 |             next_row_header = row_headers[i]
146 |             s_cur = strcpy(s_cur, next_row_header)
147 |             s_cur += sizeof(char) * strlen(next_row_header)
148 | 
149 |             s_cur = strcpy(s_cur, sep_cstr)
150 |             s_cur += sizeof(char) * strlen(sep_cstr)
151 | 
152 |         for j in xrange(M):
153 |             element = mat_ndarray[i,j]
154 |             s_cur = s_cur + sprintf(s_cur, curStringTemplate, element)
155 | 
156 |         s_cur = strcpy(s_cur, newline_cstr)
157 |         s_cur += sizeof(char) * strlen(newline_cstr)
158 | 
159 |         out_pipe.write(s_start)
160 |     free(s_start)
161 | 
162 |     close_out_func()
163 | 


--------------------------------------------------------------------------------
/cooltools/sandbox/pairs_scaling_functions.py:
--------------------------------------------------------------------------------
 1 | ###################################
 2 | #
 3 | # several functions for calculating scalings using pairs
 4 | # they used to reside in cooltools.expected module
 5 | #
 6 | ####################################
 7 | 
 8 | import numpy as np
 9 | from ..lib import numutils
10 | 
11 | def _contact_areas(distbins, scaffold_length):
12 |     distbins = distbins.astype(float)
13 |     scaffold_length = float(scaffold_length)
14 |     outer_areas = np.maximum(scaffold_length - distbins[:-1], 0) ** 2
15 |     inner_areas = np.maximum(scaffold_length - distbins[1:], 0) ** 2
16 |     return 0.5 * (outer_areas - inner_areas)
17 | 
18 | 
19 | def contact_areas(distbins, region1, region2):
20 |     if region1 == region2:
21 |         start, end = region1
22 |         areas = _contact_areas(distbins, end - start)
23 |     else:
24 |         start1, end1 = region1
25 |         start2, end2 = region2
26 |         if start2 <= start1:
27 |             start1, start2 = start2, start1
28 |             end1, end2 = end2, end1
29 |         areas = (
30 |             _contact_areas(distbins, end2 - start1)
31 |             - _contact_areas(distbins, start2 - start1)
32 |             - _contact_areas(distbins, end2 - end1)
33 |         )
34 |         if end1 < start2:
35 |             areas += _contact_areas(distbins, start2 - end1)
36 | 
37 |     return areas
38 | 
39 | 
40 | def compute_scaling(df, region1, region2=None, dmin=int(1e1), dmax=int(1e7), n_bins=50):
41 | 
42 |     import dask.array as da
43 | 
44 |     if region2 is None:
45 |         region2 = region1
46 | 
47 |     distbins = numutils.logbins(dmin, dmax, N=n_bins)
48 |     areas = contact_areas(distbins, region1, region2)
49 | 
50 |     df = df[
51 |         (df["pos1"] >= region1[0])
52 |         & (df["pos1"] < region1[1])
53 |         & (df["pos2"] >= region2[0])
54 |         & (df["pos2"] < region2[1])
55 |     ]
56 |     dists = (df["pos2"] - df["pos1"]).values
57 | 
58 |     if isinstance(dists, da.Array):
59 |         obs, _ = da.histogram(dists[(dists >= dmin) & (dists < dmax)], bins=distbins)
60 |     else:
61 |         obs, _ = np.histogram(dists[(dists >= dmin) & (dists < dmax)], bins=distbins)
62 | 
63 |     return distbins, obs, areas
64 | 


--------------------------------------------------------------------------------
/datasets/external_test_files.tsv:
--------------------------------------------------------------------------------
 1 | # key	filename	checksum	link	comment
 2 | HFF_MicroC	test.mcool	e4a0fc25c8dc3d38e9065fd74c565dd1	https://osf.io/3h9js/download	Micro-C data from HFF human cells for two chromosomes (hg38) in a multi-resolution mcool format. Krietenstein et al. 2021 data.
 3 | hESC_MicroC	test_hESC.mcool	ac0e636605505fb76fac25fa08784d5b	https://osf.io/3kdyj/download	Micro-C data from human ES cells for two chromosomes (hg38) in a multi-resolution mcool format. Krietenstein et al. 2021 data.
 4 | HFF_CTCF_fc	test_CTCF.bigWig	62429de974b5b4a379578cc85adc65a3	https://osf.io/w92u3/download	ChIP-Seq fold change over input with CTCF antibodies in HFF cells (hg38). Downloaded from ENCODE ENCSR000DWQ, ENCFF761RHS.bigWig file
 5 | HFF_CTCF_binding	test_CTCF.bed.gz	61ecfdfa821571a8e0ea362e8fd48f63	https://osf.io/c9pwe/download	Binding sites called from CTCF ChIP-Seq peaks for HFF cells (hg38). Peaks are from ENCODE ENCSR000DWQ, ENCFF498QCT.bed file. The motifs are called with gimmemotifs (options --nreport 1 --cutoff 0), with JASPAR pwm MA0139.
 6 | mESC_dRAD21_IAA	dRAD21_IAA.mm10.mapq_30.mcool	40087388c443aae19110fdf099738c06	https://osf.io/5xaut/download	Micro-C data from mESC for three chromosomes (mm10) in a multi-resolution mcool format (Hsieh et al. 2022). dRad21 IAA treatment, degraded Rad21.
 7 | mESC_dRAD21_UT	dRAD21_UT.mm10.mapq_30.mcool	2ff91a7def1a9dd3e1f9b62d89d579a7	https://osf.io/u75pd/download	Micro-C data from mESC for three chromosomes (mm10) in a multi-resolution mcool format (Hsieh et al. 2022). dRad21 untreated (UT), control for Rad21 degradation.
 8 | mESC_dCTCF_IAA	dCTCF_IAA.mm10.mapq_30.mcool	33ec02cafa9f1f31d2cbba227cf38cc6	https://osf.io/xwy9j/download	Micro-C data from mESC for three chromosomes (mm10) in a multi-resolution mcool format (Hsieh et al. 2022). dCTCF IAA treatment, degraded CTCF.
 9 | mESC_dWAPL_IAA	dWAPL_IAA.mm10.mapq_30.mcool	11088c9a6d10826a23a69807fc296005	https://osf.io/fk74t/download	Micro-C data from mESC for three chromosomes (mm10) in a multi-resolution mcool format (Hsieh et al. 2022). dWapl IAA treatment, degraded Wapl.
10 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@cd ..; python setup.py build_ext --inplace; cd docs
21 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
22 | 


--------------------------------------------------------------------------------
/docs/cli.rst:
--------------------------------------------------------------------------------
1 | CLI Reference
2 | =============
3 | 
4 | .. click:: cooltools.cli:cli
5 |    :prog: cooltools
6 |    :show-nested:
7 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for the Sphinx documentation builder.
  2 | #
  3 | # This file only contains a selection of the most common options. For a full
  4 | # list see the documentation:
  5 | # http://www.sphinx-doc.org/en/master/config
  6 | 
  7 | # -- Path setup --------------------------------------------------------------
  8 | 
  9 | # If extensions (or modules to document with autodoc) are in another directory,
 10 | # add these directories to sys.path here. If the directory is relative to the
 11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 12 | #
 13 | import os
 14 | import sys
 15 | sys.path.insert(0, os.path.abspath('..'))
 16 | 
 17 | from unittest.mock import Mock
 18 | MOCK_MODULES = [
 19 |     'cooltools.io.fastsavetxt',
 20 |     'cooltools.lib._numutils',
 21 |     'cooler',
 22 |     'cooler.core',
 23 |     'cooler.tools',
 24 |     'cooler.parallel',
 25 |     'cooler.util',
 26 |     'cython',
 27 |     'dask',
 28 |     'h5py',
 29 |     'matplotlib',
 30 |     'matplotlib.cm',
 31 |     'matplotlib.pyplot',
 32 |     'matplotlib.colors',
 33 |     'numba',
 34 |     # 'numpy',
 35 |     'pandas',
 36 |     'scipy',
 37 |     'scipy.interpolate',
 38 |     'scipy.linalg',
 39 |     'scipy.sparse',
 40 |     'scipy.sparse.linalg',
 41 |     'scipy.ndimage',
 42 |     'scipy.ndimage.filters',
 43 |     'scipy.ndimage.interpolation',
 44 |     'scipy.signal',
 45 |     'scipy.stats',
 46 |     'sklearn',
 47 |     'sklearn.cluster',
 48 |     'skimage',
 49 |     'skimage.filters',
 50 | ]
 51 | for mod_name in MOCK_MODULES:
 52 |     sys.modules[mod_name] = Mock()
 53 | 
 54 | 
 55 | # -- Project information -----------------------------------------------------
 56 | 
 57 | project = 'cooltools'
 58 | copyright = '2020, cooltoolers'
 59 | author = 'cooltoolers'
 60 | 
 61 | 
 62 | # -- General configuration ---------------------------------------------------
 63 | 
 64 | # Apparently readthedocs looks for contents.rst by default if this isn't set.
 65 | master_doc = 'index'
 66 | 
 67 | # Add any paths that contain templates here, relative to this directory.
 68 | templates_path = ['_templates']
 69 | 
 70 | # List of patterns, relative to source directory, that match files and
 71 | # directories to ignore when looking for source files.
 72 | # This pattern also affects html_static_path and html_extra_path.
 73 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints']
 74 | 
 75 | # Add any Sphinx extension module names here, as strings. They can be
 76 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 77 | # ones.
 78 | extensions = [
 79 |     'sphinx.ext.todo',
 80 |     'sphinx.ext.coverage',
 81 |     'sphinx.ext.autodoc',
 82 |     'sphinx.ext.viewcode',
 83 |     'sphinx.ext.autosummary',
 84 |     'sphinx.ext.napoleon',
 85 |     'sphinx.ext.mathjax',
 86 |     'sphinx_click.ext',
 87 |     'recommonmark',
 88 |     'nbsphinx',
 89 |     'sphinx_rtd_theme'
 90 | ]
 91 | 
 92 | # Extension configuration
 93 | napoleon_google_docstring = False
 94 | # napoleon_use_param = False
 95 | # napoleon_use_ivar = True
 96 | napoleon_use_rtype = False
 97 | 
 98 | # Notebook prolog and epilog
 99 | nbsphinx_prolog = """"""
100 | nbsphinx_epilog = r"""
101 | ----
102 | {% set docname = env.doc2path(env.docname, base='docs') %}
103 | 
104 | This page was generated with nbsphinx_ from `{{ docname }}`__
105 | 
106 | __ https://github.com/open2c/cooltools/blob/master{{ env.config.release }}/{{ docname }}
107 | 
108 | .. _nbsphinx: https://nbsphinx.readthedocs.io/
109 | 
110 | """
111 | 
112 | # -- Options for HTML output -------------------------------------------------
113 | 
114 | # The theme to use for HTML and HTML Help pages.  See the documentation for
115 | # a list of builtin themes.
116 | #
117 | html_theme = 'sphinx_rtd_theme'
118 | 
119 | # Add any paths that contain custom static files (such as style sheets) here,
120 | # relative to this directory. They are copied after the builtin static files,
121 | # so a file named "default.css" will overwrite the builtin "default.css".
122 | html_static_path = ['_static']
123 | 
124 | 
125 | # -- Style overrides ----------------------------------------------------------
126 | # Place CSS in _static directory
127 | # def setup(app):
128 | #     app.add_stylesheet('theme_overrides.css')
129 | 
130 | 
131 | # Pull jupyter notebooks from the open2c_examples repo
132 | def setup(app):
133 |     from subprocess import run
134 | 
135 |     if os.path.isdir('notebooks'):
136 |         cmd = 'cd notebooks && git pull'
137 |     else:
138 |         cmd = 'git clone https://github.com/open2c/open2c_examples.git notebooks'
139 | 
140 |     print("Updating Open2C examples...")
141 |     run(cmd, check=True, shell=True)
142 | 


--------------------------------------------------------------------------------
/docs/cooltools.lib.rst:
--------------------------------------------------------------------------------
 1 | cooltools.lib package
 2 | =====================
 3 | 
 4 | common
 5 | --------
 6 | 
 7 | .. automodule:: cooltools.lib.common
 8 |    :members:
 9 |    :undoc-members:
10 |    :show-inheritance:
11 | 
12 | numutils
13 | --------
14 | 
15 | .. automodule:: cooltools.lib.numutils
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | peaks
21 | -----
22 | 
23 | .. automodule:: cooltools.lib.peaks
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 | 
28 | plotting
29 | --------
30 | 
31 | .. automodule:: cooltools.lib.plotting
32 |    :members:
33 |    :undoc-members:
34 |    :show-inheritance:
35 | 
36 | schemas
37 | --------
38 | 
39 | .. automodule:: cooltools.lib.schemas
40 |   :members:
41 |   :undoc-members:
42 |   :show-inheritance:


--------------------------------------------------------------------------------
/docs/cooltools.rst:
--------------------------------------------------------------------------------
 1 | API Reference
 2 | =============
 3 | 
 4 | subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |    cooltools.lib
10 | 
11 | cooltools.api.coverage module
12 | -----------------------------
13 | 
14 | .. automodule:: cooltools.api.coverage
15 |    :members:
16 |    :undoc-members:
17 |    :show-inheritance:
18 | 
19 | cooltools.api.directionality module
20 | -----------------------------------
21 | 
22 | .. automodule:: cooltools.api.directionality
23 |    :members:
24 |    :undoc-members:
25 |    :show-inheritance:
26 | 
27 | cooltools.api.dotfinder module
28 | ------------------------------
29 | 
30 | .. automodule:: cooltools.api.dotfinder
31 |    :members:
32 |    :undoc-members:
33 |    :show-inheritance:
34 | 
35 | cooltools.api.eigdecomp module
36 | ------------------------------
37 | 
38 | .. automodule:: cooltools.api.eigdecomp
39 |    :members:
40 |    :undoc-members:
41 |    :show-inheritance:
42 | 
43 | cooltools.api.expected module
44 | -----------------------------
45 | 
46 | .. automodule:: cooltools.api.expected
47 |    :members:
48 |    :undoc-members:
49 |    :show-inheritance:
50 | 
51 | cooltools.api.insulation module
52 | -------------------------------
53 | 
54 | .. automodule:: cooltools.api.insulation
55 |    :members:
56 |    :undoc-members:
57 |    :show-inheritance:
58 | 
59 | cooltools.api.saddle module
60 | ---------------------------
61 | 
62 | .. automodule:: cooltools.api.saddle
63 |    :members:
64 |    :undoc-members:
65 |    :show-inheritance:
66 | 
67 | cooltools.api.sample module
68 | ---------------------------
69 | 
70 | .. automodule:: cooltools.api.sample
71 |    :members:
72 |    :undoc-members:
73 |    :show-inheritance:
74 | 
75 | cooltools.api.snipping module
76 | -----------------------------
77 | 
78 | .. automodule:: cooltools.api.snipping
79 |    :members:
80 |    :undoc-members:
81 |    :show-inheritance:
82 | 
83 | cooltools.api.virtual4c module
84 | -----------------------------
85 | 
86 | .. automodule:: cooltools.api.virtual4c
87 |    :members:
88 |    :undoc-members:
89 |    :show-inheritance:


--------------------------------------------------------------------------------
/docs/figs/cooltools-logo-futura.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/cooltools/aedd531e80e7940311f941918c6551d3229eaf21/docs/figs/cooltools-logo-futura.png


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. cooltools documentation master file, created by
 2 |    sphinx-quickstart on Wed Jun 12 16:42:43 2019.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | .. toctree::
 7 |    :caption: Overview
 8 |    :hidden:
 9 |    :maxdepth: 2
10 | 
11 |    self
12 | 
13 | Getting started
14 | ***************
15 | 
16 | The tools for your *.cool*\ s
17 | 
18 | Chromosome conformation capture technologies reveal the incredible complexity of genome folding. A growing number of labs and multiple consortia, including the 4D Nucleome, the International Nucleome Consortium, and ENCODE, are generating higher-resolution datasets to probe genome architecture across cell states, types, and organisms. Larger datasets increase the challenges at each step of computational analysis, from storage, to memory, to researchers’ time. The recently-introduced `cooler <https://github.com/open2c/cooler/tree/master/cooler>`_ format readily handles storage of high-resolution datasets via a sparse data model.
19 | 
20 | **cooltools** leverages this format to enable flexible and reproducible analysis of high-resolution data. **cooltools** provides a suite of computational tools with a paired python API and command line access, which facilitates workflows either on high-performance computing clusters or via custom analysis notebooks. As part of the `Open2C ecosystem <https://open2c.github.io/>`_, **cooltools** also provides detailed introductions to key concepts in Hi-C-data analysis with interactive notebook documentation.
21 | 
22 | If you use **cooltools** in your work, please cite **cooltools**: https://doi.org/10.1101/2022.10.31.514564.
23 | 
24 | Installation
25 | ============
26 | 
27 | Requirements
28 | ------------
29 | 
30 | - Python 3.7+
31 | - Scientific Python packages
32 | 
33 | Install using pip
34 | -----------------
35 | 
36 | Compile and install `cooltools` and its Python dependencies from
37 | PyPI using pip:
38 | 
39 | .. code-block:: bash
40 | 
41 |     $ pip install cooltools
42 | 
43 | or install the latest version directly from github:
44 | 
45 | .. code-block:: bash
46 | 
47 |     $ pip install https://github.com/open2c/cooltools/archive/refs/heads/master.zip
48 | 
49 | 
50 | Install the development version
51 | -------------------------------
52 | 
53 | Finally, you can install the latest development version of `cooltools` from
54 | github. First, make a local clone of the github repository:
55 | 
56 | .. code-block:: bash
57 | 
58 |     $ git clone https://github.com/open2c/cooltools
59 | 
60 | Then, you can compile and install `cooltools` in
61 | `development mode <https://setuptools.readthedocs.io/en/latest/setuptools.html#development-mode>`_,
62 | which installs the package without moving it to a system folder and thus allows
63 | immediate live-testing any changes in the python code.
64 | 
65 | .. code-block:: bash
66 | 
67 |     $ cd cooltools
68 |     $ pip install -e ./
69 | 
70 | 
71 | .. toctree::
72 |   :maxdepth: 2
73 |   :caption: Tutorials
74 |   :titlesonly:
75 | 
76 |   ./notebooks/viz.ipynb
77 |   ./notebooks/contacts_vs_distance.ipynb
78 |   ./notebooks/compartments_and_saddles.ipynb
79 |   ./notebooks/insulation_and_boundaries.ipynb
80 |   ./notebooks/dots.ipynb
81 |   ./notebooks/pileup_CTCF.ipynb
82 |   ./notebooks/command_line_interface.ipynb
83 | 
84 | Note that these notebooks currently focus on mammalian interphase Hi-C analysis, but are readily extendible to other organisms and cellular contexts. To clone and work interactively with these notebooks, visit: https://github.com/open2c/open2c_examples.
85 | 
86 | 
87 | .. toctree::
88 |   :maxdepth: 1
89 |   :caption: Reference
90 | 
91 |   cli
92 |   cooltools
93 |   releases
94 | 
95 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/notebooks_old/data/encode_motifs.hg38.ctcf_known1.liftover.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/cooltools/aedd531e80e7940311f941918c6551d3229eaf21/docs/notebooks_old/data/encode_motifs.hg38.ctcf_known1.liftover.bed.gz


--------------------------------------------------------------------------------
/docs/releases.md:
--------------------------------------------------------------------------------
1 | ../CHANGELOG.md


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | bioframe
 2 | click>=7.0
 3 | ipython
 4 | nbsphinx
 5 | multiprocess
 6 | numpy
 7 | pygments<3,>=2.4.1
 8 | recommonmark
 9 | Sphinx
10 | sphinx-rtd-theme
11 | sphinx-click
12 | docutils<=0.16
13 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "cython", "numpy"]
3 | build-backend = "setuptools.build_meta"
4 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
 1 | [pytest]
 2 | addopts = 
 3 |     --cov cooltools
 4 |     --cov-config .coveragerc
 5 |     --cov-report term-missing
 6 |     --cov-report html
 7 |     --cov-report xml
 8 | filterwarnings =
 9 |     ignore::PendingDeprecationWarning
10 | testpaths =
11 |     tests
12 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | pytest
3 | pytest-flake8
4 | pytest-cov
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | bioframe>=0.4.1
 2 | click>=7
 3 | cooler>=0.9.1
 4 | cython
 5 | joblib
 6 | matplotlib
 7 | multiprocess
 8 | numba
 9 | numpy
10 | pandas>=1.5.1
11 | scikit-learn>=1.1.2
12 | scipy
13 | scikit-image
14 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import io
 4 | import os
 5 | import re
 6 | 
 7 | from setuptools import setup, find_packages
 8 | from setuptools.extension import Extension
 9 | from Cython.Build import cythonize
10 | import numpy as np
11 | 
12 | 
13 | classifiers = """\
14 |     Development Status :: 4 - Beta
15 |     Programming Language :: Python
16 |     Programming Language :: Python :: 3
17 |     Programming Language :: Python :: 3.8
18 |     Programming Language :: Python :: 3.9
19 |     Programming Language :: Python :: 3.10
20 | """
21 | 
22 | 
23 | def _read(*parts, **kwargs):
24 |     filepath = os.path.join(os.path.dirname(__file__), *parts)
25 |     encoding = kwargs.pop("encoding", "utf-8")
26 |     with io.open(filepath, encoding=encoding) as fh:
27 |         text = fh.read()
28 |     return text
29 | 
30 | 
31 | def get_version():
32 |     version = re.search(
33 |         r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
34 |         _read("cooltools", "__init__.py"),
35 |         re.MULTILINE,
36 |     ).group(1)
37 |     return version
38 | 
39 | 
40 | def get_long_description():
41 |     return _read("README.md")
42 | 
43 | 
44 | def get_requirements(path):
45 |     content = _read(path)
46 |     return [
47 |         req
48 |         for req in content.split("\n")
49 |         if req != "" and not (req.startswith("#") or req.startswith("-"))
50 |     ]
51 | 
52 | 
53 | setup_requires = [
54 |     "cython",
55 |     "numpy",
56 | ]
57 | 
58 | 
59 | install_requires = get_requirements("requirements.txt")
60 | 
61 | 
62 | extensions = [
63 |     Extension(
64 |         "cooltools.lib._numutils",
65 |         ["cooltools/lib/_numutils.pyx"],
66 |         include_dirs=[np.get_include()],
67 |     ),
68 | ]
69 | 
70 | 
71 | packages = find_packages()
72 | 
73 | 
74 | setup(
75 |     name="cooltools",
76 |     author="Open2C",
77 |     author_email="open.chromosome.collective@gmail.com",
78 |     version=get_version(),
79 |     license="MIT",
80 |     description="Analysis tools for genomic interaction data stored in .cool format",
81 |     long_description=get_long_description(),
82 |     long_description_content_type="text/markdown",
83 |     keywords=["genomics", "bioinformatics", "Hi-C", "analysis", "cooler"],
84 |     url="https://github.com/open2c/cooltools",
85 |     zip_safe=False,
86 |     classifiers=[s.strip() for s in classifiers.split("\n") if s],
87 |     python_requires=">=3.7.1",  # same as pandas
88 |     packages=packages,
89 |     ext_modules=cythonize(extensions),
90 |     include_dirs=[np.get_include()],
91 |     setup_requires=setup_requires,
92 |     install_requires=install_requires,
93 |     entry_points={
94 |         "console_scripts": [
95 |             "cooltools = cooltools.cli:cli",
96 |         ]
97 |     },
98 | )
99 | 


--------------------------------------------------------------------------------
/tests/data/CN.mm9.10000kb.cool:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/cooltools/aedd531e80e7940311f941918c6551d3229eaf21/tests/data/CN.mm9.10000kb.cool


--------------------------------------------------------------------------------
/tests/data/CN.mm9.1000kb.cool:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/cooltools/aedd531e80e7940311f941918c6551d3229eaf21/tests/data/CN.mm9.1000kb.cool


--------------------------------------------------------------------------------
/tests/data/CN.mm9.toy_expected.chromnamed.tsv:
--------------------------------------------------------------------------------
  1 | region1	region2	dist	n_valid	count.sum	balanced.sum	balanced.avg
  2 | chr1	chr1	0	50			
  3 | chr1	chr1	1	49			
  4 | chr1	chr1	2	48	448255.0	2.3205076553434987	0.04834390948632289
  5 | chr1	chr1	3	47	271497.0	1.38339695992966	0.02943397787084383
  6 | chr1	chr1	4	46	179491.0	0.900655795691491	0.01957947381938024
  7 | chr1	chr1	5	45	135426.0	0.6826130105698165	0.015169178012662588
  8 | chr1	chr1	6	44	96841.0	0.48167647260294866	0.010947192559157925
  9 | chr1	chr1	7	43	74458.0	0.36747755422094075	0.008545989633045134
 10 | chr1	chr1	8	42	56431.0	0.2767897183400133	0.0065902313890479364
 11 | chr1	chr1	9	41	46579.0	0.23020444753273792	0.005614742622749705
 12 | chr1	chr1	10	40	42800.0	0.21407619204942857	0.005351904801235714
 13 | chr1	chr1	11	39	38893.0	0.1931769021342914	0.0049532539008792665
 14 | chr1	chr1	12	38	35915.0	0.1760485882134026	0.004632857584563227
 15 | chr1	chr1	13	37	31507.0	0.15432815796541483	0.0041710312963625625
 16 | chr1	chr1	14	36	28275.0	0.13916825128679033	0.003865784757966398
 17 | chr1	chr1	15	35	26582.0	0.13214461875460215	0.0037755605358457756
 18 | chr1	chr1	16	34	24080.0	0.1200420079045525	0.0035306472913103674
 19 | chr1	chr1	17	33	22554.0	0.1123809167677425	0.0034054823262952274
 20 | chr1	chr1	18	32	21069.0	0.10519693902501005	0.003287404344531564
 21 | chr1	chr1	19	31	19565.0	0.09730388315158268	0.003138834940373635
 22 | chr1	chr1	20	30	18830.0	0.09344118037915836	0.003114706012638612
 23 | chr1	chr1	21	29	18180.0	0.09181365603513099	0.003165988139142448
 24 | chr1	chr1	22	28	16817.0	0.0857312761411997	0.003061831290757132
 25 | chr1	chr1	23	27	15637.0	0.08088906104487427	0.0029958911498101583
 26 | chr1	chr1	24	26	13554.0	0.0696931607808895	0.0026805061838803654
 27 | chr1	chr1	25	25	12151.0	0.062133968853916574	0.002485358754156663
 28 | chr1	chr1	26	24	10641.0	0.053908741063492124	0.002246197544312172
 29 | chr1	chr1	27	23	9371.0	0.04780835937733471	0.002078624320753683
 30 | chr1	chr1	28	22	8684.0	0.04565538936132342	0.0020752449709692464
 31 | chr1	chr1	29	21	7883.0	0.04194264489363847	0.0019972688044589747
 32 | chr1	chr1	30	20	7602.0	0.04117335917285604	0.002058667958642802
 33 | chr1	chr1	31	19	6783.0	0.03642786791651601	0.0019172562061324217
 34 | chr1	chr1	32	18	6220.0	0.033609930607101324	0.0018672183670611847
 35 | chr1	chr1	33	17	5752.0	0.03126540105125592	0.0018391412383091717
 36 | chr1	chr1	34	16	5236.0	0.02870993254323146	0.0017943707839519663
 37 | chr1	chr1	35	15	4806.0	0.026732726358511393	0.0017821817572340928
 38 | chr1	chr1	36	14	4562.0	0.025516336044875902	0.0018225954317768502
 39 | chr1	chr1	37	13	4484.0	0.025173064987642168	0.001936389614434013
 40 | chr1	chr1	38	12	4322.0	0.024324300745100825	0.0020270250620917354
 41 | chr1	chr1	39	11	3797.0	0.02095540632794532	0.00190503693890412
 42 | chr1	chr1	40	10	3403.0	0.018630663941423948	0.0018630663941423947
 43 | chr1	chr1	41	9	3044.0	0.016810995031025552	0.001867888336780617
 44 | chr1	chr1	42	8	2716.0	0.015316241229781234	0.0019145301537226542
 45 | chr1	chr1	43	7	2461.0	0.014124488058201323	0.002017784008314475
 46 | chr1	chr1	44	6	2060.0	0.011782977088540664	0.0019638295147567774
 47 | chr1	chr1	45	5	1629.0	0.009356770724295723	0.0018713541448591446
 48 | chr1	chr1	46	4	1325.0	0.007777107004193509	0.0019442767510483773
 49 | chr1	chr1	47	3	950.0	0.005574745304582236	0.0018582484348607453
 50 | chr1	chr1	48	2	629.0	0.003669007156579109	0.0018345035782895544
 51 | chr1	chr1	49	1	326.0	0.0020415942196967394	0.0020415942196967394
 52 | chr2	chr2	0	49			
 53 | chr2	chr2	1	48			
 54 | chr2	chr2	2	47	450107.0	2.1180050802546933	0.04506393787775943
 55 | chr2	chr2	3	46	238644.0	1.1182026520831783	0.02430875330615605
 56 | chr2	chr2	4	45	151877.0	0.7065426657897472	0.01570094812866105
 57 | chr2	chr2	5	44	105862.0	0.4889639900117408	0.01111281795481229
 58 | chr2	chr2	6	43	84565.0	0.3886687958491317	0.00903880920579376
 59 | chr2	chr2	7	42	67656.0	0.305587801420597	0.007275900033823738
 60 | chr2	chr2	8	41	56605.0	0.2536802573536893	0.006187323350089984
 61 | chr2	chr2	9	40	49125.0	0.21940452543596367	0.005485113135899092
 62 | chr2	chr2	10	39	43256.0	0.19302073776471373	0.004949249686274711
 63 | chr2	chr2	11	38	38908.0	0.17213966992023477	0.0045299913136903885
 64 | chr2	chr2	12	37	33613.0	0.1494114335367291	0.00403814685234403
 65 | chr2	chr2	13	36	29008.0	0.1286862020151156	0.0035746167226421
 66 | chr2	chr2	14	35	28208.0	0.1257340707416353	0.0035924020211895802
 67 | chr2	chr2	15	34	26130.0	0.11682046178278417	0.0034358959347877698
 68 | chr2	chr2	16	33	24355.0	0.10848220502658447	0.0032873395462601354
 69 | chr2	chr2	17	32	21902.0	0.09720413992092795	0.0030376293725289986
 70 | chr2	chr2	18	31	19754.0	0.08921457365055102	0.00287788947259842
 71 | chr2	chr2	19	30	17506.0	0.0798108423392565	0.00266036141130855
 72 | chr2	chr2	20	29	16951.0	0.07831020324831016	0.002700351836148626
 73 | chr2	chr2	21	28	16124.0	0.07470713314986098	0.0026681118982093206
 74 | chr2	chr2	22	27	16237.0	0.07516147832181286	0.002783758456363439
 75 | chr2	chr2	23	26	15583.0	0.07144738725071081	0.0027479764327196466
 76 | chr2	chr2	24	25	14864.0	0.06801519019393452	0.0027206076077573808
 77 | chr2	chr2	25	24	14174.0	0.06516873511627985	0.002715363963178327
 78 | chr2	chr2	26	23	14169.0	0.06554949528961256	0.002849978056070111
 79 | chr2	chr2	27	22	13561.0	0.06221042530718225	0.0028277466048719207
 80 | chr2	chr2	28	21	12073.0	0.055813578961226296	0.0026577894743441094
 81 | chr2	chr2	29	20	11032.0	0.05118868034313225	0.0025594340171566127
 82 | chr2	chr2	30	19	10723.0	0.050269590871060296	0.0026457679405821207
 83 | chr2	chr2	31	18	10646.0	0.04998712073522266	0.0027770622630679254
 84 | chr2	chr2	32	17	10320.0	0.04943531274185869	0.002907959573050511
 85 | chr2	chr2	33	16	9664.0	0.04604888783607321	0.0028780554897545755
 86 | chr2	chr2	34	15	9227.0	0.04425307710295975	0.0029502051401973164
 87 | chr2	chr2	35	14	9111.0	0.04421548066666439	0.0031582486190474567
 88 | chr2	chr2	36	13	9923.0	0.04945120961837048	0.003803939201413114
 89 | chr2	chr2	37	12	9219.0	0.04674824569212995	0.0038956871410108294
 90 | chr2	chr2	38	11	8027.0	0.04077733321358686	0.0037070302921442602
 91 | chr2	chr2	39	10	6756.0	0.03230495094148628	0.0032304950941486276
 92 | chr2	chr2	40	9	5996.0	0.027699878189309274	0.003077764243256586
 93 | chr2	chr2	41	8	5280.0	0.023833680900535406	0.0029792101125669258
 94 | chr2	chr2	42	7	4560.0	0.019837282406156377	0.002833897486593768
 95 | chr2	chr2	43	6	3911.0	0.01627847374007839	0.0027130789566797314
 96 | chr2	chr2	44	5	3155.0	0.012966661266117605	0.002593332253223521
 97 | chr2	chr2	45	4	2335.0	0.008792759755829107	0.0021981899389572766
 98 | chr2	chr2	46	3	1518.0	0.005519380548429014	0.0018397935161430046
 99 | chr2	chr2	47	2	1142.0	0.003471630969823881	0.0017358154849119406
100 | chr2	chr2	48	1	756.0	0.0019403909506671992	0.0019403909506671992
101 | chr2	chr2	49	0	361.0	0.0	
102 | 


--------------------------------------------------------------------------------
/tests/data/CN.mm9.toy_expected.tsv:
--------------------------------------------------------------------------------
  1 | region1	region2	dist	n_valid	count.sum	balanced.sum	balanced.avg
  2 | foo	foo	0	50			
  3 | foo	foo	1	49			
  4 | foo	foo	2	48	448255.0	2.3205076553434987	0.04834390948632289
  5 | foo	foo	3	47	271497.0	1.38339695992966	0.02943397787084383
  6 | foo	foo	4	46	179491.0	0.900655795691491	0.01957947381938024
  7 | foo	foo	5	45	135426.0	0.6826130105698165	0.015169178012662588
  8 | foo	foo	6	44	96841.0	0.48167647260294866	0.010947192559157925
  9 | foo	foo	7	43	74458.0	0.36747755422094075	0.008545989633045134
 10 | foo	foo	8	42	56431.0	0.2767897183400133	0.0065902313890479364
 11 | foo	foo	9	41	46579.0	0.23020444753273792	0.005614742622749705
 12 | foo	foo	10	40	42800.0	0.21407619204942857	0.005351904801235714
 13 | foo	foo	11	39	38893.0	0.1931769021342914	0.0049532539008792665
 14 | foo	foo	12	38	35915.0	0.1760485882134026	0.004632857584563227
 15 | foo	foo	13	37	31507.0	0.15432815796541483	0.0041710312963625625
 16 | foo	foo	14	36	28275.0	0.13916825128679033	0.003865784757966398
 17 | foo	foo	15	35	26582.0	0.13214461875460215	0.0037755605358457756
 18 | foo	foo	16	34	24080.0	0.1200420079045525	0.0035306472913103674
 19 | foo	foo	17	33	22554.0	0.1123809167677425	0.0034054823262952274
 20 | foo	foo	18	32	21069.0	0.10519693902501005	0.003287404344531564
 21 | foo	foo	19	31	19565.0	0.09730388315158268	0.003138834940373635
 22 | foo	foo	20	30	18830.0	0.09344118037915836	0.003114706012638612
 23 | foo	foo	21	29	18180.0	0.09181365603513099	0.003165988139142448
 24 | foo	foo	22	28	16817.0	0.0857312761411997	0.003061831290757132
 25 | foo	foo	23	27	15637.0	0.08088906104487427	0.0029958911498101583
 26 | foo	foo	24	26	13554.0	0.0696931607808895	0.0026805061838803654
 27 | foo	foo	25	25	12151.0	0.062133968853916574	0.002485358754156663
 28 | foo	foo	26	24	10641.0	0.053908741063492124	0.002246197544312172
 29 | foo	foo	27	23	9371.0	0.04780835937733471	0.002078624320753683
 30 | foo	foo	28	22	8684.0	0.04565538936132342	0.0020752449709692464
 31 | foo	foo	29	21	7883.0	0.04194264489363847	0.0019972688044589747
 32 | foo	foo	30	20	7602.0	0.04117335917285604	0.002058667958642802
 33 | foo	foo	31	19	6783.0	0.03642786791651601	0.0019172562061324217
 34 | foo	foo	32	18	6220.0	0.033609930607101324	0.0018672183670611847
 35 | foo	foo	33	17	5752.0	0.03126540105125592	0.0018391412383091717
 36 | foo	foo	34	16	5236.0	0.02870993254323146	0.0017943707839519663
 37 | foo	foo	35	15	4806.0	0.026732726358511393	0.0017821817572340928
 38 | foo	foo	36	14	4562.0	0.025516336044875902	0.0018225954317768502
 39 | foo	foo	37	13	4484.0	0.025173064987642168	0.001936389614434013
 40 | foo	foo	38	12	4322.0	0.024324300745100825	0.0020270250620917354
 41 | foo	foo	39	11	3797.0	0.02095540632794532	0.00190503693890412
 42 | foo	foo	40	10	3403.0	0.018630663941423948	0.0018630663941423947
 43 | foo	foo	41	9	3044.0	0.016810995031025552	0.001867888336780617
 44 | foo	foo	42	8	2716.0	0.015316241229781234	0.0019145301537226542
 45 | foo	foo	43	7	2461.0	0.014124488058201323	0.002017784008314475
 46 | foo	foo	44	6	2060.0	0.011782977088540664	0.0019638295147567774
 47 | foo	foo	45	5	1629.0	0.009356770724295723	0.0018713541448591446
 48 | foo	foo	46	4	1325.0	0.007777107004193509	0.0019442767510483773
 49 | foo	foo	47	3	950.0	0.005574745304582236	0.0018582484348607453
 50 | foo	foo	48	2	629.0	0.003669007156579109	0.0018345035782895544
 51 | foo	foo	49	1	326.0	0.0020415942196967394	0.0020415942196967394
 52 | bar	bar	0	49			
 53 | bar	bar	1	48			
 54 | bar	bar	2	47	450107.0	2.1180050802546933	0.04506393787775943
 55 | bar	bar	3	46	238644.0	1.1182026520831783	0.02430875330615605
 56 | bar	bar	4	45	151877.0	0.7065426657897472	0.01570094812866105
 57 | bar	bar	5	44	105862.0	0.4889639900117408	0.01111281795481229
 58 | bar	bar	6	43	84565.0	0.3886687958491317	0.00903880920579376
 59 | bar	bar	7	42	67656.0	0.305587801420597	0.007275900033823738
 60 | bar	bar	8	41	56605.0	0.2536802573536893	0.006187323350089984
 61 | bar	bar	9	40	49125.0	0.21940452543596367	0.005485113135899092
 62 | bar	bar	10	39	43256.0	0.19302073776471373	0.004949249686274711
 63 | bar	bar	11	38	38908.0	0.17213966992023477	0.0045299913136903885
 64 | bar	bar	12	37	33613.0	0.1494114335367291	0.00403814685234403
 65 | bar	bar	13	36	29008.0	0.1286862020151156	0.0035746167226421
 66 | bar	bar	14	35	28208.0	0.1257340707416353	0.0035924020211895802
 67 | bar	bar	15	34	26130.0	0.11682046178278417	0.0034358959347877698
 68 | bar	bar	16	33	24355.0	0.10848220502658447	0.0032873395462601354
 69 | bar	bar	17	32	21902.0	0.09720413992092795	0.0030376293725289986
 70 | bar	bar	18	31	19754.0	0.08921457365055102	0.00287788947259842
 71 | bar	bar	19	30	17506.0	0.0798108423392565	0.00266036141130855
 72 | bar	bar	20	29	16951.0	0.07831020324831016	0.002700351836148626
 73 | bar	bar	21	28	16124.0	0.07470713314986098	0.0026681118982093206
 74 | bar	bar	22	27	16237.0	0.07516147832181286	0.002783758456363439
 75 | bar	bar	23	26	15583.0	0.07144738725071081	0.0027479764327196466
 76 | bar	bar	24	25	14864.0	0.06801519019393452	0.0027206076077573808
 77 | bar	bar	25	24	14174.0	0.06516873511627985	0.002715363963178327
 78 | bar	bar	26	23	14169.0	0.06554949528961256	0.002849978056070111
 79 | bar	bar	27	22	13561.0	0.06221042530718225	0.0028277466048719207
 80 | bar	bar	28	21	12073.0	0.055813578961226296	0.0026577894743441094
 81 | bar	bar	29	20	11032.0	0.05118868034313225	0.0025594340171566127
 82 | bar	bar	30	19	10723.0	0.050269590871060296	0.0026457679405821207
 83 | bar	bar	31	18	10646.0	0.04998712073522266	0.0027770622630679254
 84 | bar	bar	32	17	10320.0	0.04943531274185869	0.002907959573050511
 85 | bar	bar	33	16	9664.0	0.04604888783607321	0.0028780554897545755
 86 | bar	bar	34	15	9227.0	0.04425307710295975	0.0029502051401973164
 87 | bar	bar	35	14	9111.0	0.04421548066666439	0.0031582486190474567
 88 | bar	bar	36	13	9923.0	0.04945120961837048	0.003803939201413114
 89 | bar	bar	37	12	9219.0	0.04674824569212995	0.0038956871410108294
 90 | bar	bar	38	11	8027.0	0.04077733321358686	0.0037070302921442602
 91 | bar	bar	39	10	6756.0	0.03230495094148628	0.0032304950941486276
 92 | bar	bar	40	9	5996.0	0.027699878189309274	0.003077764243256586
 93 | bar	bar	41	8	5280.0	0.023833680900535406	0.0029792101125669258
 94 | bar	bar	42	7	4560.0	0.019837282406156377	0.002833897486593768
 95 | bar	bar	43	6	3911.0	0.01627847374007839	0.0027130789566797314
 96 | bar	bar	44	5	3155.0	0.012966661266117605	0.002593332253223521
 97 | bar	bar	45	4	2335.0	0.008792759755829107	0.0021981899389572766
 98 | bar	bar	46	3	1518.0	0.005519380548429014	0.0018397935161430046
 99 | bar	bar	47	2	1142.0	0.003471630969823881	0.0017358154849119406
100 | bar	bar	48	1	756.0	0.0019403909506671992	0.0019403909506671992
101 | bar	bar	49	0	361.0	0.0	
102 | 


--------------------------------------------------------------------------------
/tests/data/CN.mm9.toy_features.bed:
--------------------------------------------------------------------------------
1 | chr1	100100000	100150000
2 | chr2	100200000	100250000
3 | 


--------------------------------------------------------------------------------
/tests/data/CN.mm9.toy_regions.bed:
--------------------------------------------------------------------------------
1 | chr1	100000000	150000000	foo
2 | chr2	100000000	150000000	bar
3 | 


--------------------------------------------------------------------------------
/tests/data/dotfinder_mock_inputs.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/cooltools/aedd531e80e7940311f941918c6551d3229eaf21/tests/data/dotfinder_mock_inputs.npz


--------------------------------------------------------------------------------
/tests/data/dotfinder_mock_res.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/cooltools/aedd531e80e7940311f941918c6551d3229eaf21/tests/data/dotfinder_mock_res.csv.gz


--------------------------------------------------------------------------------
/tests/data/make_test_compartments.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import pandas as pd
 3 | import numpy as np
 4 | import h5py
 5 | 
 6 | 
 7 | # make chromsizes
 8 | with open("./test.chrom.sizes", "w") as chromsizes:
 9 |     chromsizes.write("chr1\t1000\n")
10 |     chromsizes.write("chr2\t2000\n")
11 |     chromsizes.write("chr3\t3000")
12 | 
13 | BIN_SIZE = 10
14 | # make bins
15 | subprocess.check_output(
16 |     f"cooltools genome binnify ./test.chrom.sizes {BIN_SIZE} > ./test.10.bins",
17 |     shell=True,
18 | )
19 | 
20 | # make Hi-C data
21 | bins = pd.read_table("./test.10.bins", sep="\t")
22 | EIG_PERIOD_BP = 500
23 | EIG_AMPLITUDE = np.sqrt(0.5)
24 | SCALING = -2
25 | MAX_CIS_COUNTS = 1e8
26 | MAX_TRANS_COUNTS = 1e5
27 | 
28 | bins["eig"] = EIG_AMPLITUDE * np.sin(bins.start * 2 * np.pi / EIG_PERIOD_BP)
29 | bins["key"] = 0
30 | pixels = pd.merge(bins, bins, on="key", how="outer", suffixes=("1", "2"))
31 | pixels.drop("key", axis="columns", inplace=True)
32 | pixels["count"] = np.nan
33 | 
34 | cis = pixels.chrom1 == pixels.chrom2
35 | pixels.loc[cis, "count"] = pixels[cis].eval(
36 |     "@MAX_CIS_COUNTS * ((abs(start1-start2)+@BIN_SIZE)**@SCALING) * (1.0+eig1*eig2)"
37 | )
38 | pixels.loc[~cis, "count"] = pixels[~cis].eval("@MAX_TRANS_COUNTS * (1.0+eig1*eig2)")
39 | 
40 | pixels["count"] = pixels["count"].astype(int)
41 | pixels[["chrom1", "start1", "end1", "chrom2", "start2", "end2", "count"]].to_csv(
42 |     "./sin_eigs_mat.bg2.gz", sep="\t", index=False, header=None, compression="gzip"
43 | )
44 | 
45 | 
46 | # make a cooler
47 | subprocess.check_output(
48 |     "cooler load -f bg2 --count-as-float --tril-action drop "
49 |     + f"./test.chrom.sizes:{BIN_SIZE} ./sin_eigs_mat.bg2.gz "
50 |     + "./sin_eigs_mat.cool",
51 |     shell=True,
52 | )
53 | 
54 | # fake IC
55 | f = h5py.File("./sin_eigs_mat.cool")
56 | f["bins/weight"] = np.ones_like(f["bins/start"], dtype=float)
57 | f["bins/weight"].attrs["ignore_diags"] = 2
58 | f.close()
59 | 


--------------------------------------------------------------------------------
/tests/data/mm9.chrom.sizes.reduced:
--------------------------------------------------------------------------------
 1 | chr1	197195432
 2 | chr2	181748087
 3 | chr3	159599783
 4 | chr4	155630120
 5 | chr5	152537259
 6 | chr6	149517037
 7 | chr7	152524553
 8 | chr8	131738871
 9 | chr9	124076172
10 | chr10	129993255
11 | chr11	121843856
12 | chr12	121257530
13 | chr13	120284312
14 | chr14	125194864
15 | chr15	103494974
16 | chr16	98319150
17 | chr17	95272651
18 | chr18	90772031
19 | chr19	61342430
20 | chrX	166650296
21 | chrY	15902555
22 | chrM	16299
23 | 


--------------------------------------------------------------------------------
/tests/data/mm9.named_nonoverlap_regions.bed:
--------------------------------------------------------------------------------
1 | chr1	0	99000000	chr1_firsthalf
2 | chr1	100000000	197195432	chr1_secondhalf
3 | chr2	0	99000000	chr2_firsthalf
4 | chr2	100000000	181748087	chr2_secondhalf
5 | chr10	0	129993255	chr10_full
6 | 


--------------------------------------------------------------------------------
/tests/data/sin_eigs_mat.bg2.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/cooltools/aedd531e80e7940311f941918c6551d3229eaf21/tests/data/sin_eigs_mat.bg2.gz


--------------------------------------------------------------------------------
/tests/data/sin_eigs_mat.cool:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open2c/cooltools/aedd531e80e7940311f941918c6551d3229eaf21/tests/data/sin_eigs_mat.cool


--------------------------------------------------------------------------------
/tests/data/test.chrom.sizes:
--------------------------------------------------------------------------------
1 | chr1	1000
2 | chr2	2000
3 | chr3	3000


--------------------------------------------------------------------------------
/tests/test_call-dots.py:
--------------------------------------------------------------------------------
  1 | import os.path as op
  2 | 
  3 | from click.testing import CliRunner
  4 | from cooltools.cli import cli
  5 | import cooler
  6 | import numpy as np
  7 | from cooltools import api
  8 | from cooltools.lib.io import read_viewframe_from_file, read_expected_from_file
  9 | 
 10 | 
 11 | # test user-facing API for calling dots
 12 | def test_dots(request):
 13 |     # Note that call-dots requires ucsc named expected and view
 14 |     in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")
 15 |     in_exp = op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.tsv")
 16 |     in_regions = op.join(request.fspath.dirname, "data/CN.mm9.toy_regions.bed")
 17 | 
 18 |     # read data for the test:
 19 |     clr = cooler.Cooler(in_cool)
 20 |     view_df = read_viewframe_from_file(in_regions, clr, check_sorting=True)
 21 |     expected_df = read_expected_from_file(
 22 |         in_exp,
 23 |         expected_value_cols=["balanced.avg"],
 24 |         verify_view=view_df,
 25 |         verify_cooler=clr,
 26 |     )
 27 | 
 28 |     # generate dot-calls
 29 |     dot_calls_df = api.dotfinder.dots(
 30 |         clr,
 31 |         expected_df,
 32 |         view_df=view_df,
 33 |         kernels={
 34 |             "d": np.array([[1, 0, 1], [0, 0, 0], [1, 0, 1]]),
 35 |             "v": np.array([[0, 1, 0], [0, 0, 0], [0, 1, 0]]),
 36 |             "h": np.array([[0, 0, 0], [1, 0, 1], [0, 0, 0]]),
 37 |         },
 38 |         max_loci_separation=100_000_000,
 39 |         max_nans_tolerated=1,
 40 |         n_lambda_bins=50,
 41 |         lambda_bin_fdr=0.1,
 42 |         clustering_radius=False,
 43 |         cluster_filtering=None,
 44 |         tile_size=50_000_000,
 45 |         nproc=1,
 46 |     )
 47 | 
 48 |     # no comparison with reference results yet
 49 |     # just checking if it runs without errors
 50 |     assert not dot_calls_df.empty
 51 | 
 52 |     dot_calls_df_pooled = api.dotfinder.dots(
 53 |         clr,
 54 |         expected_df,
 55 |         view_df=view_df,
 56 |         kernels={
 57 |             "d": np.array([[1, 0, 1], [0, 0, 0], [1, 0, 1]]),
 58 |             "v": np.array([[0, 1, 0], [0, 0, 0], [0, 1, 0]]),
 59 |             "h": np.array([[0, 0, 0], [1, 0, 1], [0, 0, 0]]),
 60 |         },
 61 |         max_loci_separation=100_000_000,
 62 |         max_nans_tolerated=1,
 63 |         n_lambda_bins=50,
 64 |         lambda_bin_fdr=0.1,
 65 |         clustering_radius=False,
 66 |         cluster_filtering=None,
 67 |         tile_size=50_000_000,
 68 |         nproc=3,
 69 |     )
 70 |     
 71 |     assert dot_calls_df.equals(dot_calls_df_pooled)
 72 | 
 73 | 
 74 | def test_call_dots_cli(request, tmpdir):
 75 |     in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")
 76 |     in_exp = op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.chromnamed.tsv")
 77 |     out_dots = op.join(tmpdir, "test.dots")
 78 | 
 79 |     runner = CliRunner()
 80 |     result = runner.invoke(
 81 |         cli,
 82 |         [
 83 |             "dots",
 84 |             "-p",
 85 |             1,
 86 |             "--tile-size",
 87 |             60_000_000,
 88 |             "--max-loci-separation",
 89 |             100_000_000,
 90 |             "--output",
 91 |             out_dots,
 92 |             in_cool,
 93 |             in_exp,
 94 |         ],
 95 |     )
 96 |     # This command should fail because viewframe interpreted from cooler does not correspond to toy_expected:
 97 |     assert result.exit_code == 1
 98 | 
 99 | 
100 | # comment this test for now, until we swap out input data and/or allow for custom kernels
101 | 
102 | # def test_call_dots_view_cli(request, tmpdir):
103 | #     # Note that call-dots requires ucsc named expected and view
104 | #     in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")
105 | #     in_exp = op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.tsv")
106 | #     in_regions = op.join(request.fspath.dirname, "data/CN.mm9.toy_regions.bed")
107 | #     out_dots = op.join(tmpdir, "test.dots")
108 | 
109 | #     runner = CliRunner()
110 | #     cmd = [
111 | #         "dots",
112 | #         "--view",
113 | #         in_regions,
114 | #         "-p",
115 | #         1,
116 | #         "--tile-size",
117 | #         60_000_000,
118 | #         "--max-loci-separation",
119 | #         100_000_000,
120 | #         "--output",
121 | #         out_dots,
122 | #         in_cool,
123 | #         in_exp,
124 | #     ]
125 | #     result = runner.invoke(cli, cmd)
126 | #     assert result.exit_code == 0
127 | #     # make sure output is generated:
128 | #     assert op.isfile(out_dots)
129 | 
130 | 


--------------------------------------------------------------------------------
/tests/test_coverage.py:
--------------------------------------------------------------------------------
  1 | import os.path as op
  2 | import cooler
  3 | 
  4 | import cooltools
  5 | import cooltools.api
  6 | from numpy import testing
  7 | import numpy as np
  8 | import pandas as pd
  9 | 
 10 | 
 11 | def test_coverage_symmetric_upper(request):
 12 |     # perform test:
 13 |     clr = cooler.Cooler(op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool"))
 14 |     cis_cov, tot_cov = cooltools.api.coverage.coverage(
 15 |         clr, ignore_diags=2, chunksize=int(1e7)
 16 |     )
 17 | 
 18 |     # Test that minimal coverage is larger than 0.5
 19 |     assert tot_cov[tot_cov > 0].min() >= 1
 20 |     # Check multiprocessed result
 21 |     cis_cov_pooled, tot_cov_pooled = cooltools.api.coverage.coverage(
 22 |         clr, ignore_diags=2, chunksize=int(1e7), nproc=3
 23 |     )
 24 |     assert np.array_equal(cis_cov, cis_cov_pooled, equal_nan=True)
 25 |     assert np.array_equal(tot_cov, tot_cov_pooled, equal_nan=True)
 26 | 
 27 | 
 28 |     # Test that dense matrix marginal is the same:
 29 |     mtx = clr.matrix(balance=False, as_pixels=False)[:]
 30 |     np.fill_diagonal(mtx, 0)
 31 |     np.fill_diagonal(mtx[1:, :], 0)
 32 |     np.fill_diagonal(mtx[:, 1:], 0)
 33 |     cov_dense = np.sum(mtx, axis=1)
 34 |     testing.assert_allclose(
 35 |         actual=tot_cov,
 36 |         desired=cov_dense,
 37 |         equal_nan=True,
 38 |     )
 39 | 
 40 |     """  generate the following cooler to test coverage:
 41 |             array([[0, 1, 2],
 42 |                    [1, 0, 0],
 43 |                    [2, 0, 0]], dtype=int32)
 44 |     """
 45 | 
 46 |     bins = pd.DataFrame(
 47 |         [["chr1", 0, 1], ["chr1", 1, 2], ["chrX", 1, 2]],
 48 |         columns=["chrom", "start", "end"],
 49 |     )
 50 | 
 51 |     pixels = pd.DataFrame(
 52 |         [[0, 1, 1], [0, 2, 2]], columns=["bin1_id", "bin2_id", "count"]
 53 |     )
 54 | 
 55 |     clr_file = op.join(request.fspath.dirname, "data/test_coverage.cool")
 56 |     cooler.create_cooler(clr_file, bins, pixels)
 57 |     clr = cooler.Cooler(clr_file)
 58 |     cis_cov, tot_cov = cooltools.coverage(clr, ignore_diags=0, store=True)
 59 |     assert (cis_cov == np.array([1, 1, 0])).all()
 60 |     assert (tot_cov == np.array([3, 1, 2])).all()
 61 |     assert clr.info["cis"] == 1
 62 |     assert clr.info["sum"] == 3
 63 |     
 64 | def test_balanced_coverage(request):
 65 |     # perform test:
 66 |     clr = cooler.Cooler(op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool"))
 67 |     cis_cov_weight, tot_cov_weight = cooltools.api.coverage.coverage(
 68 |         clr, ignore_diags=2, chunksize=int(1e7), clr_weight_name="weight"
 69 |     )
 70 |     
 71 |     # Test that mean total balanced coverage is 1.0
 72 |     assert np.nanmean(tot_cov_weight) == 1.0
 73 | 
 74 |     cis_cov_weight_pooled, tot_cov_weight_pooled = cooltools.api.coverage.coverage(
 75 |         clr, ignore_diags=2, chunksize=int(1e7), clr_weight_name="weight", nproc=3
 76 |     )
 77 |     assert np.array_equal(cis_cov_weight, cis_cov_weight_pooled, equal_nan=True)
 78 |     assert np.array_equal(tot_cov_weight, tot_cov_weight_pooled, equal_nan=True)
 79 |     
 80 |     # Generate test matrix with weights
 81 |     bins=pd.DataFrame(
 82 |         [["chr1", 0, 1, 0.5], 
 83 |          ["chr1", 1, 2, 1], 
 84 |          ["chrX", 1, 2, 0.2],
 85 |          ["chrX", 2, 3, np.nan]],
 86 |     columns=["chrom", "start", "end", "weight"],
 87 |     )
 88 | 
 89 |     pixels = pd.DataFrame(
 90 |         [[0, 1, 1], [0, 2, 2], [1, 3, 2], [2, 3, 1]], 
 91 |         columns=["bin1_id", "bin2_id", "count"]
 92 |     )
 93 | 
 94 |     clr_file = op.join(request.fspath.dirname, "data/test_coverage.cool")
 95 |     cooler.create_cooler(clr_file, bins, pixels)
 96 |     clr = cooler.Cooler(clr_file)
 97 |     cis_cov_weight, tot_cov_weight = cooltools.coverage(clr, ignore_diags=0, store=True, clr_weight_name="weight")
 98 |     assert np.allclose(cis_cov_weight, np.array([0.5, 0.5, 0, np.nan]),
 99 |                        equal_nan=True)
100 |     assert np.allclose(tot_cov_weight, np.array([0.7, 0.5, 0.2, np.nan]),
101 |                       equal_nan=True)
102 | 


--------------------------------------------------------------------------------
/tests/test_insulation.py:
--------------------------------------------------------------------------------
  1 | import os.path as op
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | from click.testing import CliRunner
  6 | from cooltools.cli import cli
  7 | 
  8 | from cooltools.api.insulation import (
  9 |     calculate_insulation_score,
 10 |     find_boundaries,
 11 |     insul_diamond,
 12 |     _find_insulating_boundaries_dense,
 13 | )
 14 | import cooler
 15 | 
 16 | def test_insulation_cli(request, tmpdir):
 17 | 
 18 |     in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")
 19 |     window = 10_000_000
 20 |     out_prefix = op.join(tmpdir, "CN.insulation.tsv")
 21 |     runner = CliRunner()
 22 |     result = runner.invoke(cli, ["insulation", "-o", out_prefix, in_cool, window])
 23 |     assert result.exit_code == 1
 24 | 
 25 | 
 26 | def test_insulation_cli_nobalance(request, tmpdir):
 27 | 
 28 |     in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")
 29 |     window = 10_000_000
 30 |     out_prefix = op.join(tmpdir, "CN.insulation.tsv")
 31 |     runner = CliRunner()
 32 |     result = runner.invoke(
 33 |         cli,
 34 |         [
 35 |             "insulation",
 36 |             "-o",
 37 |             out_prefix,
 38 |             "--clr-weight-name",
 39 |             "",
 40 |             "--ignore-diags",
 41 |             1,
 42 |             in_cool,
 43 |             window,
 44 |         ],
 45 |     )
 46 |     assert result.exit_code == 1
 47 | 
 48 | 
 49 | def test_calculate_insulation_score(request):
 50 |     clr_path = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")
 51 |     clr = cooler.Cooler(clr_path)
 52 |     windows = [10_000_000, 20_000_000]
 53 | 
 54 |     # I. Regular insulation, check presence of columns for each window:
 55 |     insulation = calculate_insulation_score(clr, windows)
 56 |     assert {f"log2_insulation_score_{window}" for window in windows}.issubset(
 57 |         insulation.columns
 58 |     )
 59 |     assert {f"n_valid_pixels_{window}" for window in windows}.issubset(
 60 |         insulation.columns
 61 |     )
 62 |     # check multiprocessed result
 63 |     insulation_pooled = calculate_insulation_score(clr, windows, nproc=3)
 64 |     assert insulation.equals(insulation_pooled)
 65 | 
 66 |     # II. Insulation with masking bad bins
 67 |     insulation = calculate_insulation_score(clr, 10_000_000, min_dist_bad_bin=1)
 68 |     # All bins closer than 1 to bad bins are filled with np.nans:
 69 |     assert np.all(
 70 |         np.isnan(insulation.query("dist_bad_bin==0")["log2_insulation_score_10000000"])
 71 |     )
 72 |     # Some of the bins at the distance 1 (above threshold) are not np.nans:
 73 |     assert np.any(
 74 |         ~np.isnan(insulation.query("dist_bad_bin==1")["log2_insulation_score_10000000"])
 75 |     )
 76 |     # check multiprocessed result
 77 |     insulation_pooled = calculate_insulation_score(clr, 10_000_000, min_dist_bad_bin=1, nproc=3)
 78 |     assert insulation.equals(insulation_pooled)
 79 | 
 80 |     # III. Insulation for separate view:
 81 |     region = pd.DataFrame(
 82 |         {"chrom": ["chr1"], "start": [0], "end": [10_000_000], "name": ["fragment01"]}
 83 |     )
 84 |     insulation = calculate_insulation_score(
 85 |         clr, 10_000_000, min_dist_bad_bin=0, view_df=region
 86 |     )
 87 |     assert len(insulation) == 10
 88 |     # check multiprocessed result
 89 |     insulation_pooled = calculate_insulation_score(
 90 |         clr, 10_000_000, min_dist_bad_bin=0, view_df=region, nproc=3
 91 |     )
 92 |     assert insulation.equals(insulation_pooled)
 93 | 
 94 |     # IV. Insulation with string or float inputs for window sizes should work.
 95 |     calculate_insulation_score(clr, '10_000_000')
 96 |     calculate_insulation_score(clr, '10_000_000', nproc=3)
 97 | 
 98 | 
 99 | def test_find_boundaries(request):
100 |     clr_path = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")
101 |     clr = cooler.Cooler(clr_path)
102 |     windows = [10_000_000, 20_000_000]
103 | 
104 |     # I. Regular boundaries, check presence of columns for each window:
105 |     insulation = calculate_insulation_score(clr, windows)
106 |     boundaries = find_boundaries(insulation)
107 |     assert {f"boundary_strength_{window}" for window in windows}.issubset(
108 |         boundaries.columns
109 |     )
110 | 
111 | 
112 | def test_insul_diamond(request):
113 |     clr_path = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")
114 |     clr = cooler.Cooler(clr_path)
115 | 
116 |     # Pixel query
117 |     from cooltools.lib._query import CSRSelector
118 | 
119 |     nbins = len(clr.bins())
120 |     chunksize = 10_000
121 |     selector = CSRSelector(
122 |         clr.open("r"), shape=(nbins, nbins), field="count", chunksize=chunksize
123 |     )
124 |     c0 = 0
125 |     c1 = 10
126 |     pixel_query = selector[c0:c1, c0:c1]
127 | 
128 |     # Define bins with different weights:
129 |     bins = pd.DataFrame(
130 |         [
131 |             ["chr1", 0, 1000000, 1, 0.1, 0.01],
132 |             ["chr1", 1000000, 2000000, 1, 0.1, 0.01],
133 |             ["chr1", 2000000, 3000000, 1, 0.1, 0.01],
134 |             ["chr1", 3000000, 4000000, 1, 0.1, 0.01],
135 |             ["chr1", 4000000, 5000000, 1, 0.1, 0.01],
136 |             ["chr1", 5000000, 6000000, 1, 0.1, 0.01],
137 |             ["chr1", 6000000, 7000000, 1, 0.1, 0.01],
138 |             ["chr1", 7000000, 8000000, 1, 0.1, 0.01],
139 |             ["chr1", 8000000, 9000000, 1, 0.1, 0.01],
140 |             ["chr1", 9000000, 10000000, 1, 0.1, 0.01],
141 |         ],
142 |         columns=["chrom", "start", "end", "weight", "weight_cis", "weight_trans"],
143 |     )
144 | 
145 |     # Run insul_diamond:
146 |     score, n_pixels, sum_balanced, sum_counts = insul_diamond(
147 |         pixel_query,
148 |         bins,
149 |         window=3,
150 |         ignore_diags=2,
151 |         norm_by_median=False,
152 |         clr_weight_name="weight",
153 |     )
154 | 
155 |     assert np.allclose(sum_balanced, sum_counts)
156 | 
157 |     score, n_pixels, sum_balanced, sum_counts = insul_diamond(
158 |         pixel_query,
159 |         bins,
160 |         window=3,
161 |         ignore_diags=2,
162 |         norm_by_median=False,
163 |         clr_weight_name="weight_cis",
164 |     )
165 | 
166 |     assert np.allclose(sum_balanced, 0.01 * sum_counts)
167 | 
168 | 
169 | def test_insulation_sparse_vs_dense(request):
170 |     clr_path = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")
171 |     clr = cooler.Cooler(clr_path)
172 |     insul_dense = _find_insulating_boundaries_dense(
173 |         clr,
174 |         10_000_000,
175 |         clr_weight_name="weight",
176 |         min_dist_bad_bin=0,
177 |         ignore_diags=2,
178 |     )
179 | 
180 |     insulation_sparse = calculate_insulation_score(
181 |         clr, 10_000_000, clr_weight_name="weight", min_dist_bad_bin=0, ignore_diags=2
182 |     )
183 |     boundaries_sparse = find_boundaries(insulation_sparse)
184 | 
185 |     assert np.allclose(
186 |         insul_dense["log2_insulation_score_10000000"],
187 |         boundaries_sparse["log2_insulation_score_10000000"],
188 |         equal_nan=True,
189 |     )


--------------------------------------------------------------------------------
/tests/test_io.py:
--------------------------------------------------------------------------------
 1 | import os.path as op
 2 | import pandas as pd
 3 | from cooltools.lib.io import read_expected_from_file, read_viewframe_from_file
 4 | from cooltools.lib import is_valid_expected
 5 | import bioframe
 6 | import pytest
 7 | 
 8 | 
 9 | def test_read_expected_from_file(request, tmpdir):
10 | 
11 |     expected_file = op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.chromnamed.tsv")
12 |     expected_df = read_expected_from_file(expected_file, expected_value_cols=["balanced.avg"])
13 | 
14 |     assert is_valid_expected(
15 |         expected_df, "cis", expected_value_cols=["balanced.avg"]
16 |     )
17 | 
18 |     # test for error when string in one row of "n_valid" column (supposed to be Int64 dtype):
19 |     expected_df_wrongdtype = expected_df.copy()
20 |     expected_df_wrongdtype["n_valid"] = expected_df_wrongdtype["n_valid"].astype(str)
21 |     expected_df_wrongdtype.loc[0,"n_valid"] = "string"
22 |     expected_df_wrongdtype.to_csv(op.join(tmpdir, "CN.mm9.toy_expected_wrongdtype.tsv"), 
23 |                                   sep="\t", index=False)
24 |     with pytest.raises(ValueError):
25 |         read_expected_from_file(
26 |             op.join(tmpdir, "CN.mm9.toy_expected_wrongdtype.tsv"),
27 |             expected_value_cols=["balanced.avg"],
28 |         )
29 | 
30 |     # test that read_expected from file works if chroms are mix of str and int
31 |     expected_df_intchr = expected_df.copy()
32 |     expected_df_intchr["region1"] = expected_df_intchr["region1"].str.replace('chr1','1')
33 |     expected_df_intchr["region2"] = expected_df_intchr["region2"].str.replace('chr1','1')
34 |     expected_df_intchr.to_csv(op.join(tmpdir, "CN.mm9.toy_expected_intchr.tsv"), 
35 |                                   sep="\t", index=False)
36 |     expected_df_intchr = read_expected_from_file(op.join(tmpdir, "CN.mm9.toy_expected_intchr.tsv"),
37 |                                                  expected_value_cols=["balanced.avg"])
38 |     assert is_valid_expected(
39 |         expected_df_intchr, "cis", expected_value_cols=["balanced.avg"]
40 |     )
41 | 
42 | 
43 | def test_read_viewframe_from_file(request, tmpdir):
44 | 
45 |     # test viewframe with 4 columns - i.e. with unique names
46 |     view_file_wnames = op.join(request.fspath.dirname, "data/CN.mm9.toy_regions.bed")
47 |     view_df = read_viewframe_from_file(view_file_wnames, verify_cooler=None, check_sorting=False)
48 |     assert bioframe.is_viewframe(view_df)
49 | 
50 |     # test viewframe with 3 columns - i.e. without unique names
51 |     view_file_wonames = op.join(request.fspath.dirname, "data/CN.mm9.toy_features.bed")
52 |     view_df = read_viewframe_from_file(view_file_wonames, verify_cooler=None, check_sorting=False)
53 |     assert bioframe.is_viewframe(view_df)
54 |     # for a 3 column viewframe, UCSC strings should assigned to names
55 |     assert view_df["name"].apply(bioframe.is_complete_ucsc_string).all()
56 | 


--------------------------------------------------------------------------------
/tests/test_lazy_toeplitz.py:
--------------------------------------------------------------------------------
 1 | from scipy.linalg import toeplitz
 2 | import numpy as np
 3 | from cooltools.lib.numutils import LazyToeplitz
 4 | 
 5 | 
 6 | n = 100
 7 | m = 150
 8 | c = np.arange(1, n + 1)
 9 | r = np.r_[1, np.arange(-2, -m, -1)]
10 | 
11 | L = LazyToeplitz(c, r)
12 | T = toeplitz(c, r)
13 | 
14 | 
15 | def test_symmetric():
16 |     for si in [
17 |         slice(10, 20),
18 |         slice(0, 150),
19 |         slice(0, 0),
20 |         slice(150, 150),
21 |         slice(10, 10),
22 |     ]:
23 |         assert np.allclose(L[si, si], T[si, si])
24 | 
25 | 
26 | def test_triu_no_overlap():
27 |     for si, sj in [
28 |         (slice(10, 20), slice(30, 40)),
29 |         (slice(10, 15), slice(30, 40)),
30 |         (slice(10, 20), slice(30, 45)),
31 |     ]:
32 |         assert np.allclose(L[si, sj], T[si, sj])
33 | 
34 | 
35 | def test_tril_no_overlap():
36 |     for si, sj in [
37 |         (slice(30, 40), slice(10, 20)),
38 |         (slice(30, 40), slice(10, 15)),
39 |         (slice(30, 45), slice(10, 20)),
40 |     ]:
41 |         assert np.allclose(L[si, sj], T[si, sj])
42 | 
43 | 
44 | def test_triu_with_overlap():
45 |     for si, sj in [
46 |         (slice(10, 20), slice(15, 25)),
47 |         (slice(13, 22), slice(15, 25)),
48 |         (slice(10, 20), slice(18, 22)),
49 |     ]:
50 |         assert np.allclose(L[si, sj], T[si, sj])
51 | 
52 | 
53 | def test_tril_with_overlap():
54 |     for si, sj in [
55 |         (slice(15, 25), slice(10, 20)),
56 |         (slice(15, 22), slice(10, 20)),
57 |         (slice(15, 25), slice(10, 18)),
58 |     ]:
59 |         assert np.allclose(L[si, sj], T[si, sj])
60 | 
61 | 
62 | def test_nested():
63 |     for si, sj in [
64 |         (slice(10, 40), slice(20, 30)),
65 |         (slice(10, 35), slice(20, 30)),
66 |         (slice(10, 40), slice(20, 25)),
67 |         (slice(20, 30), slice(10, 40)),
68 |     ]:
69 |         assert np.allclose(L[si, sj], T[si, sj])
70 | 


--------------------------------------------------------------------------------
/tests/test_lib_common.py:
--------------------------------------------------------------------------------
  1 | import os.path as op
  2 | import pandas as pd
  3 | import cooler
  4 | import cooltools
  5 | import pytest
  6 | 
  7 | # TODO: tests for
  8 | # assign_supports, or assign_regions, or deprecate & remove both
  9 | # assign_regions_to_bins
 10 | # make_cooler_view
 11 | # view_from_track
 12 | 
 13 | 
 14 | def test_align_track_with_cooler(request, tmpdir):
 15 | 
 16 |     clr_file = op.join(request.fspath.dirname, "data/sin_eigs_mat.cool")
 17 |     clr = cooler.Cooler(clr_file)
 18 | 
 19 |     # valid track with three entries that can all be aligned
 20 |     track = pd.DataFrame(
 21 |         [
 22 |             ["chr1", 990, 995, 22],
 23 |             ["chr2", 20, 30, -1],
 24 |             ["chr3", 0, 10, 0.1],
 25 |         ],
 26 |         columns=["chrom", "start", "end", "value"],
 27 |     )
 28 |     assert (
 29 |         ~cooltools.lib.align_track_with_cooler(track, clr)["value"].isna()
 30 |     ).sum() == 3
 31 | 
 32 |     # not a track, is not sorted
 33 |     track = pd.DataFrame(
 34 |         [["chr3", 0, 10, 0.1], ["chr2", 20, 30, -1], ["chr2", 0, 10, 21]],
 35 |         columns=["chrom", "start", "end", "value"],
 36 |     )
 37 |     with pytest.raises(ValueError):
 38 |         cooltools.lib.align_track_with_cooler(track, clr)
 39 | 
 40 |     # not a track, is overlapping
 41 |     track = pd.DataFrame(
 42 |         [
 43 |             ["chr1", 990, 1000, 22],
 44 |             ["chr2", 5, 15, 0.1],
 45 |             ["chr2", 20, 30, -1],
 46 |         ],
 47 |         columns=["chrom", "start", "end", "value"],
 48 |     )
 49 |     with pytest.raises(ValueError):
 50 |         cooltools.lib.align_track_with_cooler(track, clr)
 51 | 
 52 |     # bin size mismatch
 53 |     track = pd.DataFrame(
 54 |         [["chr1", 990, 995, 22], ["chr2", 20, 25, -1], ["chr3", 0, 5, 0.1]],
 55 |         columns=["chrom", "start", "end", "value"],
 56 |     )
 57 |     with pytest.raises(ValueError):
 58 |         cooltools.lib.align_track_with_cooler(track, clr)
 59 | 
 60 |     # clr_weight_name mismatch
 61 |     track = pd.DataFrame(
 62 |         [
 63 |             ["chr1", 990, 995, 22],
 64 |             ["chr2", 20, 30, -1],
 65 |             ["chr3", 0, 10, 0.1],
 66 |         ],
 67 |         columns=["chrom", "start", "end", "value"],
 68 |     )
 69 |     with pytest.raises(ValueError):
 70 |         cooltools.lib.align_track_with_cooler(
 71 |             track, clr, clr_weight_name="invalid_weight_name"
 72 |         )
 73 | 
 74 |     # regions with no assigned values
 75 |     track = pd.DataFrame(
 76 |         [["chr1", 0, 10, 0.1], ["chr1", 20, 30, -1], ["chr1", 990, 995, 22]],
 77 |         columns=["chrom", "start", "end", "value"],
 78 |     )
 79 |     with pytest.raises(ValueError):
 80 |         cooltools.lib.align_track_with_cooler(track, clr)
 81 | 
 82 |     # using a restricted view only considers chr1, avoids valueError from no assigned values
 83 |     view_df = cooltools.lib.make_cooler_view(clr)
 84 |     assert (
 85 |         ~cooltools.lib.align_track_with_cooler(track, clr, view_df=view_df[:1])[
 86 |             "value"
 87 |         ].isna()
 88 |     ).sum() == 3
 89 | 
 90 |     # testing mask_bad_bins option
 91 |     clr_file = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")
 92 |     clr = cooler.Cooler(clr_file)
 93 |     view_df = cooltools.lib.make_cooler_view(clr)[:1]
 94 | 
 95 |     track = pd.DataFrame(
 96 |         [["chr1", 0, 1000000, 1], ["chr1", 3000000, 4000000, 10]],
 97 |         columns=["chrom", "start", "end", "value"],
 98 |     )
 99 |     # without masking, both get assigned
100 |     assert (
101 |         cooltools.lib.align_track_with_cooler(
102 |             track, clr, view_df=view_df, mask_clr_bad_bins=False
103 |         )["value"].sum()
104 |         == 11
105 |     )
106 | 
107 |     # with masking, only the second value from the track gets assigned
108 |     assert (
109 |         cooltools.lib.align_track_with_cooler(
110 |             track, clr, view_df=view_df, mask_clr_bad_bins=True
111 |         )["value"].sum()
112 |         == 10
113 |     )
114 | 


--------------------------------------------------------------------------------
/tests/test_rearrange_cooler.py:
--------------------------------------------------------------------------------
  1 | import cooler
  2 | import bioframe
  3 | import os.path as op
  4 | 
  5 | import numpy as np
  6 | 
  7 | from cooltools.api.rearrange import rearrange_cooler
  8 | from pandas.testing import assert_frame_equal
  9 | 
 10 | 
 11 | def test_rearrange_cooler(request):
 12 |     # Read cool file and create view_df out of it:
 13 |     clr = cooler.Cooler(op.join(request.fspath.dirname, "data/CN.mm9.10000kb.cool"))
 14 |     orig_view = bioframe.make_viewframe(clr.chromsizes)
 15 | 
 16 |     # I.
 17 |     # Check that with the same view, nothing changes
 18 |     rearrange_cooler(clr, orig_view, "test_not_reordered.cool")
 19 |     new_clr = cooler.Cooler("test_not_reordered.cool")
 20 |     assert_frame_equal(new_clr.pixels()[:], clr.pixels()[:])
 21 |     assert_frame_equal(new_clr.bins()[:], clr.bins()[:])
 22 |     assert_frame_equal(new_clr.bins()[:], clr.bins()[:])
 23 | 
 24 |     # II.
 25 |     # Check that when just getting one chrom, all is as expected
 26 |     new_view = orig_view.iloc[:1, :]
 27 |     rearrange_cooler(clr, new_view, "test_chrom1_reordered.cool")
 28 |     new_clr = cooler.Cooler("test_chrom1_reordered.cool")
 29 |     old_bins = clr.bins()[:].query('chrom=="chr1"')
 30 |     old_bins["chrom"] = old_bins["chrom"].astype(str)
 31 |     new_bins = new_clr.bins()[:]
 32 |     new_bins["chrom"] = new_bins["chrom"].astype(str)
 33 |     assert_frame_equal(old_bins, new_bins)
 34 | 
 35 |     old_pixels = clr.matrix(as_pixels=True).fetch("chr1").drop(columns=["balanced"])
 36 |     new_pixels = new_clr.pixels()[:]
 37 |     assert_frame_equal(old_pixels, new_pixels)
 38 |     assert_frame_equal(clr.chroms()[:1], new_clr.chroms()[:])
 39 | 
 40 |     # III.
 41 |     # Check that when just getting one chrom and inverting it, all is as expected
 42 |     inverted_view = new_view.copy()
 43 |     inverted_view["strand"] = "-"
 44 |     rearrange_cooler(clr, inverted_view, "test_chrom1_reordered_inverted.cool")
 45 |     inverted_clr = cooler.Cooler("test_chrom1_reordered_inverted.cool")
 46 |     inverted_bins = inverted_clr.bins()[:]
 47 |     inverted_bins[["end", "start"]] = (
 48 |         inverted_bins.iloc[-1]["end"] - inverted_bins[["start", "end"]]
 49 |     )
 50 |     inverted_bins = inverted_bins.iloc[::-1].reset_index(drop=True)
 51 |     inverted_bins["chrom"] = inverted_bins["chrom"].astype(str)
 52 |     assert_frame_equal(new_bins, inverted_bins)
 53 |     inverted_pixels = inverted_clr.pixels()[:]
 54 |     inverted_pixels[["bin1_id", "bin2_id"]] = np.sort(
 55 |         inverted_bins.index[-1] - inverted_pixels[["bin1_id", "bin2_id"]]
 56 |     )
 57 |     inverted_pixels = inverted_pixels.sort_values(["bin1_id", "bin2_id"]).reset_index(
 58 |         drop=True
 59 |     )
 60 |     assert_frame_equal(new_clr.pixels()[:], inverted_pixels)
 61 |     assert_frame_equal(new_clr.chroms()[:1], inverted_clr.chroms()[:])
 62 | 
 63 |     # III.
 64 |     # Check that when taking two chromosomes in a different order and inverting one,
 65 |     # all is ax espected
 66 | 
 67 |     reorder_invert_view = (
 68 |         orig_view.iloc[1::-1].assign(strand=["+", "-"]).reset_index(drop=True)
 69 |     )
 70 |     rearrange_cooler(clr, reorder_invert_view, "test_chr2chr1_reordered_inverted.cool")
 71 |     reordered_inverted_clr = cooler.Cooler("test_chr2chr1_reordered_inverted.cool")
 72 | 
 73 |     # compare chr2 bins
 74 |     old_bins_chr2 = clr.bins().fetch("chr2").reset_index(drop=True)
 75 |     old_bins_chr2["chrom"] = old_bins_chr2["chrom"].astype(str)
 76 |     reordered_inverted_bins_chr2 = reordered_inverted_clr.bins().fetch("chr2")
 77 |     reordered_inverted_bins_chr2["chrom"] = reordered_inverted_bins_chr2[
 78 |         "chrom"
 79 |     ].astype(str)
 80 |     assert_frame_equal(old_bins_chr2, reordered_inverted_bins_chr2)
 81 |     # compare chr2 pixels
 82 |     old_pixels_chr2 = (
 83 |         clr.pixels()
 84 |         .fetch("chr2")
 85 |         .query(f'bin2_id<={clr.bins().fetch("chr2").index[-1]}')
 86 |         .reset_index(drop=True)
 87 |     )
 88 |     reordered_inverted_pixels_chr2 = (
 89 |         reordered_inverted_clr.pixels()
 90 |         .fetch("chr2")
 91 |         .query(f'bin2_id<={reordered_inverted_clr.bins().fetch("chr2").index[-1]}')
 92 |         .reset_index(drop=True)
 93 |     )
 94 |     reordered_inverted_pixels_chr2[["bin1_id", "bin2_id"]] += (
 95 |         clr.bins().fetch("chr1").index[-1] + 1
 96 |     )
 97 |     assert_frame_equal(old_pixels_chr2, reordered_inverted_pixels_chr2)
 98 |     # Compare chr1 bins
 99 |     old_bins_chr1 = clr.bins().fetch("chr1")
100 |     old_bins_chr1["chrom"] = old_bins_chr1["chrom"].astype(str)
101 | 
102 |     reordered_inverted_bins_chr1 = reordered_inverted_clr.bins().fetch("chr1")
103 |     reordered_inverted_bins_chr1[["end", "start"]] = (
104 |         reordered_inverted_bins_chr1.iloc[-1]["end"]
105 |         - reordered_inverted_bins_chr1[["start", "end"]]
106 |     )
107 |     reordered_inverted_bins_chr1.index = (
108 |         reordered_inverted_bins_chr1.index[::-1] - old_bins_chr1.index[-1]
109 |     )
110 |     reordered_inverted_bins_chr1 = reordered_inverted_bins_chr1.iloc[::-1]
111 |     reordered_inverted_bins_chr1["chrom"] = reordered_inverted_bins_chr1[
112 |         "chrom"
113 |     ].astype(str)
114 | 
115 |     assert_frame_equal(old_bins_chr1, reordered_inverted_bins_chr1)
116 |     # Compare chr1 pixels
117 |     old_pixels_chr1 = (
118 |         clr.pixels()
119 |         .fetch("chr1")
120 |         .query(f'bin2_id<={clr.bins().fetch("chr1").index[-1]}')
121 |         .reset_index(drop=True)
122 |     )
123 |     reordered_inverted_pixels_chr1 = reordered_inverted_clr.pixels().fetch("chr1")
124 | 
125 |     reordered_inverted_pixels_chr1[["bin1_id", "bin2_id"]] = np.sort(
126 |         reordered_inverted_bins_chr1.index[-1]
127 |         - reordered_inverted_pixels_chr1[["bin1_id", "bin2_id"]]
128 |         + reordered_inverted_bins_chr2.index[-1]
129 |         + 1
130 |     )
131 |     reordered_inverted_pixels_chr1 = inverted_pixels.sort_values(["bin1_id", "bin2_id"])
132 |     assert_frame_equal(old_pixels_chr1, reordered_inverted_pixels_chr1)
133 |     # Compare trans matrix (easier than pixels)
134 |     old_trans_m = clr.matrix().fetch("chr1", "chr2")
135 |     reordered_inverted_trans_m = reordered_inverted_clr.matrix().fetch("chr1", "chr2")[
136 |         ::-1, :
137 |     ]
138 |     assert np.array_equal(old_trans_m, reordered_inverted_trans_m, equal_nan=True)
139 | 


--------------------------------------------------------------------------------
/tests/test_sample.py:
--------------------------------------------------------------------------------
 1 | import os.path as op
 2 | import cooler
 3 | 
 4 | import cooltools
 5 | import cooltools.api
 6 | from numpy import testing
 7 | 
 8 | 
 9 | def test_sample(request):
10 |     # perform test:
11 |     clr = cooler.Cooler(op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool"))
12 | 
13 |     cooltools.api.sample.sample(
14 |         clr,
15 |         op.join(request.fspath.dirname, "data/CN.mm9.1000kb.test_sampled.cool"),
16 |         frac=0.2,
17 |         nproc=3
18 |     )
19 |     clr_result = cooler.Cooler(
20 |         op.join(request.fspath.dirname, "data/CN.mm9.1000kb.test_sampled.cool")
21 |     )
22 |     # Test that deviation from expected total is very small
23 |     testing.assert_allclose(clr_result.info["sum"], clr.info["sum"] / 5, rtol=1e-3)
24 | 
25 |     cooltools.api.sample.sample(
26 |         clr,
27 |         op.join(request.fspath.dirname, "data/CN.mm9.1000kb.test_sampled.cool"),
28 |         count=20000000,
29 |         nproc=3
30 |     )
31 |     clr_result = cooler.Cooler(
32 |         op.join(request.fspath.dirname, "data/CN.mm9.1000kb.test_sampled.cool")
33 |     )
34 |     # Test that deviation from expected total is very small
35 |     testing.assert_allclose(clr_result.info["sum"], 20000000, rtol=1e-2)
36 | 
37 | 
38 | def test_sample_exact(request):
39 |     # Exact sampling is very slow! So commented out
40 |     clr = cooler.Cooler(op.join(request.fspath.dirname, "data/CN.mm9.10000kb.cool"))
41 | 
42 |     cooltools.api.sample.sample(
43 |         clr,
44 |         op.join(request.fspath.dirname, "data/CN.mm9.10000kb.test_sampled.cool"),
45 |         frac=0.2,
46 |         exact=True,
47 |         nproc=3
48 |     )
49 |     clr_result = cooler.Cooler(
50 |         op.join(request.fspath.dirname, "data/CN.mm9.10000kb.test_sampled.cool")
51 |     )
52 |     # Test that result matches expectation exactly
53 |     testing.assert_equal(clr_result.info["sum"], round(clr.info["sum"] * 0.2))
54 | 
55 |     cooltools.api.sample.sample(
56 |         clr,
57 |         op.join(request.fspath.dirname, "data/CN.mm9.10000kb.test_sampled.cool"),
58 |         count=2000000,
59 |         exact=True,
60 |         nproc=3
61 |     )
62 |     clr_result = cooler.Cooler(
63 |         op.join(request.fspath.dirname, "data/CN.mm9.10000kb.test_sampled.cool")
64 |     )
65 |     # Test that result matches expectation exactly
66 |     testing.assert_equal(clr_result.info["sum"], 2000000)
67 | 


--------------------------------------------------------------------------------
/tests/test_virtual4c.py:
--------------------------------------------------------------------------------
 1 | import os.path as op
 2 | 
 3 | from click.testing import CliRunner
 4 | from cooltools.cli import cli
 5 | 
 6 | from cooltools.api import virtual4c
 7 | import cooler
 8 | 
 9 | 
10 | def test_virtual4c(request):
11 |     clr_path = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")
12 |     clr = cooler.Cooler(clr_path)
13 |     viewpoint = "chr1:30000000-40000000"
14 | 
15 |     v4c = virtual4c.virtual4c(clr, viewpoint)
16 | 
17 |     assert v4c.shape[0] == clr.bins()[:].shape[0]
18 | 
19 |     # check multiprocessed result
20 |     pooled_v4c = virtual4c.virtual4c(clr, viewpoint, nproc=3)
21 |     assert v4c.equals(pooled_v4c)
22 | 
23 | 
24 | def test_virtual4c_cli(request, tmpdir):
25 | 
26 |     in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")
27 |     out_prefix = op.join(tmpdir, "CN.virtual4c")
28 |     viewpoint = "chr1:30000000-40000000"
29 | 
30 |     runner = CliRunner()
31 |     result = runner.invoke(cli, ["virtual4c", "-o", out_prefix, in_cool, viewpoint])
32 |     assert result.exit_code == 0
33 | 
34 | 
35 | def test_virtual4c_cli_nobalance(request, tmpdir):
36 | 
37 |     in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")
38 |     out_prefix = op.join(tmpdir, "CN.virtual4c")
39 |     viewpoint = "chr1:30000000-40000000"
40 | 
41 |     runner = CliRunner()
42 |     result = runner.invoke(
43 |         cli,
44 |         ["virtual4c", "--clr-weight-name", "", "-o", out_prefix, in_cool, viewpoint],
45 |     )
46 |     assert result.exit_code == 0
47 | 


--------------------------------------------------------------------------------