├── .github └── workflows │ ├── pylint.yml │ ├── python-publish.yml │ └── test.yml ├── .gitignore ├── .readthedocs.yml ├── HiCMatrix_env_ci.yml ├── LICENSE ├── README.rst ├── hicmatrix ├── HiCMatrix.py ├── __init__.py ├── lib │ ├── __init__.py │ ├── cool.py │ ├── ginteractions.py │ ├── h5.py │ ├── hicpro.py │ ├── homer.py │ ├── matrixFile.py │ ├── matrixFileHandler.py │ └── scool.py ├── test │ ├── test_HiCMatrix.py │ ├── test_data │ │ ├── GSE63525_GM12878_insitu_primary_2_5mb_hic2cool042.cool │ │ ├── GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool │ │ ├── Li_et_al_2015.cool │ │ ├── Li_et_al_2015.h5 │ │ ├── one_interaction_4chr.cool │ │ ├── one_interaction_diag_4chr.cool │ │ ├── small_test_matrix.h5 │ │ ├── test_matrix.bed │ │ ├── test_matrix.hicpro │ │ ├── test_matrix.homer │ │ └── test_matrix.homer.gz │ └── test_matrixFileHandler.py └── utilities.py ├── pyproject.toml └── pytest.ini /.github/workflows/pylint.yml: -------------------------------------------------------------------------------- 1 | name: Pylint 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v4 10 | - uses: mamba-org/setup-micromamba@main 11 | with: 12 | environment-file: ./HiCMatrix_env_ci.yml 13 | cache-downloads: true 14 | environment-name: HiCMatrix_env_ci 15 | - name: Analysing the code with pylint 16 | run: | 17 | # Disable 18 | # C0103: Invalid name 19 | # C0114: Missing module docstring 20 | # C0115: Missing class docstring 21 | # C0116: Missing function or method docstring 22 | # C0301: Line too long 23 | # C0302: Too many lines in module 24 | # R0801: Similar lines 25 | # R0902: Too many instance attributes 26 | # R0904: Too many public methods 27 | # R0912: Too many branches 28 | # R0913: Too many arguments 29 | # R0914: Too many local variables 30 | # R0915: Too many statements 31 | # R1702: Too many nested blocks 32 | # R1728: Consider using a generator 33 | pylint --disable C0103,C0114,C0115,C0116,C0301,C0302,R0801,R0902,R0904,R0912,R0913,R0914,R0915,R1702,R1728 $(git ls-files '*.py') 34 | shell: micromamba-shell {0} 35 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | permissions: 23 | # IMPORTANT: this permission is mandatory for trusted publishing 24 | id-token: write 25 | steps: 26 | - uses: actions/checkout@v4 27 | - name: Set up Python 28 | uses: actions/setup-python@v3 29 | with: 30 | python-version: '3.8' 31 | - name: Install dependencies 32 | run: | 33 | python -m pip install --upgrade pip 34 | pip install build 35 | - name: Build package 36 | run: python -m build 37 | - name: Publish package 38 | uses: pypa/gh-action-pypi-publish@v1.8.14 39 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | on: [push, pull_request] 3 | 4 | defaults: 5 | run: 6 | shell: bash -l {0} 7 | 8 | jobs: 9 | test-linux: 10 | name: Test on Linux 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: 15 | - "3.7" 16 | - "3.8" 17 | - "3.9" 18 | - "3.10" 19 | steps: 20 | - uses: actions/checkout@v4 21 | - name: Use python ${{ matrix.python-version }} 22 | run: echo -e "\n - python = ${{ matrix.python-version }}" >> ./HiCMatrix_env_ci.yml 23 | - uses: mamba-org/setup-micromamba@main 24 | with: 25 | environment-file: ./HiCMatrix_env_ci.yml 26 | cache-downloads: true 27 | environment-name: HiCMatrix_env_ci 28 | - name: pip install 29 | run: | 30 | python3 -m pip install . 31 | shell: micromamba-shell {0} 32 | - name: Test HiCMatrix 33 | run: | 34 | py.test hicmatrix/test/ --capture=sys 35 | shell: micromamba-shell {0} 36 | test-osx: 37 | name: Test on OSX 38 | runs-on: macos-12 # which is Intel and supported by bioconda macOS-latest is Apple silicon. 39 | strategy: 40 | matrix: 41 | python-version: 42 | - "3.8" 43 | - "3.9" 44 | - "3.10" 45 | steps: 46 | - uses: actions/checkout@v4 47 | - name: Use python ${{ matrix.python-version }} 48 | run: echo -e "\n - python = ${{ matrix.python-version }}" >> ./HiCMatrix_env_ci.yml 49 | - uses: mamba-org/setup-micromamba@main 50 | with: 51 | environment-file: ./HiCMatrix_env_ci.yml 52 | cache-downloads: true 53 | environment-name: HiCMatrix_env_ci 54 | - name: pip install 55 | run: | 56 | python3 -m pip install . 57 | shell: micromamba-shell {0} 58 | - name: Test HiCMatrix 59 | run: | 60 | py.test hicmatrix/test/ --capture=sys 61 | shell: micromamba-shell {0} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | wheels/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | MANIFEST 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *.cover 46 | .hypothesis/ 47 | .pytest_cache/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | db.sqlite3 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # Environments 84 | .env 85 | .venv 86 | env/ 87 | venv/ 88 | ENV/ 89 | env.bak/ 90 | venv.bak/ 91 | 92 | # Spyder project settings 93 | .spyderproject 94 | .spyproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | 99 | # mkdocs documentation 100 | /site 101 | 102 | # mypy 103 | .mypy_cache/ 104 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # yaml file to configure readthedocs build 2 | python: 3 | setup_py_install: true 4 | pip_install: False 5 | -------------------------------------------------------------------------------- /HiCMatrix_env_ci.yml: -------------------------------------------------------------------------------- 1 | name: HiCMatrix_env_ci 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | - numpy >= 1.20 7 | - scipy >= 1.2 8 | - pandas >= 0.25 9 | - pytables >= 3.5 10 | - cooler >= 0.8.9 11 | - intervaltree >= 3.0 12 | - pytest 13 | - pylint 14 | - pytest-xdist 15 | - pytest-forked 16 | - nose 17 | - pathlib 18 | - configparser 19 | - build # For the upload 20 | - twine # For the upload -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | HiCMatrix 2 | =========== 3 | 4 | This library implements the central class of HiCExplorer to manage Hi-C interaction matrices. It is separated from the main project to enable Hi-C matrices 5 | in other projects without the dependency on HiCExplorer. Moreover, it enables us to use the already separated pyGenomeTracks (former hicPlotTADs) in HiCExplorer 6 | because mutual dependencies are resolved. 7 | 8 | With version 8, we dropped the support for Python 2. 9 | 10 | Version 14 introduced the official support for scool file format, used by scHiCExplorer since version 5: https://github.com/joachimwolff/scHiCExplorer and https://schicexplorer.readthedocs.io/en/latest/. 11 | 12 | Read support 13 | ------------- 14 | 15 | - h5 16 | - cool / mcool / scool 17 | - hicpro 18 | - homer 19 | 20 | Write support 21 | -------------- 22 | 23 | - h5 24 | - cool / mcool 25 | - scool 26 | - homer 27 | - ginteractions 28 | - hicpro 29 | 30 | Citation: 31 | ^^^^^^^^^ 32 | 33 | Joachim Wolff, Leily Rabbani, Ralf Gilsbach, Gautier Richard, Thomas Manke, Rolf Backofen, Björn A Grüning. 34 | **Galaxy HiCExplorer 3: a web server for reproducible Hi-C, capture Hi-C and single-cell Hi-C data analysis, quality control and visualization, Nucleic Acids Research**, Volume 48, Issue W1, 02 July 2020, Pages W177–W184, https://doi.org/10.1093/nar/gkaa220 35 | -------------------------------------------------------------------------------- /hicmatrix/HiCMatrix.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from collections import Counter 3 | from collections import OrderedDict 4 | 5 | import time 6 | 7 | import cooler 8 | import numpy as np 9 | from intervaltree import Interval, IntervalTree 10 | from scipy.sparse import csr_matrix, dia_matrix 11 | from scipy.sparse import hstack as sparse_hstack 12 | from scipy.sparse import tril, triu 13 | from scipy.sparse import vstack as sparse_vstack 14 | from scipy.sparse import diags 15 | from scipy.sparse import lil_matrix 16 | 17 | from .lib import MatrixFileHandler 18 | from .utilities import check_chrom_str_bytes, toBytes, toString 19 | 20 | log = logging.getLogger(__name__) 21 | 22 | class hiCMatrix: 23 | """ 24 | Class to handle Hi-C matrices 25 | contains routines to get intrachromosomal distances 26 | get sub matrices by chrname. 27 | """ 28 | 29 | def __init__(self, pMatrixFile=None, pChrnameList=None, pDistance=None, pNoIntervalTree=None, pUpperTriangleOnly=None, 30 | pMatrixFormat=None, pRestoreMaskedBins=None, pLoadMatrixOnly=None): 31 | self.non_homogeneous_warning_already_printed = False 32 | self.bin_size = None 33 | self.bin_size_homogeneous = None # track if the bins are equally spaced or not 34 | self.uncorrected_matrix = None 35 | 36 | self.matrix = None 37 | self.cut_intervals = None 38 | self.nan_bins = None 39 | self.correction_factors = None 40 | self.distance_counts = None 41 | # # when NaN bins are masked, this variable becomes contains the bin index 42 | # # needed to put the masked bins back into the matrix. 43 | self.orig_bin_ids = [] 44 | self.orig_cut_intervals = [] # similar to orig_bin_ids. Used to identify the position of masked nan bins 45 | self.matrixFileHandler = None 46 | start_time = time.time() 47 | if pMatrixFile is not None: 48 | log.debug('Load self.matrixFileHandler') 49 | fileType = 'cool' 50 | if pMatrixFile.endswith('.h5'): 51 | fileType = 'h5' 52 | self.matrixFileHandler = MatrixFileHandler(pFileType=fileType, pMatrixFile=pMatrixFile, pChrnameList=pChrnameList, pDistance=pDistance, pMatrixFormat=pMatrixFormat, pLoadMatrixOnly=pLoadMatrixOnly) 53 | log.debug('init time: %s', time.time() - start_time) 54 | matrixFileHandler_load = self.matrixFileHandler.load() 55 | # check if there was any exception thrown in the load function 56 | if len(matrixFileHandler_load) == 2: 57 | raise ValueError(f'Matrix failed to load: {matrixFileHandler_load[1]}') 58 | self.matrix, self.cut_intervals, self.nan_bins, \ 59 | self.correction_factors, self.distance_counts = matrixFileHandler_load 60 | if pLoadMatrixOnly is None or not pLoadMatrixOnly: 61 | if self.nan_bins is None: 62 | self.nan_bins = np.array([]) 63 | 64 | if pUpperTriangleOnly is None or not pUpperTriangleOnly: 65 | self.fillLowerTriangle() 66 | start_time = time.time() 67 | 68 | if pRestoreMaskedBins is None or pRestoreMaskedBins: 69 | self.restoreMaskedBins() 70 | start_time = time.time() 71 | 72 | if pNoIntervalTree is None or not pNoIntervalTree: 73 | self.interval_trees, self.chrBinBoundaries = \ 74 | self.intervalListToIntervalTree(self.cut_intervals) 75 | else: 76 | log.debug('no intervaltree') 77 | 78 | elif pMatrixFile is None: 79 | log.debug('Only init object, no matrix given.') 80 | else: 81 | raise ValueError('matrix file not given') 82 | log.debug('data loaded!') 83 | 84 | def save(self, pMatrixName, pSymmetric=True, pApplyCorrection=False, pHiCInfo=None): 85 | """ As an output format cooler and mcooler are supported. 86 | """ 87 | 88 | if self.matrixFileHandler is None: 89 | fileType = 'cool' 90 | if pMatrixName.endswith('h5'): 91 | fileType = 'h5' 92 | self.matrixFileHandler = MatrixFileHandler(pFileType=fileType, pHiCInfo=pHiCInfo) 93 | 94 | self.restoreMaskedBins() 95 | self.matrixFileHandler.set_matrix_variables(self.matrix, self.cut_intervals, self.nan_bins, 96 | self.correction_factors, self.distance_counts) 97 | if pMatrixName.endswith('cool'): 98 | self.matrixFileHandler.matrixFile.hic_metadata = pHiCInfo 99 | 100 | if pMatrixName.endswith('cool') or pMatrixName.endswith('h5'): 101 | self.matrixFileHandler.save(pMatrixName, pSymmetric=pSymmetric, pApplyCorrection=pApplyCorrection) 102 | 103 | def getInformationCoolerBinNames(self): 104 | log.info('The following columns are available: %s', self.matrixFileHandler.matrixFile.getInformationCoolerBinNames()) 105 | 106 | def fillLowerTriangle(self): 107 | """ 108 | checks if the matrix is complete or if only half of the matrix was saved. 109 | Returns a whole matrix. 110 | """ 111 | # log.debug('sum of tril: {}'.format(tril(self.matrix, k=-1).sum())) 112 | if tril(self.matrix, k=-1).sum() == 0: 113 | # this case means that the lower triangle of the 114 | # symmetric matrix (below the main diagonal) 115 | # is zero. In this case, replace the lower 116 | # triangle using the upper triangle 117 | self.matrix = self.matrix + triu(self.matrix, 1).T 118 | 119 | # return matrix 120 | 121 | def setCutIntervals(self, cut_intervals): 122 | """ 123 | Replace the cut_intervals of a matrix 124 | """ 125 | 126 | # check that the matrix is squared 127 | if len(cut_intervals) != self.matrix.shape[0]: 128 | raise ValueError(f"Length of cut_intervals {len(cut_intervals)} does not match the " 129 | f"matrix size {self.matrix.shape}") 130 | 131 | self.cut_intervals = cut_intervals 132 | self.interval_trees, self.chrBinBoundaries = \ 133 | self.intervalListToIntervalTree(self.cut_intervals) 134 | 135 | def setMatrix(self, matrix, cut_intervals): 136 | """ 137 | Initialize a matrix with a given matrix 138 | and cut_intervals. Mostly useful for 139 | testing. 140 | """ 141 | 142 | # check that the matrix is squared 143 | if matrix.shape[0] != matrix.shape[1]: 144 | raise ValueError(f"Matrix is not squared. Shape is {matrix.shape}") 145 | if len(cut_intervals) != matrix.shape[0]: 146 | raise ValueError(f"Length of cut_intervals {len(cut_intervals)} does not match the matrix size {matrix.shape}") 147 | 148 | self.matrix = matrix 149 | self.cut_intervals = cut_intervals 150 | self.interval_trees, self.chrBinBoundaries = \ 151 | self.intervalListToIntervalTree(self.cut_intervals) 152 | 153 | def getBinSize(self): 154 | """ 155 | estimates the bin size. In case the bin size 156 | is not equal for all bins (maybe except for the 157 | bin at the en of the chromosomes) a warning is issued. 158 | In case of uneven bins, the median is returned. 159 | """ 160 | if self.bin_size is None: 161 | chrom, start, end, extra = zip(*self.cut_intervals) 162 | diff = np.array(end) - np.array(start) 163 | # If there is only one bin: 164 | if len(diff) == 1: 165 | self.bin_size = diff[0] 166 | return self.bin_size 167 | # If there are more bins, the diff will be compared 168 | # to the median of the differences between starts 169 | median = int(np.median(np.concatenate([np.diff([start for chro, start, end, extra in self.cut_intervals if chro == cur_chrom]) for cur_chrom, nb in Counter(chrom).items() if nb > 1]))) 170 | 171 | # check if the bin size is 172 | # homogeneous 173 | if len(np.flatnonzero(diff != median)) > (len(diff) * 0.01): 174 | self.bin_size_homogeneous = False 175 | if self.non_homogeneous_warning_already_printed is False: 176 | log.warning('Bin size is not homogeneous. \ 177 | Median %f\n', median) 178 | self.non_homogeneous_warning_already_printed = True 179 | self.bin_size = median 180 | return self.bin_size 181 | 182 | def getMatrix(self): 183 | matrix = self.matrix.todense() 184 | if len(self.nan_bins): 185 | # to set NaN values the matrix type has to be 186 | # float. Corrected matrices are of float 187 | # type while uncorrected matrices are of 188 | # of int type 189 | if np.issubdtype(self.matrix, 'float') is False: 190 | matrix = matrix.astype(float) 191 | matrix[self.nan_bins, :] = np.nan 192 | matrix[:, self.nan_bins] = np.nan 193 | 194 | return matrix 195 | 196 | def getChrBinRange(self, chrName): 197 | """ 198 | Given a chromosome name, 199 | This functions return the start and end bin indices in the matrix 200 | """ 201 | 202 | if chrName in self.chrBinBoundaries: 203 | return self.chrBinBoundaries[chrName] 204 | raise ValueError(f"chrName: {chrName} not found in chrBinBoundaries" 205 | f"valid chromosomes are: {self.chrBinBoundaries.keys()}") 206 | 207 | def getChrNames(self): 208 | """ 209 | returns the names of the chromosomes 210 | present in the matrix 211 | """ 212 | return list(self.chrBinBoundaries) 213 | 214 | def getBinPos(self, binIndex): 215 | """ 216 | given a bin, it returns the chromosome name, 217 | start position and end position 218 | """ 219 | if binIndex < len(self.cut_intervals): 220 | return self.cut_intervals[binIndex] 221 | raise ValueError(f"binIndex: {binIndex} not found") 222 | 223 | def getRegionBinRange(self, chrname, startpos, endpos): 224 | """ 225 | Given a chromosome region, this function returns 226 | the bin indices that overlap with such region. 227 | """ 228 | 229 | try: 230 | # chromosome_size = hic_matrix.get_chromosome_sizes() 231 | # chrname = check_chrom_str_bytes(self.interval_trees, chrname) 232 | if not isinstance(next(iter(self.interval_trees)), type(chrname)): 233 | if isinstance(next(iter(self.interval_trees)), str): 234 | chrname = toString(chrname) 235 | elif isinstance(next(iter(self.interval_trees)), bytes): 236 | chrname = toBytes(chrname) 237 | elif isinstance(next(iter(self.interval_trees)), np.bytes_): 238 | chrname = toBytes(chrname) 239 | # chr_end_pos = chromosome_size[chrname] 240 | # self.interval_trees[chrname] 241 | if chrname not in self.interval_trees: 242 | raise ValueError(f"chromosome: {chrname} name not found in matrix" 243 | f"valid names are: {self.interval_trees.keys()}" 244 | ) 245 | except KeyError as ke: 246 | log.exception("chromosome: %s name not found in matrix", chrname) 247 | log.exception("valid names are: ") 248 | log.exception(self.interval_trees.keys()) 249 | log.exception(str(ke)) 250 | 251 | try: 252 | startpos = int(startpos) 253 | endpos = int(endpos) 254 | except ValueError as ve: 255 | log.exception("%d or %d are not valid " 256 | "position values.", startpos, endpos) 257 | log.exception(str(ve)) 258 | 259 | try: 260 | 261 | startbin = sorted(self.interval_trees[chrname][startpos:startpos + 1])[0].data 262 | endbin = sorted(self.interval_trees[chrname][endpos:endpos + 1])[0].data 263 | except IndexError: 264 | # log.exception("chrname: " + chrname) 265 | # log.exception("len intervaltree: "+len(self.interval_trees[chrname])) 266 | # log.exception("start and end pos:" + startpos + ":::" + endpos ) 267 | log.exception("Index error") 268 | return None 269 | 270 | return startbin, endbin 271 | 272 | @staticmethod 273 | def getDistList(rows, cols, cut_intervals): 274 | """ 275 | Given a list of rows and cols 276 | an array is returned containing 277 | the genomic distance between 278 | each element of the row array 279 | with each element of the col array. 280 | -1 is returned for inter-chromosomal 281 | interactions. 282 | 283 | A matching list containing the chromosome name 284 | is also returned 285 | """ 286 | chrnamelist, startlist, _, _ = zip(*cut_intervals) 287 | # now the distance between any two points 288 | # is computed and arranged such that for each 289 | # element of the data array, a corespondent distance is stored 290 | start_row = np.take(startlist, rows) 291 | start_col = np.take(startlist, cols) 292 | dist_list = start_col - start_row 293 | 294 | # now all distances that are between chromosomes are removed 295 | # to do this I convert the array of chromosomes to 296 | # a array of indices. Then, when subtracting the 297 | # values that correspond to matrix.row and matrix.col 298 | # using the array of indices, any value other 299 | # than 0 means inter-chromosomal row,col combination. 300 | 301 | # chr_id_list is based on a trick using np.unique 302 | # to get from a list of strings 303 | # a list of integers 304 | chr_id_list = np.unique(chrnamelist, return_inverse=True)[1] 305 | 306 | chr_row = np.take(chr_id_list, rows) 307 | chr_col = np.take(chr_id_list, cols) 308 | chr_diff = chr_row - chr_col 309 | # set in dist_list array '-1' for all interchromosomal values 310 | dist_list[chr_diff != 0] = -1 311 | 312 | # make a corresponding chromosome name list 313 | # if filtering per chromosome is required 314 | chrom_list = np.take(chrnamelist, rows) 315 | chrom_list[chr_diff != 0] = '' 316 | 317 | return dist_list, chrom_list 318 | 319 | @staticmethod 320 | def fit_cut_intervals(cut_intervals): 321 | # check that the matrix has bins of same size 322 | # otherwise try to adjust the bins to 323 | # to match a regular binning 324 | if len(cut_intervals) <= 1: 325 | # do nothing if there is only one interval 326 | return cut_intervals 327 | chrom, start, end, extra = zip(*cut_intervals) 328 | 329 | median = int(np.median(np.concatenate([np.diff([start for chro, start, end, extra in cut_intervals if chro == cur_chrom]) for cur_chrom, nb in Counter(chrom).items() if nb > 1]))) 330 | diff = np.array(end) - np.array(start) 331 | # check if the bin size is homogeneous 332 | if len(np.flatnonzero(diff != median)) > (len(diff) * 0.01): 333 | # set the start position of a bin to the closest multiple 334 | # of the median 335 | def snap_nearest_multiple(start_x, m): 336 | resi = [-1 * (start_x % m), -start_x % m] 337 | return start_x + resi[np.argmin(np.abs(resi))] 338 | start = [snap_nearest_multiple(x, median) for x in start] 339 | end = [snap_nearest_multiple(x, median) for x in end] 340 | cut_intervals = list(zip(chrom, start, end, extra)) 341 | log.info('[getCountsByDistance] Bin size is not ' 342 | 'homogeneous, setting \n' 343 | 'the bin distance to the median: %f\n', median) 344 | return cut_intervals 345 | 346 | def convert_to_zscore_matrix(self, maxdepth=None, perchr=False): 347 | return self.convert_to_obs_exp_matrix(maxdepth=maxdepth, zscore=True, perchr=perchr) 348 | 349 | def convert_to_obs_exp_matrix(self, maxdepth=None, zscore=False, perchr=False, pSkipTriu=False): 350 | """ 351 | Converts a corrected counts matrix into a 352 | obs / expected matrix or z-scores fast. 353 | 354 | The caveat is that the obs/exp or z-score are only 355 | computed for non-zero values, although zero values that 356 | are not part of the sparse matrix are considered. 357 | 358 | For each diagonal the mean (and std when computing z-scores) are 359 | calculated and then each non-zero value of the sparse matrix is 360 | replaced by the obs/exp or z-score. 361 | 362 | Parameters 363 | ---------- 364 | maxdepth: maximum distance from the diagonal to consider. All contacts beyond this distance will not 365 | be considered. 366 | zscore: if a zscore wants to be returned instead of obs/exp 367 | 368 | 369 | Returns 370 | ------- 371 | observed / expected sparse matrix 372 | 373 | 374 | nans occur where the standard deviation is zero 375 | """ 376 | 377 | binsize = self.getBinSize() 378 | max_depth_in_bins = None 379 | 380 | if maxdepth: 381 | if maxdepth < binsize: 382 | raise ValueError(f"Please specify a maxDepth larger than bin size ({binsize})") 383 | 384 | max_depth_in_bins = int(float(maxdepth * 1.5) / binsize) 385 | # work only with the upper matrix 386 | # and remove all pixels that are beyond 387 | # max_depth_in_bis 388 | # (this is done by subtracting a second sparse matrix 389 | # that contains only the upper matrix that wants to be removed. 390 | if not pSkipTriu: 391 | self.matrix = triu(self.matrix, k=0, format='csr') - \ 392 | triu(self.matrix, k=max_depth_in_bins, format='csr') 393 | else: 394 | if not pSkipTriu: 395 | self.matrix = triu(self.matrix, k=0, format='csr') 396 | 397 | self.matrix.eliminate_zeros() 398 | depth = None 399 | if zscore is True: 400 | m_size = self.matrix.shape[0] 401 | if max_depth_in_bins is not None: 402 | depth = max_depth_in_bins 403 | else: 404 | depth = m_size 405 | estimated_size_dense_matrix = m_size ** 2 * 8 406 | if estimated_size_dense_matrix > 100e6: 407 | log.info("To compute z-scores a dense matrix is required. This will use \n" 408 | "%f Mb of memory.\n To reduce memory use the maxdeph option." 409 | "", estimated_size_dense_matrix / 1e6) 410 | 411 | # to compute zscore the zero values need to be accounted and the matrix 412 | # need to become dense. This is only practical if only up to certain distance 413 | # wants to be evaluated, otherwise the dense matrix is too large. 414 | # To make the matrix dense and keep the same computations as when 415 | # the matrix is sparse the following is done: 416 | # A sparse diagonal matrix of shape = matrix.shape is created with ones 417 | # (only upper triangle contains diagonals up to maxdeph) 418 | # This sparse matrix is then added to self.matrix 419 | # then, -1 is subtracted to the self.matrix.data, thus effectively 420 | # adding zeros. 421 | diag_mat_ones = diags(np.repeat([1], m_size * depth).reshape(depth, m_size), list(range(depth))) 422 | 423 | self.matrix += diag_mat_ones 424 | 425 | trasf_matrix = lil_matrix(self.matrix.shape) 426 | 427 | chr_submatrix = OrderedDict() 428 | cut_intervals = OrderedDict() 429 | chrom_sizes = OrderedDict() 430 | chrom_range = OrderedDict() 431 | if perchr: 432 | for chrname in self.getChrNames(): 433 | chr_range = self.getChrBinRange(chrname) 434 | chr_submatrix[chrname] = self.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]].tocoo() 435 | cut_intervals[chrname] = [self.cut_intervals[x] for x in range(chr_range[0], chr_range[1])] 436 | chrom_sizes[chrname] = [chr_submatrix[chrname].shape[0]] 437 | chrom_range[chrname] = (chr_range[0], chr_range[1]) 438 | 439 | else: 440 | chr_submatrix['all'] = self.matrix.tocoo() 441 | cut_intervals['all'] = self.cut_intervals 442 | # chrom_sizes['all'] = np.array([v[1] - v[0] for k, v in iteritems(self.chrBinBoundaries)]) 443 | chrom_sizes['all'] = np.array([v[1] - v[0] for k, v in self.chrBinBoundaries.items()]) 444 | 445 | chrom_range['all'] = (0, self.matrix.shape[0]) 446 | 447 | # for chrname, submatrix in iteritems(chr_submatrix): 448 | for chrname, submatrix in chr_submatrix.items(): 449 | 450 | log.info("processing chromosome %s\n", chrname) 451 | if zscore is True: 452 | # this step has to be done after tocoo() 453 | submatrix.data -= 1 454 | 455 | dist_list, _ = self.getDistList(submatrix.row, submatrix.col, 456 | hiCMatrix.fit_cut_intervals(cut_intervals[chrname])) 457 | 458 | # to get the sum of all values at a given distance I use np.bincount which 459 | # is quite fast. However, the input of bincount is positive integers. Moreover 460 | # it returns the sum for every consecutive integer, even if this is not on the list. 461 | # Thus, dist_list, which contains the distance in bp between any two bins is 462 | # converted to bin distance. 463 | 464 | # Because positive integers are needed we add +1 to all bin distances 465 | # such that the value of -1 (which means different chromosomes) can now be used 466 | 467 | dist_list[dist_list == -1] = -binsize # pylint: disable=E1130 468 | # divide by binsize to get a list of bin distances and add +1 to remove negative values 469 | dist_list = (np.array(dist_list).astype(float) / binsize).astype(int) + 1 470 | 471 | # for each distance, return the sum of all values 472 | sum_counts = np.bincount(dist_list, weights=submatrix.data) 473 | distance_len = np.bincount(dist_list) 474 | # compute the average for each distance 475 | mat_size = submatrix.shape[0] 476 | mu = {} 477 | std = {} 478 | # compute mean value for each distance 479 | 480 | for bin_dist_plus_one, sum_value in enumerate(sum_counts): 481 | if maxdepth and bin_dist_plus_one == 0: # this is for intra chromosomal counts 482 | # when max depth is set, the computation 483 | # of the total_intra is not accurate and is safer to 484 | # output np.nan 485 | mu[bin_dist_plus_one] = np.nan 486 | std[bin_dist_plus_one] = np.nan 487 | continue 488 | 489 | if bin_dist_plus_one == 0: 490 | total_intra = mat_size ** 2 - sum([size ** 2 for size in chrom_sizes[chrname]]) 491 | diagonal_length = int(total_intra / 2) 492 | else: 493 | # to compute the average counts per distance we take the sum_counts and divide 494 | # by the number of values on the respective diagonal 495 | # which is equal to the size of each chromosome - the diagonal offset (for those 496 | # chromosome larger than the offset) 497 | # In the following example with two chromosomes 498 | # the first (main) diagonal has a size equal to the matrix (6), 499 | # while the next has 1 value less for each chromosome (4) and the last one has only 2 values 500 | 501 | # 0 1 2 . . . 502 | # - 0 1 . . . 503 | # - - 0 . . . 504 | # . . . 0 1 2 505 | # . . . - 0 1 506 | # . . . - - 0 507 | 508 | # idx - 1 because earlier the values where 509 | # shifted. 510 | diagonal_length = sum([size - (bin_dist_plus_one - 1) for size in chrom_sizes[chrname] if size > (bin_dist_plus_one - 1)]) 511 | log.debug("Type of diagonal_length %s", type(diagonal_length)) 512 | 513 | # the diagonal length should contain the number of values at a certain distance. 514 | # If the matrix is dense, the distance_len[bin_dist_plus_one] correctly contains the number of values 515 | # If the matrix is equally spaced, then, the diagonal_length as computed before is accurate. 516 | # But, if the matrix is both sparse and with unequal bins, then none of the above methods is 517 | # accurate but the the diagonal_length as computed before will be closer. 518 | diagonal_length = max(diagonal_length, distance_len[bin_dist_plus_one]) 519 | log.debug("Type of diagonal_length %s", type(diagonal_length)) 520 | 521 | if diagonal_length == 0: 522 | mu[bin_dist_plus_one] = np.nan 523 | else: 524 | mu[bin_dist_plus_one] = np.float64(sum_value) / diagonal_length 525 | 526 | if np.isnan(sum_value): 527 | log.info("nan value found for distance %f\n", (bin_dist_plus_one - 1) * binsize) 528 | 529 | # if zscore is needed, compute standard deviation: std = sqrt(mean(abs(x - x.mean())**2)) 530 | if zscore: 531 | values_sqrt_diff = \ 532 | np.abs((submatrix.data[dist_list == bin_dist_plus_one] - mu[bin_dist_plus_one]) ** 2) 533 | # the standard deviation is the sum of the differences with mu squared (value variable) 534 | # plus all zeros that are not included in the sparse matrix 535 | # for which the standard deviation is 536 | # (0 - mu)**2 = (mu)**2 537 | # The number of zeros is the diagonal length - the length of the non zero values 538 | zero_values_sqrt_diff_sum = (diagonal_length - len(values_sqrt_diff)) * mu[bin_dist_plus_one] ** 2 539 | 540 | _std = np.sqrt((values_sqrt_diff.sum() + zero_values_sqrt_diff_sum) / diagonal_length) 541 | std[bin_dist_plus_one] = _std 542 | 543 | # use the expected values to compute obs/exp 544 | transf_ma = np.zeros(len(submatrix.data)) 545 | for idx, value in enumerate(submatrix.data): 546 | if depth is not None and dist_list[idx] > depth + 1: 547 | continue 548 | if zscore: 549 | if std[dist_list[idx]] == 0: 550 | transf_ma[idx] = np.nan 551 | else: 552 | transf_ma[idx] = (value - mu[dist_list[idx]]) / std[dist_list[idx]] 553 | else: 554 | transf_ma[idx] = value / mu[dist_list[idx]] 555 | 556 | submatrix.data = transf_ma 557 | trasf_matrix[chrom_range[chrname][0]:chrom_range[chrname][1], chrom_range[chrname][0]:chrom_range[chrname][1]] = submatrix.tolil() 558 | 559 | self.matrix = trasf_matrix.tocsr() 560 | 561 | return self.matrix 562 | 563 | @staticmethod 564 | def dist_list_to_dict(data, dist_list): 565 | """ 566 | splits data, into numeric groups defined by dist_list 567 | Return a dictionary containing, for 568 | each unique distance a dictionary 569 | """ 570 | 571 | order = np.argsort(dist_list) 572 | dist_list = dist_list[order] 573 | data = data[order] 574 | 575 | # having the dist_list sorted, np.split 576 | # is used to divide the data into 577 | # groups that lie at the same distance, for this 578 | # np.diff together with np.flatnonzero is used to 579 | # find the indices where the distance changes. 580 | # the '+1' is needed because the np.diff array is 581 | # one element smaller than the original array, thus 582 | # the indices based no the np.diff array are off by 1 583 | # with respect to the original array 584 | groups = np.split(data, np.flatnonzero(np.diff(dist_list)) + 1) 585 | 586 | # because the dist_list is sorted 587 | # the order of the unique values 588 | # corresponds to that of the groups. 589 | # In other words, group[0] 590 | # has distance_unique[0] 591 | # np.sort after np.unique in theory 592 | # is not needed, but just in case... 593 | distance_unique = np.sort(np.unique(dist_list)) 594 | 595 | # convert to dictionary having as key 596 | # the distance 597 | distance = {} 598 | for index, d in enumerate(distance_unique): 599 | distance[d] = groups[index] 600 | 601 | return distance 602 | 603 | def keepOnlyTheseChr(self, chromosome_list): 604 | """ 605 | given a list of chromosome names, 606 | these are kept, while any other is removed 607 | from the matrix 608 | """ 609 | chromosome_list = check_chrom_str_bytes(self.interval_trees, chromosome_list) 610 | 611 | try: 612 | [self.chrBinBoundaries[x] for x in chromosome_list] 613 | except KeyError as e: 614 | raise ValueError(f"Chromosome name not in matrix. {str(e)}") from e 615 | 616 | self.restoreMaskedBins() 617 | size = self.matrix.shape 618 | # initialize a 1D array containing the columns (and rows) to 619 | # select. By default none are selected 620 | sel = np.empty(size[0], dtype=bool) 621 | sel[:] = False 622 | 623 | for chrName in list(self.interval_trees): 624 | if chrName not in chromosome_list: 625 | continue 626 | 627 | # identify start and end rows 628 | # of chromosomes that wants to be 629 | # kept 630 | index_start, index_end = self.getChrBinRange(chrName) 631 | sel[index_start:index_end] = True 632 | 633 | sel_id = np.flatnonzero(sel) 634 | mat = self.matrix[sel_id, :][:, sel_id] 635 | 636 | # update bin ids 637 | self.cut_intervals = [self.cut_intervals[x] for x in sel_id] 638 | 639 | # update correction factors 640 | if self.correction_factors is not None: 641 | self.correction_factors = [self.correction_factors[x] for x in sel_id] 642 | 643 | # keep track of nan bins 644 | if len(self.nan_bins): 645 | _temp = np.zeros(size[0]) 646 | _temp[self.nan_bins] = 1 647 | _temp = _temp[sel_id] 648 | self.nan_bins = np.flatnonzero(_temp == 1) 649 | else: 650 | self.nan_bins = [] 651 | 652 | self.numCols = len(sel_id) # pylint: disable=W0201 653 | 654 | self.interval_trees, self.chrBinBoundaries = \ 655 | self.intervalListToIntervalTree(self.cut_intervals) 656 | # remove distanceCounts 657 | try: 658 | self.distance_counts = None 659 | except AttributeError: 660 | pass 661 | self.matrix = mat 662 | return self.matrix 663 | 664 | def diagflat(self, value=np.nan): 665 | """ 666 | sets 667 | the matrix diagonal to np.nan 668 | """ 669 | M = self.matrix.shape[0] 670 | diagmatrix = dia_matrix((np.repeat(value, M), 0), shape=(M, M)) 671 | self_diag = dia_matrix(([self.matrix.diagonal()], [0]), shape=(M, M)) 672 | # take matrix, subtract the values of the diagonal such that 673 | # it becomes all zeros, replace with new values by adding them 674 | self.matrix = self.matrix - self_diag + diagmatrix 675 | return self.matrix 676 | 677 | def filterOutInterChrCounts(self): 678 | """ 679 | set all inter chromosomal counts to np.nan 680 | """ 681 | 682 | ma_coo = self.matrix.tocoo() 683 | dist_list, _ = hiCMatrix.getDistList(ma_coo.row, ma_coo.col, 684 | self.cut_intervals) 685 | 686 | # set to zero all cases in which dist_list is zero 687 | ma_coo.data[dist_list == -1] = 0 688 | 689 | self.matrix = ma_coo.tocsr() 690 | self.matrix.eliminate_zeros() 691 | return self.matrix 692 | 693 | def setMatrixValues(self, newMatrix): 694 | """ 695 | replace the current matrix values 696 | by the given matrix values. The 697 | shapes have to coincide 698 | """ 699 | assert self.matrix.shape == newMatrix.shape, \ 700 | "Given matrix has different shape. New " \ 701 | "values need to have the same shape as previous matrix." 702 | 703 | self.matrix = csr_matrix(newMatrix) 704 | 705 | def setCorrectionFactors(self, correction_factors): 706 | assert len(correction_factors) == self.matrix.shape[0], \ 707 | "length of correction factors and length of matrix are different." 708 | self.correction_factors = correction_factors 709 | 710 | def reorderChromosomes(self, new_chr_order): 711 | new_order = [] 712 | new_chr_order = check_chrom_str_bytes(self.chrBinBoundaries, new_chr_order) 713 | 714 | for chrName in new_chr_order: 715 | # check that the chromosome names are valid 716 | if chrName not in self.chrBinBoundaries: 717 | raise ValueError(f"Chromosome name '{chrName}' not found. Please check the correct spelling " 718 | "of the chromosomes and try again") 719 | orig = self.chrBinBoundaries[chrName] 720 | new_order.extend(list(range(orig[0], orig[1]))) 721 | self.reorderBins(new_order) 722 | 723 | def reorderBins(self, new_order): 724 | """ 725 | reorders the rows and colums of the 726 | matrix according to the new order. 727 | The new order can be smaller 728 | than the original matrix. In that 729 | case, the ids not in the 730 | new order are removed. 731 | """ 732 | orig_num_rows = self.matrix.shape[0] 733 | self.matrix = self.matrix[new_order, :][:, new_order] 734 | self.cut_intervals = [self.cut_intervals[x] for x in new_order] 735 | # reorder the masked bins 736 | # keep track of nan bins 737 | if len(self.nan_bins): 738 | _temp = np.zeros(orig_num_rows) 739 | _temp[self.nan_bins] = 1 740 | _temp = _temp[new_order] 741 | self.nan_bins = np.flatnonzero(_temp == 1) 742 | else: 743 | self.nan_bins = [] 744 | 745 | self.interval_trees, self.chrBinBoundaries = \ 746 | self.intervalListToIntervalTree(self.cut_intervals) 747 | 748 | def maskChromosomes(self, pChromosomeList): 749 | mask_ids = [] 750 | pChromosomeList = check_chrom_str_bytes(self.chrBinBoundaries, pChromosomeList) 751 | 752 | for chromosome in pChromosomeList: 753 | # check that the chromosome names are valid 754 | if chromosome not in self.chrBinBoundaries: 755 | raise ValueError(f"Chromosome name '{chromosome}' not found. Please check the correct spelling " 756 | "of the chromosomes and try again") 757 | orig = self.chrBinBoundaries[chromosome] 758 | mask_ids.extend(list(range(orig[0], orig[1]))) 759 | self.maskBins(mask_ids) 760 | 761 | def maskBins(self, bin_ids=None): 762 | """ 763 | Mask the list of bins given. Mask means 764 | to remove the bins from the matrix, 765 | and keep the information about the intervals 766 | as masked 767 | """ 768 | # print("self.cut_intervalsMASKBINS___START", self.cut_intervals) 769 | 770 | if bin_ids is None or len(bin_ids) == 0: 771 | return 772 | self.printchrtoremove(bin_ids, restore_masked_bins=False) 773 | try: 774 | # check if a masked bin already exists 775 | if len(self.orig_bin_ids) > 0: 776 | M = self.matrix.shape[0] 777 | previous_bin_ids = self.orig_bin_ids[M:] 778 | # merge new and old masked bins 779 | bin_ids = np.unique(np.concatenate([previous_bin_ids, self.orig_bin_ids[bin_ids]])) 780 | np.sort(bin_ids) 781 | self.restoreMaskedBins() 782 | except Exception: # pylint: disable=W0718 783 | pass 784 | 785 | # join with existing nan_bins 786 | if self.nan_bins is not None and len(self.nan_bins) > 0: 787 | log.info("found existing %d nan bins that will be " 788 | "included for masking ", len(self.nan_bins)) 789 | bin_ids = np.unique(np.concatenate([self.nan_bins, bin_ids])) 790 | self.nan_bins = [] 791 | rows = cols = np.delete(list(range(self.matrix.shape[1])), bin_ids) 792 | 793 | self.matrix = self.matrix[rows, :][:, cols] 794 | 795 | # to keep track of removed bins 796 | # I add their ids to the end of the rows vector 797 | # to reverse the changes, I just need to do an argsort 798 | # to put the removed bins in place 799 | # log.debug("bins_ids {}".format(bin_ids)) 800 | self.orig_bin_ids = np.concatenate([rows, bin_ids]) 801 | 802 | new_cut_intervals = [self.cut_intervals[x] for x in rows] 803 | 804 | self.orig_cut_intervals = new_cut_intervals + [self.cut_intervals[x] for x in bin_ids] 805 | 806 | self.cut_intervals = new_cut_intervals 807 | 808 | self.interval_trees, self.chrBinBoundaries = self.intervalListToIntervalTree(self.cut_intervals) 809 | 810 | if self.correction_factors is not None: 811 | self.correction_factors = self.correction_factors[rows] 812 | 813 | def update_matrix(self, new_matrix, new_cut_intervals): 814 | """ 815 | give a new matrix and list of cut intervals, the matrix, cut intervals and 816 | the respective tree are updated 817 | :param new_matrix: now values for the sparse matrix 818 | :param new_cut_intervals: list of cut intervals, each entry being a tuple of the form 819 | (chrom, start, end, coverage) 820 | :return: 821 | """ 822 | if len(self.orig_bin_ids) > 0: 823 | raise ValueError("matrix contains masked bins. Restore masked bins first") 824 | 825 | assert len(new_cut_intervals) == new_matrix.shape[0], "matrix shape and len of cut intervals do not match" 826 | 827 | self.matrix = new_matrix 828 | self.cut_intervals = new_cut_intervals 829 | 830 | self.interval_trees, self.chrBinBoundaries = \ 831 | self.intervalListToIntervalTree(self.cut_intervals) 832 | 833 | self.nan_bins = np.flatnonzero(self.matrix.sum(0).A == 0) 834 | 835 | def restoreMaskedBins(self): 836 | """ 837 | Puts backs into the matrix the bins 838 | removed 839 | """ 840 | if len(self.orig_bin_ids) == 0: 841 | return 842 | # the rows to add are 843 | # as an empty sparse matrix 844 | M = self.matrix.shape[0] 845 | N = len(self.orig_bin_ids) - M 846 | rows_mat = csr_matrix((N, M)) 847 | # cols to add 848 | cols_mat = csr_matrix((M + N, N)) 849 | 850 | # add the rows and cols at the end of the 851 | # current matrix 852 | self.matrix = sparse_vstack([self.matrix, rows_mat]) 853 | self.matrix = sparse_hstack([self.matrix, cols_mat], format='csr') 854 | 855 | # the new matrix has the right number of cols and rows, now 856 | # they need to be reordered to be back in their original places 857 | rows = cols = np.argsort(self.orig_bin_ids) 858 | self.matrix = self.matrix[rows, :][:, cols] 859 | self.cut_intervals = [self.orig_cut_intervals[x] for x in rows] 860 | self.interval_trees, self.chrBinBoundaries = \ 861 | self.intervalListToIntervalTree(self.cut_intervals) 862 | # set as nan_bins the masked bins that were restored 863 | self.nan_bins = self.orig_bin_ids[M:] 864 | 865 | if self.correction_factors is not None: 866 | # add missing values as nans at end of array 867 | self.correction_factors = np.concatenate([self.correction_factors, 868 | np.repeat(np.nan, N)]) 869 | # reorder array 870 | self.correction_factors = self.correction_factors[rows] 871 | 872 | # reset orig bins ids and cut intervals 873 | self.orig_bin_ids = [] 874 | self.orig_cut_intervals = [] 875 | log.info("masked bins were restored\n") 876 | 877 | def reorderMatrix(self, orig, dest): 878 | """ 879 | Given a matrix, a region over the diagonal is moved from 880 | its origin to a new destination. With this method a 881 | new order of the chromosomes can be produced. 882 | :param orig: a tuple containing the indices of the region to be moved 883 | :param dest: the index of the region into which to insert 884 | the section moved 885 | """ 886 | 887 | rows = np.delete(list(range(self.matrix.shape[1])), range(orig[0], orig[1])) 888 | 889 | if dest > orig[1]: 890 | dest = dest - (orig[1] - orig[0]) 891 | 892 | rows = cols = np.insert( 893 | rows, np.repeat(dest, orig[1] - orig[0]), list(range(orig[0], orig[1]))) 894 | self.matrix = self.matrix[rows, :][:, cols] 895 | self.cut_intervals = [self.cut_intervals[x] for x in rows] 896 | self.interval_trees, self.chrBinBoundaries = \ 897 | self.intervalListToIntervalTree(self.cut_intervals) 898 | 899 | if self.correction_factors is not None: 900 | self.correction_factors = self.correction_factors[rows] 901 | 902 | def truncTrans(self, high=0.05): 903 | """Truncates trans contacts to remove blowouts 904 | Clip high counts in trans regions (i.e. between 905 | chromosomes) to the max value found in the 1-high*100 906 | percentile 907 | 908 | :param: high : float, 0 0: 942 | log.info("Masked bins already present") 943 | self.restoreMaskedBins() 944 | except Exception: # pylint: disable=W0718 945 | pass 946 | for idx in to_remove: 947 | chrom = self.cut_intervals[idx][0] 948 | if chrom not in cnt: 949 | cnt[chrom] = 0 950 | cnt[chrom] += 1 951 | 952 | log.info('%s: %d %s', label, len(to_remove), cnt) 953 | self.prev_to_remove = to_remove # pylint: disable=W0201 954 | 955 | def get_chromosome_sizes_real(self): 956 | ''' 957 | Function returns the size of a chromosome as it is stored in the matrix. 958 | The size can differ if e.g. some area from the start or end of a chromosome is not present in the interaction matrix. 959 | ''' 960 | if self.chrBinBoundaries and len(self.chrBinBoundaries) > 0: 961 | chrom_sizes = OrderedDict() 962 | # for chrom, (start_bin, end_bin) in iteritems(self.chrBinBoundaries): 963 | for chrom, (start_bin, end_bin) in self.chrBinBoundaries.items(): 964 | chrom, start0, _, _ = self.cut_intervals[start_bin] 965 | chrom, _, end1, _ = self.cut_intervals[end_bin - 1] 966 | chrom_sizes[chrom] = end1 - start0 + 1 967 | 968 | return chrom_sizes 969 | return None 970 | 971 | def get_chromosome_sizes(self): 972 | ''' 973 | Function returns the size of a chromosome as it is stored in the matrix, assuming the chromosome starts is always at its genomic position 0. 974 | ''' 975 | if self.chrBinBoundaries and len(self.chrBinBoundaries) > 0: 976 | chrom_sizes = OrderedDict() 977 | # for chrom, (start_bin, end_bin) in iteritems(self.chrBinBoundaries): 978 | for chrom, (_, end_bin) in self.chrBinBoundaries.items(): 979 | 980 | chrom, _, end, _ = self.cut_intervals[end_bin - 1] 981 | chrom_sizes[chrom] = end 982 | 983 | return chrom_sizes 984 | return None 985 | 986 | def intervalListToIntervalTree(self, interval_list): 987 | """ 988 | given an ordered list of (chromosome name, start, end) 989 | this is transformed to a number of interval trees, 990 | one for each chromosome 991 | """ 992 | cut_int_tree = {} 993 | chrbin_boundaries = OrderedDict() 994 | if len(interval_list) == 0: 995 | log.warning("Interval list is empty") 996 | return cut_int_tree, chrbin_boundaries 997 | 998 | intval_id = 0 999 | chr_start_id = 0 1000 | previous_chrom = None 1001 | for intval in interval_list: 1002 | chrom, start, end = intval[0:3] 1003 | start = int(start) 1004 | end = int(end) 1005 | if previous_chrom != chrom: 1006 | if previous_chrom is None: 1007 | previous_chrom = chrom 1008 | 1009 | chrbin_boundaries[previous_chrom] = \ 1010 | (chr_start_id, intval_id) 1011 | chr_start_id = intval_id 1012 | cut_int_tree[chrom] = IntervalTree() 1013 | previous_chrom = chrom 1014 | 1015 | cut_int_tree[chrom].add(Interval(start, end, intval_id)) 1016 | 1017 | intval_id += 1 1018 | chrbin_boundaries[chrom] = (chr_start_id, intval_id) 1019 | 1020 | return cut_int_tree, chrbin_boundaries 1021 | 1022 | 1023 | def check_cooler(pFileName): 1024 | if pFileName.endswith('.cool') or '.mcool' in pFileName: 1025 | if cooler.fileops.is_cooler(pFileName): 1026 | return True 1027 | return False 1028 | -------------------------------------------------------------------------------- /hicmatrix/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logging.basicConfig(level=logging.INFO) 4 | # logging.basicConfig(level=logging.DEBUG) 5 | 6 | logging.getLogger('cooler').setLevel(logging.WARNING) 7 | -------------------------------------------------------------------------------- /hicmatrix/lib/__init__.py: -------------------------------------------------------------------------------- 1 | from .matrixFileHandler import MatrixFileHandler # noqa: F401 2 | -------------------------------------------------------------------------------- /hicmatrix/lib/cool.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import gc 5 | try: 6 | from importlib.metadata import version 7 | except ImportError: 8 | from importlib_metadata import version 9 | 10 | import cooler 11 | import h5py 12 | import numpy as np 13 | import pandas as pd 14 | from scipy.sparse import csr_matrix, dok_matrix, lil_matrix, triu 15 | 16 | from hicmatrix.utilities import (convertNansToOnes, toString) 17 | 18 | from .matrixFile import MatrixFile 19 | 20 | log = logging.getLogger(__name__) 21 | 22 | class Cool(MatrixFile): 23 | 24 | def __init__(self, pMatrixFile=None): 25 | super().__init__(pMatrixFile) 26 | self.chrnameList = None 27 | self.correctionFactorTable = 'weight' 28 | self.correctionOperator = None 29 | self.enforceInteger = False 30 | self.appendData = False 31 | self.fileWasH5 = False 32 | self.applyCorrectionLoad = True 33 | self.hic_metadata = {} 34 | self.cool_info = None 35 | 36 | self.hic2cool_version = None 37 | self.hicmatrix_version = None 38 | self.distance = None 39 | self.matrixFormat = None 40 | self.matrixOnly = False 41 | self.noCutIntervals = False 42 | 43 | def getInformationCoolerBinNames(self): 44 | return cooler.Cooler(self.matrixFileName).bins().columns.values 45 | 46 | def load(self): 47 | log.debug('Load in cool format') 48 | self.minValue = None # pylint: disable=W0201 49 | self.maxValue = None # pylint: disable=W0201 50 | if self.matrixFileName is None: 51 | log.warning('No matrix is initialized') 52 | try: 53 | cooler_file = cooler.Cooler(self.matrixFileName) 54 | # if 'metadata' in cooler_file.info: 55 | self.hic_metadata = cooler_file.info 56 | # else: 57 | # self.hic_metadata = None 58 | # self.cool_info = deepcopy(cooler_file.info) 59 | # log.debug('self.hic_metadata {}'.format(self.hic_metadata)) 60 | except Exception as e: # pylint: disable=W0718 61 | log.warning("Could not open cooler file. Maybe the path is wrong or the given node is not available.") 62 | log.warning('The following file was tried to open: %s', self.matrixFileName) 63 | log.warning("The following nodes are available: %s", cooler.fileops.list_coolers(self.matrixFileName.split("::")[0])) 64 | return None, e 65 | if self.chrnameList is None and (self.matrixFileName is None or not self.matrixOnly): 66 | matrixDataFrame = cooler_file.matrix(balance=False, sparse=True, as_pixels=True) 67 | used_dtype = np.int32 68 | if np.iinfo(np.int32).max < cooler_file.info['nbins']: 69 | used_dtype = np.int64 70 | count_dtype = matrixDataFrame[0]['count'].dtype 71 | data = np.empty(cooler_file.info['nnz'], dtype=count_dtype) 72 | instances = np.empty(cooler_file.info['nnz'], dtype=used_dtype) 73 | features = np.empty(cooler_file.info['nnz'], dtype=used_dtype) 74 | i = 0 75 | size = cooler_file.info['nbins'] // 32 76 | if size == 0: 77 | size = 1 78 | start_pos = 0 79 | while i < cooler_file.info['nbins']: 80 | matrixDataFrameChunk = matrixDataFrame[i:i + size] 81 | _data = matrixDataFrameChunk['count'].values.astype(count_dtype) 82 | _instances = matrixDataFrameChunk['bin1_id'].values.astype(used_dtype) 83 | _features = matrixDataFrameChunk['bin2_id'].values.astype(used_dtype) 84 | 85 | data[start_pos:start_pos + len(_data)] = _data 86 | instances[start_pos:start_pos + len(_instances)] = _instances 87 | features[start_pos:start_pos + len(_features)] = _features 88 | start_pos += len(_features) 89 | i += size 90 | del _data 91 | del _instances 92 | del _features 93 | 94 | if self.matrixFormat is None or self.matrixFormat == 'csr': 95 | matrix = csr_matrix((data, (instances, features)), shape=(int(cooler_file.info['nbins']), int(cooler_file.info['nbins'])), dtype=count_dtype) 96 | elif self.matrixFormat == 'lil': 97 | matrix = lil_matrix((data, (instances, features)), shape=(int(cooler_file.info['nbins']), int(cooler_file.info['nbins'])), dtype=count_dtype) 98 | elif self.matrixFormat == 'dok': 99 | matrix = dok_matrix((data, (instances, features)), shape=(int(cooler_file.info['nbins']), int(cooler_file.info['nbins'])), dtype=count_dtype) 100 | # elif self.matrixFormat == 'raw': 101 | # matrix = [instances, features, data, int(cooler_file.info['nbins'])] 102 | del data 103 | del instances 104 | del features 105 | gc.collect() 106 | elif self.chrnameList is None and self.matrixOnly: 107 | log.debug('Load all at once') 108 | matrixDataFrame = cooler_file.matrix(balance=False, sparse=True, as_pixels=True) 109 | used_dtype = np.int64 110 | # if np.iinfo(np.int32).max < cooler_file.info['nbins']: 111 | # used_dtype = np.int64 112 | count_dtype = matrixDataFrame[0]['count'].dtype 113 | matrixDataFrameChunk = matrixDataFrame[:] 114 | data = matrixDataFrameChunk['count'].values.astype(count_dtype) 115 | instances = matrixDataFrameChunk['bin1_id'].values.astype(used_dtype) 116 | features = matrixDataFrameChunk['bin2_id'].values.astype(used_dtype) 117 | # matrix = [_instances, _features, _data, int(cooler_file.info['nbins'])] 118 | # return matrix, None, None, None, None 119 | else: 120 | if len(self.chrnameList) == 1: 121 | try: 122 | if self.distance is None or cooler_file.binsize is None: 123 | # load the full chromosome 124 | matrix = cooler_file.matrix(balance=False, sparse=True, as_pixels=False).fetch(self.chrnameList[0]).tocsr() # pylint: disable=E1136 125 | else: 126 | # load only the values up to a specific distance 127 | lo, hi = cooler_file.extent(self.chrnameList[0]) # pylint: disable=E1136 128 | dist = self.distance // cooler_file.binsize 129 | step = (hi - lo) // 32 130 | if step < 1: # pylint: disable=R1731 131 | step = 1 132 | mat = lil_matrix((hi - lo, hi - lo), dtype=np.float32) 133 | 134 | for i0, i1 in cooler.util.partition(lo, hi, step): 135 | # fetch stripe 136 | pixels = cooler_file.matrix(balance=False, as_pixels=True)[i0:i1, lo:hi] 137 | # filter 138 | pixels = pixels[(pixels['bin2_id'] - pixels['bin1_id']) < dist] 139 | # insert into sparse matrix 140 | mat[pixels['bin1_id'] - lo, pixels['bin2_id'] - lo] = pixels['count'].astype(np.float32) 141 | del pixels 142 | 143 | matrix = mat.tocsr() 144 | del mat 145 | gc.collect() 146 | 147 | except ValueError as ve: 148 | log.exception("Wrong chromosome format. Please check UCSC / ensembl notation.") 149 | log.exception('Error: %s', str(ve)) 150 | else: 151 | raise ValueError("Operation to load more as one region is not supported.") 152 | 153 | cut_intervals_data_frame = None 154 | correction_factors_data_frame = None 155 | 156 | if self.chrnameList is not None: 157 | if len(self.chrnameList) == 1: 158 | cut_intervals_data_frame = cooler_file.bins().fetch(self.chrnameList[0]) 159 | log.debug('cut_intervals_data_frame %s', str(list(cut_intervals_data_frame.columns))) 160 | if self.correctionFactorTable in cut_intervals_data_frame: 161 | correction_factors_data_frame = cut_intervals_data_frame[self.correctionFactorTable] 162 | else: 163 | raise ValueError("Operation to load more than one chr from bins is not supported.") 164 | else: 165 | if self.applyCorrectionLoad and self.correctionFactorTable in cooler_file.bins(): 166 | correction_factors_data_frame = cooler_file.bins()[[self.correctionFactorTable]][:] 167 | 168 | cut_intervals_data_frame = cooler_file.bins()[['chrom', 'start', 'end']][:] 169 | 170 | correction_factors = None 171 | if correction_factors_data_frame is not None and self.applyCorrectionLoad: 172 | # apply correction factors to matrix 173 | # a_i,j = a_i,j * c_i *c_j 174 | if not self.matrixOnly: 175 | matrix.eliminate_zeros() 176 | data = matrix.data 177 | if len(data) > 1: 178 | 179 | if not self.matrixOnly: 180 | matrix.data = matrix.data.astype(float) 181 | else: 182 | data = np.array(data, dtype=float) 183 | 184 | correction_factors = np.array(correction_factors_data_frame.values).flatten() 185 | # Don't apply correction if weight were just 'nans' 186 | if np.sum(np.isnan(correction_factors)) != len(correction_factors): 187 | # correction_factors = convertNansToZeros(correction_factors) 188 | 189 | if not self.matrixOnly: 190 | # matrix.sort_indices() 191 | instances, features = matrix.nonzero() 192 | instances_factors = correction_factors[instances] 193 | features_factors = correction_factors[features] 194 | 195 | if self.correctionOperator is None: 196 | if self.correctionFactorTable in ['KR', 'VC', 'SQRT_VC']: 197 | self.correctionOperator = '/' 198 | else: 199 | self.correctionOperator = '*' 200 | if 'generated-by' in cooler_file.info: 201 | log.debug('cooler_file.info[\'generated-by\'] %s %s', cooler_file.info['generated-by'], type(cooler_file.info['generated-by'])) 202 | generated_by = toString(cooler_file.info['generated-by']) 203 | if 'hic2cool' in generated_by: 204 | self.hic2cool_version = generated_by.split('-')[1] 205 | elif 'hicmatrix' in generated_by: 206 | self.hicmatrix_version = generated_by.split('-')[1] 207 | 208 | instances_factors *= features_factors 209 | log.debug('hic2cool: %s', self.hic2cool_version) 210 | log.debug('self.correctionOperator: %s', self.correctionOperator) 211 | 212 | if self.matrixOnly: 213 | if self.correctionOperator == '*': 214 | log.debug('multi') 215 | data *= instances_factors 216 | elif self.correctionOperator == '/': 217 | log.debug('div') 218 | data /= instances_factors 219 | log.debug('non') 220 | return [instances, features, data, int(cooler_file.info['nbins'])], None, None, None, None 221 | 222 | if self.correctionOperator == '*': 223 | matrix.data *= instances_factors 224 | log.debug('foo') 225 | elif self.correctionOperator == '/': 226 | matrix.data /= instances_factors 227 | log.debug('hu') 228 | 229 | elif self.matrixOnly: 230 | return [instances, features, data, int(cooler_file.info['nbins'])], None, None, None, None 231 | 232 | cut_intervals = [] 233 | if not self.noCutIntervals: 234 | for values in cut_intervals_data_frame.values: 235 | cut_intervals.append(tuple([toString(values[0]), values[1], values[2], 1.0])) 236 | del cut_intervals_data_frame 237 | del correction_factors_data_frame 238 | # try to restore nan_bins. 239 | try: 240 | # remove possible nan bins introduced by the correction factors 241 | # to have them part of the nan_bins vector 242 | mask = np.isnan(matrix.data) 243 | matrix.data[mask] = 0 244 | matrix.eliminate_zeros() 245 | shape = matrix.shape[0] if matrix.shape[0] < matrix.shape[1] else matrix.shape[1] 246 | nan_bins_indices = np.arange(shape) 247 | nan_bins_indices = np.setdiff1d(nan_bins_indices, matrix.indices) 248 | 249 | nan_bins = [] 250 | for bin_id in nan_bins_indices: 251 | if len(matrix[bin_id, :].data) == 0: 252 | nan_bins.append(bin_id) 253 | nan_bins = np.array(nan_bins) 254 | except Exception: # pylint: disable=W0718 255 | nan_bins = None 256 | 257 | distance_counts = None 258 | # log.debug('self.hic_metadata {}'.format(self.hic_metadata)) 259 | 260 | return matrix, cut_intervals, nan_bins, distance_counts, correction_factors 261 | 262 | def create_cooler_input(self, pSymmetric=True, pApplyCorrection=True): 263 | log.debug('self.hic_metadata 34 %s', self.hic_metadata) 264 | 265 | self.matrix.eliminate_zeros() 266 | 267 | if self.nan_bins is not None and len(self.nan_bins) > 0 and self.fileWasH5: 268 | # remove nan_bins 269 | correction_factors = np.ones(self.matrix.shape[0]) 270 | correction_factors[self.nan_bins] = 0 271 | self.matrix.sort_indices() 272 | _instances, _features = self.matrix.nonzero() 273 | 274 | instances_factors = correction_factors[_instances] 275 | features_factors = correction_factors[_features] 276 | 277 | instances_factors = np.logical_not(np.logical_or(instances_factors, features_factors)) 278 | self.matrix.data[instances_factors] = 0 279 | self.matrix.eliminate_zeros() 280 | 281 | # set possible nans in data to 0 282 | mask = np.isnan(self.matrix.data) 283 | 284 | self.matrix.data[mask] = 0 285 | self.matrix.eliminate_zeros() 286 | # save only the upper triangle of the 287 | if pSymmetric: 288 | # symmetric matrix 289 | self.matrix = triu(self.matrix, format='csr') 290 | else: 291 | self.matrix = self.matrix 292 | 293 | self.matrix.eliminate_zeros() 294 | 295 | # create data frame for bins 296 | # self.cut_intervals is having 4 tuples, bin_data_frame should have 3.correction_factors 297 | # it looks like it is faster to create it with 4, and drop the last one 298 | # instead of handling this before. 299 | bins_data_frame = pd.DataFrame(self.cut_intervals, columns=['chrom', 'start', 'end', 'interactions']).drop('interactions', axis=1) 300 | dtype_pixel = {'bin1_id': np.int32, 'bin2_id': np.int32, 'count': np.int32} 301 | log.debug('foo') 302 | if self.correction_factors is not None and pApplyCorrection: 303 | dtype_pixel['weight'] = np.float32 304 | 305 | # if the correction was applied by a division, invert it because cool format expects multiplicative if table name is 'weight' 306 | # https://cooler.readthedocs.io/en/latest/api.html#cooler.Cooler.matrix 307 | if (self.hic2cool_version is not None and self.hic2cool_version >= '0.5') or self.fileWasH5 or self.correctionOperator == '/': 308 | 309 | log.debug('h5 true') 310 | self.correction_factors = np.array(self.correction_factors).flatten() 311 | self.correction_factors = 1 / self.correction_factors 312 | mask = np.isnan(self.correction_factors) 313 | self.correction_factors[mask] = 0 314 | mask = np.isinf(self.correction_factors) 315 | self.correction_factors[mask] = 0 316 | self.correctionOperator = '*' 317 | log.debug('inverted correction factors') 318 | weight = convertNansToOnes(np.array(self.correction_factors).flatten()) 319 | log.debug('weight %s', weight) 320 | bins_data_frame = bins_data_frame.assign(weight=weight) 321 | 322 | log.debug("Reverting correction factors on matrix...") 323 | instances, features = self.matrix.nonzero() 324 | self.correction_factors = np.array(self.correction_factors) 325 | 326 | # do not apply if correction factors are just 1's 327 | instances_factors = self.correction_factors[instances] 328 | features_factors = self.correction_factors[features] 329 | 330 | instances_factors *= features_factors 331 | 332 | self.matrix.data = self.matrix.data.astype(float) 333 | 334 | # Apply the invert operation to get the original data 335 | if self.correctionOperator == '*' or self.correctionOperator is None: 336 | self.matrix.data /= instances_factors 337 | 338 | instances_factors = None 339 | features_factors = None 340 | 341 | self.matrix.eliminate_zeros() 342 | 343 | if self.correction_factors is not None and pApplyCorrection is False: 344 | dtype_pixel['weight'] = np.float32 345 | weight = convertNansToOnes(np.array(self.correction_factors).flatten()) 346 | bins_data_frame = bins_data_frame.assign(weight=weight) 347 | log.debug('weight 2: %s', weight) 348 | instances, features = self.matrix.nonzero() 349 | 350 | matrix_data_frame = pd.DataFrame(instances, columns=['bin1_id'], dtype=np.int32) 351 | del instances 352 | matrix_data_frame = matrix_data_frame.assign(bin2_id=features) 353 | del features 354 | 355 | if self.enforceInteger: 356 | dtype_pixel['count'] = np.int32 357 | data = np.rint(self.matrix.data) 358 | matrix_data_frame = matrix_data_frame.assign(count=data) 359 | else: 360 | matrix_data_frame = matrix_data_frame.assign(count=self.matrix.data) 361 | 362 | if not self.enforceInteger and self.matrix.dtype not in [np.int32, int]: 363 | log.debug("Writing non-standard cooler matrix. Datatype of matrix['count'] is: %s", self.matrix.dtype) 364 | dtype_pixel['count'] = self.matrix.dtype 365 | split_factor = 1 366 | if len(self.matrix.data) > 1e7: 367 | split_factor = 1e4 368 | matrix_data_frame = np.array_split(matrix_data_frame, split_factor) 369 | 370 | if self.appendData: 371 | self.appendData = 'a' 372 | else: 373 | self.appendData = 'w' 374 | 375 | info = {} 376 | # these fields are created by cooler lib. Can cause errors if not deleted. 377 | if 'metadata' in info: 378 | if self.hic_metadata is None: 379 | self.hic_metadata = info['metadata'] 380 | del info['metadata'] 381 | if 'bin-size' in info: 382 | del info['bin-size'] 383 | if 'bin-type' in info: 384 | del info['bin-type'] 385 | 386 | info['format'] = str('HDF5::Cooler') 387 | info['format-url'] = str('https://github.com/mirnylab/cooler') 388 | info['generated-by'] = str('HiCMatrix-' + version('HiCMatrix')) 389 | info['generated-by-cooler-lib'] = str('cooler-' + version('cooler')) 390 | 391 | info['tool-url'] = str('https://github.com/deeptools/HiCMatrix') 392 | 393 | if self.hic_metadata is not None and 'matrix-generated-by' in self.hic_metadata: 394 | info['matrix-generated-by'] = str(self.hic_metadata['matrix-generated-by']) 395 | del self.hic_metadata['matrix-generated-by'] 396 | if self.hic_metadata is not None and 'matrix-generated-by-url' in self.hic_metadata: 397 | info['matrix-generated-by-url'] = str(self.hic_metadata['matrix-generated-by-url']) 398 | del self.hic_metadata['matrix-generated-by-url'] 399 | log.debug('self.hic_metadata %s', self.hic_metadata) 400 | if self.hic_metadata is not None and 'genome-assembly' in self.hic_metadata: 401 | info['genome-assembly'] = str(self.hic_metadata['genome-assembly']) 402 | del self.hic_metadata['genome-assembly'] 403 | 404 | return bins_data_frame, matrix_data_frame, dtype_pixel, info 405 | 406 | def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): 407 | log.debug('Save in cool format11112323') 408 | 409 | bins_data_frame, matrix_data_frame, dtype_pixel, info = self.create_cooler_input(pSymmetric=pSymmetric, pApplyCorrection=pApplyCorrection) 410 | local_temp_dir = os.path.dirname(os.path.realpath(pFileName)) 411 | cooler.create_cooler(cool_uri=pFileName, 412 | bins=bins_data_frame, 413 | pixels=matrix_data_frame, 414 | mode=self.appendData, 415 | dtypes=dtype_pixel, 416 | ordered=True, 417 | metadata=info, 418 | 419 | temp_dir=local_temp_dir) 420 | 421 | log.debug('info %s', info) 422 | if self.appendData == 'w': 423 | fileName = pFileName.split('::')[0] 424 | with h5py.File(fileName, 'r+') as h5file: 425 | h5file.attrs.update(info) 426 | h5file.close() 427 | -------------------------------------------------------------------------------- /hicmatrix/lib/ginteractions.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from scipy.sparse import triu 4 | 5 | from .matrixFile import MatrixFile 6 | 7 | log = logging.getLogger(__name__) 8 | 9 | class Ginteractions(MatrixFile): 10 | 11 | def __init__(self, pMatrixFile): 12 | super().__init__(pMatrixFile) 13 | 14 | def load(self): 15 | log.error('Not implemented') 16 | 17 | def save(self, pFileName, pSymmetric=None, pApplyCorrection=None): 18 | 19 | # self.restoreMaskedBins() 20 | log.debug(self.matrix.shape) 21 | mat_coo = triu(self.matrix, k=0, format='csr').tocoo() 22 | with open(f"{pFileName}.tsv", 'w', encoding='utf-8') as fileh: 23 | for idx, counts in enumerate(mat_coo.data): 24 | chr_row, start_row, end_row, _ = self.cut_intervals[mat_coo.row[idx]] 25 | chr_col, start_col, end_col, _ = self.cut_intervals[mat_coo.col[idx]] 26 | fileh.write(f"{chr_row}\t{int(start_row)}\t{int(end_row)}\t{chr_col}\t{int(start_col)}\t{int(end_col)}\t{counts}\n") 27 | -------------------------------------------------------------------------------- /hicmatrix/lib/h5.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from os import unlink 4 | 5 | import numpy as np 6 | import tables 7 | from scipy.sparse import csr_matrix, triu 8 | 9 | from hicmatrix.utilities import toString 10 | 11 | from .matrixFile import MatrixFile 12 | 13 | log = logging.getLogger(__name__) 14 | 15 | 16 | class H5(MatrixFile): 17 | 18 | def __init__(self, pMatrixFile): 19 | super().__init__(pMatrixFile) 20 | 21 | def load(self): 22 | """ 23 | Loads a matrix stored in h5 format 24 | :param matrix_filename: 25 | :return: matrix, cut_intervals, nan_bins, distance_counts, correction_factors 26 | """ 27 | log.debug('Load in h5 format') 28 | 29 | with tables.open_file(self.matrixFileName, 'r') as f: 30 | parts = {} 31 | try: 32 | for matrix_part in ('data', 'indices', 'indptr', 'shape'): 33 | parts[matrix_part] = getattr(f.root.matrix, matrix_part).read() 34 | except Exception as e: # pylint: disable=W0718 35 | log.info('No h5 file. Please check parameters concerning the file type!') 36 | # Should probably be raise e: 37 | e # pylint: disable=W0104 38 | matrix = csr_matrix(tuple([parts['data'], parts['indices'], parts['indptr']]), 39 | shape=parts['shape']) 40 | # matrix = hiCMatrix.fillLowerTriangle(matrix) 41 | # get intervals 42 | intvals = {} 43 | for interval_part in ('chr_list', 'start_list', 'end_list', 'extra_list'): 44 | if toString(interval_part) == toString('chr_list'): 45 | chrom_list = getattr(f.root.intervals, interval_part).read() 46 | intvals[interval_part] = toString(chrom_list) 47 | else: 48 | intvals[interval_part] = getattr(f.root.intervals, interval_part).read() 49 | 50 | cut_intervals = list(zip(intvals['chr_list'], intvals['start_list'], intvals['end_list'], intvals['extra_list'])) 51 | assert len(cut_intervals) == matrix.shape[0], \ 52 | f"Error loading matrix. Length of bin intervals ({len(cut_intervals)}) is different than the " \ 53 | f"size of the matrix ({matrix.shape[0]})" 54 | 55 | # get nan_bins 56 | try: 57 | if hasattr(f.root, 'nan_bins'): 58 | nan_bins = f.root.nan_bins.read() 59 | else: 60 | nan_bins = np.array([]) 61 | except Exception: # pylint: disable=W0718 62 | nan_bins = np.array([]) 63 | 64 | # get correction factors 65 | try: 66 | if hasattr(f.root, 'correction_factors'): 67 | correction_factors = f.root.correction_factors.read() 68 | assert len(correction_factors) == matrix.shape[0], \ 69 | "Error loading matrix. Length of correction factors does not" \ 70 | "match size of matrix" 71 | correction_factors = np.array(correction_factors) 72 | mask = np.isnan(correction_factors) 73 | correction_factors[mask] = 0 74 | mask = np.isinf(correction_factors) 75 | correction_factors[mask] = 0 76 | else: 77 | correction_factors = None 78 | except Exception: # pylint: disable=W0718 79 | correction_factors = None 80 | 81 | try: 82 | # get correction factors 83 | if hasattr(f.root, 'distance_counts'): 84 | distance_counts = f.root.correction_factors.read() 85 | else: 86 | distance_counts = None 87 | except Exception: # pylint: disable=W0718 88 | distance_counts = None 89 | return matrix, cut_intervals, nan_bins, distance_counts, correction_factors 90 | 91 | def save(self, pFileName, pSymmetric=True, pApplyCorrection=None): 92 | """ 93 | Saves a matrix using hdf5 format 94 | :param pFileName: 95 | :return: None 96 | """ 97 | log.debug('Save in h5 format') 98 | 99 | # self.restoreMaskedBins() 100 | if not pFileName.endswith(".h5"): 101 | pFileName += ".h5" 102 | 103 | # if the file name already exists 104 | # try to find a new suitable name 105 | if os.path.isfile(pFileName): 106 | log.warning("*WARNING* File already exists %s\n " 107 | "Overwriting ...\n", pFileName) 108 | 109 | unlink(pFileName) 110 | if self.nan_bins is None: 111 | self.nan_bins = np.array([]) 112 | elif not isinstance(self.nan_bins, np.ndarray): 113 | self.nan_bins = np.array(self.nan_bins) 114 | 115 | # save only the upper triangle of the 116 | if pSymmetric: 117 | # symmetric matrix 118 | matrix = triu(self.matrix, k=0, format='csr') 119 | else: 120 | matrix = self.matrix 121 | matrix.eliminate_zeros() 122 | 123 | filters = tables.Filters(complevel=5, complib='blosc') 124 | with tables.open_file(pFileName, mode="w", title="HiCExplorer matrix") as h5file: 125 | matrix_group = h5file.create_group("/", "matrix", ) 126 | # save the parts of the csr matrix 127 | for matrix_part in ('data', 'indices', 'indptr', 'shape'): 128 | arr = np.array(getattr(matrix, matrix_part)) 129 | atom = tables.Atom.from_dtype(arr.dtype) 130 | ds = h5file.create_carray(matrix_group, matrix_part, atom, 131 | shape=arr.shape, 132 | filters=filters) 133 | ds[:] = arr 134 | 135 | # save the matrix intervals 136 | intervals_group = h5file.create_group("/", "intervals", ) 137 | chr_list, start_list, end_list, extra_list = zip(*self.cut_intervals) # pylint: disable=W0612 138 | for interval_part in ('chr_list', 'start_list', 'end_list', 'extra_list'): 139 | arr = np.array(eval(interval_part)) # pylint: disable=W0123 140 | atom = tables.Atom.from_dtype(arr.dtype) 141 | ds = h5file.create_carray(intervals_group, interval_part, atom, 142 | shape=arr.shape, 143 | filters=filters) 144 | ds[:] = arr 145 | 146 | # save nan bins 147 | if len(self.nan_bins): 148 | atom = tables.Atom.from_dtype(self.nan_bins.dtype) 149 | ds = h5file.create_carray(h5file.root, 'nan_bins', atom, 150 | shape=self.nan_bins.shape, 151 | filters=filters) 152 | ds[:] = self.nan_bins 153 | 154 | # save corrections factors 155 | if self.correction_factors is not None and len(self.correction_factors): 156 | self.correction_factors = np.array(self.correction_factors) 157 | mask = np.isnan(self.correction_factors) 158 | self.correction_factors[mask] = 0 159 | atom = tables.Atom.from_dtype(self.correction_factors.dtype) 160 | ds = h5file.create_carray(h5file.root, 'correction_factors', atom, 161 | shape=self.correction_factors.shape, 162 | filters=filters) 163 | ds[:] = np.array(self.correction_factors) 164 | 165 | # save distance counts 166 | if self.distance_counts is not None and len(self.distance_counts): 167 | atom = tables.Atom.from_dtype(self.distance_counts.dtype) 168 | ds = h5file.create_carray(h5file.root, 'distance_counts', atom, 169 | shape=self.distance_counts.shape, 170 | filters=filters) 171 | ds[:] = np.array(self.distance_counts) 172 | -------------------------------------------------------------------------------- /hicmatrix/lib/hicpro.py: -------------------------------------------------------------------------------- 1 | 2 | import logging 3 | 4 | from scipy.sparse import csr_matrix 5 | 6 | from .matrixFile import MatrixFile 7 | 8 | log = logging.getLogger(__name__) 9 | 10 | 11 | class Hicpro(MatrixFile): 12 | 13 | def __init__(self, pMatrixFile, pBedFile): 14 | super().__init__(pMatrixFileName=pMatrixFile, pBedFile=pBedFile) 15 | 16 | def load(self): 17 | instances = [] 18 | features = [] 19 | data = [] 20 | with open(self.matrixFileName, 'r', encoding="utf-8") as matrix_file: 21 | for line in matrix_file: 22 | x, y, value = line.strip().split('\t') 23 | instances.append(int(x) - 1) 24 | features.append(int(y) - 1) 25 | data.append(float(value)) 26 | cut_intervals = [] 27 | with open(self.bedFile, 'r', encoding="utf-8") as bed_file: 28 | for line in bed_file: 29 | chrom, start, end, value = line.strip().split('\t') 30 | cut_intervals.append((chrom, int(start), int(end), int(value))) 31 | 32 | shape = len(cut_intervals) 33 | 34 | matrix = csr_matrix((data, (instances, features)), shape=(shape, shape)) 35 | 36 | nan_bins = None 37 | distance_counts = None 38 | correction_factors = None 39 | return matrix, cut_intervals, nan_bins, distance_counts, correction_factors 40 | 41 | def save(self, pFileName, pSymmetric=None, pApplyCorrection=None): 42 | self.matrix.eliminate_zeros() 43 | instances, features = self.matrix.nonzero() 44 | data = self.matrix.data 45 | 46 | with open(pFileName, 'w', encoding="utf-8") as matrix_file: 47 | for x, y, value in zip(instances, features, data): 48 | matrix_file.write(str(int(x + 1)) + '\t' + str(int(y + 1)) + '\t' + str(value) + '\n') 49 | 50 | with open(self.bedFile, 'w', encoding="utf-8") as bed_file: 51 | for i, interval in enumerate(self.cut_intervals): 52 | bed_file.write('\t'.join(map(str, interval[:3])) + '\t' + str(i + 1) + '\n') 53 | -------------------------------------------------------------------------------- /hicmatrix/lib/homer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import gzip 4 | 5 | from scipy.sparse import csr_matrix 6 | 7 | from hicmatrix.utilities import opener 8 | 9 | from .matrixFile import MatrixFile 10 | 11 | log = logging.getLogger(__name__) 12 | 13 | 14 | class Homer(MatrixFile): 15 | 16 | def __init__(self, pMatrixFile): 17 | super().__init__(pMatrixFile) 18 | 19 | def load(self): 20 | cut_intervals = [] 21 | 22 | # matrix_file = opener(self.matrixFileName) 23 | with opener(self.matrixFileName) as matrix_file: 24 | values = matrix_file.readline() 25 | values = values.strip().split(b'\t') 26 | 27 | # get bin size 28 | start_first = int(values[2].strip().split(b'-')[1]) 29 | start_second = int(values[3].strip().split(b'-')[1]) 30 | bin_size = start_second - start_first 31 | for value in values[2:]: 32 | chrom, start = value.strip().split(b'-') 33 | cut_intervals.append((chrom.decode('ascii'), int(start), int(start) + bin_size, 1)) 34 | 35 | matrix_dense = [] 36 | for line in matrix_file: 37 | values = line.split(b'\t') 38 | data = [] 39 | for value in values[2:]: 40 | data.append(float(value)) 41 | matrix_dense.append(data) 42 | # matrix_file.close() 43 | matrix = csr_matrix(matrix_dense) 44 | nan_bins = None 45 | distance_counts = None 46 | correction_factors = None 47 | return matrix, cut_intervals, nan_bins, distance_counts, correction_factors 48 | 49 | def save(self, pFileName, pSymmetric=None, pApplyCorrection=None): 50 | 51 | with gzip.open(pFileName, 'wt') as homerMatrixFile: 52 | homerMatrixFile.write('HiCMatrix (directory=.)\tRegions\t') 53 | for bin_interval in self.cut_intervals: 54 | homerMatrixFile.write(f'{bin_interval[0]}-{bin_interval[1]}\t') 55 | homerMatrixFile.write('\n') 56 | 57 | for i in range(self.matrix.shape[0]): 58 | data = '\t'.join(map(str, self.matrix[i, :].toarray().flatten())) 59 | homerMatrixFile.write(f'{self.cut_intervals[i][0]}-{self.cut_intervals[i][1]}\t{self.cut_intervals[i][0]}-{self.cut_intervals[i][1]}\t') 60 | homerMatrixFile.write(f'{data}') 61 | if i < self.matrix.shape[0] - 1: 62 | homerMatrixFile.write('\n') 63 | -------------------------------------------------------------------------------- /hicmatrix/lib/matrixFile.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | log = logging.getLogger(__name__) 4 | 5 | 6 | class MatrixFile(): 7 | 8 | def __init__(self, pMatrixFileName=None, pBedFile=None): 9 | self.matrixFileName = pMatrixFileName 10 | log.debug('self.matrixFileName %s', self.matrixFileName) 11 | self.matrix = None 12 | self.cut_intervals = None 13 | self.nan_bins = None 14 | self.correction_factors = None 15 | self.distance_counts = None 16 | self.bedFile = pBedFile 17 | 18 | def load(self): 19 | log.error('Not implemented') 20 | 21 | def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): # pylint: disable=W0613 22 | log.error('Not implemented') 23 | 24 | def is_of_type(self): 25 | log.error('Not implemented') 26 | 27 | def set_matrix_variables(self, pMatrix, pCutIntervals, pNanBins, pCorrectionFactors, pDistanceCounts): 28 | log.debug('Seeting matrix variables') 29 | self.matrix = pMatrix 30 | self.cut_intervals = pCutIntervals 31 | self.nan_bins = pNanBins 32 | self.correction_factors = pCorrectionFactors 33 | self.distance_counts = pDistanceCounts 34 | -------------------------------------------------------------------------------- /hicmatrix/lib/matrixFileHandler.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import logging 3 | 4 | log = logging.getLogger(__name__) 5 | 6 | 7 | class MatrixFileHandler(): 8 | """ 9 | This class handles the load and save of the different Hi-C contact matrix formats. 10 | """ 11 | 12 | def __init__(self, pFileType='cool', pMatrixFile=None, pChrnameList=None, 13 | pApplyCorrectionCoolerLoad=None, pBedFileHicPro=None, pCorrectionFactorTable=None, 14 | pCorrectionOperator=None, pEnforceInteger=None, pAppend=None, pFileWasH5=None, pHiCInfo=None, pHic2CoolVersion=None, 15 | pDistance=None, pMatrixFormat=None, pLoadMatrixOnly=None, pNoCutIntervals=None): 16 | 17 | self.class_ = getattr(importlib.import_module('.' + pFileType.lower(), package='hicmatrix.lib'), pFileType.title()) 18 | 19 | if pFileType == 'hicpro': 20 | self.matrixFile = self.class_(pMatrixFile=pMatrixFile, pBedFile=pBedFileHicPro) 21 | else: 22 | self.matrixFile = self.class_(pMatrixFile=pMatrixFile) 23 | if pFileType == 'cool': 24 | self.matrixFile.chrnameList = pChrnameList 25 | if pCorrectionFactorTable is not None: 26 | self.matrixFile.correctionFactorTable = pCorrectionFactorTable 27 | if pCorrectionOperator is not None: 28 | self.matrixFile.correctionOperator = pCorrectionOperator 29 | if pEnforceInteger is not None: 30 | self.matrixFile.enforceInteger = pEnforceInteger 31 | if pAppend is not None: 32 | self.matrixFile.appendData = pAppend 33 | if pFileWasH5 is not None: 34 | self.matrixFile.fileWasH5 = pFileWasH5 35 | if pApplyCorrectionCoolerLoad is not None: 36 | self.matrixFile.applyCorrectionLoad = pApplyCorrectionCoolerLoad 37 | if pHiCInfo is not None: 38 | self.matrixFile.hic_metadata = pHiCInfo 39 | if pHic2CoolVersion is not None: 40 | self.matrixFile.hic2cool_version = pHic2CoolVersion 41 | if pDistance is not None: 42 | self.matrixFile.distance = pDistance 43 | if pMatrixFormat is not None: 44 | self.matrixFile.matrixFormat = pMatrixFormat 45 | if pLoadMatrixOnly is not None: 46 | self.matrixFile.matrixOnly = pLoadMatrixOnly 47 | if pNoCutIntervals is not None: 48 | self.matrixFile.noCutIntervals = pNoCutIntervals 49 | 50 | def load(self): 51 | 52 | return self.matrixFile.load() 53 | 54 | def set_matrix_variables(self, pMatrix, pCutIntervals, pNanBins, pCorrectionFactors, pDistanceCounts): 55 | self.matrixFile.set_matrix_variables(pMatrix, pCutIntervals, pNanBins, pCorrectionFactors, pDistanceCounts) 56 | 57 | def save(self, pName, pSymmetric, pApplyCorrection): 58 | self.matrixFile.save(pName, pSymmetric, pApplyCorrection) 59 | 60 | def load_init(self): 61 | pass 62 | -------------------------------------------------------------------------------- /hicmatrix/lib/scool.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import cooler 5 | import numpy as np 6 | 7 | from .matrixFile import MatrixFile 8 | 9 | log = logging.getLogger(__name__) 10 | 11 | class Scool(MatrixFile): 12 | 13 | def __init__(self, pMatrixFile=None): 14 | super().__init__(pMatrixFile) 15 | log.debug('scool object created') 16 | self.coolObjectsList = None 17 | self.bins = None 18 | self.pixel_list = None 19 | self.name_list = None 20 | 21 | def load(self): 22 | raise NotImplementedError('Please use the specific cell to load the individual cool file from the scool file') 23 | 24 | def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): 25 | 26 | pixel_dict = {} 27 | bins_dict = {} 28 | 29 | if self.coolObjectsList is not None: 30 | for coolObject in self.coolObjectsList: 31 | bins_data_frame, matrix_data_frame, dtype_pixel, _ = coolObject.matrixFile.create_cooler_input(pSymmetric=pSymmetric, pApplyCorrection=pApplyCorrection) 32 | bins_dict[coolObject.matrixFile.matrixFileName] = bins_data_frame 33 | pixel_dict[coolObject.matrixFile.matrixFileName] = matrix_data_frame 34 | 35 | else: 36 | try: 37 | dtype_pixel = {'bin1_id': np.int32, 'bin2_id': np.int32, 'count': self.pixel_list[0]['count'].dtype} 38 | # dtype_pixel = self.pixel_list[0]['count'].dtype 39 | 40 | for i, pixels in enumerate(self.pixel_list): 41 | bins_dict[self.name_list[i]] = self.bins 42 | pixel_dict[self.name_list[i]] = pixels 43 | log.debug('self.name_list[i] %s', self.name_list[i]) 44 | except Exception as exp: # pylint: disable=W0718 45 | log.debug('Exception %s', str(exp)) 46 | 47 | local_temp_dir = os.path.dirname(os.path.realpath(pFileName)) 48 | 49 | cooler.create_scool(cool_uri=pFileName, bins=bins_dict, cell_name_pixels_dict=pixel_dict, 50 | dtypes=dtype_pixel, 51 | ordered=True, 52 | temp_dir=local_temp_dir) 53 | -------------------------------------------------------------------------------- /hicmatrix/test/test_HiCMatrix.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os.path 3 | import sys 4 | import warnings 5 | # from past.builtins import zip 6 | from collections import OrderedDict 7 | from os import unlink 8 | from tempfile import NamedTemporaryFile 9 | 10 | import numpy as np 11 | import numpy.testing as nt 12 | import pytest 13 | from intervaltree import Interval, IntervalTree 14 | from scipy.sparse import coo_matrix, csr_matrix 15 | from six import iteritems 16 | 17 | from hicmatrix import HiCMatrix as hm 18 | 19 | log = logging.getLogger(__name__) 20 | 21 | warnings.filterwarnings("ignore") 22 | 23 | ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_data/") 24 | 25 | 26 | def test_load_h5_save_and_load_cool(): 27 | hic = hm.hiCMatrix(ROOT + 'Li_et_al_2015.h5') 28 | 29 | outfile = NamedTemporaryFile(suffix='.cool', prefix='hicexplorer_test') # pylint: disable=R1732 30 | hic.matrixFileHandler = None 31 | hic.save(pMatrixName=outfile.name) 32 | 33 | hic_cool = hm.hiCMatrix(outfile.name) 34 | 35 | nt.assert_equal(hic_cool.matrix.data, hic.matrix.data) 36 | chrom_cool, start_cool, end_cool, _ = list(zip(*hic_cool.cut_intervals)) 37 | chrom, start, end, _ = list(zip(*hic_cool.cut_intervals)) 38 | 39 | nt.assert_equal(chrom_cool, chrom) 40 | nt.assert_equal(start_cool, start) 41 | nt.assert_equal(end_cool, end) 42 | 43 | 44 | def test_load_h5_load_cool_weight(): 45 | hic_h5 = hm.hiCMatrix(ROOT + 'Li_et_al_2015.h5') 46 | hic_cool = hm.hiCMatrix(ROOT + 'Li_et_al_2015.cool') 47 | 48 | # there is always a small gap due to rounding errors and inaccurate floating operations 49 | # test if it is equal for up to 10 decimal positions 50 | nt.assert_almost_equal(hic_cool.matrix.data, hic_h5.matrix.data, decimal=10) 51 | chrom_cool, start_cool, end_cool, _ = list(zip(*hic_cool.cut_intervals)) 52 | chrom, start, end, _ = list(zip(*hic_cool.cut_intervals)) 53 | 54 | nt.assert_equal(chrom_cool, chrom) 55 | nt.assert_equal(start_cool, start) 56 | nt.assert_equal(end_cool, end) 57 | 58 | 59 | def test_load_h5_save_and_load_cool_2(): 60 | hic = hm.hiCMatrix(ROOT + 'small_test_matrix.h5') 61 | 62 | outfile = NamedTemporaryFile(suffix='.cool', prefix='hicexplorer_test') # pylint: disable=R1732 63 | hic.matrixFileHandler = None 64 | hic.save(pMatrixName=outfile.name) 65 | 66 | hic_cool = hm.hiCMatrix(outfile.name) 67 | 68 | nt.assert_equal(hic_cool.matrix.data, hic.matrix.data) 69 | chrom_cool, start_cool, end_cool, _ = list(zip(*hic_cool.cut_intervals)) 70 | chrom, start, end, _ = list(zip(*hic_cool.cut_intervals)) 71 | 72 | nt.assert_equal(chrom_cool, chrom) 73 | nt.assert_equal(start_cool, start) 74 | nt.assert_equal(end_cool, end) 75 | 76 | 77 | def test_load_cool_save_and_load_h5(): 78 | hic = hm.hiCMatrix(ROOT + 'Li_et_al_2015.cool') 79 | 80 | outfile = NamedTemporaryFile(suffix='.h5', prefix='hicexplorer_test') # pylint: disable=R1732 81 | hic.matrixFileHandler = None 82 | hic.save(pMatrixName=outfile.name) 83 | 84 | hic_cool = hm.hiCMatrix(outfile.name) 85 | 86 | nt.assert_equal(hic_cool.matrix.data, hic.matrix.data) 87 | chrom_cool, start_cool, end_cool, _ = list(zip(*hic_cool.cut_intervals)) 88 | chrom, start, end, _ = list(zip(*hic_cool.cut_intervals)) 89 | 90 | nt.assert_equal(chrom_cool, chrom) 91 | nt.assert_equal(start_cool, start) 92 | nt.assert_equal(end_cool, end) 93 | 94 | 95 | def test_save_load_cool(): 96 | outfile = '/tmp/matrix.cool' 97 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 98 | ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] 99 | hic = hm.hiCMatrix() 100 | hic.nan_bins = [] 101 | matrix = np.array([[1, 8, 5, 3, 0], 102 | [0, 4, 15, 5, 1], 103 | [0, 0, 0, 0, 2], 104 | [0, 0, 0, 0, 1], 105 | [0, 0, 0, 0, 0]]) 106 | 107 | hic.matrix = csr_matrix(matrix) 108 | # make matrix symmetric 109 | hic.setMatrix(hic.matrix, cut_intervals) 110 | hic.fillLowerTriangle() 111 | # hic.correction_factors = np.array([0.5, 1, 2, 3, 4]) 112 | # hic.nan_bins = np.array([4]) 113 | 114 | hic.save(outfile) 115 | 116 | cool_obj = hm.hiCMatrix(outfile) 117 | # nt.assert_equal(hic.correction_factors, cool_obj.correction_factors) 118 | nt.assert_equal(hic.matrix.data, cool_obj.matrix.data) 119 | nt.assert_equal(hic.matrix.indices, cool_obj.matrix.indices) 120 | nt.assert_equal(hic.matrix.indptr, cool_obj.matrix.indptr) 121 | nt.assert_equal(hic.nan_bins, cool_obj.nan_bins) 122 | 123 | nt.assert_equal(hic.cut_intervals, cool_obj.cut_intervals) 124 | unlink(outfile) 125 | 126 | 127 | def test_save_load_h5(): 128 | outfile = '/tmp/matrix.h5' 129 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 130 | ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] 131 | hic = hm.hiCMatrix() 132 | hic.nan_bins = [] 133 | matrix = np.array([[1, 8, 5, 3, 0], 134 | [0, 4, 15, 5, 1], 135 | [0, 0, 0, 0, 2], 136 | [0, 0, 0, 0, 1], 137 | [0, 0, 0, 0, 0]]) 138 | 139 | hic.matrix = csr_matrix(matrix) 140 | # make matrix symmetric 141 | hic.setMatrix(hic.matrix, cut_intervals) 142 | hic.fillLowerTriangle() 143 | # hic.correction_factors = np.array([0.5, 1, 2, 3, 4]) 144 | # hic.nan_bins = np.array([4]) 145 | 146 | hic.save(outfile) 147 | 148 | h5_obj = hm.hiCMatrix(outfile) 149 | # nt.assert_equal(hic.correction_factors, h5_obj.correction_factors) 150 | nt.assert_equal(hic.matrix.data, h5_obj.matrix.data) 151 | nt.assert_equal(hic.matrix.indices, h5_obj.matrix.indices) 152 | nt.assert_equal(hic.matrix.indptr, h5_obj.matrix.indptr) 153 | nt.assert_equal(hic.nan_bins, h5_obj.nan_bins) 154 | 155 | nt.assert_equal(hic.cut_intervals, h5_obj.cut_intervals) 156 | unlink(outfile) 157 | 158 | 159 | @pytest.mark.xfail 160 | def test_save_load_other_formats_fail(): 161 | pMatrixFile = ROOT + 'test_matrix.hicpro' 162 | # pBedFileHicPro = ROOT + 'test_matrix.bed' # no parameter for this in hiCMatrix::__init__() anyway 163 | # hic_matrix = hm.hiCMatrix(pMatrixFile=pMatrixFile) 164 | # out, err = capsys.readouterr() 165 | # assert out == 'matrix file not given' 166 | pMatrixFile = ROOT + 'test_matrix.homer' 167 | hm.hiCMatrix(pMatrixFile=pMatrixFile) 168 | 169 | 170 | def test_convert_to_zscore_matrix(): 171 | 172 | # make test matrix 173 | m_size = 100 174 | mat = np.triu(np.random.randint(0, 101, (m_size, m_size))) 175 | # add a number of zeros 176 | mat[mat < 90] = 0 177 | # import ipdb;ipdb.set_trace() 178 | mu = dict([(idx, mat.diagonal(idx).mean()) for idx in range(mat.shape[0])]) # pylint: disable=R1717 179 | std = dict([(idx, np.std(mat.diagonal(idx))) # pylint: disable=R1717 180 | for idx in range(mat.shape[0])]) 181 | 182 | # compute z-score for test matrix 183 | zscore_mat = np.zeros((m_size, m_size)) 184 | for _i in range(mat.shape[0]): 185 | for _j in range(mat.shape[0]): 186 | if _j >= _i: 187 | diag = _j - _i 188 | if std[diag] == 0: 189 | zscore = np.nan 190 | else: 191 | zscore = (mat[_i, _j] - mu[diag]) / std[diag] 192 | zscore_mat[_i, _j] = zscore 193 | 194 | # make Hi-C matrix based on test matrix 195 | hic = hm.hiCMatrix() 196 | hic.matrix = csr_matrix(mat) 197 | cut_intervals = [('chr', idx, idx + 10, 0) for idx in range(0, mat.shape[0] * 10, 10)] 198 | hic.setMatrix(hic.matrix, cut_intervals) 199 | hic.convert_to_zscore_matrix() 200 | 201 | nt.assert_almost_equal(hic.matrix.todense(), zscore_mat) 202 | 203 | 204 | def test_convert_to_zscore_matrix_2(): 205 | 206 | # load test matrix 207 | hic = hm.hiCMatrix(ROOT + 'Li_et_al_2015.h5') 208 | hic.maskBins(hic.nan_bins) 209 | 210 | mat = hic.matrix.todense() 211 | max_depth = 10000 212 | bin_size = hic.getBinSize() 213 | max_depth_in_bins = int(float(max_depth) / bin_size) 214 | 215 | m_size = mat.shape[0] 216 | # compute matrix values per distance 217 | _, start, _, _ = list(zip( 218 | *hm.hiCMatrix.fit_cut_intervals(hic.cut_intervals))) 219 | dist_values = {} 220 | sys.stderr.write("Computing values per distance for each matrix entry\n") 221 | 222 | for _i in range(mat.shape[0]): 223 | for _j in range(mat.shape[0]): 224 | if _j >= _i: 225 | # dist is translated to bins 226 | dist = int(float(start[_j] - start[_i]) / bin_size) 227 | if dist <= max_depth_in_bins: 228 | if dist not in dist_values: 229 | dist_values[dist] = [] 230 | dist_values[dist].append(mat[_i, _j]) 231 | 232 | mu = {} 233 | std = {} 234 | for dist, values in iteritems(dist_values): 235 | mu[dist] = np.mean(values) 236 | std[dist] = np.std(values) 237 | 238 | # compute z-score for test matrix 239 | sys.stderr.write("Computing zscore for each matrix entry\n") 240 | zscore_mat = np.full((m_size, m_size), np.nan) 241 | for _i in range(mat.shape[0]): 242 | for _j in range(mat.shape[0]): 243 | if _j >= _i: 244 | dist = int(float(start[_j] - start[_i]) / bin_size) 245 | if dist <= max_depth_in_bins: 246 | zscore = (mat[_i, _j] - mu[dist]) / std[dist] 247 | zscore_mat[_i, _j] = zscore 248 | 249 | # compare with zscore from class 250 | hic.convert_to_zscore_matrix(maxdepth=max_depth) 251 | 252 | # from numpy.testing import assert_almost_equal 253 | # only the main diagonal is check. Other diagonals show minimal differences 254 | nt.assert_almost_equal(hic.matrix.todense().diagonal( 255 | 0).A1, zscore_mat.diagonal(0)) 256 | 257 | 258 | def test_dist_list_to_dict(): 259 | hic = hm.hiCMatrix() 260 | 261 | data = np.array([1, 8, 5, 3, 0, 4, 15, 5, 1, 0, 0, 2, 0, 1, 0]) 262 | dist_list = np.array( 263 | [0, 10, 20, 30, -1, 0, 10, 20, -1, 0, 10, -1, 0, -1, 0]) 264 | 265 | distance = hic.dist_list_to_dict(data, dist_list) 266 | 267 | nt.assert_equal(distance[-1], [0, 1, 2, 1]) 268 | nt.assert_equal(distance[0], [1, 4, 0, 0, 0]) 269 | nt.assert_equal(distance[10], [8, 15, 0]) 270 | nt.assert_equal(distance[20], [5, 5]) 271 | nt.assert_equal(distance[30], [3]) 272 | 273 | data = np.array([0, 100, 200, 0, 100, 200, 0, 100, 0]) 274 | dist_list = np.array([0, 100, 200, 0, 100, 200, 0, 100, 0]) 275 | 276 | distance = hic.dist_list_to_dict(data, dist_list) 277 | 278 | nt.assert_equal(distance[0], [0, 0, 0, 0]) 279 | nt.assert_equal(distance[100], [100, 100, 100]) 280 | nt.assert_equal(distance[200], [200, 200]) 281 | 282 | 283 | def test_keepOnlyTheseChr(): 284 | chromosome_list = ['chrX', 'chr2RHet'] 285 | 286 | hic = hm.hiCMatrix(ROOT + 'small_test_matrix.h5') 287 | 288 | hic.keepOnlyTheseChr(chromosome_list) 289 | 290 | nt.assert_equal(hic.getChrNames().sort(), chromosome_list.sort()) 291 | 292 | 293 | def test_save(): 294 | """ 295 | Test will not cover testing of following formats due to unsupported file_formats (see __init__ of class hiCMatrix): 296 | 297 | * ren 298 | * lieberman 299 | * GInteractions 300 | 301 | see also single test for these formats (marked as xfail) 302 | """ 303 | 304 | outfile_cool = NamedTemporaryFile(suffix='.cool', delete=False) # pylint: disable=R1732 305 | outfile_cool.close() 306 | 307 | outfile_h5 = NamedTemporaryFile(suffix='.h5', delete=False) # pylint: disable=R1732 308 | outfile_h5.close() 309 | 310 | hic = hm.hiCMatrix() 311 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 312 | ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] 313 | 314 | hic.nan_bins = [] 315 | 316 | matrix = np.array([[1, 8, 5, 3, 0], 317 | [0, 4, 15, 5, 1], 318 | [0, 0, 0, 0, 2], 319 | [0, 0, 0, 0, 1], 320 | [0, 0, 0, 0, 0]]) 321 | 322 | hic.matrix = csr_matrix(matrix) 323 | hic.setMatrix(hic.matrix, cut_intervals) 324 | hic.fillLowerTriangle() 325 | 326 | # test .h5 327 | hic.save(outfile_h5.name) 328 | h5_test = hm.hiCMatrix(outfile_h5.name) 329 | 330 | # test cool 331 | hic.matrixFileHandler = None 332 | hic.save(outfile_cool.name) 333 | cool_test = hm.hiCMatrix(outfile_cool.name) 334 | 335 | nt.assert_equal(hic.getMatrix(), h5_test.getMatrix()) 336 | nt.assert_equal(hic.getMatrix(), cool_test.getMatrix()) 337 | 338 | 339 | def test_diagflat(): 340 | hic = hm.hiCMatrix() 341 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 342 | ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] 343 | 344 | hic.nan_bins = [] 345 | 346 | matrix = np.array([[1, 8, 5, 3, 0], 347 | [0, 4, 15, 5, 1], 348 | [0, 0, 0, 0, 2], 349 | [0, 0, 0, 0, 1], 350 | [0, 0, 0, 0, 0]]) 351 | 352 | hic.matrix = csr_matrix(matrix) 353 | hic.setMatrix(hic.matrix, cut_intervals) 354 | hic.fillLowerTriangle() 355 | 356 | hic.diagflat(value=1000) 357 | nt.assert_equal( 358 | np.array([1000 for x in range(matrix.shape[0])]), hic.matrix.diagonal()) 359 | 360 | hic.diagflat() 361 | nt.assert_equal( 362 | np.array([np.nan for x in range(5)]), hic.matrix.diagonal()) 363 | 364 | 365 | def test_filterOutInterChrCounts(): 366 | hic = hm.hiCMatrix() 367 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 368 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] 369 | 370 | hic.nan_bins = [] 371 | 372 | matrix = np.array([[1, 8, 5, 3, 0], 373 | [0, 4, 15, 5, 1], 374 | [0, 0, 0, 0, 2], 375 | [0, 0, 0, 0, 1], 376 | [0, 0, 0, 0, 0]]) 377 | 378 | hic.matrix = csr_matrix(matrix) 379 | hic.setMatrix(hic.matrix, cut_intervals) 380 | hic.fillLowerTriangle() 381 | hic.filterOutInterChrCounts() 382 | 383 | filtered_matrix = np.array([[1, 8, 5, 0, 0], 384 | [8, 4, 15, 0, 0], 385 | [5, 15, 0, 0, 0], 386 | [0, 0, 0, 0, 1], 387 | [0, 0, 0, 1, 0]]) 388 | 389 | nt.assert_equal(hic.getMatrix(), filtered_matrix) 390 | 391 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 392 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] 393 | hic = hm.hiCMatrix() 394 | hic.nan_bins = [] 395 | matrix = np.array([[0, 10, 5, 3, 0], 396 | [0, 0, 15, 5, 1], 397 | [0, 0, 0, 7, 3], 398 | [0, 0, 0, 0, 1], 399 | [0, 0, 0, 0, 0]]) 400 | 401 | # make the matrix symmetric: 402 | hic.matrix = csr_matrix(matrix + matrix.T) 403 | hic.setMatrix(csr_matrix(matrix + matrix.T, dtype=np.int32), cut_intervals) 404 | 405 | filtered = hic.filterOutInterChrCounts().todense() 406 | test_matrix = np.array([[0, 10, 5, 0, 0], 407 | [10, 0, 15, 0, 0], 408 | [5, 15, 0, 0, 0], 409 | [0, 0, 0, 0, 1], 410 | [0, 0, 0, 1, 0]], dtype='i4') 411 | 412 | nt.assert_equal(filtered, test_matrix) 413 | 414 | 415 | def test_setMatrixValues_success(): 416 | hic = hm.hiCMatrix() 417 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 418 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] 419 | 420 | hic.nan_bins = [] 421 | 422 | matrix = np.array([[1, 8, 5, 3, 0], 423 | [0, 4, 15, 5, 1], 424 | [0, 0, 0, 0, 2], 425 | [0, 0, 0, 0, 1], 426 | [0, 0, 0, 0, 0]]) 427 | 428 | hic.matrix = csr_matrix(matrix) 429 | hic.setMatrix(hic.matrix, cut_intervals) 430 | 431 | new_matrix = np.array([[10, 80, 50, 30, 0], 432 | [0, 40, 150, 50, 10], 433 | [0, 0, 0, 0, 20], 434 | [0, 0, 0, 0, 10], 435 | [0, 0, 0, 0, 0]]) 436 | 437 | hic.setMatrixValues(new_matrix) 438 | 439 | nt.assert_equal(hic.getMatrix(), new_matrix) 440 | 441 | 442 | def test_setMatrixValues_fail(): 443 | hic = hm.hiCMatrix() 444 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 445 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] 446 | 447 | hic.nan_bins = [] 448 | 449 | matrix = np.array([[1, 8, 5, 3, 0], 450 | [0, 4, 15, 5, 1], 451 | [0, 0, 0, 0, 2], 452 | [0, 0, 0, 0, 1], 453 | [0, 0, 0, 0, 0]]) 454 | 455 | hic.matrix = csr_matrix(matrix) 456 | hic.setMatrix(hic.matrix, cut_intervals) 457 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 458 | ('a', 20, 30, 1), ('b', 30, 40, 1)] 459 | 460 | new_matrix = np.array([[10, 80, 50, 30], 461 | [0, 40, 150, 50], 462 | [0, 0, 0, 0], 463 | [0, 0, 0, 0]]) 464 | with pytest.raises(AssertionError): 465 | hic.setMatrixValues(new_matrix) 466 | 467 | 468 | def test_setCorrectionFactors_success(): 469 | hic = hm.hiCMatrix() 470 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 471 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] 472 | 473 | hic.nan_bins = [] 474 | 475 | matrix = np.array([[1, 8, 5, 3, 0], 476 | [0, 4, 15, 5, 1], 477 | [0, 0, 0, 0, 2], 478 | [0, 0, 0, 0, 1], 479 | [0, 0, 0, 0, 0]]) 480 | 481 | hic.matrix = csr_matrix(matrix) 482 | hic.setMatrix(hic.matrix, cut_intervals) 483 | 484 | assert hic.correction_factors is None 485 | 486 | hic.setCorrectionFactors([5, 5, 5, 5, 5]) 487 | 488 | nt.assert_equal(hic.correction_factors, [5, 5, 5, 5, 5]) 489 | 490 | 491 | def test_setCorrectionFactors_fail(): 492 | hic = hm.hiCMatrix() 493 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 494 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] 495 | 496 | hic.nan_bins = [] 497 | 498 | matrix = np.array([[1, 8, 5, 3, 0], 499 | [0, 4, 15, 5, 1], 500 | [0, 0, 0, 0, 2], 501 | [0, 0, 0, 0, 1], 502 | [0, 0, 0, 0, 0]]) 503 | 504 | hic.matrix = csr_matrix(matrix) 505 | hic.setMatrix(hic.matrix, cut_intervals) 506 | 507 | assert hic.correction_factors is None 508 | with pytest.raises(AssertionError): 509 | hic.setCorrectionFactors([5, 5, 5, 5]) 510 | 511 | 512 | def test_reorderChromosomes(): 513 | hic = hm.hiCMatrix() 514 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 515 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] 516 | 517 | hic.nan_bins = [] 518 | 519 | matrix = np.array([[1, 8, 5, 3, 0], 520 | [0, 4, 15, 5, 1], 521 | [0, 0, 0, 0, 2], 522 | [0, 0, 0, 0, 1], 523 | [0, 0, 0, 0, 0]]) 524 | 525 | hic.matrix = csr_matrix(matrix) 526 | hic.setMatrix(hic.matrix, cut_intervals) 527 | 528 | new_chr_order = ['b', 'a'] 529 | hic.reorderChromosomes(new_chr_order) 530 | 531 | nt.assert_equal(hic.chrBinBoundaries, OrderedDict( 532 | [('b', (0, 2)), ('a', (2, 5))])) 533 | 534 | old_chr_order = ['a', 'b'] 535 | hic.reorderChromosomes(old_chr_order) 536 | 537 | nt.assert_equal(hic.chrBinBoundaries, OrderedDict( 538 | [('a', (0, 3)), ('b', (3, 5))])) 539 | 540 | 541 | def test_reorderChromosomes_fail(): 542 | hic = hm.hiCMatrix() 543 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 544 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] 545 | 546 | hic.nan_bins = [] 547 | 548 | matrix = np.array([[1, 8, 5, 3, 0], 549 | [0, 4, 15, 5, 1], 550 | [0, 0, 0, 0, 2], 551 | [0, 0, 0, 0, 1], 552 | [0, 0, 0, 0, 0]]) 553 | 554 | hic.matrix = csr_matrix(matrix) 555 | hic.setMatrix(hic.matrix, cut_intervals) 556 | 557 | # name 'c' not in chromosome names, thus fail 558 | false_chr_order = ['a', 'b', 'c'] 559 | with pytest.raises(Exception) as context: 560 | hic.reorderChromosomes(false_chr_order) 561 | assert "Chromosome name 'c' not found." in str(context.value) 562 | 563 | 564 | def test_reorderBins(): 565 | hic = hm.hiCMatrix() 566 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 567 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] 568 | 569 | hic.nan_bins = [] 570 | 571 | matrix = np.array([[1, 8, 5, 3, 0], 572 | [0, 4, 15, 5, 1], 573 | [0, 0, 0, 0, 2], 574 | [0, 0, 0, 0, 1], 575 | [0, 0, 0, 0, 0]]) 576 | 577 | hic.matrix = csr_matrix(matrix) 578 | hic.setMatrix(hic.matrix, cut_intervals) 579 | 580 | nt.assert_equal(hic.getMatrix(), matrix) 581 | 582 | new_order = [0, 1, 3, 2, 4] 583 | new_matrix = np.array([[1, 8, 3, 5, 0], 584 | [0, 4, 5, 15, 1], 585 | [0, 0, 0, 0, 1], 586 | [0, 0, 0, 0, 2], 587 | [0, 0, 0, 0, 0]]) 588 | 589 | hic.reorderBins(new_order) 590 | 591 | nt.assert_equal(hic.getMatrix(), new_matrix) 592 | 593 | hic.reorderBins(new_order) 594 | 595 | nt.assert_equal(hic.getMatrix(), matrix) 596 | 597 | # order smaller than original matrix should delete unused ids 598 | small_order = [2, 3] 599 | small_matrix = np.array([[0, 0], 600 | [0, 0]]) 601 | 602 | hic.reorderBins(small_order) 603 | 604 | nt.assert_equal(hic.getMatrix(), small_matrix) 605 | nt.assert_equal(hic.matrix.shape, small_matrix.shape) 606 | nt.assert_equal(hic.chrBinBoundaries, OrderedDict( 607 | [('a', (0, 1)), ('b', (1, 2))])) 608 | nt.assert_equal(hic.cut_intervals, [('a', 20, 30, 1), ('b', 30, 40, 1)]) 609 | nt.assert_equal(hic.nan_bins, []) 610 | 611 | 612 | def test_maskBins(): 613 | hic = hm.hiCMatrix() 614 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 615 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] 616 | 617 | hic.nan_bins = [] 618 | 619 | matrix = np.array([[1, 8, 5, 3, 0], 620 | [0, 4, 15, 5, 1], 621 | [0, 0, 0, 0, 2], 622 | [0, 0, 0, 0, 1], 623 | [0, 0, 0, 0, 0]]) 624 | 625 | hic.matrix = csr_matrix(matrix) 626 | hic.setMatrix(hic.matrix, cut_intervals) 627 | 628 | nt.assert_equal(hic.getMatrix(), matrix) 629 | nt.assert_equal(hic.orig_bin_ids, []) 630 | 631 | new_matrix = np.array([[0, 0, 2], 632 | [0, 0, 1], 633 | [0, 0, 0]]) 634 | 635 | masking_ids = [0, 1] 636 | hic.maskBins(masking_ids) 637 | 638 | nt.assert_equal(hic.getMatrix(), new_matrix) 639 | nt.assert_equal(sorted(hic.orig_cut_intervals), sorted([('a', 0, 10, 1), ('a', 10, 20, 1), 640 | ('a', 20, 30, 641 | 1), ('b', 30, 40, 1), 642 | ('b', 40, 50, 1)])) 643 | nt.assert_equal(sorted(hic.cut_intervals), sorted([('a', 20, 30, 1), ('b', 30, 40, 1), 644 | ('b', 40, 50, 1)])) 645 | nt.assert_equal(hic.chrBinBoundaries, OrderedDict( 646 | [('a', (0, 1)), ('b', (1, 3))])) 647 | nt.assert_equal(sorted(hic.orig_bin_ids), sorted([0, 1, 2, 3, 4])) 648 | 649 | # direct return if masking_ids is None or has len() == 0, thus no changes to matrix 650 | masking_ids = None 651 | hic.maskBins(masking_ids) 652 | 653 | nt.assert_equal(hic.getMatrix(), new_matrix) 654 | nt.assert_equal(sorted(hic.orig_cut_intervals), sorted([('a', 0, 10, 1), ('a', 10, 20, 1), 655 | ('a', 20, 30, 656 | 1), ('b', 30, 40, 1), 657 | ('b', 40, 50, 1)])) 658 | nt.assert_equal(sorted(hic.cut_intervals), sorted([('a', 20, 30, 1), ('b', 30, 40, 1), 659 | ('b', 40, 50, 1)])) 660 | nt.assert_equal(hic.chrBinBoundaries, OrderedDict( 661 | [('a', (0, 1)), ('b', (1, 3))])) 662 | 663 | masking_ids = [] 664 | 665 | hic.maskBins(masking_ids) 666 | 667 | nt.assert_equal(hic.getMatrix(), new_matrix) 668 | nt.assert_equal(sorted(hic.orig_cut_intervals), sorted([('a', 0, 10, 1), ('a', 10, 20, 1), 669 | ('a', 20, 30, 670 | 1), ('b', 30, 40, 1), 671 | ('b', 40, 50, 1)])) 672 | nt.assert_equal(sorted(hic.cut_intervals), sorted([('a', 20, 30, 1), ('b', 30, 40, 1), 673 | ('b', 40, 50, 1)])) 674 | nt.assert_equal(hic.chrBinBoundaries, OrderedDict( 675 | [('a', (0, 1)), ('b', (1, 3))])) 676 | 677 | nt.assert_equal(sorted(hic.orig_bin_ids), sorted([0, 1, 2, 3, 4])) 678 | 679 | 680 | def test_update_matrix(): 681 | hic = hm.hiCMatrix() 682 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 683 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] 684 | 685 | hic.nan_bins = [] 686 | 687 | matrix = np.array([[1, 8, 5, 3, 0], 688 | [0, 4, 15, 5, 1], 689 | [0, 0, 0, 0, 2], 690 | [0, 0, 0, 0, 1], 691 | [0, 0, 0, 0, 0]]) 692 | 693 | hic.matrix = csr_matrix(matrix) 694 | hic.setMatrix(hic.matrix, cut_intervals) 695 | 696 | nt.assert_equal(hic.getMatrix(), matrix) 697 | 698 | new_cut_intervals = [('c', 0, 10, 1), ('d', 10, 20, 1), ('d', 20, 30, 1)] 699 | 700 | new_matrix = np.array([[3, 6, 4], 701 | [np.nan, 0, 2], 702 | [1, 0, 0]]) 703 | try: 704 | hic.update_matrix(new_matrix, new_cut_intervals) 705 | except AttributeError: 706 | pass 707 | # if matrix.shape[0] not equal to length of cut_intervals assertionError is raised 708 | short_cut_intervals = [('c', 0, 10, 1), ('d', 10, 20, 1)] 709 | 710 | with pytest.raises(AssertionError): 711 | hic.update_matrix(new_matrix, short_cut_intervals) 712 | 713 | # if matrix contains masked bins exception is raised 714 | masking_ids = [0, 1] 715 | hic.maskBins(masking_ids) 716 | 717 | with pytest.raises(Exception): 718 | hic.update_matrix(new_matrix, new_cut_intervals) 719 | 720 | 721 | def test_restoreMaskedBins(): 722 | hic = hm.hiCMatrix() 723 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 724 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] 725 | 726 | hic.nan_bins = [] 727 | 728 | matrix = np.array([[1, 8, 5, 3, 0], 729 | [0, 4, 15, 5, 1], 730 | [0, 0, 0, 0, 2], 731 | [0, 0, 0, 0, 1], 732 | [0, 0, 0, 0, 0]]) 733 | 734 | hic.matrix = csr_matrix(matrix) 735 | hic.setMatrix(hic.matrix, cut_intervals) 736 | 737 | nt.assert_equal(hic.getMatrix(), matrix) 738 | nt.assert_equal(hic.orig_bin_ids, []) 739 | 740 | # function should directly return if there are no masked_bins 741 | hic.restoreMaskedBins() 742 | 743 | nt.assert_equal(hic.getMatrix(), matrix) 744 | nt.assert_equal(hic.orig_bin_ids, []) 745 | 746 | # test general use 747 | # first get some masked bins 748 | masking_ids = [0, 1] 749 | hic.maskBins(masking_ids) 750 | 751 | new_matrix = np.array([[0, 0, 2], 752 | [0, 0, 1], 753 | [0, 0, 0]]) 754 | 755 | nt.assert_equal(hic.getMatrix(), new_matrix) 756 | nt.assert_equal(sorted(hic.orig_bin_ids), sorted([0, 1, 2, 3, 4])) 757 | 758 | # and now restore masked bins 759 | hic.restoreMaskedBins() 760 | 761 | result_matrix = np.array([[np.nan, np.nan, np.nan, np.nan, np.nan], 762 | [np.nan, np.nan, np.nan, np.nan, np.nan], 763 | [np.nan, np.nan, 0, 0, 2], 764 | [np.nan, np.nan, 0, 0, 1], 765 | [np.nan, np.nan, 0, 0, 0]]) 766 | 767 | nt.assert_equal(hic.getMatrix(), result_matrix) 768 | nt.assert_equal(hic.orig_bin_ids, []) 769 | 770 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 771 | ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] 772 | hic = hm.hiCMatrix() 773 | hic.nan_bins = [] 774 | matrix = np.array([[0, 10, 5, 3, 0], 775 | [0, 0, 15, 5, 1], 776 | [0, 0, 0, 7, 3], 777 | [0, 0, 0, 0, 1], 778 | [0, 0, 0, 0, 0]], dtype=np.int32) 779 | 780 | # make the matrix symmetric: 781 | hic.matrix = csr_matrix(matrix + matrix.T) 782 | hic.setMatrix(csr_matrix(matrix + matrix.T), cut_intervals) 783 | 784 | # Add masked bins masked bins 785 | hic.maskBins([3]) 786 | 787 | matrix = hic.matrix.todense() 788 | test_matrix = np.array([[0, 10, 5, 0], 789 | [10, 0, 15, 1], 790 | [5, 15, 0, 3], 791 | [0, 1, 3, 0]], dtype=np.int32) 792 | 793 | nt.assert_equal(matrix, test_matrix) 794 | 795 | cut_int = hic.cut_intervals 796 | test_cut_int = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 40, 50, 1)] 797 | 798 | nt.assert_equal(cut_int, test_cut_int) 799 | 800 | hic.restoreMaskedBins() 801 | 802 | dense = hic.matrix.todense() 803 | test_dense = np.array([[0., 10., 5., 0., 0.], 804 | [10., 0., 15., 0., 1.], 805 | [5., 15., 0., 0., 3.], 806 | [0., 0., 0., 0., 0.], 807 | [0., 1., 3., 0., 0.]]) 808 | 809 | nt.assert_equal(dense, test_dense) 810 | 811 | cut_int = hic.cut_intervals 812 | test_cut_int = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), 813 | ('a', 30, 40, 1), ('b', 40, 50, 1)] 814 | 815 | nt.assert_equal(cut_int, test_cut_int) 816 | 817 | 818 | def test_reorderMatrix(): 819 | orig = (1, 3) 820 | dest = 2 821 | 822 | # get matrix 823 | hic = hm.hiCMatrix() 824 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 825 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] 826 | 827 | hic.nan_bins = [] 828 | 829 | matrix = np.array([[1, 8, 5, 3, 0], 830 | [0, 4, 15, 5, 1], 831 | [0, 0, 0, 0, 2], 832 | [0, 0, 0, 0, 1], 833 | [0, 0, 0, 0, 0]]) 834 | 835 | hic.matrix = csr_matrix(matrix) 836 | hic.setMatrix(hic.matrix, cut_intervals) 837 | 838 | nt.assert_equal(hic.getMatrix(), matrix) 839 | 840 | # reorder matrix 841 | hic.reorderMatrix(orig, dest) 842 | 843 | new_matrix = np.array([[1, 3, 8, 5, 0], 844 | [0, 0, 0, 0, 1], 845 | [0, 5, 4, 15, 1], 846 | [0, 0, 0, 0, 2], 847 | [0, 0, 0, 0, 0]]) 848 | 849 | new_cut_intervals = [('a', 0, 10, 1), ('b', 30, 40, 1), 850 | ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 40, 50, 1)] 851 | 852 | # check if it is equal 853 | nt.assert_equal(hic.getMatrix(), new_matrix) 854 | nt.assert_equal(hic.matrix.shape, new_matrix.shape) 855 | nt.assert_equal(hic.cut_intervals, new_cut_intervals) 856 | 857 | 858 | def test_truncTrans(): 859 | # get matrix 860 | hic = hm.hiCMatrix() 861 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 862 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] 863 | 864 | hic.nan_bins = [] 865 | 866 | matrix = np.array([[-1, 8, 5, 3, 0], 867 | [np.nan, 4, 15, 5, 100], 868 | [0, 0, 0, 0, 2000], 869 | [0, 0, 0, 0, 1], 870 | [0, 0, 0, 0, 0]]) 871 | 872 | hic.matrix = csr_matrix(matrix) 873 | hic.setMatrix(hic.matrix, cut_intervals) 874 | 875 | nt.assert_equal(hic.getMatrix(), matrix) 876 | 877 | # define expected outcome 878 | new_matrix = np.array([[-1., 8., 5., 3., 0.], 879 | [np.nan, 4., 15., 5., 1.e+2], 880 | [0., 0., 0., 0., 2.e+3], 881 | [0., 0., 0., 0., 1.], 882 | [0., 0., 0., 0., 0.]]) 883 | 884 | # truncTrans of matrix 885 | hic.truncTrans() 886 | 887 | # test against expected outcome 888 | nt.assert_equal(hic.getMatrix(), new_matrix) 889 | 890 | # reset matrix 891 | matrix = np.array([[-1, 8, 5, 3, 0], 892 | [np.nan, 4, 15, 5, 1], 893 | [0, 0, 0, 0, 2], 894 | [0, 0, 0, 0, 1], 895 | [0, 0, 0, 0, 0]]) 896 | hic.matrix = csr_matrix(matrix) 897 | hic.setMatrix(hic.matrix, cut_intervals) 898 | 899 | # method should directly return if nothing to do, matrix stays the same 900 | hic.truncTrans() 901 | nt.assert_equal(hic.getMatrix(), matrix) 902 | 903 | 904 | def test_printchrtoremove(capsys): 905 | # get matrix 906 | hic = hm.hiCMatrix() 907 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 908 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] 909 | 910 | hic.nan_bins = [] 911 | 912 | matrix = np.array([[1, 8, 5, 3, 0], 913 | [0, 4, 15, 5, 1], 914 | [0, 0, 0, 0, 2], 915 | [0, 0, 0, 0, 1], 916 | [0, 0, 0, 0, 0]]) 917 | 918 | hic.matrix = csr_matrix(matrix) 919 | hic.setMatrix(hic.matrix, cut_intervals) 920 | 921 | nt.assert_equal(hic.getMatrix(), matrix) 922 | 923 | # first test exception message for no self.prev_to_remove 924 | to_remove = [0, 1] 925 | 926 | with pytest.raises(Exception): 927 | hic.printchrtoremove(to_remove) 928 | 929 | captured = capsys.readouterr() 930 | assert captured.out == "No self.prev_to_remove defined, defining it now." 931 | 932 | nt.assert_equal(hic.prev_to_remove, np.array(to_remove)) 933 | 934 | nt.assert_equal(hic.orig_bin_ids, []) 935 | 936 | # also test with masked_bins 937 | hic.maskBins(to_remove) 938 | 939 | assert len(hic.orig_bin_ids) > 0 940 | 941 | hic.printchrtoremove(to_remove) 942 | 943 | nt.assert_equal(hic.prev_to_remove, np.array(to_remove)) 944 | 945 | 946 | def test_get_chromosome_sizes_real(): 947 | # get matrix 948 | hic = hm.hiCMatrix() 949 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 950 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] 951 | 952 | hic.nan_bins = [] 953 | 954 | matrix = np.array([[1, 8, 5, 3, 0], 955 | [0, 4, 15, 5, 1], 956 | [0, 0, 0, 0, 2], 957 | [0, 0, 0, 0, 1], 958 | [0, 0, 0, 0, 0]]) 959 | 960 | hic.matrix = csr_matrix(matrix) 961 | hic.setMatrix(hic.matrix, cut_intervals) 962 | 963 | nt.assert_equal(hic.getMatrix(), matrix) 964 | 965 | # define expected outcome 966 | expected_sizes = OrderedDict([('a', 31), ('b', 21)]) 967 | 968 | chrom_sizes = hic.get_chromosome_sizes_real() 969 | 970 | nt.assert_equal(chrom_sizes, expected_sizes) 971 | 972 | # define new intervals and test again 973 | new_cut_intervals = [('a', 0, 10, 1), ('b', 10, 20, 1), 974 | ('b', 20, 30, 1), ('c', 30, 40, 1), ('c', 40, 90, 1)] 975 | 976 | expected_sizes = OrderedDict([('a', 11), ('b', 21), ('c', 61)]) 977 | 978 | hic.setMatrix(hic.matrix, new_cut_intervals) 979 | 980 | chrom_sizes = hic.get_chromosome_sizes_real() 981 | 982 | nt.assert_equal(chrom_sizes, expected_sizes) 983 | 984 | 985 | def test_get_chromosome_sizes(): 986 | # get matrix 987 | hic = hm.hiCMatrix() 988 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 989 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] 990 | 991 | hic.nan_bins = [] 992 | 993 | matrix = np.array([[1, 8, 5, 3, 0], 994 | [0, 4, 15, 5, 1], 995 | [0, 0, 0, 0, 2], 996 | [0, 0, 0, 0, 1], 997 | [0, 0, 0, 0, 0]]) 998 | 999 | hic.matrix = csr_matrix(matrix) 1000 | hic.setMatrix(hic.matrix, cut_intervals) 1001 | 1002 | nt.assert_equal(hic.getMatrix(), matrix) 1003 | 1004 | # define expected outcome 1005 | expected_sizes = OrderedDict([('a', 30), ('b', 50)]) 1006 | 1007 | chrom_sizes = hic.get_chromosome_sizes() 1008 | 1009 | nt.assert_equal(chrom_sizes, expected_sizes) 1010 | 1011 | # define new intervals and test again 1012 | new_cut_intervals = [('a', 0, 10, 1), ('b', 10, 20, 1), 1013 | ('b', 20, 30, 1), ('c', 30, 40, 1), ('c', 40, 90, 1)] 1014 | 1015 | expected_sizes = OrderedDict([('a', 10), ('b', 30), ('c', 90)]) 1016 | 1017 | hic.setMatrix(hic.matrix, new_cut_intervals) 1018 | 1019 | chrom_sizes = hic.get_chromosome_sizes() 1020 | 1021 | nt.assert_equal(chrom_sizes, expected_sizes) 1022 | 1023 | 1024 | def test_intervalListToIntervalTree(capsys): 1025 | # get matrix 1026 | hic = hm.hiCMatrix() 1027 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 1028 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] 1029 | 1030 | hic.nan_bins = [] 1031 | 1032 | matrix = np.array([[1, 8, 5, 3, 0], 1033 | [0, 4, 15, 5, 1], 1034 | [0, 0, 0, 0, 2], 1035 | [0, 0, 0, 0, 1], 1036 | [0, 0, 0, 0, 0]]) 1037 | 1038 | hic.matrix = csr_matrix(matrix) 1039 | hic.setMatrix(hic.matrix, cut_intervals) 1040 | 1041 | nt.assert_equal(hic.getMatrix(), matrix) 1042 | 1043 | # empty list should raise AssertionError 1044 | interval_list = [] 1045 | with pytest.raises(AssertionError): 1046 | hic.intervalListToIntervalTree(interval_list) 1047 | 1048 | captured = capsys.readouterr() 1049 | assert captured.out == "Interval list is empty" 1050 | 1051 | # test with correct interval_list 1052 | interval_list = [('a', 0, 10, 1), ('a', 10, 20, 1), ('b', 20, 30, 1), ('b', 30, 50, 1), 1053 | ('b', 50, 100, 1), ('c', 100, 200, 1), ('c', 200, 210, 1), 1054 | ('d', 210, 220, 1), ('e', 220, 250)] 1055 | 1056 | tree, boundaries = hic.intervalListToIntervalTree(interval_list) 1057 | 1058 | # test tree 1059 | nt.assert_equal(tree['a'], IntervalTree([Interval(0, 10, 0), Interval(10, 20, 1)])) 1060 | nt.assert_equal(tree['b'], IntervalTree([Interval(20, 30, 2), Interval(30, 50, 3), 1061 | Interval(50, 100, 4)])) 1062 | nt.assert_equal(tree['c'], IntervalTree([Interval(100, 200, 5), Interval(200, 210, 6)])) 1063 | nt.assert_equal(tree['d'], IntervalTree([Interval(210, 220, 7)])) 1064 | nt.assert_equal(tree['e'], IntervalTree([Interval(220, 250, 8)])) 1065 | 1066 | # test boundaries 1067 | nt.assert_equal(boundaries, OrderedDict([('a', (0, 2)), ('b', (2, 5)), ('c', (5, 7)), 1068 | ('d', (7, 8)), ('e', (8, 9))])) 1069 | 1070 | 1071 | def test_fillLowerTriangle(): 1072 | A = csr_matrix(np.array([[12, 5, 3, 2, 0], [0, 11, 4, 1, 1], [0, 0, 9, 6, 0], 1073 | [0, 0, 0, 10, 0], [0, 0, 0, 0, 0]]), dtype=np.int32) 1074 | hic = hm.hiCMatrix() 1075 | hic.matrix = A 1076 | hic.fillLowerTriangle() 1077 | B = hic.matrix 1078 | test_matrix = np.array([[12, 5, 3, 2, 0], 1079 | [5, 11, 4, 1, 1], 1080 | [3, 4, 9, 6, 0], 1081 | [2, 1, 6, 10, 0], 1082 | [0, 1, 0, 0, 0]], dtype='i4') 1083 | 1084 | nt.assert_equal(B.todense(), test_matrix) 1085 | 1086 | 1087 | def test_getDistList(): 1088 | row, col = np.triu_indices(5) 1089 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 1090 | ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] 1091 | dist_list, chrom_list = hm.hiCMatrix.getDistList(row, col, cut_intervals) 1092 | 1093 | matrix = coo_matrix((dist_list, (row, col)), shape=(5, 5), dtype=np.int32).todense() 1094 | test_matrix = np.array([[0, 10, 20, 30, -1], 1095 | [0, 0, 10, 20, -1], 1096 | [0, 0, 0, 10, -1], 1097 | [0, 0, 0, 0, -1], 1098 | [0, 0, 0, 0, 0]], dtype='i4') 1099 | nt.assert_equal(matrix, test_matrix) 1100 | 1101 | chrom_list = chrom_list.tolist() 1102 | test_chrom_list = ['a', 'a', 'a', 'a', '', 'a', 'a', 'a', '', 'a', 'a', '', 'a', 1103 | '', 'b'] 1104 | 1105 | nt.assert_equal(chrom_list, test_chrom_list) 1106 | 1107 | 1108 | def test_convert_to_obs_exp_matrix(): 1109 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 1110 | ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] 1111 | hic = hm.hiCMatrix() 1112 | hic.nan_bins = [] 1113 | matrix = np.array([[1, 8, 5, 3, 0], 1114 | [0, 4, 15, 5, 1], 1115 | [0, 0, 0, 7, 2], 1116 | [0, 0, 0, 0, 1], 1117 | [0, 0, 0, 0, 0]]) 1118 | 1119 | hic.matrix = csr_matrix(matrix) 1120 | hic.setMatrix(hic.matrix, cut_intervals) 1121 | 1122 | obs_exp_matrix = hic.convert_to_obs_exp_matrix().todense() 1123 | test_matrix = np.array([[1., 0.8, 1., 1., 0.], 1124 | [0., 4., 1.5, 1., 1.], 1125 | [0., 0., 0., 0.7, 2.], 1126 | [0., 0., 0., 0., 1.], 1127 | [0., 0., 0., 0., 0.]]) 1128 | 1129 | nt.assert_almost_equal(obs_exp_matrix, test_matrix) 1130 | 1131 | hic.matrix = csr_matrix(matrix) 1132 | obs_exp_matrix = hic.convert_to_obs_exp_matrix(maxdepth=20).todense() 1133 | test_matrix = np.array([[1., 0.8, 1., 0., 0.], 1134 | [0., 4., 1.5, 1., 0.], 1135 | [0., 0., 0., 0.7, np.nan], 1136 | [0., 0., 0., 0., np.nan], 1137 | [0., 0., 0., 0., 0.]]) 1138 | 1139 | nt.assert_almost_equal(obs_exp_matrix, test_matrix) 1140 | 1141 | hic.matrix = csr_matrix(matrix) 1142 | 1143 | obs_exp_matrix = hic.convert_to_obs_exp_matrix(zscore=True).todense() 1144 | test_matrix = np.array([[0., -0.56195149, np.nan, np.nan, -1.41421356], 1145 | [0., 1.93649167, 1.40487872, np.nan, 0.], 1146 | [0., 0., -0.64549722, -0.84292723, 1.41421356], 1147 | [0., 0., 0., -0.64549722, 0.], 1148 | [0., 0., 0., 0., -0.64549722]]) 1149 | 1150 | nt.assert_almost_equal(obs_exp_matrix, test_matrix) 1151 | 1152 | 1153 | def test_maskChromosomes(): 1154 | 1155 | hic = hm.hiCMatrix() 1156 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 1157 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] 1158 | 1159 | hic.nan_bins = [] 1160 | 1161 | matrix = np.array([[1, 8, 5, 3, 0], 1162 | [0, 4, 15, 5, 1], 1163 | [0, 0, 0, 0, 2], 1164 | [0, 0, 0, 0, 1], 1165 | [0, 0, 0, 0, 0]]) 1166 | 1167 | hic.matrix = csr_matrix(matrix) 1168 | hic.setMatrix(hic.matrix, cut_intervals) 1169 | hic.maskChromosomes(['a']) 1170 | 1171 | 1172 | @pytest.mark.xfail 1173 | def test_maskChromosomes_fail(): 1174 | 1175 | hic = hm.hiCMatrix() 1176 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), 1177 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] 1178 | 1179 | hic.nan_bins = [] 1180 | 1181 | matrix = np.array([[1, 8, 5, 3, 0], 1182 | [0, 4, 15, 5, 1], 1183 | [0, 0, 0, 0, 2], 1184 | [0, 0, 0, 0, 1], 1185 | [0, 0, 0, 0, 0]]) 1186 | 1187 | hic.matrix = csr_matrix(matrix) 1188 | hic.setMatrix(hic.matrix, cut_intervals) 1189 | 1190 | hic.maskChromosomes(['c']) 1191 | 1192 | print(hic.matrix) 1193 | 1194 | 1195 | def test_create_from_cool(): 1196 | hic_ma = hm.hiCMatrix(ROOT + 'one_interaction_4chr.cool') 1197 | nt.assert_equal(sorted(hic_ma.matrix.indices), [0, 3]) 1198 | nt.assert_equal(sorted(hic_ma.matrix.data), [1, 1]) 1199 | nt.assert_equal(sorted(hic_ma.nan_bins)[:5], [1, 2, 4, 5, 6]) 1200 | hic_ma = hm.hiCMatrix(ROOT + 'one_interaction_diag_4chr.cool') 1201 | nt.assert_equal(sorted(hic_ma.matrix.indices), [0]) 1202 | nt.assert_equal(sorted(hic_ma.matrix.data), [1]) 1203 | nt.assert_equal(sorted(hic_ma.nan_bins)[:5], [1, 2, 3, 4, 5]) 1204 | hic_ma.maskBins(hic_ma.nan_bins) 1205 | assert hic_ma.matrix.shape == (1, 1) 1206 | assert hic_ma.getBinSize() == 50000 1207 | 1208 | 1209 | def test_load_cool_matrix_only(): 1210 | hic_cool = hm.hiCMatrix(ROOT + 'Li_et_al_2015.cool', pUpperTriangleOnly=True) 1211 | 1212 | hic_cool_matrix_only = hm.hiCMatrix(ROOT + 'Li_et_al_2015.cool', pUpperTriangleOnly=True, pLoadMatrixOnly=True) 1213 | instances = hic_cool_matrix_only.matrix[0] 1214 | features = hic_cool_matrix_only.matrix[1] 1215 | data = hic_cool_matrix_only.matrix[2] 1216 | 1217 | instances_cool, features_cool = hic_cool.matrix.nonzero() 1218 | nt.assert_equal(hic_cool.matrix.data, data) 1219 | nt.assert_equal(instances_cool, instances) 1220 | nt.assert_equal(features_cool, features) 1221 | -------------------------------------------------------------------------------- /hicmatrix/test/test_data/GSE63525_GM12878_insitu_primary_2_5mb_hic2cool042.cool: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeptools/HiCMatrix/f36927fd387aa7201a0a006a9c57ecccb29cab09/hicmatrix/test/test_data/GSE63525_GM12878_insitu_primary_2_5mb_hic2cool042.cool -------------------------------------------------------------------------------- /hicmatrix/test/test_data/GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeptools/HiCMatrix/f36927fd387aa7201a0a006a9c57ecccb29cab09/hicmatrix/test/test_data/GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool -------------------------------------------------------------------------------- /hicmatrix/test/test_data/Li_et_al_2015.cool: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeptools/HiCMatrix/f36927fd387aa7201a0a006a9c57ecccb29cab09/hicmatrix/test/test_data/Li_et_al_2015.cool -------------------------------------------------------------------------------- /hicmatrix/test/test_data/Li_et_al_2015.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeptools/HiCMatrix/f36927fd387aa7201a0a006a9c57ecccb29cab09/hicmatrix/test/test_data/Li_et_al_2015.h5 -------------------------------------------------------------------------------- /hicmatrix/test/test_data/one_interaction_4chr.cool: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeptools/HiCMatrix/f36927fd387aa7201a0a006a9c57ecccb29cab09/hicmatrix/test/test_data/one_interaction_4chr.cool -------------------------------------------------------------------------------- /hicmatrix/test/test_data/one_interaction_diag_4chr.cool: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeptools/HiCMatrix/f36927fd387aa7201a0a006a9c57ecccb29cab09/hicmatrix/test/test_data/one_interaction_diag_4chr.cool -------------------------------------------------------------------------------- /hicmatrix/test/test_data/small_test_matrix.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeptools/HiCMatrix/f36927fd387aa7201a0a006a9c57ecccb29cab09/hicmatrix/test/test_data/small_test_matrix.h5 -------------------------------------------------------------------------------- /hicmatrix/test/test_data/test_matrix.homer: -------------------------------------------------------------------------------- 1 | HiCMatrix (directory=wtTagDir/) Regions 3R-1000000 3R-1020000 3R-1040000 3R-1060000 3R-1080000 3R-1100000 3R-1120000 3R-1140000 3R-1160000 3R-1180000 3R-1200000 3R-1220000 3R-1240000 2 | 3R-1000000 3R-1000000 1.000e+00 1.896e-01 2.163e-01 8.288e-02 1.431e-01 2.569e-01 1.315e-01 1.488e-01 -3.120e-02 1.430e-01 6.091e-02 3.546e-02 1.168e-01 3 | 3R-1020000 3R-1020000 1.896e-01 1.000e+00 3.695e-01 3.666e-01 1.456e-01 1.940e-01 2.517e-01 1.511e-01 2.184e-01 1.727e-01 1.676e-01 -1.512e-02 -6.450e-02 4 | 3R-1040000 3R-1040000 2.163e-01 3.695e-01 1.000e+00 3.818e-01 2.833e-01 2.460e-01 2.430e-01 3.630e-01 1.483e-01 2.690e-01 2.176e-01 -6.305e-02 -1.125e-01 5 | 3R-1060000 3R-1060000 8.288e-02 3.666e-01 3.818e-01 1.000e+00 3.246e-01 2.644e-01 2.107e-01 3.149e-01 2.863e-01 2.273e-01 2.582e-01 -1.020e-02 2.029e-02 6 | 3R-1080000 3R-1080000 1.431e-01 1.456e-01 2.833e-01 3.246e-01 1.000e+00 2.488e-01 2.928e-01 2.152e-01 3.685e-01 2.373e-01 1.003e-01 1.003e-01 4.465e-02 7 | 3R-1100000 3R-1100000 2.569e-01 1.940e-01 2.460e-01 2.644e-01 2.488e-01 1.000e+00 3.083e-01 3.408e-01 3.025e-01 1.565e-01 1.917e-01 -6.210e-02 7.574e-02 8 | 3R-1120000 3R-1120000 1.315e-01 2.517e-01 2.430e-01 2.107e-01 2.928e-01 3.083e-01 1.000e+00 2.484e-01 2.986e-01 2.647e-01 2.333e-01 7.504e-02 -4.602e-02 9 | 3R-1140000 3R-1140000 1.488e-01 1.511e-01 3.630e-01 3.149e-01 2.152e-01 3.408e-01 2.484e-01 1.000e+00 3.777e-01 1.729e-01 1.445e-01 -1.355e-02 6.834e-02 10 | 3R-1160000 3R-1160000 -3.120e-02 2.184e-01 1.483e-01 2.863e-01 3.685e-01 3.025e-01 2.986e-01 3.777e-01 1.000e+00 1.299e-01 4.142e-02 2.557e-02 8.888e-02 11 | 3R-1180000 3R-1180000 1.430e-01 1.727e-01 2.690e-01 2.273e-01 2.373e-01 1.565e-01 2.647e-01 1.729e-01 1.299e-01 1.000e+00 2.826e-01 7.371e-02 -1.322e-01 12 | 3R-1200000 3R-1200000 6.091e-02 1.676e-01 2.176e-01 2.582e-01 1.003e-01 1.917e-01 2.333e-01 1.445e-01 4.142e-02 2.826e-01 1.000e+00 3.217e-01 1.061e-01 13 | 3R-1220000 3R-1220000 3.546e-02 -1.512e-02 -6.305e-02 -1.020e-02 1.003e-01 -6.210e-02 7.504e-02 -1.355e-02 2.557e-02 7.371e-02 3.217e-01 1.000e+00 1.326e-01 14 | 3R-1240000 3R-1240000 1.168e-01 -6.450e-02 -1.125e-01 2.029e-02 4.465e-02 7.574e-02 -4.602e-02 6.834e-02 8.888e-02 -1.322e-01 1.061e-01 1.326e-01 1.000e+00 15 | -------------------------------------------------------------------------------- /hicmatrix/test/test_data/test_matrix.homer.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeptools/HiCMatrix/f36927fd387aa7201a0a006a9c57ecccb29cab09/hicmatrix/test/test_data/test_matrix.homer.gz -------------------------------------------------------------------------------- /hicmatrix/test/test_matrixFileHandler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from tempfile import NamedTemporaryFile 4 | 5 | import cooler 6 | import numpy as np 7 | import numpy.testing as nt 8 | import pytest 9 | 10 | from hicmatrix.lib import MatrixFileHandler 11 | 12 | log = logging.getLogger(__name__) 13 | 14 | ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_data/") 15 | outfile_basename = '/tmp/matrix' 16 | 17 | 18 | def test_load_homer(): 19 | # create matrixFileHandler instance with filetype 'homer' 20 | pMatrixFile = ROOT + 'test_matrix.homer' 21 | fh = MatrixFileHandler(pFileType='homer', pMatrixFile=pMatrixFile) 22 | assert fh is not None 23 | 24 | # load data 25 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() 26 | 27 | # create test matrix 28 | 29 | test_matrix = np.array([[1.0, 0.1896, 0.2163, 0.08288, 0.1431, 0.2569, 0.1315, 30 | 0.1488, -0.0312, 0.143, 0.06091, 0.03546, 0.1168]]) 31 | 32 | nt.assert_almost_equal(matrix[0].todense(), test_matrix) 33 | 34 | test_cut_intervals = [('3R', 1000000, 1020000, 1), ('3R', 1020000, 1040000, 1), ('3R', 1040000, 1060000, 1), ('3R', 1060000, 1080000, 1), ('3R', 1080000, 1100000, 1), ('3R', 1100000, 1120000, 1), ('3R', 1120000, 1140000, 1), ('3R', 1140000, 1160000, 1), ('3R', 1160000, 1180000, 1), ('3R', 1180000, 1200000, 1), ('3R', 1200000, 1220000, 1), ('3R', 1220000, 1240000, 1), ('3R', 1240000, 1260000, 1)] # noqa E501 35 | nt.assert_equal(cut_intervals, test_cut_intervals) 36 | 37 | assert nan_bins is None 38 | assert distance_counts is None 39 | assert correction_factors is None 40 | 41 | 42 | def test_load_homer_gzip(): 43 | # create matrixFileHandler instance with filetype 'homer' 44 | pMatrixFile = ROOT + 'test_matrix.homer.gz' 45 | fh = MatrixFileHandler(pFileType='homer', pMatrixFile=pMatrixFile) 46 | assert fh is not None 47 | 48 | # load data 49 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() 50 | 51 | # create test matrix 52 | 53 | test_matrix = np.array([[1.0, 0.1896, 0.2163, 0.08288, 0.1431, 0.2569, 0.1315, 54 | 0.1488, -0.0312, 0.143, 0.06091, 0.03546, 0.1168]]) 55 | 56 | nt.assert_almost_equal(matrix[0].todense(), test_matrix) 57 | 58 | test_cut_intervals = [('3R', 1000000, 1020000, 1), ('3R', 1020000, 1040000, 1), ('3R', 1040000, 1060000, 1), ('3R', 1060000, 1080000, 1), ('3R', 1080000, 1100000, 1), ('3R', 1100000, 1120000, 1), ('3R', 1120000, 1140000, 1), ('3R', 1140000, 1160000, 1), ('3R', 1160000, 1180000, 1), ('3R', 1180000, 1200000, 1), ('3R', 1200000, 1220000, 1), ('3R', 1220000, 1240000, 1), ('3R', 1240000, 1260000, 1)] # noqa E501 59 | nt.assert_equal(cut_intervals, test_cut_intervals) 60 | 61 | assert nan_bins is None 62 | assert distance_counts is None 63 | assert correction_factors is None 64 | 65 | 66 | def test_save_homer(): 67 | homer_outfile = outfile_basename + '.homer' 68 | 69 | # create matrixFileHandler instance with filetype 'homer' 70 | pMatrixFile = ROOT + 'test_matrix.homer' 71 | fh = MatrixFileHandler(pFileType='homer', pMatrixFile=pMatrixFile) 72 | assert fh is not None 73 | 74 | # load data 75 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() 76 | # set matrix variables 77 | fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) # noqa E501 78 | # and save it. 79 | fh.save(pName=homer_outfile, pSymmetric=False, pApplyCorrection=False) # not implemented 80 | os.unlink(homer_outfile) 81 | 82 | 83 | def test_load_h5(): 84 | # create matrixFileHandler instance with filetype 'h5' 85 | pMatrixFile = ROOT + 'Li_et_al_2015.h5' 86 | fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile) 87 | assert fh is not None 88 | 89 | # load data 90 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() 91 | 92 | test_matrix = np.array([[0. for i in range(11104)]]) 93 | nt.assert_almost_equal(matrix[0].todense(), test_matrix) 94 | 95 | nt.assert_equal(cut_intervals[0], ('X', 0, 2200, 0.0)) 96 | nt.assert_equal(cut_intervals[1], ('X', 2200, 4702, 0.0)) 97 | nt.assert_equal(cut_intervals[2], ('X', 4702, 7060, 0.0)) 98 | nt.assert_equal(cut_intervals[3], ('X', 7060, 8811, 0.4)) 99 | 100 | test_nan_bins = np.array([0, 1, 2, 3, 4, 5, 6, 7, 30, 31, 32, 51, 52, 53, 54, 81, 82, 83, 84, 94]) # noqa E501 101 | nt.assert_equal(nan_bins[0:20], test_nan_bins) 102 | 103 | assert distance_counts is None 104 | 105 | test_correction_factors = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0.90720049, 1.25516028]) # noqa E501 106 | nt.assert_almost_equal(correction_factors[0:10], test_correction_factors) 107 | 108 | 109 | def test_save_h5(): 110 | h5_outfile = outfile_basename + '.h5' 111 | 112 | # create matrixFileHandler instance with filetype 'h5' 113 | pMatrixFile = ROOT + 'Li_et_al_2015.h5' 114 | fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile) 115 | assert fh is not None 116 | 117 | # load data 118 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() 119 | # set matrix variables 120 | fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) # noqa E501 121 | # and save it. 122 | fh.save(h5_outfile, True, None) 123 | 124 | os.unlink(h5_outfile) 125 | 126 | 127 | def test_load_hicpro(): 128 | # create matrixFileHandler instance with filetype 'hicpro' 129 | pMatrixFile = ROOT + 'test_matrix.hicpro' 130 | pBedFileHicPro = ROOT + 'test_matrix.bed' 131 | fh = MatrixFileHandler(pFileType='hicpro', pMatrixFile=pMatrixFile, pBedFileHicPro=pBedFileHicPro) 132 | assert fh is not None 133 | 134 | # load data 135 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() 136 | 137 | # create test matrix 138 | test_list = [0. for i in range(3113)] 139 | test_list.insert(0, 41.345793) 140 | test_list[827] = 5.42079 141 | test_list[1263] = 5.122642 142 | 143 | test_matrix = np.array([test_list]) 144 | 145 | # and check for shape and values 146 | assert matrix[0].todense().shape == test_matrix.shape 147 | nt.assert_almost_equal(matrix[0].todense(), test_matrix) 148 | 149 | test_cut_intervals = np.array([('chr1', 0, 1000000, 1), ('chr1', 1000000, 2000000, 2), ('chr1', 2000000, 3000000, 3), 150 | ('chr1', 3000000, 4000000, 4), ('chr1', 4000000, 5000000, 5), ('chr1', 5000000, 6000000, 6), 151 | ('chr1', 6000000, 7000000, 7), ('chr1', 7000000, 8000000, 8), ('chr1', 8000000, 9000000, 9), 152 | ('chr1', 9000000, 10000000, 10), ('chr1', 10000000, 11000000, 11), ('chr1', 11000000, 12000000, 12), 153 | ('chr1', 12000000, 13000000, 13), ('chr1', 13000000, 14000000, 14), ('chr1', 14000000, 15000000, 15), 154 | ('chr1', 15000000, 16000000, 16), ('chr1', 16000000, 17000000, 17), ('chr1', 17000000, 18000000, 18), 155 | ('chr1', 18000000, 19000000, 19), ('chr1', 19000000, 20000000, 20)]) 156 | nt.assert_equal(cut_intervals[0:20], test_cut_intervals) 157 | 158 | assert nan_bins is None 159 | assert correction_factors is None 160 | assert distance_counts is None 161 | 162 | 163 | @pytest.mark.xfail 164 | def test_save_hicpro(): 165 | hicpro_outfile = outfile_basename + '.hicpro' 166 | 167 | # create matrixFileHandler instance with filetype 'hicpro' 168 | pMatrixFile = ROOT + 'test_matrix.hicpro' 169 | pBedFileHicPro = ROOT + 'test_matrix.bed' 170 | fh = MatrixFileHandler(pFileType='hicpro', pMatrixFile=pMatrixFile, pBedFileHicPro=pBedFileHicPro) 171 | assert fh is not None 172 | 173 | # load data 174 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() 175 | # set matrix variables 176 | fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) 177 | # and save it. 178 | fh.save(pName=hicpro_outfile, pSymmetric=False, pApplyCorrection=False) # not implemented 179 | os.unlink(hicpro_outfile) 180 | 181 | 182 | def test_load_cool(): 183 | # create matrixFileHandler instance with filetype 'cool' 184 | pMatrixFile = ROOT + 'Li_et_al_2015.cool' 185 | fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile) 186 | assert fh is not None 187 | 188 | # load data 189 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() 190 | 191 | # test matrix 192 | test_matrix = np.array([[0. for i in range(11104)]]) 193 | nt.assert_almost_equal(matrix[0].todense(), test_matrix) 194 | 195 | test_cut_intervals = [('X', 0, 2200, 1.0), ('X', 2200, 4702, 1.0), ('X', 4702, 7060, 1.0), 196 | ('X', 7060, 8811, 1.0), ('X', 8811, 11048, 1.0), ('X', 11048, 14329, 1.0), 197 | ('X', 14329, 16847, 1.0), ('X', 16847, 19537, 1.0), ('X', 19537, 20701, 1.0), 198 | ('X', 20701, 22321, 1.0), ('X', 22321, 24083, 1.0), ('X', 24083, 25983, 1.0), 199 | ('X', 25983, 27619, 1.0), ('X', 27619, 29733, 1.0), ('X', 29733, 30973, 1.0), 200 | ('X', 30973, 32214, 1.0), ('X', 32214, 34179, 1.0), ('X', 34179, 35987, 1.0), 201 | ('X', 35987, 37598, 1.0), ('X', 37598, 39009, 1.0)] 202 | for index, tup in enumerate(cut_intervals[0:20]): 203 | for ind, element in enumerate(tup): 204 | assert element == test_cut_intervals[index][ind] 205 | 206 | test_nan_bins = [0, 1, 2, 3, 4, 5, 6, 7, 30, 31] 207 | nt.assert_almost_equal(nan_bins[0:10], test_nan_bins) 208 | 209 | test_correction_factors = [0., 0., 0., 0., 0., 0., 0., 0., 1.1022922, 0.796711] 210 | nt.assert_almost_equal(correction_factors[0:10], test_correction_factors) 211 | 212 | assert distance_counts is None 213 | 214 | 215 | def test_load_cool2(): 216 | # create matrixFileHandler instance with filetype 'cool' 217 | pMatrixFile = ROOT + 'one_interaction_4chr.cool' 218 | # The interaction is: 219 | # chr1 10000 chr1 200000 220 | bin_size = 50000 221 | # So there should be a 1 between the bin 0 and the bin 3 222 | fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile) 223 | assert fh is not None 224 | 225 | # load data 226 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() 227 | 228 | # test data 229 | nt.assert_almost_equal(matrix.data, np.array([1])) 230 | 231 | # test matrix 232 | test_matrix = np.array([[0 for i in range(9167)]]) 233 | nt.assert_almost_equal(matrix[3].todense(), test_matrix) 234 | test_matrix[0][3] = 1 235 | nt.assert_almost_equal(matrix[0].todense(), test_matrix) 236 | 237 | test_cut_intervals = sum([[('chr1', i * bin_size, (i + 1) * bin_size, 1.0) for i in range(3909)], 238 | [('chr1', 195450000, 195471971, 1.0)], 239 | [('chrX', i * bin_size, (i + 1) * bin_size, 1.0) for i in range(3420)], 240 | [('chrX', 171000000, 171031299, 1.0)], 241 | [('chrY', i * bin_size, (i + 1) * bin_size, 1.0) for i in range(1834)], 242 | [('chrY', 91700000, 91744698, 1.0)], 243 | [('chrM', 0, 16299, 1.0)]], []) 244 | 245 | for index, tup in enumerate(cut_intervals): 246 | for ind, element in enumerate(tup): 247 | assert element == test_cut_intervals[index][ind] 248 | 249 | test_nan_bins = [1, 2, 4, 5] 250 | nt.assert_almost_equal(nan_bins[:4], test_nan_bins) 251 | 252 | assert distance_counts is None 253 | assert correction_factors is None 254 | 255 | 256 | def test_save_cool(): 257 | cool_outfile = outfile_basename + '.cool' 258 | 259 | # create matrixFileHandler instance with filetype 'cool' 260 | pMatrixFile = ROOT + 'Li_et_al_2015.cool' 261 | fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile) 262 | assert fh is not None 263 | 264 | # load data 265 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() 266 | # set matrix variables 267 | fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) 268 | # and save it. 269 | fh.save(pName=cool_outfile, pSymmetric=True, pApplyCorrection=True) 270 | 271 | fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile) 272 | assert fh_test is not None 273 | matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load() 274 | 275 | nt.assert_equal(matrix.data, matrix_test.data) 276 | nt.assert_equal(cut_intervals, cut_intervals_test) 277 | nt.assert_equal(nan_bins, nan_bins_test) 278 | nt.assert_equal(distance_counts, distance_counts_test) 279 | nt.assert_equal(correction_factors, correction_factors_test) 280 | 281 | os.unlink(cool_outfile) 282 | 283 | 284 | def test_load_distance_cool(): 285 | cool_outfile = outfile_basename + '.cool' 286 | 287 | # create matrixFileHandler instance with filetype 'cool' 288 | pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool' 289 | fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pChrnameList=['1'], pDistance=2500000) 290 | assert fh is not None 291 | 292 | # load data 293 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() 294 | # set matrix variables 295 | fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) 296 | # and save it. 297 | fh.save(pName=cool_outfile, pSymmetric=True, pApplyCorrection=True) 298 | 299 | fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile) 300 | assert fh_test is not None 301 | matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load() 302 | 303 | # check distance load works as expected 304 | instances, features = matrix.nonzero() 305 | distances = np.absolute(instances - features) 306 | # log.debug('max: {}'.format(np.max(distances))) 307 | mask = distances > 1 # 2.5 mb res --> all with 2.5 Mb distance 308 | assert np.sum(mask) == 0 309 | 310 | fh = MatrixFileHandler(pFileType='cool', pChrnameList=['1'], pMatrixFile=pMatrixFile) 311 | assert fh is not None 312 | 313 | # load data 314 | matrix2, _, _, _, _ = fh.load() 315 | instances, features = matrix2.nonzero() 316 | distances = np.absolute(instances - features) 317 | mask = distances > 1 # 2.5 mb res --> all with 2.5 Mb distance 318 | assert np.sum(mask) > 0 319 | 320 | # check if load and save matrix are equal 321 | nt.assert_equal(matrix.data, matrix_test.data) 322 | nt.assert_equal(cut_intervals, cut_intervals_test) 323 | nt.assert_equal(nan_bins, nan_bins_test) 324 | nt.assert_equal(distance_counts, distance_counts_test) 325 | nt.assert_equal(correction_factors, correction_factors_test) 326 | 327 | os.unlink(cool_outfile) 328 | 329 | 330 | def test_load_h5_save_cool(): 331 | cool_outfile = outfile_basename + '.cool' 332 | 333 | # create matrixFileHandler instance with filetype 'h5' 334 | pMatrixFile = ROOT + 'Li_et_al_2015.h5' 335 | fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile) 336 | assert fh is not None 337 | 338 | # load data 339 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() 340 | 341 | # set matrix variables 342 | fh_new = MatrixFileHandler(pFileType='cool') 343 | 344 | fh_new.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) 345 | fh_new.matrixFile.fileWasH5 = True 346 | # and save it. 347 | 348 | fh_new.save(pName=cool_outfile, pSymmetric=False, pApplyCorrection=True) 349 | 350 | fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile) 351 | assert fh_test is not None 352 | matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load() 353 | 354 | instances, features = matrix.nonzero() 355 | instances_factors = correction_factors[instances] 356 | features_factors = correction_factors[features] 357 | instances_factors *= features_factors 358 | 359 | matrix_applied_correction = matrix.data / instances_factors 360 | nt.assert_almost_equal(matrix_applied_correction, matrix_test.data, decimal=1) 361 | nt.assert_equal(len(cut_intervals), len(cut_intervals_test)) 362 | nt.assert_equal(nan_bins, nan_bins_test) 363 | nt.assert_equal(distance_counts, distance_counts_test) 364 | correction_factors = 1 / correction_factors 365 | mask = np.isnan(correction_factors) 366 | correction_factors[mask] = 0 367 | mask = np.isinf(correction_factors) 368 | correction_factors[mask] = 0 369 | nt.assert_equal(correction_factors, correction_factors_test) 370 | 371 | # os.unlink(cool_outfile) 372 | os.unlink(cool_outfile) 373 | 374 | 375 | def test_save_cool_enforce_integer(): 376 | cool_outfile = outfile_basename + '.cool' 377 | 378 | # create matrixFileHandler instance with filetype 'h5' 379 | pMatrixFile = ROOT + 'Li_et_al_2015.h5' 380 | fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile) 381 | assert fh is not None 382 | 383 | # load data 384 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() 385 | 386 | # set matrix variables 387 | fh_new = MatrixFileHandler(pFileType='cool', pEnforceInteger=True) 388 | 389 | fh_new.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) 390 | fh_new.matrixFile.fileWasH5 = True 391 | # and save it. 392 | 393 | fh_new.save(pName=cool_outfile, pSymmetric=False, pApplyCorrection=True) 394 | 395 | fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile, pApplyCorrectionCoolerLoad=False) 396 | assert fh_test is not None 397 | matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, _ = fh_test.load() 398 | 399 | # pMatrixFile = ROOT + 'Li_et_al_2015.h5' 400 | # fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile) 401 | # assert fh is not None 402 | 403 | # load data 404 | # matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() 405 | # instances, features = matrix.nonzero() 406 | # instances_factors = correction_factors[instances] 407 | # features_factors = correction_factors[features] 408 | # instances_factors *= features_factors 409 | 410 | # matrix_applied_correction = matrix.data / instances_factors 411 | # mask = matrix.data == 0 412 | matrix.data = np.rint(matrix.data) 413 | matrix.eliminate_zeros() 414 | # matrix_test.eliminate_zeros() 415 | 416 | nt.assert_almost_equal(matrix.data, matrix_test.data, decimal=0) 417 | nt.assert_equal(len(cut_intervals), len(cut_intervals_test)) 418 | nt.assert_equal(nan_bins, nan_bins_test) 419 | nt.assert_equal(distance_counts, distance_counts_test) 420 | 421 | # os.unlink(cool_outfile) 422 | os.unlink(cool_outfile) 423 | 424 | 425 | def test_load_cool_hic2cool_versions(): 426 | pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool042.cool' 427 | hic2cool_042 = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pCorrectionFactorTable='KR', pCorrectionOperator='*') 428 | pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool' 429 | hic2cool_051 = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pCorrectionFactorTable='KR') 430 | 431 | # hic2cool_051 = MatrixFileHandler(pFileType='h5', pMatrixFile=, pCorrectionFactorTable='KR') 432 | # hic2cool_042 = hm.hiCMatrix(ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool042.cool') 433 | # hic2cool_051 = hm.hiCMatrix(ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool') 434 | 435 | # hic2cool_041 = hm.hiCMatrix(outfile.name) 436 | matrix, cut_intervals, nan_bins, distance_counts, _ = hic2cool_042.load() 437 | matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, _ = hic2cool_051.load() 438 | 439 | nt.assert_almost_equal(matrix.data, matrix_test.data, decimal=0) 440 | nt.assert_equal(len(cut_intervals), len(cut_intervals_test)) 441 | nt.assert_equal(nan_bins, nan_bins_test) 442 | nt.assert_equal(distance_counts, distance_counts_test) 443 | 444 | 445 | def test_save_cool_apply_division(): 446 | cool_outfile = outfile_basename + '.cool' 447 | 448 | # create matrixFileHandler instance with filetype 'cool' 449 | pMatrixFile = ROOT + 'Li_et_al_2015.cool' 450 | fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pCorrectionOperator='/') 451 | assert fh is not None 452 | 453 | # load data 454 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() 455 | # set matrix variables 456 | fh_new = MatrixFileHandler(pFileType='cool', pCorrectionOperator='/') 457 | 458 | fh_new.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) 459 | 460 | # and save it. 461 | 462 | fh_new.save(pName=cool_outfile, pSymmetric=False, pApplyCorrection=True) 463 | 464 | fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile) 465 | assert fh_test is not None 466 | matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, _ = fh_test.load() 467 | pMatrixFile = ROOT + 'Li_et_al_2015.cool' 468 | fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pCorrectionOperator='/') 469 | assert fh is not None 470 | # load data 471 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() 472 | 473 | nt.assert_almost_equal(matrix.data, matrix_test.data, decimal=1) 474 | nt.assert_equal(len(cut_intervals), len(cut_intervals_test)) 475 | nt.assert_equal(nan_bins, nan_bins_test) 476 | nt.assert_equal(distance_counts, distance_counts_test) 477 | 478 | os.unlink(cool_outfile) 479 | 480 | 481 | def test_save_scool_matrixHandlersCool(): 482 | 483 | outfile = NamedTemporaryFile(suffix='.scool', prefix='hicmatrix_scool_test') # pylint: disable=R1732 484 | 485 | pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool' 486 | 487 | matrixFileHandlerInput = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile) 488 | matrix, cut_intervals, nan_bins, \ 489 | distance_counts, correction_factors = matrixFileHandlerInput.load() 490 | matrixFileHandlerOutput1 = MatrixFileHandler(pFileType='cool', pMatrixFile='cell1', pEnforceInteger=False, pFileWasH5=False, pHic2CoolVersion=None) 491 | matrixFileHandlerOutput1.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) 492 | 493 | matrixFileHandlerOutput2 = MatrixFileHandler(pFileType='cool', pMatrixFile='cell2', pEnforceInteger=False, pFileWasH5=False, pHic2CoolVersion=None) 494 | matrixFileHandlerOutput2.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) 495 | 496 | matrixFileHandlerOutput3 = MatrixFileHandler(pFileType='cool', pMatrixFile='cell3', pEnforceInteger=False, pFileWasH5=False, pHic2CoolVersion=None) 497 | matrixFileHandlerOutput3.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) 498 | 499 | matrixFileHandler = MatrixFileHandler(pFileType='scool') 500 | matrixFileHandler.matrixFile.coolObjectsList = [matrixFileHandlerOutput1, matrixFileHandlerOutput2, matrixFileHandlerOutput3] 501 | 502 | matrixFileHandler.save(outfile.name, pSymmetric=True, pApplyCorrection=False) 503 | 504 | content_of_scool = cooler.fileops.list_scool_cells(outfile.name) 505 | content_expected = ['/cells/cell1', '/cells/cell2', '/cells/cell3'] 506 | for content in content_expected: 507 | assert content in content_of_scool 508 | 509 | 510 | def test_save_scool_pixeltables(): 511 | outfile = NamedTemporaryFile(suffix='.scool', prefix='hicmatrix_scool_test') # pylint: disable=R1732 512 | 513 | pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool' 514 | 515 | cooler_obj = cooler.Cooler(pMatrixFile) 516 | bins = cooler_obj.bins()[:] 517 | pixels = cooler_obj.pixels()[:] 518 | 519 | pixelsList = [pixels, pixels, pixels] 520 | matrices_list = ['cell1', 'cell2', 'cell3'] 521 | matrixFileHandler = MatrixFileHandler(pFileType='scool') 522 | matrixFileHandler.matrixFile.coolObjectsList = None 523 | matrixFileHandler.matrixFile.bins = bins 524 | matrixFileHandler.matrixFile.pixel_list = pixelsList 525 | matrixFileHandler.matrixFile.name_list = matrices_list 526 | matrixFileHandler.save(outfile.name, pSymmetric=True, pApplyCorrection=False) 527 | 528 | content_of_scool = cooler.fileops.list_scool_cells(outfile.name) 529 | content_expected = ['/cells/cell1', '/cells/cell2', '/cells/cell3'] 530 | for content in content_expected: 531 | assert content in content_of_scool 532 | 533 | 534 | def test_load_cool_matrix_only(): 535 | 536 | pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool' 537 | 538 | matrixFileHandlerInput = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pLoadMatrixOnly=True) 539 | matrix, cut_intervals, nan_bins, \ 540 | distance_counts, correction_factors = matrixFileHandlerInput.load() 541 | 542 | assert len(matrix) == 4 543 | assert cut_intervals is None 544 | assert nan_bins is None 545 | assert distance_counts is None 546 | assert correction_factors is None 547 | 548 | matrixFileHandlerInput2 = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile) 549 | matrix2, _, _, \ 550 | _, _ = matrixFileHandlerInput2.load() 551 | 552 | instances, features = matrix2.nonzero() 553 | nt.assert_almost_equal(matrix[0], instances, decimal=1) 554 | nt.assert_almost_equal(matrix[1], features, decimal=1) 555 | nt.assert_almost_equal(matrix[2], matrix2.data, decimal=1) 556 | assert matrix[3] == matrix2.shape[0] 557 | -------------------------------------------------------------------------------- /hicmatrix/utilities.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import sys 3 | 4 | import numpy as np 5 | 6 | 7 | def toString(s): 8 | """ 9 | This takes care of python2/3 differences 10 | """ 11 | if isinstance(s, str): 12 | return s 13 | if isinstance(s, bytes): # or isinstance(s, np.bytes_): 14 | if sys.version_info[0] == 2: 15 | return str(s) 16 | return s.decode('ascii') 17 | if isinstance(s, list): 18 | return [toString(x) for x in s] 19 | if isinstance(s, np.ndarray): 20 | return s.astype(str) 21 | return s 22 | 23 | 24 | def toBytes(s): 25 | """ 26 | Like toString, but for functions requiring bytes in python3 27 | """ 28 | if sys.version_info[0] == 2: 29 | return s 30 | if isinstance(s, bytes): 31 | return s 32 | # if isinstance(s, np.bytes_): 33 | # return np.bytes_(s) 34 | if isinstance(s, str): 35 | return bytes(s, 'ascii') 36 | if isinstance(s, list): 37 | return [toBytes(x) for x in s] 38 | return s 39 | 40 | 41 | def check_chrom_str_bytes(pIteratableObj, pObj): 42 | # determine type 43 | if isinstance(pObj, list) and len(pObj) > 0: 44 | type_ = type(pObj[0]) 45 | else: 46 | type_ = type(pObj) 47 | if not isinstance(type(next(iter(pIteratableObj))), type_): 48 | if isinstance(next(iter(pIteratableObj)), str): 49 | pObj = toString(pObj) 50 | elif type(next(iter(pIteratableObj))) in [bytes, np.bytes_]: 51 | pObj = toBytes(pObj) 52 | return pObj 53 | 54 | 55 | def convertNansToZeros(ma): 56 | nan_elements = np.flatnonzero(np.isnan(ma.data)) 57 | if len(nan_elements) > 0: 58 | ma.data[nan_elements] = 0.0 59 | return ma 60 | 61 | 62 | def convertNansToOnes(pArray): 63 | nan_elements = np.flatnonzero(np.isnan(pArray)) 64 | if len(nan_elements) > 0: 65 | pArray[nan_elements] = 1.0 66 | return pArray 67 | 68 | 69 | def enlarge_bins(bin_intervals): 70 | r""" 71 | takes a list of consecutive, but not 72 | directly touching, bin intervals 73 | and joins them such that the 74 | end and start of consecutive bins 75 | is the same. 76 | 77 | >>> bin_intervals = [('chr1', 10, 50, 1), ('chr1', 50, 80, 2), 78 | ... ('chr2', 10, 60, 3), ('chr2', 70, 90, 4)] 79 | >>> enlarge_bins(bin_intervals) 80 | [('chr1', 0, 50, 1), ('chr1', 50, 80, 2), ('chr2', 0, 65, 3), ('chr2', 65, 90, 4)] 81 | """ 82 | # enlarge remaining bins 83 | chr_start = True 84 | for idx in range(len(bin_intervals) - 1): 85 | chrom, start, end, extra = bin_intervals[idx] 86 | chrom_next, start_next, end_next, extra_next = bin_intervals[idx + 1] 87 | 88 | if chr_start is True: 89 | start = 0 90 | chr_start = False 91 | bin_intervals[idx] = (chrom, start, end, extra) 92 | if chrom == chrom_next and end != start_next: 93 | middle = start_next - int((start_next - end) / 2) 94 | bin_intervals[idx] = (chrom, start, middle, extra) 95 | bin_intervals[idx + 1] = (chrom, middle, end_next, extra_next) 96 | if chrom != chrom_next: 97 | chr_start = True 98 | 99 | chrom, start, end, extra = bin_intervals[-1] 100 | bin_intervals[-1] = (chrom, start, end, extra) 101 | 102 | return bin_intervals 103 | 104 | 105 | def opener(filename): 106 | """ 107 | Determines if a file is compressed or not 108 | """ 109 | f = open(filename, 'rb') # pylint: disable=R1732 110 | # print("gzip or not?", f.read(2)) 111 | 112 | if f.read(2) == b'\x1f\x8b': 113 | f.seek(0) 114 | return gzip.GzipFile(fileobj=f) 115 | 116 | f.seek(0) 117 | return f 118 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools >= 61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "HiCMatrix" 7 | version = "17.2" 8 | authors = [ 9 | { name = "Lucille Lopez-Delisle, Joachim Wolff, Leily Rabbani, Vivek Bhardwaj, Fidel Ramirez", email = "lucille.delisle@epfl.ch" }, 10 | ] 11 | description = "Helper package which implements HiCMatrix class for HiCExplorer, pyGenomeTracks and scHiCExplorer." 12 | readme = "README.rst" 13 | requires-python = ">=3.7" 14 | classifiers = [ 15 | 'Intended Audience :: Science/Research', 16 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 17 | ] 18 | dependencies = [ 19 | "numpy >= 1.20", 20 | "scipy >= 1.2", 21 | "tables >= 3.5", 22 | "pandas >= 0.25", 23 | "cooler >= 0.8.9", 24 | "intervaltree >= 3.0", 25 | "importlib_metadata; python_version<'3.8'" 26 | ] 27 | 28 | [project.urls] 29 | Homepage = "https://github.com/deeptools/HiCMatrix" 30 | Issues = "https://github.com/deeptools/HiCMatrix/issues" 31 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | filterwarnings = 3 | ignore::UserWarning 4 | ignore::FutureWarning 5 | ignore::DeprecationWarning 6 | ignore::ImportWarning --------------------------------------------------------------------------------