├── .github
└── workflows
│ ├── pylint.yml
│ ├── python-publish.yml
│ └── test.yml
├── .gitignore
├── .readthedocs.yml
├── HiCMatrix_env_ci.yml
├── LICENSE
├── README.rst
├── hicmatrix
├── HiCMatrix.py
├── __init__.py
├── lib
│ ├── __init__.py
│ ├── cool.py
│ ├── ginteractions.py
│ ├── h5.py
│ ├── hicpro.py
│ ├── homer.py
│ ├── matrixFile.py
│ ├── matrixFileHandler.py
│ └── scool.py
├── test
│ ├── test_HiCMatrix.py
│ ├── test_data
│ │ ├── GSE63525_GM12878_insitu_primary_2_5mb_hic2cool042.cool
│ │ ├── GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool
│ │ ├── Li_et_al_2015.cool
│ │ ├── Li_et_al_2015.h5
│ │ ├── one_interaction_4chr.cool
│ │ ├── one_interaction_diag_4chr.cool
│ │ ├── small_test_matrix.h5
│ │ ├── test_matrix.bed
│ │ ├── test_matrix.hicpro
│ │ ├── test_matrix.homer
│ │ └── test_matrix.homer.gz
│ └── test_matrixFileHandler.py
└── utilities.py
├── pyproject.toml
└── pytest.ini
/.github/workflows/pylint.yml:
--------------------------------------------------------------------------------
1 | name: Pylint
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | build:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - uses: actions/checkout@v4
10 | - uses: mamba-org/setup-micromamba@main
11 | with:
12 | environment-file: ./HiCMatrix_env_ci.yml
13 | cache-downloads: true
14 | environment-name: HiCMatrix_env_ci
15 | - name: Analysing the code with pylint
16 | run: |
17 | # Disable
18 | # C0103: Invalid name
19 | # C0114: Missing module docstring
20 | # C0115: Missing class docstring
21 | # C0116: Missing function or method docstring
22 | # C0301: Line too long
23 | # C0302: Too many lines in module
24 | # R0801: Similar lines
25 | # R0902: Too many instance attributes
26 | # R0904: Too many public methods
27 | # R0912: Too many branches
28 | # R0913: Too many arguments
29 | # R0914: Too many local variables
30 | # R0915: Too many statements
31 | # R1702: Too many nested blocks
32 | # R1728: Consider using a generator
33 | pylint --disable C0103,C0114,C0115,C0116,C0301,C0302,R0801,R0902,R0904,R0912,R0913,R0914,R0915,R1702,R1728 $(git ls-files '*.py')
34 | shell: micromamba-shell {0}
35 |
--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package using Twine when a release is created
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
3 |
4 | # This workflow uses actions that are not certified by GitHub.
5 | # They are provided by a third-party and are governed by
6 | # separate terms of service, privacy policy, and support
7 | # documentation.
8 |
9 | name: Upload Python Package
10 |
11 | on:
12 | release:
13 | types: [published]
14 |
15 | permissions:
16 | contents: read
17 |
18 | jobs:
19 | deploy:
20 |
21 | runs-on: ubuntu-latest
22 | permissions:
23 | # IMPORTANT: this permission is mandatory for trusted publishing
24 | id-token: write
25 | steps:
26 | - uses: actions/checkout@v4
27 | - name: Set up Python
28 | uses: actions/setup-python@v3
29 | with:
30 | python-version: '3.8'
31 | - name: Install dependencies
32 | run: |
33 | python -m pip install --upgrade pip
34 | pip install build
35 | - name: Build package
36 | run: python -m build
37 | - name: Publish package
38 | uses: pypa/gh-action-pypi-publish@v1.8.14
39 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: Test
2 | on: [push, pull_request]
3 |
4 | defaults:
5 | run:
6 | shell: bash -l {0}
7 |
8 | jobs:
9 | test-linux:
10 | name: Test on Linux
11 | runs-on: ubuntu-latest
12 | strategy:
13 | matrix:
14 | python-version:
15 | - "3.7"
16 | - "3.8"
17 | - "3.9"
18 | - "3.10"
19 | steps:
20 | - uses: actions/checkout@v4
21 | - name: Use python ${{ matrix.python-version }}
22 | run: echo -e "\n - python = ${{ matrix.python-version }}" >> ./HiCMatrix_env_ci.yml
23 | - uses: mamba-org/setup-micromamba@main
24 | with:
25 | environment-file: ./HiCMatrix_env_ci.yml
26 | cache-downloads: true
27 | environment-name: HiCMatrix_env_ci
28 | - name: pip install
29 | run: |
30 | python3 -m pip install .
31 | shell: micromamba-shell {0}
32 | - name: Test HiCMatrix
33 | run: |
34 | py.test hicmatrix/test/ --capture=sys
35 | shell: micromamba-shell {0}
36 | test-osx:
37 | name: Test on OSX
38 | runs-on: macos-12 # which is Intel and supported by bioconda macOS-latest is Apple silicon.
39 | strategy:
40 | matrix:
41 | python-version:
42 | - "3.8"
43 | - "3.9"
44 | - "3.10"
45 | steps:
46 | - uses: actions/checkout@v4
47 | - name: Use python ${{ matrix.python-version }}
48 | run: echo -e "\n - python = ${{ matrix.python-version }}" >> ./HiCMatrix_env_ci.yml
49 | - uses: mamba-org/setup-micromamba@main
50 | with:
51 | environment-file: ./HiCMatrix_env_ci.yml
52 | cache-downloads: true
53 | environment-name: HiCMatrix_env_ci
54 | - name: pip install
55 | run: |
56 | python3 -m pip install .
57 | shell: micromamba-shell {0}
58 | - name: Test HiCMatrix
59 | run: |
60 | py.test hicmatrix/test/ --capture=sys
61 | shell: micromamba-shell {0}
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | wheels/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | MANIFEST
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *.cover
46 | .hypothesis/
47 | .pytest_cache/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 | db.sqlite3
57 |
58 | # Flask stuff:
59 | instance/
60 | .webassets-cache
61 |
62 | # Scrapy stuff:
63 | .scrapy
64 |
65 | # Sphinx documentation
66 | docs/_build/
67 |
68 | # PyBuilder
69 | target/
70 |
71 | # Jupyter Notebook
72 | .ipynb_checkpoints
73 |
74 | # pyenv
75 | .python-version
76 |
77 | # celery beat schedule file
78 | celerybeat-schedule
79 |
80 | # SageMath parsed files
81 | *.sage.py
82 |
83 | # Environments
84 | .env
85 | .venv
86 | env/
87 | venv/
88 | ENV/
89 | env.bak/
90 | venv.bak/
91 |
92 | # Spyder project settings
93 | .spyderproject
94 | .spyproject
95 |
96 | # Rope project settings
97 | .ropeproject
98 |
99 | # mkdocs documentation
100 | /site
101 |
102 | # mypy
103 | .mypy_cache/
104 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # yaml file to configure readthedocs build
2 | python:
3 | setup_py_install: true
4 | pip_install: False
5 |
--------------------------------------------------------------------------------
/HiCMatrix_env_ci.yml:
--------------------------------------------------------------------------------
1 | name: HiCMatrix_env_ci
2 | channels:
3 | - conda-forge
4 | - bioconda
5 | dependencies:
6 | - numpy >= 1.20
7 | - scipy >= 1.2
8 | - pandas >= 0.25
9 | - pytables >= 3.5
10 | - cooler >= 0.8.9
11 | - intervaltree >= 3.0
12 | - pytest
13 | - pylint
14 | - pytest-xdist
15 | - pytest-forked
16 | - nose
17 | - pathlib
18 | - configparser
19 | - build # For the upload
20 | - twine # For the upload
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU General Public License is a free, copyleft license for
11 | software and other kinds of works.
12 |
13 | The licenses for most software and other practical works are designed
14 | to take away your freedom to share and change the works. By contrast,
15 | the GNU General Public License is intended to guarantee your freedom to
16 | share and change all versions of a program--to make sure it remains free
17 | software for all its users. We, the Free Software Foundation, use the
18 | GNU General Public License for most of our software; it applies also to
19 | any other work released this way by its authors. You can apply it to
20 | your programs, too.
21 |
22 | When we speak of free software, we are referring to freedom, not
23 | price. Our General Public Licenses are designed to make sure that you
24 | have the freedom to distribute copies of free software (and charge for
25 | them if you wish), that you receive source code or can get it if you
26 | want it, that you can change the software or use pieces of it in new
27 | free programs, and that you know you can do these things.
28 |
29 | To protect your rights, we need to prevent others from denying you
30 | these rights or asking you to surrender the rights. Therefore, you have
31 | certain responsibilities if you distribute copies of the software, or if
32 | you modify it: responsibilities to respect the freedom of others.
33 |
34 | For example, if you distribute copies of such a program, whether
35 | gratis or for a fee, you must pass on to the recipients the same
36 | freedoms that you received. You must make sure that they, too, receive
37 | or can get the source code. And you must show them these terms so they
38 | know their rights.
39 |
40 | Developers that use the GNU GPL protect your rights with two steps:
41 | (1) assert copyright on the software, and (2) offer you this License
42 | giving you legal permission to copy, distribute and/or modify it.
43 |
44 | For the developers' and authors' protection, the GPL clearly explains
45 | that there is no warranty for this free software. For both users' and
46 | authors' sake, the GPL requires that modified versions be marked as
47 | changed, so that their problems will not be attributed erroneously to
48 | authors of previous versions.
49 |
50 | Some devices are designed to deny users access to install or run
51 | modified versions of the software inside them, although the manufacturer
52 | can do so. This is fundamentally incompatible with the aim of
53 | protecting users' freedom to change the software. The systematic
54 | pattern of such abuse occurs in the area of products for individuals to
55 | use, which is precisely where it is most unacceptable. Therefore, we
56 | have designed this version of the GPL to prohibit the practice for those
57 | products. If such problems arise substantially in other domains, we
58 | stand ready to extend this provision to those domains in future versions
59 | of the GPL, as needed to protect the freedom of users.
60 |
61 | Finally, every program is threatened constantly by software patents.
62 | States should not allow patents to restrict development and use of
63 | software on general-purpose computers, but in those that do, we wish to
64 | avoid the special danger that patents applied to a free program could
65 | make it effectively proprietary. To prevent this, the GPL assures that
66 | patents cannot be used to render the program non-free.
67 |
68 | The precise terms and conditions for copying, distribution and
69 | modification follow.
70 |
71 | TERMS AND CONDITIONS
72 |
73 | 0. Definitions.
74 |
75 | "This License" refers to version 3 of the GNU General Public License.
76 |
77 | "Copyright" also means copyright-like laws that apply to other kinds of
78 | works, such as semiconductor masks.
79 |
80 | "The Program" refers to any copyrightable work licensed under this
81 | License. Each licensee is addressed as "you". "Licensees" and
82 | "recipients" may be individuals or organizations.
83 |
84 | To "modify" a work means to copy from or adapt all or part of the work
85 | in a fashion requiring copyright permission, other than the making of an
86 | exact copy. The resulting work is called a "modified version" of the
87 | earlier work or a work "based on" the earlier work.
88 |
89 | A "covered work" means either the unmodified Program or a work based
90 | on the Program.
91 |
92 | To "propagate" a work means to do anything with it that, without
93 | permission, would make you directly or secondarily liable for
94 | infringement under applicable copyright law, except executing it on a
95 | computer or modifying a private copy. Propagation includes copying,
96 | distribution (with or without modification), making available to the
97 | public, and in some countries other activities as well.
98 |
99 | To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies. Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 |
103 | An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License. If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 |
112 | 1. Source Code.
113 |
114 | The "source code" for a work means the preferred form of the work
115 | for making modifications to it. "Object code" means any non-source
116 | form of a work.
117 |
118 | A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 |
123 | The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form. A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 |
134 | The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities. However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work. For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 |
147 | The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 |
151 | The Corresponding Source for a work in source code form is that
152 | same work.
153 |
154 | 2. Basic Permissions.
155 |
156 | All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met. This License explicitly affirms your unlimited
159 | permission to run the unmodified Program. The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work. This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 |
164 | You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force. You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright. Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 |
175 | Conveying under any other circumstances is permitted solely under
176 | the conditions stated below. Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 |
179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 |
181 | No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 |
187 | When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 |
195 | 4. Conveying Verbatim Copies.
196 |
197 | You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 |
205 | You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 |
208 | 5. Conveying Modified Source Versions.
209 |
210 | You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 |
214 | a) The work must carry prominent notices stating that you modified
215 | it, and giving a relevant date.
216 |
217 | b) The work must carry prominent notices stating that it is
218 | released under this License and any conditions added under section
219 | 7. This requirement modifies the requirement in section 4 to
220 | "keep intact all notices".
221 |
222 | c) You must license the entire work, as a whole, under this
223 | License to anyone who comes into possession of a copy. This
224 | License will therefore apply, along with any applicable section 7
225 | additional terms, to the whole of the work, and all its parts,
226 | regardless of how they are packaged. This License gives no
227 | permission to license the work in any other way, but it does not
228 | invalidate such permission if you have separately received it.
229 |
230 | d) If the work has interactive user interfaces, each must display
231 | Appropriate Legal Notices; however, if the Program has interactive
232 | interfaces that do not display Appropriate Legal Notices, your
233 | work need not make them do so.
234 |
235 | A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit. Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 |
245 | 6. Conveying Non-Source Forms.
246 |
247 | You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 |
252 | a) Convey the object code in, or embodied in, a physical product
253 | (including a physical distribution medium), accompanied by the
254 | Corresponding Source fixed on a durable physical medium
255 | customarily used for software interchange.
256 |
257 | b) Convey the object code in, or embodied in, a physical product
258 | (including a physical distribution medium), accompanied by a
259 | written offer, valid for at least three years and valid for as
260 | long as you offer spare parts or customer support for that product
261 | model, to give anyone who possesses the object code either (1) a
262 | copy of the Corresponding Source for all the software in the
263 | product that is covered by this License, on a durable physical
264 | medium customarily used for software interchange, for a price no
265 | more than your reasonable cost of physically performing this
266 | conveying of source, or (2) access to copy the
267 | Corresponding Source from a network server at no charge.
268 |
269 | c) Convey individual copies of the object code with a copy of the
270 | written offer to provide the Corresponding Source. This
271 | alternative is allowed only occasionally and noncommercially, and
272 | only if you received the object code with such an offer, in accord
273 | with subsection 6b.
274 |
275 | d) Convey the object code by offering access from a designated
276 | place (gratis or for a charge), and offer equivalent access to the
277 | Corresponding Source in the same way through the same place at no
278 | further charge. You need not require recipients to copy the
279 | Corresponding Source along with the object code. If the place to
280 | copy the object code is a network server, the Corresponding Source
281 | may be on a different server (operated by you or a third party)
282 | that supports equivalent copying facilities, provided you maintain
283 | clear directions next to the object code saying where to find the
284 | Corresponding Source. Regardless of what server hosts the
285 | Corresponding Source, you remain obligated to ensure that it is
286 | available for as long as needed to satisfy these requirements.
287 |
288 | e) Convey the object code using peer-to-peer transmission, provided
289 | you inform other peers where the object code and Corresponding
290 | Source of the work are being offered to the general public at no
291 | charge under subsection 6d.
292 |
293 | A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 |
297 | A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling. In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage. For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product. A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 |
310 | "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source. The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 |
318 | If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information. But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 |
329 | The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed. Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 |
337 | Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 |
343 | 7. Additional Terms.
344 |
345 | "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law. If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 |
354 | When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it. (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.) You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 |
361 | Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 |
365 | a) Disclaiming warranty or limiting liability differently from the
366 | terms of sections 15 and 16 of this License; or
367 |
368 | b) Requiring preservation of specified reasonable legal notices or
369 | author attributions in that material or in the Appropriate Legal
370 | Notices displayed by works containing it; or
371 |
372 | c) Prohibiting misrepresentation of the origin of that material, or
373 | requiring that modified versions of such material be marked in
374 | reasonable ways as different from the original version; or
375 |
376 | d) Limiting the use for publicity purposes of names of licensors or
377 | authors of the material; or
378 |
379 | e) Declining to grant rights under trademark law for use of some
380 | trade names, trademarks, or service marks; or
381 |
382 | f) Requiring indemnification of licensors and authors of that
383 | material by anyone who conveys the material (or modified versions of
384 | it) with contractual assumptions of liability to the recipient, for
385 | any liability that these contractual assumptions directly impose on
386 | those licensors and authors.
387 |
388 | All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10. If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term. If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 |
398 | If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 |
403 | Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 |
407 | 8. Termination.
408 |
409 | You may not propagate or modify a covered work except as expressly
410 | provided under this License. Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 |
415 | However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 |
422 | Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 |
429 | Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License. If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 |
435 | 9. Acceptance Not Required for Having Copies.
436 |
437 | You are not required to accept this License in order to receive or
438 | run a copy of the Program. Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance. However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work. These actions infringe copyright if you do
443 | not accept this License. Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 |
446 | 10. Automatic Licensing of Downstream Recipients.
447 |
448 | Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License. You are not responsible
451 | for enforcing compliance by third parties with this License.
452 |
453 | An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations. If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 |
463 | You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License. For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 |
471 | 11. Patents.
472 |
473 | A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based. The
475 | work thus licensed is called the contributor's "contributor version".
476 |
477 | A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version. For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 |
487 | Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 |
492 | In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement). To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 |
499 | If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients. "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 |
513 | If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 |
521 | A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License. You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 |
536 | Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 |
540 | 12. No Surrender of Others' Freedom.
541 |
542 | If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License. If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all. For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 |
552 | 13. Use with the GNU Affero General Public License.
553 |
554 | Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work. The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 |
563 | 14. Revised Versions of this License.
564 |
565 | The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time. Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 |
570 | Each version is given a distinguishing version number. If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation. If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 |
579 | If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 |
584 | Later license versions may give you additional or different
585 | permissions. However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 |
589 | 15. Disclaimer of Warranty.
590 |
591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 |
600 | 16. Limitation of Liability.
601 |
602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 |
612 | 17. Interpretation of Sections 15 and 16.
613 |
614 | If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 |
621 | END OF TERMS AND CONDITIONS
622 |
623 | How to Apply These Terms to Your New Programs
624 |
625 | If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 |
629 | To do so, attach the following notices to the program. It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 |
634 |
635 | Copyright (C)
636 |
637 | This program is free software: you can redistribute it and/or modify
638 | it under the terms of the GNU General Public License as published by
639 | the Free Software Foundation, either version 3 of the License, or
640 | (at your option) any later version.
641 |
642 | This program is distributed in the hope that it will be useful,
643 | but WITHOUT ANY WARRANTY; without even the implied warranty of
644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
645 | GNU General Public License for more details.
646 |
647 | You should have received a copy of the GNU General Public License
648 | along with this program. If not, see .
649 |
650 | Also add information on how to contact you by electronic and paper mail.
651 |
652 | If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 |
655 | Copyright (C)
656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 | This is free software, and you are welcome to redistribute it
658 | under certain conditions; type `show c' for details.
659 |
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License. Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 |
664 | You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | .
668 |
669 | The GNU General Public License does not permit incorporating your program
670 | into proprietary programs. If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library. If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License. But first, please read
674 | .
675 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | HiCMatrix
2 | ===========
3 |
4 | This library implements the central class of HiCExplorer to manage Hi-C interaction matrices. It is separated from the main project to enable Hi-C matrices
5 | in other projects without the dependency on HiCExplorer. Moreover, it enables us to use the already separated pyGenomeTracks (former hicPlotTADs) in HiCExplorer
6 | because mutual dependencies are resolved.
7 |
8 | With version 8, we dropped the support for Python 2.
9 |
10 | Version 14 introduced the official support for scool file format, used by scHiCExplorer since version 5: https://github.com/joachimwolff/scHiCExplorer and https://schicexplorer.readthedocs.io/en/latest/.
11 |
12 | Read support
13 | -------------
14 |
15 | - h5
16 | - cool / mcool / scool
17 | - hicpro
18 | - homer
19 |
20 | Write support
21 | --------------
22 |
23 | - h5
24 | - cool / mcool
25 | - scool
26 | - homer
27 | - ginteractions
28 | - hicpro
29 |
30 | Citation:
31 | ^^^^^^^^^
32 |
33 | Joachim Wolff, Leily Rabbani, Ralf Gilsbach, Gautier Richard, Thomas Manke, Rolf Backofen, Björn A Grüning.
34 | **Galaxy HiCExplorer 3: a web server for reproducible Hi-C, capture Hi-C and single-cell Hi-C data analysis, quality control and visualization, Nucleic Acids Research**, Volume 48, Issue W1, 02 July 2020, Pages W177–W184, https://doi.org/10.1093/nar/gkaa220
35 |
--------------------------------------------------------------------------------
/hicmatrix/HiCMatrix.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from collections import Counter
3 | from collections import OrderedDict
4 |
5 | import time
6 |
7 | import cooler
8 | import numpy as np
9 | from intervaltree import Interval, IntervalTree
10 | from scipy.sparse import csr_matrix, dia_matrix
11 | from scipy.sparse import hstack as sparse_hstack
12 | from scipy.sparse import tril, triu
13 | from scipy.sparse import vstack as sparse_vstack
14 | from scipy.sparse import diags
15 | from scipy.sparse import lil_matrix
16 |
17 | from .lib import MatrixFileHandler
18 | from .utilities import check_chrom_str_bytes, toBytes, toString
19 |
20 | log = logging.getLogger(__name__)
21 |
22 | class hiCMatrix:
23 | """
24 | Class to handle Hi-C matrices
25 | contains routines to get intrachromosomal distances
26 | get sub matrices by chrname.
27 | """
28 |
29 | def __init__(self, pMatrixFile=None, pChrnameList=None, pDistance=None, pNoIntervalTree=None, pUpperTriangleOnly=None,
30 | pMatrixFormat=None, pRestoreMaskedBins=None, pLoadMatrixOnly=None):
31 | self.non_homogeneous_warning_already_printed = False
32 | self.bin_size = None
33 | self.bin_size_homogeneous = None # track if the bins are equally spaced or not
34 | self.uncorrected_matrix = None
35 |
36 | self.matrix = None
37 | self.cut_intervals = None
38 | self.nan_bins = None
39 | self.correction_factors = None
40 | self.distance_counts = None
41 | # # when NaN bins are masked, this variable becomes contains the bin index
42 | # # needed to put the masked bins back into the matrix.
43 | self.orig_bin_ids = []
44 | self.orig_cut_intervals = [] # similar to orig_bin_ids. Used to identify the position of masked nan bins
45 | self.matrixFileHandler = None
46 | start_time = time.time()
47 | if pMatrixFile is not None:
48 | log.debug('Load self.matrixFileHandler')
49 | fileType = 'cool'
50 | if pMatrixFile.endswith('.h5'):
51 | fileType = 'h5'
52 | self.matrixFileHandler = MatrixFileHandler(pFileType=fileType, pMatrixFile=pMatrixFile, pChrnameList=pChrnameList, pDistance=pDistance, pMatrixFormat=pMatrixFormat, pLoadMatrixOnly=pLoadMatrixOnly)
53 | log.debug('init time: %s', time.time() - start_time)
54 | matrixFileHandler_load = self.matrixFileHandler.load()
55 | # check if there was any exception thrown in the load function
56 | if len(matrixFileHandler_load) == 2:
57 | raise ValueError(f'Matrix failed to load: {matrixFileHandler_load[1]}')
58 | self.matrix, self.cut_intervals, self.nan_bins, \
59 | self.correction_factors, self.distance_counts = matrixFileHandler_load
60 | if pLoadMatrixOnly is None or not pLoadMatrixOnly:
61 | if self.nan_bins is None:
62 | self.nan_bins = np.array([])
63 |
64 | if pUpperTriangleOnly is None or not pUpperTriangleOnly:
65 | self.fillLowerTriangle()
66 | start_time = time.time()
67 |
68 | if pRestoreMaskedBins is None or pRestoreMaskedBins:
69 | self.restoreMaskedBins()
70 | start_time = time.time()
71 |
72 | if pNoIntervalTree is None or not pNoIntervalTree:
73 | self.interval_trees, self.chrBinBoundaries = \
74 | self.intervalListToIntervalTree(self.cut_intervals)
75 | else:
76 | log.debug('no intervaltree')
77 |
78 | elif pMatrixFile is None:
79 | log.debug('Only init object, no matrix given.')
80 | else:
81 | raise ValueError('matrix file not given')
82 | log.debug('data loaded!')
83 |
84 | def save(self, pMatrixName, pSymmetric=True, pApplyCorrection=False, pHiCInfo=None):
85 | """ As an output format cooler and mcooler are supported.
86 | """
87 |
88 | if self.matrixFileHandler is None:
89 | fileType = 'cool'
90 | if pMatrixName.endswith('h5'):
91 | fileType = 'h5'
92 | self.matrixFileHandler = MatrixFileHandler(pFileType=fileType, pHiCInfo=pHiCInfo)
93 |
94 | self.restoreMaskedBins()
95 | self.matrixFileHandler.set_matrix_variables(self.matrix, self.cut_intervals, self.nan_bins,
96 | self.correction_factors, self.distance_counts)
97 | if pMatrixName.endswith('cool'):
98 | self.matrixFileHandler.matrixFile.hic_metadata = pHiCInfo
99 |
100 | if pMatrixName.endswith('cool') or pMatrixName.endswith('h5'):
101 | self.matrixFileHandler.save(pMatrixName, pSymmetric=pSymmetric, pApplyCorrection=pApplyCorrection)
102 |
103 | def getInformationCoolerBinNames(self):
104 | log.info('The following columns are available: %s', self.matrixFileHandler.matrixFile.getInformationCoolerBinNames())
105 |
106 | def fillLowerTriangle(self):
107 | """
108 | checks if the matrix is complete or if only half of the matrix was saved.
109 | Returns a whole matrix.
110 | """
111 | # log.debug('sum of tril: {}'.format(tril(self.matrix, k=-1).sum()))
112 | if tril(self.matrix, k=-1).sum() == 0:
113 | # this case means that the lower triangle of the
114 | # symmetric matrix (below the main diagonal)
115 | # is zero. In this case, replace the lower
116 | # triangle using the upper triangle
117 | self.matrix = self.matrix + triu(self.matrix, 1).T
118 |
119 | # return matrix
120 |
121 | def setCutIntervals(self, cut_intervals):
122 | """
123 | Replace the cut_intervals of a matrix
124 | """
125 |
126 | # check that the matrix is squared
127 | if len(cut_intervals) != self.matrix.shape[0]:
128 | raise ValueError(f"Length of cut_intervals {len(cut_intervals)} does not match the "
129 | f"matrix size {self.matrix.shape}")
130 |
131 | self.cut_intervals = cut_intervals
132 | self.interval_trees, self.chrBinBoundaries = \
133 | self.intervalListToIntervalTree(self.cut_intervals)
134 |
135 | def setMatrix(self, matrix, cut_intervals):
136 | """
137 | Initialize a matrix with a given matrix
138 | and cut_intervals. Mostly useful for
139 | testing.
140 | """
141 |
142 | # check that the matrix is squared
143 | if matrix.shape[0] != matrix.shape[1]:
144 | raise ValueError(f"Matrix is not squared. Shape is {matrix.shape}")
145 | if len(cut_intervals) != matrix.shape[0]:
146 | raise ValueError(f"Length of cut_intervals {len(cut_intervals)} does not match the matrix size {matrix.shape}")
147 |
148 | self.matrix = matrix
149 | self.cut_intervals = cut_intervals
150 | self.interval_trees, self.chrBinBoundaries = \
151 | self.intervalListToIntervalTree(self.cut_intervals)
152 |
153 | def getBinSize(self):
154 | """
155 | estimates the bin size. In case the bin size
156 | is not equal for all bins (maybe except for the
157 | bin at the en of the chromosomes) a warning is issued.
158 | In case of uneven bins, the median is returned.
159 | """
160 | if self.bin_size is None:
161 | chrom, start, end, extra = zip(*self.cut_intervals)
162 | diff = np.array(end) - np.array(start)
163 | # If there is only one bin:
164 | if len(diff) == 1:
165 | self.bin_size = diff[0]
166 | return self.bin_size
167 | # If there are more bins, the diff will be compared
168 | # to the median of the differences between starts
169 | median = int(np.median(np.concatenate([np.diff([start for chro, start, end, extra in self.cut_intervals if chro == cur_chrom]) for cur_chrom, nb in Counter(chrom).items() if nb > 1])))
170 |
171 | # check if the bin size is
172 | # homogeneous
173 | if len(np.flatnonzero(diff != median)) > (len(diff) * 0.01):
174 | self.bin_size_homogeneous = False
175 | if self.non_homogeneous_warning_already_printed is False:
176 | log.warning('Bin size is not homogeneous. \
177 | Median %f\n', median)
178 | self.non_homogeneous_warning_already_printed = True
179 | self.bin_size = median
180 | return self.bin_size
181 |
182 | def getMatrix(self):
183 | matrix = self.matrix.todense()
184 | if len(self.nan_bins):
185 | # to set NaN values the matrix type has to be
186 | # float. Corrected matrices are of float
187 | # type while uncorrected matrices are of
188 | # of int type
189 | if np.issubdtype(self.matrix, 'float') is False:
190 | matrix = matrix.astype(float)
191 | matrix[self.nan_bins, :] = np.nan
192 | matrix[:, self.nan_bins] = np.nan
193 |
194 | return matrix
195 |
196 | def getChrBinRange(self, chrName):
197 | """
198 | Given a chromosome name,
199 | This functions return the start and end bin indices in the matrix
200 | """
201 |
202 | if chrName in self.chrBinBoundaries:
203 | return self.chrBinBoundaries[chrName]
204 | raise ValueError(f"chrName: {chrName} not found in chrBinBoundaries"
205 | f"valid chromosomes are: {self.chrBinBoundaries.keys()}")
206 |
207 | def getChrNames(self):
208 | """
209 | returns the names of the chromosomes
210 | present in the matrix
211 | """
212 | return list(self.chrBinBoundaries)
213 |
214 | def getBinPos(self, binIndex):
215 | """
216 | given a bin, it returns the chromosome name,
217 | start position and end position
218 | """
219 | if binIndex < len(self.cut_intervals):
220 | return self.cut_intervals[binIndex]
221 | raise ValueError(f"binIndex: {binIndex} not found")
222 |
223 | def getRegionBinRange(self, chrname, startpos, endpos):
224 | """
225 | Given a chromosome region, this function returns
226 | the bin indices that overlap with such region.
227 | """
228 |
229 | try:
230 | # chromosome_size = hic_matrix.get_chromosome_sizes()
231 | # chrname = check_chrom_str_bytes(self.interval_trees, chrname)
232 | if not isinstance(next(iter(self.interval_trees)), type(chrname)):
233 | if isinstance(next(iter(self.interval_trees)), str):
234 | chrname = toString(chrname)
235 | elif isinstance(next(iter(self.interval_trees)), bytes):
236 | chrname = toBytes(chrname)
237 | elif isinstance(next(iter(self.interval_trees)), np.bytes_):
238 | chrname = toBytes(chrname)
239 | # chr_end_pos = chromosome_size[chrname]
240 | # self.interval_trees[chrname]
241 | if chrname not in self.interval_trees:
242 | raise ValueError(f"chromosome: {chrname} name not found in matrix"
243 | f"valid names are: {self.interval_trees.keys()}"
244 | )
245 | except KeyError as ke:
246 | log.exception("chromosome: %s name not found in matrix", chrname)
247 | log.exception("valid names are: ")
248 | log.exception(self.interval_trees.keys())
249 | log.exception(str(ke))
250 |
251 | try:
252 | startpos = int(startpos)
253 | endpos = int(endpos)
254 | except ValueError as ve:
255 | log.exception("%d or %d are not valid "
256 | "position values.", startpos, endpos)
257 | log.exception(str(ve))
258 |
259 | try:
260 |
261 | startbin = sorted(self.interval_trees[chrname][startpos:startpos + 1])[0].data
262 | endbin = sorted(self.interval_trees[chrname][endpos:endpos + 1])[0].data
263 | except IndexError:
264 | # log.exception("chrname: " + chrname)
265 | # log.exception("len intervaltree: "+len(self.interval_trees[chrname]))
266 | # log.exception("start and end pos:" + startpos + ":::" + endpos )
267 | log.exception("Index error")
268 | return None
269 |
270 | return startbin, endbin
271 |
272 | @staticmethod
273 | def getDistList(rows, cols, cut_intervals):
274 | """
275 | Given a list of rows and cols
276 | an array is returned containing
277 | the genomic distance between
278 | each element of the row array
279 | with each element of the col array.
280 | -1 is returned for inter-chromosomal
281 | interactions.
282 |
283 | A matching list containing the chromosome name
284 | is also returned
285 | """
286 | chrnamelist, startlist, _, _ = zip(*cut_intervals)
287 | # now the distance between any two points
288 | # is computed and arranged such that for each
289 | # element of the data array, a corespondent distance is stored
290 | start_row = np.take(startlist, rows)
291 | start_col = np.take(startlist, cols)
292 | dist_list = start_col - start_row
293 |
294 | # now all distances that are between chromosomes are removed
295 | # to do this I convert the array of chromosomes to
296 | # a array of indices. Then, when subtracting the
297 | # values that correspond to matrix.row and matrix.col
298 | # using the array of indices, any value other
299 | # than 0 means inter-chromosomal row,col combination.
300 |
301 | # chr_id_list is based on a trick using np.unique
302 | # to get from a list of strings
303 | # a list of integers
304 | chr_id_list = np.unique(chrnamelist, return_inverse=True)[1]
305 |
306 | chr_row = np.take(chr_id_list, rows)
307 | chr_col = np.take(chr_id_list, cols)
308 | chr_diff = chr_row - chr_col
309 | # set in dist_list array '-1' for all interchromosomal values
310 | dist_list[chr_diff != 0] = -1
311 |
312 | # make a corresponding chromosome name list
313 | # if filtering per chromosome is required
314 | chrom_list = np.take(chrnamelist, rows)
315 | chrom_list[chr_diff != 0] = ''
316 |
317 | return dist_list, chrom_list
318 |
319 | @staticmethod
320 | def fit_cut_intervals(cut_intervals):
321 | # check that the matrix has bins of same size
322 | # otherwise try to adjust the bins to
323 | # to match a regular binning
324 | if len(cut_intervals) <= 1:
325 | # do nothing if there is only one interval
326 | return cut_intervals
327 | chrom, start, end, extra = zip(*cut_intervals)
328 |
329 | median = int(np.median(np.concatenate([np.diff([start for chro, start, end, extra in cut_intervals if chro == cur_chrom]) for cur_chrom, nb in Counter(chrom).items() if nb > 1])))
330 | diff = np.array(end) - np.array(start)
331 | # check if the bin size is homogeneous
332 | if len(np.flatnonzero(diff != median)) > (len(diff) * 0.01):
333 | # set the start position of a bin to the closest multiple
334 | # of the median
335 | def snap_nearest_multiple(start_x, m):
336 | resi = [-1 * (start_x % m), -start_x % m]
337 | return start_x + resi[np.argmin(np.abs(resi))]
338 | start = [snap_nearest_multiple(x, median) for x in start]
339 | end = [snap_nearest_multiple(x, median) for x in end]
340 | cut_intervals = list(zip(chrom, start, end, extra))
341 | log.info('[getCountsByDistance] Bin size is not '
342 | 'homogeneous, setting \n'
343 | 'the bin distance to the median: %f\n', median)
344 | return cut_intervals
345 |
346 | def convert_to_zscore_matrix(self, maxdepth=None, perchr=False):
347 | return self.convert_to_obs_exp_matrix(maxdepth=maxdepth, zscore=True, perchr=perchr)
348 |
349 | def convert_to_obs_exp_matrix(self, maxdepth=None, zscore=False, perchr=False, pSkipTriu=False):
350 | """
351 | Converts a corrected counts matrix into a
352 | obs / expected matrix or z-scores fast.
353 |
354 | The caveat is that the obs/exp or z-score are only
355 | computed for non-zero values, although zero values that
356 | are not part of the sparse matrix are considered.
357 |
358 | For each diagonal the mean (and std when computing z-scores) are
359 | calculated and then each non-zero value of the sparse matrix is
360 | replaced by the obs/exp or z-score.
361 |
362 | Parameters
363 | ----------
364 | maxdepth: maximum distance from the diagonal to consider. All contacts beyond this distance will not
365 | be considered.
366 | zscore: if a zscore wants to be returned instead of obs/exp
367 |
368 |
369 | Returns
370 | -------
371 | observed / expected sparse matrix
372 |
373 |
374 | nans occur where the standard deviation is zero
375 | """
376 |
377 | binsize = self.getBinSize()
378 | max_depth_in_bins = None
379 |
380 | if maxdepth:
381 | if maxdepth < binsize:
382 | raise ValueError(f"Please specify a maxDepth larger than bin size ({binsize})")
383 |
384 | max_depth_in_bins = int(float(maxdepth * 1.5) / binsize)
385 | # work only with the upper matrix
386 | # and remove all pixels that are beyond
387 | # max_depth_in_bis
388 | # (this is done by subtracting a second sparse matrix
389 | # that contains only the upper matrix that wants to be removed.
390 | if not pSkipTriu:
391 | self.matrix = triu(self.matrix, k=0, format='csr') - \
392 | triu(self.matrix, k=max_depth_in_bins, format='csr')
393 | else:
394 | if not pSkipTriu:
395 | self.matrix = triu(self.matrix, k=0, format='csr')
396 |
397 | self.matrix.eliminate_zeros()
398 | depth = None
399 | if zscore is True:
400 | m_size = self.matrix.shape[0]
401 | if max_depth_in_bins is not None:
402 | depth = max_depth_in_bins
403 | else:
404 | depth = m_size
405 | estimated_size_dense_matrix = m_size ** 2 * 8
406 | if estimated_size_dense_matrix > 100e6:
407 | log.info("To compute z-scores a dense matrix is required. This will use \n"
408 | "%f Mb of memory.\n To reduce memory use the maxdeph option."
409 | "", estimated_size_dense_matrix / 1e6)
410 |
411 | # to compute zscore the zero values need to be accounted and the matrix
412 | # need to become dense. This is only practical if only up to certain distance
413 | # wants to be evaluated, otherwise the dense matrix is too large.
414 | # To make the matrix dense and keep the same computations as when
415 | # the matrix is sparse the following is done:
416 | # A sparse diagonal matrix of shape = matrix.shape is created with ones
417 | # (only upper triangle contains diagonals up to maxdeph)
418 | # This sparse matrix is then added to self.matrix
419 | # then, -1 is subtracted to the self.matrix.data, thus effectively
420 | # adding zeros.
421 | diag_mat_ones = diags(np.repeat([1], m_size * depth).reshape(depth, m_size), list(range(depth)))
422 |
423 | self.matrix += diag_mat_ones
424 |
425 | trasf_matrix = lil_matrix(self.matrix.shape)
426 |
427 | chr_submatrix = OrderedDict()
428 | cut_intervals = OrderedDict()
429 | chrom_sizes = OrderedDict()
430 | chrom_range = OrderedDict()
431 | if perchr:
432 | for chrname in self.getChrNames():
433 | chr_range = self.getChrBinRange(chrname)
434 | chr_submatrix[chrname] = self.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]].tocoo()
435 | cut_intervals[chrname] = [self.cut_intervals[x] for x in range(chr_range[0], chr_range[1])]
436 | chrom_sizes[chrname] = [chr_submatrix[chrname].shape[0]]
437 | chrom_range[chrname] = (chr_range[0], chr_range[1])
438 |
439 | else:
440 | chr_submatrix['all'] = self.matrix.tocoo()
441 | cut_intervals['all'] = self.cut_intervals
442 | # chrom_sizes['all'] = np.array([v[1] - v[0] for k, v in iteritems(self.chrBinBoundaries)])
443 | chrom_sizes['all'] = np.array([v[1] - v[0] for k, v in self.chrBinBoundaries.items()])
444 |
445 | chrom_range['all'] = (0, self.matrix.shape[0])
446 |
447 | # for chrname, submatrix in iteritems(chr_submatrix):
448 | for chrname, submatrix in chr_submatrix.items():
449 |
450 | log.info("processing chromosome %s\n", chrname)
451 | if zscore is True:
452 | # this step has to be done after tocoo()
453 | submatrix.data -= 1
454 |
455 | dist_list, _ = self.getDistList(submatrix.row, submatrix.col,
456 | hiCMatrix.fit_cut_intervals(cut_intervals[chrname]))
457 |
458 | # to get the sum of all values at a given distance I use np.bincount which
459 | # is quite fast. However, the input of bincount is positive integers. Moreover
460 | # it returns the sum for every consecutive integer, even if this is not on the list.
461 | # Thus, dist_list, which contains the distance in bp between any two bins is
462 | # converted to bin distance.
463 |
464 | # Because positive integers are needed we add +1 to all bin distances
465 | # such that the value of -1 (which means different chromosomes) can now be used
466 |
467 | dist_list[dist_list == -1] = -binsize # pylint: disable=E1130
468 | # divide by binsize to get a list of bin distances and add +1 to remove negative values
469 | dist_list = (np.array(dist_list).astype(float) / binsize).astype(int) + 1
470 |
471 | # for each distance, return the sum of all values
472 | sum_counts = np.bincount(dist_list, weights=submatrix.data)
473 | distance_len = np.bincount(dist_list)
474 | # compute the average for each distance
475 | mat_size = submatrix.shape[0]
476 | mu = {}
477 | std = {}
478 | # compute mean value for each distance
479 |
480 | for bin_dist_plus_one, sum_value in enumerate(sum_counts):
481 | if maxdepth and bin_dist_plus_one == 0: # this is for intra chromosomal counts
482 | # when max depth is set, the computation
483 | # of the total_intra is not accurate and is safer to
484 | # output np.nan
485 | mu[bin_dist_plus_one] = np.nan
486 | std[bin_dist_plus_one] = np.nan
487 | continue
488 |
489 | if bin_dist_plus_one == 0:
490 | total_intra = mat_size ** 2 - sum([size ** 2 for size in chrom_sizes[chrname]])
491 | diagonal_length = int(total_intra / 2)
492 | else:
493 | # to compute the average counts per distance we take the sum_counts and divide
494 | # by the number of values on the respective diagonal
495 | # which is equal to the size of each chromosome - the diagonal offset (for those
496 | # chromosome larger than the offset)
497 | # In the following example with two chromosomes
498 | # the first (main) diagonal has a size equal to the matrix (6),
499 | # while the next has 1 value less for each chromosome (4) and the last one has only 2 values
500 |
501 | # 0 1 2 . . .
502 | # - 0 1 . . .
503 | # - - 0 . . .
504 | # . . . 0 1 2
505 | # . . . - 0 1
506 | # . . . - - 0
507 |
508 | # idx - 1 because earlier the values where
509 | # shifted.
510 | diagonal_length = sum([size - (bin_dist_plus_one - 1) for size in chrom_sizes[chrname] if size > (bin_dist_plus_one - 1)])
511 | log.debug("Type of diagonal_length %s", type(diagonal_length))
512 |
513 | # the diagonal length should contain the number of values at a certain distance.
514 | # If the matrix is dense, the distance_len[bin_dist_plus_one] correctly contains the number of values
515 | # If the matrix is equally spaced, then, the diagonal_length as computed before is accurate.
516 | # But, if the matrix is both sparse and with unequal bins, then none of the above methods is
517 | # accurate but the the diagonal_length as computed before will be closer.
518 | diagonal_length = max(diagonal_length, distance_len[bin_dist_plus_one])
519 | log.debug("Type of diagonal_length %s", type(diagonal_length))
520 |
521 | if diagonal_length == 0:
522 | mu[bin_dist_plus_one] = np.nan
523 | else:
524 | mu[bin_dist_plus_one] = np.float64(sum_value) / diagonal_length
525 |
526 | if np.isnan(sum_value):
527 | log.info("nan value found for distance %f\n", (bin_dist_plus_one - 1) * binsize)
528 |
529 | # if zscore is needed, compute standard deviation: std = sqrt(mean(abs(x - x.mean())**2))
530 | if zscore:
531 | values_sqrt_diff = \
532 | np.abs((submatrix.data[dist_list == bin_dist_plus_one] - mu[bin_dist_plus_one]) ** 2)
533 | # the standard deviation is the sum of the differences with mu squared (value variable)
534 | # plus all zeros that are not included in the sparse matrix
535 | # for which the standard deviation is
536 | # (0 - mu)**2 = (mu)**2
537 | # The number of zeros is the diagonal length - the length of the non zero values
538 | zero_values_sqrt_diff_sum = (diagonal_length - len(values_sqrt_diff)) * mu[bin_dist_plus_one] ** 2
539 |
540 | _std = np.sqrt((values_sqrt_diff.sum() + zero_values_sqrt_diff_sum) / diagonal_length)
541 | std[bin_dist_plus_one] = _std
542 |
543 | # use the expected values to compute obs/exp
544 | transf_ma = np.zeros(len(submatrix.data))
545 | for idx, value in enumerate(submatrix.data):
546 | if depth is not None and dist_list[idx] > depth + 1:
547 | continue
548 | if zscore:
549 | if std[dist_list[idx]] == 0:
550 | transf_ma[idx] = np.nan
551 | else:
552 | transf_ma[idx] = (value - mu[dist_list[idx]]) / std[dist_list[idx]]
553 | else:
554 | transf_ma[idx] = value / mu[dist_list[idx]]
555 |
556 | submatrix.data = transf_ma
557 | trasf_matrix[chrom_range[chrname][0]:chrom_range[chrname][1], chrom_range[chrname][0]:chrom_range[chrname][1]] = submatrix.tolil()
558 |
559 | self.matrix = trasf_matrix.tocsr()
560 |
561 | return self.matrix
562 |
563 | @staticmethod
564 | def dist_list_to_dict(data, dist_list):
565 | """
566 | splits data, into numeric groups defined by dist_list
567 | Return a dictionary containing, for
568 | each unique distance a dictionary
569 | """
570 |
571 | order = np.argsort(dist_list)
572 | dist_list = dist_list[order]
573 | data = data[order]
574 |
575 | # having the dist_list sorted, np.split
576 | # is used to divide the data into
577 | # groups that lie at the same distance, for this
578 | # np.diff together with np.flatnonzero is used to
579 | # find the indices where the distance changes.
580 | # the '+1' is needed because the np.diff array is
581 | # one element smaller than the original array, thus
582 | # the indices based no the np.diff array are off by 1
583 | # with respect to the original array
584 | groups = np.split(data, np.flatnonzero(np.diff(dist_list)) + 1)
585 |
586 | # because the dist_list is sorted
587 | # the order of the unique values
588 | # corresponds to that of the groups.
589 | # In other words, group[0]
590 | # has distance_unique[0]
591 | # np.sort after np.unique in theory
592 | # is not needed, but just in case...
593 | distance_unique = np.sort(np.unique(dist_list))
594 |
595 | # convert to dictionary having as key
596 | # the distance
597 | distance = {}
598 | for index, d in enumerate(distance_unique):
599 | distance[d] = groups[index]
600 |
601 | return distance
602 |
603 | def keepOnlyTheseChr(self, chromosome_list):
604 | """
605 | given a list of chromosome names,
606 | these are kept, while any other is removed
607 | from the matrix
608 | """
609 | chromosome_list = check_chrom_str_bytes(self.interval_trees, chromosome_list)
610 |
611 | try:
612 | [self.chrBinBoundaries[x] for x in chromosome_list]
613 | except KeyError as e:
614 | raise ValueError(f"Chromosome name not in matrix. {str(e)}") from e
615 |
616 | self.restoreMaskedBins()
617 | size = self.matrix.shape
618 | # initialize a 1D array containing the columns (and rows) to
619 | # select. By default none are selected
620 | sel = np.empty(size[0], dtype=bool)
621 | sel[:] = False
622 |
623 | for chrName in list(self.interval_trees):
624 | if chrName not in chromosome_list:
625 | continue
626 |
627 | # identify start and end rows
628 | # of chromosomes that wants to be
629 | # kept
630 | index_start, index_end = self.getChrBinRange(chrName)
631 | sel[index_start:index_end] = True
632 |
633 | sel_id = np.flatnonzero(sel)
634 | mat = self.matrix[sel_id, :][:, sel_id]
635 |
636 | # update bin ids
637 | self.cut_intervals = [self.cut_intervals[x] for x in sel_id]
638 |
639 | # update correction factors
640 | if self.correction_factors is not None:
641 | self.correction_factors = [self.correction_factors[x] for x in sel_id]
642 |
643 | # keep track of nan bins
644 | if len(self.nan_bins):
645 | _temp = np.zeros(size[0])
646 | _temp[self.nan_bins] = 1
647 | _temp = _temp[sel_id]
648 | self.nan_bins = np.flatnonzero(_temp == 1)
649 | else:
650 | self.nan_bins = []
651 |
652 | self.numCols = len(sel_id) # pylint: disable=W0201
653 |
654 | self.interval_trees, self.chrBinBoundaries = \
655 | self.intervalListToIntervalTree(self.cut_intervals)
656 | # remove distanceCounts
657 | try:
658 | self.distance_counts = None
659 | except AttributeError:
660 | pass
661 | self.matrix = mat
662 | return self.matrix
663 |
664 | def diagflat(self, value=np.nan):
665 | """
666 | sets
667 | the matrix diagonal to np.nan
668 | """
669 | M = self.matrix.shape[0]
670 | diagmatrix = dia_matrix((np.repeat(value, M), 0), shape=(M, M))
671 | self_diag = dia_matrix(([self.matrix.diagonal()], [0]), shape=(M, M))
672 | # take matrix, subtract the values of the diagonal such that
673 | # it becomes all zeros, replace with new values by adding them
674 | self.matrix = self.matrix - self_diag + diagmatrix
675 | return self.matrix
676 |
677 | def filterOutInterChrCounts(self):
678 | """
679 | set all inter chromosomal counts to np.nan
680 | """
681 |
682 | ma_coo = self.matrix.tocoo()
683 | dist_list, _ = hiCMatrix.getDistList(ma_coo.row, ma_coo.col,
684 | self.cut_intervals)
685 |
686 | # set to zero all cases in which dist_list is zero
687 | ma_coo.data[dist_list == -1] = 0
688 |
689 | self.matrix = ma_coo.tocsr()
690 | self.matrix.eliminate_zeros()
691 | return self.matrix
692 |
693 | def setMatrixValues(self, newMatrix):
694 | """
695 | replace the current matrix values
696 | by the given matrix values. The
697 | shapes have to coincide
698 | """
699 | assert self.matrix.shape == newMatrix.shape, \
700 | "Given matrix has different shape. New " \
701 | "values need to have the same shape as previous matrix."
702 |
703 | self.matrix = csr_matrix(newMatrix)
704 |
705 | def setCorrectionFactors(self, correction_factors):
706 | assert len(correction_factors) == self.matrix.shape[0], \
707 | "length of correction factors and length of matrix are different."
708 | self.correction_factors = correction_factors
709 |
710 | def reorderChromosomes(self, new_chr_order):
711 | new_order = []
712 | new_chr_order = check_chrom_str_bytes(self.chrBinBoundaries, new_chr_order)
713 |
714 | for chrName in new_chr_order:
715 | # check that the chromosome names are valid
716 | if chrName not in self.chrBinBoundaries:
717 | raise ValueError(f"Chromosome name '{chrName}' not found. Please check the correct spelling "
718 | "of the chromosomes and try again")
719 | orig = self.chrBinBoundaries[chrName]
720 | new_order.extend(list(range(orig[0], orig[1])))
721 | self.reorderBins(new_order)
722 |
723 | def reorderBins(self, new_order):
724 | """
725 | reorders the rows and colums of the
726 | matrix according to the new order.
727 | The new order can be smaller
728 | than the original matrix. In that
729 | case, the ids not in the
730 | new order are removed.
731 | """
732 | orig_num_rows = self.matrix.shape[0]
733 | self.matrix = self.matrix[new_order, :][:, new_order]
734 | self.cut_intervals = [self.cut_intervals[x] for x in new_order]
735 | # reorder the masked bins
736 | # keep track of nan bins
737 | if len(self.nan_bins):
738 | _temp = np.zeros(orig_num_rows)
739 | _temp[self.nan_bins] = 1
740 | _temp = _temp[new_order]
741 | self.nan_bins = np.flatnonzero(_temp == 1)
742 | else:
743 | self.nan_bins = []
744 |
745 | self.interval_trees, self.chrBinBoundaries = \
746 | self.intervalListToIntervalTree(self.cut_intervals)
747 |
748 | def maskChromosomes(self, pChromosomeList):
749 | mask_ids = []
750 | pChromosomeList = check_chrom_str_bytes(self.chrBinBoundaries, pChromosomeList)
751 |
752 | for chromosome in pChromosomeList:
753 | # check that the chromosome names are valid
754 | if chromosome not in self.chrBinBoundaries:
755 | raise ValueError(f"Chromosome name '{chromosome}' not found. Please check the correct spelling "
756 | "of the chromosomes and try again")
757 | orig = self.chrBinBoundaries[chromosome]
758 | mask_ids.extend(list(range(orig[0], orig[1])))
759 | self.maskBins(mask_ids)
760 |
761 | def maskBins(self, bin_ids=None):
762 | """
763 | Mask the list of bins given. Mask means
764 | to remove the bins from the matrix,
765 | and keep the information about the intervals
766 | as masked
767 | """
768 | # print("self.cut_intervalsMASKBINS___START", self.cut_intervals)
769 |
770 | if bin_ids is None or len(bin_ids) == 0:
771 | return
772 | self.printchrtoremove(bin_ids, restore_masked_bins=False)
773 | try:
774 | # check if a masked bin already exists
775 | if len(self.orig_bin_ids) > 0:
776 | M = self.matrix.shape[0]
777 | previous_bin_ids = self.orig_bin_ids[M:]
778 | # merge new and old masked bins
779 | bin_ids = np.unique(np.concatenate([previous_bin_ids, self.orig_bin_ids[bin_ids]]))
780 | np.sort(bin_ids)
781 | self.restoreMaskedBins()
782 | except Exception: # pylint: disable=W0718
783 | pass
784 |
785 | # join with existing nan_bins
786 | if self.nan_bins is not None and len(self.nan_bins) > 0:
787 | log.info("found existing %d nan bins that will be "
788 | "included for masking ", len(self.nan_bins))
789 | bin_ids = np.unique(np.concatenate([self.nan_bins, bin_ids]))
790 | self.nan_bins = []
791 | rows = cols = np.delete(list(range(self.matrix.shape[1])), bin_ids)
792 |
793 | self.matrix = self.matrix[rows, :][:, cols]
794 |
795 | # to keep track of removed bins
796 | # I add their ids to the end of the rows vector
797 | # to reverse the changes, I just need to do an argsort
798 | # to put the removed bins in place
799 | # log.debug("bins_ids {}".format(bin_ids))
800 | self.orig_bin_ids = np.concatenate([rows, bin_ids])
801 |
802 | new_cut_intervals = [self.cut_intervals[x] for x in rows]
803 |
804 | self.orig_cut_intervals = new_cut_intervals + [self.cut_intervals[x] for x in bin_ids]
805 |
806 | self.cut_intervals = new_cut_intervals
807 |
808 | self.interval_trees, self.chrBinBoundaries = self.intervalListToIntervalTree(self.cut_intervals)
809 |
810 | if self.correction_factors is not None:
811 | self.correction_factors = self.correction_factors[rows]
812 |
813 | def update_matrix(self, new_matrix, new_cut_intervals):
814 | """
815 | give a new matrix and list of cut intervals, the matrix, cut intervals and
816 | the respective tree are updated
817 | :param new_matrix: now values for the sparse matrix
818 | :param new_cut_intervals: list of cut intervals, each entry being a tuple of the form
819 | (chrom, start, end, coverage)
820 | :return:
821 | """
822 | if len(self.orig_bin_ids) > 0:
823 | raise ValueError("matrix contains masked bins. Restore masked bins first")
824 |
825 | assert len(new_cut_intervals) == new_matrix.shape[0], "matrix shape and len of cut intervals do not match"
826 |
827 | self.matrix = new_matrix
828 | self.cut_intervals = new_cut_intervals
829 |
830 | self.interval_trees, self.chrBinBoundaries = \
831 | self.intervalListToIntervalTree(self.cut_intervals)
832 |
833 | self.nan_bins = np.flatnonzero(self.matrix.sum(0).A == 0)
834 |
835 | def restoreMaskedBins(self):
836 | """
837 | Puts backs into the matrix the bins
838 | removed
839 | """
840 | if len(self.orig_bin_ids) == 0:
841 | return
842 | # the rows to add are
843 | # as an empty sparse matrix
844 | M = self.matrix.shape[0]
845 | N = len(self.orig_bin_ids) - M
846 | rows_mat = csr_matrix((N, M))
847 | # cols to add
848 | cols_mat = csr_matrix((M + N, N))
849 |
850 | # add the rows and cols at the end of the
851 | # current matrix
852 | self.matrix = sparse_vstack([self.matrix, rows_mat])
853 | self.matrix = sparse_hstack([self.matrix, cols_mat], format='csr')
854 |
855 | # the new matrix has the right number of cols and rows, now
856 | # they need to be reordered to be back in their original places
857 | rows = cols = np.argsort(self.orig_bin_ids)
858 | self.matrix = self.matrix[rows, :][:, cols]
859 | self.cut_intervals = [self.orig_cut_intervals[x] for x in rows]
860 | self.interval_trees, self.chrBinBoundaries = \
861 | self.intervalListToIntervalTree(self.cut_intervals)
862 | # set as nan_bins the masked bins that were restored
863 | self.nan_bins = self.orig_bin_ids[M:]
864 |
865 | if self.correction_factors is not None:
866 | # add missing values as nans at end of array
867 | self.correction_factors = np.concatenate([self.correction_factors,
868 | np.repeat(np.nan, N)])
869 | # reorder array
870 | self.correction_factors = self.correction_factors[rows]
871 |
872 | # reset orig bins ids and cut intervals
873 | self.orig_bin_ids = []
874 | self.orig_cut_intervals = []
875 | log.info("masked bins were restored\n")
876 |
877 | def reorderMatrix(self, orig, dest):
878 | """
879 | Given a matrix, a region over the diagonal is moved from
880 | its origin to a new destination. With this method a
881 | new order of the chromosomes can be produced.
882 | :param orig: a tuple containing the indices of the region to be moved
883 | :param dest: the index of the region into which to insert
884 | the section moved
885 | """
886 |
887 | rows = np.delete(list(range(self.matrix.shape[1])), range(orig[0], orig[1]))
888 |
889 | if dest > orig[1]:
890 | dest = dest - (orig[1] - orig[0])
891 |
892 | rows = cols = np.insert(
893 | rows, np.repeat(dest, orig[1] - orig[0]), list(range(orig[0], orig[1])))
894 | self.matrix = self.matrix[rows, :][:, cols]
895 | self.cut_intervals = [self.cut_intervals[x] for x in rows]
896 | self.interval_trees, self.chrBinBoundaries = \
897 | self.intervalListToIntervalTree(self.cut_intervals)
898 |
899 | if self.correction_factors is not None:
900 | self.correction_factors = self.correction_factors[rows]
901 |
902 | def truncTrans(self, high=0.05):
903 | """Truncates trans contacts to remove blowouts
904 | Clip high counts in trans regions (i.e. between
905 | chromosomes) to the max value found in the 1-high*100
906 | percentile
907 |
908 | :param: high : float, 0= max_inter) & (dist_list == -1)] == max_inter # pylint: disable=W0104
918 |
919 | self.setMatrixValues(mat)
920 |
921 | def printchrtoremove(self, to_remove, label="Number of poor regions to remove", restore_masked_bins=True):
922 | """
923 | prints out the number of bin per chromosomes
924 | that will be removed
925 | """
926 | cnt = {}
927 | try:
928 | self.prev_to_remove
929 | except Exception: # pylint: disable=W0718
930 | log.debug("No self.prev_to_remove defined, defining it now.")
931 | self.prev_to_remove = np.array([]) # pylint: disable=W0201
932 |
933 | # if the same information was already printed don't
934 | # show it again.
935 | if np.array_equal(self.prev_to_remove, to_remove):
936 | return
937 |
938 | if restore_masked_bins:
939 | try:
940 | # check if a masked bin already exists
941 | if len(self.orig_bin_ids) > 0:
942 | log.info("Masked bins already present")
943 | self.restoreMaskedBins()
944 | except Exception: # pylint: disable=W0718
945 | pass
946 | for idx in to_remove:
947 | chrom = self.cut_intervals[idx][0]
948 | if chrom not in cnt:
949 | cnt[chrom] = 0
950 | cnt[chrom] += 1
951 |
952 | log.info('%s: %d %s', label, len(to_remove), cnt)
953 | self.prev_to_remove = to_remove # pylint: disable=W0201
954 |
955 | def get_chromosome_sizes_real(self):
956 | '''
957 | Function returns the size of a chromosome as it is stored in the matrix.
958 | The size can differ if e.g. some area from the start or end of a chromosome is not present in the interaction matrix.
959 | '''
960 | if self.chrBinBoundaries and len(self.chrBinBoundaries) > 0:
961 | chrom_sizes = OrderedDict()
962 | # for chrom, (start_bin, end_bin) in iteritems(self.chrBinBoundaries):
963 | for chrom, (start_bin, end_bin) in self.chrBinBoundaries.items():
964 | chrom, start0, _, _ = self.cut_intervals[start_bin]
965 | chrom, _, end1, _ = self.cut_intervals[end_bin - 1]
966 | chrom_sizes[chrom] = end1 - start0 + 1
967 |
968 | return chrom_sizes
969 | return None
970 |
971 | def get_chromosome_sizes(self):
972 | '''
973 | Function returns the size of a chromosome as it is stored in the matrix, assuming the chromosome starts is always at its genomic position 0.
974 | '''
975 | if self.chrBinBoundaries and len(self.chrBinBoundaries) > 0:
976 | chrom_sizes = OrderedDict()
977 | # for chrom, (start_bin, end_bin) in iteritems(self.chrBinBoundaries):
978 | for chrom, (_, end_bin) in self.chrBinBoundaries.items():
979 |
980 | chrom, _, end, _ = self.cut_intervals[end_bin - 1]
981 | chrom_sizes[chrom] = end
982 |
983 | return chrom_sizes
984 | return None
985 |
986 | def intervalListToIntervalTree(self, interval_list):
987 | """
988 | given an ordered list of (chromosome name, start, end)
989 | this is transformed to a number of interval trees,
990 | one for each chromosome
991 | """
992 | cut_int_tree = {}
993 | chrbin_boundaries = OrderedDict()
994 | if len(interval_list) == 0:
995 | log.warning("Interval list is empty")
996 | return cut_int_tree, chrbin_boundaries
997 |
998 | intval_id = 0
999 | chr_start_id = 0
1000 | previous_chrom = None
1001 | for intval in interval_list:
1002 | chrom, start, end = intval[0:3]
1003 | start = int(start)
1004 | end = int(end)
1005 | if previous_chrom != chrom:
1006 | if previous_chrom is None:
1007 | previous_chrom = chrom
1008 |
1009 | chrbin_boundaries[previous_chrom] = \
1010 | (chr_start_id, intval_id)
1011 | chr_start_id = intval_id
1012 | cut_int_tree[chrom] = IntervalTree()
1013 | previous_chrom = chrom
1014 |
1015 | cut_int_tree[chrom].add(Interval(start, end, intval_id))
1016 |
1017 | intval_id += 1
1018 | chrbin_boundaries[chrom] = (chr_start_id, intval_id)
1019 |
1020 | return cut_int_tree, chrbin_boundaries
1021 |
1022 |
1023 | def check_cooler(pFileName):
1024 | if pFileName.endswith('.cool') or '.mcool' in pFileName:
1025 | if cooler.fileops.is_cooler(pFileName):
1026 | return True
1027 | return False
1028 |
--------------------------------------------------------------------------------
/hicmatrix/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | logging.basicConfig(level=logging.INFO)
4 | # logging.basicConfig(level=logging.DEBUG)
5 |
6 | logging.getLogger('cooler').setLevel(logging.WARNING)
7 |
--------------------------------------------------------------------------------
/hicmatrix/lib/__init__.py:
--------------------------------------------------------------------------------
1 | from .matrixFileHandler import MatrixFileHandler # noqa: F401
2 |
--------------------------------------------------------------------------------
/hicmatrix/lib/cool.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 |
4 | import gc
5 | try:
6 | from importlib.metadata import version
7 | except ImportError:
8 | from importlib_metadata import version
9 |
10 | import cooler
11 | import h5py
12 | import numpy as np
13 | import pandas as pd
14 | from scipy.sparse import csr_matrix, dok_matrix, lil_matrix, triu
15 |
16 | from hicmatrix.utilities import (convertNansToOnes, toString)
17 |
18 | from .matrixFile import MatrixFile
19 |
20 | log = logging.getLogger(__name__)
21 |
22 | class Cool(MatrixFile):
23 |
24 | def __init__(self, pMatrixFile=None):
25 | super().__init__(pMatrixFile)
26 | self.chrnameList = None
27 | self.correctionFactorTable = 'weight'
28 | self.correctionOperator = None
29 | self.enforceInteger = False
30 | self.appendData = False
31 | self.fileWasH5 = False
32 | self.applyCorrectionLoad = True
33 | self.hic_metadata = {}
34 | self.cool_info = None
35 |
36 | self.hic2cool_version = None
37 | self.hicmatrix_version = None
38 | self.distance = None
39 | self.matrixFormat = None
40 | self.matrixOnly = False
41 | self.noCutIntervals = False
42 |
43 | def getInformationCoolerBinNames(self):
44 | return cooler.Cooler(self.matrixFileName).bins().columns.values
45 |
46 | def load(self):
47 | log.debug('Load in cool format')
48 | self.minValue = None # pylint: disable=W0201
49 | self.maxValue = None # pylint: disable=W0201
50 | if self.matrixFileName is None:
51 | log.warning('No matrix is initialized')
52 | try:
53 | cooler_file = cooler.Cooler(self.matrixFileName)
54 | # if 'metadata' in cooler_file.info:
55 | self.hic_metadata = cooler_file.info
56 | # else:
57 | # self.hic_metadata = None
58 | # self.cool_info = deepcopy(cooler_file.info)
59 | # log.debug('self.hic_metadata {}'.format(self.hic_metadata))
60 | except Exception as e: # pylint: disable=W0718
61 | log.warning("Could not open cooler file. Maybe the path is wrong or the given node is not available.")
62 | log.warning('The following file was tried to open: %s', self.matrixFileName)
63 | log.warning("The following nodes are available: %s", cooler.fileops.list_coolers(self.matrixFileName.split("::")[0]))
64 | return None, e
65 | if self.chrnameList is None and (self.matrixFileName is None or not self.matrixOnly):
66 | matrixDataFrame = cooler_file.matrix(balance=False, sparse=True, as_pixels=True)
67 | used_dtype = np.int32
68 | if np.iinfo(np.int32).max < cooler_file.info['nbins']:
69 | used_dtype = np.int64
70 | count_dtype = matrixDataFrame[0]['count'].dtype
71 | data = np.empty(cooler_file.info['nnz'], dtype=count_dtype)
72 | instances = np.empty(cooler_file.info['nnz'], dtype=used_dtype)
73 | features = np.empty(cooler_file.info['nnz'], dtype=used_dtype)
74 | i = 0
75 | size = cooler_file.info['nbins'] // 32
76 | if size == 0:
77 | size = 1
78 | start_pos = 0
79 | while i < cooler_file.info['nbins']:
80 | matrixDataFrameChunk = matrixDataFrame[i:i + size]
81 | _data = matrixDataFrameChunk['count'].values.astype(count_dtype)
82 | _instances = matrixDataFrameChunk['bin1_id'].values.astype(used_dtype)
83 | _features = matrixDataFrameChunk['bin2_id'].values.astype(used_dtype)
84 |
85 | data[start_pos:start_pos + len(_data)] = _data
86 | instances[start_pos:start_pos + len(_instances)] = _instances
87 | features[start_pos:start_pos + len(_features)] = _features
88 | start_pos += len(_features)
89 | i += size
90 | del _data
91 | del _instances
92 | del _features
93 |
94 | if self.matrixFormat is None or self.matrixFormat == 'csr':
95 | matrix = csr_matrix((data, (instances, features)), shape=(int(cooler_file.info['nbins']), int(cooler_file.info['nbins'])), dtype=count_dtype)
96 | elif self.matrixFormat == 'lil':
97 | matrix = lil_matrix((data, (instances, features)), shape=(int(cooler_file.info['nbins']), int(cooler_file.info['nbins'])), dtype=count_dtype)
98 | elif self.matrixFormat == 'dok':
99 | matrix = dok_matrix((data, (instances, features)), shape=(int(cooler_file.info['nbins']), int(cooler_file.info['nbins'])), dtype=count_dtype)
100 | # elif self.matrixFormat == 'raw':
101 | # matrix = [instances, features, data, int(cooler_file.info['nbins'])]
102 | del data
103 | del instances
104 | del features
105 | gc.collect()
106 | elif self.chrnameList is None and self.matrixOnly:
107 | log.debug('Load all at once')
108 | matrixDataFrame = cooler_file.matrix(balance=False, sparse=True, as_pixels=True)
109 | used_dtype = np.int64
110 | # if np.iinfo(np.int32).max < cooler_file.info['nbins']:
111 | # used_dtype = np.int64
112 | count_dtype = matrixDataFrame[0]['count'].dtype
113 | matrixDataFrameChunk = matrixDataFrame[:]
114 | data = matrixDataFrameChunk['count'].values.astype(count_dtype)
115 | instances = matrixDataFrameChunk['bin1_id'].values.astype(used_dtype)
116 | features = matrixDataFrameChunk['bin2_id'].values.astype(used_dtype)
117 | # matrix = [_instances, _features, _data, int(cooler_file.info['nbins'])]
118 | # return matrix, None, None, None, None
119 | else:
120 | if len(self.chrnameList) == 1:
121 | try:
122 | if self.distance is None or cooler_file.binsize is None:
123 | # load the full chromosome
124 | matrix = cooler_file.matrix(balance=False, sparse=True, as_pixels=False).fetch(self.chrnameList[0]).tocsr() # pylint: disable=E1136
125 | else:
126 | # load only the values up to a specific distance
127 | lo, hi = cooler_file.extent(self.chrnameList[0]) # pylint: disable=E1136
128 | dist = self.distance // cooler_file.binsize
129 | step = (hi - lo) // 32
130 | if step < 1: # pylint: disable=R1731
131 | step = 1
132 | mat = lil_matrix((hi - lo, hi - lo), dtype=np.float32)
133 |
134 | for i0, i1 in cooler.util.partition(lo, hi, step):
135 | # fetch stripe
136 | pixels = cooler_file.matrix(balance=False, as_pixels=True)[i0:i1, lo:hi]
137 | # filter
138 | pixels = pixels[(pixels['bin2_id'] - pixels['bin1_id']) < dist]
139 | # insert into sparse matrix
140 | mat[pixels['bin1_id'] - lo, pixels['bin2_id'] - lo] = pixels['count'].astype(np.float32)
141 | del pixels
142 |
143 | matrix = mat.tocsr()
144 | del mat
145 | gc.collect()
146 |
147 | except ValueError as ve:
148 | log.exception("Wrong chromosome format. Please check UCSC / ensembl notation.")
149 | log.exception('Error: %s', str(ve))
150 | else:
151 | raise ValueError("Operation to load more as one region is not supported.")
152 |
153 | cut_intervals_data_frame = None
154 | correction_factors_data_frame = None
155 |
156 | if self.chrnameList is not None:
157 | if len(self.chrnameList) == 1:
158 | cut_intervals_data_frame = cooler_file.bins().fetch(self.chrnameList[0])
159 | log.debug('cut_intervals_data_frame %s', str(list(cut_intervals_data_frame.columns)))
160 | if self.correctionFactorTable in cut_intervals_data_frame:
161 | correction_factors_data_frame = cut_intervals_data_frame[self.correctionFactorTable]
162 | else:
163 | raise ValueError("Operation to load more than one chr from bins is not supported.")
164 | else:
165 | if self.applyCorrectionLoad and self.correctionFactorTable in cooler_file.bins():
166 | correction_factors_data_frame = cooler_file.bins()[[self.correctionFactorTable]][:]
167 |
168 | cut_intervals_data_frame = cooler_file.bins()[['chrom', 'start', 'end']][:]
169 |
170 | correction_factors = None
171 | if correction_factors_data_frame is not None and self.applyCorrectionLoad:
172 | # apply correction factors to matrix
173 | # a_i,j = a_i,j * c_i *c_j
174 | if not self.matrixOnly:
175 | matrix.eliminate_zeros()
176 | data = matrix.data
177 | if len(data) > 1:
178 |
179 | if not self.matrixOnly:
180 | matrix.data = matrix.data.astype(float)
181 | else:
182 | data = np.array(data, dtype=float)
183 |
184 | correction_factors = np.array(correction_factors_data_frame.values).flatten()
185 | # Don't apply correction if weight were just 'nans'
186 | if np.sum(np.isnan(correction_factors)) != len(correction_factors):
187 | # correction_factors = convertNansToZeros(correction_factors)
188 |
189 | if not self.matrixOnly:
190 | # matrix.sort_indices()
191 | instances, features = matrix.nonzero()
192 | instances_factors = correction_factors[instances]
193 | features_factors = correction_factors[features]
194 |
195 | if self.correctionOperator is None:
196 | if self.correctionFactorTable in ['KR', 'VC', 'SQRT_VC']:
197 | self.correctionOperator = '/'
198 | else:
199 | self.correctionOperator = '*'
200 | if 'generated-by' in cooler_file.info:
201 | log.debug('cooler_file.info[\'generated-by\'] %s %s', cooler_file.info['generated-by'], type(cooler_file.info['generated-by']))
202 | generated_by = toString(cooler_file.info['generated-by'])
203 | if 'hic2cool' in generated_by:
204 | self.hic2cool_version = generated_by.split('-')[1]
205 | elif 'hicmatrix' in generated_by:
206 | self.hicmatrix_version = generated_by.split('-')[1]
207 |
208 | instances_factors *= features_factors
209 | log.debug('hic2cool: %s', self.hic2cool_version)
210 | log.debug('self.correctionOperator: %s', self.correctionOperator)
211 |
212 | if self.matrixOnly:
213 | if self.correctionOperator == '*':
214 | log.debug('multi')
215 | data *= instances_factors
216 | elif self.correctionOperator == '/':
217 | log.debug('div')
218 | data /= instances_factors
219 | log.debug('non')
220 | return [instances, features, data, int(cooler_file.info['nbins'])], None, None, None, None
221 |
222 | if self.correctionOperator == '*':
223 | matrix.data *= instances_factors
224 | log.debug('foo')
225 | elif self.correctionOperator == '/':
226 | matrix.data /= instances_factors
227 | log.debug('hu')
228 |
229 | elif self.matrixOnly:
230 | return [instances, features, data, int(cooler_file.info['nbins'])], None, None, None, None
231 |
232 | cut_intervals = []
233 | if not self.noCutIntervals:
234 | for values in cut_intervals_data_frame.values:
235 | cut_intervals.append(tuple([toString(values[0]), values[1], values[2], 1.0]))
236 | del cut_intervals_data_frame
237 | del correction_factors_data_frame
238 | # try to restore nan_bins.
239 | try:
240 | # remove possible nan bins introduced by the correction factors
241 | # to have them part of the nan_bins vector
242 | mask = np.isnan(matrix.data)
243 | matrix.data[mask] = 0
244 | matrix.eliminate_zeros()
245 | shape = matrix.shape[0] if matrix.shape[0] < matrix.shape[1] else matrix.shape[1]
246 | nan_bins_indices = np.arange(shape)
247 | nan_bins_indices = np.setdiff1d(nan_bins_indices, matrix.indices)
248 |
249 | nan_bins = []
250 | for bin_id in nan_bins_indices:
251 | if len(matrix[bin_id, :].data) == 0:
252 | nan_bins.append(bin_id)
253 | nan_bins = np.array(nan_bins)
254 | except Exception: # pylint: disable=W0718
255 | nan_bins = None
256 |
257 | distance_counts = None
258 | # log.debug('self.hic_metadata {}'.format(self.hic_metadata))
259 |
260 | return matrix, cut_intervals, nan_bins, distance_counts, correction_factors
261 |
262 | def create_cooler_input(self, pSymmetric=True, pApplyCorrection=True):
263 | log.debug('self.hic_metadata 34 %s', self.hic_metadata)
264 |
265 | self.matrix.eliminate_zeros()
266 |
267 | if self.nan_bins is not None and len(self.nan_bins) > 0 and self.fileWasH5:
268 | # remove nan_bins
269 | correction_factors = np.ones(self.matrix.shape[0])
270 | correction_factors[self.nan_bins] = 0
271 | self.matrix.sort_indices()
272 | _instances, _features = self.matrix.nonzero()
273 |
274 | instances_factors = correction_factors[_instances]
275 | features_factors = correction_factors[_features]
276 |
277 | instances_factors = np.logical_not(np.logical_or(instances_factors, features_factors))
278 | self.matrix.data[instances_factors] = 0
279 | self.matrix.eliminate_zeros()
280 |
281 | # set possible nans in data to 0
282 | mask = np.isnan(self.matrix.data)
283 |
284 | self.matrix.data[mask] = 0
285 | self.matrix.eliminate_zeros()
286 | # save only the upper triangle of the
287 | if pSymmetric:
288 | # symmetric matrix
289 | self.matrix = triu(self.matrix, format='csr')
290 | else:
291 | self.matrix = self.matrix
292 |
293 | self.matrix.eliminate_zeros()
294 |
295 | # create data frame for bins
296 | # self.cut_intervals is having 4 tuples, bin_data_frame should have 3.correction_factors
297 | # it looks like it is faster to create it with 4, and drop the last one
298 | # instead of handling this before.
299 | bins_data_frame = pd.DataFrame(self.cut_intervals, columns=['chrom', 'start', 'end', 'interactions']).drop('interactions', axis=1)
300 | dtype_pixel = {'bin1_id': np.int32, 'bin2_id': np.int32, 'count': np.int32}
301 | log.debug('foo')
302 | if self.correction_factors is not None and pApplyCorrection:
303 | dtype_pixel['weight'] = np.float32
304 |
305 | # if the correction was applied by a division, invert it because cool format expects multiplicative if table name is 'weight'
306 | # https://cooler.readthedocs.io/en/latest/api.html#cooler.Cooler.matrix
307 | if (self.hic2cool_version is not None and self.hic2cool_version >= '0.5') or self.fileWasH5 or self.correctionOperator == '/':
308 |
309 | log.debug('h5 true')
310 | self.correction_factors = np.array(self.correction_factors).flatten()
311 | self.correction_factors = 1 / self.correction_factors
312 | mask = np.isnan(self.correction_factors)
313 | self.correction_factors[mask] = 0
314 | mask = np.isinf(self.correction_factors)
315 | self.correction_factors[mask] = 0
316 | self.correctionOperator = '*'
317 | log.debug('inverted correction factors')
318 | weight = convertNansToOnes(np.array(self.correction_factors).flatten())
319 | log.debug('weight %s', weight)
320 | bins_data_frame = bins_data_frame.assign(weight=weight)
321 |
322 | log.debug("Reverting correction factors on matrix...")
323 | instances, features = self.matrix.nonzero()
324 | self.correction_factors = np.array(self.correction_factors)
325 |
326 | # do not apply if correction factors are just 1's
327 | instances_factors = self.correction_factors[instances]
328 | features_factors = self.correction_factors[features]
329 |
330 | instances_factors *= features_factors
331 |
332 | self.matrix.data = self.matrix.data.astype(float)
333 |
334 | # Apply the invert operation to get the original data
335 | if self.correctionOperator == '*' or self.correctionOperator is None:
336 | self.matrix.data /= instances_factors
337 |
338 | instances_factors = None
339 | features_factors = None
340 |
341 | self.matrix.eliminate_zeros()
342 |
343 | if self.correction_factors is not None and pApplyCorrection is False:
344 | dtype_pixel['weight'] = np.float32
345 | weight = convertNansToOnes(np.array(self.correction_factors).flatten())
346 | bins_data_frame = bins_data_frame.assign(weight=weight)
347 | log.debug('weight 2: %s', weight)
348 | instances, features = self.matrix.nonzero()
349 |
350 | matrix_data_frame = pd.DataFrame(instances, columns=['bin1_id'], dtype=np.int32)
351 | del instances
352 | matrix_data_frame = matrix_data_frame.assign(bin2_id=features)
353 | del features
354 |
355 | if self.enforceInteger:
356 | dtype_pixel['count'] = np.int32
357 | data = np.rint(self.matrix.data)
358 | matrix_data_frame = matrix_data_frame.assign(count=data)
359 | else:
360 | matrix_data_frame = matrix_data_frame.assign(count=self.matrix.data)
361 |
362 | if not self.enforceInteger and self.matrix.dtype not in [np.int32, int]:
363 | log.debug("Writing non-standard cooler matrix. Datatype of matrix['count'] is: %s", self.matrix.dtype)
364 | dtype_pixel['count'] = self.matrix.dtype
365 | split_factor = 1
366 | if len(self.matrix.data) > 1e7:
367 | split_factor = 1e4
368 | matrix_data_frame = np.array_split(matrix_data_frame, split_factor)
369 |
370 | if self.appendData:
371 | self.appendData = 'a'
372 | else:
373 | self.appendData = 'w'
374 |
375 | info = {}
376 | # these fields are created by cooler lib. Can cause errors if not deleted.
377 | if 'metadata' in info:
378 | if self.hic_metadata is None:
379 | self.hic_metadata = info['metadata']
380 | del info['metadata']
381 | if 'bin-size' in info:
382 | del info['bin-size']
383 | if 'bin-type' in info:
384 | del info['bin-type']
385 |
386 | info['format'] = str('HDF5::Cooler')
387 | info['format-url'] = str('https://github.com/mirnylab/cooler')
388 | info['generated-by'] = str('HiCMatrix-' + version('HiCMatrix'))
389 | info['generated-by-cooler-lib'] = str('cooler-' + version('cooler'))
390 |
391 | info['tool-url'] = str('https://github.com/deeptools/HiCMatrix')
392 |
393 | if self.hic_metadata is not None and 'matrix-generated-by' in self.hic_metadata:
394 | info['matrix-generated-by'] = str(self.hic_metadata['matrix-generated-by'])
395 | del self.hic_metadata['matrix-generated-by']
396 | if self.hic_metadata is not None and 'matrix-generated-by-url' in self.hic_metadata:
397 | info['matrix-generated-by-url'] = str(self.hic_metadata['matrix-generated-by-url'])
398 | del self.hic_metadata['matrix-generated-by-url']
399 | log.debug('self.hic_metadata %s', self.hic_metadata)
400 | if self.hic_metadata is not None and 'genome-assembly' in self.hic_metadata:
401 | info['genome-assembly'] = str(self.hic_metadata['genome-assembly'])
402 | del self.hic_metadata['genome-assembly']
403 |
404 | return bins_data_frame, matrix_data_frame, dtype_pixel, info
405 |
406 | def save(self, pFileName, pSymmetric=True, pApplyCorrection=True):
407 | log.debug('Save in cool format11112323')
408 |
409 | bins_data_frame, matrix_data_frame, dtype_pixel, info = self.create_cooler_input(pSymmetric=pSymmetric, pApplyCorrection=pApplyCorrection)
410 | local_temp_dir = os.path.dirname(os.path.realpath(pFileName))
411 | cooler.create_cooler(cool_uri=pFileName,
412 | bins=bins_data_frame,
413 | pixels=matrix_data_frame,
414 | mode=self.appendData,
415 | dtypes=dtype_pixel,
416 | ordered=True,
417 | metadata=info,
418 |
419 | temp_dir=local_temp_dir)
420 |
421 | log.debug('info %s', info)
422 | if self.appendData == 'w':
423 | fileName = pFileName.split('::')[0]
424 | with h5py.File(fileName, 'r+') as h5file:
425 | h5file.attrs.update(info)
426 | h5file.close()
427 |
--------------------------------------------------------------------------------
/hicmatrix/lib/ginteractions.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from scipy.sparse import triu
4 |
5 | from .matrixFile import MatrixFile
6 |
7 | log = logging.getLogger(__name__)
8 |
9 | class Ginteractions(MatrixFile):
10 |
11 | def __init__(self, pMatrixFile):
12 | super().__init__(pMatrixFile)
13 |
14 | def load(self):
15 | log.error('Not implemented')
16 |
17 | def save(self, pFileName, pSymmetric=None, pApplyCorrection=None):
18 |
19 | # self.restoreMaskedBins()
20 | log.debug(self.matrix.shape)
21 | mat_coo = triu(self.matrix, k=0, format='csr').tocoo()
22 | with open(f"{pFileName}.tsv", 'w', encoding='utf-8') as fileh:
23 | for idx, counts in enumerate(mat_coo.data):
24 | chr_row, start_row, end_row, _ = self.cut_intervals[mat_coo.row[idx]]
25 | chr_col, start_col, end_col, _ = self.cut_intervals[mat_coo.col[idx]]
26 | fileh.write(f"{chr_row}\t{int(start_row)}\t{int(end_row)}\t{chr_col}\t{int(start_col)}\t{int(end_col)}\t{counts}\n")
27 |
--------------------------------------------------------------------------------
/hicmatrix/lib/h5.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | from os import unlink
4 |
5 | import numpy as np
6 | import tables
7 | from scipy.sparse import csr_matrix, triu
8 |
9 | from hicmatrix.utilities import toString
10 |
11 | from .matrixFile import MatrixFile
12 |
13 | log = logging.getLogger(__name__)
14 |
15 |
16 | class H5(MatrixFile):
17 |
18 | def __init__(self, pMatrixFile):
19 | super().__init__(pMatrixFile)
20 |
21 | def load(self):
22 | """
23 | Loads a matrix stored in h5 format
24 | :param matrix_filename:
25 | :return: matrix, cut_intervals, nan_bins, distance_counts, correction_factors
26 | """
27 | log.debug('Load in h5 format')
28 |
29 | with tables.open_file(self.matrixFileName, 'r') as f:
30 | parts = {}
31 | try:
32 | for matrix_part in ('data', 'indices', 'indptr', 'shape'):
33 | parts[matrix_part] = getattr(f.root.matrix, matrix_part).read()
34 | except Exception as e: # pylint: disable=W0718
35 | log.info('No h5 file. Please check parameters concerning the file type!')
36 | # Should probably be raise e:
37 | e # pylint: disable=W0104
38 | matrix = csr_matrix(tuple([parts['data'], parts['indices'], parts['indptr']]),
39 | shape=parts['shape'])
40 | # matrix = hiCMatrix.fillLowerTriangle(matrix)
41 | # get intervals
42 | intvals = {}
43 | for interval_part in ('chr_list', 'start_list', 'end_list', 'extra_list'):
44 | if toString(interval_part) == toString('chr_list'):
45 | chrom_list = getattr(f.root.intervals, interval_part).read()
46 | intvals[interval_part] = toString(chrom_list)
47 | else:
48 | intvals[interval_part] = getattr(f.root.intervals, interval_part).read()
49 |
50 | cut_intervals = list(zip(intvals['chr_list'], intvals['start_list'], intvals['end_list'], intvals['extra_list']))
51 | assert len(cut_intervals) == matrix.shape[0], \
52 | f"Error loading matrix. Length of bin intervals ({len(cut_intervals)}) is different than the " \
53 | f"size of the matrix ({matrix.shape[0]})"
54 |
55 | # get nan_bins
56 | try:
57 | if hasattr(f.root, 'nan_bins'):
58 | nan_bins = f.root.nan_bins.read()
59 | else:
60 | nan_bins = np.array([])
61 | except Exception: # pylint: disable=W0718
62 | nan_bins = np.array([])
63 |
64 | # get correction factors
65 | try:
66 | if hasattr(f.root, 'correction_factors'):
67 | correction_factors = f.root.correction_factors.read()
68 | assert len(correction_factors) == matrix.shape[0], \
69 | "Error loading matrix. Length of correction factors does not" \
70 | "match size of matrix"
71 | correction_factors = np.array(correction_factors)
72 | mask = np.isnan(correction_factors)
73 | correction_factors[mask] = 0
74 | mask = np.isinf(correction_factors)
75 | correction_factors[mask] = 0
76 | else:
77 | correction_factors = None
78 | except Exception: # pylint: disable=W0718
79 | correction_factors = None
80 |
81 | try:
82 | # get correction factors
83 | if hasattr(f.root, 'distance_counts'):
84 | distance_counts = f.root.correction_factors.read()
85 | else:
86 | distance_counts = None
87 | except Exception: # pylint: disable=W0718
88 | distance_counts = None
89 | return matrix, cut_intervals, nan_bins, distance_counts, correction_factors
90 |
91 | def save(self, pFileName, pSymmetric=True, pApplyCorrection=None):
92 | """
93 | Saves a matrix using hdf5 format
94 | :param pFileName:
95 | :return: None
96 | """
97 | log.debug('Save in h5 format')
98 |
99 | # self.restoreMaskedBins()
100 | if not pFileName.endswith(".h5"):
101 | pFileName += ".h5"
102 |
103 | # if the file name already exists
104 | # try to find a new suitable name
105 | if os.path.isfile(pFileName):
106 | log.warning("*WARNING* File already exists %s\n "
107 | "Overwriting ...\n", pFileName)
108 |
109 | unlink(pFileName)
110 | if self.nan_bins is None:
111 | self.nan_bins = np.array([])
112 | elif not isinstance(self.nan_bins, np.ndarray):
113 | self.nan_bins = np.array(self.nan_bins)
114 |
115 | # save only the upper triangle of the
116 | if pSymmetric:
117 | # symmetric matrix
118 | matrix = triu(self.matrix, k=0, format='csr')
119 | else:
120 | matrix = self.matrix
121 | matrix.eliminate_zeros()
122 |
123 | filters = tables.Filters(complevel=5, complib='blosc')
124 | with tables.open_file(pFileName, mode="w", title="HiCExplorer matrix") as h5file:
125 | matrix_group = h5file.create_group("/", "matrix", )
126 | # save the parts of the csr matrix
127 | for matrix_part in ('data', 'indices', 'indptr', 'shape'):
128 | arr = np.array(getattr(matrix, matrix_part))
129 | atom = tables.Atom.from_dtype(arr.dtype)
130 | ds = h5file.create_carray(matrix_group, matrix_part, atom,
131 | shape=arr.shape,
132 | filters=filters)
133 | ds[:] = arr
134 |
135 | # save the matrix intervals
136 | intervals_group = h5file.create_group("/", "intervals", )
137 | chr_list, start_list, end_list, extra_list = zip(*self.cut_intervals) # pylint: disable=W0612
138 | for interval_part in ('chr_list', 'start_list', 'end_list', 'extra_list'):
139 | arr = np.array(eval(interval_part)) # pylint: disable=W0123
140 | atom = tables.Atom.from_dtype(arr.dtype)
141 | ds = h5file.create_carray(intervals_group, interval_part, atom,
142 | shape=arr.shape,
143 | filters=filters)
144 | ds[:] = arr
145 |
146 | # save nan bins
147 | if len(self.nan_bins):
148 | atom = tables.Atom.from_dtype(self.nan_bins.dtype)
149 | ds = h5file.create_carray(h5file.root, 'nan_bins', atom,
150 | shape=self.nan_bins.shape,
151 | filters=filters)
152 | ds[:] = self.nan_bins
153 |
154 | # save corrections factors
155 | if self.correction_factors is not None and len(self.correction_factors):
156 | self.correction_factors = np.array(self.correction_factors)
157 | mask = np.isnan(self.correction_factors)
158 | self.correction_factors[mask] = 0
159 | atom = tables.Atom.from_dtype(self.correction_factors.dtype)
160 | ds = h5file.create_carray(h5file.root, 'correction_factors', atom,
161 | shape=self.correction_factors.shape,
162 | filters=filters)
163 | ds[:] = np.array(self.correction_factors)
164 |
165 | # save distance counts
166 | if self.distance_counts is not None and len(self.distance_counts):
167 | atom = tables.Atom.from_dtype(self.distance_counts.dtype)
168 | ds = h5file.create_carray(h5file.root, 'distance_counts', atom,
169 | shape=self.distance_counts.shape,
170 | filters=filters)
171 | ds[:] = np.array(self.distance_counts)
172 |
--------------------------------------------------------------------------------
/hicmatrix/lib/hicpro.py:
--------------------------------------------------------------------------------
1 |
2 | import logging
3 |
4 | from scipy.sparse import csr_matrix
5 |
6 | from .matrixFile import MatrixFile
7 |
8 | log = logging.getLogger(__name__)
9 |
10 |
11 | class Hicpro(MatrixFile):
12 |
13 | def __init__(self, pMatrixFile, pBedFile):
14 | super().__init__(pMatrixFileName=pMatrixFile, pBedFile=pBedFile)
15 |
16 | def load(self):
17 | instances = []
18 | features = []
19 | data = []
20 | with open(self.matrixFileName, 'r', encoding="utf-8") as matrix_file:
21 | for line in matrix_file:
22 | x, y, value = line.strip().split('\t')
23 | instances.append(int(x) - 1)
24 | features.append(int(y) - 1)
25 | data.append(float(value))
26 | cut_intervals = []
27 | with open(self.bedFile, 'r', encoding="utf-8") as bed_file:
28 | for line in bed_file:
29 | chrom, start, end, value = line.strip().split('\t')
30 | cut_intervals.append((chrom, int(start), int(end), int(value)))
31 |
32 | shape = len(cut_intervals)
33 |
34 | matrix = csr_matrix((data, (instances, features)), shape=(shape, shape))
35 |
36 | nan_bins = None
37 | distance_counts = None
38 | correction_factors = None
39 | return matrix, cut_intervals, nan_bins, distance_counts, correction_factors
40 |
41 | def save(self, pFileName, pSymmetric=None, pApplyCorrection=None):
42 | self.matrix.eliminate_zeros()
43 | instances, features = self.matrix.nonzero()
44 | data = self.matrix.data
45 |
46 | with open(pFileName, 'w', encoding="utf-8") as matrix_file:
47 | for x, y, value in zip(instances, features, data):
48 | matrix_file.write(str(int(x + 1)) + '\t' + str(int(y + 1)) + '\t' + str(value) + '\n')
49 |
50 | with open(self.bedFile, 'w', encoding="utf-8") as bed_file:
51 | for i, interval in enumerate(self.cut_intervals):
52 | bed_file.write('\t'.join(map(str, interval[:3])) + '\t' + str(i + 1) + '\n')
53 |
--------------------------------------------------------------------------------
/hicmatrix/lib/homer.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import gzip
4 |
5 | from scipy.sparse import csr_matrix
6 |
7 | from hicmatrix.utilities import opener
8 |
9 | from .matrixFile import MatrixFile
10 |
11 | log = logging.getLogger(__name__)
12 |
13 |
14 | class Homer(MatrixFile):
15 |
16 | def __init__(self, pMatrixFile):
17 | super().__init__(pMatrixFile)
18 |
19 | def load(self):
20 | cut_intervals = []
21 |
22 | # matrix_file = opener(self.matrixFileName)
23 | with opener(self.matrixFileName) as matrix_file:
24 | values = matrix_file.readline()
25 | values = values.strip().split(b'\t')
26 |
27 | # get bin size
28 | start_first = int(values[2].strip().split(b'-')[1])
29 | start_second = int(values[3].strip().split(b'-')[1])
30 | bin_size = start_second - start_first
31 | for value in values[2:]:
32 | chrom, start = value.strip().split(b'-')
33 | cut_intervals.append((chrom.decode('ascii'), int(start), int(start) + bin_size, 1))
34 |
35 | matrix_dense = []
36 | for line in matrix_file:
37 | values = line.split(b'\t')
38 | data = []
39 | for value in values[2:]:
40 | data.append(float(value))
41 | matrix_dense.append(data)
42 | # matrix_file.close()
43 | matrix = csr_matrix(matrix_dense)
44 | nan_bins = None
45 | distance_counts = None
46 | correction_factors = None
47 | return matrix, cut_intervals, nan_bins, distance_counts, correction_factors
48 |
49 | def save(self, pFileName, pSymmetric=None, pApplyCorrection=None):
50 |
51 | with gzip.open(pFileName, 'wt') as homerMatrixFile:
52 | homerMatrixFile.write('HiCMatrix (directory=.)\tRegions\t')
53 | for bin_interval in self.cut_intervals:
54 | homerMatrixFile.write(f'{bin_interval[0]}-{bin_interval[1]}\t')
55 | homerMatrixFile.write('\n')
56 |
57 | for i in range(self.matrix.shape[0]):
58 | data = '\t'.join(map(str, self.matrix[i, :].toarray().flatten()))
59 | homerMatrixFile.write(f'{self.cut_intervals[i][0]}-{self.cut_intervals[i][1]}\t{self.cut_intervals[i][0]}-{self.cut_intervals[i][1]}\t')
60 | homerMatrixFile.write(f'{data}')
61 | if i < self.matrix.shape[0] - 1:
62 | homerMatrixFile.write('\n')
63 |
--------------------------------------------------------------------------------
/hicmatrix/lib/matrixFile.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | log = logging.getLogger(__name__)
4 |
5 |
6 | class MatrixFile():
7 |
8 | def __init__(self, pMatrixFileName=None, pBedFile=None):
9 | self.matrixFileName = pMatrixFileName
10 | log.debug('self.matrixFileName %s', self.matrixFileName)
11 | self.matrix = None
12 | self.cut_intervals = None
13 | self.nan_bins = None
14 | self.correction_factors = None
15 | self.distance_counts = None
16 | self.bedFile = pBedFile
17 |
18 | def load(self):
19 | log.error('Not implemented')
20 |
21 | def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): # pylint: disable=W0613
22 | log.error('Not implemented')
23 |
24 | def is_of_type(self):
25 | log.error('Not implemented')
26 |
27 | def set_matrix_variables(self, pMatrix, pCutIntervals, pNanBins, pCorrectionFactors, pDistanceCounts):
28 | log.debug('Seeting matrix variables')
29 | self.matrix = pMatrix
30 | self.cut_intervals = pCutIntervals
31 | self.nan_bins = pNanBins
32 | self.correction_factors = pCorrectionFactors
33 | self.distance_counts = pDistanceCounts
34 |
--------------------------------------------------------------------------------
/hicmatrix/lib/matrixFileHandler.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import logging
3 |
4 | log = logging.getLogger(__name__)
5 |
6 |
7 | class MatrixFileHandler():
8 | """
9 | This class handles the load and save of the different Hi-C contact matrix formats.
10 | """
11 |
12 | def __init__(self, pFileType='cool', pMatrixFile=None, pChrnameList=None,
13 | pApplyCorrectionCoolerLoad=None, pBedFileHicPro=None, pCorrectionFactorTable=None,
14 | pCorrectionOperator=None, pEnforceInteger=None, pAppend=None, pFileWasH5=None, pHiCInfo=None, pHic2CoolVersion=None,
15 | pDistance=None, pMatrixFormat=None, pLoadMatrixOnly=None, pNoCutIntervals=None):
16 |
17 | self.class_ = getattr(importlib.import_module('.' + pFileType.lower(), package='hicmatrix.lib'), pFileType.title())
18 |
19 | if pFileType == 'hicpro':
20 | self.matrixFile = self.class_(pMatrixFile=pMatrixFile, pBedFile=pBedFileHicPro)
21 | else:
22 | self.matrixFile = self.class_(pMatrixFile=pMatrixFile)
23 | if pFileType == 'cool':
24 | self.matrixFile.chrnameList = pChrnameList
25 | if pCorrectionFactorTable is not None:
26 | self.matrixFile.correctionFactorTable = pCorrectionFactorTable
27 | if pCorrectionOperator is not None:
28 | self.matrixFile.correctionOperator = pCorrectionOperator
29 | if pEnforceInteger is not None:
30 | self.matrixFile.enforceInteger = pEnforceInteger
31 | if pAppend is not None:
32 | self.matrixFile.appendData = pAppend
33 | if pFileWasH5 is not None:
34 | self.matrixFile.fileWasH5 = pFileWasH5
35 | if pApplyCorrectionCoolerLoad is not None:
36 | self.matrixFile.applyCorrectionLoad = pApplyCorrectionCoolerLoad
37 | if pHiCInfo is not None:
38 | self.matrixFile.hic_metadata = pHiCInfo
39 | if pHic2CoolVersion is not None:
40 | self.matrixFile.hic2cool_version = pHic2CoolVersion
41 | if pDistance is not None:
42 | self.matrixFile.distance = pDistance
43 | if pMatrixFormat is not None:
44 | self.matrixFile.matrixFormat = pMatrixFormat
45 | if pLoadMatrixOnly is not None:
46 | self.matrixFile.matrixOnly = pLoadMatrixOnly
47 | if pNoCutIntervals is not None:
48 | self.matrixFile.noCutIntervals = pNoCutIntervals
49 |
50 | def load(self):
51 |
52 | return self.matrixFile.load()
53 |
54 | def set_matrix_variables(self, pMatrix, pCutIntervals, pNanBins, pCorrectionFactors, pDistanceCounts):
55 | self.matrixFile.set_matrix_variables(pMatrix, pCutIntervals, pNanBins, pCorrectionFactors, pDistanceCounts)
56 |
57 | def save(self, pName, pSymmetric, pApplyCorrection):
58 | self.matrixFile.save(pName, pSymmetric, pApplyCorrection)
59 |
60 | def load_init(self):
61 | pass
62 |
--------------------------------------------------------------------------------
/hicmatrix/lib/scool.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 |
4 | import cooler
5 | import numpy as np
6 |
7 | from .matrixFile import MatrixFile
8 |
9 | log = logging.getLogger(__name__)
10 |
11 | class Scool(MatrixFile):
12 |
13 | def __init__(self, pMatrixFile=None):
14 | super().__init__(pMatrixFile)
15 | log.debug('scool object created')
16 | self.coolObjectsList = None
17 | self.bins = None
18 | self.pixel_list = None
19 | self.name_list = None
20 |
21 | def load(self):
22 | raise NotImplementedError('Please use the specific cell to load the individual cool file from the scool file')
23 |
24 | def save(self, pFileName, pSymmetric=True, pApplyCorrection=True):
25 |
26 | pixel_dict = {}
27 | bins_dict = {}
28 |
29 | if self.coolObjectsList is not None:
30 | for coolObject in self.coolObjectsList:
31 | bins_data_frame, matrix_data_frame, dtype_pixel, _ = coolObject.matrixFile.create_cooler_input(pSymmetric=pSymmetric, pApplyCorrection=pApplyCorrection)
32 | bins_dict[coolObject.matrixFile.matrixFileName] = bins_data_frame
33 | pixel_dict[coolObject.matrixFile.matrixFileName] = matrix_data_frame
34 |
35 | else:
36 | try:
37 | dtype_pixel = {'bin1_id': np.int32, 'bin2_id': np.int32, 'count': self.pixel_list[0]['count'].dtype}
38 | # dtype_pixel = self.pixel_list[0]['count'].dtype
39 |
40 | for i, pixels in enumerate(self.pixel_list):
41 | bins_dict[self.name_list[i]] = self.bins
42 | pixel_dict[self.name_list[i]] = pixels
43 | log.debug('self.name_list[i] %s', self.name_list[i])
44 | except Exception as exp: # pylint: disable=W0718
45 | log.debug('Exception %s', str(exp))
46 |
47 | local_temp_dir = os.path.dirname(os.path.realpath(pFileName))
48 |
49 | cooler.create_scool(cool_uri=pFileName, bins=bins_dict, cell_name_pixels_dict=pixel_dict,
50 | dtypes=dtype_pixel,
51 | ordered=True,
52 | temp_dir=local_temp_dir)
53 |
--------------------------------------------------------------------------------
/hicmatrix/test/test_HiCMatrix.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os.path
3 | import sys
4 | import warnings
5 | # from past.builtins import zip
6 | from collections import OrderedDict
7 | from os import unlink
8 | from tempfile import NamedTemporaryFile
9 |
10 | import numpy as np
11 | import numpy.testing as nt
12 | import pytest
13 | from intervaltree import Interval, IntervalTree
14 | from scipy.sparse import coo_matrix, csr_matrix
15 | from six import iteritems
16 |
17 | from hicmatrix import HiCMatrix as hm
18 |
19 | log = logging.getLogger(__name__)
20 |
21 | warnings.filterwarnings("ignore")
22 |
23 | ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_data/")
24 |
25 |
26 | def test_load_h5_save_and_load_cool():
27 | hic = hm.hiCMatrix(ROOT + 'Li_et_al_2015.h5')
28 |
29 | outfile = NamedTemporaryFile(suffix='.cool', prefix='hicexplorer_test') # pylint: disable=R1732
30 | hic.matrixFileHandler = None
31 | hic.save(pMatrixName=outfile.name)
32 |
33 | hic_cool = hm.hiCMatrix(outfile.name)
34 |
35 | nt.assert_equal(hic_cool.matrix.data, hic.matrix.data)
36 | chrom_cool, start_cool, end_cool, _ = list(zip(*hic_cool.cut_intervals))
37 | chrom, start, end, _ = list(zip(*hic_cool.cut_intervals))
38 |
39 | nt.assert_equal(chrom_cool, chrom)
40 | nt.assert_equal(start_cool, start)
41 | nt.assert_equal(end_cool, end)
42 |
43 |
44 | def test_load_h5_load_cool_weight():
45 | hic_h5 = hm.hiCMatrix(ROOT + 'Li_et_al_2015.h5')
46 | hic_cool = hm.hiCMatrix(ROOT + 'Li_et_al_2015.cool')
47 |
48 | # there is always a small gap due to rounding errors and inaccurate floating operations
49 | # test if it is equal for up to 10 decimal positions
50 | nt.assert_almost_equal(hic_cool.matrix.data, hic_h5.matrix.data, decimal=10)
51 | chrom_cool, start_cool, end_cool, _ = list(zip(*hic_cool.cut_intervals))
52 | chrom, start, end, _ = list(zip(*hic_cool.cut_intervals))
53 |
54 | nt.assert_equal(chrom_cool, chrom)
55 | nt.assert_equal(start_cool, start)
56 | nt.assert_equal(end_cool, end)
57 |
58 |
59 | def test_load_h5_save_and_load_cool_2():
60 | hic = hm.hiCMatrix(ROOT + 'small_test_matrix.h5')
61 |
62 | outfile = NamedTemporaryFile(suffix='.cool', prefix='hicexplorer_test') # pylint: disable=R1732
63 | hic.matrixFileHandler = None
64 | hic.save(pMatrixName=outfile.name)
65 |
66 | hic_cool = hm.hiCMatrix(outfile.name)
67 |
68 | nt.assert_equal(hic_cool.matrix.data, hic.matrix.data)
69 | chrom_cool, start_cool, end_cool, _ = list(zip(*hic_cool.cut_intervals))
70 | chrom, start, end, _ = list(zip(*hic_cool.cut_intervals))
71 |
72 | nt.assert_equal(chrom_cool, chrom)
73 | nt.assert_equal(start_cool, start)
74 | nt.assert_equal(end_cool, end)
75 |
76 |
77 | def test_load_cool_save_and_load_h5():
78 | hic = hm.hiCMatrix(ROOT + 'Li_et_al_2015.cool')
79 |
80 | outfile = NamedTemporaryFile(suffix='.h5', prefix='hicexplorer_test') # pylint: disable=R1732
81 | hic.matrixFileHandler = None
82 | hic.save(pMatrixName=outfile.name)
83 |
84 | hic_cool = hm.hiCMatrix(outfile.name)
85 |
86 | nt.assert_equal(hic_cool.matrix.data, hic.matrix.data)
87 | chrom_cool, start_cool, end_cool, _ = list(zip(*hic_cool.cut_intervals))
88 | chrom, start, end, _ = list(zip(*hic_cool.cut_intervals))
89 |
90 | nt.assert_equal(chrom_cool, chrom)
91 | nt.assert_equal(start_cool, start)
92 | nt.assert_equal(end_cool, end)
93 |
94 |
95 | def test_save_load_cool():
96 | outfile = '/tmp/matrix.cool'
97 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
98 | ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)]
99 | hic = hm.hiCMatrix()
100 | hic.nan_bins = []
101 | matrix = np.array([[1, 8, 5, 3, 0],
102 | [0, 4, 15, 5, 1],
103 | [0, 0, 0, 0, 2],
104 | [0, 0, 0, 0, 1],
105 | [0, 0, 0, 0, 0]])
106 |
107 | hic.matrix = csr_matrix(matrix)
108 | # make matrix symmetric
109 | hic.setMatrix(hic.matrix, cut_intervals)
110 | hic.fillLowerTriangle()
111 | # hic.correction_factors = np.array([0.5, 1, 2, 3, 4])
112 | # hic.nan_bins = np.array([4])
113 |
114 | hic.save(outfile)
115 |
116 | cool_obj = hm.hiCMatrix(outfile)
117 | # nt.assert_equal(hic.correction_factors, cool_obj.correction_factors)
118 | nt.assert_equal(hic.matrix.data, cool_obj.matrix.data)
119 | nt.assert_equal(hic.matrix.indices, cool_obj.matrix.indices)
120 | nt.assert_equal(hic.matrix.indptr, cool_obj.matrix.indptr)
121 | nt.assert_equal(hic.nan_bins, cool_obj.nan_bins)
122 |
123 | nt.assert_equal(hic.cut_intervals, cool_obj.cut_intervals)
124 | unlink(outfile)
125 |
126 |
127 | def test_save_load_h5():
128 | outfile = '/tmp/matrix.h5'
129 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
130 | ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)]
131 | hic = hm.hiCMatrix()
132 | hic.nan_bins = []
133 | matrix = np.array([[1, 8, 5, 3, 0],
134 | [0, 4, 15, 5, 1],
135 | [0, 0, 0, 0, 2],
136 | [0, 0, 0, 0, 1],
137 | [0, 0, 0, 0, 0]])
138 |
139 | hic.matrix = csr_matrix(matrix)
140 | # make matrix symmetric
141 | hic.setMatrix(hic.matrix, cut_intervals)
142 | hic.fillLowerTriangle()
143 | # hic.correction_factors = np.array([0.5, 1, 2, 3, 4])
144 | # hic.nan_bins = np.array([4])
145 |
146 | hic.save(outfile)
147 |
148 | h5_obj = hm.hiCMatrix(outfile)
149 | # nt.assert_equal(hic.correction_factors, h5_obj.correction_factors)
150 | nt.assert_equal(hic.matrix.data, h5_obj.matrix.data)
151 | nt.assert_equal(hic.matrix.indices, h5_obj.matrix.indices)
152 | nt.assert_equal(hic.matrix.indptr, h5_obj.matrix.indptr)
153 | nt.assert_equal(hic.nan_bins, h5_obj.nan_bins)
154 |
155 | nt.assert_equal(hic.cut_intervals, h5_obj.cut_intervals)
156 | unlink(outfile)
157 |
158 |
159 | @pytest.mark.xfail
160 | def test_save_load_other_formats_fail():
161 | pMatrixFile = ROOT + 'test_matrix.hicpro'
162 | # pBedFileHicPro = ROOT + 'test_matrix.bed' # no parameter for this in hiCMatrix::__init__() anyway
163 | # hic_matrix = hm.hiCMatrix(pMatrixFile=pMatrixFile)
164 | # out, err = capsys.readouterr()
165 | # assert out == 'matrix file not given'
166 | pMatrixFile = ROOT + 'test_matrix.homer'
167 | hm.hiCMatrix(pMatrixFile=pMatrixFile)
168 |
169 |
170 | def test_convert_to_zscore_matrix():
171 |
172 | # make test matrix
173 | m_size = 100
174 | mat = np.triu(np.random.randint(0, 101, (m_size, m_size)))
175 | # add a number of zeros
176 | mat[mat < 90] = 0
177 | # import ipdb;ipdb.set_trace()
178 | mu = dict([(idx, mat.diagonal(idx).mean()) for idx in range(mat.shape[0])]) # pylint: disable=R1717
179 | std = dict([(idx, np.std(mat.diagonal(idx))) # pylint: disable=R1717
180 | for idx in range(mat.shape[0])])
181 |
182 | # compute z-score for test matrix
183 | zscore_mat = np.zeros((m_size, m_size))
184 | for _i in range(mat.shape[0]):
185 | for _j in range(mat.shape[0]):
186 | if _j >= _i:
187 | diag = _j - _i
188 | if std[diag] == 0:
189 | zscore = np.nan
190 | else:
191 | zscore = (mat[_i, _j] - mu[diag]) / std[diag]
192 | zscore_mat[_i, _j] = zscore
193 |
194 | # make Hi-C matrix based on test matrix
195 | hic = hm.hiCMatrix()
196 | hic.matrix = csr_matrix(mat)
197 | cut_intervals = [('chr', idx, idx + 10, 0) for idx in range(0, mat.shape[0] * 10, 10)]
198 | hic.setMatrix(hic.matrix, cut_intervals)
199 | hic.convert_to_zscore_matrix()
200 |
201 | nt.assert_almost_equal(hic.matrix.todense(), zscore_mat)
202 |
203 |
204 | def test_convert_to_zscore_matrix_2():
205 |
206 | # load test matrix
207 | hic = hm.hiCMatrix(ROOT + 'Li_et_al_2015.h5')
208 | hic.maskBins(hic.nan_bins)
209 |
210 | mat = hic.matrix.todense()
211 | max_depth = 10000
212 | bin_size = hic.getBinSize()
213 | max_depth_in_bins = int(float(max_depth) / bin_size)
214 |
215 | m_size = mat.shape[0]
216 | # compute matrix values per distance
217 | _, start, _, _ = list(zip(
218 | *hm.hiCMatrix.fit_cut_intervals(hic.cut_intervals)))
219 | dist_values = {}
220 | sys.stderr.write("Computing values per distance for each matrix entry\n")
221 |
222 | for _i in range(mat.shape[0]):
223 | for _j in range(mat.shape[0]):
224 | if _j >= _i:
225 | # dist is translated to bins
226 | dist = int(float(start[_j] - start[_i]) / bin_size)
227 | if dist <= max_depth_in_bins:
228 | if dist not in dist_values:
229 | dist_values[dist] = []
230 | dist_values[dist].append(mat[_i, _j])
231 |
232 | mu = {}
233 | std = {}
234 | for dist, values in iteritems(dist_values):
235 | mu[dist] = np.mean(values)
236 | std[dist] = np.std(values)
237 |
238 | # compute z-score for test matrix
239 | sys.stderr.write("Computing zscore for each matrix entry\n")
240 | zscore_mat = np.full((m_size, m_size), np.nan)
241 | for _i in range(mat.shape[0]):
242 | for _j in range(mat.shape[0]):
243 | if _j >= _i:
244 | dist = int(float(start[_j] - start[_i]) / bin_size)
245 | if dist <= max_depth_in_bins:
246 | zscore = (mat[_i, _j] - mu[dist]) / std[dist]
247 | zscore_mat[_i, _j] = zscore
248 |
249 | # compare with zscore from class
250 | hic.convert_to_zscore_matrix(maxdepth=max_depth)
251 |
252 | # from numpy.testing import assert_almost_equal
253 | # only the main diagonal is check. Other diagonals show minimal differences
254 | nt.assert_almost_equal(hic.matrix.todense().diagonal(
255 | 0).A1, zscore_mat.diagonal(0))
256 |
257 |
258 | def test_dist_list_to_dict():
259 | hic = hm.hiCMatrix()
260 |
261 | data = np.array([1, 8, 5, 3, 0, 4, 15, 5, 1, 0, 0, 2, 0, 1, 0])
262 | dist_list = np.array(
263 | [0, 10, 20, 30, -1, 0, 10, 20, -1, 0, 10, -1, 0, -1, 0])
264 |
265 | distance = hic.dist_list_to_dict(data, dist_list)
266 |
267 | nt.assert_equal(distance[-1], [0, 1, 2, 1])
268 | nt.assert_equal(distance[0], [1, 4, 0, 0, 0])
269 | nt.assert_equal(distance[10], [8, 15, 0])
270 | nt.assert_equal(distance[20], [5, 5])
271 | nt.assert_equal(distance[30], [3])
272 |
273 | data = np.array([0, 100, 200, 0, 100, 200, 0, 100, 0])
274 | dist_list = np.array([0, 100, 200, 0, 100, 200, 0, 100, 0])
275 |
276 | distance = hic.dist_list_to_dict(data, dist_list)
277 |
278 | nt.assert_equal(distance[0], [0, 0, 0, 0])
279 | nt.assert_equal(distance[100], [100, 100, 100])
280 | nt.assert_equal(distance[200], [200, 200])
281 |
282 |
283 | def test_keepOnlyTheseChr():
284 | chromosome_list = ['chrX', 'chr2RHet']
285 |
286 | hic = hm.hiCMatrix(ROOT + 'small_test_matrix.h5')
287 |
288 | hic.keepOnlyTheseChr(chromosome_list)
289 |
290 | nt.assert_equal(hic.getChrNames().sort(), chromosome_list.sort())
291 |
292 |
293 | def test_save():
294 | """
295 | Test will not cover testing of following formats due to unsupported file_formats (see __init__ of class hiCMatrix):
296 |
297 | * ren
298 | * lieberman
299 | * GInteractions
300 |
301 | see also single test for these formats (marked as xfail)
302 | """
303 |
304 | outfile_cool = NamedTemporaryFile(suffix='.cool', delete=False) # pylint: disable=R1732
305 | outfile_cool.close()
306 |
307 | outfile_h5 = NamedTemporaryFile(suffix='.h5', delete=False) # pylint: disable=R1732
308 | outfile_h5.close()
309 |
310 | hic = hm.hiCMatrix()
311 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
312 | ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)]
313 |
314 | hic.nan_bins = []
315 |
316 | matrix = np.array([[1, 8, 5, 3, 0],
317 | [0, 4, 15, 5, 1],
318 | [0, 0, 0, 0, 2],
319 | [0, 0, 0, 0, 1],
320 | [0, 0, 0, 0, 0]])
321 |
322 | hic.matrix = csr_matrix(matrix)
323 | hic.setMatrix(hic.matrix, cut_intervals)
324 | hic.fillLowerTriangle()
325 |
326 | # test .h5
327 | hic.save(outfile_h5.name)
328 | h5_test = hm.hiCMatrix(outfile_h5.name)
329 |
330 | # test cool
331 | hic.matrixFileHandler = None
332 | hic.save(outfile_cool.name)
333 | cool_test = hm.hiCMatrix(outfile_cool.name)
334 |
335 | nt.assert_equal(hic.getMatrix(), h5_test.getMatrix())
336 | nt.assert_equal(hic.getMatrix(), cool_test.getMatrix())
337 |
338 |
339 | def test_diagflat():
340 | hic = hm.hiCMatrix()
341 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
342 | ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)]
343 |
344 | hic.nan_bins = []
345 |
346 | matrix = np.array([[1, 8, 5, 3, 0],
347 | [0, 4, 15, 5, 1],
348 | [0, 0, 0, 0, 2],
349 | [0, 0, 0, 0, 1],
350 | [0, 0, 0, 0, 0]])
351 |
352 | hic.matrix = csr_matrix(matrix)
353 | hic.setMatrix(hic.matrix, cut_intervals)
354 | hic.fillLowerTriangle()
355 |
356 | hic.diagflat(value=1000)
357 | nt.assert_equal(
358 | np.array([1000 for x in range(matrix.shape[0])]), hic.matrix.diagonal())
359 |
360 | hic.diagflat()
361 | nt.assert_equal(
362 | np.array([np.nan for x in range(5)]), hic.matrix.diagonal())
363 |
364 |
365 | def test_filterOutInterChrCounts():
366 | hic = hm.hiCMatrix()
367 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
368 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]
369 |
370 | hic.nan_bins = []
371 |
372 | matrix = np.array([[1, 8, 5, 3, 0],
373 | [0, 4, 15, 5, 1],
374 | [0, 0, 0, 0, 2],
375 | [0, 0, 0, 0, 1],
376 | [0, 0, 0, 0, 0]])
377 |
378 | hic.matrix = csr_matrix(matrix)
379 | hic.setMatrix(hic.matrix, cut_intervals)
380 | hic.fillLowerTriangle()
381 | hic.filterOutInterChrCounts()
382 |
383 | filtered_matrix = np.array([[1, 8, 5, 0, 0],
384 | [8, 4, 15, 0, 0],
385 | [5, 15, 0, 0, 0],
386 | [0, 0, 0, 0, 1],
387 | [0, 0, 0, 1, 0]])
388 |
389 | nt.assert_equal(hic.getMatrix(), filtered_matrix)
390 |
391 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
392 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]
393 | hic = hm.hiCMatrix()
394 | hic.nan_bins = []
395 | matrix = np.array([[0, 10, 5, 3, 0],
396 | [0, 0, 15, 5, 1],
397 | [0, 0, 0, 7, 3],
398 | [0, 0, 0, 0, 1],
399 | [0, 0, 0, 0, 0]])
400 |
401 | # make the matrix symmetric:
402 | hic.matrix = csr_matrix(matrix + matrix.T)
403 | hic.setMatrix(csr_matrix(matrix + matrix.T, dtype=np.int32), cut_intervals)
404 |
405 | filtered = hic.filterOutInterChrCounts().todense()
406 | test_matrix = np.array([[0, 10, 5, 0, 0],
407 | [10, 0, 15, 0, 0],
408 | [5, 15, 0, 0, 0],
409 | [0, 0, 0, 0, 1],
410 | [0, 0, 0, 1, 0]], dtype='i4')
411 |
412 | nt.assert_equal(filtered, test_matrix)
413 |
414 |
415 | def test_setMatrixValues_success():
416 | hic = hm.hiCMatrix()
417 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
418 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]
419 |
420 | hic.nan_bins = []
421 |
422 | matrix = np.array([[1, 8, 5, 3, 0],
423 | [0, 4, 15, 5, 1],
424 | [0, 0, 0, 0, 2],
425 | [0, 0, 0, 0, 1],
426 | [0, 0, 0, 0, 0]])
427 |
428 | hic.matrix = csr_matrix(matrix)
429 | hic.setMatrix(hic.matrix, cut_intervals)
430 |
431 | new_matrix = np.array([[10, 80, 50, 30, 0],
432 | [0, 40, 150, 50, 10],
433 | [0, 0, 0, 0, 20],
434 | [0, 0, 0, 0, 10],
435 | [0, 0, 0, 0, 0]])
436 |
437 | hic.setMatrixValues(new_matrix)
438 |
439 | nt.assert_equal(hic.getMatrix(), new_matrix)
440 |
441 |
442 | def test_setMatrixValues_fail():
443 | hic = hm.hiCMatrix()
444 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
445 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]
446 |
447 | hic.nan_bins = []
448 |
449 | matrix = np.array([[1, 8, 5, 3, 0],
450 | [0, 4, 15, 5, 1],
451 | [0, 0, 0, 0, 2],
452 | [0, 0, 0, 0, 1],
453 | [0, 0, 0, 0, 0]])
454 |
455 | hic.matrix = csr_matrix(matrix)
456 | hic.setMatrix(hic.matrix, cut_intervals)
457 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
458 | ('a', 20, 30, 1), ('b', 30, 40, 1)]
459 |
460 | new_matrix = np.array([[10, 80, 50, 30],
461 | [0, 40, 150, 50],
462 | [0, 0, 0, 0],
463 | [0, 0, 0, 0]])
464 | with pytest.raises(AssertionError):
465 | hic.setMatrixValues(new_matrix)
466 |
467 |
468 | def test_setCorrectionFactors_success():
469 | hic = hm.hiCMatrix()
470 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
471 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]
472 |
473 | hic.nan_bins = []
474 |
475 | matrix = np.array([[1, 8, 5, 3, 0],
476 | [0, 4, 15, 5, 1],
477 | [0, 0, 0, 0, 2],
478 | [0, 0, 0, 0, 1],
479 | [0, 0, 0, 0, 0]])
480 |
481 | hic.matrix = csr_matrix(matrix)
482 | hic.setMatrix(hic.matrix, cut_intervals)
483 |
484 | assert hic.correction_factors is None
485 |
486 | hic.setCorrectionFactors([5, 5, 5, 5, 5])
487 |
488 | nt.assert_equal(hic.correction_factors, [5, 5, 5, 5, 5])
489 |
490 |
491 | def test_setCorrectionFactors_fail():
492 | hic = hm.hiCMatrix()
493 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
494 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]
495 |
496 | hic.nan_bins = []
497 |
498 | matrix = np.array([[1, 8, 5, 3, 0],
499 | [0, 4, 15, 5, 1],
500 | [0, 0, 0, 0, 2],
501 | [0, 0, 0, 0, 1],
502 | [0, 0, 0, 0, 0]])
503 |
504 | hic.matrix = csr_matrix(matrix)
505 | hic.setMatrix(hic.matrix, cut_intervals)
506 |
507 | assert hic.correction_factors is None
508 | with pytest.raises(AssertionError):
509 | hic.setCorrectionFactors([5, 5, 5, 5])
510 |
511 |
512 | def test_reorderChromosomes():
513 | hic = hm.hiCMatrix()
514 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
515 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]
516 |
517 | hic.nan_bins = []
518 |
519 | matrix = np.array([[1, 8, 5, 3, 0],
520 | [0, 4, 15, 5, 1],
521 | [0, 0, 0, 0, 2],
522 | [0, 0, 0, 0, 1],
523 | [0, 0, 0, 0, 0]])
524 |
525 | hic.matrix = csr_matrix(matrix)
526 | hic.setMatrix(hic.matrix, cut_intervals)
527 |
528 | new_chr_order = ['b', 'a']
529 | hic.reorderChromosomes(new_chr_order)
530 |
531 | nt.assert_equal(hic.chrBinBoundaries, OrderedDict(
532 | [('b', (0, 2)), ('a', (2, 5))]))
533 |
534 | old_chr_order = ['a', 'b']
535 | hic.reorderChromosomes(old_chr_order)
536 |
537 | nt.assert_equal(hic.chrBinBoundaries, OrderedDict(
538 | [('a', (0, 3)), ('b', (3, 5))]))
539 |
540 |
541 | def test_reorderChromosomes_fail():
542 | hic = hm.hiCMatrix()
543 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
544 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]
545 |
546 | hic.nan_bins = []
547 |
548 | matrix = np.array([[1, 8, 5, 3, 0],
549 | [0, 4, 15, 5, 1],
550 | [0, 0, 0, 0, 2],
551 | [0, 0, 0, 0, 1],
552 | [0, 0, 0, 0, 0]])
553 |
554 | hic.matrix = csr_matrix(matrix)
555 | hic.setMatrix(hic.matrix, cut_intervals)
556 |
557 | # name 'c' not in chromosome names, thus fail
558 | false_chr_order = ['a', 'b', 'c']
559 | with pytest.raises(Exception) as context:
560 | hic.reorderChromosomes(false_chr_order)
561 | assert "Chromosome name 'c' not found." in str(context.value)
562 |
563 |
564 | def test_reorderBins():
565 | hic = hm.hiCMatrix()
566 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
567 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]
568 |
569 | hic.nan_bins = []
570 |
571 | matrix = np.array([[1, 8, 5, 3, 0],
572 | [0, 4, 15, 5, 1],
573 | [0, 0, 0, 0, 2],
574 | [0, 0, 0, 0, 1],
575 | [0, 0, 0, 0, 0]])
576 |
577 | hic.matrix = csr_matrix(matrix)
578 | hic.setMatrix(hic.matrix, cut_intervals)
579 |
580 | nt.assert_equal(hic.getMatrix(), matrix)
581 |
582 | new_order = [0, 1, 3, 2, 4]
583 | new_matrix = np.array([[1, 8, 3, 5, 0],
584 | [0, 4, 5, 15, 1],
585 | [0, 0, 0, 0, 1],
586 | [0, 0, 0, 0, 2],
587 | [0, 0, 0, 0, 0]])
588 |
589 | hic.reorderBins(new_order)
590 |
591 | nt.assert_equal(hic.getMatrix(), new_matrix)
592 |
593 | hic.reorderBins(new_order)
594 |
595 | nt.assert_equal(hic.getMatrix(), matrix)
596 |
597 | # order smaller than original matrix should delete unused ids
598 | small_order = [2, 3]
599 | small_matrix = np.array([[0, 0],
600 | [0, 0]])
601 |
602 | hic.reorderBins(small_order)
603 |
604 | nt.assert_equal(hic.getMatrix(), small_matrix)
605 | nt.assert_equal(hic.matrix.shape, small_matrix.shape)
606 | nt.assert_equal(hic.chrBinBoundaries, OrderedDict(
607 | [('a', (0, 1)), ('b', (1, 2))]))
608 | nt.assert_equal(hic.cut_intervals, [('a', 20, 30, 1), ('b', 30, 40, 1)])
609 | nt.assert_equal(hic.nan_bins, [])
610 |
611 |
612 | def test_maskBins():
613 | hic = hm.hiCMatrix()
614 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
615 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]
616 |
617 | hic.nan_bins = []
618 |
619 | matrix = np.array([[1, 8, 5, 3, 0],
620 | [0, 4, 15, 5, 1],
621 | [0, 0, 0, 0, 2],
622 | [0, 0, 0, 0, 1],
623 | [0, 0, 0, 0, 0]])
624 |
625 | hic.matrix = csr_matrix(matrix)
626 | hic.setMatrix(hic.matrix, cut_intervals)
627 |
628 | nt.assert_equal(hic.getMatrix(), matrix)
629 | nt.assert_equal(hic.orig_bin_ids, [])
630 |
631 | new_matrix = np.array([[0, 0, 2],
632 | [0, 0, 1],
633 | [0, 0, 0]])
634 |
635 | masking_ids = [0, 1]
636 | hic.maskBins(masking_ids)
637 |
638 | nt.assert_equal(hic.getMatrix(), new_matrix)
639 | nt.assert_equal(sorted(hic.orig_cut_intervals), sorted([('a', 0, 10, 1), ('a', 10, 20, 1),
640 | ('a', 20, 30,
641 | 1), ('b', 30, 40, 1),
642 | ('b', 40, 50, 1)]))
643 | nt.assert_equal(sorted(hic.cut_intervals), sorted([('a', 20, 30, 1), ('b', 30, 40, 1),
644 | ('b', 40, 50, 1)]))
645 | nt.assert_equal(hic.chrBinBoundaries, OrderedDict(
646 | [('a', (0, 1)), ('b', (1, 3))]))
647 | nt.assert_equal(sorted(hic.orig_bin_ids), sorted([0, 1, 2, 3, 4]))
648 |
649 | # direct return if masking_ids is None or has len() == 0, thus no changes to matrix
650 | masking_ids = None
651 | hic.maskBins(masking_ids)
652 |
653 | nt.assert_equal(hic.getMatrix(), new_matrix)
654 | nt.assert_equal(sorted(hic.orig_cut_intervals), sorted([('a', 0, 10, 1), ('a', 10, 20, 1),
655 | ('a', 20, 30,
656 | 1), ('b', 30, 40, 1),
657 | ('b', 40, 50, 1)]))
658 | nt.assert_equal(sorted(hic.cut_intervals), sorted([('a', 20, 30, 1), ('b', 30, 40, 1),
659 | ('b', 40, 50, 1)]))
660 | nt.assert_equal(hic.chrBinBoundaries, OrderedDict(
661 | [('a', (0, 1)), ('b', (1, 3))]))
662 |
663 | masking_ids = []
664 |
665 | hic.maskBins(masking_ids)
666 |
667 | nt.assert_equal(hic.getMatrix(), new_matrix)
668 | nt.assert_equal(sorted(hic.orig_cut_intervals), sorted([('a', 0, 10, 1), ('a', 10, 20, 1),
669 | ('a', 20, 30,
670 | 1), ('b', 30, 40, 1),
671 | ('b', 40, 50, 1)]))
672 | nt.assert_equal(sorted(hic.cut_intervals), sorted([('a', 20, 30, 1), ('b', 30, 40, 1),
673 | ('b', 40, 50, 1)]))
674 | nt.assert_equal(hic.chrBinBoundaries, OrderedDict(
675 | [('a', (0, 1)), ('b', (1, 3))]))
676 |
677 | nt.assert_equal(sorted(hic.orig_bin_ids), sorted([0, 1, 2, 3, 4]))
678 |
679 |
680 | def test_update_matrix():
681 | hic = hm.hiCMatrix()
682 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
683 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]
684 |
685 | hic.nan_bins = []
686 |
687 | matrix = np.array([[1, 8, 5, 3, 0],
688 | [0, 4, 15, 5, 1],
689 | [0, 0, 0, 0, 2],
690 | [0, 0, 0, 0, 1],
691 | [0, 0, 0, 0, 0]])
692 |
693 | hic.matrix = csr_matrix(matrix)
694 | hic.setMatrix(hic.matrix, cut_intervals)
695 |
696 | nt.assert_equal(hic.getMatrix(), matrix)
697 |
698 | new_cut_intervals = [('c', 0, 10, 1), ('d', 10, 20, 1), ('d', 20, 30, 1)]
699 |
700 | new_matrix = np.array([[3, 6, 4],
701 | [np.nan, 0, 2],
702 | [1, 0, 0]])
703 | try:
704 | hic.update_matrix(new_matrix, new_cut_intervals)
705 | except AttributeError:
706 | pass
707 | # if matrix.shape[0] not equal to length of cut_intervals assertionError is raised
708 | short_cut_intervals = [('c', 0, 10, 1), ('d', 10, 20, 1)]
709 |
710 | with pytest.raises(AssertionError):
711 | hic.update_matrix(new_matrix, short_cut_intervals)
712 |
713 | # if matrix contains masked bins exception is raised
714 | masking_ids = [0, 1]
715 | hic.maskBins(masking_ids)
716 |
717 | with pytest.raises(Exception):
718 | hic.update_matrix(new_matrix, new_cut_intervals)
719 |
720 |
721 | def test_restoreMaskedBins():
722 | hic = hm.hiCMatrix()
723 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
724 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]
725 |
726 | hic.nan_bins = []
727 |
728 | matrix = np.array([[1, 8, 5, 3, 0],
729 | [0, 4, 15, 5, 1],
730 | [0, 0, 0, 0, 2],
731 | [0, 0, 0, 0, 1],
732 | [0, 0, 0, 0, 0]])
733 |
734 | hic.matrix = csr_matrix(matrix)
735 | hic.setMatrix(hic.matrix, cut_intervals)
736 |
737 | nt.assert_equal(hic.getMatrix(), matrix)
738 | nt.assert_equal(hic.orig_bin_ids, [])
739 |
740 | # function should directly return if there are no masked_bins
741 | hic.restoreMaskedBins()
742 |
743 | nt.assert_equal(hic.getMatrix(), matrix)
744 | nt.assert_equal(hic.orig_bin_ids, [])
745 |
746 | # test general use
747 | # first get some masked bins
748 | masking_ids = [0, 1]
749 | hic.maskBins(masking_ids)
750 |
751 | new_matrix = np.array([[0, 0, 2],
752 | [0, 0, 1],
753 | [0, 0, 0]])
754 |
755 | nt.assert_equal(hic.getMatrix(), new_matrix)
756 | nt.assert_equal(sorted(hic.orig_bin_ids), sorted([0, 1, 2, 3, 4]))
757 |
758 | # and now restore masked bins
759 | hic.restoreMaskedBins()
760 |
761 | result_matrix = np.array([[np.nan, np.nan, np.nan, np.nan, np.nan],
762 | [np.nan, np.nan, np.nan, np.nan, np.nan],
763 | [np.nan, np.nan, 0, 0, 2],
764 | [np.nan, np.nan, 0, 0, 1],
765 | [np.nan, np.nan, 0, 0, 0]])
766 |
767 | nt.assert_equal(hic.getMatrix(), result_matrix)
768 | nt.assert_equal(hic.orig_bin_ids, [])
769 |
770 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
771 | ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)]
772 | hic = hm.hiCMatrix()
773 | hic.nan_bins = []
774 | matrix = np.array([[0, 10, 5, 3, 0],
775 | [0, 0, 15, 5, 1],
776 | [0, 0, 0, 7, 3],
777 | [0, 0, 0, 0, 1],
778 | [0, 0, 0, 0, 0]], dtype=np.int32)
779 |
780 | # make the matrix symmetric:
781 | hic.matrix = csr_matrix(matrix + matrix.T)
782 | hic.setMatrix(csr_matrix(matrix + matrix.T), cut_intervals)
783 |
784 | # Add masked bins masked bins
785 | hic.maskBins([3])
786 |
787 | matrix = hic.matrix.todense()
788 | test_matrix = np.array([[0, 10, 5, 0],
789 | [10, 0, 15, 1],
790 | [5, 15, 0, 3],
791 | [0, 1, 3, 0]], dtype=np.int32)
792 |
793 | nt.assert_equal(matrix, test_matrix)
794 |
795 | cut_int = hic.cut_intervals
796 | test_cut_int = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 40, 50, 1)]
797 |
798 | nt.assert_equal(cut_int, test_cut_int)
799 |
800 | hic.restoreMaskedBins()
801 |
802 | dense = hic.matrix.todense()
803 | test_dense = np.array([[0., 10., 5., 0., 0.],
804 | [10., 0., 15., 0., 1.],
805 | [5., 15., 0., 0., 3.],
806 | [0., 0., 0., 0., 0.],
807 | [0., 1., 3., 0., 0.]])
808 |
809 | nt.assert_equal(dense, test_dense)
810 |
811 | cut_int = hic.cut_intervals
812 | test_cut_int = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
813 | ('a', 30, 40, 1), ('b', 40, 50, 1)]
814 |
815 | nt.assert_equal(cut_int, test_cut_int)
816 |
817 |
818 | def test_reorderMatrix():
819 | orig = (1, 3)
820 | dest = 2
821 |
822 | # get matrix
823 | hic = hm.hiCMatrix()
824 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
825 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]
826 |
827 | hic.nan_bins = []
828 |
829 | matrix = np.array([[1, 8, 5, 3, 0],
830 | [0, 4, 15, 5, 1],
831 | [0, 0, 0, 0, 2],
832 | [0, 0, 0, 0, 1],
833 | [0, 0, 0, 0, 0]])
834 |
835 | hic.matrix = csr_matrix(matrix)
836 | hic.setMatrix(hic.matrix, cut_intervals)
837 |
838 | nt.assert_equal(hic.getMatrix(), matrix)
839 |
840 | # reorder matrix
841 | hic.reorderMatrix(orig, dest)
842 |
843 | new_matrix = np.array([[1, 3, 8, 5, 0],
844 | [0, 0, 0, 0, 1],
845 | [0, 5, 4, 15, 1],
846 | [0, 0, 0, 0, 2],
847 | [0, 0, 0, 0, 0]])
848 |
849 | new_cut_intervals = [('a', 0, 10, 1), ('b', 30, 40, 1),
850 | ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 40, 50, 1)]
851 |
852 | # check if it is equal
853 | nt.assert_equal(hic.getMatrix(), new_matrix)
854 | nt.assert_equal(hic.matrix.shape, new_matrix.shape)
855 | nt.assert_equal(hic.cut_intervals, new_cut_intervals)
856 |
857 |
858 | def test_truncTrans():
859 | # get matrix
860 | hic = hm.hiCMatrix()
861 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
862 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]
863 |
864 | hic.nan_bins = []
865 |
866 | matrix = np.array([[-1, 8, 5, 3, 0],
867 | [np.nan, 4, 15, 5, 100],
868 | [0, 0, 0, 0, 2000],
869 | [0, 0, 0, 0, 1],
870 | [0, 0, 0, 0, 0]])
871 |
872 | hic.matrix = csr_matrix(matrix)
873 | hic.setMatrix(hic.matrix, cut_intervals)
874 |
875 | nt.assert_equal(hic.getMatrix(), matrix)
876 |
877 | # define expected outcome
878 | new_matrix = np.array([[-1., 8., 5., 3., 0.],
879 | [np.nan, 4., 15., 5., 1.e+2],
880 | [0., 0., 0., 0., 2.e+3],
881 | [0., 0., 0., 0., 1.],
882 | [0., 0., 0., 0., 0.]])
883 |
884 | # truncTrans of matrix
885 | hic.truncTrans()
886 |
887 | # test against expected outcome
888 | nt.assert_equal(hic.getMatrix(), new_matrix)
889 |
890 | # reset matrix
891 | matrix = np.array([[-1, 8, 5, 3, 0],
892 | [np.nan, 4, 15, 5, 1],
893 | [0, 0, 0, 0, 2],
894 | [0, 0, 0, 0, 1],
895 | [0, 0, 0, 0, 0]])
896 | hic.matrix = csr_matrix(matrix)
897 | hic.setMatrix(hic.matrix, cut_intervals)
898 |
899 | # method should directly return if nothing to do, matrix stays the same
900 | hic.truncTrans()
901 | nt.assert_equal(hic.getMatrix(), matrix)
902 |
903 |
904 | def test_printchrtoremove(capsys):
905 | # get matrix
906 | hic = hm.hiCMatrix()
907 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
908 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]
909 |
910 | hic.nan_bins = []
911 |
912 | matrix = np.array([[1, 8, 5, 3, 0],
913 | [0, 4, 15, 5, 1],
914 | [0, 0, 0, 0, 2],
915 | [0, 0, 0, 0, 1],
916 | [0, 0, 0, 0, 0]])
917 |
918 | hic.matrix = csr_matrix(matrix)
919 | hic.setMatrix(hic.matrix, cut_intervals)
920 |
921 | nt.assert_equal(hic.getMatrix(), matrix)
922 |
923 | # first test exception message for no self.prev_to_remove
924 | to_remove = [0, 1]
925 |
926 | with pytest.raises(Exception):
927 | hic.printchrtoremove(to_remove)
928 |
929 | captured = capsys.readouterr()
930 | assert captured.out == "No self.prev_to_remove defined, defining it now."
931 |
932 | nt.assert_equal(hic.prev_to_remove, np.array(to_remove))
933 |
934 | nt.assert_equal(hic.orig_bin_ids, [])
935 |
936 | # also test with masked_bins
937 | hic.maskBins(to_remove)
938 |
939 | assert len(hic.orig_bin_ids) > 0
940 |
941 | hic.printchrtoremove(to_remove)
942 |
943 | nt.assert_equal(hic.prev_to_remove, np.array(to_remove))
944 |
945 |
946 | def test_get_chromosome_sizes_real():
947 | # get matrix
948 | hic = hm.hiCMatrix()
949 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
950 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]
951 |
952 | hic.nan_bins = []
953 |
954 | matrix = np.array([[1, 8, 5, 3, 0],
955 | [0, 4, 15, 5, 1],
956 | [0, 0, 0, 0, 2],
957 | [0, 0, 0, 0, 1],
958 | [0, 0, 0, 0, 0]])
959 |
960 | hic.matrix = csr_matrix(matrix)
961 | hic.setMatrix(hic.matrix, cut_intervals)
962 |
963 | nt.assert_equal(hic.getMatrix(), matrix)
964 |
965 | # define expected outcome
966 | expected_sizes = OrderedDict([('a', 31), ('b', 21)])
967 |
968 | chrom_sizes = hic.get_chromosome_sizes_real()
969 |
970 | nt.assert_equal(chrom_sizes, expected_sizes)
971 |
972 | # define new intervals and test again
973 | new_cut_intervals = [('a', 0, 10, 1), ('b', 10, 20, 1),
974 | ('b', 20, 30, 1), ('c', 30, 40, 1), ('c', 40, 90, 1)]
975 |
976 | expected_sizes = OrderedDict([('a', 11), ('b', 21), ('c', 61)])
977 |
978 | hic.setMatrix(hic.matrix, new_cut_intervals)
979 |
980 | chrom_sizes = hic.get_chromosome_sizes_real()
981 |
982 | nt.assert_equal(chrom_sizes, expected_sizes)
983 |
984 |
985 | def test_get_chromosome_sizes():
986 | # get matrix
987 | hic = hm.hiCMatrix()
988 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
989 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]
990 |
991 | hic.nan_bins = []
992 |
993 | matrix = np.array([[1, 8, 5, 3, 0],
994 | [0, 4, 15, 5, 1],
995 | [0, 0, 0, 0, 2],
996 | [0, 0, 0, 0, 1],
997 | [0, 0, 0, 0, 0]])
998 |
999 | hic.matrix = csr_matrix(matrix)
1000 | hic.setMatrix(hic.matrix, cut_intervals)
1001 |
1002 | nt.assert_equal(hic.getMatrix(), matrix)
1003 |
1004 | # define expected outcome
1005 | expected_sizes = OrderedDict([('a', 30), ('b', 50)])
1006 |
1007 | chrom_sizes = hic.get_chromosome_sizes()
1008 |
1009 | nt.assert_equal(chrom_sizes, expected_sizes)
1010 |
1011 | # define new intervals and test again
1012 | new_cut_intervals = [('a', 0, 10, 1), ('b', 10, 20, 1),
1013 | ('b', 20, 30, 1), ('c', 30, 40, 1), ('c', 40, 90, 1)]
1014 |
1015 | expected_sizes = OrderedDict([('a', 10), ('b', 30), ('c', 90)])
1016 |
1017 | hic.setMatrix(hic.matrix, new_cut_intervals)
1018 |
1019 | chrom_sizes = hic.get_chromosome_sizes()
1020 |
1021 | nt.assert_equal(chrom_sizes, expected_sizes)
1022 |
1023 |
1024 | def test_intervalListToIntervalTree(capsys):
1025 | # get matrix
1026 | hic = hm.hiCMatrix()
1027 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
1028 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]
1029 |
1030 | hic.nan_bins = []
1031 |
1032 | matrix = np.array([[1, 8, 5, 3, 0],
1033 | [0, 4, 15, 5, 1],
1034 | [0, 0, 0, 0, 2],
1035 | [0, 0, 0, 0, 1],
1036 | [0, 0, 0, 0, 0]])
1037 |
1038 | hic.matrix = csr_matrix(matrix)
1039 | hic.setMatrix(hic.matrix, cut_intervals)
1040 |
1041 | nt.assert_equal(hic.getMatrix(), matrix)
1042 |
1043 | # empty list should raise AssertionError
1044 | interval_list = []
1045 | with pytest.raises(AssertionError):
1046 | hic.intervalListToIntervalTree(interval_list)
1047 |
1048 | captured = capsys.readouterr()
1049 | assert captured.out == "Interval list is empty"
1050 |
1051 | # test with correct interval_list
1052 | interval_list = [('a', 0, 10, 1), ('a', 10, 20, 1), ('b', 20, 30, 1), ('b', 30, 50, 1),
1053 | ('b', 50, 100, 1), ('c', 100, 200, 1), ('c', 200, 210, 1),
1054 | ('d', 210, 220, 1), ('e', 220, 250)]
1055 |
1056 | tree, boundaries = hic.intervalListToIntervalTree(interval_list)
1057 |
1058 | # test tree
1059 | nt.assert_equal(tree['a'], IntervalTree([Interval(0, 10, 0), Interval(10, 20, 1)]))
1060 | nt.assert_equal(tree['b'], IntervalTree([Interval(20, 30, 2), Interval(30, 50, 3),
1061 | Interval(50, 100, 4)]))
1062 | nt.assert_equal(tree['c'], IntervalTree([Interval(100, 200, 5), Interval(200, 210, 6)]))
1063 | nt.assert_equal(tree['d'], IntervalTree([Interval(210, 220, 7)]))
1064 | nt.assert_equal(tree['e'], IntervalTree([Interval(220, 250, 8)]))
1065 |
1066 | # test boundaries
1067 | nt.assert_equal(boundaries, OrderedDict([('a', (0, 2)), ('b', (2, 5)), ('c', (5, 7)),
1068 | ('d', (7, 8)), ('e', (8, 9))]))
1069 |
1070 |
1071 | def test_fillLowerTriangle():
1072 | A = csr_matrix(np.array([[12, 5, 3, 2, 0], [0, 11, 4, 1, 1], [0, 0, 9, 6, 0],
1073 | [0, 0, 0, 10, 0], [0, 0, 0, 0, 0]]), dtype=np.int32)
1074 | hic = hm.hiCMatrix()
1075 | hic.matrix = A
1076 | hic.fillLowerTriangle()
1077 | B = hic.matrix
1078 | test_matrix = np.array([[12, 5, 3, 2, 0],
1079 | [5, 11, 4, 1, 1],
1080 | [3, 4, 9, 6, 0],
1081 | [2, 1, 6, 10, 0],
1082 | [0, 1, 0, 0, 0]], dtype='i4')
1083 |
1084 | nt.assert_equal(B.todense(), test_matrix)
1085 |
1086 |
1087 | def test_getDistList():
1088 | row, col = np.triu_indices(5)
1089 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
1090 | ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)]
1091 | dist_list, chrom_list = hm.hiCMatrix.getDistList(row, col, cut_intervals)
1092 |
1093 | matrix = coo_matrix((dist_list, (row, col)), shape=(5, 5), dtype=np.int32).todense()
1094 | test_matrix = np.array([[0, 10, 20, 30, -1],
1095 | [0, 0, 10, 20, -1],
1096 | [0, 0, 0, 10, -1],
1097 | [0, 0, 0, 0, -1],
1098 | [0, 0, 0, 0, 0]], dtype='i4')
1099 | nt.assert_equal(matrix, test_matrix)
1100 |
1101 | chrom_list = chrom_list.tolist()
1102 | test_chrom_list = ['a', 'a', 'a', 'a', '', 'a', 'a', 'a', '', 'a', 'a', '', 'a',
1103 | '', 'b']
1104 |
1105 | nt.assert_equal(chrom_list, test_chrom_list)
1106 |
1107 |
1108 | def test_convert_to_obs_exp_matrix():
1109 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
1110 | ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)]
1111 | hic = hm.hiCMatrix()
1112 | hic.nan_bins = []
1113 | matrix = np.array([[1, 8, 5, 3, 0],
1114 | [0, 4, 15, 5, 1],
1115 | [0, 0, 0, 7, 2],
1116 | [0, 0, 0, 0, 1],
1117 | [0, 0, 0, 0, 0]])
1118 |
1119 | hic.matrix = csr_matrix(matrix)
1120 | hic.setMatrix(hic.matrix, cut_intervals)
1121 |
1122 | obs_exp_matrix = hic.convert_to_obs_exp_matrix().todense()
1123 | test_matrix = np.array([[1., 0.8, 1., 1., 0.],
1124 | [0., 4., 1.5, 1., 1.],
1125 | [0., 0., 0., 0.7, 2.],
1126 | [0., 0., 0., 0., 1.],
1127 | [0., 0., 0., 0., 0.]])
1128 |
1129 | nt.assert_almost_equal(obs_exp_matrix, test_matrix)
1130 |
1131 | hic.matrix = csr_matrix(matrix)
1132 | obs_exp_matrix = hic.convert_to_obs_exp_matrix(maxdepth=20).todense()
1133 | test_matrix = np.array([[1., 0.8, 1., 0., 0.],
1134 | [0., 4., 1.5, 1., 0.],
1135 | [0., 0., 0., 0.7, np.nan],
1136 | [0., 0., 0., 0., np.nan],
1137 | [0., 0., 0., 0., 0.]])
1138 |
1139 | nt.assert_almost_equal(obs_exp_matrix, test_matrix)
1140 |
1141 | hic.matrix = csr_matrix(matrix)
1142 |
1143 | obs_exp_matrix = hic.convert_to_obs_exp_matrix(zscore=True).todense()
1144 | test_matrix = np.array([[0., -0.56195149, np.nan, np.nan, -1.41421356],
1145 | [0., 1.93649167, 1.40487872, np.nan, 0.],
1146 | [0., 0., -0.64549722, -0.84292723, 1.41421356],
1147 | [0., 0., 0., -0.64549722, 0.],
1148 | [0., 0., 0., 0., -0.64549722]])
1149 |
1150 | nt.assert_almost_equal(obs_exp_matrix, test_matrix)
1151 |
1152 |
1153 | def test_maskChromosomes():
1154 |
1155 | hic = hm.hiCMatrix()
1156 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
1157 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]
1158 |
1159 | hic.nan_bins = []
1160 |
1161 | matrix = np.array([[1, 8, 5, 3, 0],
1162 | [0, 4, 15, 5, 1],
1163 | [0, 0, 0, 0, 2],
1164 | [0, 0, 0, 0, 1],
1165 | [0, 0, 0, 0, 0]])
1166 |
1167 | hic.matrix = csr_matrix(matrix)
1168 | hic.setMatrix(hic.matrix, cut_intervals)
1169 | hic.maskChromosomes(['a'])
1170 |
1171 |
1172 | @pytest.mark.xfail
1173 | def test_maskChromosomes_fail():
1174 |
1175 | hic = hm.hiCMatrix()
1176 | cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
1177 | ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]
1178 |
1179 | hic.nan_bins = []
1180 |
1181 | matrix = np.array([[1, 8, 5, 3, 0],
1182 | [0, 4, 15, 5, 1],
1183 | [0, 0, 0, 0, 2],
1184 | [0, 0, 0, 0, 1],
1185 | [0, 0, 0, 0, 0]])
1186 |
1187 | hic.matrix = csr_matrix(matrix)
1188 | hic.setMatrix(hic.matrix, cut_intervals)
1189 |
1190 | hic.maskChromosomes(['c'])
1191 |
1192 | print(hic.matrix)
1193 |
1194 |
1195 | def test_create_from_cool():
1196 | hic_ma = hm.hiCMatrix(ROOT + 'one_interaction_4chr.cool')
1197 | nt.assert_equal(sorted(hic_ma.matrix.indices), [0, 3])
1198 | nt.assert_equal(sorted(hic_ma.matrix.data), [1, 1])
1199 | nt.assert_equal(sorted(hic_ma.nan_bins)[:5], [1, 2, 4, 5, 6])
1200 | hic_ma = hm.hiCMatrix(ROOT + 'one_interaction_diag_4chr.cool')
1201 | nt.assert_equal(sorted(hic_ma.matrix.indices), [0])
1202 | nt.assert_equal(sorted(hic_ma.matrix.data), [1])
1203 | nt.assert_equal(sorted(hic_ma.nan_bins)[:5], [1, 2, 3, 4, 5])
1204 | hic_ma.maskBins(hic_ma.nan_bins)
1205 | assert hic_ma.matrix.shape == (1, 1)
1206 | assert hic_ma.getBinSize() == 50000
1207 |
1208 |
1209 | def test_load_cool_matrix_only():
1210 | hic_cool = hm.hiCMatrix(ROOT + 'Li_et_al_2015.cool', pUpperTriangleOnly=True)
1211 |
1212 | hic_cool_matrix_only = hm.hiCMatrix(ROOT + 'Li_et_al_2015.cool', pUpperTriangleOnly=True, pLoadMatrixOnly=True)
1213 | instances = hic_cool_matrix_only.matrix[0]
1214 | features = hic_cool_matrix_only.matrix[1]
1215 | data = hic_cool_matrix_only.matrix[2]
1216 |
1217 | instances_cool, features_cool = hic_cool.matrix.nonzero()
1218 | nt.assert_equal(hic_cool.matrix.data, data)
1219 | nt.assert_equal(instances_cool, instances)
1220 | nt.assert_equal(features_cool, features)
1221 |
--------------------------------------------------------------------------------
/hicmatrix/test/test_data/GSE63525_GM12878_insitu_primary_2_5mb_hic2cool042.cool:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeptools/HiCMatrix/f36927fd387aa7201a0a006a9c57ecccb29cab09/hicmatrix/test/test_data/GSE63525_GM12878_insitu_primary_2_5mb_hic2cool042.cool
--------------------------------------------------------------------------------
/hicmatrix/test/test_data/GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeptools/HiCMatrix/f36927fd387aa7201a0a006a9c57ecccb29cab09/hicmatrix/test/test_data/GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool
--------------------------------------------------------------------------------
/hicmatrix/test/test_data/Li_et_al_2015.cool:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeptools/HiCMatrix/f36927fd387aa7201a0a006a9c57ecccb29cab09/hicmatrix/test/test_data/Li_et_al_2015.cool
--------------------------------------------------------------------------------
/hicmatrix/test/test_data/Li_et_al_2015.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeptools/HiCMatrix/f36927fd387aa7201a0a006a9c57ecccb29cab09/hicmatrix/test/test_data/Li_et_al_2015.h5
--------------------------------------------------------------------------------
/hicmatrix/test/test_data/one_interaction_4chr.cool:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeptools/HiCMatrix/f36927fd387aa7201a0a006a9c57ecccb29cab09/hicmatrix/test/test_data/one_interaction_4chr.cool
--------------------------------------------------------------------------------
/hicmatrix/test/test_data/one_interaction_diag_4chr.cool:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeptools/HiCMatrix/f36927fd387aa7201a0a006a9c57ecccb29cab09/hicmatrix/test/test_data/one_interaction_diag_4chr.cool
--------------------------------------------------------------------------------
/hicmatrix/test/test_data/small_test_matrix.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeptools/HiCMatrix/f36927fd387aa7201a0a006a9c57ecccb29cab09/hicmatrix/test/test_data/small_test_matrix.h5
--------------------------------------------------------------------------------
/hicmatrix/test/test_data/test_matrix.homer:
--------------------------------------------------------------------------------
1 | HiCMatrix (directory=wtTagDir/) Regions 3R-1000000 3R-1020000 3R-1040000 3R-1060000 3R-1080000 3R-1100000 3R-1120000 3R-1140000 3R-1160000 3R-1180000 3R-1200000 3R-1220000 3R-1240000
2 | 3R-1000000 3R-1000000 1.000e+00 1.896e-01 2.163e-01 8.288e-02 1.431e-01 2.569e-01 1.315e-01 1.488e-01 -3.120e-02 1.430e-01 6.091e-02 3.546e-02 1.168e-01
3 | 3R-1020000 3R-1020000 1.896e-01 1.000e+00 3.695e-01 3.666e-01 1.456e-01 1.940e-01 2.517e-01 1.511e-01 2.184e-01 1.727e-01 1.676e-01 -1.512e-02 -6.450e-02
4 | 3R-1040000 3R-1040000 2.163e-01 3.695e-01 1.000e+00 3.818e-01 2.833e-01 2.460e-01 2.430e-01 3.630e-01 1.483e-01 2.690e-01 2.176e-01 -6.305e-02 -1.125e-01
5 | 3R-1060000 3R-1060000 8.288e-02 3.666e-01 3.818e-01 1.000e+00 3.246e-01 2.644e-01 2.107e-01 3.149e-01 2.863e-01 2.273e-01 2.582e-01 -1.020e-02 2.029e-02
6 | 3R-1080000 3R-1080000 1.431e-01 1.456e-01 2.833e-01 3.246e-01 1.000e+00 2.488e-01 2.928e-01 2.152e-01 3.685e-01 2.373e-01 1.003e-01 1.003e-01 4.465e-02
7 | 3R-1100000 3R-1100000 2.569e-01 1.940e-01 2.460e-01 2.644e-01 2.488e-01 1.000e+00 3.083e-01 3.408e-01 3.025e-01 1.565e-01 1.917e-01 -6.210e-02 7.574e-02
8 | 3R-1120000 3R-1120000 1.315e-01 2.517e-01 2.430e-01 2.107e-01 2.928e-01 3.083e-01 1.000e+00 2.484e-01 2.986e-01 2.647e-01 2.333e-01 7.504e-02 -4.602e-02
9 | 3R-1140000 3R-1140000 1.488e-01 1.511e-01 3.630e-01 3.149e-01 2.152e-01 3.408e-01 2.484e-01 1.000e+00 3.777e-01 1.729e-01 1.445e-01 -1.355e-02 6.834e-02
10 | 3R-1160000 3R-1160000 -3.120e-02 2.184e-01 1.483e-01 2.863e-01 3.685e-01 3.025e-01 2.986e-01 3.777e-01 1.000e+00 1.299e-01 4.142e-02 2.557e-02 8.888e-02
11 | 3R-1180000 3R-1180000 1.430e-01 1.727e-01 2.690e-01 2.273e-01 2.373e-01 1.565e-01 2.647e-01 1.729e-01 1.299e-01 1.000e+00 2.826e-01 7.371e-02 -1.322e-01
12 | 3R-1200000 3R-1200000 6.091e-02 1.676e-01 2.176e-01 2.582e-01 1.003e-01 1.917e-01 2.333e-01 1.445e-01 4.142e-02 2.826e-01 1.000e+00 3.217e-01 1.061e-01
13 | 3R-1220000 3R-1220000 3.546e-02 -1.512e-02 -6.305e-02 -1.020e-02 1.003e-01 -6.210e-02 7.504e-02 -1.355e-02 2.557e-02 7.371e-02 3.217e-01 1.000e+00 1.326e-01
14 | 3R-1240000 3R-1240000 1.168e-01 -6.450e-02 -1.125e-01 2.029e-02 4.465e-02 7.574e-02 -4.602e-02 6.834e-02 8.888e-02 -1.322e-01 1.061e-01 1.326e-01 1.000e+00
15 |
--------------------------------------------------------------------------------
/hicmatrix/test/test_data/test_matrix.homer.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeptools/HiCMatrix/f36927fd387aa7201a0a006a9c57ecccb29cab09/hicmatrix/test/test_data/test_matrix.homer.gz
--------------------------------------------------------------------------------
/hicmatrix/test/test_matrixFileHandler.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | from tempfile import NamedTemporaryFile
4 |
5 | import cooler
6 | import numpy as np
7 | import numpy.testing as nt
8 | import pytest
9 |
10 | from hicmatrix.lib import MatrixFileHandler
11 |
12 | log = logging.getLogger(__name__)
13 |
14 | ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_data/")
15 | outfile_basename = '/tmp/matrix'
16 |
17 |
18 | def test_load_homer():
19 | # create matrixFileHandler instance with filetype 'homer'
20 | pMatrixFile = ROOT + 'test_matrix.homer'
21 | fh = MatrixFileHandler(pFileType='homer', pMatrixFile=pMatrixFile)
22 | assert fh is not None
23 |
24 | # load data
25 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
26 |
27 | # create test matrix
28 |
29 | test_matrix = np.array([[1.0, 0.1896, 0.2163, 0.08288, 0.1431, 0.2569, 0.1315,
30 | 0.1488, -0.0312, 0.143, 0.06091, 0.03546, 0.1168]])
31 |
32 | nt.assert_almost_equal(matrix[0].todense(), test_matrix)
33 |
34 | test_cut_intervals = [('3R', 1000000, 1020000, 1), ('3R', 1020000, 1040000, 1), ('3R', 1040000, 1060000, 1), ('3R', 1060000, 1080000, 1), ('3R', 1080000, 1100000, 1), ('3R', 1100000, 1120000, 1), ('3R', 1120000, 1140000, 1), ('3R', 1140000, 1160000, 1), ('3R', 1160000, 1180000, 1), ('3R', 1180000, 1200000, 1), ('3R', 1200000, 1220000, 1), ('3R', 1220000, 1240000, 1), ('3R', 1240000, 1260000, 1)] # noqa E501
35 | nt.assert_equal(cut_intervals, test_cut_intervals)
36 |
37 | assert nan_bins is None
38 | assert distance_counts is None
39 | assert correction_factors is None
40 |
41 |
42 | def test_load_homer_gzip():
43 | # create matrixFileHandler instance with filetype 'homer'
44 | pMatrixFile = ROOT + 'test_matrix.homer.gz'
45 | fh = MatrixFileHandler(pFileType='homer', pMatrixFile=pMatrixFile)
46 | assert fh is not None
47 |
48 | # load data
49 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
50 |
51 | # create test matrix
52 |
53 | test_matrix = np.array([[1.0, 0.1896, 0.2163, 0.08288, 0.1431, 0.2569, 0.1315,
54 | 0.1488, -0.0312, 0.143, 0.06091, 0.03546, 0.1168]])
55 |
56 | nt.assert_almost_equal(matrix[0].todense(), test_matrix)
57 |
58 | test_cut_intervals = [('3R', 1000000, 1020000, 1), ('3R', 1020000, 1040000, 1), ('3R', 1040000, 1060000, 1), ('3R', 1060000, 1080000, 1), ('3R', 1080000, 1100000, 1), ('3R', 1100000, 1120000, 1), ('3R', 1120000, 1140000, 1), ('3R', 1140000, 1160000, 1), ('3R', 1160000, 1180000, 1), ('3R', 1180000, 1200000, 1), ('3R', 1200000, 1220000, 1), ('3R', 1220000, 1240000, 1), ('3R', 1240000, 1260000, 1)] # noqa E501
59 | nt.assert_equal(cut_intervals, test_cut_intervals)
60 |
61 | assert nan_bins is None
62 | assert distance_counts is None
63 | assert correction_factors is None
64 |
65 |
66 | def test_save_homer():
67 | homer_outfile = outfile_basename + '.homer'
68 |
69 | # create matrixFileHandler instance with filetype 'homer'
70 | pMatrixFile = ROOT + 'test_matrix.homer'
71 | fh = MatrixFileHandler(pFileType='homer', pMatrixFile=pMatrixFile)
72 | assert fh is not None
73 |
74 | # load data
75 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
76 | # set matrix variables
77 | fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) # noqa E501
78 | # and save it.
79 | fh.save(pName=homer_outfile, pSymmetric=False, pApplyCorrection=False) # not implemented
80 | os.unlink(homer_outfile)
81 |
82 |
83 | def test_load_h5():
84 | # create matrixFileHandler instance with filetype 'h5'
85 | pMatrixFile = ROOT + 'Li_et_al_2015.h5'
86 | fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile)
87 | assert fh is not None
88 |
89 | # load data
90 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
91 |
92 | test_matrix = np.array([[0. for i in range(11104)]])
93 | nt.assert_almost_equal(matrix[0].todense(), test_matrix)
94 |
95 | nt.assert_equal(cut_intervals[0], ('X', 0, 2200, 0.0))
96 | nt.assert_equal(cut_intervals[1], ('X', 2200, 4702, 0.0))
97 | nt.assert_equal(cut_intervals[2], ('X', 4702, 7060, 0.0))
98 | nt.assert_equal(cut_intervals[3], ('X', 7060, 8811, 0.4))
99 |
100 | test_nan_bins = np.array([0, 1, 2, 3, 4, 5, 6, 7, 30, 31, 32, 51, 52, 53, 54, 81, 82, 83, 84, 94]) # noqa E501
101 | nt.assert_equal(nan_bins[0:20], test_nan_bins)
102 |
103 | assert distance_counts is None
104 |
105 | test_correction_factors = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0.90720049, 1.25516028]) # noqa E501
106 | nt.assert_almost_equal(correction_factors[0:10], test_correction_factors)
107 |
108 |
109 | def test_save_h5():
110 | h5_outfile = outfile_basename + '.h5'
111 |
112 | # create matrixFileHandler instance with filetype 'h5'
113 | pMatrixFile = ROOT + 'Li_et_al_2015.h5'
114 | fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile)
115 | assert fh is not None
116 |
117 | # load data
118 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
119 | # set matrix variables
120 | fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) # noqa E501
121 | # and save it.
122 | fh.save(h5_outfile, True, None)
123 |
124 | os.unlink(h5_outfile)
125 |
126 |
127 | def test_load_hicpro():
128 | # create matrixFileHandler instance with filetype 'hicpro'
129 | pMatrixFile = ROOT + 'test_matrix.hicpro'
130 | pBedFileHicPro = ROOT + 'test_matrix.bed'
131 | fh = MatrixFileHandler(pFileType='hicpro', pMatrixFile=pMatrixFile, pBedFileHicPro=pBedFileHicPro)
132 | assert fh is not None
133 |
134 | # load data
135 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
136 |
137 | # create test matrix
138 | test_list = [0. for i in range(3113)]
139 | test_list.insert(0, 41.345793)
140 | test_list[827] = 5.42079
141 | test_list[1263] = 5.122642
142 |
143 | test_matrix = np.array([test_list])
144 |
145 | # and check for shape and values
146 | assert matrix[0].todense().shape == test_matrix.shape
147 | nt.assert_almost_equal(matrix[0].todense(), test_matrix)
148 |
149 | test_cut_intervals = np.array([('chr1', 0, 1000000, 1), ('chr1', 1000000, 2000000, 2), ('chr1', 2000000, 3000000, 3),
150 | ('chr1', 3000000, 4000000, 4), ('chr1', 4000000, 5000000, 5), ('chr1', 5000000, 6000000, 6),
151 | ('chr1', 6000000, 7000000, 7), ('chr1', 7000000, 8000000, 8), ('chr1', 8000000, 9000000, 9),
152 | ('chr1', 9000000, 10000000, 10), ('chr1', 10000000, 11000000, 11), ('chr1', 11000000, 12000000, 12),
153 | ('chr1', 12000000, 13000000, 13), ('chr1', 13000000, 14000000, 14), ('chr1', 14000000, 15000000, 15),
154 | ('chr1', 15000000, 16000000, 16), ('chr1', 16000000, 17000000, 17), ('chr1', 17000000, 18000000, 18),
155 | ('chr1', 18000000, 19000000, 19), ('chr1', 19000000, 20000000, 20)])
156 | nt.assert_equal(cut_intervals[0:20], test_cut_intervals)
157 |
158 | assert nan_bins is None
159 | assert correction_factors is None
160 | assert distance_counts is None
161 |
162 |
163 | @pytest.mark.xfail
164 | def test_save_hicpro():
165 | hicpro_outfile = outfile_basename + '.hicpro'
166 |
167 | # create matrixFileHandler instance with filetype 'hicpro'
168 | pMatrixFile = ROOT + 'test_matrix.hicpro'
169 | pBedFileHicPro = ROOT + 'test_matrix.bed'
170 | fh = MatrixFileHandler(pFileType='hicpro', pMatrixFile=pMatrixFile, pBedFileHicPro=pBedFileHicPro)
171 | assert fh is not None
172 |
173 | # load data
174 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
175 | # set matrix variables
176 | fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts)
177 | # and save it.
178 | fh.save(pName=hicpro_outfile, pSymmetric=False, pApplyCorrection=False) # not implemented
179 | os.unlink(hicpro_outfile)
180 |
181 |
182 | def test_load_cool():
183 | # create matrixFileHandler instance with filetype 'cool'
184 | pMatrixFile = ROOT + 'Li_et_al_2015.cool'
185 | fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile)
186 | assert fh is not None
187 |
188 | # load data
189 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
190 |
191 | # test matrix
192 | test_matrix = np.array([[0. for i in range(11104)]])
193 | nt.assert_almost_equal(matrix[0].todense(), test_matrix)
194 |
195 | test_cut_intervals = [('X', 0, 2200, 1.0), ('X', 2200, 4702, 1.0), ('X', 4702, 7060, 1.0),
196 | ('X', 7060, 8811, 1.0), ('X', 8811, 11048, 1.0), ('X', 11048, 14329, 1.0),
197 | ('X', 14329, 16847, 1.0), ('X', 16847, 19537, 1.0), ('X', 19537, 20701, 1.0),
198 | ('X', 20701, 22321, 1.0), ('X', 22321, 24083, 1.0), ('X', 24083, 25983, 1.0),
199 | ('X', 25983, 27619, 1.0), ('X', 27619, 29733, 1.0), ('X', 29733, 30973, 1.0),
200 | ('X', 30973, 32214, 1.0), ('X', 32214, 34179, 1.0), ('X', 34179, 35987, 1.0),
201 | ('X', 35987, 37598, 1.0), ('X', 37598, 39009, 1.0)]
202 | for index, tup in enumerate(cut_intervals[0:20]):
203 | for ind, element in enumerate(tup):
204 | assert element == test_cut_intervals[index][ind]
205 |
206 | test_nan_bins = [0, 1, 2, 3, 4, 5, 6, 7, 30, 31]
207 | nt.assert_almost_equal(nan_bins[0:10], test_nan_bins)
208 |
209 | test_correction_factors = [0., 0., 0., 0., 0., 0., 0., 0., 1.1022922, 0.796711]
210 | nt.assert_almost_equal(correction_factors[0:10], test_correction_factors)
211 |
212 | assert distance_counts is None
213 |
214 |
215 | def test_load_cool2():
216 | # create matrixFileHandler instance with filetype 'cool'
217 | pMatrixFile = ROOT + 'one_interaction_4chr.cool'
218 | # The interaction is:
219 | # chr1 10000 chr1 200000
220 | bin_size = 50000
221 | # So there should be a 1 between the bin 0 and the bin 3
222 | fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile)
223 | assert fh is not None
224 |
225 | # load data
226 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
227 |
228 | # test data
229 | nt.assert_almost_equal(matrix.data, np.array([1]))
230 |
231 | # test matrix
232 | test_matrix = np.array([[0 for i in range(9167)]])
233 | nt.assert_almost_equal(matrix[3].todense(), test_matrix)
234 | test_matrix[0][3] = 1
235 | nt.assert_almost_equal(matrix[0].todense(), test_matrix)
236 |
237 | test_cut_intervals = sum([[('chr1', i * bin_size, (i + 1) * bin_size, 1.0) for i in range(3909)],
238 | [('chr1', 195450000, 195471971, 1.0)],
239 | [('chrX', i * bin_size, (i + 1) * bin_size, 1.0) for i in range(3420)],
240 | [('chrX', 171000000, 171031299, 1.0)],
241 | [('chrY', i * bin_size, (i + 1) * bin_size, 1.0) for i in range(1834)],
242 | [('chrY', 91700000, 91744698, 1.0)],
243 | [('chrM', 0, 16299, 1.0)]], [])
244 |
245 | for index, tup in enumerate(cut_intervals):
246 | for ind, element in enumerate(tup):
247 | assert element == test_cut_intervals[index][ind]
248 |
249 | test_nan_bins = [1, 2, 4, 5]
250 | nt.assert_almost_equal(nan_bins[:4], test_nan_bins)
251 |
252 | assert distance_counts is None
253 | assert correction_factors is None
254 |
255 |
256 | def test_save_cool():
257 | cool_outfile = outfile_basename + '.cool'
258 |
259 | # create matrixFileHandler instance with filetype 'cool'
260 | pMatrixFile = ROOT + 'Li_et_al_2015.cool'
261 | fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile)
262 | assert fh is not None
263 |
264 | # load data
265 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
266 | # set matrix variables
267 | fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts)
268 | # and save it.
269 | fh.save(pName=cool_outfile, pSymmetric=True, pApplyCorrection=True)
270 |
271 | fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile)
272 | assert fh_test is not None
273 | matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load()
274 |
275 | nt.assert_equal(matrix.data, matrix_test.data)
276 | nt.assert_equal(cut_intervals, cut_intervals_test)
277 | nt.assert_equal(nan_bins, nan_bins_test)
278 | nt.assert_equal(distance_counts, distance_counts_test)
279 | nt.assert_equal(correction_factors, correction_factors_test)
280 |
281 | os.unlink(cool_outfile)
282 |
283 |
284 | def test_load_distance_cool():
285 | cool_outfile = outfile_basename + '.cool'
286 |
287 | # create matrixFileHandler instance with filetype 'cool'
288 | pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool'
289 | fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pChrnameList=['1'], pDistance=2500000)
290 | assert fh is not None
291 |
292 | # load data
293 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
294 | # set matrix variables
295 | fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts)
296 | # and save it.
297 | fh.save(pName=cool_outfile, pSymmetric=True, pApplyCorrection=True)
298 |
299 | fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile)
300 | assert fh_test is not None
301 | matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load()
302 |
303 | # check distance load works as expected
304 | instances, features = matrix.nonzero()
305 | distances = np.absolute(instances - features)
306 | # log.debug('max: {}'.format(np.max(distances)))
307 | mask = distances > 1 # 2.5 mb res --> all with 2.5 Mb distance
308 | assert np.sum(mask) == 0
309 |
310 | fh = MatrixFileHandler(pFileType='cool', pChrnameList=['1'], pMatrixFile=pMatrixFile)
311 | assert fh is not None
312 |
313 | # load data
314 | matrix2, _, _, _, _ = fh.load()
315 | instances, features = matrix2.nonzero()
316 | distances = np.absolute(instances - features)
317 | mask = distances > 1 # 2.5 mb res --> all with 2.5 Mb distance
318 | assert np.sum(mask) > 0
319 |
320 | # check if load and save matrix are equal
321 | nt.assert_equal(matrix.data, matrix_test.data)
322 | nt.assert_equal(cut_intervals, cut_intervals_test)
323 | nt.assert_equal(nan_bins, nan_bins_test)
324 | nt.assert_equal(distance_counts, distance_counts_test)
325 | nt.assert_equal(correction_factors, correction_factors_test)
326 |
327 | os.unlink(cool_outfile)
328 |
329 |
330 | def test_load_h5_save_cool():
331 | cool_outfile = outfile_basename + '.cool'
332 |
333 | # create matrixFileHandler instance with filetype 'h5'
334 | pMatrixFile = ROOT + 'Li_et_al_2015.h5'
335 | fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile)
336 | assert fh is not None
337 |
338 | # load data
339 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
340 |
341 | # set matrix variables
342 | fh_new = MatrixFileHandler(pFileType='cool')
343 |
344 | fh_new.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts)
345 | fh_new.matrixFile.fileWasH5 = True
346 | # and save it.
347 |
348 | fh_new.save(pName=cool_outfile, pSymmetric=False, pApplyCorrection=True)
349 |
350 | fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile)
351 | assert fh_test is not None
352 | matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load()
353 |
354 | instances, features = matrix.nonzero()
355 | instances_factors = correction_factors[instances]
356 | features_factors = correction_factors[features]
357 | instances_factors *= features_factors
358 |
359 | matrix_applied_correction = matrix.data / instances_factors
360 | nt.assert_almost_equal(matrix_applied_correction, matrix_test.data, decimal=1)
361 | nt.assert_equal(len(cut_intervals), len(cut_intervals_test))
362 | nt.assert_equal(nan_bins, nan_bins_test)
363 | nt.assert_equal(distance_counts, distance_counts_test)
364 | correction_factors = 1 / correction_factors
365 | mask = np.isnan(correction_factors)
366 | correction_factors[mask] = 0
367 | mask = np.isinf(correction_factors)
368 | correction_factors[mask] = 0
369 | nt.assert_equal(correction_factors, correction_factors_test)
370 |
371 | # os.unlink(cool_outfile)
372 | os.unlink(cool_outfile)
373 |
374 |
375 | def test_save_cool_enforce_integer():
376 | cool_outfile = outfile_basename + '.cool'
377 |
378 | # create matrixFileHandler instance with filetype 'h5'
379 | pMatrixFile = ROOT + 'Li_et_al_2015.h5'
380 | fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile)
381 | assert fh is not None
382 |
383 | # load data
384 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
385 |
386 | # set matrix variables
387 | fh_new = MatrixFileHandler(pFileType='cool', pEnforceInteger=True)
388 |
389 | fh_new.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts)
390 | fh_new.matrixFile.fileWasH5 = True
391 | # and save it.
392 |
393 | fh_new.save(pName=cool_outfile, pSymmetric=False, pApplyCorrection=True)
394 |
395 | fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile, pApplyCorrectionCoolerLoad=False)
396 | assert fh_test is not None
397 | matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, _ = fh_test.load()
398 |
399 | # pMatrixFile = ROOT + 'Li_et_al_2015.h5'
400 | # fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile)
401 | # assert fh is not None
402 |
403 | # load data
404 | # matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
405 | # instances, features = matrix.nonzero()
406 | # instances_factors = correction_factors[instances]
407 | # features_factors = correction_factors[features]
408 | # instances_factors *= features_factors
409 |
410 | # matrix_applied_correction = matrix.data / instances_factors
411 | # mask = matrix.data == 0
412 | matrix.data = np.rint(matrix.data)
413 | matrix.eliminate_zeros()
414 | # matrix_test.eliminate_zeros()
415 |
416 | nt.assert_almost_equal(matrix.data, matrix_test.data, decimal=0)
417 | nt.assert_equal(len(cut_intervals), len(cut_intervals_test))
418 | nt.assert_equal(nan_bins, nan_bins_test)
419 | nt.assert_equal(distance_counts, distance_counts_test)
420 |
421 | # os.unlink(cool_outfile)
422 | os.unlink(cool_outfile)
423 |
424 |
425 | def test_load_cool_hic2cool_versions():
426 | pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool042.cool'
427 | hic2cool_042 = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pCorrectionFactorTable='KR', pCorrectionOperator='*')
428 | pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool'
429 | hic2cool_051 = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pCorrectionFactorTable='KR')
430 |
431 | # hic2cool_051 = MatrixFileHandler(pFileType='h5', pMatrixFile=, pCorrectionFactorTable='KR')
432 | # hic2cool_042 = hm.hiCMatrix(ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool042.cool')
433 | # hic2cool_051 = hm.hiCMatrix(ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool')
434 |
435 | # hic2cool_041 = hm.hiCMatrix(outfile.name)
436 | matrix, cut_intervals, nan_bins, distance_counts, _ = hic2cool_042.load()
437 | matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, _ = hic2cool_051.load()
438 |
439 | nt.assert_almost_equal(matrix.data, matrix_test.data, decimal=0)
440 | nt.assert_equal(len(cut_intervals), len(cut_intervals_test))
441 | nt.assert_equal(nan_bins, nan_bins_test)
442 | nt.assert_equal(distance_counts, distance_counts_test)
443 |
444 |
445 | def test_save_cool_apply_division():
446 | cool_outfile = outfile_basename + '.cool'
447 |
448 | # create matrixFileHandler instance with filetype 'cool'
449 | pMatrixFile = ROOT + 'Li_et_al_2015.cool'
450 | fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pCorrectionOperator='/')
451 | assert fh is not None
452 |
453 | # load data
454 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
455 | # set matrix variables
456 | fh_new = MatrixFileHandler(pFileType='cool', pCorrectionOperator='/')
457 |
458 | fh_new.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts)
459 |
460 | # and save it.
461 |
462 | fh_new.save(pName=cool_outfile, pSymmetric=False, pApplyCorrection=True)
463 |
464 | fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile)
465 | assert fh_test is not None
466 | matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, _ = fh_test.load()
467 | pMatrixFile = ROOT + 'Li_et_al_2015.cool'
468 | fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pCorrectionOperator='/')
469 | assert fh is not None
470 | # load data
471 | matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
472 |
473 | nt.assert_almost_equal(matrix.data, matrix_test.data, decimal=1)
474 | nt.assert_equal(len(cut_intervals), len(cut_intervals_test))
475 | nt.assert_equal(nan_bins, nan_bins_test)
476 | nt.assert_equal(distance_counts, distance_counts_test)
477 |
478 | os.unlink(cool_outfile)
479 |
480 |
481 | def test_save_scool_matrixHandlersCool():
482 |
483 | outfile = NamedTemporaryFile(suffix='.scool', prefix='hicmatrix_scool_test') # pylint: disable=R1732
484 |
485 | pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool'
486 |
487 | matrixFileHandlerInput = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile)
488 | matrix, cut_intervals, nan_bins, \
489 | distance_counts, correction_factors = matrixFileHandlerInput.load()
490 | matrixFileHandlerOutput1 = MatrixFileHandler(pFileType='cool', pMatrixFile='cell1', pEnforceInteger=False, pFileWasH5=False, pHic2CoolVersion=None)
491 | matrixFileHandlerOutput1.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts)
492 |
493 | matrixFileHandlerOutput2 = MatrixFileHandler(pFileType='cool', pMatrixFile='cell2', pEnforceInteger=False, pFileWasH5=False, pHic2CoolVersion=None)
494 | matrixFileHandlerOutput2.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts)
495 |
496 | matrixFileHandlerOutput3 = MatrixFileHandler(pFileType='cool', pMatrixFile='cell3', pEnforceInteger=False, pFileWasH5=False, pHic2CoolVersion=None)
497 | matrixFileHandlerOutput3.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts)
498 |
499 | matrixFileHandler = MatrixFileHandler(pFileType='scool')
500 | matrixFileHandler.matrixFile.coolObjectsList = [matrixFileHandlerOutput1, matrixFileHandlerOutput2, matrixFileHandlerOutput3]
501 |
502 | matrixFileHandler.save(outfile.name, pSymmetric=True, pApplyCorrection=False)
503 |
504 | content_of_scool = cooler.fileops.list_scool_cells(outfile.name)
505 | content_expected = ['/cells/cell1', '/cells/cell2', '/cells/cell3']
506 | for content in content_expected:
507 | assert content in content_of_scool
508 |
509 |
510 | def test_save_scool_pixeltables():
511 | outfile = NamedTemporaryFile(suffix='.scool', prefix='hicmatrix_scool_test') # pylint: disable=R1732
512 |
513 | pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool'
514 |
515 | cooler_obj = cooler.Cooler(pMatrixFile)
516 | bins = cooler_obj.bins()[:]
517 | pixels = cooler_obj.pixels()[:]
518 |
519 | pixelsList = [pixels, pixels, pixels]
520 | matrices_list = ['cell1', 'cell2', 'cell3']
521 | matrixFileHandler = MatrixFileHandler(pFileType='scool')
522 | matrixFileHandler.matrixFile.coolObjectsList = None
523 | matrixFileHandler.matrixFile.bins = bins
524 | matrixFileHandler.matrixFile.pixel_list = pixelsList
525 | matrixFileHandler.matrixFile.name_list = matrices_list
526 | matrixFileHandler.save(outfile.name, pSymmetric=True, pApplyCorrection=False)
527 |
528 | content_of_scool = cooler.fileops.list_scool_cells(outfile.name)
529 | content_expected = ['/cells/cell1', '/cells/cell2', '/cells/cell3']
530 | for content in content_expected:
531 | assert content in content_of_scool
532 |
533 |
534 | def test_load_cool_matrix_only():
535 |
536 | pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool'
537 |
538 | matrixFileHandlerInput = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pLoadMatrixOnly=True)
539 | matrix, cut_intervals, nan_bins, \
540 | distance_counts, correction_factors = matrixFileHandlerInput.load()
541 |
542 | assert len(matrix) == 4
543 | assert cut_intervals is None
544 | assert nan_bins is None
545 | assert distance_counts is None
546 | assert correction_factors is None
547 |
548 | matrixFileHandlerInput2 = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile)
549 | matrix2, _, _, \
550 | _, _ = matrixFileHandlerInput2.load()
551 |
552 | instances, features = matrix2.nonzero()
553 | nt.assert_almost_equal(matrix[0], instances, decimal=1)
554 | nt.assert_almost_equal(matrix[1], features, decimal=1)
555 | nt.assert_almost_equal(matrix[2], matrix2.data, decimal=1)
556 | assert matrix[3] == matrix2.shape[0]
557 |
--------------------------------------------------------------------------------
/hicmatrix/utilities.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import sys
3 |
4 | import numpy as np
5 |
6 |
7 | def toString(s):
8 | """
9 | This takes care of python2/3 differences
10 | """
11 | if isinstance(s, str):
12 | return s
13 | if isinstance(s, bytes): # or isinstance(s, np.bytes_):
14 | if sys.version_info[0] == 2:
15 | return str(s)
16 | return s.decode('ascii')
17 | if isinstance(s, list):
18 | return [toString(x) for x in s]
19 | if isinstance(s, np.ndarray):
20 | return s.astype(str)
21 | return s
22 |
23 |
24 | def toBytes(s):
25 | """
26 | Like toString, but for functions requiring bytes in python3
27 | """
28 | if sys.version_info[0] == 2:
29 | return s
30 | if isinstance(s, bytes):
31 | return s
32 | # if isinstance(s, np.bytes_):
33 | # return np.bytes_(s)
34 | if isinstance(s, str):
35 | return bytes(s, 'ascii')
36 | if isinstance(s, list):
37 | return [toBytes(x) for x in s]
38 | return s
39 |
40 |
41 | def check_chrom_str_bytes(pIteratableObj, pObj):
42 | # determine type
43 | if isinstance(pObj, list) and len(pObj) > 0:
44 | type_ = type(pObj[0])
45 | else:
46 | type_ = type(pObj)
47 | if not isinstance(type(next(iter(pIteratableObj))), type_):
48 | if isinstance(next(iter(pIteratableObj)), str):
49 | pObj = toString(pObj)
50 | elif type(next(iter(pIteratableObj))) in [bytes, np.bytes_]:
51 | pObj = toBytes(pObj)
52 | return pObj
53 |
54 |
55 | def convertNansToZeros(ma):
56 | nan_elements = np.flatnonzero(np.isnan(ma.data))
57 | if len(nan_elements) > 0:
58 | ma.data[nan_elements] = 0.0
59 | return ma
60 |
61 |
62 | def convertNansToOnes(pArray):
63 | nan_elements = np.flatnonzero(np.isnan(pArray))
64 | if len(nan_elements) > 0:
65 | pArray[nan_elements] = 1.0
66 | return pArray
67 |
68 |
69 | def enlarge_bins(bin_intervals):
70 | r"""
71 | takes a list of consecutive, but not
72 | directly touching, bin intervals
73 | and joins them such that the
74 | end and start of consecutive bins
75 | is the same.
76 |
77 | >>> bin_intervals = [('chr1', 10, 50, 1), ('chr1', 50, 80, 2),
78 | ... ('chr2', 10, 60, 3), ('chr2', 70, 90, 4)]
79 | >>> enlarge_bins(bin_intervals)
80 | [('chr1', 0, 50, 1), ('chr1', 50, 80, 2), ('chr2', 0, 65, 3), ('chr2', 65, 90, 4)]
81 | """
82 | # enlarge remaining bins
83 | chr_start = True
84 | for idx in range(len(bin_intervals) - 1):
85 | chrom, start, end, extra = bin_intervals[idx]
86 | chrom_next, start_next, end_next, extra_next = bin_intervals[idx + 1]
87 |
88 | if chr_start is True:
89 | start = 0
90 | chr_start = False
91 | bin_intervals[idx] = (chrom, start, end, extra)
92 | if chrom == chrom_next and end != start_next:
93 | middle = start_next - int((start_next - end) / 2)
94 | bin_intervals[idx] = (chrom, start, middle, extra)
95 | bin_intervals[idx + 1] = (chrom, middle, end_next, extra_next)
96 | if chrom != chrom_next:
97 | chr_start = True
98 |
99 | chrom, start, end, extra = bin_intervals[-1]
100 | bin_intervals[-1] = (chrom, start, end, extra)
101 |
102 | return bin_intervals
103 |
104 |
105 | def opener(filename):
106 | """
107 | Determines if a file is compressed or not
108 | """
109 | f = open(filename, 'rb') # pylint: disable=R1732
110 | # print("gzip or not?", f.read(2))
111 |
112 | if f.read(2) == b'\x1f\x8b':
113 | f.seek(0)
114 | return gzip.GzipFile(fileobj=f)
115 |
116 | f.seek(0)
117 | return f
118 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools >= 61.0"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "HiCMatrix"
7 | version = "17.2"
8 | authors = [
9 | { name = "Lucille Lopez-Delisle, Joachim Wolff, Leily Rabbani, Vivek Bhardwaj, Fidel Ramirez", email = "lucille.delisle@epfl.ch" },
10 | ]
11 | description = "Helper package which implements HiCMatrix class for HiCExplorer, pyGenomeTracks and scHiCExplorer."
12 | readme = "README.rst"
13 | requires-python = ">=3.7"
14 | classifiers = [
15 | 'Intended Audience :: Science/Research',
16 | 'Topic :: Scientific/Engineering :: Bio-Informatics',
17 | ]
18 | dependencies = [
19 | "numpy >= 1.20",
20 | "scipy >= 1.2",
21 | "tables >= 3.5",
22 | "pandas >= 0.25",
23 | "cooler >= 0.8.9",
24 | "intervaltree >= 3.0",
25 | "importlib_metadata; python_version<'3.8'"
26 | ]
27 |
28 | [project.urls]
29 | Homepage = "https://github.com/deeptools/HiCMatrix"
30 | Issues = "https://github.com/deeptools/HiCMatrix/issues"
31 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | filterwarnings =
3 | ignore::UserWarning
4 | ignore::FutureWarning
5 | ignore::DeprecationWarning
6 | ignore::ImportWarning
--------------------------------------------------------------------------------