├── .gitattributes ├── .github └── workflows │ ├── pythonpackage.yml │ ├── pythonpublish.yml │ └── ruff.yml ├── .gitignore ├── .pre-commit-config.yaml ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.md ├── conftest.py ├── notebooks ├── clustering.ipynb └── output │ └── cluster_three_diseases.png ├── phenopy ├── __init__.py ├── __main__.py ├── build_hpo.py ├── config.py ├── d2p.py ├── data │ ├── lgb.model.pkl │ ├── oa_phenotype_age.tsv │ ├── phenopy.wv.model.txt.gz │ └── phenotype_groups.txt ├── ic.py ├── network.py ├── score.py ├── util.py └── weights.py ├── phenoseries ├── __init__.py ├── experiment.py └── phenoseries.requirements.txt ├── pyproject.toml └── tests ├── __init__.py ├── data ├── hp.obo ├── phenotype.hpoa ├── phenotype_age.tsv ├── test.score-long.txt ├── test.score-one-patient.txt └── test.score-short.txt ├── fixtures ├── __init__.py └── get_data_dictionary.py ├── test_ic.py ├── test_network.py ├── test_score.py ├── test_util.py └── test_weights.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.py linguist-language=python 2 | *.ipynb linguist-documentation -------------------------------------------------------------------------------- /.github/workflows/pythonpackage.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: [push] 4 | jobs: 5 | build: 6 | timeout-minutes: 60 7 | runs-on: ubuntu-latest 8 | strategy: 9 | max-parallel: 4 10 | matrix: 11 | python-version: [3.9] 12 | steps: 13 | - uses: actions/checkout@v1 14 | - name: Set up Python ${{ matrix.python-version }} 15 | uses: actions/setup-python@v1 16 | with: 17 | python-version: ${{ matrix.python-version }} 18 | - name: Install and configure Poetry 19 | uses: snok/install-poetry@v1 20 | with: 21 | version: 1.4.2 22 | virtualenvs-create: true 23 | virtualenvs-in-project: false 24 | virtualenvs-path: ~/my-custom-path 25 | installer-parallel: true 26 | - name: Install dependencies 27 | run: | 28 | poetry install 29 | - name: Test with pytest 30 | run: | 31 | poetry run pytest --cov=phenopy --cov-report=xml 32 | - name: Upload coverage to Codecov 33 | uses: codecov/codecov-action@v1.0.2 34 | with: 35 | token: ${{secrets.CODECOV_TOKEN}} 36 | file: ./coverage.xml 37 | flags: unittests 38 | -------------------------------------------------------------------------------- /.github/workflows/pythonpublish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v1 12 | - name: Set up Python 13 | uses: actions/setup-python@v1 14 | with: 15 | python-version: '3.9' 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install pipenv 20 | pipenv install --dev 21 | - name: Build and publish 22 | env: 23 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 24 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 25 | run: | 26 | pipenv run python setup.py sdist bdist_wheel 27 | pipenv run twine upload dist/* 28 | -------------------------------------------------------------------------------- /.github/workflows/ruff.yml: -------------------------------------------------------------------------------- 1 | name: Ruff 2 | on: [push, pull_request] 3 | jobs: 4 | ruff: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v3 8 | - uses: chartboost/ruff-action@v1 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # IPython 78 | profile_default/ 79 | ipython_config.py 80 | 81 | # pyenv 82 | .python-version 83 | 84 | # celery beat schedule file 85 | celerybeat-schedule 86 | 87 | # SageMath parsed files 88 | *.sage.py 89 | 90 | # Environments 91 | .env 92 | .venv 93 | env/ 94 | venv/ 95 | ENV/ 96 | env.bak/ 97 | venv.bak/ 98 | 99 | # Spyder project settings 100 | .spyderproject 101 | .spyproject 102 | 103 | # Rope project settings 104 | .ropeproject 105 | 106 | # mkdocs documentation 107 | /site 108 | 109 | # mypy 110 | .mypy_cache/ 111 | .dmypy.json 112 | dmypy.json 113 | 114 | # Pyre type checker 115 | .pyre/ 116 | 117 | # PyCharm 118 | .idea/ -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v2.3.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | - repo: https://github.com/psf/black 9 | rev: 22.12.0 10 | hooks: 11 | - id: black 12 | args: [--line-length=88] 13 | - repo: https://github.com/charliermarsh/ruff-pre-commit 14 | rev: "v0.0.264" 15 | hooks: 16 | - id: ruff 17 | args: [--line-length=88] 18 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # source image 2 | FROM python:3.9 3 | 4 | # set noninterative mode 5 | ENV DEBIAN_FRONTEND noninteractive 6 | 7 | # apt update and install global requirements 8 | RUN apt-get clean all && \ 9 | apt-get update && \ 10 | apt-get upgrade -y && \ 11 | apt-get install -y \ 12 | build-essential 13 | 14 | # apt clean and remove cached source lists 15 | RUN apt-get clean && \ 16 | rm -rf /var/lib/apt/lists/* 17 | 18 | # install pipenv 19 | RUN pip install pipenv --upgrade 20 | 21 | # copy app code 22 | COPY . /app 23 | WORKDIR /app 24 | 25 | # install python requirements 26 | RUN pipenv install --dev --deploy --system 27 | 28 | # install phenopy 29 | RUN pip install . 30 | 31 | # default command 32 | CMD ["phenopy"] 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution-NonCommercial-ShareAlike 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More_considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International 58 | Public License 59 | 60 | By exercising the Licensed Rights (defined below), You accept and agree 61 | to be bound by the terms and conditions of this Creative Commons 62 | Attribution-NonCommercial-ShareAlike 4.0 International Public License 63 | ("Public License"). To the extent this Public License may be 64 | interpreted as a contract, You are granted the Licensed Rights in 65 | consideration of Your acceptance of these terms and conditions, and the 66 | Licensor grants You such rights in consideration of benefits the 67 | Licensor receives from making the Licensed Material available under 68 | these terms and conditions. 69 | 70 | 71 | Section 1 -- Definitions. 72 | 73 | a. Adapted Material means material subject to Copyright and Similar 74 | Rights that is derived from or based upon the Licensed Material 75 | and in which the Licensed Material is translated, altered, 76 | arranged, transformed, or otherwise modified in a manner requiring 77 | permission under the Copyright and Similar Rights held by the 78 | Licensor. For purposes of this Public License, where the Licensed 79 | Material is a musical work, performance, or sound recording, 80 | Adapted Material is always produced where the Licensed Material is 81 | synched in timed relation with a moving image. 82 | 83 | b. Adapter's License means the license You apply to Your Copyright 84 | and Similar Rights in Your contributions to Adapted Material in 85 | accordance with the terms and conditions of this Public License. 86 | 87 | c. BY-NC-SA Compatible License means a license listed at 88 | creativecommons.org/compatiblelicenses, approved by Creative 89 | Commons as essentially the equivalent of this Public License. 90 | 91 | d. Copyright and Similar Rights means copyright and/or similar rights 92 | closely related to copyright including, without limitation, 93 | performance, broadcast, sound recording, and Sui Generis Database 94 | Rights, without regard to how the rights are labeled or 95 | categorized. For purposes of this Public License, the rights 96 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 97 | Rights. 98 | 99 | e. Effective Technological Measures means those measures that, in the 100 | absence of proper authority, may not be circumvented under laws 101 | fulfilling obligations under Article 11 of the WIPO Copyright 102 | Treaty adopted on December 20, 1996, and/or similar international 103 | agreements. 104 | 105 | f. Exceptions and Limitations means fair use, fair dealing, and/or 106 | any other exception or limitation to Copyright and Similar Rights 107 | that applies to Your use of the Licensed Material. 108 | 109 | g. License Elements means the license attributes listed in the name 110 | of a Creative Commons Public License. The License Elements of this 111 | Public License are Attribution, NonCommercial, and ShareAlike. 112 | 113 | h. Licensed Material means the artistic or literary work, database, 114 | or other material to which the Licensor applied this Public 115 | License. 116 | 117 | i. Licensed Rights means the rights granted to You subject to the 118 | terms and conditions of this Public License, which are limited to 119 | all Copyright and Similar Rights that apply to Your use of the 120 | Licensed Material and that the Licensor has authority to license. 121 | 122 | j. Licensor means the individual(s) or entity(ies) granting rights 123 | under this Public License. 124 | 125 | k. NonCommercial means not primarily intended for or directed towards 126 | commercial advantage or monetary compensation. For purposes of 127 | this Public License, the exchange of the Licensed Material for 128 | other material subject to Copyright and Similar Rights by digital 129 | file-sharing or similar means is NonCommercial provided there is 130 | no payment of monetary compensation in connection with the 131 | exchange. 132 | 133 | l. Share means to provide material to the public by any means or 134 | process that requires permission under the Licensed Rights, such 135 | as reproduction, public display, public performance, distribution, 136 | dissemination, communication, or importation, and to make material 137 | available to the public including in ways that members of the 138 | public may access the material from a place and at a time 139 | individually chosen by them. 140 | 141 | m. Sui Generis Database Rights means rights other than copyright 142 | resulting from Directive 96/9/EC of the European Parliament and of 143 | the Council of 11 March 1996 on the legal protection of databases, 144 | as amended and/or succeeded, as well as other essentially 145 | equivalent rights anywhere in the world. 146 | 147 | n. You means the individual or entity exercising the Licensed Rights 148 | under this Public License. Your has a corresponding meaning. 149 | 150 | 151 | Section 2 -- Scope. 152 | 153 | a. License grant. 154 | 155 | 1. Subject to the terms and conditions of this Public License, 156 | the Licensor hereby grants You a worldwide, royalty-free, 157 | non-sublicensable, non-exclusive, irrevocable license to 158 | exercise the Licensed Rights in the Licensed Material to: 159 | 160 | a. reproduce and Share the Licensed Material, in whole or 161 | in part, for NonCommercial purposes only; and 162 | 163 | b. produce, reproduce, and Share Adapted Material for 164 | NonCommercial purposes only. 165 | 166 | 2. Exceptions and Limitations. For the avoidance of doubt, where 167 | Exceptions and Limitations apply to Your use, this Public 168 | License does not apply, and You do not need to comply with 169 | its terms and conditions. 170 | 171 | 3. Term. The term of this Public License is specified in Section 172 | 6(a). 173 | 174 | 4. Media and formats; technical modifications allowed. The 175 | Licensor authorizes You to exercise the Licensed Rights in 176 | all media and formats whether now known or hereafter created, 177 | and to make technical modifications necessary to do so. The 178 | Licensor waives and/or agrees not to assert any right or 179 | authority to forbid You from making technical modifications 180 | necessary to exercise the Licensed Rights, including 181 | technical modifications necessary to circumvent Effective 182 | Technological Measures. For purposes of this Public License, 183 | simply making modifications authorized by this Section 2(a) 184 | (4) never produces Adapted Material. 185 | 186 | 5. Downstream recipients. 187 | 188 | a. Offer from the Licensor -- Licensed Material. Every 189 | recipient of the Licensed Material automatically 190 | receives an offer from the Licensor to exercise the 191 | Licensed Rights under the terms and conditions of this 192 | Public License. 193 | 194 | b. Additional offer from the Licensor -- Adapted Material. 195 | Every recipient of Adapted Material from You 196 | automatically receives an offer from the Licensor to 197 | exercise the Licensed Rights in the Adapted Material 198 | under the conditions of the Adapter's License You apply. 199 | 200 | c. No downstream restrictions. You may not offer or impose 201 | any additional or different terms or conditions on, or 202 | apply any Effective Technological Measures to, the 203 | Licensed Material if doing so restricts exercise of the 204 | Licensed Rights by any recipient of the Licensed 205 | Material. 206 | 207 | 6. No endorsement. Nothing in this Public License constitutes or 208 | may be construed as permission to assert or imply that You 209 | are, or that Your use of the Licensed Material is, connected 210 | with, or sponsored, endorsed, or granted official status by, 211 | the Licensor or others designated to receive attribution as 212 | provided in Section 3(a)(1)(A)(i). 213 | 214 | b. Other rights. 215 | 216 | 1. Moral rights, such as the right of integrity, are not 217 | licensed under this Public License, nor are publicity, 218 | privacy, and/or other similar personality rights; however, to 219 | the extent possible, the Licensor waives and/or agrees not to 220 | assert any such rights held by the Licensor to the limited 221 | extent necessary to allow You to exercise the Licensed 222 | Rights, but not otherwise. 223 | 224 | 2. Patent and trademark rights are not licensed under this 225 | Public License. 226 | 227 | 3. To the extent possible, the Licensor waives any right to 228 | collect royalties from You for the exercise of the Licensed 229 | Rights, whether directly or through a collecting society 230 | under any voluntary or waivable statutory or compulsory 231 | licensing scheme. In all other cases the Licensor expressly 232 | reserves any right to collect such royalties, including when 233 | the Licensed Material is used other than for NonCommercial 234 | purposes. 235 | 236 | 237 | Section 3 -- License Conditions. 238 | 239 | Your exercise of the Licensed Rights is expressly made subject to the 240 | following conditions. 241 | 242 | a. Attribution. 243 | 244 | 1. If You Share the Licensed Material (including in modified 245 | form), You must: 246 | 247 | a. retain the following if it is supplied by the Licensor 248 | with the Licensed Material: 249 | 250 | i. identification of the creator(s) of the Licensed 251 | Material and any others designated to receive 252 | attribution, in any reasonable manner requested by 253 | the Licensor (including by pseudonym if 254 | designated); 255 | 256 | ii. a copyright notice; 257 | 258 | iii. a notice that refers to this Public License; 259 | 260 | iv. a notice that refers to the disclaimer of 261 | warranties; 262 | 263 | v. a URI or hyperlink to the Licensed Material to the 264 | extent reasonably practicable; 265 | 266 | b. indicate if You modified the Licensed Material and 267 | retain an indication of any previous modifications; and 268 | 269 | c. indicate the Licensed Material is licensed under this 270 | Public License, and include the text of, or the URI or 271 | hyperlink to, this Public License. 272 | 273 | 2. You may satisfy the conditions in Section 3(a)(1) in any 274 | reasonable manner based on the medium, means, and context in 275 | which You Share the Licensed Material. For example, it may be 276 | reasonable to satisfy the conditions by providing a URI or 277 | hyperlink to a resource that includes the required 278 | information. 279 | 3. If requested by the Licensor, You must remove any of the 280 | information required by Section 3(a)(1)(A) to the extent 281 | reasonably practicable. 282 | 283 | b. ShareAlike. 284 | 285 | In addition to the conditions in Section 3(a), if You Share 286 | Adapted Material You produce, the following conditions also apply. 287 | 288 | 1. The Adapter's License You apply must be a Creative Commons 289 | license with the same License Elements, this version or 290 | later, or a BY-NC-SA Compatible License. 291 | 292 | 2. You must include the text of, or the URI or hyperlink to, the 293 | Adapter's License You apply. You may satisfy this condition 294 | in any reasonable manner based on the medium, means, and 295 | context in which You Share Adapted Material. 296 | 297 | 3. You may not offer or impose any additional or different terms 298 | or conditions on, or apply any Effective Technological 299 | Measures to, Adapted Material that restrict exercise of the 300 | rights granted under the Adapter's License You apply. 301 | 302 | 303 | Section 4 -- Sui Generis Database Rights. 304 | 305 | Where the Licensed Rights include Sui Generis Database Rights that 306 | apply to Your use of the Licensed Material: 307 | 308 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 309 | to extract, reuse, reproduce, and Share all or a substantial 310 | portion of the contents of the database for NonCommercial purposes 311 | only; 312 | 313 | b. if You include all or a substantial portion of the database 314 | contents in a database in which You have Sui Generis Database 315 | Rights, then the database in which You have Sui Generis Database 316 | Rights (but not its individual contents) is Adapted Material, 317 | including for purposes of Section 3(b); and 318 | 319 | c. You must comply with the conditions in Section 3(a) if You Share 320 | all or a substantial portion of the contents of the database. 321 | 322 | For the avoidance of doubt, this Section 4 supplements and does not 323 | replace Your obligations under this Public License where the Licensed 324 | Rights include other Copyright and Similar Rights. 325 | 326 | 327 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 328 | 329 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 330 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 331 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 332 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 333 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 334 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 335 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 336 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 337 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 338 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 339 | 340 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 341 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 342 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 343 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 344 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 345 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 346 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 347 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 348 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 349 | 350 | c. The disclaimer of warranties and limitation of liability provided 351 | above shall be interpreted in a manner that, to the extent 352 | possible, most closely approximates an absolute disclaimer and 353 | waiver of all liability. 354 | 355 | 356 | Section 6 -- Term and Termination. 357 | 358 | a. This Public License applies for the term of the Copyright and 359 | Similar Rights licensed here. However, if You fail to comply with 360 | this Public License, then Your rights under this Public License 361 | terminate automatically. 362 | 363 | b. Where Your right to use the Licensed Material has terminated under 364 | Section 6(a), it reinstates: 365 | 366 | 1. automatically as of the date the violation is cured, provided 367 | it is cured within 30 days of Your discovery of the 368 | violation; or 369 | 370 | 2. upon express reinstatement by the Licensor. 371 | 372 | For the avoidance of doubt, this Section 6(b) does not affect any 373 | right the Licensor may have to seek remedies for Your violations 374 | of this Public License. 375 | 376 | c. For the avoidance of doubt, the Licensor may also offer the 377 | Licensed Material under separate terms or conditions or stop 378 | distributing the Licensed Material at any time; however, doing so 379 | will not terminate this Public License. 380 | 381 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 382 | License. 383 | 384 | 385 | Section 7 -- Other Terms and Conditions. 386 | 387 | a. The Licensor shall not be bound by any additional or different 388 | terms or conditions communicated by You unless expressly agreed. 389 | 390 | b. Any arrangements, understandings, or agreements regarding the 391 | Licensed Material not stated herein are separate from and 392 | independent of the terms and conditions of this Public License. 393 | 394 | 395 | Section 8 -- Interpretation. 396 | 397 | a. For the avoidance of doubt, this Public License does not, and 398 | shall not be interpreted to, reduce, limit, restrict, or impose 399 | conditions on any use of the Licensed Material that could lawfully 400 | be made without permission under this Public License. 401 | 402 | b. To the extent possible, if any provision of this Public License is 403 | deemed unenforceable, it shall be automatically reformed to the 404 | minimum extent necessary to make it enforceable. If the provision 405 | cannot be reformed, it shall be severed from this Public License 406 | without affecting the enforceability of the remaining terms and 407 | conditions. 408 | 409 | c. No term or condition of this Public License will be waived and no 410 | failure to comply consented to unless expressly agreed to by the 411 | Licensor. 412 | 413 | d. Nothing in this Public License constitutes or may be interpreted 414 | as a limitation upon, or waiver of, any privileges and immunities 415 | that apply to the Licensor or You, including from the legal 416 | processes of any jurisdiction or authority. 417 | 418 | ======================================================================= 419 | 420 | Creative Commons is not a party to its public 421 | licenses. Notwithstanding, Creative Commons may elect to apply one of 422 | its public licenses to material it publishes and in those instances 423 | will be considered the “Licensor.” The text of the Creative Commons 424 | public licenses is dedicated to the public domain under the CC0 Public 425 | Domain Dedication. Except for the limited purpose of indicating that 426 | material is shared under a Creative Commons public license or as 427 | otherwise permitted by the Creative Commons policies published at 428 | creativecommons.org/policies, Creative Commons does not authorize the 429 | use of the trademark "Creative Commons" or any other trademark or logo 430 | of Creative Commons without its prior written consent including, 431 | without limitation, in connection with any unauthorized modifications 432 | to any of its public licenses or any other arrangements, 433 | understandings, or agreements concerning use of licensed material. For 434 | the avoidance of doubt, this paragraph does not form part of the 435 | public licenses. 436 | 437 | Creative Commons may be contacted at creativecommons.org. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include phenopy/data/* -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![python-version](https://img.shields.io/badge/python-3.6+-blue.svg)](https://www.python.org/downloads/release/python-360/) 2 | [![github-actions](https://github.com/GeneDx/phenopy/workflows/Python%20package/badge.svg)](https://github.com/GeneDx/phenopy/actions) 3 | [![codecov](https://codecov.io/gh/GeneDx/phenopy/branch/develop/graph/badge.svg)](https://codecov.io/gh/GeneDx/phenopy) 4 | [![DOI](https://zenodo.org/badge/207335538.svg)](https://zenodo.org/badge/latestdoi/207335538) 5 | 6 | # phenopy 7 | `phenopy` was developed using Python 3.9 and functions to perform phenotype similarity scoring by semantic similarity. `phenopy` is a 8 | lightweight but highly optimized command line tool and library to efficiently perform semantic similarity scoring on 9 | generic entities with phenotype annotations from the [Human Phenotype Ontology (HPO)](https://hpo.jax.org/app/). 10 | 11 | ![Phenotype Similarity Clustering](https://raw.githubusercontent.com/GeneDx/phenopy/develop/notebooks/output/cluster_three_diseases.png) 12 | 13 | ## Installation 14 | Install using pip: 15 | ```bash 16 | pip install phenopy 17 | ``` 18 | 19 | Install from GitHub: 20 | ```bash 21 | git clone https://github.com/GeneDx/phenopy.git 22 | cd phenopy 23 | pipx install poetry 24 | poetry install 25 | ``` 26 | 27 | ## Command Line Usage 28 | ### score 29 | `phenopy` is primarily used as a command line tool. An entity, as described here, is presented as a sample, gene, or 30 | disease, but could be any concept that warrants annotation of phenotype terms. 31 | 32 | Use `phenopy score` to perform semantic similarity scoring in various formats. Write the results of any command to file 33 | using `--output-file=/path/to/output_file.txt` 34 | 35 | 1. Score similarity of entities defined by the HPO terms from an input file against all the OMIM diseases in 36 | `.phenopy/data/phenotype.hpoa`. We provide a test input file in the repo. The default summarization method is to 37 | use `--summarization-method=BMWA` which weighs each diseases' phenotypes by the frequency of a phenotype seen in 38 | each particular disease. 39 | ```bash 40 | phenopy score tests/data/test.score.txt 41 | ``` 42 | Output: 43 | ``` 44 | #query entity_id score 45 | 118200 210100 0.0 46 | 118200 615779 0.0 47 | 118200 613266 0.0052 48 | ... 49 | ``` 50 | 51 | 2. Score similarity of entities defined by the HPO terms from an input file against all the OMIM diseases in 52 | `.phenopy/data/phenotype.hpoa`, to use the non-weighted summarization method use `--summarization-method=BMA` which 53 | uses a traditional *best-match average* summarization of semantic similarity scores when comparing terms from record *a* 54 | with terms from record *b*. 55 | ```bash 56 | phenopy score tests/data/test.score.txt --summarization-method=BMWA 57 | ``` 58 | Output: 59 | ``` 60 | #query entity_id score 61 | 118200 210100 0.0 62 | 118200 615779 0.0 63 | 118200 613266 0.0052 64 | ... 65 | ``` 66 | 67 | 3. Score similarity of an entities defined by the HPO terms from an input file against a custom list of entities with HPO annotations, referred to as the `--records-file`. Both files are in the same format. 68 | ```bash 69 | phenopy score tests/data/test.score-short.txt --records-file tests/data/test.score-long.txt 70 | ``` 71 | Output: 72 | ``` 73 | #query entity_id score 74 | 118200 118200 0.0169 75 | 118200 300905 0.0156 76 | 118200 601098 0.0171 77 | ... 78 | ``` 79 | 80 | 4. Score pairwise similarity of entities defined by the HPO terms from an input file using `--self`. 81 | 82 | ```bash 83 | phenopy score tests/data/test.score-long.txt --threads 4 --self 84 | ``` 85 | Output: 86 | ``` 87 | #query entity_id score 88 | 118200 118200 0.2284 89 | 118200 118210 0.1302 90 | 118200 118211 0.1302 91 | 118210 118210 0.2048 92 | 118210 118211 0.2048 93 | 118211 118211 0.2048 94 | ``` 95 | 5. Score age-adjusted pairwise similarity of entities defined in the input file, 96 | using phenotype mean age and standard deviation defined in the `--ages_distribution_file`, 97 | select best-match weighted average as the scoring summarization method `--summarization-method BMWA`. 98 | 99 | ```bash 100 | phenopy score tests/data/test.score-short.txt --ages_distribution_file tests/data/phenotype_age.tsv --summarization-method BMWA --threads 4 --self 101 | ``` 102 | Output: 103 | ``` 104 | #query entity_id score 105 | 118200 210100 0.0 106 | 118200 177650 0.0127 107 | 118200 241520 0.0 108 | ... 109 | ``` 110 | 111 | The phenotype age file contains hpo-id, mean, sd as tab separated text as follows 112 | 113 | | | | | 114 | |------------|------|-----| 115 | | HP:0001251 | 6.0 | 3.0 | 116 | | HP:0001263 | 1.0 | 1.0 | 117 | | HP:0001290 | 1.0 | 1.0 | 118 | | HP:0004322 | 10.0 | 3.0 | 119 | | HP:0001249 | 6.0 | 3.0 | 120 | 121 | If no phenotype ages file is provided `--summarization-method=BMWA` can be selected to use default, open access literature-derived phenotype ages (~ 1,400 age weighted phenotypes). 122 | ```bash 123 | phenopy score tests/data/test.score-short.txt --summarization-method BMWA --threads 4 124 | ``` 125 | 126 | #### Parameters 127 | For a full list of command arguments use `phenopy [subcommand] --help`: 128 | ```bash 129 | phenopy score --help 130 | ``` 131 | Output: 132 | ``` 133 | --output_file=OUTPUT_FILE 134 | File path where to store the results. [default: - (stdout)] 135 | --records_file=RECORDS_FILE 136 | An entity-to-phenotype annotation file in the same format as "input_file". This file, if provided, is used to score entries in the "input_file" against entries here. [default: None] 137 | --annotations_file=ANNOTATIONS_FILE 138 | An entity-to-phenotype annotation file in the same format as "input_file". This file, if provided, is used to add information content to the network. [default: None] 139 | --ages_distribution_file=AGES_DISTRIBUTION_FILE 140 | Phenotypes age summary stats file containing phenotype HPO id, mean_age, and std. [default: None] 141 | --self=SELF 142 | Score entries in the "input_file" against itself. 143 | --summarization_method=SUMMARIZATION_METHOD 144 | The method used to summarize the HRSS matrix. Supported Values are best match average (BMA), best match weighted average (BMWA), and maximum (maximum). [default: BMWA] 145 | --threads=THREADS 146 | Number of parallel processes to use. [default: 1] 147 | ``` 148 | 149 | ## Library Usage 150 | 151 | The `phenopy` library can be used as a `Python` module, allowing more control for advanced users. 152 | 153 | ### score 154 | 155 | **Generate the hpo network and supporting objects**: 156 | 157 | ```python 158 | import os 159 | from phenopy.build_hpo import generate_annotated_hpo_network 160 | from phenopy.score import Scorer 161 | 162 | # data directory 163 | phenopy_data_directory = os.path.join(os.getenv('HOME'), '.phenopy/data') 164 | 165 | # files used in building the annotated HPO network 166 | obo_file = os.path.join(phenopy_data_directory, 'hp.obo') 167 | disease_to_phenotype_file = os.path.join(phenopy_data_directory, 'phenotype.hpoa') 168 | 169 | # if you have a custom ages_distribution_file, you can set it here. 170 | ages_distribution_file = os.path.join(phenopy_data_directory, 'xa_age_stats_oct052019.tsv') 171 | 172 | hpo_network, alt2prim, disease_records = \ 173 | generate_annotated_hpo_network(obo_file, 174 | disease_to_phenotype_file, 175 | ages_distribution_file=ages_distribution_file 176 | ) 177 | ``` 178 | 179 | **Then, instantiate the `Scorer` class and score hpo term lists.** 180 | 181 | ```python 182 | scorer = Scorer(hpo_network) 183 | 184 | terms_a = ['HP:0001263', 'HP:0011839'] 185 | terms_b = ['HP:0001263', 'HP:0000252'] 186 | 187 | print(scorer.score_term_sets_basic(terms_a, terms_b)) 188 | ``` 189 | 190 | Output: 191 | 192 | ``` 193 | 0.11213185474495047 194 | ``` 195 | 196 | ### miscellaneous 197 | 198 | The library can be used to prune parent phenotypes from the `phenotype.hpoa` and store pruned annotations as a file 199 | 200 | ```python 201 | from phenopy.util import export_phenotype_hpoa_with_no_parents 202 | # saves a new file of phenotype disease annotations with parent HPO terms removed from phenotype lists. 203 | disease_to_phenotype_no_parents_file = os.path.join(phenopy_data_directory, 'phenotype.noparents.hpoa') 204 | export_phenotype_hpoa_with_no_parents(disease_to_phenotype_file, disease_to_phenotype_no_parents_file, hpo_network) 205 | ``` 206 | 207 | 208 | ## Initial setup 209 | phenopy is designed to run with minimal setup from the user, to run phenopy with default parameters (recommended), skip ahead 210 | to the [Commands overview](#Commands-overview). 211 | 212 | This section provides details about where phenopy stores data resources and config files. The following occurs when 213 | you run phenopy for the first time. 214 | 1. phenopy creates a `.phenopy/` directory in your home folder and downloads external resources from HPO into the 215 | `$HOME/.phenopy/data/` directory. 216 | 2. phenopy creates a `$HOME/.phenopy/phenopy.ini` config file where users can set variables for phenopy to use 217 | at runtime. 218 | 219 | ## Config 220 | While we recommend using the default settings for most users, the config file *can be* modified: `$HOME/.phenopy/phenopy.ini`. 221 | 222 | To run phenopy with a different version of `hp.obo`, set the path of `obo_file` in `$HOME/.phenopy/phenopy.ini`. 223 | 224 | ## Contributing 225 | We welcome contributions from the community. Please follow these steps to setup a local development environment. 226 | ```bash 227 | pipenv install --dev 228 | ``` 229 | 230 | To run tests locally: 231 | ```bash 232 | pipenv shell 233 | coverage run --source=. -m unittest discover --start-directory tests/ 234 | coverage report -m 235 | ``` 236 | 237 | ## References 238 | The underlying algorithm which determines the semantic similarity for any two HPO terms is based on an implementation of HRSS, [published here](https://www.ncbi.nlm.nih.gov/pubmed/23741529). 239 | 240 | ## Citing Phenopy 241 | Please use the following Bibtex to cite this software. 242 | ``` 243 | @software{arvai_phenopy_2019, 244 | title = {Phenopy}, 245 | rights = {Attribution-NonCommercial-ShareAlike 4.0 International}, 246 | url = {https://github.com/GeneDx/phenopy}, 247 | abstract = {Phenopy is a Python package to perform phenotype similarity scoring by semantic similarity. 248 | Phenopy is a lightweight but highly optimized command line tool and library to efficiently perform semantic 249 | similarity scoring on generic entities with phenotype annotations from the Human Phenotype Ontology (HPO).}, 250 | version = {0.3.0}, 251 | author = {Arvai, Kevin and Borroto, Carlos and Gainullin, Vladimir and Retterer, Kyle}, 252 | date = {2019-11-05}, 253 | year = {2019}, 254 | doi = {10.5281/zenodo.3529569} 255 | } 256 | ``` 257 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | from tests.fixtures.get_data_dictionary import test_data as test_data 2 | -------------------------------------------------------------------------------- /notebooks/output/cluster_three_diseases.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeneDx/phenopy/3cc03ea1a60334155141f8b261c77471195b62f3/notebooks/output/cluster_three_diseases.png -------------------------------------------------------------------------------- /phenopy/__init__.py: -------------------------------------------------------------------------------- 1 | __project__ = 'phenopy' 2 | __version__ = '0.6.0' 3 | -------------------------------------------------------------------------------- /phenopy/__main__.py: -------------------------------------------------------------------------------- 1 | import fire 2 | import itertools 3 | import sys 4 | from configparser import NoOptionError, NoSectionError 5 | from phenopy.util import open_or_stdout 6 | from phenopy.build_hpo import generate_annotated_hpo_network 7 | from phenopy.config import config, logger 8 | from phenopy.score import Scorer 9 | from phenopy.util import parse_input, half_product 10 | from phenoseries.experiment import run_phenoseries_experiment 11 | 12 | 13 | def score( 14 | input_file, 15 | output_file="-", 16 | records_file=None, 17 | annotations_file=None, 18 | custom_disease_file=None, 19 | ages_distribution_file=None, 20 | self=False, 21 | summarization_method="BMWA", 22 | scoring_method="HRSS", 23 | threads=1, 24 | ): 25 | """ 26 | Scores similarity of provided HPO annotated entries (see format below) against a 27 | set of HPO annotated dataset. By default scoring happens against diseases 28 | annotated by the HPO group. See https://hpo.jax.org/app/download/annotation. 29 | 30 | Phenopy also supports scoring the product of provided entries (see "--product") or 31 | scoring against a custom records dataset (see "--records-file). 32 | 33 | :param input_file: File with HPO annotated entries, one per line (see format below). 34 | :param output_file: File path where to store the results. [default: - (stdout)] 35 | :param records_file: An entity-to-phenotype annotation file in the same format as 36 | "input_file". This file, if 37 | provided, is used to score entries in the "input_file" against entries here. 38 | [default: None] 39 | :param annotations_file: An entity-to-phenotype annotation file in the same format 40 | as "input_file". This file, if 41 | provided, is used to add information content to the network. [default: None] 42 | :param custom_disease_file: entity Annotation for ranking diseases/genes 43 | :param ages_distribution_file: Phenotypes age summary stats file containing 44 | phenotype HPO id, mean_age, and std. [default: None] 45 | :param self: Score entries in the "input_file" against itself. 46 | :param summarization_method: The method used to summarize the HRSS matrix. 47 | Supported Values are best match average 48 | (BMA), best match weighted average (BMWA), and maximum (maximum). [default: BMWA] 49 | :param scoring_method: Either HRSS or Resnik 50 | :param threads: Number of parallel processes to use. [default: 1] 51 | """ 52 | 53 | try: 54 | obo_file = config.get("hpo", "obo_file") 55 | except (NoSectionError, NoOptionError): 56 | logger.critical( 57 | 'No HPO OBO file found in the configuration file. See "hpo:obo_file" ' 58 | "parameter." 59 | ) 60 | sys.exit(1) 61 | if custom_disease_file is None: 62 | try: 63 | disease_to_phenotype_file = config.get("hpo", "disease_to_phenotype_file") 64 | except (NoSectionError, NoOptionError): 65 | logger.critical( 66 | "No HPO annotated dataset file found in the configuration file." 67 | ' See "hpo:disease_to_phenotype_file" parameter.' 68 | ) 69 | sys.exit(1) 70 | else: 71 | logger.info(f"using custom disease annotation file: {custom_disease_file}") 72 | disease_to_phenotype_file = custom_disease_file 73 | 74 | logger.info(f"Loading HPO OBO file: {obo_file}") 75 | hpo_network, alt2prim, disease_records = generate_annotated_hpo_network( 76 | obo_file, 77 | disease_to_phenotype_file, 78 | annotations_file=annotations_file, 79 | ages_distribution_file=ages_distribution_file, 80 | ) 81 | 82 | # parse input records 83 | input_records = parse_input(input_file, hpo_network, alt2prim) 84 | 85 | # create instance the scorer class 86 | try: 87 | scorer = Scorer( 88 | hpo_network, 89 | summarization_method=summarization_method, 90 | scoring_method=scoring_method, 91 | ) 92 | except ValueError as e: 93 | logger.critical(f"Failed to initialize scoring class: {e}") 94 | sys.exit(1) 95 | 96 | if self: 97 | score_records = input_records 98 | 99 | scoring_pairs = half_product(len(score_records), len(score_records)) 100 | else: 101 | if records_file: 102 | score_records = parse_input(records_file, hpo_network, alt2prim) 103 | else: 104 | score_records = disease_records 105 | 106 | scoring_pairs = itertools.product( 107 | range(len(input_records)), 108 | range(len(score_records)), 109 | ) 110 | 111 | results = scorer.score_records(input_records, score_records, scoring_pairs, threads) 112 | 113 | with open_or_stdout(output_file) as output_fh: 114 | output_fh.write("\t".join(["#query", "entity_id", "score"])) 115 | output_fh.write("\n") 116 | for result in results: 117 | output_fh.write("\t".join(str(column) for column in result)) 118 | output_fh.write("\n") 119 | 120 | 121 | def validate_phenoseries( 122 | phenotypic_series_filepath, 123 | outdir=None, 124 | min_hpos=4, 125 | min_entities=2, 126 | phenoseries_fraction=1.0, 127 | scoring_method="HRSS", 128 | threads=1, 129 | omim_phenotypes_file="", 130 | pairwise_mim_scores_file="", 131 | ): 132 | """ 133 | This runs the phenoseries experiment for a fraction of the OMIM phenoseries 134 | (PSid's). It Outputs a file with each row containing: PSid, MIMid, Python list 135 | of integers (ranks), and the length of the list. 136 | 137 | :param phenotypic_series_filepath: The phenotypicSeries.txt file from OMIM API. 138 | This is required to run validation. 139 | :param outdir: Directory where output files will be written. 140 | :param min_hpos: The minimum number of HPO ids annotated to a MIM id for the 141 | MIM id to be included in the experiment. 142 | :param min_entities: The minimum number of MIM ids for a phenoseries id to be 143 | included in the experiment. 144 | :param phenoseries_fraction: The fraction of total phenoseries to evaluate. 145 | :param scoring_method: Either HRSS, Resnik, Jaccard, or word2vec 146 | :param threads: Number of parallel processes to use. [default: 1] 147 | :param omim_phenotypes_file: Path to the file containing OMIM id in 148 | the first column and a Python 149 | list of hpo ids in the second column. 150 | :param pairwise_mim_scores_file: Path to the file containing similarity 151 | scores for each of the 152 | """ 153 | run_phenoseries_experiment( 154 | outdir=outdir, 155 | phenotypic_series_filepath=phenotypic_series_filepath, 156 | min_hpos=min_hpos, 157 | min_entities=min_entities, 158 | phenoseries_fraction=phenoseries_fraction, 159 | scoring_method=scoring_method, 160 | threads=threads, 161 | omim_phenotypes_file=omim_phenotypes_file, 162 | pairwise_mim_scores_file=pairwise_mim_scores_file, 163 | ) 164 | 165 | 166 | def main(): 167 | fire.Fire( 168 | { 169 | "score": score, 170 | "validate-phenoseries": validate_phenoseries, 171 | } 172 | ) 173 | 174 | 175 | if __name__ == "__main__": 176 | main() 177 | -------------------------------------------------------------------------------- /phenopy/build_hpo.py: -------------------------------------------------------------------------------- 1 | from phenopy.util import generate_alternate_ids 2 | from phenopy.d2p import load as load_d2p 3 | from phenopy.network import load as load_network 4 | from phenopy.network import annotate 5 | from typing import Tuple 6 | 7 | 8 | def generate_annotated_hpo_network( 9 | obo_file: str, disease_to_phenotype_file: str, 10 | annotations_file=None, ages_distribution_file=None) -> Tuple: 11 | """ 12 | Generate an annotated HPO network, alternate ids to primary ids and disease records 13 | """ 14 | hpo_network = load_network(obo_file) 15 | 16 | alt2prim = generate_alternate_ids(hpo_network) 17 | 18 | # load phenotypes to diseases associations 19 | ( 20 | disease_records, 21 | phenotype_to_diseases, 22 | ) = load_d2p(disease_to_phenotype_file, hpo_network, alt2prim) 23 | 24 | # load hpo network 25 | hpo_network = annotate( 26 | hpo_network, 27 | phenotype_to_diseases, 28 | len(disease_records), 29 | alt2prim, 30 | annotations_file=annotations_file, 31 | ages_distribution_file=ages_distribution_file, 32 | ) 33 | 34 | return hpo_network, alt2prim, disease_records 35 | -------------------------------------------------------------------------------- /phenopy/config.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import logging 3 | import os 4 | import urllib.request 5 | import shutil 6 | from pathlib import Path 7 | from gensim.models import KeyedVectors 8 | from phenopy import __project__, __version__ 9 | 10 | 11 | def download_resource_files(): 12 | """ 13 | Check if HPO files exist, if not download them 14 | :return: None 15 | """ 16 | 17 | def download(url, file_path): 18 | """ 19 | Download and save a file 20 | :param url: where to get it from 21 | :param file_path: where to put it 22 | :return: None 23 | """ 24 | try: 25 | response = urllib.request.urlopen(url) 26 | 27 | except ValueError: 28 | logger.info(f"Incorrect url specified for HPO files: {url}") 29 | raise 30 | 31 | except urllib.error.URLError as e: 32 | if hasattr(e, "reason"): 33 | logger.info(f"Incorrect url specified for HPO files: {url}") 34 | logger.info("Reason: ", e.reason) 35 | raise 36 | elif hasattr(e, "code"): 37 | logger.info("The server could not fulfill the request") 38 | logger.info("Reason: ", e.code) 39 | raise 40 | 41 | try: 42 | with open(file_path, "wb") as out_file: 43 | shutil.copyfileobj(response, out_file) 44 | 45 | except PermissionError: 46 | logger.info(f"No permission accessing data directory: {file_path}") 47 | raise 48 | 49 | # read the config file to get file paths and urls 50 | obo_path = config.get("hpo", "obo_file") 51 | obo_url = config.get("hpo", "obo_file_url") 52 | 53 | hpoa_path = config.get("hpo", "disease_to_phenotype_file") 54 | hpoa_url = config.get("hpo", "disease_to_phenotype_file_url") 55 | 56 | if not os.path.isfile(obo_path): 57 | logger.info(f"Downloading HPO obo file to: {obo_path}") 58 | download(obo_url, obo_path) 59 | 60 | if not os.path.isfile(hpoa_path): 61 | logger.info(f"Downloading phenotype to disease annotations to {hpoa_path}") 62 | download(hpoa_url, hpoa_path) 63 | 64 | 65 | # create logger 66 | logger = logging.getLogger(__project__) 67 | logger.setLevel(logging.DEBUG) 68 | 69 | # create console handler 70 | ch = logging.StreamHandler() 71 | ch.setLevel(logging.DEBUG) 72 | 73 | # create formatter and add it to the handler 74 | formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") 75 | ch.setFormatter(formatter) 76 | 77 | # add the handler to the logger 78 | logger.addHandler(ch) 79 | 80 | # create config 81 | config = configparser.ConfigParser() 82 | 83 | # create config directory if it doesn't exist 84 | config_directory = os.path.join(os.environ.get("HOME"), f".{__project__}") 85 | project_directory = Path(__file__).parent 86 | project_data_dir = os.path.join(project_directory, "data") 87 | try: 88 | os.makedirs(config_directory) 89 | except FileExistsError: 90 | pass 91 | 92 | # create data directory if it doesn't exist 93 | data_directory = os.path.join(config_directory, "data") 94 | try: 95 | os.makedirs(data_directory) 96 | except FileExistsError: 97 | pass 98 | 99 | # if phenopy.ini doesnt exist make one 100 | logger.info(f"checking if config file exists: {config_directory}") 101 | if not os.path.isfile(os.path.join(config_directory, "phenopy.ini")): 102 | config = configparser.ConfigParser() 103 | w2v_path = os.path.join(os.path.dirname(__file__), "data/phenopy.wv.model.txt.gz") 104 | 105 | w2v_vw_path = os.path.join(data_directory, "phenopy.w2v.model") 106 | 107 | wv = KeyedVectors.load_word2vec_format(w2v_path) 108 | # save model in faster to load format in users directory 109 | wv.save(w2v_vw_path) 110 | 111 | # copy the lmd model to the data directory 112 | lmd_path = os.path.join(os.path.dirname(__file__), "data/lgb.model.pkl") 113 | lmd_data_path = os.path.join(data_directory, "lgb.model.pkl") 114 | shutil.copyfile(lmd_path, lmd_data_path) 115 | 116 | config["hpo"] = { 117 | "obo_file": os.path.join( 118 | data_directory, 119 | "hp.obo", 120 | ), 121 | "obo_file_url": "http://purl.obolibrary.org/obo/hp.obo", 122 | "hpo_network_file": os.path.join( 123 | data_directory, 124 | "hpo_network.pickle", 125 | ), 126 | "disease_to_phenotype_file_url": "http://purl.obolibrary.org/obo/hp/hpoa/phenotype.hpoa", 127 | "disease_to_phenotype_file": os.path.join( 128 | data_directory, 129 | "phenotype.hpoa", 130 | ), 131 | } 132 | 133 | config["models"] = { 134 | "phenopy.wv.model": w2v_vw_path, 135 | } 136 | config["age"] = { 137 | "open_access_phenotype_age": os.path.join( 138 | project_data_dir, 139 | "oa_phenotype_age.tsv", 140 | ) 141 | } 142 | config["omim"] = { 143 | "omim_api_key": "", 144 | } 145 | config["phenotype_groups"] = { 146 | "phenotype_groups_file": os.path.join(project_data_dir, "phenotype_groups.txt") 147 | } 148 | 149 | with open(os.path.join(config_directory, "phenopy.ini"), "w") as configfile: 150 | logger.info("writing config file to: %s " % config_directory) 151 | config.write(configfile) 152 | 153 | # log project and version 154 | logger.info(f"{__project__} {__version__}") 155 | 156 | # read config 157 | config_file = os.environ.get( 158 | f"{__project__.upper()}_CONFIG", 159 | os.path.join( 160 | config_directory, 161 | f"{__project__}.ini", 162 | ), 163 | ) 164 | config.read(config_file) 165 | logger.info(f"Using configuration file: {config_file}") 166 | 167 | # download resource files if necessary 168 | download_resource_files() 169 | -------------------------------------------------------------------------------- /phenopy/d2p.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import csv 3 | import sys 4 | import networkx as nx 5 | from typing import ( 6 | List, 7 | Tuple, 8 | ) 9 | 10 | hpo_id_to_float = { 11 | "HP:0040280": 1.0, 12 | "HP:0040281": np.mean([0.80, 0.99]), 13 | "HP:0040282": np.mean([0.30, 0.79]), 14 | "HP:0040283": np.mean([0.05, 0.29]), 15 | "HP:0040284": np.mean([0.01, 0.04]), 16 | "HP:0040285": 0.0, 17 | } 18 | 19 | 20 | def read_hpo_annotation_file( 21 | phenotype_annotations_file: str, hpo_network: nx.MultiDiGraph, logger=None 22 | ) -> List: 23 | """ 24 | Reads the annotation files from the HPO website 25 | """ 26 | try: 27 | with open(phenotype_annotations_file, "r") as tsv_fh: 28 | [next(tsv_fh) for _ in range(4)] 29 | reader = csv.DictReader(tsv_fh, delimiter="\t") 30 | # this removes the leading hash 31 | reader.fieldnames[0] = reader.fieldnames[0].lstrip("#") 32 | 33 | records = [] 34 | 35 | for row in reader: 36 | 37 | # phenotype term id 38 | term_id = row.get("HPO_ID") if "HPO_ID" in row else row.get("hpo_id") 39 | if term_id not in hpo_network.nodes(): 40 | continue 41 | 42 | # parse disease id, currently only supports omim entries 43 | database_id = ( 44 | row.get("DatabaseID") 45 | if "DatabaseID" in row 46 | else row.get("database_id") 47 | ) 48 | db, disease_accession = database_id.split(":") 49 | if db not in ["OMIM"]: 50 | continue 51 | 52 | # For now, skip negative phenotype annotations 53 | qualifier = ( 54 | row.get("Qualifier") if "Qualifier" in row else row.get("qualifier") 55 | ) 56 | if qualifier == "NOT": 57 | continue 58 | 59 | frequency = ( 60 | row.get("Frequency") if "Frequency" in row else row.get("frequency") 61 | ) 62 | records.append( 63 | (term_id, disease_accession, frequency_converter(frequency)) 64 | ) 65 | 66 | return records 67 | 68 | except (FileNotFoundError, PermissionError): 69 | hpoa_file_error_msg = ( 70 | f"{phenotype_annotations_file} " f"not found or incorrect permissions" 71 | ) 72 | if logger is not None: 73 | logger.critical(hpoa_file_error_msg) 74 | else: 75 | sys.stderr.write(hpoa_file_error_msg) 76 | sys.exit(1) 77 | 78 | 79 | def read_custom_annotation_file( 80 | custom_annotation_file_path: str, hpo_network: nx.MultiDiGraph, logger: None = None 81 | ) -> List: 82 | try: 83 | with open(custom_annotation_file_path, "r") as tsv_fh: 84 | reader = csv.reader(tsv_fh, delimiter="\t") 85 | 86 | records = [] 87 | for row in reader: 88 | # phenotype term id 89 | # convert alternate phenotype id to primary 90 | term_id, disease_accession, freq = row 91 | if term_id not in hpo_network.nodes(): 92 | continue 93 | 94 | records.append((term_id, disease_accession, float(freq))) 95 | 96 | return records 97 | 98 | except (FileNotFoundError, PermissionError): 99 | hpoa_file_error_msg = ( 100 | f"{custom_annotation_file_path} " f"not found or incorrect permissions" 101 | ) 102 | if logger is not None: 103 | logger.critical(hpoa_file_error_msg) 104 | else: 105 | sys.stderr.write(hpoa_file_error_msg) 106 | sys.exit(1) 107 | 108 | 109 | def load( 110 | phenotype_annotations_file: str, 111 | hpo_network: nx.MultiDiGraph, 112 | alt2prim, 113 | default_frequency: float = 0.5, 114 | ) -> Tuple: 115 | """ 116 | Parse the hpoa file 117 | """ 118 | if phenotype_annotations_file.endswith("hpoa"): 119 | records = read_hpo_annotation_file(phenotype_annotations_file, hpo_network) 120 | else: 121 | records = read_custom_annotation_file(phenotype_annotations_file, hpo_network) 122 | 123 | disease_to_phenotypes = dict() 124 | phenotype_to_diseases = dict() 125 | 126 | for r in records: 127 | term_id, disease_accession, freq = r 128 | if term_id not in phenotype_to_diseases: 129 | phenotype_to_diseases[term_id] = { 130 | disease_accession: {"frequency": default_frequency} 131 | } 132 | else: 133 | if disease_accession not in phenotype_to_diseases[term_id]: 134 | phenotype_to_diseases[term_id].update( 135 | {disease_accession: {"frequency": default_frequency}} 136 | ) 137 | 138 | phenotype_to_diseases[term_id][disease_accession]["frequency"] = freq 139 | 140 | # add the phenotype to the disease in the disease_records dictionary 141 | if disease_accession not in disease_to_phenotypes: 142 | disease_to_phenotypes[disease_accession] = { 143 | "record_id": disease_accession, 144 | "terms": [], 145 | "weights": { 146 | "disease_frequency": [], 147 | }, 148 | } 149 | disease_to_phenotypes[disease_accession]["terms"].append(term_id) 150 | 151 | # going from dict to a list of disease records and setting weights 152 | disease_records = list() 153 | for disease_accession, disease in disease_to_phenotypes.items(): 154 | disease["terms"] = sorted(set(disease["terms"])) 155 | for term_id in disease["terms"]: 156 | # convert alternate phenotype id to primary 157 | term_id = term_id if term_id not in alt2prim else alt2prim[term_id] 158 | if term_id not in hpo_network.nodes(): 159 | continue 160 | 161 | frequency_weight = phenotype_to_diseases[term_id][disease_accession][ 162 | "frequency" 163 | ] 164 | # 165 | disease["weights"]["disease_frequency"].append(frequency_weight) 166 | 167 | disease_records.append(disease) 168 | 169 | # TODO: do we need phenotype_to_diseases? 170 | return disease_records, phenotype_to_diseases 171 | 172 | 173 | def frequency_converter(hpoa_frequency: str, default_frequency: float = 0.5) -> float: 174 | """ 175 | convert the frequency column from the hpoa file to a float 176 | """ 177 | if "HP:" in hpoa_frequency: 178 | # TODO discuss the best default 179 | return hpo_id_to_float.get(hpoa_frequency, default_frequency) 180 | 181 | elif "/" in hpoa_frequency: 182 | n, d = hpoa_frequency.split("/") 183 | return float(n) / float(d) 184 | 185 | elif "%" in hpoa_frequency: 186 | return float(hpoa_frequency.strip("%")) / 100 187 | 188 | # TODO discuss the best default 189 | return default_frequency 190 | -------------------------------------------------------------------------------- /phenopy/data/lgb.model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeneDx/phenopy/3cc03ea1a60334155141f8b261c77471195b62f3/phenopy/data/lgb.model.pkl -------------------------------------------------------------------------------- /phenopy/data/phenopy.wv.model.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeneDx/phenopy/3cc03ea1a60334155141f8b261c77471195b62f3/phenopy/data/phenopy.wv.model.txt.gz -------------------------------------------------------------------------------- /phenopy/ic.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import numpy as np 3 | from typing import Dict 4 | 5 | SMOOTH = 1 6 | 7 | 8 | def calculate_information_content( 9 | hpo_id: str, 10 | hpo_network: nx.MultiDiGraph, 11 | phenotype_to_diseases: Dict, 12 | num_diseases_annotated: int, 13 | custom_annotations: str = None) -> np.ndarray: 14 | """ 15 | Calculates information content for an HPO term. 16 | """ 17 | # compile list of HPO terms to include in the calculation, term plus children 18 | hpo_id_plus_children = [hpo_id] + list(nx.ancestors(hpo_network, hpo_id)) 19 | # num_diseases_annotated is the total number of diseases in the annotation corpus. 20 | 21 | def get_ic(hpo_ids, annotations): 22 | # count the # of unique diseases annotated to the hpo term and it's children 23 | n_unique_diseases = len( 24 | {g for h in hpo_ids if h in annotations for g in annotations[h]} 25 | ) 26 | # negative log of the number of hpo annotations divided by the total number 27 | # of hpo terms in the 28 | # phenotypes_to_genes file 29 | information_content = -np.log((n_unique_diseases + SMOOTH) / 30 | float(num_diseases_annotated + SMOOTH)) 31 | 32 | return information_content 33 | 34 | annotations_list = [phenotype_to_diseases] 35 | if custom_annotations is not None: 36 | annotations_list.append(custom_annotations) 37 | output_mean = np.mean([get_ic(hpo_id_plus_children, annotations=annotations) 38 | for annotations in annotations_list]) 39 | return output_mean 40 | -------------------------------------------------------------------------------- /phenopy/network.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import obonet 3 | import re 4 | import sys 5 | 6 | from phenopy.config import logger 7 | from phenopy.ic import calculate_information_content 8 | from phenopy.weights import make_age_distributions 9 | from phenopy.util import parse_input 10 | from typing import ( 11 | Dict, 12 | List, 13 | ) 14 | 15 | 16 | def load(obo_file: str) -> nx.MultiDiGraph: 17 | """ 18 | Load OBO file into a networkx graph. 19 | """ 20 | try: 21 | hpo_network = obonet.read_obo(obo_file) 22 | except (FileNotFoundError, PermissionError) as e: 23 | if logger is not None: 24 | logger.critical(e) 25 | else: 26 | sys.stderr.write(str(e)) 27 | exit(1) 28 | 29 | # roots for non-phenotype nodes 30 | non_phenotypes = { 31 | "mortality_aging": "HP:0040006", 32 | "mode_of_inheritance": "HP:0000005", 33 | "clinical_modifier": "HP:0012823", 34 | "frequency": "HP:0040279", 35 | "clinical_course": "HP:0031797", 36 | } 37 | 38 | # remove non-phenotype branches 39 | for _, hpo_id in non_phenotypes.items(): 40 | if hpo_id in hpo_network.nodes: 41 | children = nx.ancestors(hpo_network, hpo_id) 42 | hpo_network.remove_nodes_from([hpo_id] + list(children)) 43 | 44 | return hpo_network 45 | 46 | 47 | def annotate( 48 | hpo_network: nx.MultiDiGraph, 49 | phenotype_to_diseases: Dict, 50 | num_diseases_annotated: int, 51 | alt2prim: Dict, 52 | annotations_file: List = None, 53 | ages_distribution_file: str = None, 54 | phenotype_disease_frequencies: Dict = None, 55 | ) -> nx.MultiDiGraph: 56 | """ 57 | Cleans the HPO network. 58 | 59 | Removes non-phenotype branches of the network, and merges all synonyms into one tag. 60 | 61 | :param hpo_network: `networkx.MultiDiGraph` to clean. 62 | :param phenotype_to_diseases: Dictionary mapping HPO terms to diseases. 63 | :param num_diseases_annotated: Number of diseases with HPO annotations. 64 | :param alt2prim: The dict of alternate terms to canonical terms. 65 | :param annotations_file: A list of custom annotation files, in the same format 66 | as tests/data/test.score-long.txt 67 | :param phenotype_disease_frequencies: dictionary of phenotype to disease frequencies 68 | :param ages_distribution_file: Path to phenotypes ages distribution file. 69 | :return: `networkx.MultiDiGraph` 70 | """ 71 | 72 | # Before calculating information content, check for custom_annotations_file and load 73 | custom_annos = None 74 | if annotations_file is not None: 75 | custom_annos = {} 76 | for record in parse_input(annotations_file, hpo_network, alt2prim): 77 | for term_id in record["terms"]: 78 | if term_id not in custom_annos: 79 | custom_annos[term_id] = [] 80 | custom_annos[term_id].append(record["record_id"]) 81 | 82 | # make ages distributions 83 | ages = None 84 | if ages_distribution_file is not None: 85 | try: 86 | ages = make_age_distributions(ages_distribution_file) 87 | logger.info( 88 | f"Adding custom phenotype age distributions to HPO nodes " 89 | f"from file: {ages_distribution_file}" 90 | ) 91 | except (FileNotFoundError, PermissionError) as e: 92 | logger.critical(e) 93 | logger.critical( 94 | f"Specified phenotype ages file could not be loaded or " 95 | f"does not exist: {e}" 96 | ) 97 | exit(1) 98 | 99 | for node_id, data in hpo_network.nodes(data=True): 100 | # annotate with information content value 101 | hpo_network.nodes[node_id]["ic"] = calculate_information_content( 102 | node_id, 103 | hpo_network, 104 | phenotype_to_diseases, 105 | num_diseases_annotated, 106 | custom_annos, 107 | ) 108 | # annotate with phenotype age distribution 109 | hpo_network.nodes[node_id]["disease_weights"] = {} 110 | 111 | if ages is not None and node_id in ages.index: 112 | hpo_network.nodes[node_id]["age_dist"] = ages.loc[node_id]["age_dist"] 113 | 114 | # add the disease_frequency weights as attributes to the node 115 | if phenotype_disease_frequencies is not None: 116 | if node_id in phenotype_disease_frequencies: 117 | for dis_id, freq in phenotype_disease_frequencies[node_id].items(): 118 | hpo_network.nodes[node_id]["weights"]["disease_frequency"][ 119 | dis_id 120 | ] = freq 121 | 122 | # annotate with depth value 123 | # hard-coding origin node for now 124 | origin = "HP:0000001" 125 | hpo_network.nodes[node_id]["depth"] = nx.shortest_path_length( 126 | hpo_network, node_id, origin 127 | ) 128 | 129 | # clean synonyms 130 | synonyms = [] 131 | try: 132 | for synonym in data["synonym"]: 133 | synonyms.append(synonym) 134 | hpo_network.nodes[node_id]["synonyms"] = re.findall( 135 | r'"(.*?)"', ",".join(synonyms) 136 | ) 137 | except KeyError: 138 | # pass if no synonym tags in the node 139 | pass 140 | 141 | return hpo_network 142 | -------------------------------------------------------------------------------- /phenopy/score.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import gensim 3 | import networkx as nx 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from functools import lru_cache 8 | from multiprocessing import Pool 9 | from phenopy.weights import calculate_age_weights 10 | from phenopy.config import config 11 | from typing import ( 12 | Tuple, 13 | Dict, 14 | List, 15 | Set, 16 | ) 17 | 18 | 19 | class Scorer: 20 | def __init__( 21 | self, 22 | hpo_network: nx.MultiDiGraph, 23 | summarization_method: str = "BMWA", 24 | min_score_mask: float = 0.05, 25 | scoring_method: str = "HRSS", 26 | ) -> None: 27 | 28 | # Establish hpo_network 29 | self.hpo_network = hpo_network 30 | 31 | # Establish summarization method 32 | if summarization_method not in ["BMA", "BMWA", "maximum"]: 33 | raise ValueError( 34 | "Unsupported summarization method, please choose from " 35 | "BMA, BMWA, or maximum." 36 | ) 37 | self.summarization_method = summarization_method 38 | 39 | # Assign min_score_mask 40 | self.min_score_mask = min_score_mask 41 | 42 | # Assign scoring method 43 | if scoring_method not in ["HRSS", "Resnik", "Jaccard", "word2vec"]: 44 | raise ValueError( 45 | "Unsupported semantic similarity scoring method, please " 46 | "choose from HRSS, Resnik, Jaccard, or word2vec." 47 | ) 48 | self.scoring_method = scoring_method 49 | 50 | # Load the word vectors if using word2vec 51 | if scoring_method == "word2vec": 52 | try: 53 | self.word_vectors = gensim.models.KeyedVectors.load( 54 | config.get("models", "phenopy.wv.model") 55 | ) 56 | except FileNotFoundError: 57 | raise ValueError( 58 | "Please make sure that a word2vec model is in " 59 | "your project data directory." 60 | ) 61 | 62 | def find_lca(self, term_a: str, term_b: str) -> str: 63 | """ 64 | Determine the lowest common ancestor for two HPO terms 65 | """ 66 | 67 | # if either term is HP:0000001 return it 68 | if any(term == "HP:0000001" for term in [term_a, term_b]): 69 | return "HP:0000001" 70 | 71 | # if one of the terms is a child of the other return the parent 72 | if self.hpo_network.has_edge(term_a, term_b): 73 | return term_b 74 | if self.hpo_network.has_edge(term_b, term_a): 75 | return term_a 76 | 77 | # find common breadth-first-search predecessors 78 | parents = [] 79 | for i, term in enumerate([term_a, term_b]): 80 | parents.append({p[0] for p in nx.bfs_predecessors(self.hpo_network, term)}) 81 | parents[i].add(term) 82 | 83 | # Find the intersection between the two sets of parents 84 | common_parents = parents[0].intersection(parents[1]) 85 | 86 | # lca node - find the ancestor with the highest IC 87 | # break ties by choosing the node with the greatest depth 88 | return max( 89 | common_parents, 90 | key=lambda n: ( 91 | self.hpo_network.nodes[n]["ic"], 92 | self.hpo_network.nodes[n]["depth"], 93 | ), 94 | ) 95 | 96 | def calculate_beta(self, term_a: str, term_b: str) -> float: 97 | """ 98 | calculates the beta term in HRSS equation 99 | """ 100 | # find information content for the most informative leaf for each term 101 | mil_ic = [] 102 | 103 | # For each term, if it has children, find the most informative leaf 104 | for term in [term_a, term_b]: 105 | if self.hpo_network.in_edges(term): 106 | 107 | # children terms generator 108 | children = nx.ancestors(self.hpo_network, term) 109 | 110 | # Establish the leaf nodes 111 | leaves = { 112 | p 113 | for p in children 114 | if self.hpo_network.out_degree(p) >= 1 115 | and self.hpo_network.in_degree(p) == 0 116 | } 117 | 118 | # append the max IC leaf (choose the one with the max depth) 119 | mil = max( 120 | leaves, 121 | key=lambda n: ( 122 | self.hpo_network.nodes[n]["ic"], 123 | self.hpo_network.nodes[n]["depth"], 124 | ), 125 | ) 126 | mil_ic.append(self.hpo_network.nodes[mil]["ic"]) 127 | 128 | # the node is a leaf 129 | else: 130 | mil_ic.append(self.hpo_network.nodes[term]["ic"]) 131 | 132 | # calculate beta_ic 133 | beta_ic = ( 134 | (mil_ic[0] - self.hpo_network.nodes[term_a]["ic"]) 135 | + (mil_ic[1] - self.hpo_network.nodes[term_b]["ic"]) 136 | ) / 2.0 137 | 138 | return beta_ic 139 | 140 | def calculate_gamma(self, term_a: str, term_b: str, term_lca: str) -> int: 141 | """ 142 | Calculate gamma term for the HRSS algorithm. 143 | """ 144 | # calculate gamma 145 | # "such that the value equals zero if the two terms are the same" 146 | if term_a == term_b: 147 | return 0 148 | 149 | # If one of the terms is a child of the other return 1 150 | term_a_child = self.hpo_network.has_edge(term_a, term_b) 151 | term_b_child = self.hpo_network.has_edge(term_b, term_a) 152 | if term_a_child or term_b_child: 153 | return 1 154 | 155 | # Otherwise calculate the shortest-path length to the LCA 156 | a_to_lca = nx.shortest_path_length(self.hpo_network, term_a, term_lca) 157 | b_to_lca = nx.shortest_path_length(self.hpo_network, term_b, term_lca) 158 | 159 | return a_to_lca + b_to_lca 160 | 161 | @lru_cache(maxsize=72000000) 162 | def score_hpo_pair_hrss(self, term_a: str, term_b: str) -> float: 163 | """ 164 | Scores the comparison of a pair of terms, using Hybrid Relative Specificity 165 | Similarity (HRSS) algorithm. 166 | """ 167 | 168 | # calculate beta_ic 169 | beta_ic = self.calculate_beta(term_a, term_b) 170 | 171 | # find lowest common ancestors for the two terms 172 | lca_node = self.find_lca(term_a, term_b) 173 | 174 | # calculate alpha_ic 175 | alpha_ic = self.hpo_network.nodes[lca_node]["ic"] 176 | if self.scoring_method == "Resnik": 177 | return alpha_ic 178 | 179 | # Return 0 if alpha_ic and beta_ic are both 0 180 | if (alpha_ic == 0.0) and (beta_ic == 0.0): 181 | return 0.0 182 | 183 | # calculate gamma 184 | gamma = self.calculate_gamma(term_a, term_b, lca_node) 185 | 186 | # Assign the I and D variables in the HRSS equation 187 | i_variable = alpha_ic / (alpha_ic + beta_ic) 188 | d_variable = 1.0 / (1.0 + gamma) 189 | 190 | return i_variable * d_variable 191 | 192 | def score(self, record_a: Dict, record_b: Dict) -> Tuple[str, str, float]: 193 | """ 194 | Scores the comparison of terms listed in record A to terms listed in record B. 195 | """ 196 | if self.summarization_method not in ["BMA", "BMWA", "maximum"]: 197 | raise ValueError( 198 | "Unsupported summarization method, please choose from " 199 | "BMA, BMWA, or maximum." 200 | ) 201 | 202 | # if either set is empty return 0.0 203 | terms_a = record_a["terms"] 204 | terms_b = record_b["terms"] 205 | if not terms_a or not terms_b: 206 | return record_a["record_id"], record_b["record_id"], 0.0 207 | 208 | # If specified, calculate the Jaccard similarity 209 | if self.scoring_method == "Jaccard": 210 | intersection = len(list(set(terms_a).intersection(terms_b))) 211 | union = (len(terms_a) + len(terms_b)) - intersection 212 | comparison_score = float(intersection) / union 213 | return record_a["record_id"], record_b["record_id"], comparison_score 214 | 215 | # If specified, calculate the word2vec similarity 216 | elif self.scoring_method == "word2vec": 217 | 218 | # Ensure that all HPO terms are in the vocab 219 | in_vocab_terms_a = [ 220 | x for x in terms_a if x in self.word_vectors.key_to_index 221 | ] 222 | in_vocab_terms_b = [ 223 | x for x in terms_b if x in self.word_vectors.key_to_index 224 | ] 225 | 226 | # If both records have terms in the vocab (both are non-empty lists) 227 | if in_vocab_terms_a and in_vocab_terms_b: 228 | return self.word_vectors.n_similarity( 229 | in_vocab_terms_a, in_vocab_terms_b 230 | ) 231 | 232 | # One record or the other has no terms in the word2vec vocab 233 | else: 234 | return record_a["record_id"], record_b["record_id"], 0.0 235 | 236 | # calculate weights for record_a and record_b 237 | if record_a["weights"] is not None: 238 | weights_a = record_a["weights"].copy() 239 | else: 240 | weights_a = [] 241 | if record_b["weights"] is not None: 242 | weights_b = record_b["weights"].copy() 243 | else: 244 | weights_b = [] 245 | 246 | # set weights 247 | # if we have age of record_a use it to set age weights for record_b 248 | if "age" in record_a: 249 | weights_b["age"] = calculate_age_weights( 250 | record_b["terms"], record_a["age"], self.hpo_network 251 | ) 252 | 253 | # if we have age of record_b use it to set age weights for record_a 254 | if "age" in record_b: 255 | weights_a["age"] = calculate_age_weights( 256 | record_a["terms"], record_b["age"], self.hpo_network 257 | ) 258 | 259 | # Creates a dataframe that houses the HRSS for each term pair 260 | df = self.get_term_pair_dataframe(terms_a, terms_b) 261 | 262 | # Return maximum if specified 263 | if self.summarization_method == "maximum": 264 | return record_a["record_id"], record_b["record_id"], self.maximum(df) 265 | 266 | # Retrun BMWA if specified 267 | elif self.summarization_method == "BMWA" and any([weights_a, weights_b]): 268 | score_output = self.best_match_weighted_average( 269 | df, weights_a=weights_a, weights_b=weights_b 270 | ) 271 | 272 | return record_a["record_id"], record_b["record_id"], score_output 273 | 274 | # Otherwise return the best-match-average 275 | else: 276 | score_output = self.best_match_average(df) 277 | return record_a["record_id"], record_b["record_id"], score_output 278 | 279 | def score_term_sets_basic(self, terms_a: str, terms_b: str) -> float: 280 | """ 281 | Calculate the semantic similarity of two lists of terms. 282 | This is intended to be used as a library function. It is not used by the CLI. 283 | """ 284 | # Instantiate the two lists of HPO identifiers 285 | terms_a = set(terms_a) 286 | terms_b = set(terms_b) 287 | 288 | # Calculate the Jaccard similarity if specified 289 | if self.scoring_method == "Jaccard": 290 | intersection = len(list(set(terms_a).intersection(terms_b))) 291 | union = (len(terms_a) + len(terms_b)) - intersection 292 | return float(intersection) / union 293 | 294 | # Calculate the word vector similarity if word2vec is specified 295 | elif self.scoring_method == "word2vec": 296 | 297 | # Instantiate a list to house all HPO terms that are within the vocab 298 | in_vocab_terms_a = [ 299 | x for x in terms_a if x in self.word_vectors.key_to_index 300 | ] 301 | in_vocab_terms_b = [ 302 | x for x in terms_b if x in self.word_vectors.key_to_index 303 | ] 304 | 305 | # If both lists exist (both are non-empty lists) return their similarity 306 | if in_vocab_terms_a and in_vocab_terms_b: 307 | return self.word_vectors.n_similarity( 308 | in_vocab_terms_a, in_vocab_terms_b 309 | ) 310 | 311 | # Otherwise return 0.0 312 | else: 313 | return 0.0 314 | 315 | # Creates a dataframe that houses the HRSS for each term pair 316 | df = self.get_term_pair_dataframe(terms_a, terms_b) 317 | 318 | # If set to maximum, return the maximum, otherwise best-match-average 319 | if self.summarization_method == "maximum": 320 | return self.maximum(df) 321 | else: 322 | return self.best_match_average(df) 323 | 324 | def score_records( 325 | self, a_records: Dict, b_records: Dict, record_pairs: List, threads: int = 1 326 | ) -> List: 327 | """ 328 | Scores a pair of records based on the specified number of threads 329 | """ 330 | with Pool(processes=threads) as p: 331 | results = p.starmap( 332 | self.score, 333 | [ 334 | ( 335 | a_records[record_a], # a records 336 | b_records[record_b], # b records 337 | ) 338 | for (record_a, record_b) in record_pairs 339 | ], 340 | ) 341 | 342 | return results 343 | 344 | @staticmethod 345 | def best_match_average(df: pd.DataFrame) -> float: 346 | """ 347 | Returns the Best-Match average of a termlist to termlist similarity matrix. 348 | """ 349 | # Determine the max values of the rows and columns 350 | max_column_values = df.max(axis=1).values 351 | max_row_values = df.max(axis=0).values 352 | return np.average(np.append(max_column_values, max_row_values)) 353 | 354 | @staticmethod 355 | def maximum(dataframe: pd.DataFrame) -> float: 356 | """Returns the maximum similarity value between to term lists""" 357 | return dataframe.values.max() 358 | 359 | def best_match_weighted_average( 360 | self, df: pd.DataFrame, weights_a: Dict, weights_b: Dict 361 | ) -> float: 362 | """ 363 | Returns Best-Match Weighted Average of a termlist to termlist similarity matrix. 364 | """ 365 | max_a = df.max(axis=1).values 366 | max_b = df.max(axis=0).values 367 | scores = np.append(max_a, max_b) 368 | 369 | weights_matrix = {} 370 | for w in weights_a: 371 | # init weight list if necessary 372 | if w not in weights_matrix: 373 | weights_matrix[w] = [] 374 | 375 | # extend weight with the values of a 376 | weights_matrix[w].extend(weights_a[w]) 377 | 378 | # for columns not in b, fill in with 1s for each b row 379 | if w not in weights_b: 380 | weights_matrix[w].extend([1 for _ in range(max_b.shape[0])]) 381 | 382 | for w in weights_b: 383 | # for columns not in a fill in with 1s for each a row 384 | if w not in weights_matrix: 385 | weights_matrix[w] = [1 for _ in range(max_a.shape[0])] 386 | 387 | # extend weight with the values of b 388 | weights_matrix[w].extend(weights_b[w]) 389 | 390 | weights_df = pd.DataFrame.from_dict(weights_matrix) 391 | weights = weights_df.min(axis=1) 392 | 393 | # mask good matches from weighting 394 | # mask threshold based on >75% of pairwise scores of all hpo terms 395 | # TODO: expose min_score cutoff value to be set in config 396 | if self.min_score_mask is not None: 397 | masked_weights = np.where(scores > self.min_score_mask, 1.0, weights) 398 | weights = masked_weights 399 | 400 | # if weights add up to zero, calculate unweighted average 401 | if np.sum(weights) == 0.0: 402 | weights = np.ones(len(weights)) 403 | 404 | return np.average(scores, weights=weights) 405 | 406 | def get_term_pair_dataframe(self, terms_a: Set, terms_b: Set) -> pd.DataFrame: 407 | """ 408 | Creates a dataframes of pairwise HRSS scores between them 409 | """ 410 | # Create the list of term pairs 411 | # e.g., ['a', 'b']['c', 'd'] -> [('a', 'c'), ('a', 'd'), ('b', 'c), ('b','d')] 412 | term_pairs = itertools.product(terms_a, terms_b) 413 | 414 | # Apply the HRSS score to each pair within the dataframe 415 | dataframe = ( 416 | pd.DataFrame( 417 | [ 418 | (pair[0], pair[1], self.score_hpo_pair_hrss(pair[0], pair[1])) 419 | for pair in term_pairs 420 | ], 421 | columns=["a", "b", "score"], 422 | ) 423 | .set_index(["a", "b"]) 424 | .unstack() 425 | ) 426 | 427 | return dataframe 428 | -------------------------------------------------------------------------------- /phenopy/util.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | import networkx as nx 4 | import numpy as np 5 | import pandas as pd 6 | import logging 7 | 8 | from collections import Counter 9 | from contextlib import contextmanager 10 | 11 | from phenopy.config import config, logger 12 | from typing import ( 13 | Tuple, 14 | List, 15 | Dict, 16 | Union, 17 | Generator, 18 | ) 19 | 20 | 21 | def half_product(num_rows: int, num_columns: int) -> Tuple[int, int]: 22 | """yield combinations and the diagonal""" 23 | for m in range(0, num_rows): 24 | for n in range(m, num_columns): 25 | yield m, n 26 | 27 | 28 | def export_phenotype_hpoa_with_no_parents( 29 | phenotype_hpoa_file: str, 30 | phenotype_hpoa_no_parents_file: str, 31 | hpo_network: nx.MultiDiGraph, 32 | logger: logging.Logger = None, 33 | ) -> None: 34 | """ 35 | Load HPO terms associated to genes as annotated in 36 | https://hpo.jax.org/app/download/annotation. 37 | Filter the parent terms for each gene. 38 | Dump pheno2genes_no_parents_file 39 | 40 | :param phenotype_hpoa_file: Phenotypes to diseases file. 41 | :param phenotype_hpoa_no_parents_file: Phenotypes to diseases file 42 | with parents removed. 43 | :param hpo_network: The HPO networkx object. 44 | :param logger: Python `logging` logger instance. 45 | :return: None 46 | """ 47 | try: 48 | with open(phenotype_hpoa_file, "r") as tsv_fh: 49 | # skip the comment lines 50 | [next(tsv_fh) for _ in range(4)] 51 | df = pd.read_csv( 52 | tsv_fh, 53 | sep="\t", 54 | ) 55 | except (FileNotFoundError, PermissionError) as e: 56 | if logger is not None: 57 | logger.critical(e) 58 | else: 59 | sys.stderr.write(str(e)) 60 | exit(1) 61 | 62 | no_parents_df = df.copy() 63 | 64 | # Establish the proper column headers (different for various versions) 65 | database_id = "#DatabaseID" if "#DatabaseID" in df.columns else "database_id" 66 | hpo_id = "HPO_ID" if "HPO_ID" in df.columns else "hpo_id" 67 | 68 | for gene, annotations in df.groupby(database_id): 69 | termlist = [ 70 | node for node in annotations[hpo_id].tolist() if node in hpo_network.nodes() 71 | ] 72 | termlist = remove_parents(termlist, hpo_network) 73 | parent_idx = annotations.loc[~annotations[hpo_id].isin(termlist)].index 74 | no_parents_df.drop(parent_idx, inplace=True) 75 | 76 | try: 77 | no_parents_df.to_csv(phenotype_hpoa_no_parents_file, sep="\t", index=False) 78 | except PermissionError as e: 79 | if logger is not None: 80 | logger.critical(e) 81 | else: 82 | sys.stderr.write(str(e)) 83 | exit(1) 84 | 85 | 86 | def parse(string: str, what: str = "HPO") -> Union[None, int, str, list]: 87 | """ 88 | Parse patient parameters in the records file 89 | :param string: string to parse 90 | :param what: (HP,age,sex) terms to parse 91 | :return: parsed object, int for age, string for gender, list for terms 92 | """ 93 | string = string.strip() 94 | if string == ".": 95 | return None 96 | if what == "HPO": 97 | result = [x for x in string.split("|") if x.startswith("HP:")] 98 | return result 99 | elif f"{what}=" in string: 100 | result = [x.split(f"{what}=")[1] for x in string.split(";") if what in x] 101 | if result: 102 | result = result[0] 103 | if what == "age": 104 | try: 105 | result = round(float(result), 1) 106 | except ValueError: 107 | result = None 108 | 109 | if what == "sex": 110 | if result.lower().startswith("f"): 111 | result = "Female" 112 | elif result.lower().startswith("m"): 113 | result = "Male" 114 | else: 115 | result = None 116 | return result 117 | else: 118 | return None 119 | 120 | 121 | def read_records_file( 122 | records_file: str, 123 | no_parents: bool = False, 124 | hpo_network: nx.MultiDiGraph = None, 125 | logger: logging.Logger = None, 126 | ) -> List: 127 | """ 128 | Parse input file for patient descriptions into an array of dictionaries 129 | """ 130 | try: 131 | with open(records_file) as records_fh: 132 | reader = csv.reader(records_fh, delimiter="\t") 133 | records = [] 134 | for line in reader: 135 | if line[0].startswith("#"): 136 | continue 137 | dict_ = { 138 | "sample": line[0], 139 | "age": parse(line[1], what="age"), 140 | "gender": parse(line[1], what="sex"), 141 | "terms": parse(line[2], what="HPO"), 142 | } 143 | 144 | if no_parents is True and hpo_network is not None: 145 | dict_["terms"] = remove_parents(dict_["terms"], hpo_network) 146 | else: 147 | pass 148 | records.append(dict_) 149 | return records 150 | except (FileNotFoundError, PermissionError) as e: 151 | if logger is not None: 152 | logger.critical(e) 153 | else: 154 | sys.stderr.write(str(e)) 155 | exit(1) 156 | 157 | 158 | def remove_parents(termlist: List[str], hpo_network: nx.MultiDiGraph) -> List[str]: 159 | """ 160 | remove parents from termlist 161 | """ 162 | terms_to_remove = set() 163 | for source_term in termlist: 164 | if source_term not in hpo_network.nodes: 165 | terms_to_remove.add(source_term) 166 | continue 167 | for target_term in termlist: 168 | if target_term not in hpo_network.nodes: 169 | terms_to_remove.add(target_term) 170 | continue 171 | # has_path will evaluate True for a term to itself, include additional check 172 | same_terms = source_term == target_term 173 | source_to_target = nx.has_path(hpo_network, source_term, target_term) 174 | target_to_source = nx.has_path(hpo_network, target_term, source_term) 175 | if source_to_target is True and not same_terms: 176 | terms_to_remove.add(target_term) 177 | if target_to_source is True and not same_terms: 178 | terms_to_remove.add(source_term) 179 | return sorted(set(termlist) - terms_to_remove) 180 | 181 | 182 | def generate_alternate_ids(hpo_network: nx.MultiDiGraph) -> Dict[str, str]: 183 | """ 184 | Create a key, value store of alternate terms to canonical terms. 185 | """ 186 | alt2prim = {} 187 | for n in hpo_network.nodes(data=True): 188 | n = n[0] 189 | try: 190 | for alt in hpo_network.nodes[n]["alt_id"]: 191 | alt2prim[alt] = n 192 | except KeyError: 193 | # no alternate HPO ids for this term 194 | continue 195 | return alt2prim 196 | 197 | 198 | def parse_input( 199 | input_file: str, hpo_network: nx.MultiDiGraph, alt2prim: Dict[str, str] 200 | ) -> List: 201 | """ 202 | Parse input file. 203 | """ 204 | try: 205 | with open(input_file, "r") as input_fh: 206 | reader = csv.reader( 207 | filter(lambda x: not x.startswith("#"), input_fh), delimiter="\t" 208 | ) 209 | records = [] 210 | for line in reader: 211 | # prcoess terms with convert and filter first 212 | terms = [] 213 | for term_id in line[2].split("|"): 214 | # convert alternate ids to primary 215 | if term_id in alt2prim: 216 | term_id = alt2prim[term_id] 217 | # filtering terms not in the hpo network 218 | if term_id not in hpo_network.nodes(): 219 | continue 220 | terms.append(term_id) 221 | 222 | record = { 223 | "record_id": line[0], 224 | "terms": remove_parents(terms, hpo_network), 225 | "weights": {}, 226 | **dict( 227 | item.split("=") for item in line[1].split(";") if line[1] != "." 228 | ), 229 | } 230 | 231 | # assign new weights here ex. Sex weights (similar to the age weights). 232 | records.append(record) 233 | 234 | except (FileNotFoundError, PermissionError) as e: 235 | logger.critical(f"Input file could not be loaded or does not exist: {e}") 236 | exit(1) 237 | except ValueError: 238 | logger.critical( 239 | f"Unable to parse input file, invalid line number: " 240 | f"{reader.line_num}:{input_file}" 241 | ) 242 | exit(1) 243 | 244 | return records 245 | 246 | 247 | def read_phenotype_groups( 248 | phenotype_group_file: str = None, 249 | ) -> Dict[str, Dict[str, int]]: 250 | """ 251 | Reads the phenotype group mappping file into a dictionary. 252 | """ 253 | if phenotype_group_file is None: 254 | phenotype_group_file = config["phenotype_groups"]["phenotype_groups_file"] 255 | 256 | hp_to_pg = {} 257 | with open(phenotype_group_file, "r") as f: 258 | f.readline() 259 | for line in f: 260 | hpid, phenotype_group_1000, phenotype_group_1500 = line.strip("\n").split( 261 | "\t" 262 | ) 263 | hp_to_pg[hpid] = { 264 | "k1000": int(phenotype_group_1000), 265 | "k1500": int(phenotype_group_1500), 266 | } 267 | return hp_to_pg 268 | 269 | 270 | def standardize_phenotypes( 271 | terms: List[str], hpo_network: nx.MultiDiGraph, alt2prim: Dict[str, str] 272 | ) -> List[str]: 273 | """ 274 | Given a list of HPO ids, first try to convert synonyms to primary ids, 275 | then filter if terms are not in the ontology 276 | """ 277 | terms = [alt2prim[term] if term in alt2prim else term for term in terms] 278 | terms = list(filter(lambda term: term in hpo_network.nodes, terms)) 279 | terms = remove_parents(terms, hpo_network) 280 | return terms 281 | 282 | 283 | def encode_phenotypes( 284 | phenotypes: List, 285 | phenotype_groups: Dict, 286 | hpo_network: nx.MultiDiGraph, 287 | alt2prim: Dict[str, str], 288 | k: int = 1000, 289 | ) -> np.ndarray: 290 | """ 291 | Encode phenotypes into a feature array. 292 | """ 293 | 294 | def build_feature_array(cntr: Counter, n_features: int = k) -> np.ndarray: 295 | a = [0] * n_features 296 | for feature_index, count in cntr.items(): 297 | a[feature_index] = count 298 | return a 299 | 300 | def encode(hpo_ids: List) -> Counter: 301 | return Counter(hpo_ids) 302 | 303 | nested = all(isinstance(element, list) for element in phenotypes) 304 | 305 | if nested: 306 | return [ 307 | build_feature_array( 308 | encode( 309 | [ 310 | phenotype_groups[hpoid][f"k{k}"] 311 | for hpoid in standardize_phenotypes( 312 | phenotypes_, hpo_network, alt2prim 313 | ) 314 | ] 315 | ) 316 | ) 317 | for phenotypes_ in phenotypes 318 | ] 319 | 320 | return build_feature_array( 321 | encode( 322 | [ 323 | phenotype_groups[hpoid][f"k{k}"] 324 | for hpoid in standardize_phenotypes(phenotypes, hpo_network, alt2prim) 325 | ] 326 | ) 327 | ) 328 | 329 | 330 | @contextmanager 331 | def open_or_stdout(filename: str) -> Generator: 332 | if filename != "-": 333 | with open(filename, "w") as f: 334 | yield f 335 | else: 336 | yield sys.stdout 337 | -------------------------------------------------------------------------------- /phenopy/weights.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import networkx as nx 4 | import pandas as pd 5 | import sys 6 | 7 | from functools import lru_cache 8 | from typing import List 9 | import numpy as np 10 | 11 | 12 | @lru_cache(maxsize=1300000) 13 | def hpo_age_to_weight(hpo_network: nx.MultiGraph, term_id: str, age: int) -> float: 14 | """ 15 | calculate weight based on truncated normal distribution CDF 16 | """ 17 | if term_id not in hpo_network.nodes or age is None: 18 | return 1.0 19 | elif "age_dist" in hpo_network.nodes[term_id]: 20 | return get_empirical_cdf(float(age), hpo_network.nodes[term_id]["age_dist"]) 21 | else: 22 | return 1.0 23 | 24 | 25 | def calculate_age_weights( 26 | terms: List, age: int, hpo_network: nx.MultiGraph 27 | ) -> List[float]: 28 | """ 29 | Calculates an age-based weight vector given an iterable of terms. 30 | """ 31 | weights = [] 32 | for term_id in terms: 33 | weights.append(hpo_age_to_weight(hpo_network, term_id, age)) 34 | 35 | return weights 36 | 37 | 38 | def get_truncated_normal( 39 | mean: float, sd: float, lower: float, upper: float, instances: int = 1000000 40 | ) -> np.ndarray: 41 | """ 42 | Simulates a truncated normal distribution 43 | """ 44 | # Create the normal distribution 45 | distribution = np.random.normal(mean, sd, instances) 46 | 47 | # Truncate all values outside of the range 48 | distribution = np.array([i for i in distribution if lower <= i <= upper]) 49 | 50 | return distribution 51 | 52 | 53 | def get_empirical_cdf(value: float, distribution: np.ndarray) -> float: 54 | """ 55 | Calculates the empirical cumulative distribution function for a given value within 56 | a given distribution. 57 | """ 58 | # Sort the distribution 59 | data_sorted = np.sort(distribution) 60 | 61 | # Determine the CDF for the values within the distribution 62 | cdf = np.linspace(0, 1, len(distribution)) 63 | 64 | # Establish as a dataframe 65 | df = pd.DataFrame(list(zip(data_sorted, cdf)), columns=["value", "cdf"]) 66 | 67 | # Return the maximum CDF value for the given value 68 | return df[df["value"] <= value]["cdf"].max() 69 | 70 | 71 | def make_age_distributions( 72 | phenotype_age_file: str, logger: logging.Logger = None 73 | ) -> pd.DataFrame: 74 | """ 75 | Read in phenotype ages file and convert to pandas object with modeled distributions 76 | """ 77 | 78 | try: 79 | df = pd.read_csv(phenotype_age_file, sep="\t", names=["hpid", "mean", "std"]) 80 | 81 | except (FileNotFoundError, PermissionError) as e: 82 | 83 | if logger is not None: 84 | logger.critical(e) 85 | else: 86 | sys.stderr.write(str(e)) 87 | exit(1) 88 | 89 | distributions = [] 90 | for rec in df.to_dict("records"): 91 | 92 | try: 93 | # model truncated normal 94 | dist = get_truncated_normal( 95 | mean=rec["mean"], sd=rec["std"], lower=0, upper=rec["mean"] 96 | ) 97 | distributions.append({"hpid": rec["hpid"], "age_dist": dist}) 98 | 99 | except ValueError as e: 100 | if logger is not None: 101 | logger.critical(e) 102 | else: 103 | sys.stderr.write(str(e)) 104 | exit(1) 105 | 106 | return pd.DataFrame.from_dict(distributions).set_index("hpid") 107 | -------------------------------------------------------------------------------- /phenoseries/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeneDx/phenopy/3cc03ea1a60334155141f8b261c77471195b62f3/phenoseries/__init__.py -------------------------------------------------------------------------------- /phenoseries/experiment.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import networkx as nx 5 | import numpy as np 6 | import pandas as pd 7 | import requests 8 | import sys 9 | 10 | from ast import literal_eval 11 | from phenopy.build_hpo import generate_annotated_hpo_network 12 | from phenopy.config import ( 13 | config, 14 | logger 15 | ) 16 | from phenopy.score import Scorer 17 | from phenopy.util import remove_parents, half_product 18 | from typing import ( 19 | List, 20 | Dict, 21 | ) 22 | 23 | try: 24 | from txt2hpo.extract import Extractor 25 | except ModuleNotFoundError: 26 | logger.warning("txt2hpo is not installed. This is only used in the " 27 | "validate-phenoseries command.\nTo use this command, please " 28 | "install txt2hpo: pip install txt2hpo") 29 | 30 | # TODO: fix the bug in this script before merging to master. 31 | 32 | OMIM_API_URL = "https://api.omim.org/api/" 33 | OMIM_DOWNLOADS_URL = "https://data.omim.org/downloads/" 34 | 35 | 36 | def request_mimid_info(mimid: str) -> requests.Response: 37 | """ 38 | request mimid description from OMIM 39 | """ 40 | access = "entry?" 41 | api_key = os.getenv("OMIM_API_KEY") 42 | if api_key is None: 43 | api_key = config.get("omim", "omim_api_key") 44 | payload = { 45 | "mimNumber": mimid, 46 | "include": "text", 47 | "format": "json", 48 | "apiKey": api_key, 49 | } 50 | 51 | r = requests.get(OMIM_API_URL + access, params=payload) 52 | if r.status_code == 200: 53 | return r 54 | else: 55 | logger.critical("Please set the omim_api_key in your phenopy.ini config file") 56 | 57 | 58 | def convert_and_filter_hpoids( 59 | terms: List, 60 | hpo: nx.MultiDiGraph, 61 | alt2prim: Dict[str, str]) -> List: 62 | """ 63 | Given a list of HPO ids, first try to convert synonyms to primary ids, 64 | then filter if terms are not in the ontology 65 | """ 66 | terms = [alt2prim[term] if term in alt2prim else term for term in terms] 67 | terms = list(filter(lambda term: term in hpo.nodes, terms)) 68 | terms = remove_parents(terms, hpo) 69 | return terms 70 | 71 | 72 | def make_rank_dataframe( 73 | pairwise_sim_matrix: np.ndarray, 74 | mimdf: pd.DataFrame, 75 | ps2mimids: Dict[str, List[str]]) -> pd.DataFrame: 76 | relevant_ranks_results = [] 77 | for psid, ps_mim_ids in ps2mimids.items(): 78 | # Grab the index of the "relevant" mim ids 79 | # Helps identify index in pairwise distance matrix 80 | ps_mim_idxs = mimdf[mimdf["omim_id"].isin(ps_mim_ids)].index.tolist() 81 | for query_mim_idx in ps_mim_idxs: 82 | ranks = return_relevant_ranks( 83 | pairwise_sim_matrix, query_mim_idx, ps_mim_idxs 84 | ) 85 | query_mim = mimdf.iloc[query_mim_idx]["omim_id"] 86 | relevant_ranks_results.append([psid, query_mim, ranks]) 87 | 88 | rankdf = pd.DataFrame( 89 | relevant_ranks_results, columns=["psid", "query_mim_id", "relevant_ranks"] 90 | ) 91 | rankdf["total_relevant"] = rankdf.apply( 92 | lambda row: len(row["relevant_ranks"]), axis=1 93 | ) 94 | 95 | return rankdf 96 | 97 | 98 | def return_relevant_ranks( 99 | pairwise_sim: np.ndarray, 100 | query_idx: int, 101 | other_mim_indices: List[int]) -> List[int]: 102 | """ 103 | Given a pairwise similarity matrix, compute the rank of the similarity between 104 | a query mim and another mim disease from the same PS. 105 | """ 106 | other_idxs = other_mim_indices.copy() 107 | other_idxs.remove(query_idx) 108 | other_idxs = [idx-1 for idx in other_idxs] 109 | mim_sims = pairwise_sim[query_idx].copy() 110 | mim_sims_noself = np.delete(mim_sims, [query_idx]) 111 | order = mim_sims_noself.argsort() 112 | ranks = order.argsort() 113 | ranks = max(ranks) - ranks 114 | # convert the ranks to 1-based 115 | ranks = np.array([r+1 for r in ranks]) 116 | return sorted(ranks[other_idxs]) 117 | 118 | 119 | def run_phenoseries_experiment( 120 | outdir=None, phenotypic_series_filepath=None, 121 | min_hpos=2, min_entities=4, phenoseries_fraction=1.0, 122 | scoring_method="HRSS", threads=1, 123 | omim_phenotypes_file=None, pairwise_mim_scores_file=None): 124 | 125 | if outdir is None: 126 | outdir = os.getcwd 127 | 128 | # load HPO network 129 | # data directory 130 | phenopy_data_directory = os.path.join(os.getenv("HOME"), ".phenopy/data") 131 | 132 | # files used in building the annotated HPO network 133 | obo_file = os.path.join(phenopy_data_directory, "hp.obo") 134 | disease_to_phenotype_file = os.path.join(phenopy_data_directory, "phenotype.hpoa") 135 | 136 | hpo_network, alt2prim, _ = generate_annotated_hpo_network( 137 | obo_file, disease_to_phenotype_file, ages_distribution_file=None 138 | ) 139 | 140 | # read the phenotypic series file as a DataFrame 141 | psdf = pd.read_csv( 142 | phenotypic_series_filepath, 143 | sep="\t", 144 | comment="#", 145 | names=["PS", "MIM", "Phenotype"], 146 | ) 147 | # null phenotypes are actually null MIM id fields, so just drop these 148 | psdf = psdf.dropna().sample(frac=phenoseries_fraction, random_state=42) 149 | psdf.reset_index(inplace=True, drop=True) 150 | 151 | # create a dictionary for phenotypic series to list of omim ids mapping 152 | ps2mimids = {} 153 | for ps, mim_ids in psdf.groupby(["PS"])["MIM"]: 154 | # more than two mims in a ps 155 | if len(mim_ids) >= 2: 156 | ps2mimids[ps] = list(set([int(mid) for mid in mim_ids.tolist()])) 157 | 158 | # invert the ps2mimid dictionary for easy lookup of which ps a mim belongs to 159 | mim2psids = {} 160 | for mim_id, ps in psdf.groupby(["MIM"])["PS"]: 161 | mim2psids[int(mim_id)] = ps.tolist() 162 | 163 | fields_to_use = [ 164 | "text", 165 | "description", 166 | "otherFeatures", 167 | "biochemicalFeatures", 168 | "diagnosis", 169 | "clinicalFeatures", 170 | ] 171 | 172 | if omim_phenotypes_file == "": 173 | logger.info("Scraping OMIM Diseases text") 174 | mim_texts = {} 175 | for mim_id in mim2psids: 176 | mim_response = request_mimid_info(mim_id) 177 | try: 178 | mim_info = mim_response.json() 179 | except AttributeError: 180 | break 181 | mim_text = mim_info["omim"]["entryList"][0]["entry"]["textSectionList"] 182 | 183 | all_mim_text = "" 184 | for text_section in mim_text: 185 | section_name = text_section["textSection"]["textSectionName"] 186 | if section_name in fields_to_use: 187 | # unique_section_names.add(section_name) 188 | added_text = text_section["textSection"]["textSectionContent"] 189 | all_mim_text += f" {added_text}" 190 | 191 | mim_texts[mim_id] = all_mim_text 192 | # instantiate txt2hpo's Exctractor class to perform named entity recognition 193 | extractor = Extractor( 194 | remove_negated=True, 195 | max_neighbors=3, 196 | correct_spelling=False) 197 | 198 | # loop over the MIM ids and extract hpo ids from each MIM's text fields 199 | mim_hpos = {} 200 | for mim_id in mim2psids: 201 | mim_hpos[mim_id] = extractor.hpo(mim_texts[mim_id]).hpids 202 | 203 | mimdf = pd.DataFrame() 204 | mimdf["omim_id"] = list(mim2psids.keys()) 205 | mimdf["hpo_terms"] = mimdf["omim_id"].apply(lambda mim_id: mim_hpos[mim_id]) 206 | mimdf.to_csv(os.path.join(outdir, "omim_phenotypes.txt"), index=False, sep='\t') 207 | 208 | else: 209 | logger.info("You passed an OMIM disease to phenotype file") 210 | try: 211 | mimdf = pd.read_csv(omim_phenotypes_file, sep="\t") 212 | mimdf["omim_id"] = mimdf["omim_id"].astype(int) 213 | mimdf["hpo_terms"] = mimdf["hpo_terms"].apply(literal_eval) 214 | mim_hpos = dict(zip(mimdf["omim_id"], mimdf["hpo_terms"])) 215 | except FileNotFoundError: 216 | sys.exit("Please provide a valid file path") 217 | 218 | # clean up HPO ids in lists 219 | for mim_id, hpo_ids in mim_hpos.items(): 220 | mim_hpos[mim_id] = convert_and_filter_hpoids(hpo_ids, hpo_network, alt2prim) 221 | 222 | # remove entities (mims) that have less than min_hpos 223 | mims_to_remove = [] 224 | for mim_id, hpo_ids in mim_hpos.copy().items(): 225 | if len(hpo_ids) <= min_hpos: 226 | mims_to_remove.append(mim_id) 227 | 228 | # Now remove the entities (mim ids) with less than min_hpos 229 | experiment_ps2mimids = {} 230 | # remove these mims from ps 231 | for ps, mimids in ps2mimids.copy().items(): 232 | experiment_ps2mimids[ps] = [] 233 | for ps_mim_id in mimids: 234 | if ps_mim_id not in mims_to_remove: 235 | experiment_ps2mimids[ps].append(ps_mim_id) 236 | 237 | # After removing entities, make sure the series has min number of entities 238 | # get lists of mims and their PS 239 | remove_these_ps = [] 240 | for ps, mimids in experiment_ps2mimids.items(): 241 | if len(mimids) < min_entities: 242 | remove_these_ps.append(ps) 243 | 244 | for psid in remove_these_ps: 245 | del experiment_ps2mimids[psid] 246 | 247 | # Create a unique list of entity ids, for scoring later 248 | experiment_omims = set() 249 | for psid, mim_ids in experiment_ps2mimids.items(): 250 | for mim in mim_ids: 251 | experiment_omims.add(mim) 252 | experiment_omims = list(experiment_omims) 253 | 254 | # make a DataFrame for entity ids 255 | mimdf = pd.DataFrame() 256 | mimdf["omim_id"] = experiment_omims 257 | mimdf["hpo_terms"] = mimdf["omim_id"].apply(lambda mim_id: mim_hpos[mim_id]) 258 | 259 | if pairwise_mim_scores_file == "": 260 | scorer = Scorer(hpo_network, scoring_method=scoring_method) 261 | records = [ 262 | { 263 | "record_id": mim_id, 264 | "terms": convert_and_filter_hpoids(hpo_terms, hpo_network, alt2prim), 265 | "weights": {}, 266 | } 267 | for mim_id, hpo_terms in dict( 268 | zip(mimdf["omim_id"], mimdf["hpo_terms"]) 269 | ).items() 270 | ] 271 | 272 | results = scorer.score_records( 273 | records, records, half_product(len(records), len(records)), threads=threads 274 | ) 275 | 276 | pairwise_scores = pd.DataFrame( 277 | results, columns=["mimid1", "mimid2", "phenopy-score"] 278 | ) 279 | # convert to square form 280 | pairwise_scores = pairwise_scores.set_index(["mimid1", "mimid2"]).unstack() 281 | # This pandas method chain fills in the missing scores of the square matrix 282 | # with the values from the transpose of df. 283 | pairwise_scores = ( 284 | pairwise_scores["phenopy-score"] 285 | .reset_index(drop=True) 286 | .fillna(pairwise_scores.T.droplevel(0).reset_index(drop=True)) 287 | .set_index(pairwise_scores.index, drop=True) 288 | ) 289 | # reindex with the mimdf index 290 | pairwise_scores = pairwise_scores.reindex(mimdf["omim_id"].tolist()) 291 | pairwise_scores = pairwise_scores[mimdf["omim_id"].tolist()] 292 | pd.DataFrame(pairwise_scores).to_csv( 293 | os.path.join(outdir, 'phenoseries.psim_matrix.txt'), 294 | sep='\t' 295 | ) 296 | else: 297 | pairwise_scores = pd.read_csv(pairwise_mim_scores_file, sep='\t') 298 | 299 | ranksdf = make_rank_dataframe( 300 | pairwise_scores.astype(float).values, mimdf, experiment_ps2mimids 301 | ) 302 | ranksdf.to_csv(os.path.join(outdir, "phenoseries.rankdf.txt"), sep="\t") 303 | 304 | 305 | if __name__ == "__main__": 306 | parser = argparse.ArgumentParser() 307 | parser.add_argument( 308 | "--outdir", "-o", default=os.getcwd(), help="Path where to store the results." 309 | ) 310 | parser.add_argument( 311 | "--phenotypic-series-filepath", 312 | "-p", 313 | help="path to the omim text file defining phenotypic series to omim id", 314 | ) 315 | parser.add_argument( 316 | "--min-hpos", 317 | "-n", 318 | default=4, 319 | type=int, 320 | help="The minimum number of hpo ids per entity (mim id, for example) to" 321 | "be considered for the experiment", 322 | ) 323 | parser.add_argument( 324 | "--min-entities", 325 | "-m", 326 | default=2, 327 | type=int, 328 | help="The minimum number of entities (mim id, for example) per series to" 329 | "be considered for the experiment", 330 | ) 331 | parser.add_argument( 332 | "--phenoseries-fraction", 333 | "-f", 334 | default=1.0, 335 | help="The fraction of phenoseries to use", 336 | type=float, 337 | ) 338 | parser.add_argument( 339 | "--scoring-method", 340 | "-s", 341 | default="HRSS", 342 | help="The scoring method to use", 343 | type=str, 344 | ) 345 | parser.add_argument( 346 | "--threads", "-t", default=4, help="The number of threads to use", type=int, 347 | ) 348 | parser.add_argument( 349 | "--omim-phenotypes-file", 350 | "-a", 351 | default="", 352 | help="The full path to a pre-generated omim id to list of phenotypes file", 353 | type=str, 354 | ) 355 | parser.add_argument( 356 | "--pairwise-mim-scores-file", 357 | "-b", 358 | default="", 359 | help="The full path to a pre-generated file with all the pairwise scores for" 360 | "each omim id in the experiment.", 361 | type=str, 362 | ) 363 | 364 | args = parser.parse_args() 365 | 366 | outdir = args.outdir 367 | phenotypic_series_filepath = args.phenotypic_series_filepath 368 | min_hpos = args.min_hpos 369 | min_entities = args.min_entities 370 | phenoseries_fraction = args.phenoseries_fraction 371 | scoring_method = args.scoring_method 372 | threads = args.threads 373 | omim_phenotypes_file = args.omim_phenotypes_file 374 | pairwise_mim_scores_file = args.pairwise_mim_scores_file 375 | 376 | run_phenoseries_experiment( 377 | outdir=outdir, 378 | phenotypic_series_filepath=phenotypic_series_filepath, 379 | min_hpos=min_hpos, 380 | min_entities=min_entities, 381 | phenoseries_fraction=phenoseries_fraction, 382 | scoring_method=scoring_method, 383 | threads=threads, 384 | omim_phenotypes_file=omim_phenotypes_file, 385 | pairwise_mim_scores_file=pairwise_mim_scores_file, 386 | ) 387 | -------------------------------------------------------------------------------- /phenoseries/phenoseries.requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | matplotlib 4 | requests 5 | scikit-learn 6 | txt2hpo -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["poetry-core"] 3 | build-backend = "poetry.core.masonry.api" 4 | 5 | [tool.poetry] 6 | name = "phenopy" 7 | version = "0.6.0" 8 | description = "Phenotype comparison scoring by semantic similarity." 9 | authors = [ 10 | "Kevin Arvai", 11 | "Kyle Retterer", 12 | "Carlos Borroto ", 13 | "Vlad Gainullin", 14 | "Vincent Ustach ", 15 | "Stephen McGee " 16 | ] 17 | readme = "README.md" 18 | license = "" 19 | 20 | [tool.poetry.scripts] 21 | phenopy = "phenopy.__main__:main" 22 | 23 | [tool.poetry.urls] 24 | homepage = "https://github.com/GeneDx/phenopy" 25 | "Bug Tracker" = "https://github.com/GeneDx/phenopy/issues" 26 | 27 | [tool.poetry.dependencies] 28 | python = "^3.9" 29 | fire = "^0.5.0" 30 | gensim = "^4.3.0" 31 | networkx = "2.6.3" 32 | numpy = "^1.21.1" 33 | obonet = "^1.0.0" 34 | pandas = "^1.0.0" 35 | scipy = "^1.6.1" 36 | requests = "^2.31.0" 37 | pytest = "^7.3.1" 38 | 39 | [tool.poetry.dev-dependencies] 40 | pre-commit = "^2.21.0" 41 | pytest = "^7.3.1" 42 | pytest-cov = "^4.0.0" 43 | ruff = "^0.0.264" 44 | 45 | [tool.ruff] 46 | line-length = 88 47 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeneDx/phenopy/3cc03ea1a60334155141f8b261c77471195b62f3/tests/__init__.py -------------------------------------------------------------------------------- /tests/data/phenotype_age.tsv: -------------------------------------------------------------------------------- 1 | HP:0001251 6.0 3.0 2 | HP:0001263 1.0 1.0 3 | HP:0001290 1.0 1.0 4 | HP:0004322 10.0 3.0 5 | HP:0001249 6.0 3.0 6 | -------------------------------------------------------------------------------- /tests/data/test.score-long.txt: -------------------------------------------------------------------------------- 1 | 118200 . HP:0000006|HP:0001171|HP:0001178|HP:0001265|HP:0001284|HP:0001425|HP:0001761|HP:0001765|HP:0002460|HP:0002751|HP:0002936|HP:0003376|HP:0003380|HP:0003382|HP:0003383|HP:0003431|HP:0003449|HP:0003587|HP:0003621|HP:0003677|HP:0003690|HP:0003693|HP:0003693|HP:0003828|HP:0004336|HP:0009027|HP:0009830|HP:0011096|HP:0012074 2 | 118210 . HP:0000006|HP:0001265|HP:0001284|HP:0001425|HP:0001761|HP:0001765|HP:0002460|HP:0002936|HP:0003376|HP:0003378|HP:0003380|HP:0003383|HP:0003384|HP:0003431|HP:0003674|HP:0003677|HP:0003690|HP:0003693|HP:0003693|HP:0009027|HP:0009830 3 | 118220 . HP:0000006|HP:0000365|HP:0001171|HP:0001178|HP:0001265|HP:0001284|HP:0001425|HP:0001761|HP:0001765|HP:0002460|HP:0002751|HP:0002936|HP:0003376|HP:0003380|HP:0003382|HP:0003383|HP:0003431|HP:0003449|HP:0003481|HP:0003587|HP:0003621|HP:0003677|HP:0003690|HP:0003693|HP:0003828|HP:0004336|HP:0009027|HP:0009830 4 | 118230 . HP:0000006|HP:0000470|HP:0000774|HP:0000975|HP:0001026|HP:0001270|HP:0001301|HP:0001315|HP:0001417|HP:0001600|HP:0001678|HP:0001761|HP:0001999|HP:0002013|HP:0002018|HP:0002028|HP:0003009|HP:0003091|HP:0003447|HP:0003474|HP:0003593|HP:0003693|HP:0004875|HP:0005853|HP:0009049|HP:0011727 5 | 118300 . HP:0000006|HP:0000407|HP:0001171|HP:0001265|HP:0001284|HP:0001761|HP:0001765|HP:0001884|HP:0002460|HP:0002936|HP:0003376|HP:0003431|HP:0003621|HP:0003690|HP:0003693|HP:0003693|HP:0009027|HP:0011463 6 | 118301 . HP:0000006|HP:0000007|HP:0000508|HP:0000726|HP:0000762|HP:0000763|HP:0000975|HP:0001026|HP:0001278|HP:0001300|HP:0001301|HP:0001315|HP:0001347|HP:0001417|HP:0001678|HP:0001761|HP:0002013|HP:0002018|HP:0002028|HP:0002171|HP:0002398|HP:0003009|HP:0003091|HP:0003447|HP:0003693|HP:0005150|HP:0007110|HP:0009049|HP:0011727 7 | 148360 . HP:0000006|HP:0000982|HP:0001425|HP:0001761|HP:0002164|HP:0003390|HP:0007002|HP:0008404 8 | 214400 . HP:0000007|HP:0000764|HP:0001171|HP:0001178|HP:0001265|HP:0001270|HP:0001284|HP:0001425|HP:0001765|HP:0002460|HP:0002460|HP:0002751|HP:0002936|HP:0003380|HP:0003382|HP:0003400|HP:0003429|HP:0003431|HP:0003593|HP:0003678|HP:0003693|HP:0006915|HP:0007182|HP:0040078 9 | 300905 . HP:0000407|HP:0001265|HP:0001271|HP:0001423|HP:0001761|HP:0002378|HP:0003376|HP:0003474|HP:0003677|HP:0030237 10 | 302800 . HP:0000407|HP:0000639|HP:0000763|HP:0000764|HP:0001260|HP:0001265|HP:0001270|HP:0001272|HP:0001310|HP:0001337|HP:0001419|HP:0001423|HP:0001761|HP:0001771|HP:0002015|HP:0002311|HP:0002355|HP:0002385|HP:0002395|HP:0002427|HP:0002460|HP:0002460|HP:0002500|HP:0002936|HP:0003380|HP:0003383|HP:0003431|HP:0003487|HP:0003677|HP:0003693|HP:0003829|HP:0009830|HP:0040078|HP:0040083 11 | 302801 . HP:0001249|HP:0001284|HP:0001419|HP:0001761|HP:0002460|HP:0002936|HP:0003376|HP:0003431|HP:0003482|HP:0003484|HP:0003593|HP:0003693|HP:0009027 12 | 302802 . HP:0000762|HP:0001284|HP:0001385|HP:0001419|HP:0001761|HP:0002385|HP:0002460|HP:0002650|HP:0002936|HP:0003376|HP:0003482|HP:0003484|HP:0003693|HP:0009027 13 | 302803 . HP:0001362|HP:0001419|HP:0003390|HP:0007002|HP:0007385 14 | 302900 . HP:0000639|HP:0000819|HP:0001251|HP:0001260|HP:0001284|HP:0001417|HP:0001635|HP:0001691|HP:0001761|HP:0001765|HP:0001953|HP:0002062|HP:0002495|HP:0002650|HP:0002936|HP:0003115|HP:0003116|HP:0003133|HP:0003134|HP:0003209|HP:0003232|HP:0003376|HP:0003487|HP:0003621|HP:0005157|HP:0008954|HP:0008963|HP:0009005|HP:0009027|HP:0010831|HP:0011397|HP:0011399|HP:0011441 15 | 311070 . HP:0000365|HP:0000407|HP:0000510|HP:0000529|HP:0000648|HP:0001270|HP:0001271|HP:0001288|HP:0001419|HP:0001761|HP:0002460|HP:0002522|HP:0002936|HP:0003383|HP:0003481|HP:0003693|HP:0003828|HP:0011463|HP:0032460 16 | 600882 . HP:0000006|HP:0000763|HP:0001265|HP:0001284|HP:0001761|HP:0001763|HP:0001765|HP:0001810|HP:0001868|HP:0001886|HP:0002460|HP:0002460|HP:0003376|HP:0003378|HP:0003380|HP:0003384|HP:0003431|HP:0003474|HP:0003693|HP:0009027 17 | 601098 . HP:0000006|HP:0001265|HP:0001425|HP:0001761|HP:0002460|HP:0002936|HP:0003382|HP:0003383|HP:0003431|HP:0003481|HP:0003621|HP:0003693 18 | 601152 . HP:0000006|HP:0000007|HP:0000360|HP:0000458|HP:0000543|HP:0000551|HP:0000603|HP:0000641|HP:0000648|HP:0000649|HP:0001265|HP:0001284|HP:0001604|HP:0001761|HP:0002403|HP:0002460|HP:0002650|HP:0002936|HP:0002938|HP:0003376|HP:0003378|HP:0003378|HP:0003409|HP:0003431|HP:0003593|HP:0003690|HP:0003693|HP:0003693|HP:0003701|HP:0007924|HP:0008587 19 | 601382 . HP:0000007|HP:0001270|HP:0001425|HP:0001762|HP:0002460|HP:0002650|HP:0002936|HP:0003431|HP:0003693|HP:0003693|HP:0003701|HP:0006958|HP:0007208|HP:0010628 20 | 601455 . HP:0000007|HP:0000365|HP:0000649|HP:0000762|HP:0001155|HP:0001265|HP:0001284|HP:0001288|HP:0002460|HP:0002936|HP:0003383|HP:0003447|HP:0003481|HP:0003621|HP:0003693|HP:0004696|HP:0006916|HP:0006958 21 | 601472 . HP:0000006|HP:0001265|HP:0001761|HP:0001765|HP:0002172|HP:0002650|HP:0002936|HP:0003392|HP:0003393|HP:0003426|HP:0003427|HP:0003435|HP:0003484|HP:0003674|HP:0003677|HP:0003693|HP:0009129 22 | 601596 . HP:0000007|HP:0000365|HP:0000639|HP:0000764|HP:0001270|HP:0001291|HP:0001308|HP:0001425|HP:0001761|HP:0002355|HP:0002460|HP:0002650|HP:0002936|HP:0003387|HP:0003400|HP:0003431|HP:0003484|HP:0003693|HP:0004466|HP:0007107|HP:0007695|HP:0010628|HP:0012473|HP:0040078 23 | 604563 . HP:0000007|HP:0000407|HP:0000501|HP:0001171|HP:0001178|HP:0001265|HP:0001284|HP:0001425|HP:0001761|HP:0001762|HP:0001765|HP:0002355|HP:0002460|HP:0002751|HP:0002936|HP:0003376|HP:0003380|HP:0003383|HP:0003431|HP:0003481|HP:0003621|HP:0003693|HP:0009027 24 | 605588 . HP:0000007|HP:0001265|HP:0001284|HP:0001425|HP:0001761|HP:0002460|HP:0002751|HP:0002936|HP:0003376|HP:0003378|HP:0003380|HP:0003383|HP:0003384|HP:0003431|HP:0003484|HP:0003674|HP:0003693|HP:0003701|HP:0009027 25 | 605589 . HP:0000007|HP:0001265|HP:0001284|HP:0002460|HP:0002936|HP:0003431|HP:0003581|HP:0003693 26 | 606482 . HP:0000006|HP:0000764|HP:0001265|HP:0001284|HP:0001425|HP:0001761|HP:0002460|HP:0002460|HP:0002936|HP:0003380|HP:0003383|HP:0003481|HP:0003621|HP:0003693|HP:0003693|HP:0007107|HP:0040078 27 | 606483 . HP:0000006|HP:0001265|HP:0001284|HP:0001425|HP:0001761|HP:0002460|HP:0002460|HP:0002936|HP:0003376|HP:0003378|HP:0003383|HP:0003394|HP:0003481|HP:0003674|HP:0003693|HP:0003693|HP:0007107|HP:0009027 28 | 606595 . HP:0000006|HP:0001171|HP:0001178|HP:0001265|HP:0001284|HP:0001425|HP:0001761|HP:0002380|HP:0002460|HP:0002936|HP:0003376|HP:0003394|HP:0003431|HP:0003693|HP:0007267|HP:0009027 29 | 607677 . HP:0000006|HP:0001265|HP:0001284|HP:0001761|HP:0002460|HP:0002936|HP:0003376|HP:0003378|HP:0003380|HP:0003484|HP:0003693 30 | 607678 . HP:0000006|HP:0001425|HP:0002460|HP:0003376|HP:0003431|HP:0003484|HP:0003621|HP:0003693|HP:0003828|HP:0009027|HP:0009830 31 | 607684 . HP:0000006|HP:0000218|HP:0000508|HP:0001171|HP:0001178|HP:0001265|HP:0001270|HP:0001284|HP:0001371|HP:0001761|HP:0001765|HP:0002460|HP:0002650|HP:0002936|HP:0003376|HP:0003431|HP:0003693|HP:0003798|HP:0003828|HP:0006006|HP:0009025|HP:0009027|HP:0010628 32 | 607706 . HP:0000007|HP:0001171|HP:0001284|HP:0001371|HP:0001604|HP:0001761|HP:0002460|HP:0002936|HP:0003378|HP:0003380|HP:0003383|HP:0003431|HP:0003623|HP:0003693|HP:0008443 33 | 607731 . HP:0000007|HP:0001761|HP:0002460|HP:0002460|HP:0002936|HP:0003376|HP:0003380|HP:0003438|HP:0003450|HP:0003621|HP:0003693|HP:0007083|HP:0007350|HP:0009027 34 | 607734 . HP:0000006|HP:0000007|HP:0001265|HP:0001270|HP:0001284|HP:0001425|HP:0001761|HP:0002460|HP:0002936|HP:0003380|HP:0003383|HP:0003431|HP:0003481|HP:0003621|HP:0003693|HP:0003828|HP:0004336|HP:0007233 35 | 607736 . HP:0000006|HP:0000407|HP:0000408|HP:0000478|HP:0001265|HP:0001284|HP:0001761|HP:0002015|HP:0002086|HP:0002460|HP:0002936|HP:0003376|HP:0003378|HP:0003693|HP:0003693|HP:0009027|HP:0011096 36 | 607791 . HP:0000006|HP:0001265|HP:0001284|HP:0002460|HP:0002936|HP:0003378|HP:0003481|HP:0003484|HP:0003693 37 | 607831 . HP:0000006|HP:0000007|HP:0001171|HP:0001284|HP:0001425|HP:0001762|HP:0002460|HP:0002751|HP:0002936|HP:0003380|HP:0003431|HP:0003450|HP:0003593|HP:0003693|HP:0003701 38 | 608323 . HP:0000006|HP:0001425|HP:0001760|HP:0002460|HP:0002936|HP:0003450|HP:0003484|HP:0003693 39 | 608340 . HP:0000007|HP:0001178|HP:0001265|HP:0001284|HP:0001761|HP:0001762|HP:0002650|HP:0002936|HP:0003376|HP:0003383|HP:0003387|HP:0003445|HP:0003690|HP:0003693|HP:0009027|HP:0009830|HP:0011096|HP:0011463 40 | 608673 . HP:0000006|HP:0001265|HP:0001284|HP:0001425|HP:0001761|HP:0002460|HP:0002650|HP:0002936|HP:0003387|HP:0003444|HP:0003477|HP:0003693|HP:0007078 41 | 609260 . HP:0000006|HP:0000007|HP:0000365|HP:0000648|HP:0001257|HP:0001265|HP:0001268|HP:0001276|HP:0001284|HP:0001337|HP:0001347|HP:0001371|HP:0001761|HP:0001765|HP:0002460|HP:0002650|HP:0002936|HP:0003376|HP:0003378|HP:0003380|HP:0003383|HP:0003384|HP:0003431|HP:0003487|HP:0003677|HP:0003690|HP:0003693|HP:0003693|HP:0003828|HP:0003829|HP:0009027|HP:0012531 42 | 609311 . HP:0000007|HP:0001265|HP:0001270|HP:0001284|HP:0001425|HP:0001761|HP:0001762|HP:0002515|HP:0002650|HP:0002936|HP:0003380|HP:0003383|HP:0003431|HP:0003484|HP:0003593|HP:0008944|HP:0009053|HP:0011096 43 | 611228 . HP:0000007|HP:0000762|HP:0001265|HP:0001270|HP:0001284|HP:0001288|HP:0002359|HP:0002460|HP:0002936|HP:0003383|HP:0003431|HP:0003447|HP:0003676|HP:0003828|HP:0005684|HP:0006466|HP:0007182 44 | 613287 . HP:0000006|HP:0000407|HP:0001284|HP:0001761|HP:0001765|HP:0002460|HP:0002936|HP:0003202|HP:0003431|HP:0003477|HP:0003828|HP:0009027 45 | 613641 . HP:0000007|HP:0001263|HP:0001265|HP:0001284|HP:0001761|HP:0002936|HP:0003376|HP:0009027|HP:0009588 46 | 614228 . HP:0000006|HP:0001265|HP:0001270|HP:0001761|HP:0002359|HP:0002460|HP:0002527|HP:0002936|HP:0003431|HP:0003677|HP:0003690|HP:0009046 47 | 614436 . HP:0000006|HP:0000007|HP:0000007|HP:0000764|HP:0001265|HP:0001284|HP:0001761|HP:0001765|HP:0002380|HP:0002460|HP:0002936|HP:0003376|HP:0003378|HP:0003431|HP:0003677|HP:0003693|HP:0003829|HP:0006886|HP:0009027|HP:0040078|HP:0040083 48 | 614455 . HP:0000006|HP:0000093|HP:0000097|HP:0001171|HP:0001265|HP:0001284|HP:0001761|HP:0001765|HP:0002460|HP:0002936|HP:0003376|HP:0003383|HP:0003447|HP:0003676|HP:0003774|HP:0007149|HP:0008944|HP:0009027 49 | 614895 . HP:0000007|HP:0001270|HP:0001284|HP:0001604|HP:0001761|HP:0002355|HP:0002650|HP:0002936|HP:0003202|HP:0003387|HP:0003400|HP:0003431|HP:0003677|HP:0003690|HP:0010871|HP:0011096 50 | 615025 . HP:0000006|HP:0002355|HP:0003202|HP:0003474 51 | 615185 . HP:0000006|HP:0001265|HP:0001761|HP:0001765|HP:0002936|HP:0003376|HP:0003383|HP:0003450|HP:0003677 52 | 615284 . HP:0000007|HP:0000020|HP:0000252|HP:0000486|HP:0000602|HP:0000762|HP:0001159|HP:0001249|HP:0001284|HP:0001288|HP:0001763|HP:0002650|HP:0002936|HP:0003383|HP:0003676|HP:0003690|HP:0012444 53 | 615376 . HP:0000007|HP:0001284|HP:0001761|HP:0001765|HP:0002460|HP:0002936|HP:0003387|HP:0003431|HP:0008180 54 | 615490 . HP:0000007|HP:0001284|HP:0001290|HP:0001324|HP:0001605|HP:0001761|HP:0001762|HP:0002093|HP:0002136|HP:0002540|HP:0002779|HP:0003199|HP:0003380|HP:0003477|HP:0006380|HP:0031936|HP:0040078 55 | 616039 . HP:0000007|HP:0001265|HP:0001284|HP:0001761|HP:0002936|HP:0003376|HP:0003383|HP:0003677|HP:0009027 56 | 616155 . HP:0000007|HP:0001265|HP:0001284|HP:0001762|HP:0002650|HP:0002936|HP:0003376|HP:0003677|HP:0003701|HP:0007141|HP:0009027|HP:0040078 57 | 616280 . HP:0000006|HP:0001284|HP:0002936|HP:0003376|HP:0003477|HP:0003677|HP:0009027 58 | 616491 . HP:0000006|HP:0001265|HP:0002936|HP:0003401|HP:0003676|HP:0010871 59 | 616625 . HP:0000006|HP:0001761|HP:0001765|HP:0002936|HP:0003376|HP:0003438|HP:0003828 60 | 616668 . HP:0000007|HP:0001155|HP:0001284|HP:0001337|HP:0001761|HP:0002079|HP:0002751|HP:0002936|HP:0003477|HP:0003677|HP:0006466|HP:0009027|HP:0100543 61 | 616684 . HP:0000007|HP:0000407|HP:0000666|HP:0001251|HP:0001284|HP:0001332|HP:0002151|HP:0002355|HP:0002751|HP:0003202|HP:0003388|HP:0003447|HP:0003677|HP:0003828|HP:0009830|HP:0011096 62 | 616687 . HP:0000006|HP:0001284|HP:0001761|HP:0001765|HP:0002936|HP:0003236|HP:0003676|HP:0003828 63 | 616688 . HP:0000006|HP:0000020|HP:0000365|HP:0001171|HP:0001263|HP:0001270|HP:0001276|HP:0001284|HP:0001290|HP:0001620|HP:0001761|HP:0002355|HP:0002380|HP:0002411|HP:0002650|HP:0002936|HP:0003394|HP:0003677|HP:0003701|HP:0007256|HP:0009027 64 | 616924 . HP:0000006|HP:0001265|HP:0001761|HP:0002021|HP:0002359|HP:0002495|HP:0002515|HP:0002936|HP:0003198|HP:0003200|HP:0003236|HP:0003390|HP:0003445|HP:0003484|HP:0003487|HP:0003555|HP:0003557|HP:0003676|HP:0003701|HP:0003805|HP:0003828|HP:0007141|HP:0007210|HP:0007340|HP:0009129 65 | 617017 . HP:0000006|HP:0000007|HP:0001265|HP:0001284|HP:0002317|HP:0002936|HP:0003581|HP:0003677|HP:0007141|HP:0009027 66 | 617087 . HP:0000007|HP:0000365|HP:0000543|HP:0000648|HP:0001265|HP:0001761|HP:0002194|HP:0002355|HP:0002650|HP:0002747|HP:0002808|HP:0002936|HP:0003477|HP:0003701|HP:0003828|HP:0009027 67 | 617882 . HP:0000006|HP:0000407|HP:0000639|HP:0001171|HP:0001251|HP:0001257|HP:0001265|HP:0001270|HP:0001284|HP:0001761|HP:0002460|HP:0002515|HP:0002936|HP:0003236|HP:0003376|HP:0003391|HP:0003487|HP:0003677|HP:0007141 68 | 618036 . HP:0000006|HP:0001265|HP:0001284|HP:0001761|HP:0003376|HP:0003394|HP:0003677|HP:0007141|HP:0009027 69 | 618279 . HP:0000006|HP:0001761|HP:0002355|HP:0002359|HP:0003376|HP:0003383|HP:0003677 70 | 117210 . HP:0000006|HP:0000407|HP:0001251|HP:0001260|HP:0001272|HP:0002066|HP:0002070|HP:0003584|HP:0007979 71 | 117360 . HP:0000006|HP:0000639|HP:0001260|HP:0001270|HP:0001310|HP:0002070|HP:0002075|HP:0002080|HP:0002136|HP:0002335|HP:0002470|HP:0003577|HP:0006855|HP:0006855|HP:0100543 72 | 133190 . HP:0000006|HP:0000605|HP:0000639|HP:0000951|HP:0000962|HP:0001257|HP:0001260|HP:0001265|HP:0001272|HP:0001347|HP:0002066|HP:0002070|HP:0002075|HP:0002080|HP:0002380|HP:0003477|HP:0003829|HP:0007256|HP:0007772 73 | 164400 . HP:0000006|HP:0000514|HP:0000543|HP:0000623|HP:0000639|HP:0000640|HP:0000641|HP:0000648|HP:0001151|HP:0001252|HP:0001257|HP:0001260|HP:0001283|HP:0001284|HP:0001290|HP:0001310|HP:0001347|HP:0002015|HP:0002070|HP:0002071|HP:0002072|HP:0002073|HP:0002075|HP:0002078|HP:0002168|HP:0002198|HP:0002495|HP:0002503|HP:0002542|HP:0002839|HP:0003202|HP:0003431|HP:0003448|HP:0003487|HP:0003581|HP:0003693|HP:0003744|HP:0007006|HP:0007078|HP:0007263|HP:0100543 74 | 164500 . HP:0000006|HP:0000514|HP:0000529|HP:0000580|HP:0000608|HP:0000623|HP:0000639|HP:0000648|HP:0001257|HP:0001260|HP:0001268|HP:0001310|HP:0001337|HP:0001347|HP:0002015|HP:0002071|HP:0002072|HP:0002073|HP:0002310|HP:0002542|HP:0003487|HP:0003744 75 | 183050 . HP:0000006|HP:0000762|HP:0001251|HP:0001257|HP:0001260|HP:0001271|HP:0002063|HP:0002067|HP:0002380|HP:0003202 76 | 183086 . HP:0000006|HP:0000640|HP:0000763|HP:0001260|HP:0001272|HP:0002015|HP:0002073|HP:0002076|HP:0003676|HP:0003743|HP:0007670|HP:0007772 77 | 183090 . HP:0000006|HP:0000510|HP:0000514|HP:0000602|HP:0000640|HP:0000641|HP:0000657|HP:0000726|HP:0001151|HP:0001252|HP:0001257|HP:0001260|HP:0001265|HP:0001290|HP:0001300|HP:0001310|HP:0001336|HP:0002015|HP:0002063|HP:0002067|HP:0002070|HP:0002073|HP:0002075|HP:0002172|HP:0002174|HP:0002198|HP:0002380|HP:0002495|HP:0002503|HP:0002542|HP:0002839|HP:0003693|HP:0003743 78 | 213200 . HP:0000007|HP:0000639|HP:0000750|HP:0001152|HP:0001249|HP:0001257|HP:0001260|HP:0001263|HP:0001265|HP:0001290|HP:0001310|HP:0001321|HP:0001337|HP:0001347|HP:0001761|HP:0002066|HP:0002070|HP:0002171|HP:0002311|HP:0002317|HP:0003593|HP:0003680|HP:0004322 79 | 271250 . HP:0000007|HP:0000365|HP:0000618|HP:0001251|HP:0005102 80 | 271270 . HP:0000007|HP:0000179|HP:0000280|HP:0000337|HP:0000463|HP:0000508|HP:0001251|HP:0001252|HP:0001260|HP:0001263|HP:0001265|HP:0001272|HP:0001290|HP:0001760|HP:0002208|HP:0002650|HP:0002714|HP:0003196|HP:0003487 81 | 300703 . HP:0000639|HP:0001251|HP:0001260|HP:0001270|HP:0001319|HP:0001419|HP:0002345|HP:0003593|HP:0003680 82 | 301310 . HP:0001260|HP:0001310|HP:0001419|HP:0001924|HP:0001939|HP:0002075|HP:0002080|HP:0002169|HP:0002470|HP:0003487|HP:0003621|HP:0004840 83 | 301790 . HP:0000407|HP:0000543|HP:0000565|HP:0000648|HP:0000726|HP:0001250|HP:0001252|HP:0001254|HP:0001257|HP:0001263|HP:0001265|HP:0001272|HP:0001284|HP:0001290|HP:0001310|HP:0001324|HP:0001419|HP:0001522|HP:0002013|HP:0002015|HP:0002020|HP:0002080|HP:0002171|HP:0002205|HP:0002311|HP:0002529|HP:0002599|HP:0003593|HP:0004881|HP:0004885|HP:0008757 84 | 301840 . HP:0000726|HP:0001251|HP:0001337|HP:0001417|HP:0002062|HP:0007256|HP:0031936 85 | 302500 . HP:0000486|HP:0000514|HP:0000639|HP:0001251|HP:0001260|HP:0001270|HP:0001272|HP:0001319|HP:0001417|HP:0001419|HP:0002080|HP:0003577|HP:0003621|HP:0003680|HP:0003698 86 | 302600 . HP:0001251|HP:0001417|HP:0002071 87 | 600223 . HP:0000006|HP:0000763|HP:0001260|HP:0001265|HP:0001272|HP:0001284|HP:0002073|HP:0002406|HP:0002936|HP:0003487|HP:0007772 88 | 600224 . HP:0000006|HP:0000317|HP:0000640|HP:0001260|HP:0001263|HP:0001272|HP:0001290|HP:0001310|HP:0001347|HP:0002066|HP:0002070|HP:0002075|HP:0002080|HP:0002311|HP:0002493|HP:0002495|HP:0003593|HP:0003674|HP:0003677|HP:0007772|HP:0100543 89 | 603516 . HP:0000006|HP:0000012|HP:0000020|HP:0000639|HP:0000716|HP:0000726|HP:0000762|HP:0001250|HP:0001260|HP:0001272|HP:0001310|HP:0001347|HP:0002015|HP:0002062|HP:0002066|HP:0002070|HP:0002071|HP:0002073|HP:0002075|HP:0002168|HP:0002311|HP:0003743|HP:0003829|HP:0007256 90 | 604326 . HP:0000006|HP:0000317|HP:0000496|HP:0000716|HP:0000726|HP:0000739|HP:0000746|HP:0001260|HP:0001272|HP:0001300|HP:0001310|HP:0001347|HP:0002073|HP:0002075|HP:0002120|HP:0002345|HP:0002346|HP:0002530|HP:0007141 91 | 604432 . HP:0000006|HP:0000639|HP:0001260|HP:0001272|HP:0001347|HP:0002073|HP:0003581 92 | 605259 . HP:0000006|HP:0000365|HP:0000639|HP:0000648|HP:0001249|HP:0001252|HP:0001260|HP:0001270|HP:0001272|HP:0001290|HP:0001347|HP:0002062|HP:0002066|HP:0002070|HP:0002073|HP:0002406|HP:0003677|HP:0007256|HP:0008003 93 | 605361 . HP:0000006|HP:0000317|HP:0000639|HP:0000716|HP:0001260|HP:0001268|HP:0001272|HP:0001310|HP:0001347|HP:0002015|HP:0002066|HP:0002073|HP:0002354|HP:0003677|HP:0003829|HP:0004373|HP:0006938|HP:0007018 94 | 606002 . HP:0000007|HP:0000486|HP:0000524|HP:0000639|HP:0000640|HP:0000657|HP:0001152|HP:0001260|HP:0001265|HP:0001271|HP:0001272|HP:0001284|HP:0001332|HP:0001337|HP:0001761|HP:0002015|HP:0002066|HP:0002070|HP:0002072|HP:0002346|HP:0002460|HP:0002650|HP:0003236|HP:0003431|HP:0003477|HP:0003676|HP:0003693|HP:0003828|HP:0006254|HP:0006879|HP:0006886|HP:0006937|HP:0007240|HP:0007256|HP:0007267|HP:0010702|HP:0010831 95 | 606658 . HP:0000006|HP:0000641|HP:0001260|HP:0001272|HP:0001347|HP:0002066|HP:0002070|HP:0002078|HP:0002168|HP:0002174|HP:0003581|HP:0003621|HP:0003677|HP:0007772|HP:0007979 96 | 607136 . HP:0000006|HP:0000020|HP:0000640|HP:0000716|HP:0000718|HP:0000727|HP:0000738|HP:0000743|HP:0000757|HP:0001250|HP:0001260|HP:0001272|HP:0001289|HP:0001300|HP:0001310|HP:0001332|HP:0001336|HP:0002015|HP:0002063|HP:0002066|HP:0002067|HP:0002070|HP:0002072|HP:0002080|HP:0002136|HP:0002171|HP:0002186|HP:0002300|HP:0002403|HP:0002506|HP:0002529|HP:0003676|HP:0007668|HP:0011999 97 | 607250 . HP:0000007|HP:0001251|HP:0001761|HP:0003376|HP:0003477|HP:0003693 98 | 607317 . HP:0000007|HP:0000252|HP:0000639|HP:0001251|HP:0001256|HP:0001257|HP:0001260|HP:0001270|HP:0001332|HP:0001336|HP:0001337|HP:0001347|HP:0001761|HP:0002066|HP:0002359|HP:0002380|HP:0002460|HP:0002500|HP:0003477|HP:0003487|HP:0003581|HP:0003693|HP:0003828|HP:0007338|HP:0008936|HP:0032105 99 | 607346 . HP:0000006|HP:0001260|HP:0001265|HP:0001272|HP:0001336|HP:0001347|HP:0002015|HP:0002066|HP:0002070|HP:0002073|HP:0002078|HP:0002174|HP:0002396|HP:0003677|HP:0007944|HP:0007979|HP:0100543 100 | 607454 . HP:0000006|HP:0000514|HP:0000639|HP:0000718|HP:0000741|HP:0001249|HP:0001260|HP:0001263|HP:0001265|HP:0001272|HP:0001300|HP:0002066|HP:0002070|HP:0002071|HP:0002073|HP:0002168|HP:0002174|HP:0002304|HP:0002396|HP:0003677|HP:0007792|HP:0010526|HP:0100543|HP:0100710 101 | 607458 . HP:0000006|HP:0000639|HP:0001265|HP:0001272|HP:0001284|HP:0001310|HP:0001337|HP:0001761|HP:0002075|HP:0003202|HP:0003390|HP:0003487|HP:0003674|HP:0003690|HP:0007240 102 | 608029 . HP:0000007|HP:0000750|HP:0001251|HP:0001252|HP:0001257|HP:0001270|HP:0001272|HP:0001290|HP:0001310|HP:0001347|HP:0001763|HP:0002066|HP:0002080|HP:0002312|HP:0003577|HP:0003680|HP:0004322 103 | 608687 . HP:0000006|HP:0000639|HP:0001260|HP:0001618|HP:0001620|HP:0002066|HP:0002070|HP:0002174|HP:0003581|HP:0003677|HP:0007256|HP:0007338|HP:0010530 104 | 608703 . HP:0000006|HP:0000012|HP:0000317|HP:0000486|HP:0000505|HP:0000639|HP:0000763|HP:0001251|HP:0001260|HP:0001272|HP:0001761|HP:0002013|HP:0002522|HP:0002650|HP:0003380|HP:0003487|HP:0006944|HP:0007328|HP:0007663|HP:0011468 105 | 608768 . HP:0000006|HP:0000514|HP:0000639|HP:0000641|HP:0000763|HP:0001257|HP:0001260|HP:0001272|HP:0001337|HP:0002015|HP:0002062|HP:0002073|HP:0002311|HP:0007256|HP:0007772|HP:0009830 106 | 609270 . HP:0000007|HP:0000639|HP:0000651|HP:0001152|HP:0001251|HP:0001260|HP:0001272|HP:0001347|HP:0002066|HP:0002070|HP:0002174|HP:0002312|HP:0002495|HP:0003487|HP:0003621|HP:0003677|HP:0007338 107 | 609306 . HP:0000006|HP:0000639|HP:0000641|HP:0001151|HP:0001260|HP:0001272|HP:0002066|HP:0002070|HP:0002078|HP:0002311|HP:0003581|HP:0003677 108 | 609307 . HP:0000006|HP:0000486|HP:0000640|HP:0000641|HP:0000716|HP:0001256|HP:0001260|HP:0001272|HP:0001425|HP:0001761|HP:0002066|HP:0002070|HP:0002078|HP:0002174|HP:0002310|HP:0002346|HP:0002354|HP:0002495|HP:0003390|HP:0003677|HP:0007772 109 | 610245 . HP:0000006|HP:0000514|HP:0001260|HP:0001271|HP:0001272|HP:0001274|HP:0001310|HP:0001337|HP:0001347|HP:0002066|HP:0002070|HP:0002166|HP:0002529|HP:0003487|HP:0003677|HP:0007141|HP:0007305 110 | 610246 . HP:0000006|HP:0000508|HP:0000514|HP:0000597|HP:0000640|HP:0000641|HP:0001257|HP:0001260|HP:0001272|HP:0001276|HP:0001300|HP:0001332|HP:0002066|HP:0002070|HP:0002395|HP:0003487|HP:0003677 111 | 610743 . HP:0000007|HP:0000639|HP:0001260|HP:0001272|HP:0001310|HP:0002066|HP:0002070|HP:0003581|HP:0003677 112 | 613371 . HP:0000006|HP:0000640|HP:0001251|HP:0001260|HP:0001272|HP:0003581|HP:0003587|HP:0003677|HP:0007338 113 | 613728 . HP:0000007|HP:0000639|HP:0001249|HP:0001260|HP:0001272|HP:0001310|HP:0001347|HP:0001761|HP:0002066|HP:0002070|HP:0002078|HP:0002080|HP:0002380|HP:0007338 114 | 613908 . HP:0000006|HP:0000315|HP:0000467|HP:0000473|HP:0001260|HP:0001272|HP:0001310|HP:0001347|HP:0002080|HP:0002311|HP:0002355|HP:0003487|HP:0003581|HP:0003677 115 | 613909 . HP:0000006|HP:0000027|HP:0000029|HP:0000789|HP:0001251|HP:0001272|HP:0100543 116 | 614153 . HP:0000006|HP:0000365|HP:0000508|HP:0000511|HP:0000514|HP:0000639|HP:0001252|HP:0001260|HP:0001272|HP:0001276|HP:0001308|HP:0001324|HP:0001347|HP:0002015|HP:0002066|HP:0002070|HP:0002078|HP:0002311|HP:0003202|HP:0003445|HP:0003487|HP:0003676|HP:0007772|HP:0012473 117 | 614229 . HP:0000007|HP:0000639|HP:0001260|HP:0001263|HP:0001272|HP:0002015|HP:0002070|HP:0002078|HP:0003677|HP:0007772|HP:0025356 118 | 614322 . HP:0000007|HP:0000252|HP:0000546|HP:0000640|HP:0000648|HP:0001249|HP:0001250|HP:0001257|HP:0001260|HP:0001263|HP:0001265|HP:0001272|HP:0001510|HP:0002066|HP:0002070|HP:0003487 119 | 614831 . HP:0000007|HP:0000508|HP:0000565|HP:0000571|HP:0000666|HP:0001249|HP:0001250|HP:0001260|HP:0001263|HP:0001272|HP:0001290|HP:0001310|HP:0001337|HP:0001347|HP:0001763|HP:0002075|HP:0002119|HP:0003593|HP:0003677|HP:0004322|HP:0006951|HP:0007068|HP:0007256 120 | 615386 . HP:0000007|HP:0000571|HP:0000639|HP:0000750|HP:0001257|HP:0001263|HP:0001272|HP:0001310|HP:0001347|HP:0002066|HP:0002075|HP:0002080|HP:0003593|HP:0003677|HP:0008003|HP:0100543 121 | 615705 . HP:0000007|HP:0000639|HP:0001249|HP:0001250|HP:0001251|HP:0001260|HP:0001265|HP:0001270|HP:0001272|HP:0001347|HP:0002317|HP:0003676 122 | 615768 . HP:0000007|HP:0000135|HP:0000544|HP:0000639|HP:0001257|HP:0001260|HP:0001272|HP:0001321|HP:0001337|HP:0002070|HP:0002078|HP:0002317|HP:0003676|HP:0011448|HP:0100543 123 | 615945 . HP:0000006|HP:0000639|HP:0001251|HP:0001260|HP:0001272|HP:0001337|HP:0002015|HP:0002317|HP:0002359|HP:0003677 124 | 615957 . HP:0000006|HP:0000514|HP:0000639|HP:0001260|HP:0001272|HP:0002066|HP:0002070|HP:0003477|HP:0003677 125 | 616053 . HP:0000006|HP:0001260|HP:0001310|HP:0001347|HP:0002075|HP:0002080|HP:0002136|HP:0002313|HP:0002317|HP:0003581|HP:0003677|HP:0006879 126 | 616127 . HP:0000007|HP:0001249|HP:0001260|HP:0001263|HP:0001290|HP:0001310|HP:0001321|HP:0001337|HP:0002078|HP:0002317|HP:0003593|HP:0003677|HP:0003680 127 | 616204 . HP:0000007|HP:0000543|HP:0000565|HP:0000639|HP:0000657|HP:0001260|HP:0001263|HP:0001272|HP:0001290|HP:0001310|HP:0001347|HP:0001371|HP:0002066|HP:0002075|HP:0002078|HP:0002311|HP:0002465|HP:0003487|HP:0003593|HP:0100543 128 | 616354 . HP:0000007|HP:0000158|HP:0000218|HP:0000280|HP:0000283|HP:0000286|HP:0000289|HP:0000343|HP:0000407|HP:0000463|HP:0000639|HP:0000678|HP:0000684|HP:0000729|HP:0000998|HP:0001156|HP:0001250|HP:0001251|HP:0001257|HP:0001263|HP:0001265|HP:0001272|HP:0001290|HP:0001321|HP:0001344|HP:0001762|HP:0002120|HP:0002186|HP:0002540|HP:0002650|HP:0003487|HP:0003593|HP:0004482|HP:0011220|HP:0012385|HP:0012471|HP:0012745|HP:0012810|HP:0030084 129 | 616410 . HP:0000006|HP:0001251|HP:0001272|HP:0002172|HP:0002317|HP:0003581|HP:0003676 130 | 616719 . HP:0000007|HP:0001256|HP:0001257|HP:0001265|HP:0001270|HP:0001272|HP:0001337|HP:0001347|HP:0001395|HP:0001399|HP:0001744|HP:0001762|HP:0002066|HP:0002240|HP:0002359|HP:0002936|HP:0007141 131 | 616795 . HP:0000006|HP:0000012|HP:0000020|HP:0000651|HP:0000666|HP:0000716|HP:0000716|HP:0001152|HP:0001260|HP:0001272|HP:0001337|HP:0001347|HP:0002015|HP:0002064|HP:0002317|HP:0002317|HP:0002497|HP:0003487|HP:0003677|HP:0006938|HP:0007001|HP:0031166|HP:0100543 132 | 616948 . HP:0000007|HP:0000639|HP:0001249|HP:0001260|HP:0001272|HP:0001310|HP:0001347|HP:0002061|HP:0002078|HP:0002079|HP:0002080|HP:0002317|HP:0003677|HP:0007256 133 | 616949 . HP:0000007|HP:0000252|HP:0001249|HP:0001250|HP:0001251|HP:0001290|HP:0001875|HP:0002902|HP:0003388|HP:0011675|HP:0100786 134 | 617018 . HP:0000006|HP:0000571|HP:0000639|HP:0000768|HP:0001260|HP:0001265|HP:0001272|HP:0001337|HP:0001761|HP:0002063|HP:0002066|HP:0002070|HP:0002936|HP:0003581|HP:0003677|HP:0003693|HP:0009763 135 | 617133 . HP:0000007|HP:0000518|HP:0000639|HP:0001260|HP:0001272|HP:0002064|HP:0002066|HP:0002070|HP:0003676 136 | 617584 . HP:0000007|HP:0000639|HP:0001263|HP:0001310|HP:0001321|HP:0002078|HP:0003680|HP:0031936|HP:0100543 137 | 617633 . HP:0000007|HP:0000639|HP:0000657|HP:0001260|HP:0001272|HP:0001284|HP:0001310|HP:0002015|HP:0002070|HP:0002075|HP:0002317|HP:0002403|HP:0002460|HP:0002936|HP:0003676|HP:0007141|HP:0007338 138 | 617691 . HP:0000006|HP:0000750|HP:0001257|HP:0001260|HP:0001270|HP:0001272|HP:0001310|HP:0002015|HP:0002066|HP:0002075|HP:0002359|HP:0003677|HP:0007338 139 | 617769 . HP:0000006|HP:0000639|HP:0001260|HP:0001272|HP:0002066|HP:0002070|HP:0003581|HP:0003677 140 | 617770 . HP:0000006|HP:0000514|HP:0000639|HP:0001260|HP:0001272|HP:0001310|HP:0002066|HP:0002070|HP:0002403|HP:0003390|HP:0003581|HP:0003677 141 | 617931 . HP:0000006|HP:0000218|HP:0000341|HP:0000369|HP:0000431|HP:0000508|HP:0000651|HP:0000750|HP:0001182|HP:0001249|HP:0001250|HP:0001257|HP:0001260|HP:0001270|HP:0001290|HP:0001310|HP:0001999|HP:0002072|HP:0003676|HP:0003829|HP:0004322|HP:0006855|HP:0030084|HP:0200055 142 | 618087 . HP:0000006|HP:0000252|HP:0000303|HP:0000316|HP:0000486|HP:0000490|HP:0000540|HP:0000582|HP:0000657|HP:0001007|HP:0001159|HP:0001249|HP:0001257|HP:0001263|HP:0001310|HP:0001321|HP:0001332|HP:0001347|HP:0002421|HP:0002540|HP:0003196|HP:0008070|HP:0008936|HP:0030084|HP:0040080|HP:0045025 143 | 618093 . HP:0000006|HP:0000020|HP:0000739|HP:0001260|HP:0001272|HP:0002015|HP:0002066|HP:0003676 144 | 209900 . HP:0000007|HP:0000054|HP:0000077|HP:0000135|HP:0000137|HP:0000148|HP:0000218|HP:0000256|HP:0000365|HP:0000483|HP:0000486|HP:0000501|HP:0000510|HP:0000518|HP:0000545|HP:0000546|HP:0000556|HP:0000639|HP:0000668|HP:0000678|HP:0000750|HP:0000819|HP:0000822|HP:0001007|HP:0001080|HP:0001156|HP:0001159|HP:0001162|HP:0001249|HP:0001251|HP:0001263|HP:0001328|HP:0001395|HP:0001513|HP:0001712|HP:0001769|HP:0001773|HP:0001829|HP:0002099|HP:0002141|HP:0002167|HP:0002251|HP:0002370|HP:0002705|HP:0008734|HP:0009466|HP:0009806|HP:0012393 145 | 600151 . HP:0000007|HP:0000089|HP:0000510|HP:0001156|HP:0001249|HP:0001263|HP:0001513|HP:0003241|HP:0005180|HP:0010442 146 | 605231 . HP:0000007|HP:0000047|HP:0000107|HP:0000510|HP:0000819|HP:0001159|HP:0001249|HP:0001513|HP:0003241|HP:0010442 147 | 615981 . HP:0000007|HP:0000135|HP:0000510|HP:0000546|HP:0000819|HP:0001162|HP:0001249|HP:0001263|HP:0001513|HP:0001631|HP:0001644|HP:0001647|HP:0001830|HP:0003241 148 | 615982 . HP:0000007|HP:0000028|HP:0000107|HP:0000135|HP:0000164|HP:0000510|HP:0000546|HP:0000662|HP:0001156|HP:0001159|HP:0001249|HP:0001513|HP:0003241|HP:0010442 149 | 615983 . HP:0000007|HP:0000135|HP:0000510|HP:0001156|HP:0001159|HP:0001513|HP:0003241|HP:0007754|HP:0010442|HP:0100543 150 | 615984 . HP:0000007|HP:0000510|HP:0001249|HP:0001513|HP:0003241|HP:0010442 151 | 615985 . HP:0000007|HP:0000047|HP:0000110|HP:0000135|HP:0000248|HP:0000510|HP:0001249|HP:0001263|HP:0001513|HP:0001696|HP:0010442|HP:0100543 152 | 615986 . HP:0000007|HP:0000510|HP:0001249|HP:0001513|HP:0003828|HP:0010442 153 | 615987 . HP:0000007|HP:0000083|HP:0000107|HP:0000135|HP:0000510|HP:0001513|HP:0010442|HP:0100543 154 | 615988 . HP:0000007|HP:0000077|HP:0000135|HP:0000488|HP:0001513|HP:0010442 155 | 615989 . HP:0000007|HP:0000077|HP:0000135|HP:0000510|HP:0001513|HP:0010442|HP:0100543 156 | 615990 . HP:0000007|HP:0000510|HP:0001249|HP:0001263|HP:0001513|HP:0010442 157 | 615991 . HP:0000007|HP:0000510|HP:0001249|HP:0001263|HP:0001513 158 | 615992 . HP:0000007 159 | 615993 . HP:0000007|HP:0000083|HP:0000104|HP:0000107|HP:0000110|HP:0000135|HP:0000365|HP:0000403|HP:0000510|HP:0000546|HP:0001249|HP:0001263|HP:0001513|HP:0002098|HP:0003241|HP:0011950|HP:0100543 160 | 615994 . HP:0000007|HP:0000107|HP:0000135|HP:0000546|HP:0000548|HP:0001156|HP:0001263|HP:0001513|HP:0001696|HP:0003241|HP:0003774|HP:0100260|HP:0100543 161 | 615995 . HP:0000007|HP:0000083|HP:0000510|HP:0000518|HP:0001156|HP:0001513|HP:0100543 162 | 615996 . HP:0000007|HP:0000083|HP:0000135|HP:0000510|HP:0001249|HP:0001513|HP:0003241|HP:0004409|HP:0010442 163 | 617119 . HP:0000007|HP:0000135|HP:0000252|HP:0000510|HP:0001249|HP:0001513|HP:0010442 164 | 617406 . HP:0000007|HP:0000085|HP:0000510|HP:0000545|HP:0000548|HP:0000618|HP:0000668|HP:0000750|HP:0001105|HP:0001133|HP:0001162|HP:0001513|HP:0002910|HP:0007750|HP:0030329|HP:0030483|HP:0030631 165 | -------------------------------------------------------------------------------- /tests/data/test.score-one-patient.txt: -------------------------------------------------------------------------------- 1 | 601382 . HP:0000007|HP:0001270|HP:0001425|HP:0001762|HP:0002460|HP:0002650|HP:0002936|HP:0003431|HP:0003693|HP:0003693|HP:0003701|HP:0006958|HP:0007208|HP:0010628 2 | -------------------------------------------------------------------------------- /tests/data/test.score-short.txt: -------------------------------------------------------------------------------- 1 | 118200 age=9.0;sex=female HP:0001263|HP:0001251|HP:0001290|HP:0004322 2 | 118210 age=4.0 HP:0001249|HP:0001263|HP:0001290 3 | 118211 . HP:0001249|HP:0001263|HP:0001290 4 | -------------------------------------------------------------------------------- /tests/fixtures/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeneDx/phenopy/3cc03ea1a60334155141f8b261c77471195b62f3/tests/fixtures/__init__.py -------------------------------------------------------------------------------- /tests/fixtures/get_data_dictionary.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from phenopy.d2p import load as load_d2p 5 | from phenopy.network import annotate 6 | from phenopy.network import load as load_network 7 | from phenopy.score import Scorer 8 | from phenopy.util import generate_alternate_ids, read_phenotype_groups 9 | 10 | 11 | @pytest.fixture() 12 | def test_data(): 13 | data = {} 14 | data["parent_dir"] = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 15 | data["obo_file"] = os.path.join(data["parent_dir"], "data/hp.obo") 16 | data["hpo_network"] = load_network(data["obo_file"]) 17 | data["alt2prim"] = generate_alternate_ids(data["hpo_network"]) 18 | data["ages_distribution_file"] = os.path.join( 19 | data["parent_dir"], "data/phenotype_age.tsv" 20 | ) 21 | 22 | data["disease_to_phenotype_file"] = os.path.join( 23 | data["parent_dir"], "data/phenotype.hpoa" 24 | ) 25 | data["disease_records"], data["phenotype_to_diseases"] = load_d2p( 26 | data["disease_to_phenotype_file"], data["hpo_network"], data["alt2prim"] 27 | ) 28 | 29 | data["num_diseases_annotated"] = len(data["disease_records"]) 30 | data["hpo_network"] = annotate( 31 | data["hpo_network"], 32 | data["phenotype_to_diseases"], 33 | data["num_diseases_annotated"], 34 | data["alt2prim"], 35 | ) 36 | 37 | data["scorer"] = Scorer(data["hpo_network"], min_score_mask=None) 38 | data["disease_to_phenotype_output_file"] = os.path.join( 39 | data["parent_dir"], "data/phenotype.noparents.hpoa" 40 | ) 41 | 42 | data["phenotype_groups"] = read_phenotype_groups() 43 | 44 | return data 45 | -------------------------------------------------------------------------------- /tests/test_ic.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from phenopy.network import annotate 5 | from phenopy.network import load as load_network 6 | from phenopy.util import export_phenotype_hpoa_with_no_parents 7 | 8 | 9 | def test_ic_d2p(test_data): 10 | """Calculate the information content of a phenotype""" 11 | assert ( 12 | pytest.approx(test_data["hpo_network"].nodes["HP:0010863"]["ic"], 0.01) == 7.21 13 | ) 14 | 15 | 16 | def test_ic_custom(test_data): 17 | """ 18 | Calculate the information content of a phenotype when multiple 19 | annotations are present 20 | """ 21 | custom_annotation_file = os.path.join( 22 | test_data["parent_dir"], "data/test.score-long.txt" 23 | ) 24 | hpo_network = load_network(test_data["obo_file"]) 25 | hpo_network = annotate( 26 | hpo_network, 27 | test_data["phenotype_to_diseases"], 28 | test_data["num_diseases_annotated"], 29 | test_data["alt2prim"], 30 | annotations_file=custom_annotation_file, 31 | ) 32 | 33 | assert pytest.approx(hpo_network.nodes["HP:0010863"]["ic"], 0.01) == 8.11 34 | 35 | 36 | def test_ic_d2p_no_parents(test_data): 37 | export_phenotype_hpoa_with_no_parents( 38 | test_data["disease_to_phenotype_file"], 39 | test_data["disease_to_phenotype_output_file"], 40 | test_data["hpo_network"], 41 | ) 42 | assert os.path.exists(test_data["disease_to_phenotype_output_file"]) 43 | os.remove(test_data["disease_to_phenotype_output_file"]) 44 | -------------------------------------------------------------------------------- /tests/test_network.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from phenopy.d2p import load as load_d2p 5 | from phenopy.network import load as load_network 6 | from phenopy.network import annotate 7 | from phenopy.util import generate_alternate_ids 8 | 9 | 10 | def test_load_network(test_data): 11 | hpo_network = load_network(test_data["obo_file"]) 12 | assert len(hpo_network) == 16861 13 | 14 | 15 | def test_annotate_network(test_data): 16 | hpo_network = load_network(test_data["obo_file"]) 17 | alt2prim = generate_alternate_ids(hpo_network) 18 | 19 | # load phenotypes to diseases associations 20 | disease_to_phenotype_file = os.path.join( 21 | test_data["parent_dir"], "data/phenotype.hpoa" 22 | ) 23 | disease_records, phenotype_to_diseases = load_d2p( 24 | disease_to_phenotype_file, hpo_network, alt2prim 25 | ) 26 | 27 | num_diseases_annotated = len(disease_records) 28 | hpo_network = annotate( 29 | hpo_network, phenotype_to_diseases, num_diseases_annotated, alt2prim 30 | ) 31 | 32 | assert pytest.approx(hpo_network.nodes["HP:0010863"]["ic"], 0.01) == 7.21 33 | assert pytest.approx(hpo_network.nodes["HP:0001263"]["ic"], 0.01) == 1.55 34 | -------------------------------------------------------------------------------- /tests/test_score.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import numpy as np 3 | import os 4 | import pandas as pd 5 | import pytest 6 | 7 | from phenopy.network import annotate 8 | from phenopy.score import Scorer 9 | from phenopy.util import ( 10 | remove_parents, 11 | parse_input, 12 | half_product, 13 | ) 14 | from phenopy.weights import calculate_age_weights 15 | 16 | 17 | def test_find_lca(test_data): 18 | lca = test_data["scorer"].find_lca("HP:0001249", "HP:0012434") 19 | assert lca == "HP:0012759" 20 | 21 | root_lca = test_data["scorer"].find_lca("HP:0012759", "HP:0000001") 22 | assert root_lca == "HP:0000001" 23 | 24 | parent_lca = test_data["scorer"].find_lca("HP:0012759", "HP:0012758") 25 | assert parent_lca == "HP:0012759" 26 | 27 | parent_lca = test_data["scorer"].find_lca("HP:0012758", "HP:0012759") 28 | assert parent_lca == "HP:0012759" 29 | 30 | parent_lca = test_data["scorer"].find_lca("HP:0012759", "HP:0012759") 31 | assert parent_lca == "HP:0012759" 32 | 33 | parent_lca = test_data["scorer"].find_lca("HP:0012759", "HP:0000750") 34 | assert parent_lca == "HP:0012759" 35 | 36 | 37 | def test_calculate_gamma(test_data): 38 | t1 = "HP:0012758" 39 | t2 = "HP:0012759" 40 | 41 | gamma0 = test_data["scorer"].calculate_gamma(t1, t1, t2) 42 | assert gamma0 == 0 43 | 44 | gamma1a = test_data["scorer"].calculate_gamma(t1, t2, t2) 45 | assert gamma1a == 1 46 | 47 | gamma1b = test_data["scorer"].calculate_gamma(t2, t1, t2) 48 | assert gamma1b == 1 49 | 50 | gamma2 = test_data["scorer"].calculate_gamma("HP:0000750", "HP:0012434", t1) 51 | assert gamma2 == 2 52 | 53 | 54 | def test_calculate_beta(test_data): 55 | t1 = "HP:0001344" 56 | t2 = "HP:0012759" 57 | beta = test_data["scorer"].calculate_beta(t1, t2) 58 | assert round(beta, 2) == 3.99 59 | 60 | 61 | def test_score_hpo_pair_hrss(test_data): 62 | t1 = "HP:0011351" 63 | t2 = "HP:0012434" 64 | 65 | score = test_data["scorer"].score_hpo_pair_hrss(t1, t2) 66 | assert round(score, 2) == 0.14 67 | 68 | score = test_data["scorer"].score_hpo_pair_hrss(t1, t2) 69 | assert round(score, 2) == 0.14 70 | 71 | score = test_data["scorer"].score_hpo_pair_hrss(t2, t1) 72 | assert round(score, 2) == 0.14 73 | 74 | 75 | def test_score(test_data): 76 | record_a = { 77 | "record_id": "sample_1", 78 | "terms": ["HP:0012433", "HP:0012434"], 79 | "weights": {}, 80 | } 81 | record_b = {"record_id": "sample_2", "terms": [], "weights": {}} 82 | 83 | score0 = test_data["scorer"].score(record_a, record_b) 84 | assert score0[2] == 0.0 85 | record_b["terms"] = ["HP:0001249", "HP:0012758"] 86 | 87 | score_bma = test_data["scorer"].score(record_a, record_b) 88 | assert round(score_bma[2], 2) == 0.09 89 | test_data["scorer"].summarization_method = "maximum" 90 | score_max = test_data["scorer"].score(record_a, record_b) 91 | assert round(score_max[2], 4) == 0.1251 92 | 93 | test_data["scorer"].summarization_method = "not_a_method" 94 | with pytest.raises(ValueError): 95 | test_data["scorer"].score(record_a, record_b) 96 | 97 | record_a.update( 98 | { 99 | "terms": [ 100 | "HP:0001251", 101 | "HP:0001263", 102 | "HP:0001290", 103 | "HP:0004322", 104 | "HP:0012433", 105 | ], 106 | "weights": {"age": [0.67, 1.0, 1.0, 0.4, 0.4]}, 107 | } 108 | ) 109 | record_b.update( 110 | { 111 | "terms": ["HP:0001249", "HP:0001263", "HP:0001290"], 112 | "weights": {"age": [1.0, 1.0, 1.0]}, 113 | } 114 | ) 115 | 116 | test_data["scorer"].summarization_method = "BMWA" 117 | test_data["scorer"].min_score_mask = 0.05 118 | score_bmwa = test_data["scorer"].score(record_a, record_b) 119 | assert round(score_bmwa[2], 4) == 0.1822 120 | 121 | record_a.update( 122 | { 123 | "terms": ["HP:0001251", "HP:0001263", "HP:0001290", "HP:0004322"], 124 | "weights": {"age": [0.67, 1.0, 1.0, 0.4]}, 125 | } 126 | ) 127 | record_b.update( 128 | { 129 | "terms": ["HP:0001263", "HP:0001249", "HP:0001290"], 130 | "weights": {"age": [1.0, 1.0, 0.5]}, 131 | } 132 | ) 133 | 134 | scorer = test_data["scorer"] 135 | scorer.summarization_method = "BMWA" 136 | 137 | score_bwma_both_weights = scorer.score(record_a, record_b) 138 | assert round(score_bwma_both_weights[2], 4) == 0.1918 139 | 140 | scorer.min_score_mask = None 141 | record_a["weights"].pop("age", None) 142 | score_bwma_one_weights = scorer.score(record_a, record_b) 143 | assert round(score_bwma_one_weights[2], 4) == 0.155 144 | 145 | 146 | def test_score_records(test_data): 147 | query_name = "SAMPLE" 148 | query_terms = [ 149 | "HP:0000750", 150 | "HP:0010863", 151 | ] 152 | input_records = [{"record_id": query_name, "terms": query_terms, "weights": {}}] 153 | score_records = test_data["disease_records"] 154 | 155 | results = test_data["scorer"].score_records( 156 | input_records, 157 | score_records, 158 | itertools.product(range(len(input_records)), range(len(score_records))), 159 | threads=1, 160 | ) 161 | assert len(results) == 8118 162 | assert round(float(results[0][2]), 2) == 0.04 163 | 164 | [record["weights"].pop("disease_frequency") for record in score_records] 165 | results = test_data["scorer"].score_records( 166 | input_records, 167 | score_records, 168 | itertools.product(range(len(input_records)), range(len(score_records))), 169 | threads=1, 170 | ) 171 | assert len(results) == 8118 172 | 173 | 174 | def test_no_parents(test_data): 175 | terms_a = ["HP:0012433", "HP:0000708"] 176 | terms_b = ["HP:0001249", "HP:0012758"] 177 | 178 | assert ( 179 | list(remove_parents(terms_a, test_data["scorer"].hpo_network))[0] 180 | == "HP:0012433" 181 | ) 182 | assert len(remove_parents(terms_b, test_data["scorer"].hpo_network)) == 2 183 | 184 | 185 | def test_score_self(test_data): 186 | records = parse_input( 187 | os.path.join(test_data["parent_dir"], "data/test.score-long.txt"), 188 | test_data["hpo_network"], 189 | test_data["alt2prim"], 190 | ) 191 | 192 | input_records = [x for x in records if x["record_id"] in ["213200", "302801"]] 193 | 194 | results = test_data["scorer"].score_records( 195 | input_records, 196 | input_records, 197 | half_product(len(input_records), len(input_records)), 198 | ) 199 | assert len(results) == 3 200 | 201 | assert round(float(results[1][2]), 2) == 0.1 202 | 203 | 204 | def test_bmwa(test_data): 205 | terms_a = ["HP:0001251", "HP:0001263", "HP:0001290", "HP:0004322"] 206 | 207 | terms_b = ["HP:0001263", "HP:0001249", "HP:0001290"] 208 | weights_a = {"age": [0.67, 1.0, 1.0, 0.4]} 209 | weights_b = {"age": [1.0, 1.0, 1.0]} 210 | 211 | df = pd.DataFrame( 212 | [ 213 | [4.22595743e-02, 3.92122308e-02, 3.04851573e-04], 214 | [1.07473687e-01, 5.05101655e-01, 3.78305515e-04], 215 | [3.69780479e-04, 3.78305515e-04, 4.64651944e-01], 216 | [4.17139800e-04, 4.12232546e-04, 3.67984322e-04], 217 | ], 218 | index=pd.Index(terms_a, name="a"), 219 | columns=pd.MultiIndex.from_arrays( 220 | [["score"] * len(terms_b), terms_b], names=[None, "b"] 221 | ), 222 | ) 223 | 224 | score_bmwa = test_data["scorer"].best_match_weighted_average( 225 | df, weights_a, weights_b 226 | ) 227 | 228 | assert round(score_bmwa, 4) == 0.3419 229 | 230 | weights_a = {"age": [1.0] * len(terms_a)} 231 | score_bmwa = test_data["scorer"].best_match_weighted_average( 232 | df, weights_a, weights_b 233 | ) 234 | assert round(score_bmwa, 4) == 0.2985 235 | 236 | weights_a = {"age": [1.0] * len(terms_a)} 237 | weights_b = {"age": [1.0] * len(terms_b)} 238 | test_data["scorer"].min_score_mask = None 239 | score_bmwa = test_data["scorer"].best_match_weighted_average( 240 | df, weights_a, weights_b 241 | ) 242 | assert round(score_bmwa, 4) == 0.2985 243 | 244 | terms_a = ["HP:0001251", "HP:0001249", "HP:0001263", "HP:0001290", "HP:0004322"] 245 | terms_b = ["HP:0001263", "HP:0001249", "HP:0001290"] 246 | 247 | df = pd.DataFrame( 248 | [ 249 | [4.22595743e-02, 3.92122308e-02, 3.04851573e-04], 250 | [1.07473687e-01, 5.05101655e-01, 3.78305515e-04], 251 | [1.07473687e-01, 5.05101655e-01, 3.78305515e-04], 252 | [3.69780479e-04, 3.78305515e-04, 4.64651944e-01], 253 | [4.17139800e-04, 4.12232546e-04, 3.67984322e-04], 254 | ], 255 | index=pd.Index(terms_a, name="a"), 256 | columns=pd.MultiIndex.from_arrays( 257 | [["score"] * len(terms_b), terms_b], names=[None, "b"] 258 | ), 259 | ) 260 | 261 | weights_a = {"age": [0.67, 0.4, 1.0, 1.0, 0.4]} 262 | weights_b = {"age": [1.0, 1.0, 1.0]} 263 | 264 | # compute pairwise best match weighted average 265 | test_data["scorer"].min_score_mask = None 266 | score_bmwa = test_data["scorer"].best_match_weighted_average( 267 | df, weights_a, weights_b 268 | ) 269 | 270 | assert round(score_bmwa, 3) == 0.352 271 | 272 | # because both patients were described to have ID, but only patient a 273 | # has ataxia and so we mask good phenotype matches from being weighted 274 | # down by default we expect to get a better similarity score 275 | test_data["scorer"].min_score_mask = 0.05 276 | score_bmwa = test_data["scorer"].best_match_weighted_average( 277 | df, weights_a, weights_b 278 | ) 279 | 280 | assert round(score_bmwa, 3) == 0.365 281 | 282 | 283 | def test_age_weight(test_data): 284 | # Test age based weight distribution and best_match_weighted_average calculation 285 | 286 | terms_a = [ 287 | "HP:0001251", 288 | "HP:0001263", 289 | "HP:0001290", 290 | "HP:0004322", 291 | ] # ATAX, DD, HYP, SS 292 | terms_b = ["HP:0001263", "HP:0001249", "HP:0001290"] # DD, ID, HYP 293 | 294 | test_data["hpo_network"] = annotate( 295 | test_data["hpo_network"], 296 | test_data["phenotype_to_diseases"], 297 | test_data["num_diseases_annotated"], 298 | test_data["alt2prim"], 299 | ages_distribution_file=test_data["ages_distribution_file"], 300 | ) 301 | 302 | age_a = 9.0 303 | age_b = 4.0 304 | 305 | # calculate weights based on patients age 306 | weights_a = {"age": calculate_age_weights(terms_a, age_b, test_data["hpo_network"])} 307 | weights_b = {"age": calculate_age_weights(terms_b, age_a, test_data["hpo_network"])} 308 | 309 | # make pairwise scores matrix 310 | df = pd.DataFrame( 311 | [ 312 | [4.22595743e-02, 3.92122308e-02, 3.04851573e-04], 313 | [1.07473687e-01, 5.05101655e-01, 3.78305515e-04], 314 | [3.69780479e-04, 3.78305515e-04, 4.64651944e-01], 315 | [4.17139800e-04, 4.12232546e-04, 3.67984322e-04], 316 | ], 317 | index=pd.Index(terms_a, name="a"), 318 | columns=pd.MultiIndex.from_arrays( 319 | [["score"] * len(terms_b), terms_b], names=[None, "b"] 320 | ), 321 | ) 322 | # compute pairwise best match weighted average 323 | score_bmwa = test_data["scorer"].best_match_weighted_average( 324 | df, weights_a, weights_b 325 | ) 326 | 327 | assert pytest.approx(float(score_bmwa), 0.01) == 0.3742 328 | 329 | # set all weights to 1.0, result should be the same as BMA without weights 330 | weights_a = {"disease_frequency": [1.0] * len(terms_a)} 331 | weights_b = {"disease_frequency": [1.0] * len(terms_b)} 332 | score_bmwa = test_data["scorer"].best_match_weighted_average( 333 | df, weights_a, weights_b 334 | ) 335 | 336 | assert pytest.approx(float(score_bmwa), 0.01) == 0.2985 337 | 338 | # test term not in network 339 | terms_a = ["HP:Not_a_term"] 340 | weights_a = calculate_age_weights(terms_a, age_b, test_data["hpo_network"]) 341 | assert weights_a == [1.0] 342 | 343 | # term in network no age 344 | terms_a = ["HP:0000001"] 345 | weights_a = calculate_age_weights(terms_a, age_b, test_data["hpo_network"]) 346 | assert weights_a == [1.0] 347 | 348 | 349 | def test_score_pairs_age(test_data): 350 | # Test reading in records files and calculating pairwise scores 351 | # read in records 352 | test_data["hpo_network"] = annotate( 353 | test_data["hpo_network"], 354 | test_data["phenotype_to_diseases"], 355 | test_data["num_diseases_annotated"], 356 | test_data["alt2prim"], 357 | ages_distribution_file=test_data["ages_distribution_file"], 358 | ) 359 | 360 | records = parse_input( 361 | os.path.join(test_data["parent_dir"], "data/test.score-short.txt"), 362 | test_data["hpo_network"], 363 | test_data["alt2prim"], 364 | ) 365 | 366 | # create instance the scorer class 367 | scorer = Scorer( 368 | test_data["hpo_network"], summarization_method="BMWA", min_score_mask=None 369 | ) 370 | 371 | # select which patients to test in pairwise best_match_weighted_average 372 | input_records = [x for x in records if x["record_id"] in ["118200", "118210"]] 373 | 374 | results = scorer.score_records( 375 | input_records, 376 | input_records, 377 | [ 378 | (0, 1), 379 | ], 380 | ) 381 | assert len(results) == 1 382 | 383 | # the right answer 384 | answer = np.average( 385 | [0.017, 0.231, 0.325, 0.0, 0.042, 0.231, 0.325], 386 | weights=[0.481, 1.0, 1.0, 0.0446, 1.0, 1.0, 1.0], 387 | ) 388 | 389 | assert pytest.approx(float(results[0][2]), 0.01) == answer 390 | 391 | # Test identical records for which one age exist and one doesn't 392 | input_records = [x for x in records if x["record_id"] in ["118210", "118211"]] 393 | 394 | results = scorer.score_records( 395 | input_records, 396 | input_records, 397 | [ 398 | (0, 1), 399 | ], 400 | ) 401 | assert len(results) == 1 402 | 403 | # the right answer 404 | answer = np.average([0.226, 0.231, 0.325], weights=[0.481, 1.0, 1.0]) 405 | 406 | assert pytest.approx(float(results[0][2]), 0.1) == answer 407 | 408 | 409 | def test_alpha_zero(test_data): 410 | """the root term should contain all diseases therefore the IC should be zero""" 411 | 412 | root_term_ic = test_data["hpo_network"].nodes["HP:0000118"]["ic"] 413 | assert root_term_ic == 0.0 414 | 415 | 416 | def test_leaves_diff_branches_score_zero(test_data): 417 | """two leaves in different branches 418 | two leaves therefore beta is zero 419 | different branches therefore alpha is zero 420 | define I = (0.0 / (0.0 + 0.0)) as zero and not nan""" 421 | term_a = "HP:0001290" 422 | term_b = "HP:0011351" 423 | 424 | score_two_leaves_diff_branches = test_data["scorer"].score_hpo_pair_hrss( 425 | term_a, term_b 426 | ) 427 | assert score_two_leaves_diff_branches == 0.0 428 | 429 | 430 | def test_score_hrss_basic(test_data): 431 | test_data["scorer"].scoring_method = "HRSS" 432 | terms_a = ["HP:0001290", "HP:0000118"] 433 | terms_b = ["HP:0001290", "HP:0011351"] 434 | 435 | assert pytest.approx(0.162, 0.01) == test_data["scorer"].score_term_sets_basic( 436 | terms_a, terms_b 437 | ) 438 | 439 | 440 | def test_score_resnik_basic(test_data): 441 | test_data["scorer"].scoring_method = "Resnik" 442 | terms_a = ["HP:0001290", "HP:0000118"] 443 | terms_b = ["HP:0001290", "HP:0011351"] 444 | assert pytest.approx(1.283, 0.01) == test_data["scorer"].score_term_sets_basic( 445 | terms_a, terms_b 446 | ) 447 | 448 | 449 | def test_score_jaccard_basic(test_data): 450 | test_data["scorer"].scoring_method = "Jaccard" 451 | terms_a = ["HP:0001290", "HP:0000118"] 452 | terms_b = ["HP:0001290", "HP:0011351"] 453 | 454 | assert pytest.approx(0.333, 0.01) == test_data["scorer"].score_term_sets_basic( 455 | terms_a, terms_b 456 | ) 457 | 458 | 459 | def test_score_word2vec_basic(test_data): 460 | test_data["scorer"] = Scorer(test_data["hpo_network"], scoring_method="word2vec") 461 | terms_a = ["HP:0001290", "HP:0000118"] 462 | terms_b = ["HP:0001290", "HP:0011351"] 463 | 464 | assert pytest.approx( 465 | test_data["scorer"].score_term_sets_basic(terms_a, terms_b), 0.01 466 | ) == pytest.approx(0.156, 0.01) 467 | 468 | 469 | def test_score_word2vec_out_of_vocab(test_data): 470 | test_data["scorer"] = Scorer(test_data["hpo_network"], scoring_method="word2vec") 471 | terms_a = ["HP:NOT_A_TERM", "HP:0000118"] 472 | terms_b = ["HP:0001290", "NOT_A_TERM"] 473 | 474 | assert pytest.approx( 475 | test_data["scorer"].score_term_sets_basic(terms_a, terms_b), 0.01 476 | ) == pytest.approx(0.063, 0.01) 477 | 478 | 479 | def test_score_word2vec_empty(test_data): 480 | test_data["scorer"] = Scorer(test_data["hpo_network"], scoring_method="word2vec") 481 | terms_a = [] 482 | terms_b = ["HP:0001290", "HP:0011351"] 483 | 484 | assert test_data["scorer"].score_term_sets_basic(terms_a, terms_b) == 0.0 485 | -------------------------------------------------------------------------------- /tests/test_util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from phenopy.util import parse, read_records_file, encode_phenotypes, parse_input 5 | 6 | 7 | def test_parse(test_data): 8 | string = "age=13;sex=Male" 9 | assert parse(string, what="sex") == "Male" 10 | assert parse(string, what="age") == 13.0 11 | 12 | string = "age=13.64;sex=male" 13 | assert parse(string, what="sex") == "Male" 14 | assert parse(string, what="age") == 13.6 15 | 16 | string = "age=12.9;sex=female" 17 | assert parse(string, what="sex") == "Female" 18 | assert parse(string, what="age") == 12.9 19 | 20 | string = "sex=Female" 21 | assert parse(string, what="sex") == "Female" 22 | 23 | string = "sex=FEMALE" 24 | assert parse(string, what="sex") == "Female" 25 | 26 | string = "sex=F" 27 | assert parse(string, what="sex") == "Female" 28 | 29 | string = "age=1" 30 | assert parse(string, what="age") == 1.0 31 | 32 | string = "." 33 | assert not parse(string, what="age") 34 | 35 | string = ". " 36 | assert not parse(string, what="age") 37 | 38 | string = " . " 39 | assert not parse(string, what="age") 40 | 41 | string = "13?" 42 | assert not parse(string, what="age") 43 | 44 | string = "sex=NA" 45 | assert not parse(string, what="sex") 46 | 47 | string = "sex=Unknown" 48 | assert not parse(string, what="sex") 49 | 50 | 51 | def test_encode_phenotypes_file(test_data): 52 | input_file = os.path.join(test_data["parent_dir"], "data/test.score-short.txt") 53 | records = parse_input(input_file, test_data["hpo_network"], test_data["alt2prim"]) 54 | encoded_phenotypes = encode_phenotypes( 55 | [record["terms"] for record in records], 56 | test_data["phenotype_groups"], 57 | test_data["hpo_network"], 58 | test_data["alt2prim"], 59 | ) 60 | assert sum(encoded_phenotypes[0]) == 4 61 | 62 | 63 | def test_encode_1d_phenotypes(test_data): 64 | phenotypes = ["HP:0012759", "HP:0003011", "HP:0011442"] 65 | encoded_phenotypes = encode_phenotypes( 66 | phenotypes, 67 | test_data["phenotype_groups"], 68 | test_data["hpo_network"], 69 | test_data["alt2prim"], 70 | k=1000, 71 | ) 72 | assert sum(encoded_phenotypes) == 3 73 | 74 | 75 | def test_encode_2d_phenotypes(test_data): 76 | phenotypes = [ 77 | ["HP:0012759", "HP:0003011", "HP:0011442"], 78 | ["HP:0012759", "HP:0003011"], 79 | ] 80 | encoded_phenotypes = encode_phenotypes( 81 | phenotypes, 82 | test_data["phenotype_groups"], 83 | test_data["hpo_network"], 84 | test_data["alt2prim"], 85 | k=1000, 86 | ) 87 | assert sum(encoded_phenotypes[1]) == 2 88 | 89 | 90 | def test_read_records_file(test_data): 91 | with pytest.raises(SystemExit) as se: 92 | read_records_file("notafilepath/notafile") 93 | 94 | assert se.type == SystemExit 95 | assert se.value.code == 1 96 | 97 | records_truth = [ 98 | { 99 | "sample": "118200", 100 | "age": 9.0, 101 | "gender": "Female", 102 | "terms": "HP:0001263|HP:0001251|HP:0001290|HP:0004322".split("|"), 103 | }, 104 | { 105 | "sample": "118210", 106 | "age": 4.0, 107 | "gender": None, 108 | "terms": "HP:0001249|HP:0001263|HP:0001290".split("|"), 109 | }, 110 | { 111 | "sample": "118211", 112 | "age": None, 113 | "gender": None, 114 | "terms": "HP:0001249|HP:0001263|HP:0001290".split("|"), 115 | }, 116 | ] 117 | records_path = os.path.join( 118 | os.path.dirname(os.path.realpath(__file__)), "data/test.score-short.txt" 119 | ) 120 | records = read_records_file(records_path, no_parents=False) 121 | assert records == records_truth 122 | -------------------------------------------------------------------------------- /tests/test_weights.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import pandas as pd 4 | from phenopy.config import logger 5 | from phenopy.weights import ( 6 | get_truncated_normal, 7 | hpo_age_to_weight, 8 | make_age_distributions, 9 | get_empirical_cdf, 10 | ) 11 | 12 | 13 | def test_age_weights(test_data): 14 | assert hpo_age_to_weight(test_data["hpo_network"], "HP:0001251", 9.0) == 1.0 15 | assert ( 16 | pytest.approx( 17 | hpo_age_to_weight(test_data["hpo_network"], "HP:0001251", 5.0), 0.01 18 | ) 19 | == 1.0 20 | ) 21 | 22 | 23 | def test_make_age_distributions(test_data): 24 | with pytest.raises(SystemExit) as se: 25 | make_age_distributions("notafilepath/notafile") 26 | 27 | assert se.type == SystemExit 28 | assert se.value.code == 1 29 | 30 | with pytest.raises(SystemExit) as se: 31 | make_age_distributions("notafilepath/notafile", logger=logger) 32 | 33 | assert se.type == SystemExit 34 | assert se.value.code == 1 35 | 36 | ages_truth = pd.DataFrame( 37 | [ 38 | { 39 | "hpid": "HP:0001251", 40 | "age_dist": get_truncated_normal(6.0, 3.0, 0.0, 6.0), 41 | }, 42 | { 43 | "hpid": "HP:0001263", 44 | "age_dist": get_truncated_normal(1.0, 1.0, 0.0, 1.0), 45 | }, 46 | { 47 | "hpid": "HP:0001290", 48 | "age_dist": get_truncated_normal(1.0, 1.0, 0.0, 1.0), 49 | }, 50 | { 51 | "hpid": "HP:0004322", 52 | "age_dist": get_truncated_normal(10.0, 3.0, 0.0, 10.0), 53 | }, 54 | { 55 | "hpid": "HP:0001249", 56 | "age_dist": get_truncated_normal(6.0, 3.0, 0.0, 6.0), 57 | }, 58 | ] 59 | ).set_index("hpid") 60 | 61 | phenotype_ages_file = os.path.join( 62 | os.path.dirname(os.path.realpath(__file__)), "data/phenotype_age.tsv" 63 | ) 64 | df = make_age_distributions(phenotype_ages_file) 65 | assert set(ages_truth.index) == set(df.index) 66 | 67 | for hpid in ages_truth.index: 68 | assert pytest.approx( 69 | ages_truth.loc[hpid]["age_dist"].mean(), 0.1 70 | ) == pytest.approx(df.loc[hpid]["age_dist"].mean(), 0.1) 71 | 72 | 73 | def test_get_truncated_normal(test_data): 74 | distribution = get_truncated_normal(mean=6.0, sd=1.0, lower=0.0, upper=6.0) 75 | 76 | assert pytest.approx(distribution.mean(), 0.01) == 5.20 77 | assert pytest.approx(get_empirical_cdf(3, distribution), 0.1) == 0.0027 78 | assert pytest.approx(get_empirical_cdf(12, distribution), 0.01) == 1.0 79 | --------------------------------------------------------------------------------