├── .gitattributes
├── .github
    └── workflows
    │   ├── pythonpackage.yml
    │   ├── pythonpublish.yml
    │   └── ruff.yml
├── .gitignore
├── .pre-commit-config.yaml
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── conftest.py
├── notebooks
    ├── clustering.ipynb
    └── output
    │   └── cluster_three_diseases.png
├── phenopy
    ├── __init__.py
    ├── __main__.py
    ├── build_hpo.py
    ├── config.py
    ├── d2p.py
    ├── data
    │   ├── lgb.model.pkl
    │   ├── oa_phenotype_age.tsv
    │   ├── phenopy.wv.model.txt.gz
    │   └── phenotype_groups.txt
    ├── ic.py
    ├── network.py
    ├── score.py
    ├── util.py
    └── weights.py
├── phenoseries
    ├── __init__.py
    ├── experiment.py
    └── phenoseries.requirements.txt
├── pyproject.toml
└── tests
    ├── __init__.py
    ├── data
        ├── hp.obo
        ├── phenotype.hpoa
        ├── phenotype_age.tsv
        ├── test.score-long.txt
        ├── test.score-one-patient.txt
        └── test.score-short.txt
    ├── fixtures
        ├── __init__.py
        └── get_data_dictionary.py
    ├── test_ic.py
    ├── test_network.py
    ├── test_score.py
    ├── test_util.py
    └── test_weights.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.py linguist-language=python
2 | *.ipynb linguist-documentation


--------------------------------------------------------------------------------
/.github/workflows/pythonpackage.yml:
--------------------------------------------------------------------------------
 1 | name: Python package
 2 | 
 3 | on: [push]
 4 | jobs:
 5 |   build:
 6 |     timeout-minutes: 60
 7 |     runs-on: ubuntu-latest
 8 |     strategy:
 9 |       max-parallel: 4
10 |       matrix:
11 |         python-version: [3.9]
12 |     steps:
13 |     - uses: actions/checkout@v1
14 |     - name: Set up Python ${{ matrix.python-version }}
15 |       uses: actions/setup-python@v1
16 |       with:
17 |         python-version: ${{ matrix.python-version }}
18 |     - name: Install and configure Poetry
19 |       uses: snok/install-poetry@v1
20 |       with:
21 |         version: 1.4.2
22 |         virtualenvs-create: true
23 |         virtualenvs-in-project: false
24 |         virtualenvs-path: ~/my-custom-path
25 |         installer-parallel: true
26 |     - name: Install dependencies
27 |       run: |
28 |         poetry install
29 |     - name: Test with pytest
30 |       run: |
31 |         poetry run pytest --cov=phenopy --cov-report=xml
32 |     - name: Upload coverage to Codecov
33 |       uses: codecov/codecov-action@v1.0.2
34 |       with:
35 |         token: ${{secrets.CODECOV_TOKEN}}
36 |         file: ./coverage.xml
37 |         flags: unittests
38 | 


--------------------------------------------------------------------------------
/.github/workflows/pythonpublish.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v1
12 |     - name: Set up Python
13 |       uses: actions/setup-python@v1
14 |       with:
15 |         python-version: '3.9'
16 |     - name: Install dependencies
17 |       run: |
18 |         python -m pip install --upgrade pip
19 |         pip install pipenv
20 |         pipenv install --dev
21 |     - name: Build and publish
22 |       env:
23 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
24 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
25 |       run: |
26 |         pipenv run python setup.py sdist bdist_wheel
27 |         pipenv run twine upload dist/*
28 | 


--------------------------------------------------------------------------------
/.github/workflows/ruff.yml:
--------------------------------------------------------------------------------
1 | name: Ruff
2 | on: [push, pull_request]
3 | jobs:
4 |   ruff:
5 |     runs-on: ubuntu-latest
6 |     steps:
7 |       - uses: actions/checkout@v3
8 |       - uses: chartboost/ruff-action@v1
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | .pytest_cache/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | db.sqlite3
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # IPython
 78 | profile_default/
 79 | ipython_config.py
 80 | 
 81 | # pyenv
 82 | .python-version
 83 | 
 84 | # celery beat schedule file
 85 | celerybeat-schedule
 86 | 
 87 | # SageMath parsed files
 88 | *.sage.py
 89 | 
 90 | # Environments
 91 | .env
 92 | .venv
 93 | env/
 94 | venv/
 95 | ENV/
 96 | env.bak/
 97 | venv.bak/
 98 | 
 99 | # Spyder project settings
100 | .spyderproject
101 | .spyproject
102 | 
103 | # Rope project settings
104 | .ropeproject
105 | 
106 | # mkdocs documentation
107 | /site
108 | 
109 | # mypy
110 | .mypy_cache/
111 | .dmypy.json
112 | dmypy.json
113 | 
114 | # Pyre type checker
115 | .pyre/
116 | 
117 | # PyCharm
118 | .idea/


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v2.3.0
 4 |     hooks:
 5 |     - id: check-yaml
 6 |     - id: end-of-file-fixer
 7 |     - id: trailing-whitespace
 8 |   - repo: https://github.com/psf/black
 9 |     rev: 22.12.0
10 |     hooks:
11 |     - id: black
12 |       args: [--line-length=88]
13 |   - repo: https://github.com/charliermarsh/ruff-pre-commit
14 |     rev: "v0.0.264"
15 |     hooks:
16 |     - id: ruff
17 |       args: [--line-length=88]
18 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # source image
 2 | FROM python:3.9
 3 | 
 4 | # set noninterative mode
 5 | ENV DEBIAN_FRONTEND noninteractive
 6 | 
 7 | # apt update and install global requirements
 8 | RUN apt-get clean all && \
 9 |   apt-get update && \
10 |   apt-get upgrade -y && \
11 |   apt-get install -y  \
12 |       build-essential
13 | 
14 | # apt clean and remove cached source lists
15 | RUN apt-get clean && \
16 |   rm -rf /var/lib/apt/lists/*
17 | 
18 | # install pipenv
19 | RUN pip install pipenv --upgrade
20 | 
21 | # copy app code
22 | COPY . /app
23 | WORKDIR /app
24 | 
25 | # install python requirements
26 | RUN pipenv install --dev --deploy --system
27 | 
28 | # install phenopy
29 | RUN pip install .
30 | 
31 | # default command
32 | CMD ["phenopy"]
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Attribution-NonCommercial-ShareAlike 4.0 International
  2 | 
  3 | =======================================================================
  4 | 
  5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
  6 | does not provide legal services or legal advice. Distribution of
  7 | Creative Commons public licenses does not create a lawyer-client or
  8 | other relationship. Creative Commons makes its licenses and related
  9 | information available on an "as-is" basis. Creative Commons gives no
 10 | warranties regarding its licenses, any material licensed under their
 11 | terms and conditions, or any related information. Creative Commons
 12 | disclaims all liability for damages resulting from their use to the
 13 | fullest extent possible.
 14 | 
 15 | Using Creative Commons Public Licenses
 16 | 
 17 | Creative Commons public licenses provide a standard set of terms and
 18 | conditions that creators and other rights holders may use to share
 19 | original works of authorship and other material subject to copyright
 20 | and certain other rights specified in the public license below. The
 21 | following considerations are for informational purposes only, are not
 22 | exhaustive, and do not form part of our licenses.
 23 | 
 24 |      Considerations for licensors: Our public licenses are
 25 |      intended for use by those authorized to give the public
 26 |      permission to use material in ways otherwise restricted by
 27 |      copyright and certain other rights. Our licenses are
 28 |      irrevocable. Licensors should read and understand the terms
 29 |      and conditions of the license they choose before applying it.
 30 |      Licensors should also secure all rights necessary before
 31 |      applying our licenses so that the public can reuse the
 32 |      material as expected. Licensors should clearly mark any
 33 |      material not subject to the license. This includes other CC-
 34 |      licensed material, or material used under an exception or
 35 |      limitation to copyright. More considerations for licensors:
 36 | 	wiki.creativecommons.org/Considerations_for_licensors
 37 | 
 38 |      Considerations for the public: By using one of our public
 39 |      licenses, a licensor grants the public permission to use the
 40 |      licensed material under specified terms and conditions. If
 41 |      the licensor's permission is not necessary for any reason--for
 42 |      example, because of any applicable exception or limitation to
 43 |      copyright--then that use is not regulated by the license. Our
 44 |      licenses grant only permissions under copyright and certain
 45 |      other rights that a licensor has authority to grant. Use of
 46 |      the licensed material may still be restricted for other
 47 |      reasons, including because others have copyright or other
 48 |      rights in the material. A licensor may make special requests,
 49 |      such as asking that all changes be marked or described.
 50 |      Although not required by our licenses, you are encouraged to
 51 |      respect those requests where reasonable. More_considerations
 52 |      for the public:
 53 | 	wiki.creativecommons.org/Considerations_for_licensees
 54 | 
 55 | =======================================================================
 56 | 
 57 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
 58 | Public License
 59 | 
 60 | By exercising the Licensed Rights (defined below), You accept and agree
 61 | to be bound by the terms and conditions of this Creative Commons
 62 | Attribution-NonCommercial-ShareAlike 4.0 International Public License
 63 | ("Public License"). To the extent this Public License may be
 64 | interpreted as a contract, You are granted the Licensed Rights in
 65 | consideration of Your acceptance of these terms and conditions, and the
 66 | Licensor grants You such rights in consideration of benefits the
 67 | Licensor receives from making the Licensed Material available under
 68 | these terms and conditions.
 69 | 
 70 | 
 71 | Section 1 -- Definitions.
 72 | 
 73 |   a. Adapted Material means material subject to Copyright and Similar
 74 |      Rights that is derived from or based upon the Licensed Material
 75 |      and in which the Licensed Material is translated, altered,
 76 |      arranged, transformed, or otherwise modified in a manner requiring
 77 |      permission under the Copyright and Similar Rights held by the
 78 |      Licensor. For purposes of this Public License, where the Licensed
 79 |      Material is a musical work, performance, or sound recording,
 80 |      Adapted Material is always produced where the Licensed Material is
 81 |      synched in timed relation with a moving image.
 82 | 
 83 |   b. Adapter's License means the license You apply to Your Copyright
 84 |      and Similar Rights in Your contributions to Adapted Material in
 85 |      accordance with the terms and conditions of this Public License.
 86 | 
 87 |   c. BY-NC-SA Compatible License means a license listed at
 88 |      creativecommons.org/compatiblelicenses, approved by Creative
 89 |      Commons as essentially the equivalent of this Public License.
 90 | 
 91 |   d. Copyright and Similar Rights means copyright and/or similar rights
 92 |      closely related to copyright including, without limitation,
 93 |      performance, broadcast, sound recording, and Sui Generis Database
 94 |      Rights, without regard to how the rights are labeled or
 95 |      categorized. For purposes of this Public License, the rights
 96 |      specified in Section 2(b)(1)-(2) are not Copyright and Similar
 97 |      Rights.
 98 | 
 99 |   e. Effective Technological Measures means those measures that, in the
100 |      absence of proper authority, may not be circumvented under laws
101 |      fulfilling obligations under Article 11 of the WIPO Copyright
102 |      Treaty adopted on December 20, 1996, and/or similar international
103 |      agreements.
104 | 
105 |   f. Exceptions and Limitations means fair use, fair dealing, and/or
106 |      any other exception or limitation to Copyright and Similar Rights
107 |      that applies to Your use of the Licensed Material.
108 | 
109 |   g. License Elements means the license attributes listed in the name
110 |      of a Creative Commons Public License. The License Elements of this
111 |      Public License are Attribution, NonCommercial, and ShareAlike.
112 | 
113 |   h. Licensed Material means the artistic or literary work, database,
114 |      or other material to which the Licensor applied this Public
115 |      License.
116 | 
117 |   i. Licensed Rights means the rights granted to You subject to the
118 |      terms and conditions of this Public License, which are limited to
119 |      all Copyright and Similar Rights that apply to Your use of the
120 |      Licensed Material and that the Licensor has authority to license.
121 | 
122 |   j. Licensor means the individual(s) or entity(ies) granting rights
123 |      under this Public License.
124 | 
125 |   k. NonCommercial means not primarily intended for or directed towards
126 |      commercial advantage or monetary compensation. For purposes of
127 |      this Public License, the exchange of the Licensed Material for
128 |      other material subject to Copyright and Similar Rights by digital
129 |      file-sharing or similar means is NonCommercial provided there is
130 |      no payment of monetary compensation in connection with the
131 |      exchange.
132 | 
133 |   l. Share means to provide material to the public by any means or
134 |      process that requires permission under the Licensed Rights, such
135 |      as reproduction, public display, public performance, distribution,
136 |      dissemination, communication, or importation, and to make material
137 |      available to the public including in ways that members of the
138 |      public may access the material from a place and at a time
139 |      individually chosen by them.
140 | 
141 |   m. Sui Generis Database Rights means rights other than copyright
142 |      resulting from Directive 96/9/EC of the European Parliament and of
143 |      the Council of 11 March 1996 on the legal protection of databases,
144 |      as amended and/or succeeded, as well as other essentially
145 |      equivalent rights anywhere in the world.
146 | 
147 |   n. You means the individual or entity exercising the Licensed Rights
148 |      under this Public License. Your has a corresponding meaning.
149 | 
150 | 
151 | Section 2 -- Scope.
152 | 
153 |   a. License grant.
154 | 
155 |        1. Subject to the terms and conditions of this Public License,
156 |           the Licensor hereby grants You a worldwide, royalty-free,
157 |           non-sublicensable, non-exclusive, irrevocable license to
158 |           exercise the Licensed Rights in the Licensed Material to:
159 | 
160 |             a. reproduce and Share the Licensed Material, in whole or
161 |                in part, for NonCommercial purposes only; and
162 | 
163 |             b. produce, reproduce, and Share Adapted Material for
164 |                NonCommercial purposes only.
165 | 
166 |        2. Exceptions and Limitations. For the avoidance of doubt, where
167 |           Exceptions and Limitations apply to Your use, this Public
168 |           License does not apply, and You do not need to comply with
169 |           its terms and conditions.
170 | 
171 |        3. Term. The term of this Public License is specified in Section
172 |           6(a).
173 | 
174 |        4. Media and formats; technical modifications allowed. The
175 |           Licensor authorizes You to exercise the Licensed Rights in
176 |           all media and formats whether now known or hereafter created,
177 |           and to make technical modifications necessary to do so. The
178 |           Licensor waives and/or agrees not to assert any right or
179 |           authority to forbid You from making technical modifications
180 |           necessary to exercise the Licensed Rights, including
181 |           technical modifications necessary to circumvent Effective
182 |           Technological Measures. For purposes of this Public License,
183 |           simply making modifications authorized by this Section 2(a)
184 |           (4) never produces Adapted Material.
185 | 
186 |        5. Downstream recipients.
187 | 
188 |             a. Offer from the Licensor -- Licensed Material. Every
189 |                recipient of the Licensed Material automatically
190 |                receives an offer from the Licensor to exercise the
191 |                Licensed Rights under the terms and conditions of this
192 |                Public License.
193 | 
194 |             b. Additional offer from the Licensor -- Adapted Material.
195 |                Every recipient of Adapted Material from You
196 |                automatically receives an offer from the Licensor to
197 |                exercise the Licensed Rights in the Adapted Material
198 |                under the conditions of the Adapter's License You apply.
199 | 
200 |             c. No downstream restrictions. You may not offer or impose
201 |                any additional or different terms or conditions on, or
202 |                apply any Effective Technological Measures to, the
203 |                Licensed Material if doing so restricts exercise of the
204 |                Licensed Rights by any recipient of the Licensed
205 |                Material.
206 | 
207 |        6. No endorsement. Nothing in this Public License constitutes or
208 |           may be construed as permission to assert or imply that You
209 |           are, or that Your use of the Licensed Material is, connected
210 |           with, or sponsored, endorsed, or granted official status by,
211 |           the Licensor or others designated to receive attribution as
212 |           provided in Section 3(a)(1)(A)(i).
213 | 
214 |   b. Other rights.
215 | 
216 |        1. Moral rights, such as the right of integrity, are not
217 |           licensed under this Public License, nor are publicity,
218 |           privacy, and/or other similar personality rights; however, to
219 |           the extent possible, the Licensor waives and/or agrees not to
220 |           assert any such rights held by the Licensor to the limited
221 |           extent necessary to allow You to exercise the Licensed
222 |           Rights, but not otherwise.
223 | 
224 |        2. Patent and trademark rights are not licensed under this
225 |           Public License.
226 | 
227 |        3. To the extent possible, the Licensor waives any right to
228 |           collect royalties from You for the exercise of the Licensed
229 |           Rights, whether directly or through a collecting society
230 |           under any voluntary or waivable statutory or compulsory
231 |           licensing scheme. In all other cases the Licensor expressly
232 |           reserves any right to collect such royalties, including when
233 |           the Licensed Material is used other than for NonCommercial
234 |           purposes.
235 | 
236 | 
237 | Section 3 -- License Conditions.
238 | 
239 | Your exercise of the Licensed Rights is expressly made subject to the
240 | following conditions.
241 | 
242 |   a. Attribution.
243 | 
244 |        1. If You Share the Licensed Material (including in modified
245 |           form), You must:
246 | 
247 |             a. retain the following if it is supplied by the Licensor
248 |                with the Licensed Material:
249 | 
250 |                  i. identification of the creator(s) of the Licensed
251 |                     Material and any others designated to receive
252 |                     attribution, in any reasonable manner requested by
253 |                     the Licensor (including by pseudonym if
254 |                     designated);
255 | 
256 |                 ii. a copyright notice;
257 | 
258 |                iii. a notice that refers to this Public License;
259 | 
260 |                 iv. a notice that refers to the disclaimer of
261 |                     warranties;
262 | 
263 |                  v. a URI or hyperlink to the Licensed Material to the
264 |                     extent reasonably practicable;
265 | 
266 |             b. indicate if You modified the Licensed Material and
267 |                retain an indication of any previous modifications; and
268 | 
269 |             c. indicate the Licensed Material is licensed under this
270 |                Public License, and include the text of, or the URI or
271 |                hyperlink to, this Public License.
272 | 
273 |        2. You may satisfy the conditions in Section 3(a)(1) in any
274 |           reasonable manner based on the medium, means, and context in
275 |           which You Share the Licensed Material. For example, it may be
276 |           reasonable to satisfy the conditions by providing a URI or
277 |           hyperlink to a resource that includes the required
278 |           information.
279 |        3. If requested by the Licensor, You must remove any of the
280 |           information required by Section 3(a)(1)(A) to the extent
281 |           reasonably practicable.
282 | 
283 |   b. ShareAlike.
284 | 
285 |      In addition to the conditions in Section 3(a), if You Share
286 |      Adapted Material You produce, the following conditions also apply.
287 | 
288 |        1. The Adapter's License You apply must be a Creative Commons
289 |           license with the same License Elements, this version or
290 |           later, or a BY-NC-SA Compatible License.
291 | 
292 |        2. You must include the text of, or the URI or hyperlink to, the
293 |           Adapter's License You apply. You may satisfy this condition
294 |           in any reasonable manner based on the medium, means, and
295 |           context in which You Share Adapted Material.
296 | 
297 |        3. You may not offer or impose any additional or different terms
298 |           or conditions on, or apply any Effective Technological
299 |           Measures to, Adapted Material that restrict exercise of the
300 |           rights granted under the Adapter's License You apply.
301 | 
302 | 
303 | Section 4 -- Sui Generis Database Rights.
304 | 
305 | Where the Licensed Rights include Sui Generis Database Rights that
306 | apply to Your use of the Licensed Material:
307 | 
308 |   a. for the avoidance of doubt, Section 2(a)(1) grants You the right
309 |      to extract, reuse, reproduce, and Share all or a substantial
310 |      portion of the contents of the database for NonCommercial purposes
311 |      only;
312 | 
313 |   b. if You include all or a substantial portion of the database
314 |      contents in a database in which You have Sui Generis Database
315 |      Rights, then the database in which You have Sui Generis Database
316 |      Rights (but not its individual contents) is Adapted Material,
317 |      including for purposes of Section 3(b); and
318 | 
319 |   c. You must comply with the conditions in Section 3(a) if You Share
320 |      all or a substantial portion of the contents of the database.
321 | 
322 | For the avoidance of doubt, this Section 4 supplements and does not
323 | replace Your obligations under this Public License where the Licensed
324 | Rights include other Copyright and Similar Rights.
325 | 
326 | 
327 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
328 | 
329 |   a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
330 |      EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
331 |      AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
332 |      ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
333 |      IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
334 |      WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
335 |      PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
336 |      ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
337 |      KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
338 |      ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
339 | 
340 |   b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
341 |      TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
342 |      NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
343 |      INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
344 |      COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
345 |      USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
346 |      ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
347 |      DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
348 |      IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
349 | 
350 |   c. The disclaimer of warranties and limitation of liability provided
351 |      above shall be interpreted in a manner that, to the extent
352 |      possible, most closely approximates an absolute disclaimer and
353 |      waiver of all liability.
354 | 
355 | 
356 | Section 6 -- Term and Termination.
357 | 
358 |   a. This Public License applies for the term of the Copyright and
359 |      Similar Rights licensed here. However, if You fail to comply with
360 |      this Public License, then Your rights under this Public License
361 |      terminate automatically.
362 | 
363 |   b. Where Your right to use the Licensed Material has terminated under
364 |      Section 6(a), it reinstates:
365 | 
366 |        1. automatically as of the date the violation is cured, provided
367 |           it is cured within 30 days of Your discovery of the
368 |           violation; or
369 | 
370 |        2. upon express reinstatement by the Licensor.
371 | 
372 |      For the avoidance of doubt, this Section 6(b) does not affect any
373 |      right the Licensor may have to seek remedies for Your violations
374 |      of this Public License.
375 | 
376 |   c. For the avoidance of doubt, the Licensor may also offer the
377 |      Licensed Material under separate terms or conditions or stop
378 |      distributing the Licensed Material at any time; however, doing so
379 |      will not terminate this Public License.
380 | 
381 |   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
382 |      License.
383 | 
384 | 
385 | Section 7 -- Other Terms and Conditions.
386 | 
387 |   a. The Licensor shall not be bound by any additional or different
388 |      terms or conditions communicated by You unless expressly agreed.
389 | 
390 |   b. Any arrangements, understandings, or agreements regarding the
391 |      Licensed Material not stated herein are separate from and
392 |      independent of the terms and conditions of this Public License.
393 | 
394 | 
395 | Section 8 -- Interpretation.
396 | 
397 |   a. For the avoidance of doubt, this Public License does not, and
398 |      shall not be interpreted to, reduce, limit, restrict, or impose
399 |      conditions on any use of the Licensed Material that could lawfully
400 |      be made without permission under this Public License.
401 | 
402 |   b. To the extent possible, if any provision of this Public License is
403 |      deemed unenforceable, it shall be automatically reformed to the
404 |      minimum extent necessary to make it enforceable. If the provision
405 |      cannot be reformed, it shall be severed from this Public License
406 |      without affecting the enforceability of the remaining terms and
407 |      conditions.
408 | 
409 |   c. No term or condition of this Public License will be waived and no
410 |      failure to comply consented to unless expressly agreed to by the
411 |      Licensor.
412 | 
413 |   d. Nothing in this Public License constitutes or may be interpreted
414 |      as a limitation upon, or waiver of, any privileges and immunities
415 |      that apply to the Licensor or You, including from the legal
416 |      processes of any jurisdiction or authority.
417 | 
418 | =======================================================================
419 | 
420 | Creative Commons is not a party to its public
421 | licenses. Notwithstanding, Creative Commons may elect to apply one of
422 | its public licenses to material it publishes and in those instances
423 | will be considered the “Licensor.” The text of the Creative Commons
424 | public licenses is dedicated to the public domain under the CC0 Public
425 | Domain Dedication. Except for the limited purpose of indicating that
426 | material is shared under a Creative Commons public license or as
427 | otherwise permitted by the Creative Commons policies published at
428 | creativecommons.org/policies, Creative Commons does not authorize the
429 | use of the trademark "Creative Commons" or any other trademark or logo
430 | of Creative Commons without its prior written consent including,
431 | without limitation, in connection with any unauthorized modifications
432 | to any of its public licenses or any other arrangements,
433 | understandings, or agreements concerning use of licensed material. For
434 | the avoidance of doubt, this paragraph does not form part of the
435 | public licenses.
436 | 
437 | Creative Commons may be contacted at creativecommons.org.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include phenopy/data/*


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![python-version](https://img.shields.io/badge/python-3.6+-blue.svg)](https://www.python.org/downloads/release/python-360/)
  2 | [![github-actions](https://github.com/GeneDx/phenopy/workflows/Python%20package/badge.svg)](https://github.com/GeneDx/phenopy/actions)
  3 | [![codecov](https://codecov.io/gh/GeneDx/phenopy/branch/develop/graph/badge.svg)](https://codecov.io/gh/GeneDx/phenopy)
  4 | [![DOI](https://zenodo.org/badge/207335538.svg)](https://zenodo.org/badge/latestdoi/207335538)
  5 | 
  6 | # phenopy
  7 | `phenopy` was developed using Python 3.9 and functions to perform phenotype similarity scoring by semantic similarity. `phenopy` is a
  8 | lightweight but highly optimized command line tool and library to efficiently perform semantic similarity scoring on
  9 | generic entities with phenotype annotations from the [Human Phenotype Ontology (HPO)](https://hpo.jax.org/app/).
 10 | 
 11 | ![Phenotype Similarity Clustering](https://raw.githubusercontent.com/GeneDx/phenopy/develop/notebooks/output/cluster_three_diseases.png)
 12 | 
 13 | ## Installation
 14 | Install using pip:
 15 | ```bash
 16 | pip install phenopy
 17 | ```
 18 | 
 19 | Install from GitHub:
 20 | ```bash
 21 | git clone https://github.com/GeneDx/phenopy.git
 22 | cd phenopy
 23 | pipx install poetry
 24 | poetry install
 25 | ```
 26 | 
 27 | ## Command Line Usage
 28 | ### score
 29 | `phenopy` is primarily used as a command line tool. An entity, as described here, is presented as a sample, gene, or
 30 | disease, but could be any concept that warrants annotation of phenotype terms.
 31 | 
 32 | Use `phenopy score` to perform semantic similarity scoring in various formats. Write the results of any command to file
 33 | using `--output-file=/path/to/output_file.txt`
 34 | 
 35 | 1. Score similarity of entities defined by the HPO terms from an input file against all the OMIM diseases in
 36 |     `.phenopy/data/phenotype.hpoa`. We provide a test input file in the repo. The default summarization method is to
 37 |      use `--summarization-method=BMWA` which weighs each diseases' phenotypes by the frequency of a phenotype seen in
 38 |      each particular disease.
 39 |     ```bash
 40 |     phenopy score tests/data/test.score.txt
 41 |     ```
 42 |     Output:
 43 |     ```
 44 |     #query	entity_id	score
 45 |     118200  210100  0.0
 46 |     118200  615779  0.0
 47 |     118200  613266  0.0052
 48 |     ...
 49 |     ```
 50 | 
 51 | 2. Score similarity of entities defined by the HPO terms from an input file against all the OMIM diseases in
 52 |     `.phenopy/data/phenotype.hpoa`, to use the non-weighted summarization method use `--summarization-method=BMA` which
 53 |     uses a traditional *best-match average* summarization of semantic similarity scores when comparing terms from record *a*
 54 |     with terms from record *b*.
 55 |     ```bash
 56 |     phenopy score tests/data/test.score.txt --summarization-method=BMWA
 57 |     ```
 58 |     Output:
 59 |     ```
 60 |     #query	entity_id	score
 61 |     118200  210100  0.0
 62 |     118200  615779  0.0
 63 |     118200  613266  0.0052
 64 |     ...
 65 |     ```
 66 | 
 67 | 3. Score similarity of an entities defined by the HPO terms from an input file against a custom list of entities with HPO annotations, referred to as the `--records-file`. Both files are in the same format.
 68 |     ```bash
 69 |     phenopy score tests/data/test.score-short.txt --records-file tests/data/test.score-long.txt
 70 |     ```
 71 |     Output:
 72 |     ```
 73 |     #query  entity_id       score
 74 |     118200  118200  0.0169
 75 |     118200  300905  0.0156
 76 |     118200  601098  0.0171
 77 |     ...
 78 |     ```
 79 | 
 80 | 4. Score pairwise similarity of entities defined by the HPO terms from an input file using `--self`.
 81 | 
 82 |     ```bash
 83 |     phenopy score tests/data/test.score-long.txt --threads 4 --self
 84 |     ```
 85 |     Output:
 86 |     ```
 87 |     #query  entity_id       score
 88 |     118200  118200  0.2284
 89 |     118200  118210  0.1302
 90 |     118200  118211  0.1302
 91 |     118210  118210  0.2048
 92 |     118210  118211  0.2048
 93 |     118211  118211  0.2048
 94 |     ```
 95 | 5. Score age-adjusted pairwise similarity of entities defined in the input file,
 96 |     using phenotype mean age and standard deviation defined in the `--ages_distribution_file`,
 97 |     select best-match weighted average as the scoring summarization method `--summarization-method BMWA`.
 98 | 
 99 |     ```bash
100 |     phenopy score tests/data/test.score-short.txt --ages_distribution_file tests/data/phenotype_age.tsv --summarization-method BMWA --threads 4 --self
101 |     ```
102 |     Output:
103 |     ```
104 |     #query  entity_id       score
105 |     118200  210100  0.0
106 |     118200  177650  0.0127
107 |     118200  241520  0.0
108 |     ...
109 |     ```
110 | 
111 |     The phenotype age file contains hpo-id, mean, sd as tab separated text as follows
112 | 
113 |     |  |  | |
114 |     |------------|------|-----|
115 |     | HP:0001251 | 6.0  | 3.0 |
116 |     | HP:0001263 | 1.0  | 1.0 |
117 |     | HP:0001290 | 1.0  | 1.0 |
118 |     | HP:0004322 | 10.0 | 3.0 |
119 |     | HP:0001249 | 6.0  | 3.0 |
120 | 
121 |   If no phenotype ages file is provided `--summarization-method=BMWA` can be selected to use default, open access literature-derived phenotype ages (~ 1,400 age weighted phenotypes).
122 |    ```bash
123 |     phenopy score tests/data/test.score-short.txt  --summarization-method BMWA --threads 4
124 |    ```
125 | 
126 | #### Parameters
127 | For a full list of command arguments use `phenopy [subcommand] --help`:
128 | ```bash
129 | phenopy score --help
130 | ```
131 | Output:
132 | ```
133 |     --output_file=OUTPUT_FILE
134 |         File path where to store the results. [default: - (stdout)]
135 |     --records_file=RECORDS_FILE
136 |         An entity-to-phenotype annotation file in the same format as "input_file". This file, if provided, is used to score entries in the "input_file" against entries here. [default: None]
137 |     --annotations_file=ANNOTATIONS_FILE
138 |         An entity-to-phenotype annotation file in the same format as "input_file". This file, if provided, is used to add information content to the network. [default: None]
139 |     --ages_distribution_file=AGES_DISTRIBUTION_FILE
140 |         Phenotypes age summary stats file containing phenotype HPO id, mean_age, and std. [default: None]
141 |     --self=SELF
142 |         Score entries in the "input_file" against itself.
143 |     --summarization_method=SUMMARIZATION_METHOD
144 |         The method used to summarize the HRSS matrix. Supported Values are best match average (BMA), best match weighted average (BMWA), and maximum (maximum). [default: BMWA]
145 |     --threads=THREADS
146 |         Number of parallel processes to use. [default: 1]
147 | ```
148 | 
149 | ## Library Usage
150 | 
151 | The `phenopy` library can be used as a `Python` module, allowing more control for advanced users.
152 | 
153 | ### score
154 | 
155 | **Generate the hpo network and supporting objects**:
156 | 
157 | ```python
158 | import os
159 | from phenopy.build_hpo import generate_annotated_hpo_network
160 | from phenopy.score import Scorer
161 | 
162 | # data directory
163 | phenopy_data_directory = os.path.join(os.getenv('HOME'), '.phenopy/data')
164 | 
165 | # files used in building the annotated HPO network
166 | obo_file = os.path.join(phenopy_data_directory, 'hp.obo')
167 | disease_to_phenotype_file = os.path.join(phenopy_data_directory, 'phenotype.hpoa')
168 | 
169 | # if you have a custom ages_distribution_file, you can set it here.
170 | ages_distribution_file = os.path.join(phenopy_data_directory, 'xa_age_stats_oct052019.tsv')
171 | 
172 | hpo_network, alt2prim, disease_records = \
173 |     generate_annotated_hpo_network(obo_file,
174 |                                    disease_to_phenotype_file,
175 |                                    ages_distribution_file=ages_distribution_file
176 |                                    )
177 | ```
178 | 
179 | **Then, instantiate the `Scorer` class and score hpo term lists.**
180 | 
181 | ```python
182 | scorer = Scorer(hpo_network)
183 | 
184 | terms_a = ['HP:0001263', 'HP:0011839']
185 | terms_b = ['HP:0001263', 'HP:0000252']
186 | 
187 | print(scorer.score_term_sets_basic(terms_a, terms_b))
188 | ```
189 | 
190 | Output:
191 | 
192 | ```
193 | 0.11213185474495047
194 | ```
195 | 
196 | ### miscellaneous
197 | 
198 | The library can be used to prune parent phenotypes from the `phenotype.hpoa` and store pruned annotations as a file
199 | 
200 | ```python
201 | from phenopy.util import export_phenotype_hpoa_with_no_parents
202 | # saves a new file of phenotype disease annotations with parent HPO terms removed from phenotype lists.
203 | disease_to_phenotype_no_parents_file = os.path.join(phenopy_data_directory, 'phenotype.noparents.hpoa')
204 | export_phenotype_hpoa_with_no_parents(disease_to_phenotype_file, disease_to_phenotype_no_parents_file, hpo_network)
205 | ```
206 | 
207 | 
208 | ## Initial setup
209 | phenopy is designed to run with minimal setup from the user, to run phenopy with default parameters (recommended), skip ahead
210 | to the [Commands overview](#Commands-overview).
211 | 
212 | This section provides details about where phenopy stores data resources and config files. The following occurs when
213 | you run phenopy for the first time.
214 |  1. phenopy creates a `.phenopy/` directory in your home folder and downloads external resources from HPO into the
215 |   `$HOME/.phenopy/data/` directory.
216 |  2. phenopy creates a `$HOME/.phenopy/phenopy.ini` config file where users can set variables for phenopy to use
217 |  at runtime.
218 | 
219 | ## Config
220 | While we recommend using the default settings for most users, the config file *can be* modified: `$HOME/.phenopy/phenopy.ini`.
221 | 
222 | To run phenopy with a different version of `hp.obo`, set the path of `obo_file` in `$HOME/.phenopy/phenopy.ini`.
223 | 
224 | ## Contributing
225 | We welcome contributions from the community. Please follow these steps to setup a local development environment.
226 | ```bash
227 | pipenv install --dev
228 | ```
229 | 
230 | To run tests locally:
231 | ```bash
232 | pipenv shell
233 | coverage run --source=. -m unittest discover --start-directory tests/
234 | coverage report -m
235 | ```
236 | 
237 | ## References
238 | The underlying algorithm which determines the semantic similarity for any two HPO terms is based on an implementation of HRSS, [published here](https://www.ncbi.nlm.nih.gov/pubmed/23741529).
239 | 
240 | ## Citing Phenopy
241 | Please use the following Bibtex to cite this software.
242 | ```
243 | @software{arvai_phenopy_2019,
244 |     title = {Phenopy},
245 |     rights = {Attribution-NonCommercial-ShareAlike 4.0 International},
246 |     url = {https://github.com/GeneDx/phenopy},
247 |     abstract = {Phenopy is a Python package to perform phenotype similarity scoring by semantic similarity.
248 |         Phenopy is a lightweight but highly optimized command line tool and library to efficiently perform semantic
249 |         similarity scoring on generic entities with phenotype annotations from the Human Phenotype Ontology (HPO).},
250 |     version = {0.3.0},
251 |     author = {Arvai, Kevin and Borroto, Carlos and Gainullin, Vladimir and Retterer, Kyle},
252 |     date = {2019-11-05},
253 |     year = {2019},
254 |     doi = {10.5281/zenodo.3529569}
255 | }
256 | ```
257 | 


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
1 | from tests.fixtures.get_data_dictionary import test_data as test_data
2 | 


--------------------------------------------------------------------------------
/notebooks/output/cluster_three_diseases.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeneDx/phenopy/3cc03ea1a60334155141f8b261c77471195b62f3/notebooks/output/cluster_three_diseases.png


--------------------------------------------------------------------------------
/phenopy/__init__.py:
--------------------------------------------------------------------------------
1 | __project__ = 'phenopy'
2 | __version__ = '0.6.0'
3 | 


--------------------------------------------------------------------------------
/phenopy/__main__.py:
--------------------------------------------------------------------------------
  1 | import fire
  2 | import itertools
  3 | import sys
  4 | from configparser import NoOptionError, NoSectionError
  5 | from phenopy.util import open_or_stdout
  6 | from phenopy.build_hpo import generate_annotated_hpo_network
  7 | from phenopy.config import config, logger
  8 | from phenopy.score import Scorer
  9 | from phenopy.util import parse_input, half_product
 10 | from phenoseries.experiment import run_phenoseries_experiment
 11 | 
 12 | 
 13 | def score(
 14 |     input_file,
 15 |     output_file="-",
 16 |     records_file=None,
 17 |     annotations_file=None,
 18 |     custom_disease_file=None,
 19 |     ages_distribution_file=None,
 20 |     self=False,
 21 |     summarization_method="BMWA",
 22 |     scoring_method="HRSS",
 23 |     threads=1,
 24 | ):
 25 |     """
 26 |     Scores similarity of provided HPO annotated entries (see format below) against a
 27 |     set of HPO annotated dataset. By default scoring happens against diseases
 28 |     annotated by the HPO group. See https://hpo.jax.org/app/download/annotation.
 29 | 
 30 |     Phenopy also supports scoring the product of provided entries (see "--product") or
 31 |     scoring against a custom records dataset (see "--records-file).
 32 | 
 33 |     :param input_file: File with HPO annotated entries, one per line (see format below).
 34 |     :param output_file: File path where to store the results. [default: - (stdout)]
 35 |     :param records_file: An entity-to-phenotype annotation file in the same format as
 36 |         "input_file". This file, if
 37 |     provided, is used to score entries in the "input_file" against entries here.
 38 |         [default: None]
 39 |     :param annotations_file: An entity-to-phenotype annotation file in the same format
 40 |         as "input_file". This file, if
 41 |     provided, is used to add information content to the network. [default: None]
 42 |     :param custom_disease_file: entity Annotation for ranking diseases/genes
 43 |     :param ages_distribution_file: Phenotypes age summary stats file containing
 44 |         phenotype HPO id, mean_age, and std. [default: None]
 45 |     :param self: Score entries in the "input_file" against itself.
 46 |     :param summarization_method: The method used to summarize the HRSS matrix.
 47 |         Supported Values are best match average
 48 |     (BMA), best match weighted average (BMWA), and maximum (maximum). [default: BMWA]
 49 |     :param scoring_method: Either HRSS or Resnik
 50 |     :param threads: Number of parallel processes to use. [default: 1]
 51 |     """
 52 | 
 53 |     try:
 54 |         obo_file = config.get("hpo", "obo_file")
 55 |     except (NoSectionError, NoOptionError):
 56 |         logger.critical(
 57 |             'No HPO OBO file found in the configuration file. See "hpo:obo_file" '
 58 |             "parameter."
 59 |         )
 60 |         sys.exit(1)
 61 |     if custom_disease_file is None:
 62 |         try:
 63 |             disease_to_phenotype_file = config.get("hpo", "disease_to_phenotype_file")
 64 |         except (NoSectionError, NoOptionError):
 65 |             logger.critical(
 66 |                 "No HPO annotated dataset file found in the configuration file."
 67 |                 ' See "hpo:disease_to_phenotype_file" parameter.'
 68 |             )
 69 |             sys.exit(1)
 70 |     else:
 71 |         logger.info(f"using custom disease annotation file: {custom_disease_file}")
 72 |         disease_to_phenotype_file = custom_disease_file
 73 | 
 74 |     logger.info(f"Loading HPO OBO file: {obo_file}")
 75 |     hpo_network, alt2prim, disease_records = generate_annotated_hpo_network(
 76 |         obo_file,
 77 |         disease_to_phenotype_file,
 78 |         annotations_file=annotations_file,
 79 |         ages_distribution_file=ages_distribution_file,
 80 |     )
 81 | 
 82 |     # parse input records
 83 |     input_records = parse_input(input_file, hpo_network, alt2prim)
 84 | 
 85 |     # create instance the scorer class
 86 |     try:
 87 |         scorer = Scorer(
 88 |             hpo_network,
 89 |             summarization_method=summarization_method,
 90 |             scoring_method=scoring_method,
 91 |         )
 92 |     except ValueError as e:
 93 |         logger.critical(f"Failed to initialize scoring class: {e}")
 94 |         sys.exit(1)
 95 | 
 96 |     if self:
 97 |         score_records = input_records
 98 | 
 99 |         scoring_pairs = half_product(len(score_records), len(score_records))
100 |     else:
101 |         if records_file:
102 |             score_records = parse_input(records_file, hpo_network, alt2prim)
103 |         else:
104 |             score_records = disease_records
105 | 
106 |         scoring_pairs = itertools.product(
107 |             range(len(input_records)),
108 |             range(len(score_records)),
109 |         )
110 | 
111 |     results = scorer.score_records(input_records, score_records, scoring_pairs, threads)
112 | 
113 |     with open_or_stdout(output_file) as output_fh:
114 |         output_fh.write("\t".join(["#query", "entity_id", "score"]))
115 |         output_fh.write("\n")
116 |         for result in results:
117 |             output_fh.write("\t".join(str(column) for column in result))
118 |             output_fh.write("\n")
119 | 
120 | 
121 | def validate_phenoseries(
122 |     phenotypic_series_filepath,
123 |     outdir=None,
124 |     min_hpos=4,
125 |     min_entities=2,
126 |     phenoseries_fraction=1.0,
127 |     scoring_method="HRSS",
128 |     threads=1,
129 |     omim_phenotypes_file="",
130 |     pairwise_mim_scores_file="",
131 | ):
132 |     """
133 |     This runs the phenoseries experiment for a fraction of the OMIM phenoseries
134 |     (PSid's). It Outputs a file with each row containing: PSid, MIMid, Python list
135 |     of integers (ranks), and the length of the list.
136 | 
137 |     :param phenotypic_series_filepath: The phenotypicSeries.txt file from OMIM API.
138 |         This is required to run validation.
139 |     :param outdir: Directory where output files will be written.
140 |     :param min_hpos: The minimum number of HPO ids annotated to a MIM id for the
141 |         MIM id to be included in the experiment.
142 |     :param min_entities: The minimum number of MIM ids for a phenoseries id to be
143 |         included in the experiment.
144 |     :param phenoseries_fraction: The fraction of total phenoseries to evaluate.
145 |     :param scoring_method: Either HRSS, Resnik, Jaccard, or word2vec
146 |     :param threads: Number of parallel processes to use. [default: 1]
147 |     :param omim_phenotypes_file: <Optional> Path to the file containing OMIM id in
148 |         the first column and a Python
149 |      list of hpo ids in the second column.
150 |     :param pairwise_mim_scores_file: <Optional> Path to the file containing similarity
151 |         scores for each of the
152 |     """
153 |     run_phenoseries_experiment(
154 |         outdir=outdir,
155 |         phenotypic_series_filepath=phenotypic_series_filepath,
156 |         min_hpos=min_hpos,
157 |         min_entities=min_entities,
158 |         phenoseries_fraction=phenoseries_fraction,
159 |         scoring_method=scoring_method,
160 |         threads=threads,
161 |         omim_phenotypes_file=omim_phenotypes_file,
162 |         pairwise_mim_scores_file=pairwise_mim_scores_file,
163 |     )
164 | 
165 | 
166 | def main():
167 |     fire.Fire(
168 |         {
169 |             "score": score,
170 |             "validate-phenoseries": validate_phenoseries,
171 |         }
172 |     )
173 | 
174 | 
175 | if __name__ == "__main__":
176 |     main()
177 | 


--------------------------------------------------------------------------------
/phenopy/build_hpo.py:
--------------------------------------------------------------------------------
 1 | from phenopy.util import generate_alternate_ids
 2 | from phenopy.d2p import load as load_d2p
 3 | from phenopy.network import load as load_network
 4 | from phenopy.network import annotate
 5 | from typing import Tuple
 6 | 
 7 | 
 8 | def generate_annotated_hpo_network(
 9 |         obo_file: str, disease_to_phenotype_file: str,
10 |         annotations_file=None, ages_distribution_file=None) -> Tuple:
11 |     """
12 |     Generate an annotated HPO network, alternate ids to primary ids and disease records
13 |     """
14 |     hpo_network = load_network(obo_file)
15 | 
16 |     alt2prim = generate_alternate_ids(hpo_network)
17 | 
18 |     # load phenotypes to diseases associations
19 |     (
20 |         disease_records,
21 |         phenotype_to_diseases,
22 |     ) = load_d2p(disease_to_phenotype_file, hpo_network, alt2prim)
23 | 
24 |     # load hpo network
25 |     hpo_network = annotate(
26 |         hpo_network,
27 |         phenotype_to_diseases,
28 |         len(disease_records),
29 |         alt2prim,
30 |         annotations_file=annotations_file,
31 |         ages_distribution_file=ages_distribution_file,
32 |     )
33 | 
34 |     return hpo_network, alt2prim, disease_records
35 | 


--------------------------------------------------------------------------------
/phenopy/config.py:
--------------------------------------------------------------------------------
  1 | import configparser
  2 | import logging
  3 | import os
  4 | import urllib.request
  5 | import shutil
  6 | from pathlib import Path
  7 | from gensim.models import KeyedVectors
  8 | from phenopy import __project__, __version__
  9 | 
 10 | 
 11 | def download_resource_files():
 12 |     """
 13 |     Check if HPO files exist, if not download them
 14 |     :return: None
 15 |     """
 16 | 
 17 |     def download(url, file_path):
 18 |         """
 19 |         Download and save a file
 20 |         :param url: where to get it from
 21 |         :param file_path: where to put it
 22 |         :return: None
 23 |         """
 24 |         try:
 25 |             response = urllib.request.urlopen(url)
 26 | 
 27 |         except ValueError:
 28 |             logger.info(f"Incorrect url specified for HPO files: {url}")
 29 |             raise
 30 | 
 31 |         except urllib.error.URLError as e:
 32 |             if hasattr(e, "reason"):
 33 |                 logger.info(f"Incorrect url specified for HPO files: {url}")
 34 |                 logger.info("Reason: ", e.reason)
 35 |                 raise
 36 |             elif hasattr(e, "code"):
 37 |                 logger.info("The server could not fulfill the request")
 38 |                 logger.info("Reason: ", e.code)
 39 |                 raise
 40 | 
 41 |         try:
 42 |             with open(file_path, "wb") as out_file:
 43 |                 shutil.copyfileobj(response, out_file)
 44 | 
 45 |         except PermissionError:
 46 |             logger.info(f"No permission accessing data directory: {file_path}")
 47 |             raise
 48 | 
 49 |     # read the config file to get file paths and urls
 50 |     obo_path = config.get("hpo", "obo_file")
 51 |     obo_url = config.get("hpo", "obo_file_url")
 52 | 
 53 |     hpoa_path = config.get("hpo", "disease_to_phenotype_file")
 54 |     hpoa_url = config.get("hpo", "disease_to_phenotype_file_url")
 55 | 
 56 |     if not os.path.isfile(obo_path):
 57 |         logger.info(f"Downloading HPO obo file to: {obo_path}")
 58 |         download(obo_url, obo_path)
 59 | 
 60 |     if not os.path.isfile(hpoa_path):
 61 |         logger.info(f"Downloading phenotype to disease annotations to {hpoa_path}")
 62 |         download(hpoa_url, hpoa_path)
 63 | 
 64 | 
 65 | # create logger
 66 | logger = logging.getLogger(__project__)
 67 | logger.setLevel(logging.DEBUG)
 68 | 
 69 | # create console handler
 70 | ch = logging.StreamHandler()
 71 | ch.setLevel(logging.DEBUG)
 72 | 
 73 | # create formatter and add it to the handler
 74 | formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 75 | ch.setFormatter(formatter)
 76 | 
 77 | # add the handler to the logger
 78 | logger.addHandler(ch)
 79 | 
 80 | # create config
 81 | config = configparser.ConfigParser()
 82 | 
 83 | # create config directory if it doesn't exist
 84 | config_directory = os.path.join(os.environ.get("HOME"), f".{__project__}")
 85 | project_directory = Path(__file__).parent
 86 | project_data_dir = os.path.join(project_directory, "data")
 87 | try:
 88 |     os.makedirs(config_directory)
 89 | except FileExistsError:
 90 |     pass
 91 | 
 92 | # create data directory if it doesn't exist
 93 | data_directory = os.path.join(config_directory, "data")
 94 | try:
 95 |     os.makedirs(data_directory)
 96 | except FileExistsError:
 97 |     pass
 98 | 
 99 | # if phenopy.ini doesnt exist make one
100 | logger.info(f"checking if config file exists: {config_directory}")
101 | if not os.path.isfile(os.path.join(config_directory, "phenopy.ini")):
102 |     config = configparser.ConfigParser()
103 |     w2v_path = os.path.join(os.path.dirname(__file__), "data/phenopy.wv.model.txt.gz")
104 | 
105 |     w2v_vw_path = os.path.join(data_directory, "phenopy.w2v.model")
106 | 
107 |     wv = KeyedVectors.load_word2vec_format(w2v_path)
108 |     # save model in faster to load format in users directory
109 |     wv.save(w2v_vw_path)
110 | 
111 |     # copy the lmd model to the data directory
112 |     lmd_path = os.path.join(os.path.dirname(__file__), "data/lgb.model.pkl")
113 |     lmd_data_path = os.path.join(data_directory, "lgb.model.pkl")
114 |     shutil.copyfile(lmd_path, lmd_data_path)
115 | 
116 |     config["hpo"] = {
117 |         "obo_file": os.path.join(
118 |             data_directory,
119 |             "hp.obo",
120 |         ),
121 |         "obo_file_url": "http://purl.obolibrary.org/obo/hp.obo",
122 |         "hpo_network_file": os.path.join(
123 |             data_directory,
124 |             "hpo_network.pickle",
125 |         ),
126 |         "disease_to_phenotype_file_url": "http://purl.obolibrary.org/obo/hp/hpoa/phenotype.hpoa",
127 |         "disease_to_phenotype_file": os.path.join(
128 |             data_directory,
129 |             "phenotype.hpoa",
130 |         ),
131 |     }
132 | 
133 |     config["models"] = {
134 |         "phenopy.wv.model": w2v_vw_path,
135 |     }
136 |     config["age"] = {
137 |         "open_access_phenotype_age": os.path.join(
138 |             project_data_dir,
139 |             "oa_phenotype_age.tsv",
140 |         )
141 |     }
142 |     config["omim"] = {
143 |         "omim_api_key": "",
144 |     }
145 |     config["phenotype_groups"] = {
146 |         "phenotype_groups_file": os.path.join(project_data_dir, "phenotype_groups.txt")
147 |     }
148 | 
149 |     with open(os.path.join(config_directory, "phenopy.ini"), "w") as configfile:
150 |         logger.info("writing config file to: %s " % config_directory)
151 |         config.write(configfile)
152 | 
153 | # log project and version
154 | logger.info(f"{__project__} {__version__}")
155 | 
156 | # read config
157 | config_file = os.environ.get(
158 |     f"{__project__.upper()}_CONFIG",
159 |     os.path.join(
160 |         config_directory,
161 |         f"{__project__}.ini",
162 |     ),
163 | )
164 | config.read(config_file)
165 | logger.info(f"Using configuration file: {config_file}")
166 | 
167 | # download resource files if necessary
168 | download_resource_files()
169 | 


--------------------------------------------------------------------------------
/phenopy/d2p.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import csv
  3 | import sys
  4 | import networkx as nx
  5 | from typing import (
  6 |     List,
  7 |     Tuple,
  8 | )
  9 | 
 10 | hpo_id_to_float = {
 11 |     "HP:0040280": 1.0,
 12 |     "HP:0040281": np.mean([0.80, 0.99]),
 13 |     "HP:0040282": np.mean([0.30, 0.79]),
 14 |     "HP:0040283": np.mean([0.05, 0.29]),
 15 |     "HP:0040284": np.mean([0.01, 0.04]),
 16 |     "HP:0040285": 0.0,
 17 | }
 18 | 
 19 | 
 20 | def read_hpo_annotation_file(
 21 |     phenotype_annotations_file: str, hpo_network: nx.MultiDiGraph, logger=None
 22 | ) -> List:
 23 |     """
 24 |     Reads the annotation files from the HPO website
 25 |     """
 26 |     try:
 27 |         with open(phenotype_annotations_file, "r") as tsv_fh:
 28 |             [next(tsv_fh) for _ in range(4)]
 29 |             reader = csv.DictReader(tsv_fh, delimiter="\t")
 30 |             # this removes the leading hash
 31 |             reader.fieldnames[0] = reader.fieldnames[0].lstrip("#")
 32 | 
 33 |             records = []
 34 | 
 35 |             for row in reader:
 36 | 
 37 |                 # phenotype term id
 38 |                 term_id = row.get("HPO_ID") if "HPO_ID" in row else row.get("hpo_id")
 39 |                 if term_id not in hpo_network.nodes():
 40 |                     continue
 41 | 
 42 |                 # parse disease id, currently only supports omim entries
 43 |                 database_id = (
 44 |                     row.get("DatabaseID")
 45 |                     if "DatabaseID" in row
 46 |                     else row.get("database_id")
 47 |                 )
 48 |                 db, disease_accession = database_id.split(":")
 49 |                 if db not in ["OMIM"]:
 50 |                     continue
 51 | 
 52 |                 # For now, skip negative phenotype annotations
 53 |                 qualifier = (
 54 |                     row.get("Qualifier") if "Qualifier" in row else row.get("qualifier")
 55 |                 )
 56 |                 if qualifier == "NOT":
 57 |                     continue
 58 | 
 59 |                 frequency = (
 60 |                     row.get("Frequency") if "Frequency" in row else row.get("frequency")
 61 |                 )
 62 |                 records.append(
 63 |                     (term_id, disease_accession, frequency_converter(frequency))
 64 |                 )
 65 | 
 66 |         return records
 67 | 
 68 |     except (FileNotFoundError, PermissionError):
 69 |         hpoa_file_error_msg = (
 70 |             f"{phenotype_annotations_file} " f"not found or incorrect permissions"
 71 |         )
 72 |         if logger is not None:
 73 |             logger.critical(hpoa_file_error_msg)
 74 |         else:
 75 |             sys.stderr.write(hpoa_file_error_msg)
 76 |         sys.exit(1)
 77 | 
 78 | 
 79 | def read_custom_annotation_file(
 80 |     custom_annotation_file_path: str, hpo_network: nx.MultiDiGraph, logger: None = None
 81 | ) -> List:
 82 |     try:
 83 |         with open(custom_annotation_file_path, "r") as tsv_fh:
 84 |             reader = csv.reader(tsv_fh, delimiter="\t")
 85 | 
 86 |             records = []
 87 |             for row in reader:
 88 |                 # phenotype term id
 89 |                 # convert alternate phenotype id to primary
 90 |                 term_id, disease_accession, freq = row
 91 |                 if term_id not in hpo_network.nodes():
 92 |                     continue
 93 | 
 94 |                 records.append((term_id, disease_accession, float(freq)))
 95 | 
 96 |         return records
 97 | 
 98 |     except (FileNotFoundError, PermissionError):
 99 |         hpoa_file_error_msg = (
100 |             f"{custom_annotation_file_path} " f"not found or incorrect permissions"
101 |         )
102 |         if logger is not None:
103 |             logger.critical(hpoa_file_error_msg)
104 |         else:
105 |             sys.stderr.write(hpoa_file_error_msg)
106 |         sys.exit(1)
107 | 
108 | 
109 | def load(
110 |     phenotype_annotations_file: str,
111 |     hpo_network: nx.MultiDiGraph,
112 |     alt2prim,
113 |     default_frequency: float = 0.5,
114 | ) -> Tuple:
115 |     """
116 |     Parse the hpoa file
117 |     """
118 |     if phenotype_annotations_file.endswith("hpoa"):
119 |         records = read_hpo_annotation_file(phenotype_annotations_file, hpo_network)
120 |     else:
121 |         records = read_custom_annotation_file(phenotype_annotations_file, hpo_network)
122 | 
123 |     disease_to_phenotypes = dict()
124 |     phenotype_to_diseases = dict()
125 | 
126 |     for r in records:
127 |         term_id, disease_accession, freq = r
128 |         if term_id not in phenotype_to_diseases:
129 |             phenotype_to_diseases[term_id] = {
130 |                 disease_accession: {"frequency": default_frequency}
131 |             }
132 |         else:
133 |             if disease_accession not in phenotype_to_diseases[term_id]:
134 |                 phenotype_to_diseases[term_id].update(
135 |                     {disease_accession: {"frequency": default_frequency}}
136 |                 )
137 | 
138 |         phenotype_to_diseases[term_id][disease_accession]["frequency"] = freq
139 | 
140 |         # add the phenotype to the disease in the disease_records dictionary
141 |         if disease_accession not in disease_to_phenotypes:
142 |             disease_to_phenotypes[disease_accession] = {
143 |                 "record_id": disease_accession,
144 |                 "terms": [],
145 |                 "weights": {
146 |                     "disease_frequency": [],
147 |                 },
148 |             }
149 |         disease_to_phenotypes[disease_accession]["terms"].append(term_id)
150 | 
151 |     # going from dict to a list of disease records and setting weights
152 |     disease_records = list()
153 |     for disease_accession, disease in disease_to_phenotypes.items():
154 |         disease["terms"] = sorted(set(disease["terms"]))
155 |         for term_id in disease["terms"]:
156 |             # convert alternate phenotype id to primary
157 |             term_id = term_id if term_id not in alt2prim else alt2prim[term_id]
158 |             if term_id not in hpo_network.nodes():
159 |                 continue
160 | 
161 |             frequency_weight = phenotype_to_diseases[term_id][disease_accession][
162 |                 "frequency"
163 |             ]
164 |             #
165 |             disease["weights"]["disease_frequency"].append(frequency_weight)
166 | 
167 |         disease_records.append(disease)
168 | 
169 |     # TODO: do we need phenotype_to_diseases?
170 |     return disease_records, phenotype_to_diseases
171 | 
172 | 
173 | def frequency_converter(hpoa_frequency: str, default_frequency: float = 0.5) -> float:
174 |     """
175 |     convert the frequency column from the hpoa file to a float
176 |     """
177 |     if "HP:" in hpoa_frequency:
178 |         # TODO discuss the best default
179 |         return hpo_id_to_float.get(hpoa_frequency, default_frequency)
180 | 
181 |     elif "/" in hpoa_frequency:
182 |         n, d = hpoa_frequency.split("/")
183 |         return float(n) / float(d)
184 | 
185 |     elif "%" in hpoa_frequency:
186 |         return float(hpoa_frequency.strip("%")) / 100
187 | 
188 |     # TODO discuss the best default
189 |     return default_frequency
190 | 


--------------------------------------------------------------------------------
/phenopy/data/lgb.model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeneDx/phenopy/3cc03ea1a60334155141f8b261c77471195b62f3/phenopy/data/lgb.model.pkl


--------------------------------------------------------------------------------
/phenopy/data/phenopy.wv.model.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeneDx/phenopy/3cc03ea1a60334155141f8b261c77471195b62f3/phenopy/data/phenopy.wv.model.txt.gz


--------------------------------------------------------------------------------
/phenopy/ic.py:
--------------------------------------------------------------------------------
 1 | import networkx as nx
 2 | import numpy as np
 3 | from typing import Dict
 4 | 
 5 | SMOOTH = 1
 6 | 
 7 | 
 8 | def calculate_information_content(
 9 |         hpo_id: str,
10 |         hpo_network: nx.MultiDiGraph,
11 |         phenotype_to_diseases: Dict,
12 |         num_diseases_annotated: int,
13 |         custom_annotations: str = None) -> np.ndarray:
14 |     """
15 |     Calculates information content for an HPO term.
16 |     """
17 |     # compile list of HPO terms to include in the calculation, term plus children
18 |     hpo_id_plus_children = [hpo_id] + list(nx.ancestors(hpo_network, hpo_id))
19 |     # num_diseases_annotated is the total number of diseases in the annotation corpus.
20 | 
21 |     def get_ic(hpo_ids, annotations):
22 |         # count the # of unique diseases annotated to the hpo term and it's children
23 |         n_unique_diseases = len(
24 |             {g for h in hpo_ids if h in annotations for g in annotations[h]}
25 |         )
26 |         # negative log of the number of hpo annotations divided by the total number
27 |         # of hpo terms in the
28 |         # phenotypes_to_genes file
29 |         information_content = -np.log((n_unique_diseases + SMOOTH) /
30 |                                       float(num_diseases_annotated + SMOOTH))
31 | 
32 |         return information_content
33 | 
34 |     annotations_list = [phenotype_to_diseases]
35 |     if custom_annotations is not None:
36 |         annotations_list.append(custom_annotations)
37 |     output_mean = np.mean([get_ic(hpo_id_plus_children, annotations=annotations)
38 |                            for annotations in annotations_list])
39 |     return output_mean
40 | 


--------------------------------------------------------------------------------
/phenopy/network.py:
--------------------------------------------------------------------------------
  1 | import networkx as nx
  2 | import obonet
  3 | import re
  4 | import sys
  5 | 
  6 | from phenopy.config import logger
  7 | from phenopy.ic import calculate_information_content
  8 | from phenopy.weights import make_age_distributions
  9 | from phenopy.util import parse_input
 10 | from typing import (
 11 |     Dict,
 12 |     List,
 13 | )
 14 | 
 15 | 
 16 | def load(obo_file: str) -> nx.MultiDiGraph:
 17 |     """
 18 |     Load OBO file into a networkx graph.
 19 |     """
 20 |     try:
 21 |         hpo_network = obonet.read_obo(obo_file)
 22 |     except (FileNotFoundError, PermissionError) as e:
 23 |         if logger is not None:
 24 |             logger.critical(e)
 25 |         else:
 26 |             sys.stderr.write(str(e))
 27 |         exit(1)
 28 | 
 29 |     # roots for non-phenotype nodes
 30 |     non_phenotypes = {
 31 |         "mortality_aging": "HP:0040006",
 32 |         "mode_of_inheritance": "HP:0000005",
 33 |         "clinical_modifier": "HP:0012823",
 34 |         "frequency": "HP:0040279",
 35 |         "clinical_course": "HP:0031797",
 36 |     }
 37 | 
 38 |     # remove non-phenotype branches
 39 |     for _, hpo_id in non_phenotypes.items():
 40 |         if hpo_id in hpo_network.nodes:
 41 |             children = nx.ancestors(hpo_network, hpo_id)
 42 |             hpo_network.remove_nodes_from([hpo_id] + list(children))
 43 | 
 44 |     return hpo_network
 45 | 
 46 | 
 47 | def annotate(
 48 |     hpo_network: nx.MultiDiGraph,
 49 |     phenotype_to_diseases: Dict,
 50 |     num_diseases_annotated: int,
 51 |     alt2prim: Dict,
 52 |     annotations_file: List = None,
 53 |     ages_distribution_file: str = None,
 54 |     phenotype_disease_frequencies: Dict = None,
 55 | ) -> nx.MultiDiGraph:
 56 |     """
 57 |     Cleans the HPO network.
 58 | 
 59 |     Removes non-phenotype branches of the network, and merges all synonyms into one tag.
 60 | 
 61 |     :param hpo_network: `networkx.MultiDiGraph` to clean.
 62 |     :param phenotype_to_diseases: Dictionary mapping HPO terms to diseases.
 63 |     :param num_diseases_annotated: Number of diseases with HPO annotations.
 64 |     :param alt2prim: The dict of alternate terms to canonical terms.
 65 |     :param annotations_file: A list of custom annotation files, in the same format
 66 |         as tests/data/test.score-long.txt
 67 |     :param phenotype_disease_frequencies: dictionary of phenotype to disease frequencies
 68 |     :param ages_distribution_file: Path to phenotypes ages distribution file.
 69 |     :return: `networkx.MultiDiGraph`
 70 |     """
 71 | 
 72 |     # Before calculating information content, check for custom_annotations_file and load
 73 |     custom_annos = None
 74 |     if annotations_file is not None:
 75 |         custom_annos = {}
 76 |         for record in parse_input(annotations_file, hpo_network, alt2prim):
 77 |             for term_id in record["terms"]:
 78 |                 if term_id not in custom_annos:
 79 |                     custom_annos[term_id] = []
 80 |                 custom_annos[term_id].append(record["record_id"])
 81 | 
 82 |     # make ages distributions
 83 |     ages = None
 84 |     if ages_distribution_file is not None:
 85 |         try:
 86 |             ages = make_age_distributions(ages_distribution_file)
 87 |             logger.info(
 88 |                 f"Adding custom phenotype age distributions to HPO nodes "
 89 |                 f"from file: {ages_distribution_file}"
 90 |             )
 91 |         except (FileNotFoundError, PermissionError) as e:
 92 |             logger.critical(e)
 93 |             logger.critical(
 94 |                 f"Specified phenotype ages file could not be loaded or "
 95 |                 f"does not exist: {e}"
 96 |             )
 97 |             exit(1)
 98 | 
 99 |     for node_id, data in hpo_network.nodes(data=True):
100 |         # annotate with information content value
101 |         hpo_network.nodes[node_id]["ic"] = calculate_information_content(
102 |             node_id,
103 |             hpo_network,
104 |             phenotype_to_diseases,
105 |             num_diseases_annotated,
106 |             custom_annos,
107 |         )
108 |         # annotate with phenotype age distribution
109 |         hpo_network.nodes[node_id]["disease_weights"] = {}
110 | 
111 |         if ages is not None and node_id in ages.index:
112 |             hpo_network.nodes[node_id]["age_dist"] = ages.loc[node_id]["age_dist"]
113 | 
114 |         # add the disease_frequency weights as attributes to the node
115 |         if phenotype_disease_frequencies is not None:
116 |             if node_id in phenotype_disease_frequencies:
117 |                 for dis_id, freq in phenotype_disease_frequencies[node_id].items():
118 |                     hpo_network.nodes[node_id]["weights"]["disease_frequency"][
119 |                         dis_id
120 |                     ] = freq
121 | 
122 |         # annotate with depth value
123 |         # hard-coding origin node for now
124 |         origin = "HP:0000001"
125 |         hpo_network.nodes[node_id]["depth"] = nx.shortest_path_length(
126 |             hpo_network, node_id, origin
127 |         )
128 | 
129 |         # clean synonyms
130 |         synonyms = []
131 |         try:
132 |             for synonym in data["synonym"]:
133 |                 synonyms.append(synonym)
134 |             hpo_network.nodes[node_id]["synonyms"] = re.findall(
135 |                 r'"(.*?)"', ",".join(synonyms)
136 |             )
137 |         except KeyError:
138 |             # pass if no synonym tags in the node
139 |             pass
140 | 
141 |     return hpo_network
142 | 


--------------------------------------------------------------------------------
/phenopy/score.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import gensim
  3 | import networkx as nx
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from functools import lru_cache
  8 | from multiprocessing import Pool
  9 | from phenopy.weights import calculate_age_weights
 10 | from phenopy.config import config
 11 | from typing import (
 12 |     Tuple,
 13 |     Dict,
 14 |     List,
 15 |     Set,
 16 | )
 17 | 
 18 | 
 19 | class Scorer:
 20 |     def __init__(
 21 |         self,
 22 |         hpo_network: nx.MultiDiGraph,
 23 |         summarization_method: str = "BMWA",
 24 |         min_score_mask: float = 0.05,
 25 |         scoring_method: str = "HRSS",
 26 |     ) -> None:
 27 | 
 28 |         # Establish hpo_network
 29 |         self.hpo_network = hpo_network
 30 | 
 31 |         # Establish summarization method
 32 |         if summarization_method not in ["BMA", "BMWA", "maximum"]:
 33 |             raise ValueError(
 34 |                 "Unsupported summarization method, please choose from "
 35 |                 "BMA, BMWA, or maximum."
 36 |             )
 37 |         self.summarization_method = summarization_method
 38 | 
 39 |         # Assign min_score_mask
 40 |         self.min_score_mask = min_score_mask
 41 | 
 42 |         # Assign scoring method
 43 |         if scoring_method not in ["HRSS", "Resnik", "Jaccard", "word2vec"]:
 44 |             raise ValueError(
 45 |                 "Unsupported semantic similarity scoring method, please "
 46 |                 "choose from HRSS, Resnik, Jaccard, or word2vec."
 47 |             )
 48 |         self.scoring_method = scoring_method
 49 | 
 50 |         # Load the word vectors if using word2vec
 51 |         if scoring_method == "word2vec":
 52 |             try:
 53 |                 self.word_vectors = gensim.models.KeyedVectors.load(
 54 |                     config.get("models", "phenopy.wv.model")
 55 |                 )
 56 |             except FileNotFoundError:
 57 |                 raise ValueError(
 58 |                     "Please make sure that a word2vec model is in "
 59 |                     "your project data directory."
 60 |                 )
 61 | 
 62 |     def find_lca(self, term_a: str, term_b: str) -> str:
 63 |         """
 64 |         Determine the lowest common ancestor for two HPO terms
 65 |         """
 66 | 
 67 |         # if either term is HP:0000001 return it
 68 |         if any(term == "HP:0000001" for term in [term_a, term_b]):
 69 |             return "HP:0000001"
 70 | 
 71 |         # if one of the terms is a child of the other return the parent
 72 |         if self.hpo_network.has_edge(term_a, term_b):
 73 |             return term_b
 74 |         if self.hpo_network.has_edge(term_b, term_a):
 75 |             return term_a
 76 | 
 77 |         # find common breadth-first-search predecessors
 78 |         parents = []
 79 |         for i, term in enumerate([term_a, term_b]):
 80 |             parents.append({p[0] for p in nx.bfs_predecessors(self.hpo_network, term)})
 81 |             parents[i].add(term)
 82 | 
 83 |         # Find the intersection between the two sets of parents
 84 |         common_parents = parents[0].intersection(parents[1])
 85 | 
 86 |         # lca node - find the ancestor with the highest IC
 87 |         # break ties by choosing the node with the greatest depth
 88 |         return max(
 89 |             common_parents,
 90 |             key=lambda n: (
 91 |                 self.hpo_network.nodes[n]["ic"],
 92 |                 self.hpo_network.nodes[n]["depth"],
 93 |             ),
 94 |         )
 95 | 
 96 |     def calculate_beta(self, term_a: str, term_b: str) -> float:
 97 |         """
 98 |         calculates the beta term in HRSS equation
 99 |         """
100 |         # find information content for the most informative leaf for each term
101 |         mil_ic = []
102 | 
103 |         # For each term, if it has children, find the most informative leaf
104 |         for term in [term_a, term_b]:
105 |             if self.hpo_network.in_edges(term):
106 | 
107 |                 # children terms generator
108 |                 children = nx.ancestors(self.hpo_network, term)
109 | 
110 |                 # Establish the leaf nodes
111 |                 leaves = {
112 |                     p
113 |                     for p in children
114 |                     if self.hpo_network.out_degree(p) >= 1
115 |                     and self.hpo_network.in_degree(p) == 0
116 |                 }
117 | 
118 |                 # append the max IC leaf (choose the one with the max depth)
119 |                 mil = max(
120 |                     leaves,
121 |                     key=lambda n: (
122 |                         self.hpo_network.nodes[n]["ic"],
123 |                         self.hpo_network.nodes[n]["depth"],
124 |                     ),
125 |                 )
126 |                 mil_ic.append(self.hpo_network.nodes[mil]["ic"])
127 | 
128 |             # the node is a leaf
129 |             else:
130 |                 mil_ic.append(self.hpo_network.nodes[term]["ic"])
131 | 
132 |         # calculate beta_ic
133 |         beta_ic = (
134 |             (mil_ic[0] - self.hpo_network.nodes[term_a]["ic"])
135 |             + (mil_ic[1] - self.hpo_network.nodes[term_b]["ic"])
136 |         ) / 2.0
137 | 
138 |         return beta_ic
139 | 
140 |     def calculate_gamma(self, term_a: str, term_b: str, term_lca: str) -> int:
141 |         """
142 |         Calculate gamma term for the HRSS algorithm.
143 |         """
144 |         # calculate gamma
145 |         # "such that the value equals zero if the two terms are the same"
146 |         if term_a == term_b:
147 |             return 0
148 | 
149 |         # If one of the terms is a child of the other return 1
150 |         term_a_child = self.hpo_network.has_edge(term_a, term_b)
151 |         term_b_child = self.hpo_network.has_edge(term_b, term_a)
152 |         if term_a_child or term_b_child:
153 |             return 1
154 | 
155 |         # Otherwise calculate the shortest-path length to the LCA
156 |         a_to_lca = nx.shortest_path_length(self.hpo_network, term_a, term_lca)
157 |         b_to_lca = nx.shortest_path_length(self.hpo_network, term_b, term_lca)
158 | 
159 |         return a_to_lca + b_to_lca
160 | 
161 |     @lru_cache(maxsize=72000000)
162 |     def score_hpo_pair_hrss(self, term_a: str, term_b: str) -> float:
163 |         """
164 |         Scores the comparison of a pair of terms, using Hybrid Relative Specificity
165 |         Similarity (HRSS) algorithm.
166 |         """
167 | 
168 |         # calculate beta_ic
169 |         beta_ic = self.calculate_beta(term_a, term_b)
170 | 
171 |         # find lowest common ancestors for the two terms
172 |         lca_node = self.find_lca(term_a, term_b)
173 | 
174 |         # calculate alpha_ic
175 |         alpha_ic = self.hpo_network.nodes[lca_node]["ic"]
176 |         if self.scoring_method == "Resnik":
177 |             return alpha_ic
178 | 
179 |         # Return 0 if alpha_ic and beta_ic are both 0
180 |         if (alpha_ic == 0.0) and (beta_ic == 0.0):
181 |             return 0.0
182 | 
183 |         # calculate gamma
184 |         gamma = self.calculate_gamma(term_a, term_b, lca_node)
185 | 
186 |         # Assign the I and D variables in the HRSS equation
187 |         i_variable = alpha_ic / (alpha_ic + beta_ic)
188 |         d_variable = 1.0 / (1.0 + gamma)
189 | 
190 |         return i_variable * d_variable
191 | 
192 |     def score(self, record_a: Dict, record_b: Dict) -> Tuple[str, str, float]:
193 |         """
194 |         Scores the comparison of terms listed in record A to terms listed in record B.
195 |         """
196 |         if self.summarization_method not in ["BMA", "BMWA", "maximum"]:
197 |             raise ValueError(
198 |                 "Unsupported summarization method, please choose from "
199 |                 "BMA, BMWA, or maximum."
200 |             )
201 | 
202 |         # if either set is empty return 0.0
203 |         terms_a = record_a["terms"]
204 |         terms_b = record_b["terms"]
205 |         if not terms_a or not terms_b:
206 |             return record_a["record_id"], record_b["record_id"], 0.0
207 | 
208 |         # If specified, calculate the Jaccard similarity
209 |         if self.scoring_method == "Jaccard":
210 |             intersection = len(list(set(terms_a).intersection(terms_b)))
211 |             union = (len(terms_a) + len(terms_b)) - intersection
212 |             comparison_score = float(intersection) / union
213 |             return record_a["record_id"], record_b["record_id"], comparison_score
214 | 
215 |         # If specified, calculate the word2vec similarity
216 |         elif self.scoring_method == "word2vec":
217 | 
218 |             # Ensure that all HPO terms are in the vocab
219 |             in_vocab_terms_a = [
220 |                 x for x in terms_a if x in self.word_vectors.key_to_index
221 |             ]
222 |             in_vocab_terms_b = [
223 |                 x for x in terms_b if x in self.word_vectors.key_to_index
224 |             ]
225 | 
226 |             # If both records have terms in the vocab (both are non-empty lists)
227 |             if in_vocab_terms_a and in_vocab_terms_b:
228 |                 return self.word_vectors.n_similarity(
229 |                     in_vocab_terms_a, in_vocab_terms_b
230 |                 )
231 | 
232 |             # One record or the other has no terms in the word2vec vocab
233 |             else:
234 |                 return record_a["record_id"], record_b["record_id"], 0.0
235 | 
236 |         # calculate weights for record_a and record_b
237 |         if record_a["weights"] is not None:
238 |             weights_a = record_a["weights"].copy()
239 |         else:
240 |             weights_a = []
241 |         if record_b["weights"] is not None:
242 |             weights_b = record_b["weights"].copy()
243 |         else:
244 |             weights_b = []
245 | 
246 |         # set weights
247 |         # if we have age of record_a use it to set age weights for record_b
248 |         if "age" in record_a:
249 |             weights_b["age"] = calculate_age_weights(
250 |                 record_b["terms"], record_a["age"], self.hpo_network
251 |             )
252 | 
253 |         # if we have age of record_b use it to set age weights for record_a
254 |         if "age" in record_b:
255 |             weights_a["age"] = calculate_age_weights(
256 |                 record_a["terms"], record_b["age"], self.hpo_network
257 |             )
258 | 
259 |         # Creates a dataframe that houses the HRSS for each term pair
260 |         df = self.get_term_pair_dataframe(terms_a, terms_b)
261 | 
262 |         # Return maximum if specified
263 |         if self.summarization_method == "maximum":
264 |             return record_a["record_id"], record_b["record_id"], self.maximum(df)
265 | 
266 |         # Retrun BMWA if specified
267 |         elif self.summarization_method == "BMWA" and any([weights_a, weights_b]):
268 |             score_output = self.best_match_weighted_average(
269 |                 df, weights_a=weights_a, weights_b=weights_b
270 |             )
271 | 
272 |             return record_a["record_id"], record_b["record_id"], score_output
273 | 
274 |         # Otherwise return the best-match-average
275 |         else:
276 |             score_output = self.best_match_average(df)
277 |             return record_a["record_id"], record_b["record_id"], score_output
278 | 
279 |     def score_term_sets_basic(self, terms_a: str, terms_b: str) -> float:
280 |         """
281 |         Calculate the semantic similarity of two lists of terms.
282 |         This is intended to be used as a library function. It is not used by the CLI.
283 |         """
284 |         # Instantiate the two lists of HPO identifiers
285 |         terms_a = set(terms_a)
286 |         terms_b = set(terms_b)
287 | 
288 |         # Calculate the Jaccard similarity if specified
289 |         if self.scoring_method == "Jaccard":
290 |             intersection = len(list(set(terms_a).intersection(terms_b)))
291 |             union = (len(terms_a) + len(terms_b)) - intersection
292 |             return float(intersection) / union
293 | 
294 |         # Calculate the word vector similarity if word2vec is specified
295 |         elif self.scoring_method == "word2vec":
296 | 
297 |             # Instantiate a list to house all HPO terms that are within the vocab
298 |             in_vocab_terms_a = [
299 |                 x for x in terms_a if x in self.word_vectors.key_to_index
300 |             ]
301 |             in_vocab_terms_b = [
302 |                 x for x in terms_b if x in self.word_vectors.key_to_index
303 |             ]
304 | 
305 |             # If both lists exist (both are non-empty lists) return their similarity
306 |             if in_vocab_terms_a and in_vocab_terms_b:
307 |                 return self.word_vectors.n_similarity(
308 |                     in_vocab_terms_a, in_vocab_terms_b
309 |                 )
310 | 
311 |             # Otherwise return 0.0
312 |             else:
313 |                 return 0.0
314 | 
315 |         # Creates a dataframe that houses the HRSS for each term pair
316 |         df = self.get_term_pair_dataframe(terms_a, terms_b)
317 | 
318 |         # If set to maximum, return the maximum, otherwise best-match-average
319 |         if self.summarization_method == "maximum":
320 |             return self.maximum(df)
321 |         else:
322 |             return self.best_match_average(df)
323 | 
324 |     def score_records(
325 |         self, a_records: Dict, b_records: Dict, record_pairs: List, threads: int = 1
326 |     ) -> List:
327 |         """
328 |         Scores a pair of records based on the specified number of threads
329 |         """
330 |         with Pool(processes=threads) as p:
331 |             results = p.starmap(
332 |                 self.score,
333 |                 [
334 |                     (
335 |                         a_records[record_a],  # a records
336 |                         b_records[record_b],  # b records
337 |                     )
338 |                     for (record_a, record_b) in record_pairs
339 |                 ],
340 |             )
341 | 
342 |         return results
343 | 
344 |     @staticmethod
345 |     def best_match_average(df: pd.DataFrame) -> float:
346 |         """
347 |         Returns the Best-Match average of a termlist to termlist similarity matrix.
348 |         """
349 |         # Determine the max values of the rows and columns
350 |         max_column_values = df.max(axis=1).values
351 |         max_row_values = df.max(axis=0).values
352 |         return np.average(np.append(max_column_values, max_row_values))
353 | 
354 |     @staticmethod
355 |     def maximum(dataframe: pd.DataFrame) -> float:
356 |         """Returns the maximum similarity value between to term lists"""
357 |         return dataframe.values.max()
358 | 
359 |     def best_match_weighted_average(
360 |         self, df: pd.DataFrame, weights_a: Dict, weights_b: Dict
361 |     ) -> float:
362 |         """
363 |         Returns Best-Match Weighted Average of a termlist to termlist similarity matrix.
364 |         """
365 |         max_a = df.max(axis=1).values
366 |         max_b = df.max(axis=0).values
367 |         scores = np.append(max_a, max_b)
368 | 
369 |         weights_matrix = {}
370 |         for w in weights_a:
371 |             # init weight list if necessary
372 |             if w not in weights_matrix:
373 |                 weights_matrix[w] = []
374 | 
375 |             # extend weight with the values of a
376 |             weights_matrix[w].extend(weights_a[w])
377 | 
378 |             # for columns not in b, fill in with 1s for each b row
379 |             if w not in weights_b:
380 |                 weights_matrix[w].extend([1 for _ in range(max_b.shape[0])])
381 | 
382 |         for w in weights_b:
383 |             # for columns not in a fill in with 1s for each a row
384 |             if w not in weights_matrix:
385 |                 weights_matrix[w] = [1 for _ in range(max_a.shape[0])]
386 | 
387 |             # extend weight with the values of b
388 |             weights_matrix[w].extend(weights_b[w])
389 | 
390 |         weights_df = pd.DataFrame.from_dict(weights_matrix)
391 |         weights = weights_df.min(axis=1)
392 | 
393 |         # mask good matches from weighting
394 |         # mask threshold based on >75% of pairwise scores of all hpo terms
395 |         # TODO: expose min_score cutoff value to be set in config
396 |         if self.min_score_mask is not None:
397 |             masked_weights = np.where(scores > self.min_score_mask, 1.0, weights)
398 |             weights = masked_weights
399 | 
400 |         # if weights add up to zero, calculate unweighted average
401 |         if np.sum(weights) == 0.0:
402 |             weights = np.ones(len(weights))
403 | 
404 |         return np.average(scores, weights=weights)
405 | 
406 |     def get_term_pair_dataframe(self, terms_a: Set, terms_b: Set) -> pd.DataFrame:
407 |         """
408 |         Creates a dataframes of pairwise HRSS scores between them
409 |         """
410 |         # Create the list of term pairs
411 |         # e.g., ['a', 'b']['c', 'd'] -> [('a', 'c'), ('a', 'd'), ('b', 'c), ('b','d')]
412 |         term_pairs = itertools.product(terms_a, terms_b)
413 | 
414 |         # Apply the HRSS score to each pair within the dataframe
415 |         dataframe = (
416 |             pd.DataFrame(
417 |                 [
418 |                     (pair[0], pair[1], self.score_hpo_pair_hrss(pair[0], pair[1]))
419 |                     for pair in term_pairs
420 |                 ],
421 |                 columns=["a", "b", "score"],
422 |             )
423 |             .set_index(["a", "b"])
424 |             .unstack()
425 |         )
426 | 
427 |         return dataframe
428 | 


--------------------------------------------------------------------------------
/phenopy/util.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import sys
  3 | import networkx as nx
  4 | import numpy as np
  5 | import pandas as pd
  6 | import logging
  7 | 
  8 | from collections import Counter
  9 | from contextlib import contextmanager
 10 | 
 11 | from phenopy.config import config, logger
 12 | from typing import (
 13 |     Tuple,
 14 |     List,
 15 |     Dict,
 16 |     Union,
 17 |     Generator,
 18 | )
 19 | 
 20 | 
 21 | def half_product(num_rows: int, num_columns: int) -> Tuple[int, int]:
 22 |     """yield combinations and the diagonal"""
 23 |     for m in range(0, num_rows):
 24 |         for n in range(m, num_columns):
 25 |             yield m, n
 26 | 
 27 | 
 28 | def export_phenotype_hpoa_with_no_parents(
 29 |     phenotype_hpoa_file: str,
 30 |     phenotype_hpoa_no_parents_file: str,
 31 |     hpo_network: nx.MultiDiGraph,
 32 |     logger: logging.Logger = None,
 33 | ) -> None:
 34 |     """
 35 |     Load HPO terms associated to genes as annotated in
 36 |     https://hpo.jax.org/app/download/annotation.
 37 |     Filter the parent terms for each gene.
 38 |     Dump pheno2genes_no_parents_file
 39 | 
 40 |     :param phenotype_hpoa_file: Phenotypes to diseases file.
 41 |     :param phenotype_hpoa_no_parents_file: Phenotypes to diseases file
 42 |     with parents removed.
 43 |     :param hpo_network: The HPO networkx object.
 44 |     :param logger: Python `logging` logger instance.
 45 |     :return: None
 46 |     """
 47 |     try:
 48 |         with open(phenotype_hpoa_file, "r") as tsv_fh:
 49 |             # skip the comment lines
 50 |             [next(tsv_fh) for _ in range(4)]
 51 |             df = pd.read_csv(
 52 |                 tsv_fh,
 53 |                 sep="\t",
 54 |             )
 55 |     except (FileNotFoundError, PermissionError) as e:
 56 |         if logger is not None:
 57 |             logger.critical(e)
 58 |         else:
 59 |             sys.stderr.write(str(e))
 60 |         exit(1)
 61 | 
 62 |     no_parents_df = df.copy()
 63 | 
 64 |     # Establish the proper column headers (different for various versions)
 65 |     database_id = "#DatabaseID" if "#DatabaseID" in df.columns else "database_id"
 66 |     hpo_id = "HPO_ID" if "HPO_ID" in df.columns else "hpo_id"
 67 | 
 68 |     for gene, annotations in df.groupby(database_id):
 69 |         termlist = [
 70 |             node for node in annotations[hpo_id].tolist() if node in hpo_network.nodes()
 71 |         ]
 72 |         termlist = remove_parents(termlist, hpo_network)
 73 |         parent_idx = annotations.loc[~annotations[hpo_id].isin(termlist)].index
 74 |         no_parents_df.drop(parent_idx, inplace=True)
 75 | 
 76 |     try:
 77 |         no_parents_df.to_csv(phenotype_hpoa_no_parents_file, sep="\t", index=False)
 78 |     except PermissionError as e:
 79 |         if logger is not None:
 80 |             logger.critical(e)
 81 |         else:
 82 |             sys.stderr.write(str(e))
 83 |         exit(1)
 84 | 
 85 | 
 86 | def parse(string: str, what: str = "HPO") -> Union[None, int, str, list]:
 87 |     """
 88 |     Parse patient parameters in the records file
 89 |     :param string: string to parse
 90 |     :param what: (HP,age,sex) terms to parse
 91 |     :return: parsed object, int for age, string for gender, list for terms
 92 |     """
 93 |     string = string.strip()
 94 |     if string == ".":
 95 |         return None
 96 |     if what == "HPO":
 97 |         result = [x for x in string.split("|") if x.startswith("HP:")]
 98 |         return result
 99 |     elif f"{what}=" in string:
100 |         result = [x.split(f"{what}=")[1] for x in string.split(";") if what in x]
101 |         if result:
102 |             result = result[0]
103 |             if what == "age":
104 |                 try:
105 |                     result = round(float(result), 1)
106 |                 except ValueError:
107 |                     result = None
108 | 
109 |             if what == "sex":
110 |                 if result.lower().startswith("f"):
111 |                     result = "Female"
112 |                 elif result.lower().startswith("m"):
113 |                     result = "Male"
114 |                 else:
115 |                     result = None
116 |             return result
117 |         else:
118 |             return None
119 | 
120 | 
121 | def read_records_file(
122 |     records_file: str,
123 |     no_parents: bool = False,
124 |     hpo_network: nx.MultiDiGraph = None,
125 |     logger: logging.Logger = None,
126 | ) -> List:
127 |     """
128 |     Parse input file for patient descriptions into an array of dictionaries
129 |     """
130 |     try:
131 |         with open(records_file) as records_fh:
132 |             reader = csv.reader(records_fh, delimiter="\t")
133 |             records = []
134 |             for line in reader:
135 |                 if line[0].startswith("#"):
136 |                     continue
137 |                 dict_ = {
138 |                     "sample": line[0],
139 |                     "age": parse(line[1], what="age"),
140 |                     "gender": parse(line[1], what="sex"),
141 |                     "terms": parse(line[2], what="HPO"),
142 |                 }
143 | 
144 |                 if no_parents is True and hpo_network is not None:
145 |                     dict_["terms"] = remove_parents(dict_["terms"], hpo_network)
146 |                 else:
147 |                     pass
148 |                 records.append(dict_)
149 |         return records
150 |     except (FileNotFoundError, PermissionError) as e:
151 |         if logger is not None:
152 |             logger.critical(e)
153 |         else:
154 |             sys.stderr.write(str(e))
155 |         exit(1)
156 | 
157 | 
158 | def remove_parents(termlist: List[str], hpo_network: nx.MultiDiGraph) -> List[str]:
159 |     """
160 |     remove parents from termlist
161 |     """
162 |     terms_to_remove = set()
163 |     for source_term in termlist:
164 |         if source_term not in hpo_network.nodes:
165 |             terms_to_remove.add(source_term)
166 |             continue
167 |         for target_term in termlist:
168 |             if target_term not in hpo_network.nodes:
169 |                 terms_to_remove.add(target_term)
170 |                 continue
171 |             # has_path will evaluate True for a term to itself, include additional check
172 |             same_terms = source_term == target_term
173 |             source_to_target = nx.has_path(hpo_network, source_term, target_term)
174 |             target_to_source = nx.has_path(hpo_network, target_term, source_term)
175 |             if source_to_target is True and not same_terms:
176 |                 terms_to_remove.add(target_term)
177 |             if target_to_source is True and not same_terms:
178 |                 terms_to_remove.add(source_term)
179 |     return sorted(set(termlist) - terms_to_remove)
180 | 
181 | 
182 | def generate_alternate_ids(hpo_network: nx.MultiDiGraph) -> Dict[str, str]:
183 |     """
184 |     Create a key, value store of alternate terms to canonical terms.
185 |     """
186 |     alt2prim = {}
187 |     for n in hpo_network.nodes(data=True):
188 |         n = n[0]
189 |         try:
190 |             for alt in hpo_network.nodes[n]["alt_id"]:
191 |                 alt2prim[alt] = n
192 |         except KeyError:
193 |             # no alternate HPO ids for this term
194 |             continue
195 |     return alt2prim
196 | 
197 | 
198 | def parse_input(
199 |     input_file: str, hpo_network: nx.MultiDiGraph, alt2prim: Dict[str, str]
200 | ) -> List:
201 |     """
202 |     Parse input file.
203 |     """
204 |     try:
205 |         with open(input_file, "r") as input_fh:
206 |             reader = csv.reader(
207 |                 filter(lambda x: not x.startswith("#"), input_fh), delimiter="\t"
208 |             )
209 |             records = []
210 |             for line in reader:
211 |                 # prcoess terms with convert and filter first
212 |                 terms = []
213 |                 for term_id in line[2].split("|"):
214 |                     # convert alternate ids to primary
215 |                     if term_id in alt2prim:
216 |                         term_id = alt2prim[term_id]
217 |                     # filtering terms not in the hpo network
218 |                     if term_id not in hpo_network.nodes():
219 |                         continue
220 |                     terms.append(term_id)
221 | 
222 |                 record = {
223 |                     "record_id": line[0],
224 |                     "terms": remove_parents(terms, hpo_network),
225 |                     "weights": {},
226 |                     **dict(
227 |                         item.split("=") for item in line[1].split(";") if line[1] != "."
228 |                     ),
229 |                 }
230 | 
231 |                 # assign new weights here ex. Sex weights (similar to the age weights).
232 |                 records.append(record)
233 | 
234 |     except (FileNotFoundError, PermissionError) as e:
235 |         logger.critical(f"Input file could not be loaded or does not exist: {e}")
236 |         exit(1)
237 |     except ValueError:
238 |         logger.critical(
239 |             f"Unable to parse input file, invalid line number: "
240 |             f"{reader.line_num}:{input_file}"
241 |         )
242 |         exit(1)
243 | 
244 |     return records
245 | 
246 | 
247 | def read_phenotype_groups(
248 |     phenotype_group_file: str = None,
249 | ) -> Dict[str, Dict[str, int]]:
250 |     """
251 |     Reads the phenotype group mappping file into a dictionary.
252 |     """
253 |     if phenotype_group_file is None:
254 |         phenotype_group_file = config["phenotype_groups"]["phenotype_groups_file"]
255 | 
256 |     hp_to_pg = {}
257 |     with open(phenotype_group_file, "r") as f:
258 |         f.readline()
259 |         for line in f:
260 |             hpid, phenotype_group_1000, phenotype_group_1500 = line.strip("\n").split(
261 |                 "\t"
262 |             )
263 |             hp_to_pg[hpid] = {
264 |                 "k1000": int(phenotype_group_1000),
265 |                 "k1500": int(phenotype_group_1500),
266 |             }
267 |     return hp_to_pg
268 | 
269 | 
270 | def standardize_phenotypes(
271 |     terms: List[str], hpo_network: nx.MultiDiGraph, alt2prim: Dict[str, str]
272 | ) -> List[str]:
273 |     """
274 |     Given a list of HPO ids, first try to convert synonyms to primary ids,
275 |     then filter if terms are not in the ontology
276 |     """
277 |     terms = [alt2prim[term] if term in alt2prim else term for term in terms]
278 |     terms = list(filter(lambda term: term in hpo_network.nodes, terms))
279 |     terms = remove_parents(terms, hpo_network)
280 |     return terms
281 | 
282 | 
283 | def encode_phenotypes(
284 |     phenotypes: List,
285 |     phenotype_groups: Dict,
286 |     hpo_network: nx.MultiDiGraph,
287 |     alt2prim: Dict[str, str],
288 |     k: int = 1000,
289 | ) -> np.ndarray:
290 |     """
291 |     Encode phenotypes into a feature array.
292 |     """
293 | 
294 |     def build_feature_array(cntr: Counter, n_features: int = k) -> np.ndarray:
295 |         a = [0] * n_features
296 |         for feature_index, count in cntr.items():
297 |             a[feature_index] = count
298 |         return a
299 | 
300 |     def encode(hpo_ids: List) -> Counter:
301 |         return Counter(hpo_ids)
302 | 
303 |     nested = all(isinstance(element, list) for element in phenotypes)
304 | 
305 |     if nested:
306 |         return [
307 |             build_feature_array(
308 |                 encode(
309 |                     [
310 |                         phenotype_groups[hpoid][f"k{k}"]
311 |                         for hpoid in standardize_phenotypes(
312 |                             phenotypes_, hpo_network, alt2prim
313 |                         )
314 |                     ]
315 |                 )
316 |             )
317 |             for phenotypes_ in phenotypes
318 |         ]
319 | 
320 |     return build_feature_array(
321 |         encode(
322 |             [
323 |                 phenotype_groups[hpoid][f"k{k}"]
324 |                 for hpoid in standardize_phenotypes(phenotypes, hpo_network, alt2prim)
325 |             ]
326 |         )
327 |     )
328 | 
329 | 
330 | @contextmanager
331 | def open_or_stdout(filename: str) -> Generator:
332 |     if filename != "-":
333 |         with open(filename, "w") as f:
334 |             yield f
335 |     else:
336 |         yield sys.stdout
337 | 


--------------------------------------------------------------------------------
/phenopy/weights.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import networkx as nx
  4 | import pandas as pd
  5 | import sys
  6 | 
  7 | from functools import lru_cache
  8 | from typing import List
  9 | import numpy as np
 10 | 
 11 | 
 12 | @lru_cache(maxsize=1300000)
 13 | def hpo_age_to_weight(hpo_network: nx.MultiGraph, term_id: str, age: int) -> float:
 14 |     """
 15 |     calculate weight based on truncated normal distribution CDF
 16 |     """
 17 |     if term_id not in hpo_network.nodes or age is None:
 18 |         return 1.0
 19 |     elif "age_dist" in hpo_network.nodes[term_id]:
 20 |         return get_empirical_cdf(float(age), hpo_network.nodes[term_id]["age_dist"])
 21 |     else:
 22 |         return 1.0
 23 | 
 24 | 
 25 | def calculate_age_weights(
 26 |     terms: List, age: int, hpo_network: nx.MultiGraph
 27 | ) -> List[float]:
 28 |     """
 29 |     Calculates an age-based weight vector given an iterable of terms.
 30 |     """
 31 |     weights = []
 32 |     for term_id in terms:
 33 |         weights.append(hpo_age_to_weight(hpo_network, term_id, age))
 34 | 
 35 |     return weights
 36 | 
 37 | 
 38 | def get_truncated_normal(
 39 |     mean: float, sd: float, lower: float, upper: float, instances: int = 1000000
 40 | ) -> np.ndarray:
 41 |     """
 42 |     Simulates a truncated normal distribution
 43 |     """
 44 |     # Create the normal distribution
 45 |     distribution = np.random.normal(mean, sd, instances)
 46 | 
 47 |     # Truncate all values outside of the range
 48 |     distribution = np.array([i for i in distribution if lower <= i <= upper])
 49 | 
 50 |     return distribution
 51 | 
 52 | 
 53 | def get_empirical_cdf(value: float, distribution: np.ndarray) -> float:
 54 |     """
 55 |     Calculates the empirical cumulative distribution function for a given value within
 56 |     a given distribution.
 57 |     """
 58 |     # Sort the distribution
 59 |     data_sorted = np.sort(distribution)
 60 | 
 61 |     # Determine the CDF for the values within the distribution
 62 |     cdf = np.linspace(0, 1, len(distribution))
 63 | 
 64 |     # Establish as a dataframe
 65 |     df = pd.DataFrame(list(zip(data_sorted, cdf)), columns=["value", "cdf"])
 66 | 
 67 |     # Return the maximum CDF value for the given value
 68 |     return df[df["value"] <= value]["cdf"].max()
 69 | 
 70 | 
 71 | def make_age_distributions(
 72 |     phenotype_age_file: str, logger: logging.Logger = None
 73 | ) -> pd.DataFrame:
 74 |     """
 75 |     Read in phenotype ages file and convert to pandas object with modeled distributions
 76 |     """
 77 | 
 78 |     try:
 79 |         df = pd.read_csv(phenotype_age_file, sep="\t", names=["hpid", "mean", "std"])
 80 | 
 81 |     except (FileNotFoundError, PermissionError) as e:
 82 | 
 83 |         if logger is not None:
 84 |             logger.critical(e)
 85 |         else:
 86 |             sys.stderr.write(str(e))
 87 |         exit(1)
 88 | 
 89 |     distributions = []
 90 |     for rec in df.to_dict("records"):
 91 | 
 92 |         try:
 93 |             # model truncated normal
 94 |             dist = get_truncated_normal(
 95 |                 mean=rec["mean"], sd=rec["std"], lower=0, upper=rec["mean"]
 96 |             )
 97 |             distributions.append({"hpid": rec["hpid"], "age_dist": dist})
 98 | 
 99 |         except ValueError as e:
100 |             if logger is not None:
101 |                 logger.critical(e)
102 |             else:
103 |                 sys.stderr.write(str(e))
104 |             exit(1)
105 | 
106 |     return pd.DataFrame.from_dict(distributions).set_index("hpid")
107 | 


--------------------------------------------------------------------------------
/phenoseries/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeneDx/phenopy/3cc03ea1a60334155141f8b261c77471195b62f3/phenoseries/__init__.py


--------------------------------------------------------------------------------
/phenoseries/experiment.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | import networkx as nx
  5 | import numpy as np
  6 | import pandas as pd
  7 | import requests
  8 | import sys
  9 | 
 10 | from ast import literal_eval
 11 | from phenopy.build_hpo import generate_annotated_hpo_network
 12 | from phenopy.config import (
 13 |     config,
 14 |     logger
 15 | )
 16 | from phenopy.score import Scorer
 17 | from phenopy.util import remove_parents, half_product
 18 | from typing import (
 19 |     List,
 20 |     Dict,
 21 | )
 22 | 
 23 | try:
 24 |     from txt2hpo.extract import Extractor
 25 | except ModuleNotFoundError:
 26 |     logger.warning("txt2hpo is not installed. This is only used in the "
 27 |                    "validate-phenoseries command.\nTo use this command, please "
 28 |                    "install txt2hpo: pip install txt2hpo")
 29 | 
 30 | # TODO: fix the bug in this script before merging to master.
 31 | 
 32 | OMIM_API_URL = "https://api.omim.org/api/"
 33 | OMIM_DOWNLOADS_URL = "https://data.omim.org/downloads/"
 34 | 
 35 | 
 36 | def request_mimid_info(mimid: str) -> requests.Response:
 37 |     """
 38 |     request mimid description from OMIM
 39 |     """
 40 |     access = "entry?"
 41 |     api_key = os.getenv("OMIM_API_KEY")
 42 |     if api_key is None:
 43 |         api_key = config.get("omim", "omim_api_key")
 44 |     payload = {
 45 |         "mimNumber": mimid,
 46 |         "include": "text",
 47 |         "format": "json",
 48 |         "apiKey": api_key,
 49 |     }
 50 | 
 51 |     r = requests.get(OMIM_API_URL + access, params=payload)
 52 |     if r.status_code == 200:
 53 |         return r
 54 |     else:
 55 |         logger.critical("Please set the omim_api_key in your phenopy.ini config file")
 56 | 
 57 | 
 58 | def convert_and_filter_hpoids(
 59 |         terms: List,
 60 |         hpo: nx.MultiDiGraph,
 61 |         alt2prim: Dict[str, str]) -> List:
 62 |     """
 63 |     Given a list of HPO ids, first try to convert synonyms to primary ids,
 64 |     then filter if terms are not in the ontology
 65 |     """
 66 |     terms = [alt2prim[term] if term in alt2prim else term for term in terms]
 67 |     terms = list(filter(lambda term: term in hpo.nodes, terms))
 68 |     terms = remove_parents(terms, hpo)
 69 |     return terms
 70 | 
 71 | 
 72 | def make_rank_dataframe(
 73 |         pairwise_sim_matrix: np.ndarray,
 74 |         mimdf: pd.DataFrame,
 75 |         ps2mimids: Dict[str, List[str]]) -> pd.DataFrame:
 76 |     relevant_ranks_results = []
 77 |     for psid, ps_mim_ids in ps2mimids.items():
 78 |         # Grab the index of the "relevant" mim ids
 79 |         # Helps identify index in pairwise distance matrix
 80 |         ps_mim_idxs = mimdf[mimdf["omim_id"].isin(ps_mim_ids)].index.tolist()
 81 |         for query_mim_idx in ps_mim_idxs:
 82 |             ranks = return_relevant_ranks(
 83 |                 pairwise_sim_matrix, query_mim_idx, ps_mim_idxs
 84 |             )
 85 |             query_mim = mimdf.iloc[query_mim_idx]["omim_id"]
 86 |             relevant_ranks_results.append([psid, query_mim, ranks])
 87 | 
 88 |     rankdf = pd.DataFrame(
 89 |         relevant_ranks_results, columns=["psid", "query_mim_id", "relevant_ranks"]
 90 |     )
 91 |     rankdf["total_relevant"] = rankdf.apply(
 92 |         lambda row: len(row["relevant_ranks"]), axis=1
 93 |     )
 94 | 
 95 |     return rankdf
 96 | 
 97 | 
 98 | def return_relevant_ranks(
 99 |         pairwise_sim: np.ndarray,
100 |         query_idx: int,
101 |         other_mim_indices: List[int]) -> List[int]:
102 |     """
103 |     Given a pairwise similarity matrix, compute the rank of the similarity between
104 |     a query mim and another mim disease from the same PS.
105 |     """
106 |     other_idxs = other_mim_indices.copy()
107 |     other_idxs.remove(query_idx)
108 |     other_idxs = [idx-1 for idx in other_idxs]
109 |     mim_sims = pairwise_sim[query_idx].copy()
110 |     mim_sims_noself = np.delete(mim_sims, [query_idx])
111 |     order = mim_sims_noself.argsort()
112 |     ranks = order.argsort()
113 |     ranks = max(ranks) - ranks
114 |     # convert the ranks to 1-based
115 |     ranks = np.array([r+1 for r in ranks])
116 |     return sorted(ranks[other_idxs])
117 | 
118 | 
119 | def run_phenoseries_experiment(
120 |         outdir=None, phenotypic_series_filepath=None,
121 |         min_hpos=2, min_entities=4, phenoseries_fraction=1.0,
122 |         scoring_method="HRSS", threads=1,
123 |         omim_phenotypes_file=None, pairwise_mim_scores_file=None):
124 | 
125 |     if outdir is None:
126 |         outdir = os.getcwd
127 | 
128 |     # load HPO network
129 |     # data directory
130 |     phenopy_data_directory = os.path.join(os.getenv("HOME"), ".phenopy/data")
131 | 
132 |     # files used in building the annotated HPO network
133 |     obo_file = os.path.join(phenopy_data_directory, "hp.obo")
134 |     disease_to_phenotype_file = os.path.join(phenopy_data_directory, "phenotype.hpoa")
135 | 
136 |     hpo_network, alt2prim, _ = generate_annotated_hpo_network(
137 |         obo_file, disease_to_phenotype_file, ages_distribution_file=None
138 |     )
139 | 
140 |     # read the phenotypic series file as a DataFrame
141 |     psdf = pd.read_csv(
142 |         phenotypic_series_filepath,
143 |         sep="\t",
144 |         comment="#",
145 |         names=["PS", "MIM", "Phenotype"],
146 |     )
147 |     # null phenotypes are actually null MIM id fields, so just drop these
148 |     psdf = psdf.dropna().sample(frac=phenoseries_fraction, random_state=42)
149 |     psdf.reset_index(inplace=True, drop=True)
150 | 
151 |     # create a dictionary for phenotypic series to list of omim ids mapping
152 |     ps2mimids = {}
153 |     for ps, mim_ids in psdf.groupby(["PS"])["MIM"]:
154 |         # more than two mims in a ps
155 |         if len(mim_ids) >= 2:
156 |             ps2mimids[ps] = list(set([int(mid) for mid in mim_ids.tolist()]))
157 | 
158 |     # invert the ps2mimid dictionary for easy lookup of which ps a mim belongs to
159 |     mim2psids = {}
160 |     for mim_id, ps in psdf.groupby(["MIM"])["PS"]:
161 |         mim2psids[int(mim_id)] = ps.tolist()
162 | 
163 |     fields_to_use = [
164 |         "text",
165 |         "description",
166 |         "otherFeatures",
167 |         "biochemicalFeatures",
168 |         "diagnosis",
169 |         "clinicalFeatures",
170 |     ]
171 | 
172 |     if omim_phenotypes_file == "":
173 |         logger.info("Scraping OMIM Diseases text")
174 |         mim_texts = {}
175 |         for mim_id in mim2psids:
176 |             mim_response = request_mimid_info(mim_id)
177 |             try:
178 |                 mim_info = mim_response.json()
179 |             except AttributeError:
180 |                 break
181 |             mim_text = mim_info["omim"]["entryList"][0]["entry"]["textSectionList"]
182 | 
183 |             all_mim_text = ""
184 |             for text_section in mim_text:
185 |                 section_name = text_section["textSection"]["textSectionName"]
186 |                 if section_name in fields_to_use:
187 |                     # unique_section_names.add(section_name)
188 |                     added_text = text_section["textSection"]["textSectionContent"]
189 |                     all_mim_text += f" {added_text}"
190 | 
191 |             mim_texts[mim_id] = all_mim_text
192 |         # instantiate txt2hpo's Exctractor class to perform named entity recognition
193 |         extractor = Extractor(
194 |             remove_negated=True,
195 |             max_neighbors=3,
196 |             correct_spelling=False)
197 | 
198 |         # loop over the MIM ids and extract hpo ids from each MIM's text fields
199 |         mim_hpos = {}
200 |         for mim_id in mim2psids:
201 |             mim_hpos[mim_id] = extractor.hpo(mim_texts[mim_id]).hpids
202 | 
203 |         mimdf = pd.DataFrame()
204 |         mimdf["omim_id"] = list(mim2psids.keys())
205 |         mimdf["hpo_terms"] = mimdf["omim_id"].apply(lambda mim_id: mim_hpos[mim_id])
206 |         mimdf.to_csv(os.path.join(outdir, "omim_phenotypes.txt"), index=False, sep='\t')
207 | 
208 |     else:
209 |         logger.info("You passed an OMIM disease to phenotype file")
210 |         try:
211 |             mimdf = pd.read_csv(omim_phenotypes_file, sep="\t")
212 |             mimdf["omim_id"] = mimdf["omim_id"].astype(int)
213 |             mimdf["hpo_terms"] = mimdf["hpo_terms"].apply(literal_eval)
214 |             mim_hpos = dict(zip(mimdf["omim_id"], mimdf["hpo_terms"]))
215 |         except FileNotFoundError:
216 |             sys.exit("Please provide a valid file path")
217 | 
218 |     # clean up HPO ids in lists
219 |     for mim_id, hpo_ids in mim_hpos.items():
220 |         mim_hpos[mim_id] = convert_and_filter_hpoids(hpo_ids, hpo_network, alt2prim)
221 | 
222 |     # remove entities (mims) that have less than min_hpos
223 |     mims_to_remove = []
224 |     for mim_id, hpo_ids in mim_hpos.copy().items():
225 |         if len(hpo_ids) <= min_hpos:
226 |             mims_to_remove.append(mim_id)
227 | 
228 |     # Now remove the entities (mim ids) with less than min_hpos
229 |     experiment_ps2mimids = {}
230 |     # remove these mims from ps
231 |     for ps, mimids in ps2mimids.copy().items():
232 |         experiment_ps2mimids[ps] = []
233 |         for ps_mim_id in mimids:
234 |             if ps_mim_id not in mims_to_remove:
235 |                 experiment_ps2mimids[ps].append(ps_mim_id)
236 | 
237 |     # After removing entities, make sure the series has min number of entities
238 |     # get lists of mims and their PS
239 |     remove_these_ps = []
240 |     for ps, mimids in experiment_ps2mimids.items():
241 |         if len(mimids) < min_entities:
242 |             remove_these_ps.append(ps)
243 | 
244 |     for psid in remove_these_ps:
245 |         del experiment_ps2mimids[psid]
246 | 
247 |     # Create a unique list of entity ids, for scoring later
248 |     experiment_omims = set()
249 |     for psid, mim_ids in experiment_ps2mimids.items():
250 |         for mim in mim_ids:
251 |             experiment_omims.add(mim)
252 |     experiment_omims = list(experiment_omims)
253 | 
254 |     # make a DataFrame for entity ids
255 |     mimdf = pd.DataFrame()
256 |     mimdf["omim_id"] = experiment_omims
257 |     mimdf["hpo_terms"] = mimdf["omim_id"].apply(lambda mim_id: mim_hpos[mim_id])
258 | 
259 |     if pairwise_mim_scores_file == "":
260 |         scorer = Scorer(hpo_network, scoring_method=scoring_method)
261 |         records = [
262 |             {
263 |                 "record_id": mim_id,
264 |                 "terms": convert_and_filter_hpoids(hpo_terms, hpo_network, alt2prim),
265 |                 "weights": {},
266 |             }
267 |             for mim_id, hpo_terms in dict(
268 |                 zip(mimdf["omim_id"], mimdf["hpo_terms"])
269 |             ).items()
270 |         ]
271 | 
272 |         results = scorer.score_records(
273 |             records, records, half_product(len(records), len(records)), threads=threads
274 |         )
275 | 
276 |         pairwise_scores = pd.DataFrame(
277 |             results, columns=["mimid1", "mimid2", "phenopy-score"]
278 |         )
279 |         # convert to square form
280 |         pairwise_scores = pairwise_scores.set_index(["mimid1", "mimid2"]).unstack()
281 |         # This pandas method chain fills in the missing scores of the square matrix
282 |         # with the values from the transpose of df.
283 |         pairwise_scores = (
284 |             pairwise_scores["phenopy-score"]
285 |             .reset_index(drop=True)
286 |             .fillna(pairwise_scores.T.droplevel(0).reset_index(drop=True))
287 |             .set_index(pairwise_scores.index, drop=True)
288 |         )
289 |         # reindex with the mimdf index
290 |         pairwise_scores = pairwise_scores.reindex(mimdf["omim_id"].tolist())
291 |         pairwise_scores = pairwise_scores[mimdf["omim_id"].tolist()]
292 |         pd.DataFrame(pairwise_scores).to_csv(
293 |             os.path.join(outdir, 'phenoseries.psim_matrix.txt'),
294 |             sep='\t'
295 |         )
296 |     else:
297 |         pairwise_scores = pd.read_csv(pairwise_mim_scores_file, sep='\t')
298 | 
299 |     ranksdf = make_rank_dataframe(
300 |         pairwise_scores.astype(float).values, mimdf, experiment_ps2mimids
301 |     )
302 |     ranksdf.to_csv(os.path.join(outdir, "phenoseries.rankdf.txt"), sep="\t")
303 | 
304 | 
305 | if __name__ == "__main__":
306 |     parser = argparse.ArgumentParser()
307 |     parser.add_argument(
308 |         "--outdir", "-o", default=os.getcwd(), help="Path where to store the results."
309 |     )
310 |     parser.add_argument(
311 |         "--phenotypic-series-filepath",
312 |         "-p",
313 |         help="path to the omim text file defining phenotypic series to omim id",
314 |     )
315 |     parser.add_argument(
316 |         "--min-hpos",
317 |         "-n",
318 |         default=4,
319 |         type=int,
320 |         help="The minimum number of hpo ids per entity (mim id, for example) to"
321 |              "be considered for the experiment",
322 |     )
323 |     parser.add_argument(
324 |         "--min-entities",
325 |         "-m",
326 |         default=2,
327 |         type=int,
328 |         help="The minimum number of entities (mim id, for example) per series to"
329 |              "be considered for the experiment",
330 |     )
331 |     parser.add_argument(
332 |         "--phenoseries-fraction",
333 |         "-f",
334 |         default=1.0,
335 |         help="The fraction of phenoseries to use",
336 |         type=float,
337 |     )
338 |     parser.add_argument(
339 |         "--scoring-method",
340 |         "-s",
341 |         default="HRSS",
342 |         help="The scoring method to use",
343 |         type=str,
344 |     )
345 |     parser.add_argument(
346 |         "--threads", "-t", default=4, help="The number of threads to use", type=int,
347 |     )
348 |     parser.add_argument(
349 |         "--omim-phenotypes-file",
350 |         "-a",
351 |         default="",
352 |         help="The full path to a pre-generated omim id to list of phenotypes file",
353 |         type=str,
354 |     )
355 |     parser.add_argument(
356 |         "--pairwise-mim-scores-file",
357 |         "-b",
358 |         default="",
359 |         help="The full path to a pre-generated file with all the pairwise scores for"
360 |              "each omim id in the experiment.",
361 |         type=str,
362 |     )
363 | 
364 |     args = parser.parse_args()
365 | 
366 |     outdir = args.outdir
367 |     phenotypic_series_filepath = args.phenotypic_series_filepath
368 |     min_hpos = args.min_hpos
369 |     min_entities = args.min_entities
370 |     phenoseries_fraction = args.phenoseries_fraction
371 |     scoring_method = args.scoring_method
372 |     threads = args.threads
373 |     omim_phenotypes_file = args.omim_phenotypes_file
374 |     pairwise_mim_scores_file = args.pairwise_mim_scores_file
375 | 
376 |     run_phenoseries_experiment(
377 |         outdir=outdir,
378 |         phenotypic_series_filepath=phenotypic_series_filepath,
379 |         min_hpos=min_hpos,
380 |         min_entities=min_entities,
381 |         phenoseries_fraction=phenoseries_fraction,
382 |         scoring_method=scoring_method,
383 |         threads=threads,
384 |         omim_phenotypes_file=omim_phenotypes_file,
385 |         pairwise_mim_scores_file=pairwise_mim_scores_file,
386 |         )
387 | 


--------------------------------------------------------------------------------
/phenoseries/phenoseries.requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | matplotlib
4 | requests
5 | scikit-learn
6 | txt2hpo


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["poetry-core"]
 3 | build-backend = "poetry.core.masonry.api"
 4 | 
 5 | [tool.poetry]
 6 | name = "phenopy"
 7 | version = "0.6.0"
 8 | description = "Phenotype comparison scoring by semantic similarity."
 9 | authors = [
10 |     "Kevin Arvai",
11 |     "Kyle Retterer",
12 |     "Carlos Borroto <cborroto@genedx.com>",
13 |     "Vlad Gainullin",
14 |     "Vincent Ustach <vustach@genedx.com>",
15 |     "Stephen McGee <smcgee@genedx.com>"
16 | ]
17 | readme = "README.md"
18 | license = ""
19 | 
20 | [tool.poetry.scripts]
21 | phenopy = "phenopy.__main__:main"
22 | 
23 | [tool.poetry.urls]
24 | homepage = "https://github.com/GeneDx/phenopy"
25 | "Bug Tracker" = "https://github.com/GeneDx/phenopy/issues"
26 | 
27 | [tool.poetry.dependencies]
28 | python = "^3.9"
29 | fire = "^0.5.0"
30 | gensim = "^4.3.0"
31 | networkx = "2.6.3"
32 | numpy = "^1.21.1"
33 | obonet = "^1.0.0"
34 | pandas = "^1.0.0"
35 | scipy = "^1.6.1"
36 | requests = "^2.31.0"
37 | pytest = "^7.3.1"
38 | 
39 | [tool.poetry.dev-dependencies]
40 | pre-commit = "^2.21.0"
41 | pytest = "^7.3.1"
42 | pytest-cov = "^4.0.0"
43 | ruff = "^0.0.264"
44 | 
45 | [tool.ruff]
46 | line-length = 88
47 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeneDx/phenopy/3cc03ea1a60334155141f8b261c77471195b62f3/tests/__init__.py


--------------------------------------------------------------------------------
/tests/data/phenotype_age.tsv:
--------------------------------------------------------------------------------
1 | HP:0001251	6.0	3.0
2 | HP:0001263	1.0	1.0
3 | HP:0001290	1.0	1.0
4 | HP:0004322	10.0	3.0
5 | HP:0001249	6.0	3.0
6 | 


--------------------------------------------------------------------------------
/tests/data/test.score-long.txt:
--------------------------------------------------------------------------------
  1 | 118200	.	HP:0000006|HP:0001171|HP:0001178|HP:0001265|HP:0001284|HP:0001425|HP:0001761|HP:0001765|HP:0002460|HP:0002751|HP:0002936|HP:0003376|HP:0003380|HP:0003382|HP:0003383|HP:0003431|HP:0003449|HP:0003587|HP:0003621|HP:0003677|HP:0003690|HP:0003693|HP:0003693|HP:0003828|HP:0004336|HP:0009027|HP:0009830|HP:0011096|HP:0012074
  2 | 118210	.	HP:0000006|HP:0001265|HP:0001284|HP:0001425|HP:0001761|HP:0001765|HP:0002460|HP:0002936|HP:0003376|HP:0003378|HP:0003380|HP:0003383|HP:0003384|HP:0003431|HP:0003674|HP:0003677|HP:0003690|HP:0003693|HP:0003693|HP:0009027|HP:0009830
  3 | 118220	.	HP:0000006|HP:0000365|HP:0001171|HP:0001178|HP:0001265|HP:0001284|HP:0001425|HP:0001761|HP:0001765|HP:0002460|HP:0002751|HP:0002936|HP:0003376|HP:0003380|HP:0003382|HP:0003383|HP:0003431|HP:0003449|HP:0003481|HP:0003587|HP:0003621|HP:0003677|HP:0003690|HP:0003693|HP:0003828|HP:0004336|HP:0009027|HP:0009830
  4 | 118230	.	HP:0000006|HP:0000470|HP:0000774|HP:0000975|HP:0001026|HP:0001270|HP:0001301|HP:0001315|HP:0001417|HP:0001600|HP:0001678|HP:0001761|HP:0001999|HP:0002013|HP:0002018|HP:0002028|HP:0003009|HP:0003091|HP:0003447|HP:0003474|HP:0003593|HP:0003693|HP:0004875|HP:0005853|HP:0009049|HP:0011727
  5 | 118300	.	HP:0000006|HP:0000407|HP:0001171|HP:0001265|HP:0001284|HP:0001761|HP:0001765|HP:0001884|HP:0002460|HP:0002936|HP:0003376|HP:0003431|HP:0003621|HP:0003690|HP:0003693|HP:0003693|HP:0009027|HP:0011463
  6 | 118301	.	HP:0000006|HP:0000007|HP:0000508|HP:0000726|HP:0000762|HP:0000763|HP:0000975|HP:0001026|HP:0001278|HP:0001300|HP:0001301|HP:0001315|HP:0001347|HP:0001417|HP:0001678|HP:0001761|HP:0002013|HP:0002018|HP:0002028|HP:0002171|HP:0002398|HP:0003009|HP:0003091|HP:0003447|HP:0003693|HP:0005150|HP:0007110|HP:0009049|HP:0011727
  7 | 148360	.	HP:0000006|HP:0000982|HP:0001425|HP:0001761|HP:0002164|HP:0003390|HP:0007002|HP:0008404
  8 | 214400	.	HP:0000007|HP:0000764|HP:0001171|HP:0001178|HP:0001265|HP:0001270|HP:0001284|HP:0001425|HP:0001765|HP:0002460|HP:0002460|HP:0002751|HP:0002936|HP:0003380|HP:0003382|HP:0003400|HP:0003429|HP:0003431|HP:0003593|HP:0003678|HP:0003693|HP:0006915|HP:0007182|HP:0040078
  9 | 300905	.	HP:0000407|HP:0001265|HP:0001271|HP:0001423|HP:0001761|HP:0002378|HP:0003376|HP:0003474|HP:0003677|HP:0030237
 10 | 302800	.	HP:0000407|HP:0000639|HP:0000763|HP:0000764|HP:0001260|HP:0001265|HP:0001270|HP:0001272|HP:0001310|HP:0001337|HP:0001419|HP:0001423|HP:0001761|HP:0001771|HP:0002015|HP:0002311|HP:0002355|HP:0002385|HP:0002395|HP:0002427|HP:0002460|HP:0002460|HP:0002500|HP:0002936|HP:0003380|HP:0003383|HP:0003431|HP:0003487|HP:0003677|HP:0003693|HP:0003829|HP:0009830|HP:0040078|HP:0040083
 11 | 302801	.	HP:0001249|HP:0001284|HP:0001419|HP:0001761|HP:0002460|HP:0002936|HP:0003376|HP:0003431|HP:0003482|HP:0003484|HP:0003593|HP:0003693|HP:0009027
 12 | 302802	.	HP:0000762|HP:0001284|HP:0001385|HP:0001419|HP:0001761|HP:0002385|HP:0002460|HP:0002650|HP:0002936|HP:0003376|HP:0003482|HP:0003484|HP:0003693|HP:0009027
 13 | 302803	.	HP:0001362|HP:0001419|HP:0003390|HP:0007002|HP:0007385
 14 | 302900	.	HP:0000639|HP:0000819|HP:0001251|HP:0001260|HP:0001284|HP:0001417|HP:0001635|HP:0001691|HP:0001761|HP:0001765|HP:0001953|HP:0002062|HP:0002495|HP:0002650|HP:0002936|HP:0003115|HP:0003116|HP:0003133|HP:0003134|HP:0003209|HP:0003232|HP:0003376|HP:0003487|HP:0003621|HP:0005157|HP:0008954|HP:0008963|HP:0009005|HP:0009027|HP:0010831|HP:0011397|HP:0011399|HP:0011441
 15 | 311070	.	HP:0000365|HP:0000407|HP:0000510|HP:0000529|HP:0000648|HP:0001270|HP:0001271|HP:0001288|HP:0001419|HP:0001761|HP:0002460|HP:0002522|HP:0002936|HP:0003383|HP:0003481|HP:0003693|HP:0003828|HP:0011463|HP:0032460
 16 | 600882	.	HP:0000006|HP:0000763|HP:0001265|HP:0001284|HP:0001761|HP:0001763|HP:0001765|HP:0001810|HP:0001868|HP:0001886|HP:0002460|HP:0002460|HP:0003376|HP:0003378|HP:0003380|HP:0003384|HP:0003431|HP:0003474|HP:0003693|HP:0009027
 17 | 601098	.	HP:0000006|HP:0001265|HP:0001425|HP:0001761|HP:0002460|HP:0002936|HP:0003382|HP:0003383|HP:0003431|HP:0003481|HP:0003621|HP:0003693
 18 | 601152	.	HP:0000006|HP:0000007|HP:0000360|HP:0000458|HP:0000543|HP:0000551|HP:0000603|HP:0000641|HP:0000648|HP:0000649|HP:0001265|HP:0001284|HP:0001604|HP:0001761|HP:0002403|HP:0002460|HP:0002650|HP:0002936|HP:0002938|HP:0003376|HP:0003378|HP:0003378|HP:0003409|HP:0003431|HP:0003593|HP:0003690|HP:0003693|HP:0003693|HP:0003701|HP:0007924|HP:0008587
 19 | 601382	.	HP:0000007|HP:0001270|HP:0001425|HP:0001762|HP:0002460|HP:0002650|HP:0002936|HP:0003431|HP:0003693|HP:0003693|HP:0003701|HP:0006958|HP:0007208|HP:0010628
 20 | 601455	.	HP:0000007|HP:0000365|HP:0000649|HP:0000762|HP:0001155|HP:0001265|HP:0001284|HP:0001288|HP:0002460|HP:0002936|HP:0003383|HP:0003447|HP:0003481|HP:0003621|HP:0003693|HP:0004696|HP:0006916|HP:0006958
 21 | 601472	.	HP:0000006|HP:0001265|HP:0001761|HP:0001765|HP:0002172|HP:0002650|HP:0002936|HP:0003392|HP:0003393|HP:0003426|HP:0003427|HP:0003435|HP:0003484|HP:0003674|HP:0003677|HP:0003693|HP:0009129
 22 | 601596	.	HP:0000007|HP:0000365|HP:0000639|HP:0000764|HP:0001270|HP:0001291|HP:0001308|HP:0001425|HP:0001761|HP:0002355|HP:0002460|HP:0002650|HP:0002936|HP:0003387|HP:0003400|HP:0003431|HP:0003484|HP:0003693|HP:0004466|HP:0007107|HP:0007695|HP:0010628|HP:0012473|HP:0040078
 23 | 604563	.	HP:0000007|HP:0000407|HP:0000501|HP:0001171|HP:0001178|HP:0001265|HP:0001284|HP:0001425|HP:0001761|HP:0001762|HP:0001765|HP:0002355|HP:0002460|HP:0002751|HP:0002936|HP:0003376|HP:0003380|HP:0003383|HP:0003431|HP:0003481|HP:0003621|HP:0003693|HP:0009027
 24 | 605588	.	HP:0000007|HP:0001265|HP:0001284|HP:0001425|HP:0001761|HP:0002460|HP:0002751|HP:0002936|HP:0003376|HP:0003378|HP:0003380|HP:0003383|HP:0003384|HP:0003431|HP:0003484|HP:0003674|HP:0003693|HP:0003701|HP:0009027
 25 | 605589	.	HP:0000007|HP:0001265|HP:0001284|HP:0002460|HP:0002936|HP:0003431|HP:0003581|HP:0003693
 26 | 606482	.	HP:0000006|HP:0000764|HP:0001265|HP:0001284|HP:0001425|HP:0001761|HP:0002460|HP:0002460|HP:0002936|HP:0003380|HP:0003383|HP:0003481|HP:0003621|HP:0003693|HP:0003693|HP:0007107|HP:0040078
 27 | 606483	.	HP:0000006|HP:0001265|HP:0001284|HP:0001425|HP:0001761|HP:0002460|HP:0002460|HP:0002936|HP:0003376|HP:0003378|HP:0003383|HP:0003394|HP:0003481|HP:0003674|HP:0003693|HP:0003693|HP:0007107|HP:0009027
 28 | 606595	.	HP:0000006|HP:0001171|HP:0001178|HP:0001265|HP:0001284|HP:0001425|HP:0001761|HP:0002380|HP:0002460|HP:0002936|HP:0003376|HP:0003394|HP:0003431|HP:0003693|HP:0007267|HP:0009027
 29 | 607677	.	HP:0000006|HP:0001265|HP:0001284|HP:0001761|HP:0002460|HP:0002936|HP:0003376|HP:0003378|HP:0003380|HP:0003484|HP:0003693
 30 | 607678	.	HP:0000006|HP:0001425|HP:0002460|HP:0003376|HP:0003431|HP:0003484|HP:0003621|HP:0003693|HP:0003828|HP:0009027|HP:0009830
 31 | 607684	.	HP:0000006|HP:0000218|HP:0000508|HP:0001171|HP:0001178|HP:0001265|HP:0001270|HP:0001284|HP:0001371|HP:0001761|HP:0001765|HP:0002460|HP:0002650|HP:0002936|HP:0003376|HP:0003431|HP:0003693|HP:0003798|HP:0003828|HP:0006006|HP:0009025|HP:0009027|HP:0010628
 32 | 607706	.	HP:0000007|HP:0001171|HP:0001284|HP:0001371|HP:0001604|HP:0001761|HP:0002460|HP:0002936|HP:0003378|HP:0003380|HP:0003383|HP:0003431|HP:0003623|HP:0003693|HP:0008443
 33 | 607731	.	HP:0000007|HP:0001761|HP:0002460|HP:0002460|HP:0002936|HP:0003376|HP:0003380|HP:0003438|HP:0003450|HP:0003621|HP:0003693|HP:0007083|HP:0007350|HP:0009027
 34 | 607734	.	HP:0000006|HP:0000007|HP:0001265|HP:0001270|HP:0001284|HP:0001425|HP:0001761|HP:0002460|HP:0002936|HP:0003380|HP:0003383|HP:0003431|HP:0003481|HP:0003621|HP:0003693|HP:0003828|HP:0004336|HP:0007233
 35 | 607736	.	HP:0000006|HP:0000407|HP:0000408|HP:0000478|HP:0001265|HP:0001284|HP:0001761|HP:0002015|HP:0002086|HP:0002460|HP:0002936|HP:0003376|HP:0003378|HP:0003693|HP:0003693|HP:0009027|HP:0011096
 36 | 607791	.	HP:0000006|HP:0001265|HP:0001284|HP:0002460|HP:0002936|HP:0003378|HP:0003481|HP:0003484|HP:0003693
 37 | 607831	.	HP:0000006|HP:0000007|HP:0001171|HP:0001284|HP:0001425|HP:0001762|HP:0002460|HP:0002751|HP:0002936|HP:0003380|HP:0003431|HP:0003450|HP:0003593|HP:0003693|HP:0003701
 38 | 608323	.	HP:0000006|HP:0001425|HP:0001760|HP:0002460|HP:0002936|HP:0003450|HP:0003484|HP:0003693
 39 | 608340	.	HP:0000007|HP:0001178|HP:0001265|HP:0001284|HP:0001761|HP:0001762|HP:0002650|HP:0002936|HP:0003376|HP:0003383|HP:0003387|HP:0003445|HP:0003690|HP:0003693|HP:0009027|HP:0009830|HP:0011096|HP:0011463
 40 | 608673	.	HP:0000006|HP:0001265|HP:0001284|HP:0001425|HP:0001761|HP:0002460|HP:0002650|HP:0002936|HP:0003387|HP:0003444|HP:0003477|HP:0003693|HP:0007078
 41 | 609260	.	HP:0000006|HP:0000007|HP:0000365|HP:0000648|HP:0001257|HP:0001265|HP:0001268|HP:0001276|HP:0001284|HP:0001337|HP:0001347|HP:0001371|HP:0001761|HP:0001765|HP:0002460|HP:0002650|HP:0002936|HP:0003376|HP:0003378|HP:0003380|HP:0003383|HP:0003384|HP:0003431|HP:0003487|HP:0003677|HP:0003690|HP:0003693|HP:0003693|HP:0003828|HP:0003829|HP:0009027|HP:0012531
 42 | 609311	.	HP:0000007|HP:0001265|HP:0001270|HP:0001284|HP:0001425|HP:0001761|HP:0001762|HP:0002515|HP:0002650|HP:0002936|HP:0003380|HP:0003383|HP:0003431|HP:0003484|HP:0003593|HP:0008944|HP:0009053|HP:0011096
 43 | 611228	.	HP:0000007|HP:0000762|HP:0001265|HP:0001270|HP:0001284|HP:0001288|HP:0002359|HP:0002460|HP:0002936|HP:0003383|HP:0003431|HP:0003447|HP:0003676|HP:0003828|HP:0005684|HP:0006466|HP:0007182
 44 | 613287	.	HP:0000006|HP:0000407|HP:0001284|HP:0001761|HP:0001765|HP:0002460|HP:0002936|HP:0003202|HP:0003431|HP:0003477|HP:0003828|HP:0009027
 45 | 613641	.	HP:0000007|HP:0001263|HP:0001265|HP:0001284|HP:0001761|HP:0002936|HP:0003376|HP:0009027|HP:0009588
 46 | 614228	.	HP:0000006|HP:0001265|HP:0001270|HP:0001761|HP:0002359|HP:0002460|HP:0002527|HP:0002936|HP:0003431|HP:0003677|HP:0003690|HP:0009046
 47 | 614436	.	HP:0000006|HP:0000007|HP:0000007|HP:0000764|HP:0001265|HP:0001284|HP:0001761|HP:0001765|HP:0002380|HP:0002460|HP:0002936|HP:0003376|HP:0003378|HP:0003431|HP:0003677|HP:0003693|HP:0003829|HP:0006886|HP:0009027|HP:0040078|HP:0040083
 48 | 614455	.	HP:0000006|HP:0000093|HP:0000097|HP:0001171|HP:0001265|HP:0001284|HP:0001761|HP:0001765|HP:0002460|HP:0002936|HP:0003376|HP:0003383|HP:0003447|HP:0003676|HP:0003774|HP:0007149|HP:0008944|HP:0009027
 49 | 614895	.	HP:0000007|HP:0001270|HP:0001284|HP:0001604|HP:0001761|HP:0002355|HP:0002650|HP:0002936|HP:0003202|HP:0003387|HP:0003400|HP:0003431|HP:0003677|HP:0003690|HP:0010871|HP:0011096
 50 | 615025	.	HP:0000006|HP:0002355|HP:0003202|HP:0003474
 51 | 615185	.	HP:0000006|HP:0001265|HP:0001761|HP:0001765|HP:0002936|HP:0003376|HP:0003383|HP:0003450|HP:0003677
 52 | 615284	.	HP:0000007|HP:0000020|HP:0000252|HP:0000486|HP:0000602|HP:0000762|HP:0001159|HP:0001249|HP:0001284|HP:0001288|HP:0001763|HP:0002650|HP:0002936|HP:0003383|HP:0003676|HP:0003690|HP:0012444
 53 | 615376	.	HP:0000007|HP:0001284|HP:0001761|HP:0001765|HP:0002460|HP:0002936|HP:0003387|HP:0003431|HP:0008180
 54 | 615490	.	HP:0000007|HP:0001284|HP:0001290|HP:0001324|HP:0001605|HP:0001761|HP:0001762|HP:0002093|HP:0002136|HP:0002540|HP:0002779|HP:0003199|HP:0003380|HP:0003477|HP:0006380|HP:0031936|HP:0040078
 55 | 616039	.	HP:0000007|HP:0001265|HP:0001284|HP:0001761|HP:0002936|HP:0003376|HP:0003383|HP:0003677|HP:0009027
 56 | 616155	.	HP:0000007|HP:0001265|HP:0001284|HP:0001762|HP:0002650|HP:0002936|HP:0003376|HP:0003677|HP:0003701|HP:0007141|HP:0009027|HP:0040078
 57 | 616280	.	HP:0000006|HP:0001284|HP:0002936|HP:0003376|HP:0003477|HP:0003677|HP:0009027
 58 | 616491	.	HP:0000006|HP:0001265|HP:0002936|HP:0003401|HP:0003676|HP:0010871
 59 | 616625	.	HP:0000006|HP:0001761|HP:0001765|HP:0002936|HP:0003376|HP:0003438|HP:0003828
 60 | 616668	.	HP:0000007|HP:0001155|HP:0001284|HP:0001337|HP:0001761|HP:0002079|HP:0002751|HP:0002936|HP:0003477|HP:0003677|HP:0006466|HP:0009027|HP:0100543
 61 | 616684	.	HP:0000007|HP:0000407|HP:0000666|HP:0001251|HP:0001284|HP:0001332|HP:0002151|HP:0002355|HP:0002751|HP:0003202|HP:0003388|HP:0003447|HP:0003677|HP:0003828|HP:0009830|HP:0011096
 62 | 616687	.	HP:0000006|HP:0001284|HP:0001761|HP:0001765|HP:0002936|HP:0003236|HP:0003676|HP:0003828
 63 | 616688	.	HP:0000006|HP:0000020|HP:0000365|HP:0001171|HP:0001263|HP:0001270|HP:0001276|HP:0001284|HP:0001290|HP:0001620|HP:0001761|HP:0002355|HP:0002380|HP:0002411|HP:0002650|HP:0002936|HP:0003394|HP:0003677|HP:0003701|HP:0007256|HP:0009027
 64 | 616924	.	HP:0000006|HP:0001265|HP:0001761|HP:0002021|HP:0002359|HP:0002495|HP:0002515|HP:0002936|HP:0003198|HP:0003200|HP:0003236|HP:0003390|HP:0003445|HP:0003484|HP:0003487|HP:0003555|HP:0003557|HP:0003676|HP:0003701|HP:0003805|HP:0003828|HP:0007141|HP:0007210|HP:0007340|HP:0009129
 65 | 617017	.	HP:0000006|HP:0000007|HP:0001265|HP:0001284|HP:0002317|HP:0002936|HP:0003581|HP:0003677|HP:0007141|HP:0009027
 66 | 617087	.	HP:0000007|HP:0000365|HP:0000543|HP:0000648|HP:0001265|HP:0001761|HP:0002194|HP:0002355|HP:0002650|HP:0002747|HP:0002808|HP:0002936|HP:0003477|HP:0003701|HP:0003828|HP:0009027
 67 | 617882	.	HP:0000006|HP:0000407|HP:0000639|HP:0001171|HP:0001251|HP:0001257|HP:0001265|HP:0001270|HP:0001284|HP:0001761|HP:0002460|HP:0002515|HP:0002936|HP:0003236|HP:0003376|HP:0003391|HP:0003487|HP:0003677|HP:0007141
 68 | 618036	.	HP:0000006|HP:0001265|HP:0001284|HP:0001761|HP:0003376|HP:0003394|HP:0003677|HP:0007141|HP:0009027
 69 | 618279	.	HP:0000006|HP:0001761|HP:0002355|HP:0002359|HP:0003376|HP:0003383|HP:0003677
 70 | 117210	.	HP:0000006|HP:0000407|HP:0001251|HP:0001260|HP:0001272|HP:0002066|HP:0002070|HP:0003584|HP:0007979
 71 | 117360	.	HP:0000006|HP:0000639|HP:0001260|HP:0001270|HP:0001310|HP:0002070|HP:0002075|HP:0002080|HP:0002136|HP:0002335|HP:0002470|HP:0003577|HP:0006855|HP:0006855|HP:0100543
 72 | 133190	.	HP:0000006|HP:0000605|HP:0000639|HP:0000951|HP:0000962|HP:0001257|HP:0001260|HP:0001265|HP:0001272|HP:0001347|HP:0002066|HP:0002070|HP:0002075|HP:0002080|HP:0002380|HP:0003477|HP:0003829|HP:0007256|HP:0007772
 73 | 164400	.	HP:0000006|HP:0000514|HP:0000543|HP:0000623|HP:0000639|HP:0000640|HP:0000641|HP:0000648|HP:0001151|HP:0001252|HP:0001257|HP:0001260|HP:0001283|HP:0001284|HP:0001290|HP:0001310|HP:0001347|HP:0002015|HP:0002070|HP:0002071|HP:0002072|HP:0002073|HP:0002075|HP:0002078|HP:0002168|HP:0002198|HP:0002495|HP:0002503|HP:0002542|HP:0002839|HP:0003202|HP:0003431|HP:0003448|HP:0003487|HP:0003581|HP:0003693|HP:0003744|HP:0007006|HP:0007078|HP:0007263|HP:0100543
 74 | 164500	.	HP:0000006|HP:0000514|HP:0000529|HP:0000580|HP:0000608|HP:0000623|HP:0000639|HP:0000648|HP:0001257|HP:0001260|HP:0001268|HP:0001310|HP:0001337|HP:0001347|HP:0002015|HP:0002071|HP:0002072|HP:0002073|HP:0002310|HP:0002542|HP:0003487|HP:0003744
 75 | 183050	.	HP:0000006|HP:0000762|HP:0001251|HP:0001257|HP:0001260|HP:0001271|HP:0002063|HP:0002067|HP:0002380|HP:0003202
 76 | 183086	.	HP:0000006|HP:0000640|HP:0000763|HP:0001260|HP:0001272|HP:0002015|HP:0002073|HP:0002076|HP:0003676|HP:0003743|HP:0007670|HP:0007772
 77 | 183090	.	HP:0000006|HP:0000510|HP:0000514|HP:0000602|HP:0000640|HP:0000641|HP:0000657|HP:0000726|HP:0001151|HP:0001252|HP:0001257|HP:0001260|HP:0001265|HP:0001290|HP:0001300|HP:0001310|HP:0001336|HP:0002015|HP:0002063|HP:0002067|HP:0002070|HP:0002073|HP:0002075|HP:0002172|HP:0002174|HP:0002198|HP:0002380|HP:0002495|HP:0002503|HP:0002542|HP:0002839|HP:0003693|HP:0003743
 78 | 213200	.	HP:0000007|HP:0000639|HP:0000750|HP:0001152|HP:0001249|HP:0001257|HP:0001260|HP:0001263|HP:0001265|HP:0001290|HP:0001310|HP:0001321|HP:0001337|HP:0001347|HP:0001761|HP:0002066|HP:0002070|HP:0002171|HP:0002311|HP:0002317|HP:0003593|HP:0003680|HP:0004322
 79 | 271250	.	HP:0000007|HP:0000365|HP:0000618|HP:0001251|HP:0005102
 80 | 271270	.	HP:0000007|HP:0000179|HP:0000280|HP:0000337|HP:0000463|HP:0000508|HP:0001251|HP:0001252|HP:0001260|HP:0001263|HP:0001265|HP:0001272|HP:0001290|HP:0001760|HP:0002208|HP:0002650|HP:0002714|HP:0003196|HP:0003487
 81 | 300703	.	HP:0000639|HP:0001251|HP:0001260|HP:0001270|HP:0001319|HP:0001419|HP:0002345|HP:0003593|HP:0003680
 82 | 301310	.	HP:0001260|HP:0001310|HP:0001419|HP:0001924|HP:0001939|HP:0002075|HP:0002080|HP:0002169|HP:0002470|HP:0003487|HP:0003621|HP:0004840
 83 | 301790	.	HP:0000407|HP:0000543|HP:0000565|HP:0000648|HP:0000726|HP:0001250|HP:0001252|HP:0001254|HP:0001257|HP:0001263|HP:0001265|HP:0001272|HP:0001284|HP:0001290|HP:0001310|HP:0001324|HP:0001419|HP:0001522|HP:0002013|HP:0002015|HP:0002020|HP:0002080|HP:0002171|HP:0002205|HP:0002311|HP:0002529|HP:0002599|HP:0003593|HP:0004881|HP:0004885|HP:0008757
 84 | 301840	.	HP:0000726|HP:0001251|HP:0001337|HP:0001417|HP:0002062|HP:0007256|HP:0031936
 85 | 302500	.	HP:0000486|HP:0000514|HP:0000639|HP:0001251|HP:0001260|HP:0001270|HP:0001272|HP:0001319|HP:0001417|HP:0001419|HP:0002080|HP:0003577|HP:0003621|HP:0003680|HP:0003698
 86 | 302600	.	HP:0001251|HP:0001417|HP:0002071
 87 | 600223	.	HP:0000006|HP:0000763|HP:0001260|HP:0001265|HP:0001272|HP:0001284|HP:0002073|HP:0002406|HP:0002936|HP:0003487|HP:0007772
 88 | 600224	.	HP:0000006|HP:0000317|HP:0000640|HP:0001260|HP:0001263|HP:0001272|HP:0001290|HP:0001310|HP:0001347|HP:0002066|HP:0002070|HP:0002075|HP:0002080|HP:0002311|HP:0002493|HP:0002495|HP:0003593|HP:0003674|HP:0003677|HP:0007772|HP:0100543
 89 | 603516	.	HP:0000006|HP:0000012|HP:0000020|HP:0000639|HP:0000716|HP:0000726|HP:0000762|HP:0001250|HP:0001260|HP:0001272|HP:0001310|HP:0001347|HP:0002015|HP:0002062|HP:0002066|HP:0002070|HP:0002071|HP:0002073|HP:0002075|HP:0002168|HP:0002311|HP:0003743|HP:0003829|HP:0007256
 90 | 604326	.	HP:0000006|HP:0000317|HP:0000496|HP:0000716|HP:0000726|HP:0000739|HP:0000746|HP:0001260|HP:0001272|HP:0001300|HP:0001310|HP:0001347|HP:0002073|HP:0002075|HP:0002120|HP:0002345|HP:0002346|HP:0002530|HP:0007141
 91 | 604432	.	HP:0000006|HP:0000639|HP:0001260|HP:0001272|HP:0001347|HP:0002073|HP:0003581
 92 | 605259	.	HP:0000006|HP:0000365|HP:0000639|HP:0000648|HP:0001249|HP:0001252|HP:0001260|HP:0001270|HP:0001272|HP:0001290|HP:0001347|HP:0002062|HP:0002066|HP:0002070|HP:0002073|HP:0002406|HP:0003677|HP:0007256|HP:0008003
 93 | 605361	.	HP:0000006|HP:0000317|HP:0000639|HP:0000716|HP:0001260|HP:0001268|HP:0001272|HP:0001310|HP:0001347|HP:0002015|HP:0002066|HP:0002073|HP:0002354|HP:0003677|HP:0003829|HP:0004373|HP:0006938|HP:0007018
 94 | 606002	.	HP:0000007|HP:0000486|HP:0000524|HP:0000639|HP:0000640|HP:0000657|HP:0001152|HP:0001260|HP:0001265|HP:0001271|HP:0001272|HP:0001284|HP:0001332|HP:0001337|HP:0001761|HP:0002015|HP:0002066|HP:0002070|HP:0002072|HP:0002346|HP:0002460|HP:0002650|HP:0003236|HP:0003431|HP:0003477|HP:0003676|HP:0003693|HP:0003828|HP:0006254|HP:0006879|HP:0006886|HP:0006937|HP:0007240|HP:0007256|HP:0007267|HP:0010702|HP:0010831
 95 | 606658	.	HP:0000006|HP:0000641|HP:0001260|HP:0001272|HP:0001347|HP:0002066|HP:0002070|HP:0002078|HP:0002168|HP:0002174|HP:0003581|HP:0003621|HP:0003677|HP:0007772|HP:0007979
 96 | 607136	.	HP:0000006|HP:0000020|HP:0000640|HP:0000716|HP:0000718|HP:0000727|HP:0000738|HP:0000743|HP:0000757|HP:0001250|HP:0001260|HP:0001272|HP:0001289|HP:0001300|HP:0001310|HP:0001332|HP:0001336|HP:0002015|HP:0002063|HP:0002066|HP:0002067|HP:0002070|HP:0002072|HP:0002080|HP:0002136|HP:0002171|HP:0002186|HP:0002300|HP:0002403|HP:0002506|HP:0002529|HP:0003676|HP:0007668|HP:0011999
 97 | 607250	.	HP:0000007|HP:0001251|HP:0001761|HP:0003376|HP:0003477|HP:0003693
 98 | 607317	.	HP:0000007|HP:0000252|HP:0000639|HP:0001251|HP:0001256|HP:0001257|HP:0001260|HP:0001270|HP:0001332|HP:0001336|HP:0001337|HP:0001347|HP:0001761|HP:0002066|HP:0002359|HP:0002380|HP:0002460|HP:0002500|HP:0003477|HP:0003487|HP:0003581|HP:0003693|HP:0003828|HP:0007338|HP:0008936|HP:0032105
 99 | 607346	.	HP:0000006|HP:0001260|HP:0001265|HP:0001272|HP:0001336|HP:0001347|HP:0002015|HP:0002066|HP:0002070|HP:0002073|HP:0002078|HP:0002174|HP:0002396|HP:0003677|HP:0007944|HP:0007979|HP:0100543
100 | 607454	.	HP:0000006|HP:0000514|HP:0000639|HP:0000718|HP:0000741|HP:0001249|HP:0001260|HP:0001263|HP:0001265|HP:0001272|HP:0001300|HP:0002066|HP:0002070|HP:0002071|HP:0002073|HP:0002168|HP:0002174|HP:0002304|HP:0002396|HP:0003677|HP:0007792|HP:0010526|HP:0100543|HP:0100710
101 | 607458	.	HP:0000006|HP:0000639|HP:0001265|HP:0001272|HP:0001284|HP:0001310|HP:0001337|HP:0001761|HP:0002075|HP:0003202|HP:0003390|HP:0003487|HP:0003674|HP:0003690|HP:0007240
102 | 608029	.	HP:0000007|HP:0000750|HP:0001251|HP:0001252|HP:0001257|HP:0001270|HP:0001272|HP:0001290|HP:0001310|HP:0001347|HP:0001763|HP:0002066|HP:0002080|HP:0002312|HP:0003577|HP:0003680|HP:0004322
103 | 608687	.	HP:0000006|HP:0000639|HP:0001260|HP:0001618|HP:0001620|HP:0002066|HP:0002070|HP:0002174|HP:0003581|HP:0003677|HP:0007256|HP:0007338|HP:0010530
104 | 608703	.	HP:0000006|HP:0000012|HP:0000317|HP:0000486|HP:0000505|HP:0000639|HP:0000763|HP:0001251|HP:0001260|HP:0001272|HP:0001761|HP:0002013|HP:0002522|HP:0002650|HP:0003380|HP:0003487|HP:0006944|HP:0007328|HP:0007663|HP:0011468
105 | 608768	.	HP:0000006|HP:0000514|HP:0000639|HP:0000641|HP:0000763|HP:0001257|HP:0001260|HP:0001272|HP:0001337|HP:0002015|HP:0002062|HP:0002073|HP:0002311|HP:0007256|HP:0007772|HP:0009830
106 | 609270	.	HP:0000007|HP:0000639|HP:0000651|HP:0001152|HP:0001251|HP:0001260|HP:0001272|HP:0001347|HP:0002066|HP:0002070|HP:0002174|HP:0002312|HP:0002495|HP:0003487|HP:0003621|HP:0003677|HP:0007338
107 | 609306	.	HP:0000006|HP:0000639|HP:0000641|HP:0001151|HP:0001260|HP:0001272|HP:0002066|HP:0002070|HP:0002078|HP:0002311|HP:0003581|HP:0003677
108 | 609307	.	HP:0000006|HP:0000486|HP:0000640|HP:0000641|HP:0000716|HP:0001256|HP:0001260|HP:0001272|HP:0001425|HP:0001761|HP:0002066|HP:0002070|HP:0002078|HP:0002174|HP:0002310|HP:0002346|HP:0002354|HP:0002495|HP:0003390|HP:0003677|HP:0007772
109 | 610245	.	HP:0000006|HP:0000514|HP:0001260|HP:0001271|HP:0001272|HP:0001274|HP:0001310|HP:0001337|HP:0001347|HP:0002066|HP:0002070|HP:0002166|HP:0002529|HP:0003487|HP:0003677|HP:0007141|HP:0007305
110 | 610246	.	HP:0000006|HP:0000508|HP:0000514|HP:0000597|HP:0000640|HP:0000641|HP:0001257|HP:0001260|HP:0001272|HP:0001276|HP:0001300|HP:0001332|HP:0002066|HP:0002070|HP:0002395|HP:0003487|HP:0003677
111 | 610743	.	HP:0000007|HP:0000639|HP:0001260|HP:0001272|HP:0001310|HP:0002066|HP:0002070|HP:0003581|HP:0003677
112 | 613371	.	HP:0000006|HP:0000640|HP:0001251|HP:0001260|HP:0001272|HP:0003581|HP:0003587|HP:0003677|HP:0007338
113 | 613728	.	HP:0000007|HP:0000639|HP:0001249|HP:0001260|HP:0001272|HP:0001310|HP:0001347|HP:0001761|HP:0002066|HP:0002070|HP:0002078|HP:0002080|HP:0002380|HP:0007338
114 | 613908	.	HP:0000006|HP:0000315|HP:0000467|HP:0000473|HP:0001260|HP:0001272|HP:0001310|HP:0001347|HP:0002080|HP:0002311|HP:0002355|HP:0003487|HP:0003581|HP:0003677
115 | 613909	.	HP:0000006|HP:0000027|HP:0000029|HP:0000789|HP:0001251|HP:0001272|HP:0100543
116 | 614153	.	HP:0000006|HP:0000365|HP:0000508|HP:0000511|HP:0000514|HP:0000639|HP:0001252|HP:0001260|HP:0001272|HP:0001276|HP:0001308|HP:0001324|HP:0001347|HP:0002015|HP:0002066|HP:0002070|HP:0002078|HP:0002311|HP:0003202|HP:0003445|HP:0003487|HP:0003676|HP:0007772|HP:0012473
117 | 614229	.	HP:0000007|HP:0000639|HP:0001260|HP:0001263|HP:0001272|HP:0002015|HP:0002070|HP:0002078|HP:0003677|HP:0007772|HP:0025356
118 | 614322	.	HP:0000007|HP:0000252|HP:0000546|HP:0000640|HP:0000648|HP:0001249|HP:0001250|HP:0001257|HP:0001260|HP:0001263|HP:0001265|HP:0001272|HP:0001510|HP:0002066|HP:0002070|HP:0003487
119 | 614831	.	HP:0000007|HP:0000508|HP:0000565|HP:0000571|HP:0000666|HP:0001249|HP:0001250|HP:0001260|HP:0001263|HP:0001272|HP:0001290|HP:0001310|HP:0001337|HP:0001347|HP:0001763|HP:0002075|HP:0002119|HP:0003593|HP:0003677|HP:0004322|HP:0006951|HP:0007068|HP:0007256
120 | 615386	.	HP:0000007|HP:0000571|HP:0000639|HP:0000750|HP:0001257|HP:0001263|HP:0001272|HP:0001310|HP:0001347|HP:0002066|HP:0002075|HP:0002080|HP:0003593|HP:0003677|HP:0008003|HP:0100543
121 | 615705	.	HP:0000007|HP:0000639|HP:0001249|HP:0001250|HP:0001251|HP:0001260|HP:0001265|HP:0001270|HP:0001272|HP:0001347|HP:0002317|HP:0003676
122 | 615768	.	HP:0000007|HP:0000135|HP:0000544|HP:0000639|HP:0001257|HP:0001260|HP:0001272|HP:0001321|HP:0001337|HP:0002070|HP:0002078|HP:0002317|HP:0003676|HP:0011448|HP:0100543
123 | 615945	.	HP:0000006|HP:0000639|HP:0001251|HP:0001260|HP:0001272|HP:0001337|HP:0002015|HP:0002317|HP:0002359|HP:0003677
124 | 615957	.	HP:0000006|HP:0000514|HP:0000639|HP:0001260|HP:0001272|HP:0002066|HP:0002070|HP:0003477|HP:0003677
125 | 616053	.	HP:0000006|HP:0001260|HP:0001310|HP:0001347|HP:0002075|HP:0002080|HP:0002136|HP:0002313|HP:0002317|HP:0003581|HP:0003677|HP:0006879
126 | 616127	.	HP:0000007|HP:0001249|HP:0001260|HP:0001263|HP:0001290|HP:0001310|HP:0001321|HP:0001337|HP:0002078|HP:0002317|HP:0003593|HP:0003677|HP:0003680
127 | 616204	.	HP:0000007|HP:0000543|HP:0000565|HP:0000639|HP:0000657|HP:0001260|HP:0001263|HP:0001272|HP:0001290|HP:0001310|HP:0001347|HP:0001371|HP:0002066|HP:0002075|HP:0002078|HP:0002311|HP:0002465|HP:0003487|HP:0003593|HP:0100543
128 | 616354	.	HP:0000007|HP:0000158|HP:0000218|HP:0000280|HP:0000283|HP:0000286|HP:0000289|HP:0000343|HP:0000407|HP:0000463|HP:0000639|HP:0000678|HP:0000684|HP:0000729|HP:0000998|HP:0001156|HP:0001250|HP:0001251|HP:0001257|HP:0001263|HP:0001265|HP:0001272|HP:0001290|HP:0001321|HP:0001344|HP:0001762|HP:0002120|HP:0002186|HP:0002540|HP:0002650|HP:0003487|HP:0003593|HP:0004482|HP:0011220|HP:0012385|HP:0012471|HP:0012745|HP:0012810|HP:0030084
129 | 616410	.	HP:0000006|HP:0001251|HP:0001272|HP:0002172|HP:0002317|HP:0003581|HP:0003676
130 | 616719	.	HP:0000007|HP:0001256|HP:0001257|HP:0001265|HP:0001270|HP:0001272|HP:0001337|HP:0001347|HP:0001395|HP:0001399|HP:0001744|HP:0001762|HP:0002066|HP:0002240|HP:0002359|HP:0002936|HP:0007141
131 | 616795	.	HP:0000006|HP:0000012|HP:0000020|HP:0000651|HP:0000666|HP:0000716|HP:0000716|HP:0001152|HP:0001260|HP:0001272|HP:0001337|HP:0001347|HP:0002015|HP:0002064|HP:0002317|HP:0002317|HP:0002497|HP:0003487|HP:0003677|HP:0006938|HP:0007001|HP:0031166|HP:0100543
132 | 616948	.	HP:0000007|HP:0000639|HP:0001249|HP:0001260|HP:0001272|HP:0001310|HP:0001347|HP:0002061|HP:0002078|HP:0002079|HP:0002080|HP:0002317|HP:0003677|HP:0007256
133 | 616949	.	HP:0000007|HP:0000252|HP:0001249|HP:0001250|HP:0001251|HP:0001290|HP:0001875|HP:0002902|HP:0003388|HP:0011675|HP:0100786
134 | 617018	.	HP:0000006|HP:0000571|HP:0000639|HP:0000768|HP:0001260|HP:0001265|HP:0001272|HP:0001337|HP:0001761|HP:0002063|HP:0002066|HP:0002070|HP:0002936|HP:0003581|HP:0003677|HP:0003693|HP:0009763
135 | 617133	.	HP:0000007|HP:0000518|HP:0000639|HP:0001260|HP:0001272|HP:0002064|HP:0002066|HP:0002070|HP:0003676
136 | 617584	.	HP:0000007|HP:0000639|HP:0001263|HP:0001310|HP:0001321|HP:0002078|HP:0003680|HP:0031936|HP:0100543
137 | 617633	.	HP:0000007|HP:0000639|HP:0000657|HP:0001260|HP:0001272|HP:0001284|HP:0001310|HP:0002015|HP:0002070|HP:0002075|HP:0002317|HP:0002403|HP:0002460|HP:0002936|HP:0003676|HP:0007141|HP:0007338
138 | 617691	.	HP:0000006|HP:0000750|HP:0001257|HP:0001260|HP:0001270|HP:0001272|HP:0001310|HP:0002015|HP:0002066|HP:0002075|HP:0002359|HP:0003677|HP:0007338
139 | 617769	.	HP:0000006|HP:0000639|HP:0001260|HP:0001272|HP:0002066|HP:0002070|HP:0003581|HP:0003677
140 | 617770	.	HP:0000006|HP:0000514|HP:0000639|HP:0001260|HP:0001272|HP:0001310|HP:0002066|HP:0002070|HP:0002403|HP:0003390|HP:0003581|HP:0003677
141 | 617931	.	HP:0000006|HP:0000218|HP:0000341|HP:0000369|HP:0000431|HP:0000508|HP:0000651|HP:0000750|HP:0001182|HP:0001249|HP:0001250|HP:0001257|HP:0001260|HP:0001270|HP:0001290|HP:0001310|HP:0001999|HP:0002072|HP:0003676|HP:0003829|HP:0004322|HP:0006855|HP:0030084|HP:0200055
142 | 618087	.	HP:0000006|HP:0000252|HP:0000303|HP:0000316|HP:0000486|HP:0000490|HP:0000540|HP:0000582|HP:0000657|HP:0001007|HP:0001159|HP:0001249|HP:0001257|HP:0001263|HP:0001310|HP:0001321|HP:0001332|HP:0001347|HP:0002421|HP:0002540|HP:0003196|HP:0008070|HP:0008936|HP:0030084|HP:0040080|HP:0045025
143 | 618093	.	HP:0000006|HP:0000020|HP:0000739|HP:0001260|HP:0001272|HP:0002015|HP:0002066|HP:0003676
144 | 209900	.	HP:0000007|HP:0000054|HP:0000077|HP:0000135|HP:0000137|HP:0000148|HP:0000218|HP:0000256|HP:0000365|HP:0000483|HP:0000486|HP:0000501|HP:0000510|HP:0000518|HP:0000545|HP:0000546|HP:0000556|HP:0000639|HP:0000668|HP:0000678|HP:0000750|HP:0000819|HP:0000822|HP:0001007|HP:0001080|HP:0001156|HP:0001159|HP:0001162|HP:0001249|HP:0001251|HP:0001263|HP:0001328|HP:0001395|HP:0001513|HP:0001712|HP:0001769|HP:0001773|HP:0001829|HP:0002099|HP:0002141|HP:0002167|HP:0002251|HP:0002370|HP:0002705|HP:0008734|HP:0009466|HP:0009806|HP:0012393
145 | 600151	.	HP:0000007|HP:0000089|HP:0000510|HP:0001156|HP:0001249|HP:0001263|HP:0001513|HP:0003241|HP:0005180|HP:0010442
146 | 605231	.	HP:0000007|HP:0000047|HP:0000107|HP:0000510|HP:0000819|HP:0001159|HP:0001249|HP:0001513|HP:0003241|HP:0010442
147 | 615981	.	HP:0000007|HP:0000135|HP:0000510|HP:0000546|HP:0000819|HP:0001162|HP:0001249|HP:0001263|HP:0001513|HP:0001631|HP:0001644|HP:0001647|HP:0001830|HP:0003241
148 | 615982	.	HP:0000007|HP:0000028|HP:0000107|HP:0000135|HP:0000164|HP:0000510|HP:0000546|HP:0000662|HP:0001156|HP:0001159|HP:0001249|HP:0001513|HP:0003241|HP:0010442
149 | 615983	.	HP:0000007|HP:0000135|HP:0000510|HP:0001156|HP:0001159|HP:0001513|HP:0003241|HP:0007754|HP:0010442|HP:0100543
150 | 615984	.	HP:0000007|HP:0000510|HP:0001249|HP:0001513|HP:0003241|HP:0010442
151 | 615985	.	HP:0000007|HP:0000047|HP:0000110|HP:0000135|HP:0000248|HP:0000510|HP:0001249|HP:0001263|HP:0001513|HP:0001696|HP:0010442|HP:0100543
152 | 615986	.	HP:0000007|HP:0000510|HP:0001249|HP:0001513|HP:0003828|HP:0010442
153 | 615987	.	HP:0000007|HP:0000083|HP:0000107|HP:0000135|HP:0000510|HP:0001513|HP:0010442|HP:0100543
154 | 615988	.	HP:0000007|HP:0000077|HP:0000135|HP:0000488|HP:0001513|HP:0010442
155 | 615989	.	HP:0000007|HP:0000077|HP:0000135|HP:0000510|HP:0001513|HP:0010442|HP:0100543
156 | 615990	.	HP:0000007|HP:0000510|HP:0001249|HP:0001263|HP:0001513|HP:0010442
157 | 615991	.	HP:0000007|HP:0000510|HP:0001249|HP:0001263|HP:0001513
158 | 615992	.	HP:0000007
159 | 615993	.	HP:0000007|HP:0000083|HP:0000104|HP:0000107|HP:0000110|HP:0000135|HP:0000365|HP:0000403|HP:0000510|HP:0000546|HP:0001249|HP:0001263|HP:0001513|HP:0002098|HP:0003241|HP:0011950|HP:0100543
160 | 615994	.	HP:0000007|HP:0000107|HP:0000135|HP:0000546|HP:0000548|HP:0001156|HP:0001263|HP:0001513|HP:0001696|HP:0003241|HP:0003774|HP:0100260|HP:0100543
161 | 615995	.	HP:0000007|HP:0000083|HP:0000510|HP:0000518|HP:0001156|HP:0001513|HP:0100543
162 | 615996	.	HP:0000007|HP:0000083|HP:0000135|HP:0000510|HP:0001249|HP:0001513|HP:0003241|HP:0004409|HP:0010442
163 | 617119	.	HP:0000007|HP:0000135|HP:0000252|HP:0000510|HP:0001249|HP:0001513|HP:0010442
164 | 617406	.	HP:0000007|HP:0000085|HP:0000510|HP:0000545|HP:0000548|HP:0000618|HP:0000668|HP:0000750|HP:0001105|HP:0001133|HP:0001162|HP:0001513|HP:0002910|HP:0007750|HP:0030329|HP:0030483|HP:0030631
165 | 


--------------------------------------------------------------------------------
/tests/data/test.score-one-patient.txt:
--------------------------------------------------------------------------------
1 | 601382	.	HP:0000007|HP:0001270|HP:0001425|HP:0001762|HP:0002460|HP:0002650|HP:0002936|HP:0003431|HP:0003693|HP:0003693|HP:0003701|HP:0006958|HP:0007208|HP:0010628
2 | 


--------------------------------------------------------------------------------
/tests/data/test.score-short.txt:
--------------------------------------------------------------------------------
1 | 118200	age=9.0;sex=female	HP:0001263|HP:0001251|HP:0001290|HP:0004322
2 | 118210	age=4.0	HP:0001249|HP:0001263|HP:0001290
3 | 118211	.	HP:0001249|HP:0001263|HP:0001290
4 | 


--------------------------------------------------------------------------------
/tests/fixtures/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeneDx/phenopy/3cc03ea1a60334155141f8b261c77471195b62f3/tests/fixtures/__init__.py


--------------------------------------------------------------------------------
/tests/fixtures/get_data_dictionary.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from phenopy.d2p import load as load_d2p
 5 | from phenopy.network import annotate
 6 | from phenopy.network import load as load_network
 7 | from phenopy.score import Scorer
 8 | from phenopy.util import generate_alternate_ids, read_phenotype_groups
 9 | 
10 | 
11 | @pytest.fixture()
12 | def test_data():
13 |     data = {}
14 |     data["parent_dir"] = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
15 |     data["obo_file"] = os.path.join(data["parent_dir"], "data/hp.obo")
16 |     data["hpo_network"] = load_network(data["obo_file"])
17 |     data["alt2prim"] = generate_alternate_ids(data["hpo_network"])
18 |     data["ages_distribution_file"] = os.path.join(
19 |         data["parent_dir"], "data/phenotype_age.tsv"
20 |     )
21 | 
22 |     data["disease_to_phenotype_file"] = os.path.join(
23 |         data["parent_dir"], "data/phenotype.hpoa"
24 |     )
25 |     data["disease_records"], data["phenotype_to_diseases"] = load_d2p(
26 |         data["disease_to_phenotype_file"], data["hpo_network"], data["alt2prim"]
27 |     )
28 | 
29 |     data["num_diseases_annotated"] = len(data["disease_records"])
30 |     data["hpo_network"] = annotate(
31 |         data["hpo_network"],
32 |         data["phenotype_to_diseases"],
33 |         data["num_diseases_annotated"],
34 |         data["alt2prim"],
35 |     )
36 | 
37 |     data["scorer"] = Scorer(data["hpo_network"], min_score_mask=None)
38 |     data["disease_to_phenotype_output_file"] = os.path.join(
39 |         data["parent_dir"], "data/phenotype.noparents.hpoa"
40 |     )
41 | 
42 |     data["phenotype_groups"] = read_phenotype_groups()
43 | 
44 |     return data
45 | 


--------------------------------------------------------------------------------
/tests/test_ic.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from phenopy.network import annotate
 5 | from phenopy.network import load as load_network
 6 | from phenopy.util import export_phenotype_hpoa_with_no_parents
 7 | 
 8 | 
 9 | def test_ic_d2p(test_data):
10 |     """Calculate the information content of a phenotype"""
11 |     assert (
12 |         pytest.approx(test_data["hpo_network"].nodes["HP:0010863"]["ic"], 0.01) == 7.21
13 |     )
14 | 
15 | 
16 | def test_ic_custom(test_data):
17 |     """
18 |     Calculate the information content of a phenotype when multiple
19 |     annotations are present
20 |     """
21 |     custom_annotation_file = os.path.join(
22 |         test_data["parent_dir"], "data/test.score-long.txt"
23 |     )
24 |     hpo_network = load_network(test_data["obo_file"])
25 |     hpo_network = annotate(
26 |         hpo_network,
27 |         test_data["phenotype_to_diseases"],
28 |         test_data["num_diseases_annotated"],
29 |         test_data["alt2prim"],
30 |         annotations_file=custom_annotation_file,
31 |     )
32 | 
33 |     assert pytest.approx(hpo_network.nodes["HP:0010863"]["ic"], 0.01) == 8.11
34 | 
35 | 
36 | def test_ic_d2p_no_parents(test_data):
37 |     export_phenotype_hpoa_with_no_parents(
38 |         test_data["disease_to_phenotype_file"],
39 |         test_data["disease_to_phenotype_output_file"],
40 |         test_data["hpo_network"],
41 |     )
42 |     assert os.path.exists(test_data["disease_to_phenotype_output_file"])
43 |     os.remove(test_data["disease_to_phenotype_output_file"])
44 | 


--------------------------------------------------------------------------------
/tests/test_network.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from phenopy.d2p import load as load_d2p
 5 | from phenopy.network import load as load_network
 6 | from phenopy.network import annotate
 7 | from phenopy.util import generate_alternate_ids
 8 | 
 9 | 
10 | def test_load_network(test_data):
11 |     hpo_network = load_network(test_data["obo_file"])
12 |     assert len(hpo_network) == 16861
13 | 
14 | 
15 | def test_annotate_network(test_data):
16 |     hpo_network = load_network(test_data["obo_file"])
17 |     alt2prim = generate_alternate_ids(hpo_network)
18 | 
19 |     # load phenotypes to diseases associations
20 |     disease_to_phenotype_file = os.path.join(
21 |         test_data["parent_dir"], "data/phenotype.hpoa"
22 |     )
23 |     disease_records, phenotype_to_diseases = load_d2p(
24 |         disease_to_phenotype_file, hpo_network, alt2prim
25 |     )
26 | 
27 |     num_diseases_annotated = len(disease_records)
28 |     hpo_network = annotate(
29 |         hpo_network, phenotype_to_diseases, num_diseases_annotated, alt2prim
30 |     )
31 | 
32 |     assert pytest.approx(hpo_network.nodes["HP:0010863"]["ic"], 0.01) == 7.21
33 |     assert pytest.approx(hpo_network.nodes["HP:0001263"]["ic"], 0.01) == 1.55
34 | 


--------------------------------------------------------------------------------
/tests/test_score.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import numpy as np
  3 | import os
  4 | import pandas as pd
  5 | import pytest
  6 | 
  7 | from phenopy.network import annotate
  8 | from phenopy.score import Scorer
  9 | from phenopy.util import (
 10 |     remove_parents,
 11 |     parse_input,
 12 |     half_product,
 13 | )
 14 | from phenopy.weights import calculate_age_weights
 15 | 
 16 | 
 17 | def test_find_lca(test_data):
 18 |     lca = test_data["scorer"].find_lca("HP:0001249", "HP:0012434")
 19 |     assert lca == "HP:0012759"
 20 | 
 21 |     root_lca = test_data["scorer"].find_lca("HP:0012759", "HP:0000001")
 22 |     assert root_lca == "HP:0000001"
 23 | 
 24 |     parent_lca = test_data["scorer"].find_lca("HP:0012759", "HP:0012758")
 25 |     assert parent_lca == "HP:0012759"
 26 | 
 27 |     parent_lca = test_data["scorer"].find_lca("HP:0012758", "HP:0012759")
 28 |     assert parent_lca == "HP:0012759"
 29 | 
 30 |     parent_lca = test_data["scorer"].find_lca("HP:0012759", "HP:0012759")
 31 |     assert parent_lca == "HP:0012759"
 32 | 
 33 |     parent_lca = test_data["scorer"].find_lca("HP:0012759", "HP:0000750")
 34 |     assert parent_lca == "HP:0012759"
 35 | 
 36 | 
 37 | def test_calculate_gamma(test_data):
 38 |     t1 = "HP:0012758"
 39 |     t2 = "HP:0012759"
 40 | 
 41 |     gamma0 = test_data["scorer"].calculate_gamma(t1, t1, t2)
 42 |     assert gamma0 == 0
 43 | 
 44 |     gamma1a = test_data["scorer"].calculate_gamma(t1, t2, t2)
 45 |     assert gamma1a == 1
 46 | 
 47 |     gamma1b = test_data["scorer"].calculate_gamma(t2, t1, t2)
 48 |     assert gamma1b == 1
 49 | 
 50 |     gamma2 = test_data["scorer"].calculate_gamma("HP:0000750", "HP:0012434", t1)
 51 |     assert gamma2 == 2
 52 | 
 53 | 
 54 | def test_calculate_beta(test_data):
 55 |     t1 = "HP:0001344"
 56 |     t2 = "HP:0012759"
 57 |     beta = test_data["scorer"].calculate_beta(t1, t2)
 58 |     assert round(beta, 2) == 3.99
 59 | 
 60 | 
 61 | def test_score_hpo_pair_hrss(test_data):
 62 |     t1 = "HP:0011351"
 63 |     t2 = "HP:0012434"
 64 | 
 65 |     score = test_data["scorer"].score_hpo_pair_hrss(t1, t2)
 66 |     assert round(score, 2) == 0.14
 67 | 
 68 |     score = test_data["scorer"].score_hpo_pair_hrss(t1, t2)
 69 |     assert round(score, 2) == 0.14
 70 | 
 71 |     score = test_data["scorer"].score_hpo_pair_hrss(t2, t1)
 72 |     assert round(score, 2) == 0.14
 73 | 
 74 | 
 75 | def test_score(test_data):
 76 |     record_a = {
 77 |         "record_id": "sample_1",
 78 |         "terms": ["HP:0012433", "HP:0012434"],
 79 |         "weights": {},
 80 |     }
 81 |     record_b = {"record_id": "sample_2", "terms": [], "weights": {}}
 82 | 
 83 |     score0 = test_data["scorer"].score(record_a, record_b)
 84 |     assert score0[2] == 0.0
 85 |     record_b["terms"] = ["HP:0001249", "HP:0012758"]
 86 | 
 87 |     score_bma = test_data["scorer"].score(record_a, record_b)
 88 |     assert round(score_bma[2], 2) == 0.09
 89 |     test_data["scorer"].summarization_method = "maximum"
 90 |     score_max = test_data["scorer"].score(record_a, record_b)
 91 |     assert round(score_max[2], 4) == 0.1251
 92 | 
 93 |     test_data["scorer"].summarization_method = "not_a_method"
 94 |     with pytest.raises(ValueError):
 95 |         test_data["scorer"].score(record_a, record_b)
 96 | 
 97 |     record_a.update(
 98 |         {
 99 |             "terms": [
100 |                 "HP:0001251",
101 |                 "HP:0001263",
102 |                 "HP:0001290",
103 |                 "HP:0004322",
104 |                 "HP:0012433",
105 |             ],
106 |             "weights": {"age": [0.67, 1.0, 1.0, 0.4, 0.4]},
107 |         }
108 |     )
109 |     record_b.update(
110 |         {
111 |             "terms": ["HP:0001249", "HP:0001263", "HP:0001290"],
112 |             "weights": {"age": [1.0, 1.0, 1.0]},
113 |         }
114 |     )
115 | 
116 |     test_data["scorer"].summarization_method = "BMWA"
117 |     test_data["scorer"].min_score_mask = 0.05
118 |     score_bmwa = test_data["scorer"].score(record_a, record_b)
119 |     assert round(score_bmwa[2], 4) == 0.1822
120 | 
121 |     record_a.update(
122 |         {
123 |             "terms": ["HP:0001251", "HP:0001263", "HP:0001290", "HP:0004322"],
124 |             "weights": {"age": [0.67, 1.0, 1.0, 0.4]},
125 |         }
126 |     )
127 |     record_b.update(
128 |         {
129 |             "terms": ["HP:0001263", "HP:0001249", "HP:0001290"],
130 |             "weights": {"age": [1.0, 1.0, 0.5]},
131 |         }
132 |     )
133 | 
134 |     scorer = test_data["scorer"]
135 |     scorer.summarization_method = "BMWA"
136 | 
137 |     score_bwma_both_weights = scorer.score(record_a, record_b)
138 |     assert round(score_bwma_both_weights[2], 4) == 0.1918
139 | 
140 |     scorer.min_score_mask = None
141 |     record_a["weights"].pop("age", None)
142 |     score_bwma_one_weights = scorer.score(record_a, record_b)
143 |     assert round(score_bwma_one_weights[2], 4) == 0.155
144 | 
145 | 
146 | def test_score_records(test_data):
147 |     query_name = "SAMPLE"
148 |     query_terms = [
149 |         "HP:0000750",
150 |         "HP:0010863",
151 |     ]
152 |     input_records = [{"record_id": query_name, "terms": query_terms, "weights": {}}]
153 |     score_records = test_data["disease_records"]
154 | 
155 |     results = test_data["scorer"].score_records(
156 |         input_records,
157 |         score_records,
158 |         itertools.product(range(len(input_records)), range(len(score_records))),
159 |         threads=1,
160 |     )
161 |     assert len(results) == 8118
162 |     assert round(float(results[0][2]), 2) == 0.04
163 | 
164 |     [record["weights"].pop("disease_frequency") for record in score_records]
165 |     results = test_data["scorer"].score_records(
166 |         input_records,
167 |         score_records,
168 |         itertools.product(range(len(input_records)), range(len(score_records))),
169 |         threads=1,
170 |     )
171 |     assert len(results) == 8118
172 | 
173 | 
174 | def test_no_parents(test_data):
175 |     terms_a = ["HP:0012433", "HP:0000708"]
176 |     terms_b = ["HP:0001249", "HP:0012758"]
177 | 
178 |     assert (
179 |         list(remove_parents(terms_a, test_data["scorer"].hpo_network))[0]
180 |         == "HP:0012433"
181 |     )
182 |     assert len(remove_parents(terms_b, test_data["scorer"].hpo_network)) == 2
183 | 
184 | 
185 | def test_score_self(test_data):
186 |     records = parse_input(
187 |         os.path.join(test_data["parent_dir"], "data/test.score-long.txt"),
188 |         test_data["hpo_network"],
189 |         test_data["alt2prim"],
190 |     )
191 | 
192 |     input_records = [x for x in records if x["record_id"] in ["213200", "302801"]]
193 | 
194 |     results = test_data["scorer"].score_records(
195 |         input_records,
196 |         input_records,
197 |         half_product(len(input_records), len(input_records)),
198 |     )
199 |     assert len(results) == 3
200 | 
201 |     assert round(float(results[1][2]), 2) == 0.1
202 | 
203 | 
204 | def test_bmwa(test_data):
205 |     terms_a = ["HP:0001251", "HP:0001263", "HP:0001290", "HP:0004322"]
206 | 
207 |     terms_b = ["HP:0001263", "HP:0001249", "HP:0001290"]
208 |     weights_a = {"age": [0.67, 1.0, 1.0, 0.4]}
209 |     weights_b = {"age": [1.0, 1.0, 1.0]}
210 | 
211 |     df = pd.DataFrame(
212 |         [
213 |             [4.22595743e-02, 3.92122308e-02, 3.04851573e-04],
214 |             [1.07473687e-01, 5.05101655e-01, 3.78305515e-04],
215 |             [3.69780479e-04, 3.78305515e-04, 4.64651944e-01],
216 |             [4.17139800e-04, 4.12232546e-04, 3.67984322e-04],
217 |         ],
218 |         index=pd.Index(terms_a, name="a"),
219 |         columns=pd.MultiIndex.from_arrays(
220 |             [["score"] * len(terms_b), terms_b], names=[None, "b"]
221 |         ),
222 |     )
223 | 
224 |     score_bmwa = test_data["scorer"].best_match_weighted_average(
225 |         df, weights_a, weights_b
226 |     )
227 | 
228 |     assert round(score_bmwa, 4) == 0.3419
229 | 
230 |     weights_a = {"age": [1.0] * len(terms_a)}
231 |     score_bmwa = test_data["scorer"].best_match_weighted_average(
232 |         df, weights_a, weights_b
233 |     )
234 |     assert round(score_bmwa, 4) == 0.2985
235 | 
236 |     weights_a = {"age": [1.0] * len(terms_a)}
237 |     weights_b = {"age": [1.0] * len(terms_b)}
238 |     test_data["scorer"].min_score_mask = None
239 |     score_bmwa = test_data["scorer"].best_match_weighted_average(
240 |         df, weights_a, weights_b
241 |     )
242 |     assert round(score_bmwa, 4) == 0.2985
243 | 
244 |     terms_a = ["HP:0001251", "HP:0001249", "HP:0001263", "HP:0001290", "HP:0004322"]
245 |     terms_b = ["HP:0001263", "HP:0001249", "HP:0001290"]
246 | 
247 |     df = pd.DataFrame(
248 |         [
249 |             [4.22595743e-02, 3.92122308e-02, 3.04851573e-04],
250 |             [1.07473687e-01, 5.05101655e-01, 3.78305515e-04],
251 |             [1.07473687e-01, 5.05101655e-01, 3.78305515e-04],
252 |             [3.69780479e-04, 3.78305515e-04, 4.64651944e-01],
253 |             [4.17139800e-04, 4.12232546e-04, 3.67984322e-04],
254 |         ],
255 |         index=pd.Index(terms_a, name="a"),
256 |         columns=pd.MultiIndex.from_arrays(
257 |             [["score"] * len(terms_b), terms_b], names=[None, "b"]
258 |         ),
259 |     )
260 | 
261 |     weights_a = {"age": [0.67, 0.4, 1.0, 1.0, 0.4]}
262 |     weights_b = {"age": [1.0, 1.0, 1.0]}
263 | 
264 |     # compute pairwise best match weighted average
265 |     test_data["scorer"].min_score_mask = None
266 |     score_bmwa = test_data["scorer"].best_match_weighted_average(
267 |         df, weights_a, weights_b
268 |     )
269 | 
270 |     assert round(score_bmwa, 3) == 0.352
271 | 
272 |     # because both patients were described to have ID, but only patient a
273 |     # has ataxia and so we mask good phenotype matches from being weighted
274 |     # down by default we expect to get a better similarity score
275 |     test_data["scorer"].min_score_mask = 0.05
276 |     score_bmwa = test_data["scorer"].best_match_weighted_average(
277 |         df, weights_a, weights_b
278 |     )
279 | 
280 |     assert round(score_bmwa, 3) == 0.365
281 | 
282 | 
283 | def test_age_weight(test_data):
284 |     # Test age based weight distribution and best_match_weighted_average calculation
285 | 
286 |     terms_a = [
287 |         "HP:0001251",
288 |         "HP:0001263",
289 |         "HP:0001290",
290 |         "HP:0004322",
291 |     ]  # ATAX, DD, HYP, SS
292 |     terms_b = ["HP:0001263", "HP:0001249", "HP:0001290"]  # DD, ID, HYP
293 | 
294 |     test_data["hpo_network"] = annotate(
295 |         test_data["hpo_network"],
296 |         test_data["phenotype_to_diseases"],
297 |         test_data["num_diseases_annotated"],
298 |         test_data["alt2prim"],
299 |         ages_distribution_file=test_data["ages_distribution_file"],
300 |     )
301 | 
302 |     age_a = 9.0
303 |     age_b = 4.0
304 | 
305 |     # calculate weights based on patients age
306 |     weights_a = {"age": calculate_age_weights(terms_a, age_b, test_data["hpo_network"])}
307 |     weights_b = {"age": calculate_age_weights(terms_b, age_a, test_data["hpo_network"])}
308 | 
309 |     # make pairwise scores matrix
310 |     df = pd.DataFrame(
311 |         [
312 |             [4.22595743e-02, 3.92122308e-02, 3.04851573e-04],
313 |             [1.07473687e-01, 5.05101655e-01, 3.78305515e-04],
314 |             [3.69780479e-04, 3.78305515e-04, 4.64651944e-01],
315 |             [4.17139800e-04, 4.12232546e-04, 3.67984322e-04],
316 |         ],
317 |         index=pd.Index(terms_a, name="a"),
318 |         columns=pd.MultiIndex.from_arrays(
319 |             [["score"] * len(terms_b), terms_b], names=[None, "b"]
320 |         ),
321 |     )
322 |     # compute pairwise best match weighted average
323 |     score_bmwa = test_data["scorer"].best_match_weighted_average(
324 |         df, weights_a, weights_b
325 |     )
326 | 
327 |     assert pytest.approx(float(score_bmwa), 0.01) == 0.3742
328 | 
329 |     # set all weights to 1.0, result should be the same as BMA without weights
330 |     weights_a = {"disease_frequency": [1.0] * len(terms_a)}
331 |     weights_b = {"disease_frequency": [1.0] * len(terms_b)}
332 |     score_bmwa = test_data["scorer"].best_match_weighted_average(
333 |         df, weights_a, weights_b
334 |     )
335 | 
336 |     assert pytest.approx(float(score_bmwa), 0.01) == 0.2985
337 | 
338 |     # test term not in network
339 |     terms_a = ["HP:Not_a_term"]
340 |     weights_a = calculate_age_weights(terms_a, age_b, test_data["hpo_network"])
341 |     assert weights_a == [1.0]
342 | 
343 |     # term in network no age
344 |     terms_a = ["HP:0000001"]
345 |     weights_a = calculate_age_weights(terms_a, age_b, test_data["hpo_network"])
346 |     assert weights_a == [1.0]
347 | 
348 | 
349 | def test_score_pairs_age(test_data):
350 |     # Test reading in records files and calculating pairwise scores
351 |     # read in records
352 |     test_data["hpo_network"] = annotate(
353 |         test_data["hpo_network"],
354 |         test_data["phenotype_to_diseases"],
355 |         test_data["num_diseases_annotated"],
356 |         test_data["alt2prim"],
357 |         ages_distribution_file=test_data["ages_distribution_file"],
358 |     )
359 | 
360 |     records = parse_input(
361 |         os.path.join(test_data["parent_dir"], "data/test.score-short.txt"),
362 |         test_data["hpo_network"],
363 |         test_data["alt2prim"],
364 |     )
365 | 
366 |     # create instance the scorer class
367 |     scorer = Scorer(
368 |         test_data["hpo_network"], summarization_method="BMWA", min_score_mask=None
369 |     )
370 | 
371 |     # select which patients to test in pairwise best_match_weighted_average
372 |     input_records = [x for x in records if x["record_id"] in ["118200", "118210"]]
373 | 
374 |     results = scorer.score_records(
375 |         input_records,
376 |         input_records,
377 |         [
378 |             (0, 1),
379 |         ],
380 |     )
381 |     assert len(results) == 1
382 | 
383 |     # the right answer
384 |     answer = np.average(
385 |         [0.017, 0.231, 0.325, 0.0, 0.042, 0.231, 0.325],
386 |         weights=[0.481, 1.0, 1.0, 0.0446, 1.0, 1.0, 1.0],
387 |     )
388 | 
389 |     assert pytest.approx(float(results[0][2]), 0.01) == answer
390 | 
391 |     # Test identical records for which one age exist and one doesn't
392 |     input_records = [x for x in records if x["record_id"] in ["118210", "118211"]]
393 | 
394 |     results = scorer.score_records(
395 |         input_records,
396 |         input_records,
397 |         [
398 |             (0, 1),
399 |         ],
400 |     )
401 |     assert len(results) == 1
402 | 
403 |     # the right answer
404 |     answer = np.average([0.226, 0.231, 0.325], weights=[0.481, 1.0, 1.0])
405 | 
406 |     assert pytest.approx(float(results[0][2]), 0.1) == answer
407 | 
408 | 
409 | def test_alpha_zero(test_data):
410 |     """the root term should contain all diseases therefore the IC should be zero"""
411 | 
412 |     root_term_ic = test_data["hpo_network"].nodes["HP:0000118"]["ic"]
413 |     assert root_term_ic == 0.0
414 | 
415 | 
416 | def test_leaves_diff_branches_score_zero(test_data):
417 |     """two leaves in different branches
418 |     two leaves therefore beta is zero
419 |     different branches therefore alpha is zero
420 |     define I = (0.0 / (0.0 + 0.0)) as zero and not nan"""
421 |     term_a = "HP:0001290"
422 |     term_b = "HP:0011351"
423 | 
424 |     score_two_leaves_diff_branches = test_data["scorer"].score_hpo_pair_hrss(
425 |         term_a, term_b
426 |     )
427 |     assert score_two_leaves_diff_branches == 0.0
428 | 
429 | 
430 | def test_score_hrss_basic(test_data):
431 |     test_data["scorer"].scoring_method = "HRSS"
432 |     terms_a = ["HP:0001290", "HP:0000118"]
433 |     terms_b = ["HP:0001290", "HP:0011351"]
434 | 
435 |     assert pytest.approx(0.162, 0.01) == test_data["scorer"].score_term_sets_basic(
436 |         terms_a, terms_b
437 |     )
438 | 
439 | 
440 | def test_score_resnik_basic(test_data):
441 |     test_data["scorer"].scoring_method = "Resnik"
442 |     terms_a = ["HP:0001290", "HP:0000118"]
443 |     terms_b = ["HP:0001290", "HP:0011351"]
444 |     assert pytest.approx(1.283, 0.01) == test_data["scorer"].score_term_sets_basic(
445 |         terms_a, terms_b
446 |     )
447 | 
448 | 
449 | def test_score_jaccard_basic(test_data):
450 |     test_data["scorer"].scoring_method = "Jaccard"
451 |     terms_a = ["HP:0001290", "HP:0000118"]
452 |     terms_b = ["HP:0001290", "HP:0011351"]
453 | 
454 |     assert pytest.approx(0.333, 0.01) == test_data["scorer"].score_term_sets_basic(
455 |         terms_a, terms_b
456 |     )
457 | 
458 | 
459 | def test_score_word2vec_basic(test_data):
460 |     test_data["scorer"] = Scorer(test_data["hpo_network"], scoring_method="word2vec")
461 |     terms_a = ["HP:0001290", "HP:0000118"]
462 |     terms_b = ["HP:0001290", "HP:0011351"]
463 | 
464 |     assert pytest.approx(
465 |         test_data["scorer"].score_term_sets_basic(terms_a, terms_b), 0.01
466 |     ) == pytest.approx(0.156, 0.01)
467 | 
468 | 
469 | def test_score_word2vec_out_of_vocab(test_data):
470 |     test_data["scorer"] = Scorer(test_data["hpo_network"], scoring_method="word2vec")
471 |     terms_a = ["HP:NOT_A_TERM", "HP:0000118"]
472 |     terms_b = ["HP:0001290", "NOT_A_TERM"]
473 | 
474 |     assert pytest.approx(
475 |         test_data["scorer"].score_term_sets_basic(terms_a, terms_b), 0.01
476 |     ) == pytest.approx(0.063, 0.01)
477 | 
478 | 
479 | def test_score_word2vec_empty(test_data):
480 |     test_data["scorer"] = Scorer(test_data["hpo_network"], scoring_method="word2vec")
481 |     terms_a = []
482 |     terms_b = ["HP:0001290", "HP:0011351"]
483 | 
484 |     assert test_data["scorer"].score_term_sets_basic(terms_a, terms_b) == 0.0
485 | 


--------------------------------------------------------------------------------
/tests/test_util.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | 
  4 | from phenopy.util import parse, read_records_file, encode_phenotypes, parse_input
  5 | 
  6 | 
  7 | def test_parse(test_data):
  8 |     string = "age=13;sex=Male"
  9 |     assert parse(string, what="sex") == "Male"
 10 |     assert parse(string, what="age") == 13.0
 11 | 
 12 |     string = "age=13.64;sex=male"
 13 |     assert parse(string, what="sex") == "Male"
 14 |     assert parse(string, what="age") == 13.6
 15 | 
 16 |     string = "age=12.9;sex=female"
 17 |     assert parse(string, what="sex") == "Female"
 18 |     assert parse(string, what="age") == 12.9
 19 | 
 20 |     string = "sex=Female"
 21 |     assert parse(string, what="sex") == "Female"
 22 | 
 23 |     string = "sex=FEMALE"
 24 |     assert parse(string, what="sex") == "Female"
 25 | 
 26 |     string = "sex=F"
 27 |     assert parse(string, what="sex") == "Female"
 28 | 
 29 |     string = "age=1"
 30 |     assert parse(string, what="age") == 1.0
 31 | 
 32 |     string = "."
 33 |     assert not parse(string, what="age")
 34 | 
 35 |     string = ". "
 36 |     assert not parse(string, what="age")
 37 | 
 38 |     string = " . "
 39 |     assert not parse(string, what="age")
 40 | 
 41 |     string = "13?"
 42 |     assert not parse(string, what="age")
 43 | 
 44 |     string = "sex=NA"
 45 |     assert not parse(string, what="sex")
 46 | 
 47 |     string = "sex=Unknown"
 48 |     assert not parse(string, what="sex")
 49 | 
 50 | 
 51 | def test_encode_phenotypes_file(test_data):
 52 |     input_file = os.path.join(test_data["parent_dir"], "data/test.score-short.txt")
 53 |     records = parse_input(input_file, test_data["hpo_network"], test_data["alt2prim"])
 54 |     encoded_phenotypes = encode_phenotypes(
 55 |         [record["terms"] for record in records],
 56 |         test_data["phenotype_groups"],
 57 |         test_data["hpo_network"],
 58 |         test_data["alt2prim"],
 59 |     )
 60 |     assert sum(encoded_phenotypes[0]) == 4
 61 | 
 62 | 
 63 | def test_encode_1d_phenotypes(test_data):
 64 |     phenotypes = ["HP:0012759", "HP:0003011", "HP:0011442"]
 65 |     encoded_phenotypes = encode_phenotypes(
 66 |         phenotypes,
 67 |         test_data["phenotype_groups"],
 68 |         test_data["hpo_network"],
 69 |         test_data["alt2prim"],
 70 |         k=1000,
 71 |     )
 72 |     assert sum(encoded_phenotypes) == 3
 73 | 
 74 | 
 75 | def test_encode_2d_phenotypes(test_data):
 76 |     phenotypes = [
 77 |         ["HP:0012759", "HP:0003011", "HP:0011442"],
 78 |         ["HP:0012759", "HP:0003011"],
 79 |     ]
 80 |     encoded_phenotypes = encode_phenotypes(
 81 |         phenotypes,
 82 |         test_data["phenotype_groups"],
 83 |         test_data["hpo_network"],
 84 |         test_data["alt2prim"],
 85 |         k=1000,
 86 |     )
 87 |     assert sum(encoded_phenotypes[1]) == 2
 88 | 
 89 | 
 90 | def test_read_records_file(test_data):
 91 |     with pytest.raises(SystemExit) as se:
 92 |         read_records_file("notafilepath/notafile")
 93 | 
 94 |     assert se.type == SystemExit
 95 |     assert se.value.code == 1
 96 | 
 97 |     records_truth = [
 98 |         {
 99 |             "sample": "118200",
100 |             "age": 9.0,
101 |             "gender": "Female",
102 |             "terms": "HP:0001263|HP:0001251|HP:0001290|HP:0004322".split("|"),
103 |         },
104 |         {
105 |             "sample": "118210",
106 |             "age": 4.0,
107 |             "gender": None,
108 |             "terms": "HP:0001249|HP:0001263|HP:0001290".split("|"),
109 |         },
110 |         {
111 |             "sample": "118211",
112 |             "age": None,
113 |             "gender": None,
114 |             "terms": "HP:0001249|HP:0001263|HP:0001290".split("|"),
115 |         },
116 |     ]
117 |     records_path = os.path.join(
118 |         os.path.dirname(os.path.realpath(__file__)), "data/test.score-short.txt"
119 |     )
120 |     records = read_records_file(records_path, no_parents=False)
121 |     assert records == records_truth
122 | 


--------------------------------------------------------------------------------
/tests/test_weights.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | import pandas as pd
 4 | from phenopy.config import logger
 5 | from phenopy.weights import (
 6 |     get_truncated_normal,
 7 |     hpo_age_to_weight,
 8 |     make_age_distributions,
 9 |     get_empirical_cdf,
10 | )
11 | 
12 | 
13 | def test_age_weights(test_data):
14 |     assert hpo_age_to_weight(test_data["hpo_network"], "HP:0001251", 9.0) == 1.0
15 |     assert (
16 |         pytest.approx(
17 |             hpo_age_to_weight(test_data["hpo_network"], "HP:0001251", 5.0), 0.01
18 |         )
19 |         == 1.0
20 |     )
21 | 
22 | 
23 | def test_make_age_distributions(test_data):
24 |     with pytest.raises(SystemExit) as se:
25 |         make_age_distributions("notafilepath/notafile")
26 | 
27 |     assert se.type == SystemExit
28 |     assert se.value.code == 1
29 | 
30 |     with pytest.raises(SystemExit) as se:
31 |         make_age_distributions("notafilepath/notafile", logger=logger)
32 | 
33 |     assert se.type == SystemExit
34 |     assert se.value.code == 1
35 | 
36 |     ages_truth = pd.DataFrame(
37 |         [
38 |             {
39 |                 "hpid": "HP:0001251",
40 |                 "age_dist": get_truncated_normal(6.0, 3.0, 0.0, 6.0),
41 |             },
42 |             {
43 |                 "hpid": "HP:0001263",
44 |                 "age_dist": get_truncated_normal(1.0, 1.0, 0.0, 1.0),
45 |             },
46 |             {
47 |                 "hpid": "HP:0001290",
48 |                 "age_dist": get_truncated_normal(1.0, 1.0, 0.0, 1.0),
49 |             },
50 |             {
51 |                 "hpid": "HP:0004322",
52 |                 "age_dist": get_truncated_normal(10.0, 3.0, 0.0, 10.0),
53 |             },
54 |             {
55 |                 "hpid": "HP:0001249",
56 |                 "age_dist": get_truncated_normal(6.0, 3.0, 0.0, 6.0),
57 |             },
58 |         ]
59 |     ).set_index("hpid")
60 | 
61 |     phenotype_ages_file = os.path.join(
62 |         os.path.dirname(os.path.realpath(__file__)), "data/phenotype_age.tsv"
63 |     )
64 |     df = make_age_distributions(phenotype_ages_file)
65 |     assert set(ages_truth.index) == set(df.index)
66 | 
67 |     for hpid in ages_truth.index:
68 |         assert pytest.approx(
69 |             ages_truth.loc[hpid]["age_dist"].mean(), 0.1
70 |         ) == pytest.approx(df.loc[hpid]["age_dist"].mean(), 0.1)
71 | 
72 | 
73 | def test_get_truncated_normal(test_data):
74 |     distribution = get_truncated_normal(mean=6.0, sd=1.0, lower=0.0, upper=6.0)
75 | 
76 |     assert pytest.approx(distribution.mean(), 0.01) == 5.20
77 |     assert pytest.approx(get_empirical_cdf(3, distribution), 0.1) == 0.0027
78 |     assert pytest.approx(get_empirical_cdf(12, distribution), 0.01) == 1.0
79 | 


--------------------------------------------------------------------------------