├── .binder
└── environment.yml
├── .github
└── workflows
│ └── draft_pdf.yml
├── .gitignore
├── .travis.yml
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── paper
├── figure1.png
├── paper.bib
└── paper.md
├── pynm
├── __init__.py
├── cli.py
├── models
│ ├── __init__.py
│ ├── approx.py
│ ├── centiles.py
│ ├── gamlss.py
│ └── loess.py
├── pynm.py
└── util.py
├── pynm_logo.png
├── pynm_models.png
├── requirements.txt
├── setup.py
├── test
├── __init__.py
└── test_pynm.py
└── tutorials
├── 0-why_normative_modeling.ipynb
├── 1-getting_started.ipynb
├── 2-multivariate_confounds.ipynb
├── 3-big_data.ipynb
├── 4-complex_data.ipynb
├── 5-model_selection.ipynb
├── 6-downstream_analyses.ipynb
├── image1.jpg
├── image2.jpg
├── image3.jpg
├── image4.jpg
└── image5.jpg
/.binder/environment.yml:
--------------------------------------------------------------------------------
1 | name: python 3.9
2 |
3 | channels:
4 | - conda-forge
5 |
6 | dependencies:
7 | - python=3.9
8 | - r-base=4.2
9 | - r-tidyverse
10 | - r-gamlss
11 | - r-gamlss.dist
12 | - r-gamlss.data
13 | - pip:
14 | - pynm
15 |
--------------------------------------------------------------------------------
/.github/workflows/draft_pdf.yml:
--------------------------------------------------------------------------------
1 | on: [push]
2 |
3 | jobs:
4 | paper:
5 | runs-on: ubuntu-latest
6 | name: Paper Draft
7 | steps:
8 | - name: Checkout
9 | uses: actions/checkout@v2
10 | - name: Build draft PDF
11 | uses: openjournals/openjournals-draft-action@master
12 | with:
13 | journal: joss
14 | # This should be the path to the paper within your repo.
15 | paper-path: paper/paper.md
16 | - name: Upload
17 | uses: actions/upload-artifact@v1
18 | with:
19 | name: paper
20 | # This is the output path where Pandoc will write the compiled
21 | # PDF. Note, this should be the same directory as the input
22 | # paper.md
23 | path: paper/paper.pdf
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 | #vscode
107 | .vscode/
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "3.6"
4 | - "3.7"
5 | before_install:
6 | - sudo apt-get update
7 | # command to install dependencies
8 | install:
9 | # - pip install -r requirements.txt #(should work from install_requires)
10 | - pip install .
11 | - pip install pytest
12 | # command to run tests
13 | script:
14 | - pytest test/test_pynm.py::TestBasic
15 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to PyNM
2 | Your input is much welcome! Please do not hesitate to contribute to this project, whether it's:
3 |
4 | - Reporting a bug
5 | - Discussing the current state of the code
6 | - Submitting a fix
7 | - Proposing new features
8 | - Becoming a maintainer
9 |
10 | ## We Develop with Github
11 | We use github to host code, to track issues and feature requests, as well as accept pull requests.
12 |
13 | ## We Use [Github Flow](https://guides.github.com/introduction/flow/index.html), So All Code Changes Happen Through Pull Requests
14 | Pull requests are the best way to propose changes to the codebase (we use [Github Flow](https://guides.github.com/introduction/flow/index.html)). We actively welcome your pull requests:
15 |
16 | 1. Fork the repo and create your branch from `main`.
17 | 2. If you've added code that should be tested, add tests.
18 | 3. If you've changed APIs, update the documentation.
19 | 4. Ensure the test suite passes (Travis).
20 | 5. Make sure your code lints (PEP8).
21 | 6. Issue the pull request!
22 |
23 | ## Any contributions you make will be under the 3-clause BSD License
24 | In short, when you submit code changes, your submissions are understood to be under the same [3-clause BSD License](https://opensource.org/licenses/BSD-3-Clause) that covers the project. Feel free to contact the maintainers if that's a concern.
25 |
26 | ## Report bugs using Github's [issues](https://github.com/ppsp-team/PyNM/issues)
27 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/ppsp-team/PyNM/issues/new/choose); it's that easy!
28 |
29 | ## Write bug reports with detail, background, and sample code
30 |
31 | **Great Bug Reports** tend to have:
32 |
33 | - A quick summary and/or background
34 | - Specific steps to reproduce, with sample code if you can.
35 | - What you expected would happen
36 | - What actually happens
37 | - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work)
38 |
39 | Thanks in advance to take the time for writing **Great** Bug Reports!
40 |
41 | ## Use a Consistent Coding Style
42 |
43 | * 4 spaces for indentation rather than tabs
44 | * [PEP 8 Style Guide for Python Code](https://pep8.org/)
45 | * [Numpydoc](https://numpydoc.readthedocs.io/en/latest/) for the docstring
46 |
47 | ## License
48 | By contributing, you agree that your contributions will be licensed under its [3-clause BSD License](https://opensource.org/licenses/BSD-3-Clause).
49 |
50 | ## References
51 | This document was adapted from [this open-source contribution guidelines](https://gist.github.com/briandk/3d2e8b3ec8daf5a27a62).
52 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2019, Guillaume Dumas
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | [](https://pypi.org/project/pynm/)
[](https://opensource.org/licenses/BSD-3-Clause)
4 |
5 | PyNM is a lightweight python implementation of Normative Modeling making it approachable and easy to adopt. The package provides:
6 |
7 | - Python API and a command-line interface for wide accessibility
8 | - Automatic dataset splitting and cross-validation
9 | - Five models from various back-ends in a unified interface that cover a broad range of common use cases
10 | - Centiles
11 | - LOESS
12 | - Gaussian Process (GP)
13 | - Stochastic Variational Gaussian Process (SVGP)
14 | - Generalized Additive Models of Location Shape and Scale (GAMLSS)
15 | - Solutions for very large datasets and heteroskedastic data
16 | - Integrated plotting and evaluation functions to quickly check the validity of the model fit and results
17 | - Comprehensive and interactive tutorials
18 |
19 | The tutorials can be accessed without any local installation via binder: [](https://mybinder.org/v2/gh/ppsp-team/PyNM/HEAD)
20 |
21 | For a more advanced implementation, see the Python library [PCNtoolkit](https://github.com/amarquand/PCNtoolkit).
22 |
23 | ## Installation
24 | **Note**: functional installation requires python 3.9
25 |
26 | **Minimal Installation (without R)**
27 |
28 | If you aren't using the GAMLSS model/don't need to install R.
29 |
30 | ```bash
31 | $ pip install pynm
32 | ```
33 |
34 | **Installation with R**
35 |
36 | If you are using a GAMLSS.
37 | - Must first have R (v4.2.2) installed and packages:
38 | - gamlss
39 | - gamlss.dist
40 | - gamlss.data
41 |
42 | Instruction for installing R can be found at [r-project](https://www.r-project.org/). Once R and the `gamlss` packages are installed, install pynm:
43 | ```bash
44 | $ pip install pynm
45 | ```
46 | **Bleeding-edge Installation**
47 |
48 | If you want to be up to date with the most recent changes to PyNM (not necessarily stable). For the options above replace `pip install pynm` with:
49 | ```bash
50 | $ git clone https://github.com/ppsp-team/PyNM.git
51 | $ cd pynm
52 | $ pip install .
53 | ```
54 |
55 | ## Command Line Usage
56 | ```
57 | usage: pynm [-h] --pheno_p PHENO_P --out_p OUT_P --confounds CONFOUNDS --score
58 | SCORE --group GROUP [--train_sample TRAIN_SAMPLE] [--LOESS]
59 | [--centiles] [--bin_spacing BIN_SPACING] [--bin_width BIN_WIDTH]
60 | [--GP] [--gp_method GP_METHOD] [--gp_num_epochs GP_NUM_EPOCHS]
61 | [--gp_n_inducing GP_N_INDUCING] [--gp_batch_size GP_BATCH_SIZE]
62 | [--gp_length_scale GP_LENGTH_SCALE]
63 | [--gp_length_scale_bounds [GP_LENGTH_SCALE_BOUNDS [GP_LENGTH_SCALE_BOUNDS ...]]]
64 | [--gp_nu NU] [--GAMLSS] [--gamlss_mu GAMLSS_MU]
65 | [--gamlss_sigma GAMLSS_SIGMA] [--gamlss_nu GAMLSS_NU]
66 | [--gamlss_tau GAMLSS_TAU] [--gamlss_family GAMLSS_FAMILY]
67 |
68 | optional arguments:
69 | -h, --help show this help message and exit
70 | --pheno_p PHENO_P Path to phenotype data. Data must be in a .csv file.
71 | --out_p OUT_P Path to output directory.
72 | --confounds CONFOUNDS
73 | List of confounds to use in the GP model.The list must
74 | formatted as a string with commas between confounds,
75 | each confound must be a column name from the phenotype
76 | .csv file. For GP model all confounds will be used,
77 | for LOESS and Centiles models only the first is used.
78 | For GAMLSS all confounds are used unless formulas are
79 | specified. Categorical values must be denoted by
80 | c(var) ('c' must be lower case), e.g. 'c(SEX)' for
81 | column name 'SEX'.
82 | --score SCORE Response variable for all models. Must be a column
83 | title from phenotype .csv file.
84 | --group GROUP Column name from the phenotype .csv file that
85 | distinguishes probands from controls. The column must
86 | be encoded with str labels using 'PROB' for probands
87 | and 'CTR' for controls or with int labels using 1 for
88 | probands and 0 for controls.
89 | --train_sample TRAIN_SAMPLE
90 | Which method to use for a training sample, can be a
91 | float in (0,1] for a percentage of controls or
92 | 'manual' to be manually set using a column of the
93 | DataFrame labelled 'train_sample'.
94 | --LOESS Flag to run LOESS model.
95 | --centiles Flag to run Centiles model.
96 | --bin_spacing BIN_SPACING
97 | Distance between bins for LOESS & centiles models.
98 | --bin_width BIN_WIDTH
99 | Width of bins for LOESS & centiles models.
100 | --GP Flag to run Gaussian Process model.
101 | --gp_method GP_METHOD
102 | Method to use for the GP model. Can be set to
103 | 'auto','approx' or 'exact'. In 'auto' mode, the exact
104 | model will be used for datasets smaller than 2000 data
105 | points. SVGP is used for the approximate model. See
106 | documentation for details. Default value is 'auto'.
107 | --gp_num_epochs GP_NUM_EPOCHS
108 | Number of training epochs for SVGP model. See
109 | documentation for details. Default value is 20.
110 | --gp_n_inducing GP_N_INDUCING
111 | Number of inducing points for SVGP model. See
112 | documentation for details. Default value is 500.
113 | --gp_batch_size GP_BATCH_SIZE
114 | Batch size for training and predicting from SVGP
115 | model. See documentation for details. Default value is
116 | 256.
117 | --gp_length_scale GP_LENGTH_SCALE
118 | Length scale of Matern kernel for exact model. See
119 | documentation for details. Default value is 1.
120 | --gp_length_scale_bounds [GP_LENGTH_SCALE_BOUNDS [GP_LENGTH_SCALE_BOUNDS ...]]
121 | The lower and upper bound on length_scale. If set to
122 | 'fixed', length_scale cannot be changed during
123 | hyperparameter tuning. See documentation for details.
124 | Default value is (1e-5,1e5).
125 | --gp_nu NU Nu of Matern kernel for exact and SVGP model. See
126 | documentation for details. Default value is 2.5.
127 | --GAMLSS Flag to run GAMLSS.
128 | --gamlss_mu GAMLSS_MU
129 | Formula for mu (location) parameter of GAMLSS. Default
130 | formula for score is sum of confounds with non-
131 | categorical columns as smooth functions, e.g. 'score ~
132 | ps(age) + sex'.
133 | --gamlss_sigma GAMLSS_SIGMA
134 | Formula for mu (location) parameter of GAMLSS. Default
135 | formula is '~ 1'.
136 | --gamlss_nu GAMLSS_NU
137 | Formula for mu (location) parameter of GAMLSS. Default
138 | formula is '~ 1'.
139 | --gamlss_tau GAMLSS_TAU
140 | Formula for mu (location) parameter of GAMLSS. Default
141 | formula is '~ 1'.
142 | --gamlss_family GAMLSS_FAMILY
143 | Family of distributions to use for fitting, default is
144 | 'SHASHo2'. See R documentation for GAMLSS package for
145 | other available families of distributions.
146 | ```
147 | ## API Example
148 | ```python
149 | from pynm.pynm import PyNM
150 |
151 | # Load data
152 | df = pd.read_csv('data.csv')
153 |
154 | # Initialize pynm w/ data and confounds
155 | m = PyNM(df,'score','group', confounds = ['age','c(sex)','c(site)'])
156 |
157 | # Run models
158 | m.loess_normative_model()
159 | m.centiles_normative_model()
160 | m.gp_normative_model()
161 | m.gamlss_normative_model()
162 |
163 | # Collect output
164 | data = m.data
165 | ```
166 |
167 | ## Documentation
168 |
169 | All the functions have the classical Python DocStrings that you can summon with ```help()```. You can also see the [tutorials](https://github.com/ppsp-team/PyNM/tree/master/tutorials) for documented examples.
170 |
171 | ### Training sample
172 | By default, the models are fit on all the controls in the dataset and prediction is then done on the entire dataset. The residuals (scores of the normative model) are then calculated as the difference between the actual value and predicted value for each subject. This paradigm is not meant for situations in which the residuals will then be used in a prediction setting, since any train/test split stratified by proband/control will have information from the training set leaked into the test data.
173 |
174 | In order to avoid contaminating the test set, in a prediction setting it is important to fit the normative model on a subset of the controls and then leave those out. This is implemented in PyNM with the `--train_sample` flag. It can be set to:
175 | 1. A number in (0,1]
176 | - This is simplest usage that defines the sample size, PyNM will then select a random sample of the controls and use those as a training group. The number is the proportion of controls to use, the default value is 1 to use the full set of controls.
177 | - The subjects used in the sample are recorded in the column `'train_sample'` of the resulting PyNM.data object. Subjects used in the training sample are encoded as 1s, and the rest as 0s.
178 | 2. `'manual'`
179 | - It is also possible to specify exactly which subjects to use as a training group by providing a column in the input data labeled `'train_sample'` encoded the same way.
180 |
181 | ### Models
182 | #### Centiles and LOESS Models
183 | Both the Centiles and LOESS models are non parametric models based local approximations. They accept only a single dependent variable, passed using the `conf` option.
184 |
185 | #### Gaussian Process Model
186 | Gaussian Process Regression (GPR), which underpins the Gaussian Process Model, can accept an arbitrary number of dependent variables passed using the `confounds` option. Note: in order for GPR to be effective, the data must be homoskedastic. For a full discussion see [this paper](https://www.biorxiv.org/content/10.1101/2021.05.11.443565v1.full).
187 |
188 | GPR is very intensive on both memory and time usage. In order to have a scaleable method, we've implemented both an exact model for smaller datasets and an approximate method, recommended for datasets over ~1000 subjects. The method can be specified using the `method` option, it defaults to `auto` in which the approxiamte model will be chosen for datasets over 1000.
189 |
190 | ##### Exact Model
191 | The exact model implements [scikit-learn](https://scikit-learn.org/stable/index.html)'s Gaussian Process Regressor. The kernel is composed of a constant kernel, a white noise kernel, and a Matern kernel. The Matern kernel has parameters `nu` and `length_scale` that can be specified. The parameter `nu` has special values at 1.5 and 2.5, using other values will significantly increase computation time. See [documentation](https://scikit-learn.org/stable/modules/gaussian_process.html) for an overview of both.
192 |
193 | ##### Approximate Model
194 | The approximate model implements a Stochastic Variational Gaussian Process (SVGP) model using [GPytorch](https://gpytorch.ai/), with a kernel closely matching the one in the exact model. SVGP is a deep learning technique that needs to be trained on minibatches for a set number of epochs, this can be tuned with the parameters `batch_size` and `num_epoch`. The model speeds up computation by using a subset of the data as inducing points, this can be controlled with the parameter `n_inducing` that defines how many points to use. See [documentation](https://docs.gpytorch.ai/en/v1.1.1/examples/04_Variational_and_Approximate_GPs/SVGP_Regression_CUDA.html) for an overview.
195 |
196 | #### GAMLSS
197 | Generalized Additive Models of Location Shape and Scale (GAMLSS) are a flexible modeling framework that can model heteroskedasticity, non-linear effects of variables, and hierarchical structure of the data. The implementation here is a python wrapper for the R package gamlss, formulas for each parameter must be specified using functions available in the package (see [documentation](https://cran.r-project.org/web/packages/gamlss/index.html)). For a full discussion of using GAMLSS for normative modeling see [this paper](https://doi.org/10.1101/2021.06.14.448106).
198 |
199 | 
200 |
201 | ## References
202 |
203 | Original papers with Gaussian Processes (GP):
204 | - Marquand et al. Biological Psychiatry 2016 [doi:10.1016/j.biopsych.2015.12.023](https://doi.org/10.1016/j.biopsych.2015.12.023)
205 | - Marquand et al. Molecular Psychiatry 2019 [doi:10.1038/s41380-019-0441-1](https://doi.org/10.1038/s41380-019-0441-1)
206 |
207 | For limitations of Gaussian Proccesses:
208 | - Xu et al. PLoS ONE 2021, [The pitfalls of using Gaussian Process Regression for normative modeling](https://doi.org/10.1371/journal.pone.0252108)
209 |
210 | Example of use of the LOESS approach:
211 | - Lefebvre et al. Front. Neurosci. 2018 [doi:10.3389/fnins.2018.00662](https://doi.org/10.3389/fnins.2018.00662)
212 | - Maruani et al. Front. Psychiatry 2019 [doi:10.3389/fpsyt.2019.00011](https://doi.org/10.3389/fpsyt.2019.00011)
213 |
214 | For the Centiles approach see:
215 | - Bethlehem et al. Communications Biology 2020 [doi:10.1038/s42003-020-01212-9](https://doi.org/10.1038/s42003-020-01212-9)
216 | - R implementation [here](https://github.com/rb643/Normative_modeling).
217 |
218 | For the SVGP model see:
219 | - Hensman et al. [https://arxiv.org/pdf/1411.2005.pdf](https://arxiv.org/pdf/1411.2005.pdf)
220 |
221 | For GAMLSS see:
222 | - Dinga et al. [https://doi.org/10.1101/2021.06.14.448106](https://doi.org/10.1101/2021.06.14.448106)
223 | - R documentation [https://cran.r-project.org/web/packages/gamlss/index.html](https://cran.r-project.org/web/packages/gamlss/index.html)
224 |
225 | ## How to run tests
226 |
227 | To test the code locally, first make sure R and the required packages are installed then follow the instructions above under **Installation: Bleeding-edge Installation**. Finally, run:
228 |
229 | ```bash
230 | $ pip install -r requirements.txt
231 | $ pytest test/test_pynm.py
232 | ```
233 |
234 | ## How to report errors
235 |
236 | If you spot any bugs :beetle:? Check out the [open issues](https://github.com/ppsp-team/PyNM/issues) to see if we're already working on it. If not, open up a new issue and we will check it out when we can!
237 |
238 | ## How to contribute
239 |
240 | Thank you for considering contributing to our project! Before getting involved, please review our [contribution guidelines](https://github.com/ppsp-team/PyNM/blob/master/CONTRIBUTING.md).
241 |
242 | ## Support
243 |
244 | This work is supported by [IVADO](https://ivado.ca/), [FRQS](http://www.frqs.gouv.qc.ca/en/), [CFI](https://www.innovation.ca/), [MITACS](https://www.mitacs.ca/en), and [Compute Canada](https://computecanada.ca).
245 |
--------------------------------------------------------------------------------
/paper/figure1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppsp-team/PyNM/52de5b73ffd2cb4b04352e348f042dd695be3c87/paper/figure1.png
--------------------------------------------------------------------------------
/paper/paper.bib:
--------------------------------------------------------------------------------
1 | @Article{marquand:2019,
2 | author={Marquand, Andre F.
3 | and Kia, Seyed Mostafa
4 | and Zabihi, Mariam
5 | and Wolfers, Thomas
6 | and Buitelaar, Jan K.
7 | and Beckmann, Christian F.},
8 | title={Conceptualizing mental disorders as deviations from normative functioning},
9 | journal={Molecular Psychiatry},
10 | year={2019},
11 | month={Oct},
12 | day={01},
13 | volume={24},
14 | number={10},
15 | pages={1415-1424},
16 | abstract={Normative models are a class of emerging statistical techniques useful for understanding the heterogeneous biology underlying psychiatric disorders at the level of the individual participant. Analogous to normative growth charts used in paediatric medicine for plotting child development in terms of height or weight as a function of age, normative models chart variation in clinical cohorts in terms of mappings between quantitative biological measures and clinically relevant variables. An emerging body of literature has demonstrated that such techniques are excellent tools for parsing the heterogeneity in clinical cohorts by providing statistical inferences at the level of the individual participant with respect to the normative range. Here, we provide a unifying review of the theory and application of normative modelling for understanding the biological and clinical heterogeneity underlying mental disorders. We first provide a statistically grounded yet non-technical overview of the conceptual underpinnings of normative modelling and propose a conceptual framework to link the many different methodological approaches that have been proposed for this purpose. We survey the literature employing these techniques, focusing principally on applications of normative modelling to quantitative neuroimaging-based biomarkers in psychiatry and, finally, we provide methodological considerations and recommendations to guide future applications of these techniques. We show that normative modelling provides a means by which the importance of modelling individual differences can be brought from theory to concrete data analysis procedures for understanding heterogeneous mental disorders and ultimately a promising route towards precision medicine in psychiatry.},
17 | issn={1476-5578},
18 | doi={10.1038/s41380-019-0441-1},
19 | url={https://doi.org/10.1038/s41380-019-0441-1}
20 | }
21 |
22 | @Article{marquand:2016,
23 | author={Marquand, Andre F.
24 | and Rezek, Iead
25 | and Buitelaar, Jan
26 | and Beckmann, Christian F.},
27 | title={Understanding Heterogeneity in Clinical Cohorts Using Normative Models: Beyond Case-Control Studies},
28 | journal={Biological psychiatry},
29 | year={2016},
30 | month={Oct},
31 | day={01},
32 | edition={2016/01/06},
33 | publisher={Elsevier},
34 | volume={80},
35 | number={7},
36 | pages={552-561},
37 | keywords={*Gaussian process; *Heterogeneity; *Normative model; *Outlier detection; *Patient stratification; *Research Domain Criteria; Adult; Attention Deficit Disorder with Hyperactivity/*diagnosis; Brain/physiology; Case-Control Studies; Cluster Analysis; *Data Interpretation, Statistical; Delay Discounting/physiology; Female; Functional Neuroimaging; Humans; Impulsive Behavior/physiology; Magnetic Resonance Imaging; Male; *Models, Statistical; Reward; Young Adult},
38 | abstract={BACKGROUND: Despite many successes, the case-control approach is problematic in biomedical science. It introduces an artificial symmetry whereby all clinical groups (e.g., patients and control subjects) are assumed to be well defined, when biologically they are often highly heterogeneous. By definition, it also precludes inference over the validity of the diagnostic labels. In response, the National Institute of Mental Health Research Domain Criteria proposes to map relationships between symptom dimensions and broad behavioral and biological domains, cutting across diagnostic categories. However, to date, Research Domain Criteria have prompted few methods to meaningfully stratify clinical cohorts. METHODS: We introduce normative modeling for parsing heterogeneity in clinical cohorts, while allowing predictions at an individual subject level. This approach aims to map variation within the cohort and is distinct from, and complementary to, existing approaches that address heterogeneity by employing clustering techniques to fractionate cohorts. To demonstrate this approach, we mapped the relationship between trait impulsivity and reward-related brain activity in a large healthy cohort (N = 491). RESULTS: We identify participants who are outliers within this distribution and show that the degree of deviation (outlier magnitude) relates to specific attention-deficit/hyperactivity disorder symptoms (hyperactivity, but not inattention) on the basis of individualized patterns of abnormality. CONCLUSIONS: Normative modeling provides a natural framework to study disorders at the individual participant level without dichotomizing the cohort. Instead, disease can be considered as an extreme of the normal range or as-possibly idiosyncratic-deviation from normal functioning. It also enables inferences over the degree to which behavioral variables, including diagnostic labels, map onto biology.},
39 | note={26927419[pmid]},
40 | note={PMC5023321[pmcid]},
41 | note={S0006-3223(16)00002-0[PII]},
42 | issn={1873-2402},
43 | doi={10.1016/j.biopsych.2015.12.023},
44 | url={https://pubmed.ncbi.nlm.nih.gov/26927419},
45 | url={https://doi.org/10.1016/j.biopsych.2015.12.023},
46 | language={eng}
47 | }
48 |
49 | @Article{loth:2021,
50 | doi = {10.1371/journal.pcbi.1009477},
51 | author = {Loth, Eva
52 | and Ahmad, Jumana
53 | and Chatham, Chris
54 | and López, Beatriz
55 | and Carter, Ben
56 | and Crawley, Daisy
57 | and Oakley, Bethany
58 | and Hayward, Hannah
59 | and Cooke, Jennifer
60 | and San José Cáceres, Antonia
61 | and Bzdok, Danilo
62 | and Jones, Emily
63 | and Charman, Tony
64 | and Beckmann, Christian
65 | and Bourgeron, Thomas
66 | and Toro, Roberto
67 | and Buitelaar, Jan
68 | and Murphy, Declan
69 | and Dumas, Guillaume},
70 | journal = {PLOS Computational Biology},
71 | publisher = {Public Library of Science},
72 | title = {The meaning of significant mean group differences for biomarker discovery},
73 | year = {2021},
74 | month = {11},
75 | volume = {17},
76 | url = {https://doi.org/10.1371/journal.pcbi.1009477},
77 | pages = {1-16},
78 | abstract = {Over the past decade, biomarker discovery has become a key goal in psychiatry to aid in the more reliable diagnosis and prognosis of heterogeneous psychiatric conditions and the development of tailored therapies. Nevertheless, the prevailing statistical approach is still the mean group comparison between “cases” and “controls,” which tends to ignore within-group variability. In this educational article, we used empirical data simulations to investigate how effect size, sample size, and the shape of distributions impact the interpretation of mean group differences for biomarker discovery. We then applied these statistical criteria to evaluate biomarker discovery in one area of psychiatric research—autism research. Across the most influential areas of autism research, effect size estimates ranged from small (d = 0.21, anatomical structure) to medium (d = 0.36 electrophysiology, d = 0.5, eye-tracking) to large (d = 1.1 theory of mind). We show that in normal distributions, this translates to approximately 45% to 63% of cases performing within 1 standard deviation (SD) of the typical range, i.e., they do not have a deficit/atypicality in a statistical sense. For a measure to have diagnostic utility as defined by 80% sensitivity and 80% specificity, Cohen’s d of 1.66 is required, with still 40% of cases falling within 1 SD. However, in both normal and nonnormal distributions, 1 (skewness) or 2 (platykurtic, bimodal) biologically plausible subgroups may exist despite small or even nonsignificant mean group differences. This conclusion drastically contrasts the way mean group differences are frequently reported. Over 95% of studies omitted the “on average” when summarising their findings in their abstracts (“autistic people have deficits in X”), which can be misleading as it implies that the group-level difference applies to all individuals in that group. We outline practical approaches and steps for researchers to explore mean group comparisons for the discovery of stratification biomarkers.},
79 | number = {11},
80 | }
81 |
82 | @Article{xu:2021,
83 | doi = {10.1371/journal.pone.0252108},
84 | author = {Xu, Bohan
85 | and Kuplicki, Rayus
86 | and Sen, Sandip
87 | and Paulus, Martin P.},
88 | journal = {PLOS ONE},
89 | publisher = {Public Library of Science},
90 | title = {The pitfalls of using Gaussian Process Regression for normative modeling},
91 | year = {2021},
92 | month = {09},
93 | volume = {16},
94 | url = {https://doi.org/10.1371/journal.pone.0252108},
95 | pages = {1-14},
96 | abstract = {Normative modeling, a group of methods used to quantify an individual’s deviation from some expected trajectory relative to observed variability around that trajectory, has been used to characterize subject heterogeneity. Gaussian Processes Regression includes an estimate of variable uncertainty across the input domain, which at face value makes it an attractive method to normalize the cohort heterogeneity where the deviation between predicted value and true observation is divided by the derived uncertainty directly from Gaussian Processes Regression. However, we show that the uncertainty directly from Gaussian Processes Regression is irrelevant to the cohort heterogeneity in general.},
97 | number = {9},
98 | }
99 |
100 | @Misc{pcntoolkit,
101 | author = {Andre F. Marquand
102 | and Saige Rutherford
103 | and Seyed Mostafa Kia
104 | and Thomas Wolfers
105 | and Charlotte Fraza
106 | and Richard Dinga
107 | and Mariam Zabihi},
108 | title = {PCNToolkit (0.20)},
109 | year = {2021},
110 | publisher = {Zenodo},
111 | doi = {10.5281/zenodo.5207839},
112 | }
113 |
114 | @InProceedings{kia:2020,
115 | author="Kia, Seyed Mostafa
116 | and Huijsdens, Hester
117 | and Dinga, Richard
118 | and Wolfers, Thomas
119 | and Mennes, Maarten
120 | and Andreassen, Ole A.
121 | and Westlye, Lars T.
122 | and Beckmann, Christian F.
123 | and Marquand, Andre F.",
124 | editor="Martel, Anne L.
125 | and Abolmaesumi, Purang
126 | and Stoyanov, Danail
127 | and Mateus, Diana
128 | and Zuluaga, Maria A.
129 | and Zhou, S. Kevin
130 | and Racoceanu, Daniel
131 | and Joskowicz, Leo",
132 | title="Hierarchical Bayesian Regression for Multi-site Normative Modeling of Neuroimaging Data",
133 | booktitle="Medical Image Computing and Computer Assisted Intervention -- MICCAI 2020",
134 | year="2020",
135 | publisher="Springer International Publishing",
136 | address="Cham",
137 | pages="699--709",
138 | abstract="Clinical neuroimaging has recently witnessed explosive growth in data availability which brings studying heterogeneity in clinical cohorts to the spotlight. Normative modeling is an emerging statistical tool for achieving this objective. However, its application remains technically challenging due to difficulties in properly dealing with nuisance variation, for example due to variability in image acquisition devices. Here, in a fully probabilistic framework, we propose an application of hierarchical Bayesian regression (HBR) for multi-site normative modeling. Our experimental results confirm the superiority of HBR in deriving more accurate normative ranges on large multi-site neuroimaging data compared to widely used methods. This provides the possibility i) to learn the normative range of structural and functional brain measures on large multi-site data; ii) to recalibrate and reuse the learned model on local small data; therefore, HBR closes the technical loop for applying normative modeling as a medical tool for the diagnosis and prognosis of mental disorders.",
139 | isbn="978-3-030-59728-3"
140 | }
141 |
142 | @Article {kia:2021,
143 | author = {Kia, Seyed Mostafa
144 | and Huijsdens, Hester
145 | and Rutherford, Saige
146 | and Dinga, Richard
147 | and Wolfers, Thomas
148 | and Mennes, Maarten
149 | and Andreassen, Ole A.
150 | and Westlye, Lars T.
151 | and Beckmann, Christian F.
152 | and Marquand, Andre F.},
153 | title = {Federated Multi-Site Normative Modeling using Hierarchical Bayesian Regression},
154 | elocation-id = {2021.05.28.446120},
155 | year = {2021},
156 | doi = {10.1101/2021.05.28.446120},
157 | publisher = {Cold Spring Harbor Laboratory},
158 | abstract = {Clinical neuroimaging data availability has grown substantially in the last decade, providing the potential for studying heterogeneity in clinical cohorts on a previously unprecedented scale. Normative modeling is an emerging statistical tool for dissecting heterogeneity in complex brain disorders. However, its application remains technically challenging due to medical data privacy issues and difficulties in dealing with nuisance variation, such as the variability in the image acquisition process. Here, we introduce a federated probabilistic framework using hierarchical Bayesian regression (HBR) for multi-site normative modeling. The proposed method completes the life-cycle of normative modeling by providing the possibilities to learn, update, and adapt the model parameters on decentralized neuroimaging data. Our experimental results confirm the superiority of HBR in deriving more accurate normative ranges on large multi-site neuroimaging datasets compared to the current standard methods. In addition, our approach provides the possibility to recalibrate and reuse the learned model on local datasets and even on datasets with very small sample sizes. The proposed federated framework closes the technical loop for applying normative modeling across multiple sites in a decentralized manner. This will facilitate applications of normative modeling as a medical tool for screening the biological deviations in individuals affected by complex illnesses such as mental disorders.Competing Interest StatementOle A. Andreassen is a consultant to HealthLytix and received a speaker honorarium from Lundbeck. Christian F. Beckmann is a shareholder and director of SBG Neuro.},
159 | URL = {https://www.biorxiv.org/content/early/2021/05/30/2021.05.28.446120},
160 | eprint = {https://www.biorxiv.org/content/early/2021/05/30/2021.05.28.446120.full.pdf},
161 | journal = {bioRxiv}
162 | }
163 |
164 | @Article {dinga:2021,
165 | author = {Dinga, Richard
166 | and Fraza, Charlotte J.
167 | and Bayer, Johanna M.M.
168 | and Kia, Seyed Mostafa
169 | and Beckmann, Christian F.
170 | and Marquand, Andre F.},
171 | title = {Normative modeling of neuroimaging data using generalized additive models of location scale and shape},
172 | elocation-id = {2021.06.14.448106},
173 | year = {2021},
174 | doi = {10.1101/2021.06.14.448106},
175 | publisher = {Cold Spring Harbor Laboratory},
176 | abstract = {Normative modeling aims to quantify the degree to which an individual{\textquoteright}s brain deviates from a reference sample with respect to one or more variables, which can be used as a potential biomarker of a healthy brain and as a tool to study heterogeneity of psychiatric disorders. The application of normative models is hindered by methodological challenges and lacks standards for the usage and evaluation of normative models. In this paper, we present generalized additive models for location scale and shape (GAMLSS) for normative modeling of neuroimaging data, a flexible modeling framework that can model heteroskedasticity, non-linear effects of variables, and hierarchical structure of the data. It can model non-Gaussian distributions, and it allows for an automatic model order selection, thus improving the accuracy of normative models while mitigating problems of overfitting. Furthermore, we describe measures and diagnostic tools suitable for evaluating normative models and step-by-step examples of normative modeling, including fitting several candidate models, selecting the best models, and transferring them to new scan sites.Competing Interest StatementThe authors have declared no competing interest.},
177 | URL = {https://www.biorxiv.org/content/early/2021/06/14/2021.06.14.448106},
178 | eprint = {https://www.biorxiv.org/content/early/2021/06/14/2021.06.14.448106.full.pdf},
179 | journal = {bioRxiv}
180 | }
181 |
182 | @Article{fraza:2021,
183 | title = {Warped Bayesian linear regression for normative modelling of big data},
184 | journal = {NeuroImage},
185 | volume = {245},
186 | pages = {118715},
187 | year = {2021},
188 | issn = {1053-8119},
189 | doi = {10.1016/j.neuroimage.2021.118715},
190 | url = {https://www.sciencedirect.com/science/article/pii/S1053811921009873},
191 | author = {Charlotte J. Fraza
192 | and Richard Dinga
193 | and Christian F. Beckmann
194 | and Andre F. Marquand},
195 | keywords = {Machine learning, UK Biobank, Big data, Bayesian linear regression, Normative modelling},
196 | abstract = {Normative modelling is becoming more popular in neuroimaging due to its ability to make predictions of deviation from a normal trajectory at the level of individual participants. It allows the user to model the distribution of several neuroimaging modalities, giving an estimation for the mean and centiles of variation. With the increase in the availability of big data in neuroimaging, there is a need to scale normative modelling to big data sets. However, the scaling of normative models has come with several challenges. So far, most normative modelling approaches used Gaussian process regression, and although suitable for smaller datasets (up to a few thousand participants) it does not scale well to the large cohorts currently available and being acquired. Furthermore, most neuroimaging modelling methods that are available assume the predictive distribution to be Gaussian in shape. However, deviations from Gaussianity can be frequently found, which may lead to incorrect inferences, particularly in the outer centiles of the distribution. In normative modelling, we use the centiles to give an estimation of the deviation of a particular participant from the ‘normal’ trend. Therefore, especially in normative modelling, the correct estimation of the outer centiles is of utmost importance, which is also where data are sparsest. Here, we present a novel framework based on Bayesian linear regression with likelihood warping that allows us to address these problems, that is, to correctly model non-Gaussian predictive distributions and scale normative modelling elegantly to big data cohorts. In addition, this method provides likelihood-based statistics, which are useful for model selection. To evaluate this framework, we use a range of neuroimaging-derived measures from the UK Biobank study, including image-derived phenotypes (IDPs) and whole-brain voxel-wise measures derived from diffusion tensor imaging. We show good computational scaling and improved accuracy of the warped BLR for certain IDPs and voxels if there was a deviation from normality of these parameters in their residuals. The present results indicate the advantage of a warped BLR in terms of; computational scalability and the flexibility to incorporate non-linearity and non-Gaussianity of the data, giving a wider range of neuroimaging datasets that can be correctly modelled.}
197 | }
198 |
199 | @Article{rutherford:2022a,
200 | article_type = {journal},
201 | title = {Charting brain growth and aging at high spatial precision},
202 | author = {Rutherford, Saige
203 | and Fraza, Charlotte
204 | and Dinga, Richard
205 | and Kia, Seyed Mostafa
206 | and Wolfers, Thomas
207 | and Zabihi, Mariam
208 | and Berthet, Pierre
209 | and Worker, Amanda
210 | and Verdi, Serena
211 | and Andrews, Derek
212 | and Han, Laura KM
213 | and Bayer, Johanna MM
214 | and Dazzan, Paola
215 | and McGuire, Phillip
216 | and Mocking, Roel T
217 | and Schene, Aart
218 | and Sripada, Chandra
219 | and Tso, Ivy F
220 | and Duval, Elizabeth R
221 | and Chang, Soo-Eun
222 | and Penninx, Brenda WJH
223 | and Heitzeg, Mary M
224 | and Burt, S Alexandra
225 | and Hyde, Luke W
226 | and Amaral, David
227 | and Wu Nordahl, Christine
228 | and Andreasssen, Ole A
229 | and Westlye, Lars T
230 | and Zahn, Roland
231 | and Ruhe, Henricus G
232 | and Beckmann, Christian
233 | and Marquand, Andre F},
234 | editor = {Baker, Chris I and Taschler, Bernd and Esteban, Oscar and Constable, Todd},
235 | volume = 11,
236 | year = 2022,
237 | month = {feb},
238 | pub_date = {2022-02-01},
239 | pages = {e72904},
240 | citation = {eLife 2022;11:e72904},
241 | doi = {10.7554/eLife.72904},
242 | url = {https://doi.org/10.7554/eLife.72904},
243 | abstract = {Defining reference models for population variation, and the ability to study individual deviations is essential for understanding inter-individual variability and its relation to the onset and progression of medical conditions. In this work, we assembled a reference cohort of neuroimaging data from 82 sites (N=58,836; ages 2–100) and used normative modeling to characterize lifespan trajectories of cortical thickness and subcortical volume. Models are validated against a manually quality checked subset (N=24,354) and we provide an interface for transferring to new data sources. We showcase the clinical value by applying the models to a transdiagnostic psychiatric sample (N=1985), showing they can be used to quantify variability underlying multiple disorders whilst also refining case-control inferences. These models will be augmented with additional samples and imaging modalities as they become available. This provides a common reference platform to bind results from different studies and ultimately paves the way for personalized clinical decision-making.},
244 | keywords = {normative model, lifespan, growth chart, brain chart, big data, individual prediction},
245 | journal = {eLife},
246 | issn = {2050-084X},
247 | publisher = {eLife Sciences Publications, Ltd},
248 | }
249 |
250 | @Article{rutherford:2022b,
251 | author={Rutherford, Saige
252 | and Kia, Seyed Mostafa
253 | and Wolfers, Thomas
254 | and Fraza, Charlotte
255 | and Zabihi, Mariam
256 | and Dinga, Richard
257 | and Berthet, Pierre
258 | and Worker, Amanda
259 | and Verdi, Serena
260 | and Ruhe, Henricus G.
261 | and Beckmann, Christian F.
262 | and Marquand, Andre F.},
263 | title={The normative modeling framework for computational psychiatry},
264 | journal={Nature Protocols},
265 | year={2022},
266 | month={Jul},
267 | day={01},
268 | volume={17},
269 | number={7},
270 | pages={1711-1734},
271 | abstract={Normative modeling is an emerging and innovative framework for mapping individual differences at the level of a single subject or observation in relation to a reference model. It involves charting centiles of variation across a population in terms of mappings between biology and behavior, which can then be used to make statistical inferences at the level of the individual. The fields of computational psychiatry and clinical neuroscience have been slow to transition away from patient versus `healthy' control analytic approaches, probably owing to a lack of tools designed to properly model biological heterogeneity of mental disorders. Normative modeling provides a solution to address this issue and moves analysis away from case--control comparisons that rely on potentially noisy clinical labels. Here we define a standardized protocol to guide users through, from start to finish, normative modeling analysis using the Predictive Clinical Neuroscience toolkit (PCNtoolkit). We describe the input data selection process, provide intuition behind the various modeling choices and conclude by demonstrating several examples of downstream analyses that the normative model may facilitate, such as stratification of high-risk individuals, subtyping and behavioral predictive modeling. The protocol takes {\textasciitilde}1--3 h to complete.},
272 | issn={1750-2799},
273 | doi={10.1038/s41596-022-00696-5},
274 | url={https://doi.org/10.1038/s41596-022-00696-5}
275 | }
276 |
277 | @article{rigby:2005,
278 | author = {Rigby, R. A. and Stasinopoulos, D. M.},
279 | title = {Generalized additive models for location, scale and shape},
280 | journal = {Journal of the Royal Statistical Society: Series C (Applied Statistics)},
281 | volume = {54},
282 | number = {3},
283 | pages = {507-554},
284 | doi = {10.1111/j.1467-9876.2005.00510.x},
285 | url = {https://rss.onlinelibrary.wiley.com/doi/abs/10.1111/j.1467-9876.2005.00510.x},
286 | eprint = {https://rss.onlinelibrary.wiley.com/doi/pdf/10.1111/j.1467-9876.2005.00510.x},
287 | year = {2005}
288 | }
289 |
290 | @Article{lefebvre:2018,
291 | author={Lefebvre, Aline
292 | and Delorme, Richard
293 | and Delanoë, Catherine
294 | and Amsellem, Frederique
295 | and Beggiato, Anita
296 | and Germanaud, David
297 | and Bourgeron, Thomas
298 | and Toro, Roberto
299 | and Dumas, Guillaume},
300 | title={Alpha Waves as a Neuromarker of Autism Spectrum Disorder: The Challenge of Reproducibility and Heterogeneity},
301 | journal={Frontiers in Neuroscience},
302 | volume={12},
303 | year={2018},
304 | url={https://www.frontiersin.org/article/10.3389/fnins.2018.00662},
305 | doi={10.3389/fnins.2018.00662},
306 | issn={1662-453X},
307 | abstract={Background: There is no consensus in the literature concerning the presence of abnormal alpha wave profiles in patients with autism spectrum disorder (ASD). This may be due to phenotypic heterogeneity among patients as well as the limited sample sizes utilized. Here we present our results of alpha wave profile analysis based on a sample larger than most of those in the field, performed using a robust processing pipeline.Methods: We compared the alpha waves profiles at rest in children with ASD to those of age-, sex-, and IQ-matched control individuals. We used linear regression and non-parametric normative models using age as covariate forparsing the clinical heterogeneity. We explored the correlation between EEG profiles and the patient’s brain volumes, obtained from structural MRI. We automatized the detection of the alpha peak and visually quality controled our MRI measurements. We assessed the robustness of our results by running the EEG preprocessing with two different versions of Matlab as well as Python.Results: A simple linear regression between peak power or frequency of the alpha waves and the status or age of the participants did not allow to identify any statistically significant relationship. The non-parametric normative model (which took account the non-linear effect of age on the alpha profiles) suggested that participants with ASD displayed more variability than control participants for both frequency and amplitude of the alpha peak (p < 0.05). Independent of the status of the individual, we also observed weak associations (uncorrected p < 0.05) between the alpha frequency, and the volumes of several cortical and subcortical structures (in particular the striatum), but which did not survive correction for multiple testing and changed between analysis pelines.Discussions: Our study did not find evidence for abnormal alpha wave profiles in ASD. We propose, however, an analysis pipeline to perform standardized and automatized EEG analyses on large cohorts. These should help the community to address the challenge of clinical heterogeneity of ASD and to tackle the problems of reproducibility.}
308 | }
309 |
310 | @Article{maruani:2019,
311 | author={Maruani, Anna
312 | and Dumas, Guillaume
313 | and Beggiato, Anita
314 | and Traut, Nicolas
315 | and Peyre, Hugo
316 | and Cohen-Freoua, Alicia
317 | and Amsellem, Frédérique
318 | and Elmaleh, Monique
319 | and Germanaud, David
320 | and Launay, Jean-Marie
321 | and Bourgeron, Thomas
322 | and Toro, Roberto
323 | and Delorme, Richard},
324 | title={Morning Plasma Melatonin Differences in Autism: Beyond the Impact of Pineal Gland Volume},
325 | journal={Frontiers in Psychiatry},
326 | volume={10},
327 | year={2019},
328 | url={https://www.frontiersin.org/article/10.3389/fpsyt.2019.00011},
329 | doi={10.3389/fpsyt.2019.00011},
330 | issn={1664-0640},
331 | abstract={While low plasma melatonin, a neuro-hormone synthesized in the pineal gland, has been frequently associated with autism, our understanding of the mechanisms behind it have remained unclear. In this exploratory study, we hypothesized that low melatonin levels in ASD could be linked to a decrease of the pineal gland volume (PGV). PGV estimates with magnetic resonance imaging (MRI) with a voxel-based volumetric measurement method and early morning plasma melatonin levels were evaluated for 215 participants, including 78 individuals with ASD, 90 unaffected relatives, and 47 controls. We first found that both early morning melatonin level and PGV were lower in patients compared to controls. We secondly built a linear model and observed that plasma melatonin was correlated to the group of the participant, but also to the PGV. To further understand the relationship between PGV and melatonin, we generated a normative model of the PGV relationship with melatonin level based on control participant data. We found an effect of PGV on normalized melatonin levels in ASD. Melatonin deficit appeared however more related to the group of the subject. Thus, melatonin variations in ASD could be mainly driven by melatonin pathway dysregulation.}
332 | }
333 |
334 | @Article{bethlehem:2020,
335 | author={Bethlehem, Richard A. I.
336 | and Seidlitz, Jakob
337 | and Romero-Garcia, Rafael
338 | and Trakoshis, Stavros
339 | and Dumas, Guillaume
340 | and Lombardo, Michael V.},
341 | title={A normative modelling approach reveals age-atypical cortical thickness in a subgroup of males with autism spectrum disorder},
342 | journal={Communications Biology},
343 | year={2020},
344 | month={Sep},
345 | day={04},
346 | volume={3},
347 | number={1},
348 | pages={486},
349 | abstract={Understanding heterogeneity is an important goal on the path to precision medicine for autism spectrum disorders (ASD). We examined how cortical thickness (CT) in ASD can be parameterized as an individualized metric of atypicality relative to typically-developing (TD) age-related norms. Across a large sample (n{\thinspace}={\thinspace}870 per group) and wide age range (5--40 years), we applied normative modelling resulting in individualized whole-brain maps of age-related CT atypicality in ASD and isolating a small subgroup with highly age-atypical CT. Age-normed CT scores also highlights on-average differentiation, and associations with behavioural symptomatology that is separate from insights gleaned from traditional case-control approaches. This work showcases an individualized approach for understanding ASD heterogeneity that could potentially further prioritize work on a subset of individuals with cortical pathophysiology represented in age-related CT atypicality. Only a small subset of ASD individuals are actually highly atypical relative to age-norms. driving small on-average case-control differences.},
350 | issn={2399-3642},
351 | doi={10.1038/s42003-020-01212-9},
352 | url={https://doi.org/10.1038/s42003-020-01212-9}
353 | }
354 |
--------------------------------------------------------------------------------
/paper/paper.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'PyNM: a Lightweight Python implementation of Normative Modeling'
3 | tags:
4 | - Python
5 | - Normative Modeling
6 | - Heterogeneity
7 | - Heteroskedasticity
8 | - Big Data
9 | - Centiles
10 | - LOESS
11 | - Gaussian Process
12 | - Stochastic Variational Gaussian Process
13 | - GAMLSS
14 | - Computational Psychiatry
15 | - Neuroscience
16 | authors:
17 | - name: Harvey, Annabelle
18 | orcid: 0000-0002-9940-8799
19 | affiliation: "1, 2"
20 | - name: Dumas, Guillaume
21 | orcid: 0000-0002-2253-1844
22 | affiliation: "2, 3"
23 | affiliations:
24 | - name: Centre de Recherche de l’Institut Universitaire de Gériatrie de Montréal, Université de Montréal, QC, Canada
25 | index: 1
26 | - name: Centre de Recherche du CHU Sainte-Justine, Université de Montréal, QC, Canada
27 | index: 2
28 | - name: Mila - Quebec AI Institute, Université de Montréal, QC, Canada
29 | index: 3
30 | date: 10 March 2022
31 | bibliography: paper.bib
32 | ---
33 |
34 |
35 | # Summary
36 |
37 | The majority of studies in neuroimaging and psychiatry are focussed on case-control analysis [@marquand:2019]. However, case-control relies on well-defined groups which is more the exception than the rule in biology. Psychiatric conditions are diagnosed based on symptoms alone, which makes for heterogeneity at the biological level [@marquand:2016]. Relying on mean differences obscures this heterogeneity and the resulting loss of information can produce unreliable results or misleading conclusions [@loth:2021].
38 |
39 | Normative Modeling is an emerging alternative to case-control analyses that seeks to parse heterogeneity by looking at how individuals deviate from the normal trajectory. Analogous to normative growth charts, normative models map the mean and variance of a trait for a given population against a set of explanatory variables (usually including age). Statistical inferences at the level of the individual participant can then be obtained with respect to the normative range [@marquand:2019]. This framework can detect patterns of abnormality that might not be consistent across the population, and recasts disease as an extreme deviation from the normal range rather than a separate group.
40 |
41 | PyNM is a lightweight python implementation of Normative Modeling making it approachable and easy to adopt. The package provides:
42 |
43 | - Python API and a command-line interface for wide accessibility
44 | - Automatic dataset splitting and cross-validation
45 | - Five models from various back-ends in a unified interface that cover a broad range of common use cases
46 | - Solutions for very large datasets and heteroskedastic data
47 | - Integrated plotting and evaluation functions to quickly check the validity of the model fit and results
48 | - Comprehensive and interactive tutorials
49 |
50 |
51 | # Statement of need
52 |
53 | The basic idea underpinning Normative Modeling is to fit a model on the controls (or a subset of them) of a dataset, and then apply it to the rest of the participants. The difference between the model’s prediction and the ground truth for the unseen participants relative to the variance around the prediction quantifies their deviation from the normal. While simple in concept, implementing Normative Modeling requires some care in managing the dataset and choosing an appropriate model.
54 |
55 | In principle, any model that estimates both the mean and variance of the predictive distribution could be used for Normative Modeling. However, in practice, we impose more constraints. First and foremost, the assumptions of the model must be met by the data. Second, it is important to distinguish between epistemic and aleatoric uncertainty. Epistemic or systematic uncertainty stems from how information about the distribution is collected, whereas aleatoric uncertainty is intrinsic to the distribution and represents the true variation of the population [@xu:2021].
56 |
57 | To the author’s knowledge, PCNtoolkit [@pcntoolkit] is the only other available package for Normative Modeling. It implements methods that have been applied in a range of psychiatry and neuroimaging studies [@kia:2020; @kia:2021; @rutherford:2022a; @fraza:2021], and is accompanied by thorough [tutorials](https://pcntoolkit.readthedocs.io/en/latest/pages/BLR_normativemodel_protocol.html), a [forum](https://gitter.im/predictive-clinical-neuroscience/community), and a framework for Normative Modeling in computational psychiatry [@rutherford:2022b]. While PCNtoolkit offers more advanced functionality, PyNM emphasizes being lightweight and easy to use, and implements different models than PCNtoolkit including a wrapper for the GAMLSS package from R, which is a powerful option for Normative Modeling [@dinga:2021].
58 |
59 | PyNM is intended to take users from their first steps in Normative Modeling to using advanced models on complex datasets. Crucially, it manages the dataset and has interactive tutorials – making it quick for new users to try the method either on their own data or on provided simulated data. The tutorials motivate the use of each model and highlight their limitations to help clarify which model is appropriate for what data, and built-in plotting and evaluation functions (\autoref{fig:Figure 1}) make it simple to check the validity of the model output. The package includes five models from various backends in a unified interface, including a wrapper for GAMLSS [@rigby:2005] from R that is otherwise not yet available in python, and the selected models cover many settings including big data and heteroskedasticity.
60 |
61 | Earlier versions of PyNM code were used in the following publications:
62 |
63 | - @lefebvre:2018
64 | - @maruani:2019
65 | - @bethlehem:2020
66 |
67 | # Usage Example
68 | ```
69 | from pynm.pynm import PyNM
70 |
71 | # Load data
72 | # df contains columns ‘score’,’group’,’age’,’sex’,’site’
73 | df = pd.read_csv(‘data.csv’)
74 |
75 | # Initialize pynm w/ data and confounds
76 | m = PyNM(df,'score','group', confounds = ['age','c(sex)','c(site)'])
77 |
78 | # Run models
79 | m.loess_normative_model()
80 | m.centiles_normative_model()
81 | m.gp_normative_model()
82 | m.gamlss_normative_model()
83 |
84 | # Collect output
85 | data = m.data
86 | ```
87 |
88 | # Figures
89 |
90 | 
91 |
92 | # Acknowledgements
93 |
94 | The development of this code has benefited from useful discussions with Andre Marquand, Thomas Wolfers, Eva Loth, Jumana Amad, Richard Bethlehem, and Michael Lombardo. The authors also want to thank the two reviewers Saige Rutherford ([`@saigerutherford`](https://github.com/saigerutherford)) and Seyed Mostafa Kia ([`@smkia`](https://github.com/smkia)) for their insightful feedback.
95 |
96 | Funding: This work is supported by IVADO, FRQS, CFI, MITACS, and Compute Canada.
97 |
98 | # References
99 |
--------------------------------------------------------------------------------
/pynm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppsp-team/PyNM/52de5b73ffd2cb4b04352e348f042dd695be3c87/pynm/__init__.py
--------------------------------------------------------------------------------
/pynm/cli.py:
--------------------------------------------------------------------------------
1 | from argparse import ArgumentParser
2 | from pynm import pynm
3 | import pandas as pd
4 |
5 | def _cli_parser():
6 | """Reads command line arguments and returns input specifications
7 |
8 | Returns
9 | -------
10 | dict
11 | Parsed arguments.
12 | """
13 | parser = ArgumentParser()
14 | parser.add_argument("--pheno_p",dest='pheno_p',required=True,
15 | help="Path to phenotype data. Data must be in a .csv file.")
16 | parser.add_argument("--out_p",dest='out_p',required=True,
17 | help="Path to output directory.")
18 | parser.add_argument("--confounds",dest='confounds',required=True,
19 | help="List of confounds to use in the GP model."
20 | "The list must formatted as a string with commas between confounds, "
21 | "each confound must be a column name from the phenotype .csv file. "
22 | "For GP model all confounds will be used, for LOESS and Centiles models "
23 | "only the first is used. For GAMLSS all confounds are used "
24 | "unless formulas are specified. Categorical values must be denoted by c(var) "
25 | "('c' must be lower case), e.g. 'c(SEX)' for column name 'SEX'.")
26 | parser.add_argument("--score",dest='score',required=True,
27 | help="Response variable for all models. "
28 | "Must be a column title from phenotype .csv file.")
29 | parser.add_argument("--group",dest='group',required=True,
30 | help="Column name from the phenotype .csv file that "
31 | "distinguishes probands from controls. The column must be "
32 | "encoded with str labels using 'PROB' for probands and 'CTR' for controls "
33 | "or with int labels using 1 for probands and 0 for controls.")
34 | parser.add_argument("--train_sample",default=1,dest='train_sample',
35 | help="Which method to use for a training sample, can be a float in (0,1] "
36 | "for a percentage of controls or 'manual' to be manually set using a column "
37 | "of the DataFrame labelled 'train_sample'.")
38 | parser.add_argument("--LOESS",dest='LOESS',action='store_true',
39 | help="Flag to run LOESS model.")
40 | parser.add_argument("--centiles",dest='centiles',action='store_true',
41 | help="Flag to run Centiles model.")
42 | parser.add_argument("--bin_spacing",default = -1,dest='bin_spacing',
43 | help="Distance between bins for LOESS & centiles models.")
44 | parser.add_argument("--bin_width",default = -1,dest='bin_width',
45 | help="Width of bins for LOESS & centiles models.")
46 | parser.add_argument("--GP",dest='GP',action='store_true',
47 | help="Flag to run Gaussian Process model.")
48 | parser.add_argument("--gp_method",default = 'auto',dest='gp_method',
49 | help="Method to use for the GP model. Can be set to "
50 | "'auto','approx' or 'exact'. In 'auto' mode, "
51 | "the exact model will be used for datasets smaller "
52 | "than 2000 data points. SVGP is used for the approximate model. "
53 | "See documentation for details. Default value is 'auto'.")
54 | parser.add_argument("--gp_num_epochs",default=20, dest='gp_num_epochs',
55 | help="Number of training epochs for SVGP model. "
56 | "See documentation for details. Default value is 20.")
57 | parser.add_argument("--gp_n_inducing",default=500,dest='gp_n_inducing',
58 | help="Number of inducing points for SVGP model. "
59 | "See documentation for details. Default value is 500.")
60 | parser.add_argument("--gp_batch_size",default=256,dest='gp_batch_size',
61 | help="Batch size for training and predicting from SVGP model. "
62 | "See documentation for details. Default value is 256.")
63 | parser.add_argument("--gp_length_scale",default=1,dest='gp_length_scale',
64 | help="Length scale of Matern kernel for exact model. "
65 | "See documentation for details. Default value is 1.")
66 | parser.add_argument("--gp_length_scale_bounds",default=(1e-5,1e5),dest='gp_length_scale_bounds', nargs='*',
67 | help="The lower and upper bound on length_scale. If set to 'fixed', "
68 | "length_scale cannot be changed during hyperparameter tuning. "
69 | "See documentation for details. Default value is (1e-5,1e5).")
70 | parser.add_argument("--gp_nu",default=2.5,dest='nu',
71 | help="Nu of Matern kernel for exact and SVGP model. "
72 | "See documentation for details. Default value is 2.5.")
73 | parser.add_argument("--GAMLSS",dest='GAMLSS',action='store_true',
74 | help="Flag to run GAMLSS.")
75 | parser.add_argument("--gamlss_mu",default=None,dest='gamlss_mu',
76 | help="Formula for mu (location) parameter of GAMLSS. Default "
77 | "formula for score is sum of confounds with non-categorical "
78 | "columns as smooth functions, e.g. 'score ~ ps(age) + sex'.")
79 | parser.add_argument("--gamlss_sigma",default=None,dest='gamlss_sigma',
80 | help="Formula for mu (location) parameter of GAMLSS. Default "
81 | "formula is '~ 1'.")
82 | parser.add_argument("--gamlss_nu",default=None,dest='gamlss_nu',
83 | help="Formula for mu (location) parameter of GAMLSS. Default "
84 | "formula is '~ 1'.")
85 | parser.add_argument("--gamlss_tau",default=None,dest='gamlss_tau',
86 | help="Formula for mu (location) parameter of GAMLSS. Default "
87 | "formula is '~ 1'.")
88 | parser.add_argument("--gamlss_family",default='SHASHo2',dest='gamlss_family',
89 | help="Family of distributions to use for fitting, default is 'SHASHo2'. "
90 | "See R documentation for GAMLSS package for other available families of distributions.")
91 | return parser.parse_args()
92 |
93 | def get_bounds(bounds):
94 | """Converts gp_length_scale_bounds parameter to appropriate type.
95 |
96 | Returns
97 | -------
98 | pair of floats >= 0 or 'fixed'
99 | Appropriate argument for PyNM.gp_normative_model.
100 |
101 | Raises
102 | ------
103 | ValueError
104 | Unrecognized argument for gp_length_scale_bounds.
105 | """
106 | if isinstance(bounds,list):
107 | if len(bounds)==1 and bounds[0]=='fixed':
108 | return 'fixed'
109 | elif len(bounds)==2:
110 | return (float(bounds[0]),float(bounds[1]))
111 | else:
112 | raise ValueError('Unrecognized argument for gp_length_scale_bounds.')
113 | else:
114 | return bounds
115 |
116 | def main():
117 | params = vars(_cli_parser())
118 |
119 | confounds = params['confounds'].split(',')
120 | data = pd.read_csv(params['pheno_p'])
121 |
122 | m = pynm.PyNM(data,params['score'],params['group'],params['conf'],confounds,params['train_sample'],
123 | bin_spacing=params['bin_spacing'], bin_width=params['bin_width'])
124 |
125 | #Run models
126 | if params['LOESS']:
127 | m.loess_normative_model()
128 | m.bins_num()
129 | if params['centiles']:
130 | m.centiles_normative_model()
131 | m.bins_num()
132 | if params['GP']:
133 | gp_length_scale_bounds = get_bounds(params['gp_length_scale_bounds'])
134 | print(gp_length_scale_bounds)
135 | print(type(gp_length_scale_bounds))
136 | m.gp_normative_model(length_scale=params['gp_length_scale'],
137 | length_scale_bounds = gp_length_scale_bounds, nu=params['gp_nu'],
138 | method=params['gp_method'],batch_size=params['gp_batch_size'],
139 | n_inducing=params['gp_n_inducing'],num_epochs=params['gp_num_epochs'])
140 | if args.GAMLSS:
141 | m.gamlss_normative_model(mu=params['gamlss_mu'],sigma=params['gamlss_sigma'],nu=params['gamlss_nu'],
142 | tau=params['gamlss_tau'],family=params['gamlss_family'])
143 |
144 | m.data.to_csv(params['out_p'],index=False)
145 |
146 | if __name__ == "__main__":
147 | raise RuntimeError("`pynm/cli.py` should not be run directly. Please install `pynm`.")
--------------------------------------------------------------------------------
/pynm/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppsp-team/PyNM/52de5b73ffd2cb4b04352e348f042dd695be3c87/pynm/models/__init__.py
--------------------------------------------------------------------------------
/pynm/models/approx.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | import gpytorch
4 | import numpy as np
5 | import statsmodels.api as sm
6 | from tqdm import tqdm
7 | from torch.utils.data import TensorDataset, DataLoader
8 | from gpytorch.models import ApproximateGP
9 | from gpytorch.variational import CholeskyVariationalDistribution
10 | from gpytorch.variational import VariationalStrategy
11 |
12 | class GPModel(ApproximateGP):
13 | """ Class for GPyTorch model.
14 |
15 | Attributes
16 | ----------
17 | mean_module : gpytorch Mean
18 | Module to calculate mean.
19 | covar_module : gpytorch Kernel
20 | Module to calculate covariance.
21 | """
22 | def __init__(self, inducing_points,nu=2.5,length_scale=1,length_scale_bounds=(1e-5,1e5)):
23 | """ Create a GPModel object.
24 |
25 | Parameters
26 | ----------
27 | inducing_points: array
28 | Array of inducing points.
29 | length_scale: float, default=1
30 | Length scale parameter of Matern kernel.
31 | length_scale_bounds: pair of floats >= 0 or 'fixed', default=(1e-5, 1e5)
32 | The lower and upper bound on length_scale. If set to 'fixed', ‘length_scale’ cannot be changed during hyperparameter tuning.
33 | nu: float, default=2.5
34 | Nu parameter of Matern kernel.
35 |
36 | Raises
37 | ------
38 | ValueError
39 | Invalid argument for length_scale_bounds
40 | """
41 | variational_distribution = CholeskyVariationalDistribution(inducing_points.size(0))
42 | variational_strategy = VariationalStrategy(self, inducing_points, variational_distribution, learn_inducing_locations=True)
43 | super(GPModel, self).__init__(variational_strategy)
44 |
45 | self.mean_module = gpytorch.means.ConstantMean()
46 |
47 | if length_scale_bounds == 'fixed':
48 | constraint = gpytorch.constraints.Interval(length_scale - 0.001,length_scale + 0.0001)
49 | elif isinstance(length_scale_bounds,tuple):
50 | constraint = gpytorch.constraints.Interval(length_scale_bounds[0],length_scale_bounds[1])
51 | else:
52 | raise ValueError('Invalid argument for length_scale_bounds.')
53 | prior = gpytorch.priors.NormalPrior(length_scale,1)
54 | self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=nu,lengthscale_prior=prior),lengthscale_contraint = constraint)
55 |
56 | def forward(self, x):
57 | """ Calculate forward pass of GPModel.
58 |
59 | Parameters
60 | ----------
61 | x: Tensor
62 | Data tensor.
63 |
64 | Returns
65 | -------
66 | MultivariateNormal object
67 | """
68 | mean_x = self.mean_module(x)
69 | covar_x = self.covar_module(x)
70 | return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
71 |
72 |
73 | class SVGP:
74 | """ Class for SVGP model.
75 |
76 | Attributes
77 | ----------
78 | train_loader: pytorch DataLoader
79 | DataLoader for training data.
80 | test_loader: pytorch DataLoader
81 | DataLoader for test data.
82 | inducing_points: array
83 | Subset of training data to use as inducing points.
84 | n_train: int
85 | Number of training points.
86 | n_test: int
87 | Number of test points.
88 | model: GPModel
89 | Instance of GPModel class.
90 | likelihood: gpytorch Likelihood
91 | Gaussian likelihood function.
92 | loss: list
93 | Loss for each epoch of training.
94 | """
95 | def __init__(self,X_train,X_test,y_train,y_test,n_inducing=500,batch_size=256,nu=2.5,length_scale=1,length_scale_bounds=(1e-5,1e5)):
96 | """ Create a SVGP object.
97 |
98 | Parameters
99 | ----------
100 | X_train: array
101 | Training confounds with categorical values dummy encoded.
102 | X_test: array
103 | Test confounds with categorical values dummy encoded.
104 | y_train: array
105 | Training score/response variable.
106 | y_test: array
107 | Test score/response variable.
108 | length_scale: float, default=1
109 | Length scale parameter of Matern kernel.
110 | length_scale_bounds: pair of floats >= 0 or 'fixed', default=(1e-5, 1e5)
111 | The lower and upper bound on length_scale. If set to 'fixed', ‘length_scale’ cannot be changed during hyperparameter tuning.
112 | nu: float, default=2.5
113 | Nu parameter of Matern kernel.
114 | batch_size: int, default=256
115 | Batch size for SVGP model training and prediction.
116 | n_inducing: int, default=500
117 | Number of inducing points for SVGP model.
118 | """
119 | # Get data in torch format
120 | train_x = torch.from_numpy(X_train).contiguous()
121 | test_x = torch.from_numpy(X_test).double().contiguous()
122 | train_y = torch.from_numpy(y_train).contiguous()
123 | test_y = torch.from_numpy(y_test).double().contiguous()
124 |
125 | # Create datasets
126 | train_dataset = TensorDataset(train_x, train_y)
127 | test_dataset = TensorDataset(test_x, test_y)
128 |
129 | # Create dataloaders
130 | self.train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
131 | self.test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
132 | inducing_idx = np.random.choice(np.array(range(train_x.shape[0])),size=n_inducing)
133 | self.inducing_points = train_x[inducing_idx, :]
134 | self.n_train = train_y.size(0)
135 | self.n_test = test_y.size(0)
136 |
137 | self.model = GPModel(inducing_points=self.inducing_points,nu=nu,length_scale=length_scale,length_scale_bounds=length_scale_bounds).double()
138 | self.likelihood = gpytorch.likelihoods.GaussianLikelihood()
139 | self.likelihood.initialize(noise=torch.std(train_x))
140 |
141 | if torch.cuda.is_available():
142 | self.model = self.model.cuda()
143 | self.likelihood = self.likelihood.cuda()
144 |
145 | self.loss = []
146 |
147 | def train(self,num_epochs=20):
148 | """ Trains the SVGP model.
149 |
150 | Parameters
151 | ----------
152 | num_epochs: int
153 | Number of epochs (full passes through dataset) to train for.
154 | """
155 | self.model.train()
156 | self.likelihood.train()
157 |
158 | optimizer = torch.optim.Adam([{'params': self.model.parameters()},{'params': self.likelihood.parameters()}], lr=0.01)
159 |
160 | # Loss object. We're using the VariationalELBO
161 | mll = gpytorch.mlls.VariationalELBO(self.likelihood, self.model, num_data=self.n_train)
162 |
163 | epochs_iter = tqdm(range(num_epochs), desc="Epoch")
164 | for i in epochs_iter:
165 | # Within each iteration, we will go over each minibatch of data
166 | minibatch_iter = tqdm(self.train_loader, desc="Minibatch", leave=False)
167 | for x_batch, y_batch in minibatch_iter:
168 | optimizer.zero_grad()
169 | output = self.model(x_batch)
170 | loss = -mll(output, y_batch)
171 | minibatch_iter.set_postfix(loss=loss.item())
172 | loss.backward()
173 | optimizer.step()
174 | self.loss.append(loss.item())
175 |
176 | def predict(self):
177 | """ Predict from SVGP model.
178 |
179 | Returns
180 | ----------
181 | array
182 | Model predictions (mean of predictive distribution).
183 | array
184 | Model uncertainty (standard deviation of predictive distribution).
185 | """
186 | self.model.eval()
187 | self.likelihood.eval()
188 |
189 | mean = torch.tensor([0.])
190 | sigma = torch.tensor([0.])
191 | with torch.no_grad():
192 | for x_batch, y_batch in self.test_loader:
193 | preds = self.likelihood(self.model(x_batch)) # get likelihood variance + posterior GP variance
194 | mean = torch.cat([mean, preds.mean.cpu()])
195 | sigma = torch.cat([sigma, torch.sqrt(preds.variance.cpu())])
196 | mean = mean[1:]
197 | sigma = sigma[1:]
198 | return mean, sigma
--------------------------------------------------------------------------------
/pynm/models/centiles.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from scipy.stats.mstats import mquantiles
3 |
4 | def centiles_fit(train_data,bins,bin_width):
5 | """ Fit Centiles model.
6 |
7 | Parameters
8 | ----------
9 | train_data: array
10 | Training data for Centiles model.
11 | bins: array
12 | Bins for Centiles model.
13 | bin_width: float
14 | Width of each bin.
15 |
16 | Returns
17 | -------
18 | array
19 | Centiles for each bin.
20 | """
21 | z = np.zeros([bins.shape[0], 101]) # centiles
22 |
23 | for i, bin_center in enumerate(bins):
24 | mu = np.array(bin_center) # bin_center value (age or conf)
25 | bin_mask = (abs(train_data[:, :1] - mu) <
26 | bin_width) * 1. # one hot mask
27 | idx = [u for (u, v) in np.argwhere(bin_mask)]
28 | scores = train_data[idx, 1]
29 |
30 | # if more than 2 non NaN values do the model
31 | if (~np.isnan(scores)).sum() > 2:
32 | # centiles
33 | z[i, :] = mquantiles(scores, prob=np.linspace(0, 1, 101), alphap=0.4, betap=0.4)
34 | else:
35 | z[i] = np.nan
36 |
37 | return z
38 |
39 | def centiles_predict(test_data,bins,z):
40 | """ Predict from Centiles model.
41 |
42 | Parameters
43 | ----------
44 | test_data: array
45 | Test data for Centiles model. Column 0 is confound, 1 is score.
46 | bins: array
47 | Bins for Centiles model.
48 | z: array
49 | Centiles for each bin.
50 |
51 | Returns
52 | -------
53 | array
54 | Centile within which each subject falls.
55 | array
56 | Centiles for each subject.
57 | """
58 | dists = [np.abs(conf - bins) for conf in test_data[:,0]]
59 | idx = [np.argmin(d) for d in dists]
60 | centiles = np.array([z[i] for i in idx])
61 |
62 | result = np.zeros(centiles.shape[0])
63 | max_mask = test_data[:,1] >= np.max(centiles, axis=1)
64 | min_mask = test_data[:,1] < np.min(centiles, axis=1)
65 | else_mask = ~(max_mask | min_mask)
66 | result[max_mask] = 100
67 | result[min_mask] = 0
68 | result[else_mask] = np.array([np.argmin(test_data[:,1][i] >= centiles[i]) for i in range(test_data.shape[0])])[else_mask]
69 |
70 | return result, centiles
--------------------------------------------------------------------------------
/pynm/models/gamlss.py:
--------------------------------------------------------------------------------
1 | import re
2 | import rpy2.robjects as ro
3 | from rpy2.robjects.packages import importr
4 | from rpy2.robjects import numpy2ri
5 | from rpy2.robjects import pandas2ri
6 | from rpy2.robjects import r
7 | from pynm.util import read_confounds
8 |
9 | class GAMLSS:
10 | """Class for GAMLSS model.
11 |
12 | Attributes
13 | ----------
14 | gamlss_data: R package
15 | Python imported R package.
16 | gamlss_dist: R package
17 | Python imported R package.
18 | gamlss: R package
19 | Python imported R package.
20 | mu_f: R formula
21 | Formula for mu (location) parameter.
22 | sigma_f: R formula
23 | Formula for sigma (scale) parameter.
24 | nu_f: R formula
25 | Formula for nu (skewness) parameter.
26 | tau_f: R formula
27 | Formula for tau (kurtosis) parameter.
28 | rfamily: R object
29 | Family of distributions to use for fitting.
30 | method: str
31 | Method to fit GAMLSS.
32 | model: R object
33 | Fitted GAMLSS model.
34 | """
35 |
36 | def __init__(self,mu=None,sigma=None,nu=None,tau=None,family='SHASHo2',method='RS',score=None,confounds=None):
37 | """Create GAMLSS object. Formulas must be written for R, using functions available in the GAMLSS package.
38 |
39 | Parameters
40 | ----------
41 | mu: str, default=None
42 | Formula for mu (location) parameter of GAMLSS. If None, formula for score is sum of confounds
43 | with non-categorical columns as smooth functions, e.g. "score ~ ps(age) + sex".
44 | sigma: str, default=None
45 | Formula for sigma (scale) parameter of GAMLSS. If None, formula is '~ 1'.
46 | nu: str, default=None
47 | Formula for nu (skewness) parameter of GAMLSS. If None, formula is '~ 1'.
48 | tau: str, default=None
49 | Formula for tau (kurtosis) parameter of GAMLSS. If None, formula is '~ 1'.
50 | family: str,default='SHASHo2'
51 | Family of distributions to use for fitting, default is 'SHASHo2'. See R documentation for GAMLSS package for other available families of distributions.
52 | method: str, default = 'RS'
53 | Method for fitting GAMLSS. Can be 'RS' (Rigby and Stasinopoulos algorithm), 'CG' (Cole and Green algorithm) or 'mixed(n,m)' where n & m are integers.
54 | Specifying 'mixed(n,m)' will use the RS algorithm for n iterations and the CG algorithm for up to m additional iterations.
55 | score: str, default=None
56 | Label of score in DataFrame.
57 | confounds: list, default=None
58 | List of labels of confounds in DataFrame.
59 |
60 | Notes
61 | -----
62 | If using 'random()' to model a random effect in any of the formulas, it must be passed a column of the dataframe with categorical values
63 | as a factor: e.g. 'random(as.factor(COL))'.
64 | """
65 | numpy2ri.activate()
66 | pandas2ri.activate()
67 |
68 | self.gamlss_data = importr('gamlss.data')
69 | self.gamlss_dist = importr('gamlss.dist')
70 | self.gamlss = importr('gamlss')
71 | self.base = importr('base')
72 |
73 | self.score = score
74 | self.confounds = confounds
75 | self.mu_f,self.sigma_f,self.nu_f,self.tau_f = self._get_r_formulas(mu,sigma,nu,tau)
76 | self.family = family
77 | self.method = self._get_method(method)
78 | try:
79 | self.rfamily = r[family]
80 | except:
81 | raise ValueError("Provided family not valid, choose 'SHASHo2', 'NO' or see R documentation for GAMLSS package for other available families of distributions.")
82 |
83 | def _get_r_formulas(self,mu,sigma,nu,tau):
84 | """Convert from string input to R formula.
85 |
86 | Parameters
87 | ----------
88 | mu: str or None
89 | Formula for mu (location) parameter of GAMLSS. If None, formula for score is sum of confounds
90 | with non-categorical columns as smooth functions, e.g. "score ~ ps(age) + sex".
91 | sigma: str or None
92 | Formula for sigma (scale) parameter of GAMLSS. If None, formula is '~ 1'.
93 | nu: str or None
94 | Formula for nu (skewness) parameter of GAMLSS. If None, formula is '~ 1'.
95 | tau: str or None
96 | Formula for tau (kurtosis) parameter of GAMLSS. If None, formula is '~ 1'.
97 |
98 | Raises
99 | ------
100 | ValueError
101 | If any of the input strings contains a function call not recognised by the R GAMLSS package.
102 | ValueError
103 | If mu is None and either score or confounds is None.
104 |
105 | Returns
106 | -------
107 | R formula, R formula, R formula, R formula
108 | R formula equivalent for each input string.
109 | """
110 | if mu is None:
111 | if (self.score is None) or (self.confounds is None):
112 | raise ValueError('If mu is None, both score and confounds must be provided i.e. not None.')
113 | _,cat = read_confounds(self.confounds)
114 | formula_conf = ['ps({})'.format(conf) for conf in self.confounds if not conf[2:-1] in cat] + cat
115 | mu = '{} ~ {}'.format(self.score,' + '.join(formula_conf))
116 | if sigma is None:
117 | sigma = '~ 1'
118 | if nu is None:
119 | nu = '~ 1'
120 | if tau is None:
121 | tau = '~ 1'
122 |
123 | # get r functions from formulas
124 | p = re.compile(r"\w*\(")
125 | funcs = []
126 | for s in [mu,sigma,nu,tau]:
127 | for f in p.findall(s):
128 | funcs.append(f[:-1])
129 |
130 | for func in funcs:
131 | try:
132 | exec("{} = r['{}']".format(func,func))
133 | except:
134 | raise ValueError("'{}' function not found in R GAMLSS package. See GAMLSS documentation for available functions.".format(func))
135 |
136 | return mu,sigma,nu,tau
137 |
138 | def _get_method(self,method):
139 | """ Get method parameter in appropriate format for R.
140 |
141 | Raises
142 | ------
143 | TypeError
144 | "Argument 'method' must be of type str."
145 | ValueError
146 | "Unrecognized argument for 'method'."
147 | """
148 | if not isinstance(method,str):
149 | raise TypeError("Argument 'method' must be of type str.")
150 |
151 | pattern = re.compile(r"mixed\([0-9]*,[0-9]*\)")
152 |
153 | if method == 'RS':
154 | return 'RS()'
155 | elif method == 'CG':
156 | return 'CG()'
157 | elif pattern.match(method) is not None:
158 | return method
159 | else:
160 | raise ValueError("Unrecognized argument for 'method'.")
161 |
162 | def fit(self,train_data):
163 | """Create and fit gamlss model.
164 |
165 | Parameters
166 | ----------
167 | train_data: DataFrame
168 | DataFrame with training data.
169 | """
170 | ro.globalenv['train_data'] = train_data
171 |
172 | self.model = r(f'''gamlss({self.mu_f},
173 | sigma.formula={self.sigma_f},
174 | nu.formula={self.nu_f},
175 | tau.formula={self.tau_f},
176 | family={self.family},
177 | data=train_data,
178 | method={self.method})''')
179 |
180 | def predict(self,test_data,what='mu'):
181 | """Predict from fitted gamlss model.
182 |
183 | Parameters
184 | ----------
185 | test_data: DataFrame
186 | DataFrame with test data.
187 | what: str
188 | Which parameter to predict, can be 'mu','sigma', 'nu', or 'tau'.
189 | """
190 | ro.globalenv['model'] = self.model
191 | ro.globalenv['test_data'] = test_data
192 |
193 | res = r(f'''predict(model,newdata=test_data,parameter="{what}")''')
194 | return res
195 |
--------------------------------------------------------------------------------
/pynm/models/loess.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import statsmodels.api as sm
3 | from statsmodels.sandbox.regression.predstd import wls_prediction_std
4 |
5 | def loess_fit(train_data,bins,bin_width):
6 | """ Fit LOESS model.
7 |
8 | Parameters
9 | ----------
10 | train_data: array
11 | Training data for LOESS model. Column 0 is confound, 1 is score.
12 | bins: array
13 | Bins for LOESS model.
14 | bin_width: float
15 | Width of each bin.
16 |
17 | Returns
18 | -------
19 | array
20 | Mean of each bin.
21 | array
22 | Standard deviation of each bin.
23 | array
24 | Confidence interval of each bin.
25 | """
26 | zm = np.zeros(bins.shape[0]) # mean
27 | zstd = np.zeros(bins.shape[0]) # standard deviation
28 | zci = np.zeros([bins.shape[0], 2]) # confidence interval
29 |
30 | for i, bin_center in enumerate(bins):
31 | mu = np.array(bin_center) # bin_center value (age or conf)
32 | bin_mask = (abs(train_data[:, :1] - mu) < bin_width) * 1.
33 | idx = [u for (u, v) in np.argwhere(bin_mask)]
34 |
35 | scores = train_data[idx, 1]
36 | adj_conf = train_data[idx, 0] - mu # confound relative to bin center
37 |
38 | # if more than 2 non NaN values do the model
39 | if (~np.isnan(scores)).sum() > 2:
40 | mod = sm.WLS(scores, sm.tools.add_constant(adj_conf,
41 | has_constant='add'),
42 | missing='drop', weights=bin_mask.flatten()[idx],
43 | hasconst=True).fit()
44 | zm[i] = mod.params[0] # mean
45 |
46 | # std and confidence intervals
47 | prstd,_,_ = wls_prediction_std(mod, [0, 0])
48 | zstd[i] = prstd
49 | zci[i, :] = mod.conf_int()[0, :] # [iv_l, iv_u]
50 |
51 | else:
52 | zm[i] = np.nan
53 | zci[i] = np.nan
54 | zstd[i] = np.nan
55 |
56 | return zm, zstd, zci
57 |
58 | def loess_predict(test_data,bins,zm,zstd):
59 | """ Predict from LOESS model.
60 |
61 | Parameters
62 | ----------
63 | test_data: array
64 | Test data for LOESS model. Column 0 is confound, 1 is score.
65 | bins: array
66 | Bins for LOESS model.
67 | zm: array
68 | Mean of each bin.
69 | zstd: array
70 | Standard deviation of each bin.
71 |
72 | Returns
73 | -------
74 | array
75 | Mean for each subject.
76 | array
77 | Standard deviation for each subject.
78 | """
79 | dists = [np.abs(conf - bins) for conf in test_data[:,0]]
80 | idx = [np.argmin(d) for d in dists]
81 | m = np.array([zm[i] for i in idx])
82 | std = np.array([zstd[i] for i in idx])
83 |
84 | return m, std
--------------------------------------------------------------------------------
/pynm/pynm.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | # ==============================================================================
4 | # title : PyNM.py
5 | # description : Gaussian Processes, Centiles & LOESS-based normative models
6 | # author : Guillaume Dumas (Institut Pasteur/Université de Montréal)
7 | # Annabelle Harvey (Université de Montréal)
8 | # date : 2021-04-15
9 | # notes : The input dataframe column passed to --group must either
10 | # have controls marked as "CTR" and probands as "PROB", or
11 | # controls marked as 0 and probands as 1.
12 | # The --pheno_p is for the path to the input dataframe.
13 | # The --out_p flag is for the path to save the output
14 | # dataframe, including filename formatted as 'filename.csv'.
15 | # The confounds columns for the gaussian process model must
16 | # be specified using the --confounds flag. The confound for
17 | # the LOESS and centiles models must be specified using the
18 | # --conf flag.
19 | # licence : BSD 3-Clause License
20 | # python_version : 3.7
21 | # ==============================================================================
22 |
23 | import pandas as pd
24 | import numpy as np
25 | import matplotlib.pyplot as plt
26 | import seaborn as sns
27 | import warnings
28 |
29 | from sklearn import gaussian_process
30 | from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel
31 | from sklearn.model_selection import KFold
32 | from statsmodels.stats.diagnostic import het_white
33 | from statsmodels.tools.tools import add_constant
34 | from scipy import stats
35 |
36 | from pynm.util import *
37 | from pynm.models.loess import *
38 | from pynm.models.centiles import *
39 |
40 | class PyNM:
41 | """ Class to run normative modeling using LOESS, centiles, or GP model.
42 |
43 | Attributes
44 | ----------
45 | data : dataframe
46 | Dataset to fit model, must at least contain columns corresponding to 'group',
47 | 'score', and 'conf'.
48 | score : str
49 | Label of column from data with score (response variable).
50 | group : str
51 | Label of column from data that encodes wether subjects are probands or controls.
52 | CTR : str or int
53 | Label of controls in 'group' column can be 'CTR' or 0.
54 | PROB: str or int
55 | Label of probands in 'group' column can be 'PRB' or 1.
56 | conf: str
57 | Label of column from data with confound to use for LOESS and centiles models.
58 | confounds: list of str
59 | List of labels of columns from data with confounds. For GP model all confounds will be used,
60 | for LOESS and Centiles models only the first is used. For GAMLSS all confounds are used
61 | unless formulas are specified. Categorical values must be denoted by c(var) ('c' must be lower case).
62 | train_sample: str or float
63 | Which method to use for a training sample, can be 'controls' to use all the controls,
64 | 'manual' to be manually set, or a float in (0,1] for a percentage of controls.
65 | bin_spacing: int
66 | Distance between bins for LOESS & centiles models.
67 | bin_width: float
68 | Width of bins for LOESS & centiles models.
69 | bins: array
70 | Bins for the centiles and LOESS models.
71 | bin_count: array
72 | Number of controls in each bin.
73 | zm: array
74 | Mean of each bin (LOESS).
75 | zstd: array
76 | Standard deviation of each bin (LOESS).
77 | zci: array
78 | Confidence interval of each bin (LOESS).
79 | z: array
80 | Centiles for each bin.
81 | RMSE_LOESS: float
82 | RMSE of LOESS normative model
83 | SMSE_LOESS: float
84 | SMSE of LOESS normative model
85 | RMSE_Centiles: float
86 | RMSE of Centiles normative model
87 | SMSE_Centiles: float
88 | SMSE of Centiles normative model
89 | RMSE_GP: float
90 | RMSE of Gaussian Process normative model
91 | SMSE_GP: float
92 | SMSE of Gaussian Process normative model
93 | MSLL_GP: float
94 | MSLL of Gaussian Process normative model
95 | RMSE_GAMLSS: float
96 | RMSE of GAMLSS
97 | SMSE_GAMLSS: float
98 | SMSE of GAMLSS
99 | MSLL_GAMLSS: float
100 | MSLL of GAMLSS
101 | """
102 |
103 | def __init__(self, data, score, group, confounds,
104 | train_sample=1, bin_spacing=-1, bin_width=-1, seed=None):
105 | """ Create a PyNM object.
106 |
107 | Parameters
108 | ----------
109 | data : dataframe
110 | Dataset to fit model, must at least contain columns corresponding to 'group',
111 | 'score', and 'conf'.
112 | score : str
113 | Label of column from data with score (response variable).
114 | group : str
115 | Label of column from data that encodes wether subjects are probands or controls.
116 | confounds: list of str
117 | List of labels of columns from data with confounds. For GP model all confounds will be used,
118 | for LOESS and Centiles models only the first is used. For GAMLSS all confounds are used
119 | unless formulas are specified. Categorical values must be denoted by c(var) ('c' must be lower case).
120 | train_sample: str or float, default=1
121 | Which method to use for a training sample, can be a float in (0,1] for a percentage of controls
122 | or 'manual' to be manually set using a column of the DataFrame labelled 'train_sample'.
123 | bin_spacing: int, default=-1
124 | Distance between bins for LOESS & centiles models.
125 | bin_width: float, default=-1
126 | Width of bins for LOESS & centiles models.
127 | seed: int, default=None
128 | Seed for random state generator, if None no seed is set.
129 |
130 | Raises
131 | ------
132 | ValueError
133 | Each row of DataFrame must have a unique index.
134 | """
135 | if data.index.nunique() != data.shape[0]:
136 | raise ValueError('Each row of DataFrame must have a unique index.')
137 | self.data = data.copy()
138 | self.score = score
139 | self.group = group
140 | self.confounds = confounds
141 | self.conf = self.confounds[0]
142 | self.train_sample = train_sample
143 | self.CTR = None
144 | self.PROB = None
145 | self.bin_spacing = bin_spacing
146 | self.bin_width = bin_width
147 | self.bins = None
148 | self.bin_count = None
149 | self.zm = None
150 | self.zstd = None
151 | self.zci = None
152 | self.z = None
153 | self.RMSE_LOESS = None
154 | self.SMSE_LOESS = None
155 | self.RMSE_Centiles = None
156 | self.SMSE_Centiles = None
157 | self.RMSE_GP = None
158 | self.SMSE_GP = None
159 | self.MSLL_GP = None
160 | self.RMSE_GAMLSS = None
161 | self.SMSE_GAMLSS = None
162 | self.MSLL_GAMLSS = None
163 |
164 | if seed is not None:
165 | np.random.seed(seed)
166 |
167 | self._set_group_names()
168 | self._set_group()
169 |
170 | def _make_train_sample(self, train_size):
171 | """ Select a subsample of controls to be used as a training sample for the normative model.
172 |
173 | Parameters
174 | ----------
175 | train_size: float
176 | Percentage of controls to use for training. Must be in (0,1].
177 | """
178 | ctr_idx = self.data[self.data[self.group] == self.CTR].index.tolist()
179 | n_ctr = len(ctr_idx)
180 | n_ctr_train = max(int(train_size*n_ctr), 1)
181 |
182 | np.random.seed(1)
183 | ctr_idx_train = np.array(np.random.choice(ctr_idx, size=n_ctr_train, replace=False))
184 |
185 | train_sample = np.zeros(self.data.shape[0])
186 | train_sample[ctr_idx_train] = 1
187 | self.data['train_sample'] = train_sample
188 |
189 | print('Models will be fit with train sample size = {}: using {}/{} of controls.'.format(train_size, n_ctr_train, n_ctr))
190 |
191 | def _set_group(self):
192 | """ Read the specified training sample and set the group attribute to refer to the appropriate column of data.
193 |
194 | Raises
195 | ------
196 | ValueError
197 | With train_sample=1: Dataset has no controls for training sample.
198 | ValueError
199 | With train_sample='manual': Data has no column "train_sample". To manually specify a training sample,
200 | data .csv must contain a column "train_sample" with included subjects marked with 1 and rest as 0.
201 | ValueError
202 | With train_sample='manual': Dataset has no subjects in specified training sample.
203 | ValueError
204 | Value for train_sample not recognized. Must be either a value in (0,1] or 'manual'.
205 | ValueError
206 | With train_sample float: Numerical value for train_sample must be in the range (0,1].
207 | """
208 | if self.train_sample == 1:
209 | print('Models will be fit on full set of controls.')
210 | if self.data[self.data[self.group] == self.CTR].shape[0] == 0:
211 | raise ValueError('Dataset has no controls for training sample.')
212 | elif self.train_sample == 'manual':
213 | print('Models will be fit using specified training sample.')
214 | if 'train_sample' not in self.data.columns:
215 | raise ValueError('Data has no column "train_sample". To manually specify a training sample, data .csv '
216 | 'must contain a column "train_sample" with included subjects marked with 1 and rest as 0.')
217 | self.group = 'train_sample'
218 | self._set_group_names()
219 |
220 | if self.data[self.data[self.group] == self.CTR].shape[0] == 0:
221 | raise ValueError('Dataset has no subjects in specified training sample..')
222 | else:
223 | try:
224 | train_size = float(self.train_sample)
225 | except:
226 | raise ValueError("Value for train_sample not recognized. Must be either 'controls', 'manual', or a "
227 | "value in (0,1].")
228 | else:
229 | if (train_size > 1) or (train_size <= 0):
230 | raise ValueError("Numerical value for train_sample must be in the range (0,1].")
231 | else:
232 | self._make_train_sample(train_size)
233 | self.group = 'train_sample'
234 | self._set_group_names()
235 |
236 | def _set_group_names(self):
237 | """ Read whether subjects in data are labeled CTR/PROB or 0/1 and set labels accordingly."""
238 | if self.group == 'train_sample':
239 | self.CTR = 1
240 | self.PROB = 0
241 | else:
242 | labels = list(self.data[self.group].unique())
243 | if ('CTR' in labels) or ('PROB' in labels):
244 | self.CTR = 'CTR'
245 | self.PROB = 'PROB'
246 | else:
247 | self.CTR = 0
248 | self.PROB = 1
249 |
250 | def _get_masks(self):
251 | """ Get masks from data corresponding to controls and probands.
252 |
253 | Returns
254 | -------
255 | array
256 | Control mask: controls marked as True.
257 | array
258 | Proband mask: probands marked as True.
259 | """
260 | ctr = self.data.loc[(self.data[self.group] == self.CTR)]
261 | ctr_mask = self.data.index.isin(ctr.index)
262 | probands = self.data.loc[(self.data[self.group] == self.PROB)]
263 | prob_mask = self.data.index.isin(probands.index)
264 | return ctr_mask, prob_mask
265 |
266 | # Default values for age in days
267 | def _create_bins(self):
268 | """ Create bins for the centiles and LOESS models.
269 | Returns
270 | -------
271 | array
272 | Bins for the centiles and LOESS models.
273 | """
274 | min_conf = self.data[self.conf].min()
275 | max_conf = self.data[self.conf].max()
276 |
277 | if self.bin_width == -1:
278 | self.bin_width = (max_conf - min_conf)/100
279 | if self.bin_spacing == -1:
280 | self.bin_spacing = (max_conf - min_conf)/10
281 |
282 | # define the bins (according to width)
283 | self.bins = np.arange(min_conf, max_conf + self.bin_width, self.bin_spacing)
284 | return self.bins
285 |
286 | def bins_num(self):
287 | """ Give the number of ctr used for the age bin each participant is in.
288 |
289 | Returns
290 | -------
291 | array
292 | Number of controls in each bin.
293 | """
294 | if self.bins is None:
295 | self.create_bins()
296 |
297 | dists = [np.abs(conf - self.bins) for conf in self.data[self.conf]]
298 | idx = [np.argmin(d) for d in dists]
299 | n_ctr = [self.bin_count[i] for i in idx]
300 | self.data['participants'] = n_ctr
301 | return n_ctr
302 |
303 | def _loess_rank(self):
304 | """ Associate ranks to LOESS normative scores."""
305 | self.data.loc[(self.data.LOESS_z <= -2), 'LOESS_rank'] = -2
306 | self.data.loc[(self.data.LOESS_z > -2) &
307 | (self.data.LOESS_z <= -1), 'LOESS_rank'] = -1
308 | self.data.loc[(self.data.LOESS_z > -1) &
309 | (self.data.LOESS_z <= +1), 'LOESS_rank'] = 0
310 | self.data.loc[(self.data.LOESS_z > +1) &
311 | (self.data.LOESS_z <= +2), 'LOESS_rank'] = 1
312 | self.data.loc[(self.data.LOESS_z > +2), 'LOESS_rank'] = 2
313 |
314 | def loess_normative_model(self,cv_folds=1):
315 | """ Compute LOESS normative model.
316 |
317 | Parameters
318 | ----------
319 | cv_folds: int, default=1
320 | How many folds of cross-validation to perform. If 1, there is no cross-validation.
321 | """
322 | if self.bins is None:
323 | self._create_bins()
324 |
325 | # Format data
326 | data = self.data[[self.conf, self.score]].to_numpy(dtype=np.float64)
327 |
328 | # Take the controls
329 | ctr_mask, _ = self._get_masks()
330 | ctr = data[ctr_mask]
331 |
332 | # Cross-validation
333 | if cv_folds == 1:
334 | self.zm,self.zstd,self.zci = loess_fit(ctr,self.bins,self.bin_width)
335 | m, std = loess_predict(data,self.bins,self.zm,self.zstd)
336 |
337 | rmse = RMSE(self.data[self.score].values[ctr_mask],m[ctr_mask])
338 | smse = SMSE(self.data[self.score].values[ctr_mask],m[ctr_mask])
339 |
340 | else:
341 | kf = KFold(n_splits=cv_folds, shuffle=True)
342 | rmse = []
343 | smse = []
344 | print(f'Starting {cv_folds} folds of CV...')
345 | for i, (train_index, test_index) in enumerate(kf.split(ctr)):
346 | ctr_train, ctr_test = ctr[train_index], ctr[test_index]
347 | cv_zm,cv_zstd,_ = loess_fit(ctr_train,self.bins,self.bin_width)
348 | cv_m, _ = loess_predict(ctr_test,self.bins,cv_zm,cv_zstd)
349 | r = RMSE(ctr_test[:,1],cv_m)
350 | s = SMSE(ctr_test[:,1],cv_m)
351 | print(f'CV Fold {i}: RMSE={r:.3f} - SMSE={s:.3f}')
352 | rmse.append(r)
353 | smse.append(s)
354 | print('Done!')
355 |
356 | rmse = np.mean(rmse)
357 | smse = np.mean(smse)
358 | print(f'Average: RMSE={rmse:.3f} - SMSE={smse:.3f}')
359 |
360 | self.zm,self.zstd,self.zci = loess_fit(ctr,self.bins,self.bin_width)
361 | m, std = loess_predict(data,self.bins,self.zm,self.zstd)
362 |
363 | self.data['LOESS_pred'] = m
364 | self.data['LOESS_sigma'] = std
365 | self.data['LOESS_residuals'] = self.data[self.score] - self.data['LOESS_pred']
366 | self.data['LOESS_z'] = self.data['LOESS_residuals']/self.data['LOESS_sigma']
367 |
368 | self.RMSE_LOESS = rmse
369 | self.SMSE_LOESS = smse
370 |
371 | self._loess_rank()
372 |
373 | def _centiles_rank(self):
374 | """ Associate ranks to centiles associated with normative modeling."""
375 | self.data.loc[(self.data.Centiles_pred <= 5), 'Centiles_rank'] = -2
376 | self.data.loc[(self.data.Centiles_pred > 5) &
377 | (self.data.Centiles_pred <= 25), 'Centiles_rank'] = -1
378 | self.data.loc[(self.data.Centiles_pred > 25) &
379 | (self.data.Centiles_pred <= 75), 'Centiles_rank'] = 0
380 | self.data.loc[(self.data.Centiles_pred > 75) &
381 | (self.data.Centiles_pred <= 95), 'Centiles_rank'] = 1
382 | self.data.loc[(self.data.Centiles_pred > 95), 'Centiles_rank'] = 2
383 |
384 | def centiles_normative_model(self, cv_folds=1):
385 | """ Compute centiles normative model.
386 |
387 | Parameters
388 | ----------
389 | cv_folds: int, default=1
390 | How many folds of cross-validation to perform. If 1, there is no cross-validation.
391 | """
392 | if self.bins is None:
393 | self._create_bins()
394 |
395 | # Format data
396 | data = self.data[[self.conf, self.score]].to_numpy(dtype=np.float64)
397 |
398 | # Take the controls
399 | ctr_mask, _ = self._get_masks()
400 | ctr = data[ctr_mask]
401 |
402 | # Cross-validation
403 | if cv_folds == 1:
404 | self.z = centiles_fit(ctr,self.bins,self.bin_width)
405 | result, centiles = centiles_predict(data,self.bins,self.z)
406 | centiles_50 = np.array([centiles[i, 50] for i in range(self.data.shape[0])])
407 |
408 | rmse = RMSE(self.data[self.score].values[ctr_mask],centiles_50[ctr_mask])
409 | smse = SMSE(self.data[self.score].values[ctr_mask],centiles_50[ctr_mask])
410 |
411 | else:
412 | kf = KFold(n_splits=cv_folds, shuffle=True)
413 | rmse = []
414 | smse = []
415 | print(f'Starting {cv_folds} folds of CV...')
416 | for i, (train_index, test_index) in enumerate(kf.split(ctr)):
417 | ctr_train, ctr_test = ctr[train_index], ctr[test_index]
418 | cv_z = centiles_fit(ctr_train,self.bins,self.bin_width)
419 | _, cv_centiles = centiles_predict(ctr_test, self.bins,cv_z)
420 | cv_50 = np.array([cv_centiles[i, 50] for i in range(ctr_test.shape[0])])
421 | r = RMSE(ctr_test[:,1],cv_50)
422 | s = SMSE(ctr_test[:,1],cv_50)
423 | print(f'CV Fold {i}: RMSE={r:.3f} - SMSE={s:.3f}')
424 | rmse.append(r)
425 | smse.append(s)
426 | print('Done!')
427 |
428 | rmse = np.mean(rmse)
429 | smse = np.mean(smse)
430 | print(f'Average: RMSE={rmse:.3f} - SMSE={smse:.3f}')
431 |
432 | self.z = centiles_fit(ctr,self.bins,self.bin_width)
433 | result, centiles = centiles_predict(data,self.bins,self.z)
434 |
435 | self.data['Centiles'] = result
436 | self.data['Centiles_5'] = np.array([centiles[i, 5] for i in range(self.data.shape[0])])
437 | self.data['Centiles_32'] = np.array([centiles[i, 32] for i in range(self.data.shape[0])])
438 | self.data['Centiles_pred'] = np.array([centiles[i, 50] for i in range(self.data.shape[0])])
439 | self.data['Centiles_68'] = np.array([centiles[i, 68] for i in range(self.data.shape[0])])
440 | self.data['Centiles_95'] = np.array([centiles[i, 95] for i in range(self.data.shape[0])])
441 | self.data['Centiles_sigma'] = (self.data['Centiles_68'] - self.data['Centiles_32'])/2
442 | self.data['Centiles_residuals'] = self.data[self.score] - self.data['Centiles_pred']
443 | self.data['Centiles_z'] = self.data['Centiles_residuals']/self.data['Centiles_sigma']
444 |
445 | self.RMSE_Centiles = rmse
446 | self.SMSE_Centiles = smse
447 |
448 | self._centiles_rank()
449 |
450 | def _get_conf_mat(self):
451 | """ Get confounds properly formatted from dataframe and input list.
452 |
453 | Returns
454 | -------
455 | array
456 | Confounds with categorical values dummy encoded. Dummy encoding keeps k-1
457 | dummies out of k categorical levels.
458 | """
459 | conf_clean, conf_cat = read_confounds(self.confounds)
460 | conf_mat = pd.get_dummies(self.data[conf_clean], columns=conf_cat,
461 | drop_first=True)
462 | return conf_mat.to_numpy()
463 |
464 | def _get_score(self):
465 | """ Get the score from the PyNM object as an array.
466 |
467 | Raises
468 | ------
469 | ValueError
470 | Method must be one of "auto","approx", or "exact".
471 |
472 | Returns
473 | -------
474 | array
475 | The column of data marked by the user as 'score'.
476 | """
477 | return self.data[self.score].to_numpy()
478 |
479 | def _use_approx(self, method='auto'):
480 | """ Choose wether or not to use SVGP model. If method is set to 'auto' SVGP is chosen
481 | for datasets with more than 2000 points.
482 |
483 | Parameters
484 | ----------
485 | method: str, default='auto'
486 | Which method to use, can be 'exact' for exact GP regression, 'approx' for SVGP,
487 | or 'auto' which will set the method according to the size of the data.
488 |
489 | Raises
490 | ------
491 | ValueError
492 | Method must be one of "auto","approx", or "exact".
493 | """
494 | if method == 'auto':
495 | if self.data.shape[0] > 2000:
496 | return True
497 | else:
498 | return False
499 | elif method == 'approx':
500 | return True
501 | elif method == 'exact':
502 | if self.data.shape[0] > 2000:
503 | warnings.warn("Exact GP model with over 2000 data points requires "
504 | "large amounts of time and memory, continuing with exact model.",Warning)
505 | return False
506 | else:
507 | raise ValueError('Method must be one of "auto","approx", or "exact".')
508 |
509 | def _test_gp_residuals(self,conf_mat):
510 | #Test normal
511 | k2, p_norm = stats.normaltest(self.data['GP_residuals'])
512 | if p_norm < 0.05:
513 | warnings.warn("The residuals are not Gaussian!")
514 |
515 | # Test heteroskedasticity
516 | exog = add_constant(conf_mat)
517 | _,p_het,_,_ = het_white((self.data['GP_residuals'])**2,exog)
518 | if p_het < 0.05:
519 | warnings.warn("The residuals are heteroskedastic!")
520 |
521 | def gp_normative_model(self, length_scale=1, nu=2.5, length_scale_bounds=(1e-5,1e5),method='auto', batch_size=256, n_inducing=500, num_epochs=20, cv_folds=1):
522 | """ Compute gaussian process normative model. Gaussian process regression is computed using
523 | the Matern Kernel with an added constant and white noise. For Matern kernel see scikit-learn documentation:
524 | https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.kernels.Matern.html.
525 |
526 | Parameters
527 | -------
528 | length_scale: float, default=1
529 | Length scale parameter of Matern kernel.
530 | nu: float, default=2.5
531 | Nu parameter of Matern kernel.
532 | length_scale_bounds: pair of floats >= 0 or 'fixed', default=(1e-5, 1e5)
533 | The lower and upper bound on length_scale. If set to 'fixed', ‘length_scale’ cannot be changed during hyperparameter tuning.
534 | method: str, default='auto'
535 | Which method to use, can be 'exact' for exact GP regression, 'approx' for SVGP,
536 | or 'auto' which will set the method according to the size of the data.
537 | batch_size: int, default=256
538 | Batch size for SVGP model training and prediction.
539 | n_inducing: int, default=500
540 | Number of inducing points for SVGP model.
541 | num_epochs: int, default=20
542 | Number of epochs (passes through entire dataset) to train SVGP for.
543 | cv_folds: int, default=1
544 | How many folds of cross-validation to perform. If 1, there is no cross-validation.
545 | """
546 | # get proband and control masks
547 | ctr_mask, _ = self._get_masks()
548 |
549 | # get matrix of confounds
550 | conf_mat = self._get_conf_mat()
551 |
552 | # get score
553 | score = self._get_score()
554 |
555 | if self._use_approx(method=method):
556 | self.loss = self._svgp_normative_model(conf_mat,score,ctr_mask,nu=nu,length_scale=length_scale, length_scale_bounds=length_scale_bounds,
557 | batch_size=batch_size,n_inducing=n_inducing,num_epochs=num_epochs,cv_folds=cv_folds)
558 |
559 | else:
560 | kernel = ConstantKernel() + WhiteKernel(noise_level=1) + Matern(length_scale=length_scale, nu=nu,length_scale_bounds=length_scale_bounds)
561 | gp = gaussian_process.GaussianProcessRegressor(kernel=kernel)
562 |
563 | # Define independent and response variables
564 | y = score[ctr_mask].reshape(-1,1)
565 | X = conf_mat[ctr_mask]
566 |
567 | if cv_folds == 1:
568 | gp.fit(X, y)
569 | y_pred, sigma = gp.predict(conf_mat, return_std=True)
570 | y_true = self.data[self.score].to_numpy()
571 |
572 | # For MSLL
573 | y_train_mean = np.mean(y_true[ctr_mask])
574 | y_train_sigma = np.std(y_true[ctr_mask])
575 |
576 | rmse = RMSE(y_true[ctr_mask],y_pred[ctr_mask])
577 | smse = SMSE(y_true[ctr_mask],y_pred[ctr_mask])
578 | msll = MSLL(y_true[ctr_mask],y_pred[ctr_mask],sigma[ctr_mask],y_train_mean,y_train_sigma)
579 | else:
580 | kf = KFold(n_splits=cv_folds, shuffle=True)
581 | rmse = []
582 | smse = []
583 | msll = []
584 | print(f'Starting {cv_folds} folds of CV...')
585 | for i, (train_index, test_index) in enumerate(kf.split(X)):
586 | X_train, X_test = X[train_index], X[test_index]
587 | y_train, y_test = y[train_index], y[test_index]
588 | gp.fit(X_train, y_train)
589 | y_pred, sigma = gp.predict(X_test, return_std=True)
590 |
591 | # For MSLL
592 | y_train_mean = np.mean(y_train)
593 | y_train_sigma = np.std(y_train)
594 |
595 | r = RMSE(y_test,y_pred)
596 | s = SMSE(y_test,y_pred)
597 | m = MSLL(y_test.squeeze(),y_pred.squeeze(),sigma.squeeze(),y_train_mean,y_train_sigma)
598 | print(f'CV Fold {i}: RMSE={r:.3f} - SMSE={s:.3f} - MSLL={m:.3f}')
599 | rmse.append(r)
600 | smse.append(s)
601 | msll.append(m)
602 | print('Done!')
603 |
604 | rmse = np.mean(rmse)
605 | smse = np.mean(smse)
606 | msll = np.mean(msll)
607 | print(f'Average: RMSE={rmse:.3f} - SMSE={smse:.3f} - MSLL={msll:.3f}')
608 |
609 | gp.fit(X, y)
610 | y_pred, sigma = gp.predict(conf_mat, return_std=True)
611 | y_true = self.data[self.score].to_numpy().reshape(-1,1)
612 |
613 | self.data['GP_pred'] = y_pred
614 | self.data['GP_sigma'] = sigma
615 | self.data['GP_residuals'] = np.squeeze(y_true) - y_pred
616 | self.data['GP_z'] = self.data['GP_residuals'] / self.data['GP_sigma']
617 |
618 | self.RMSE_GP = rmse
619 | self.SMSE_GP = smse
620 | self.MSLL_GP = msll
621 |
622 | self._test_gp_residuals(conf_mat)
623 |
624 | def _svgp_normative_model(self,conf_mat,score,ctr_mask,nu=2.5,length_scale=1,length_scale_bounds=(1e-5,1e5),
625 | batch_size=256,n_inducing=500,num_epochs=20,cv_folds=1):
626 | """ Compute SVGP model. See GPyTorch documentation for further details:
627 | https://docs.gpytorch.ai/en/v1.1.1/examples/04_Variational_and_Approximate_GPs/SVGP_Regression_CUDA.html#Creating-a-SVGP-Model.
628 |
629 | Parameters
630 | ----------
631 | conf_mat: array
632 | Confounds with categorical values dummy encoded.
633 | score: array
634 | Score/response variable.
635 | ctr_mask: array
636 | Mask (boolean array) with controls marked True.
637 | length_scale: float, default=1
638 | Length scale parameter of Matern kernel.
639 | length_scale_bounds: pair of floats >= 0 or 'fixed', default=(1e-5, 1e5)
640 | The lower and upper bound on length_scale. If set to 'fixed', ‘length_scale’ cannot be changed during hyperparameter tuning.
641 | nu: float, default=2.5
642 | Nu parameter of Matern kernel.
643 | batch_size: int, default=256
644 | Batch size for SVGP model training and prediction.
645 | n_inducing: int, default=500
646 | Number of inducing points for SVGP model.
647 | num_epochs: int, default=20
648 | Number of epochs (passes through entire dataset) to train SVGP for.
649 | cv_folds: int, default=1
650 | How many folds of cross-validation to perform. If 1, there is no cross-validation.
651 |
652 | Raises
653 | ------
654 | ImportError
655 | GPyTorch or it's dependencies aren't installed.
656 |
657 | Returns
658 | -------
659 | array
660 | Loss per epoch of training (temp for debugging).
661 | """
662 | try:
663 | from pynm.models.approx import SVGP
664 | except:
665 | raise ImportError("GPyTorch and it's dependencies must be installed to use the SVGP model.")
666 | else:
667 | if cv_folds == 1:
668 | svgp = SVGP(conf_mat[ctr_mask],conf_mat,score[ctr_mask],score,n_inducing=n_inducing,batch_size=batch_size,nu=nu,
669 | length_scale=length_scale,length_scale_bounds=length_scale_bounds)
670 |
671 | svgp.train(num_epochs=num_epochs)
672 | means, sigma = svgp.predict()
673 |
674 | y_pred = means.numpy()
675 | y_true = score
676 | residuals = (y_true - y_pred).astype(float)
677 |
678 | # For MSLL
679 | y_train_mean = np.mean(y_true[ctr_mask])
680 | y_train_sigma = np.std(y_true[ctr_mask])
681 |
682 | rmse = RMSE(y_true[ctr_mask],y_pred[ctr_mask])
683 | smse = SMSE(y_true[ctr_mask],y_pred[ctr_mask])
684 | msll = MSLL(y_true[ctr_mask],y_pred[ctr_mask],sigma.numpy()[ctr_mask],y_train_mean,y_train_sigma)
685 |
686 | else:
687 | X = conf_mat[ctr_mask]
688 | y = score[ctr_mask]
689 |
690 | kf = KFold(n_splits=cv_folds, shuffle=True)
691 | rmse = []
692 | smse = []
693 | msll = []
694 | print(f'Starting {cv_folds} folds of CV...')
695 | for i, (train_index, test_index) in enumerate(kf.split(X)):
696 | X_train, X_test = X[train_index], X[test_index]
697 | y_train, y_test = y[train_index], y[test_index]
698 |
699 | # For MSLL
700 | y_train_mean = np.mean(y_train)
701 | y_train_sigma = np.std(y_train)
702 |
703 | cv_svgp = SVGP(X_train,X_test,y_train,y_test,n_inducing=n_inducing,batch_size=batch_size,nu=nu,
704 | length_scale=length_scale,length_scale_bounds=length_scale_bounds)
705 |
706 | cv_svgp.train(num_epochs=num_epochs)
707 | cv_means, cv_sigma = cv_svgp.predict()
708 |
709 | cv_y_pred = cv_means.numpy()
710 | cv_residuals = (y_test - cv_y_pred).astype(float)
711 |
712 | r = RMSE(y_test,cv_y_pred)
713 | s = SMSE(y_test,cv_y_pred)
714 | m = MSLL(y_test,cv_y_pred,cv_sigma.numpy(),y_train_mean,y_train_sigma)
715 |
716 | print(f'CV Fold {i}: RMSE={r:.3f} - SMSE={s:.3f} - MSLL={m:.3f}')
717 | rmse.append(r)
718 | smse.append(s)
719 | msll.append(m)
720 | print('Done!')
721 |
722 | rmse = np.mean(rmse)
723 | smse = np.mean(smse)
724 | msll = np.mean(msll)
725 | print(f'Average: RMSE={rmse:.3f} - SMSE={smse:.3f} - MSLL={msll:.3f}')
726 |
727 | svgp = SVGP(conf_mat[ctr_mask],conf_mat,score[ctr_mask],score,n_inducing=n_inducing,batch_size=batch_size,nu=nu,
728 | length_scale=length_scale,length_scale_bounds=length_scale_bounds)
729 |
730 | svgp.train(num_epochs=num_epochs)
731 | means, sigma = svgp.predict()
732 |
733 | y_pred = means.numpy()
734 | y_true = score
735 | residuals = (y_true - y_pred).astype(float)
736 |
737 | self.data['GP_pred'] = y_pred
738 | self.data['GP_sigma'] = sigma.numpy()
739 | self.data['GP_residuals'] = residuals
740 | self.data['GP_z'] = self.data['GP_residuals']/self.data['GP_sigma']
741 |
742 | self.RMSE_GP = rmse
743 | self.SMSE_GP = smse
744 | self.MSLL_GP = msll
745 |
746 |
747 | def gamlss_normative_model(self,mu=None,sigma=None,nu=None,tau=None,family='SHASHo2',method='RS',cv_folds=1):
748 | """Compute GAMLSS normative model.
749 |
750 | Parameters
751 | ----------
752 | mu: str or None
753 | Formula for mu (location) parameter of GAMLSS. If None, formula for score is sum of confounds
754 | with non-categorical columns as smooth functions, e.g. "score ~ ps(age) + sex".
755 | sigma: str or None
756 | Formula for sigma (scale) parameter of GAMLSS. If None, formula is '~ 1'.
757 | nu: str or None
758 | Formula for nu (skewness) parameter of GAMLSS. If None, formula is '~ 1'.
759 | tau: str or None
760 | Formula for tau (kurtosis) parameter of GAMLSS. If None, formula is '~ 1'.
761 | family: str,default='SHASHo2'
762 | Family of distributions to use for fitting, default is 'SHASHo2'. See R documentation for GAMLSS package for other available families of distributions.
763 | method: str, default = 'RS'
764 | Method for fitting GAMLSS. Can be 'RS' (Rigby and Stasinopoulos algorithm), 'CG' (Cole and Green algorithm) or 'mixed(n,m)' where n & m are integers.
765 | Specifying 'mixed(n,m)' will use the RS algorithm for n iterations and the CG algorithm for up to m additional iterations.
766 | cv_folds: int, default=1
767 | How many folds of cross-validation to perform. If 1, there is no cross-validation.
768 |
769 | Notes
770 | -----
771 | If using 'random()' to model a random effect in any of the formulas, it must be passed a column of the dataframe with categorical values
772 | as a factor: e.g. 'random(as.factor(COL))'.
773 | """
774 | try:
775 | from pynm.models.gamlss import GAMLSS
776 | except:
777 | raise ImportError("R and the GAMLSS package must be installed to use GAMLSS model, see documentation for installation help.")
778 | else:
779 | # get proband and control masks
780 | ctr_mask, _ = self._get_masks()
781 |
782 | gamlss = GAMLSS(mu=mu,sigma=sigma,nu=nu,tau=tau,family=family,method=method,
783 | score=self.score,confounds=self.confounds)
784 |
785 | nan_cols = ['LOESS_pred','LOESS_residuals','LOESS_z','LOESS_rank','LOESS_sigma',
786 | 'Centiles_pred','Centiles_residuals','Centiles_z','Centiles','Centiles_rank','Centiles_sigma',
787 | 'Centiles_95','Centiles_5','Centiles_32','Centiles_68']
788 | gamlss_data = self.data[[c for c in self.data.columns if c not in nan_cols]]
789 |
790 | if cv_folds == 1:
791 | gamlss.fit(gamlss_data[ctr_mask])
792 |
793 | mu_pred = gamlss.predict(gamlss_data,what='mu')
794 | sigma_pred = gamlss.predict(gamlss_data,what='sigma')
795 |
796 | # For MSLL
797 | y_train_mean = np.mean(self.data[self.score].values[ctr_mask])
798 | y_train_sigma = np.std(self.data[self.score].values[ctr_mask])
799 |
800 | rmse = RMSE(self.data[self.score].values[ctr_mask],mu_pred[ctr_mask])
801 | smse = SMSE(self.data[self.score].values[ctr_mask],mu_pred[ctr_mask])
802 | msll = MSLL(self.data[self.score].values[ctr_mask],mu_pred[ctr_mask],sigma_pred[ctr_mask],
803 | y_train_mean, y_train_sigma)
804 |
805 | else:
806 | X = gamlss_data[ctr_mask]
807 | kf = KFold(n_splits=cv_folds, shuffle=True)
808 | rmse = []
809 | smse = []
810 | msll = []
811 | print(f'Starting {cv_folds} folds of CV...')
812 | for i, (train_index, test_index) in enumerate(kf.split(X)):
813 | X_train, X_test = X.iloc[train_index], X.iloc[test_index]
814 |
815 | # For MSLL
816 | y_train_mean = np.mean(self.data[self.score].values[train_index])
817 | y_train_sigma = np.std(self.data[self.score].values[train_index])
818 |
819 | gamlss.fit(X_train)
820 |
821 | cv_mu_pred = gamlss.predict(X_test,what='mu')
822 | cv_sigma_pred = gamlss.predict(X_test,what='sigma')
823 |
824 | r = RMSE(X_test[self.score].values,cv_mu_pred)
825 | s = SMSE(X_test[self.score].values,cv_mu_pred)
826 | m = MSLL(X_test[self.score].values,cv_mu_pred,cv_sigma_pred,y_train_mean,y_train_sigma)
827 | print(f'CV Fold {i}: RMSE={r:.3f} - SMSE={s:.3f} - MSLL={m:.3f}')
828 | rmse.append(r)
829 | smse.append(s)
830 | msll.append(m)
831 | print('Done!')
832 |
833 | rmse = np.mean(rmse)
834 | smse = np.mean(smse)
835 | msll = np.mean(msll)
836 | print(f'Average: RMSE={rmse:.3f} - SMSE={smse:.3f} - MSLL={msll:.3f}')
837 |
838 | gamlss.fit(gamlss_data[ctr_mask])
839 |
840 | mu_pred = gamlss.predict(gamlss_data,what='mu')
841 | sigma_pred = gamlss.predict(gamlss_data,what='sigma')
842 |
843 | self.data['GAMLSS_pred'] = mu_pred
844 | self.data['GAMLSS_sigma'] = sigma_pred
845 | self.data['GAMLSS_residuals'] = self.data[self.score] - self.data['GAMLSS_pred']
846 | self.data['GAMLSS_z'] = self.data['GAMLSS_residuals']/self.data['GAMLSS_sigma']
847 |
848 | self.RMSE_GAMLSS = rmse
849 | self.SMSE_GAMLSS = smse
850 | self.MSLL_GAMLSS = msll
851 |
852 | def report(self):
853 | """ Prints the values of each metric (SMSE, RMSE, MSLL) for the models that have been run.
854 | """
855 | print("------\nReport\n------")
856 | models = []
857 | for m in ['LOESS','Centiles','GP','GAMLSS']:
858 | if '{}_pred'.format(m) in self.data.columns:
859 | models.append(m)
860 | if len(models)==0:
861 | print('No models have been run.')
862 | return
863 |
864 | print("========= SMSE - RMSE - MSLL")
865 | for m in models:
866 | k = 9 - len(m)
867 | m_formatted = m + k*' '
868 | smse = np.round(eval(f"self.SMSE_{m}"),2)
869 | if np.isnan(smse):
870 | smse = 'NaN '
871 | rmse = np.round(eval(f"self.RMSE_{m}"),2)
872 | if np.isnan(rmse):
873 | rmse = 'NaN '
874 | msll = 'N/A'
875 | if (m == 'GP') or (m == 'GAMLSS'):
876 | msll = np.round(eval(f"self.MSLL_{m}"),2)
877 | print(f"{m_formatted} {smse} {rmse} {msll}")
878 |
879 | def _plot(self, ax,kind=None,gp_xaxis=None,gamlss_xaxis=None):
880 | """ Plot the data with the normative model overlaid.
881 |
882 | Parameters
883 | ----------
884 | ax: matplotlib axis
885 | Axis on which to plot.
886 | kind: str, default=None
887 | Type of plot among "LOESS" (local polynomial), "Centiles", "GP" (gaussian processes),
888 | or "GAMLSS" (generalized additive models of location scale and shape).
889 | gp_xaxis: str,default=None
890 | Which confound to use for xaxis of GP plot. If set to None, first confound in list passed to model will be used.
891 | gamlss_xaxis: str,default=None
892 | Which confound to use for xaxis of GAMLSS plot. If set to None, first confound in list passed to model will be used.
893 |
894 | Returns
895 | -------
896 | Axis
897 | handle for the matplotlib axis of the plot
898 | """
899 | if kind is None:
900 | sns.scatterplot(data=self.data, x=self.conf, y=self.score,
901 | hue=self.group, style=self.group,ax=ax)
902 | elif kind == 'LOESS':
903 | sns.scatterplot(data=self.data, x=self.conf, y=self.score,
904 | hue=self.group, style=self.group,ax=ax)
905 | tmp=self.data.sort_values(self.conf)
906 | ax.plot(tmp[self.conf], tmp['LOESS_pred'], '-k',label='Prediction')
907 | ax.plot(tmp[self.conf], tmp['LOESS_pred'] - 1.96*tmp['LOESS_sigma'], '--k')
908 | ax.plot(tmp[self.conf], tmp['LOESS_pred'] + 1.96*tmp['LOESS_sigma'], '--k',label='95% CI')
909 | handles, labels = ax.get_legend_handles_labels()
910 | ax.legend(handles, labels)
911 | ax.set_title(f"{kind} SMSE={self.SMSE_LOESS:.3f}")
912 | elif kind == 'Centiles':
913 | sns.scatterplot(data=self.data, x=self.conf, y=self.score,
914 | hue=self.group, style=self.group,ax=ax)
915 | tmp=self.data.sort_values(self.conf)
916 | ax.plot(tmp[self.conf], tmp['Centiles_pred'], '-k',label='Prediction')
917 | ax.plot(tmp[self.conf], tmp['Centiles_5'],'--k')
918 | ax.plot(tmp[self.conf], tmp['Centiles_95'],'--k',label='95% CI')
919 | handles, labels = ax.get_legend_handles_labels()
920 | ax.legend(handles, labels)
921 | ax.set_title(f"{kind} SMSE={self.SMSE_Centiles:.3f}")
922 | elif kind == 'GP':
923 | if gp_xaxis is None:
924 | gp_xaxis = self.conf
925 | sns.scatterplot(data=self.data, x=gp_xaxis, y=self.score,
926 | hue=self.group, style=self.group,ax=ax)
927 | tmp=self.data.sort_values(gp_xaxis)
928 | if len(self.confounds) == 1:
929 | ax.plot(tmp[gp_xaxis], tmp['GP_pred'], '-k',label='Prediction')
930 | ax.plot(tmp[gp_xaxis], tmp['GP_pred'] - 1.96*tmp['GP_sigma'], '--k')
931 | ax.plot(tmp[gp_xaxis], tmp['GP_pred'] + 1.96*tmp['GP_sigma'], '--k',label='95% CI')
932 | else:
933 | ax.scatter(tmp[gp_xaxis], tmp['GP_pred'], label='Prediction',color='black',marker='_',s=25)
934 | ax.scatter(tmp[gp_xaxis], tmp['GP_pred'] - 1.96*tmp['GP_sigma'],color='black',s=0.2)
935 | ax.scatter(tmp[gp_xaxis], tmp['GP_pred'] + 1.96*tmp['GP_sigma'], label='95% CI',color='black',s=0.2)
936 | handles, labels = ax.get_legend_handles_labels()
937 | ax.legend(handles, labels)
938 | ax.set_title(f"{kind} SMSE={self.SMSE_GP:.3f} - MSLL={self.MSLL_GP:.3f}")
939 | elif kind == 'GAMLSS':
940 | if gamlss_xaxis is None:
941 | gamlss_xaxis = self.conf
942 | sns.scatterplot(data=self.data, x=gamlss_xaxis, y=self.score,
943 | hue=self.group, style=self.group,ax=ax)
944 | tmp=self.data.sort_values(gamlss_xaxis)
945 | if len(self.confounds) == 1:
946 | ax.plot(tmp[gamlss_xaxis], tmp['GAMLSS_pred'], '-k',label='Prediction')
947 | ax.plot(tmp[gamlss_xaxis], tmp['GAMLSS_pred'] - 1.96*tmp['GAMLSS_sigma'], '--k')
948 | ax.plot(tmp[gamlss_xaxis], tmp['GAMLSS_pred'] + 1.96*tmp['GAMLSS_sigma'], '--k',label='95% CI')
949 | else:
950 | ax.scatter(tmp[gamlss_xaxis], tmp['GAMLSS_pred'], label='Prediction',color='black',marker='_',s=25)
951 | ax.scatter(tmp[gamlss_xaxis], tmp['GAMLSS_pred'] - 1.96*tmp['GAMLSS_sigma'],color='black',s=0.2)
952 | ax.scatter(tmp[gamlss_xaxis], tmp['GAMLSS_pred'] + 1.96*tmp['GAMLSS_sigma'], label='95% CI',color='black',s=0.2)
953 | handles, labels = ax.get_legend_handles_labels()
954 | ax.legend(handles, labels)
955 | ax.set_title(f"{kind} SMSE={self.SMSE_GAMLSS:.3f} - MSLL={self.MSLL_GAMLSS:.3f}")
956 | return ax
957 |
958 | def plot(self, kind=None,gp_xaxis=None,gamlss_xaxis=None):
959 | """Plot the data with the normative model overlaid.
960 |
961 | Parameters
962 | ----------
963 | kind: list, default=None
964 | Type of plot, must be a valid subset of ["Centiles","LOESS","GP","GAMLSS"] or None. If None, all available
965 | results will be plotted, if None are available a warning will be raised and only the data will be plotted.
966 | gp_xaxis: str,default=None
967 | Which confound to use for xaxis of GP plot. If set to None, first confound in list passed to model will be used.
968 | gamlss_xaxis: str,default=None
969 | Which confound to use for xaxis of GAMLSS plot. If set to None, first confound in list passed to model will be used.
970 |
971 | Raises
972 | ------
973 | ValueError
974 | Plot kind not recognized, must be a valid subset of ["Centiles","LOESS","GP","GAMLSS"] or None.
975 | """
976 | if kind is None:
977 | kind = []
978 | for k in ['LOESS','Centiles','GP','GAMLSS']:
979 | if '{}_pred'.format(k) in self.data.columns:
980 | kind.append(k)
981 | if len(kind)==0:
982 | warnings.warn('No model results found in data.')
983 |
984 | if set(kind).issubset(set(['LOESS','Centiles','GP','GAMLSS'])) and len(kind)>1:
985 | fig, ax = plt.subplots(1,len(kind),figsize=(len(kind)*5,5))
986 | for i,k in enumerate(kind):
987 | self._plot(ax[i],kind=k,gp_xaxis=gp_xaxis,gamlss_xaxis=gamlss_xaxis)
988 | plt.show()
989 | elif set(kind).issubset(set(['LOESS','Centiles','GP','GAMLSS'])) and len(kind)>0:
990 | fig, ax = plt.subplots(1,len(kind),figsize=(len(kind)*5,5))
991 | self._plot(ax,kind=kind[0],gp_xaxis=gp_xaxis,gamlss_xaxis=gamlss_xaxis)
992 | plt.show()
993 | elif len(kind)==0:
994 | fig, ax = plt.subplots(1,1)
995 | self._plot(ax,None,gp_xaxis=gp_xaxis,gamlss_xaxis=gamlss_xaxis)
996 | ax.set_title('Data')
997 | plt.show()
998 | else:
999 | raise ValueError('Plot kind not recognized, must be a valid subset of ["Centiles","LOESS","GP","GAMLSS"] or None.')
1000 |
1001 | def _plot_res_z(self, ax,kind=None, confound=None,z=False):
1002 | """ Plot the residuals of the normative model.
1003 |
1004 | Parameters
1005 | ----------
1006 | ax: matplotlib axis
1007 | Axis on which to plot.
1008 | kind: str, default=None
1009 | Type of plot among "LOESS" (local polynomial), "Centiles", "GP" (gaussian processes),
1010 | or "GAMLSS" (generalized additive models of location scale and shape).
1011 | confound: str or None
1012 | Which confound to use as xaxis of plot, must be categorical or None.
1013 | """
1014 | if kind == 'LOESS':
1015 | if z:
1016 | sns.violinplot(x=confound, y='LOESS_z',
1017 | data=self.data, split=True, palette='Blues', hue=self.group,ax=ax)
1018 | else:
1019 | sns.violinplot(x=confound, y='LOESS_residuals',
1020 | data=self.data, split=True, palette='Blues', hue=self.group,ax=ax)
1021 | ax.set_title(f"{kind} SMSE={self.SMSE_LOESS:.3f}")
1022 | if kind == 'Centiles':
1023 | if z:
1024 | sns.violinplot(x=confound, y='Centiles_z',
1025 | data=self.data, split=True, palette='Blues', hue=self.group,ax=ax)
1026 | else:
1027 | sns.violinplot(x=confound, y='Centiles_residuals',
1028 | data=self.data, split=True, palette='Blues', hue=self.group,ax=ax)
1029 | ax.set_title(f"{kind} SMSE={self.SMSE_Centiles:.3f}")
1030 | if kind == 'GP':
1031 | if z:
1032 | sns.violinplot(x=confound, y='GP_z',
1033 | data=self.data, split=True, palette='Blues', hue=self.group,ax=ax)
1034 | else:
1035 | sns.violinplot(x=confound, y='GP_residuals',
1036 | data=self.data, split=True, palette='Blues', hue=self.group,ax=ax)
1037 | ax.set_title(f"{kind} SMSE={self.SMSE_GP:.3f} - MSLL={self.MSLL_GP:.3f}")
1038 | if kind == 'GAMLSS':
1039 | if z:
1040 | sns.violinplot(x=confound, y='GAMLSS_z',
1041 | data=self.data, split=True, palette='Blues', hue=self.group,ax=ax)
1042 | else:
1043 | sns.violinplot(x=confound, y='GAMLSS_residuals',
1044 | data=self.data, split=True, palette='Blues', hue=self.group,ax=ax)
1045 | ax.set_title(f"{kind} SMSE={self.SMSE_GAMLSS:.3f} - MSLL={self.MSLL_GAMLSS:.3f}")
1046 | if not isinstance(confound,str):
1047 | ax.set_xticklabels([''])
1048 |
1049 | def _plot_res_z_cont(self, ax,kind=None, confound=None,z=False):
1050 | """ Plot the residuals of the normative model.
1051 |
1052 | Parameters
1053 | ----------
1054 | ax: matplotlib axis
1055 | Axis on which to plot.
1056 | kind: str, default=None
1057 | Type of plot among "LOESS" (local polynomial), "Centiles", "GP" (gaussian processes),
1058 | or "GAMLSS" (generalized additive models of location scale and shape).
1059 | confound: str or None
1060 | Which confound to use as xaxis of plot, must be continuous.
1061 | """
1062 | if kind == 'LOESS':
1063 | if z:
1064 | sns.scatterplot(x=confound, y='LOESS_z',
1065 | data=self.data, hue=self.group,ax=ax)
1066 | else:
1067 | sns.scatterplot(x=confound, y='LOESS_residuals',
1068 | data=self.data, hue=self.group,ax=ax)
1069 | ax.set_title(f"{kind} SMSE={self.SMSE_LOESS:.3f}")
1070 | if kind == 'Centiles':
1071 | if z:
1072 | sns.scatterplot(x=confound, y='Centiles_z',
1073 | data=self.data, hue=self.group,ax=ax)
1074 | else:
1075 | sns.scatterplot(x=confound, y='Centiles_residuals',
1076 | data=self.data, hue=self.group,ax=ax)
1077 | ax.set_title(f"{kind} SMSE={self.SMSE_Centiles:.3f}")
1078 | if kind == 'GP':
1079 | if z:
1080 | sns.scatterplot(x=confound, y='GP_z',
1081 | data=self.data, hue=self.group,ax=ax)
1082 | else:
1083 | sns.scatterplot(x=confound, y='GP_residuals',
1084 | data=self.data, hue=self.group,ax=ax)
1085 | ax.set_title(f"{kind} SMSE={self.SMSE_GP:.3f} - MSLL={self.MSLL_GP:.3f}")
1086 | if kind == 'GAMLSS':
1087 | if z:
1088 | sns.scatterplot(x=confound, y='GAMLSS_z',
1089 | data=self.data, hue=self.group,ax=ax)
1090 | else:
1091 | sns.scatterplot(x=confound, y='GAMLSS_residuals',
1092 | data=self.data, hue=self.group,ax=ax)
1093 | ax.set_title(f"{kind} SMSE={self.SMSE_GAMLSS:.3f} - MSLL={self.MSLL_GAMLSS:.3f}")
1094 |
1095 | def plot_res(self, kind=None, confound=None):
1096 | """Plot the residuals of the normative model.
1097 |
1098 | Parameters
1099 | ----------
1100 | kind: list default=None
1101 | Type of plot, must be a valid subset of ["Centiles","LOESS","GP","GAMLSS"] or None. If None, all available
1102 | results will be plotted, if None are available a ValueError will be raised.
1103 | confound: str, default=None
1104 | Which confound to use as xaxis of plot.
1105 |
1106 | Raises
1107 | ------
1108 | ValueError
1109 | Plot kind not recognized, must be a valid subset of ["Centiles","LOESS","GP","GAMLSS"] or None.
1110 | ValueError
1111 | No model results found in data.
1112 | """
1113 | _, cat = read_confounds(self.confounds)
1114 | if confound is None:
1115 | categorical = True
1116 | elif confound in cat:
1117 | categorical = True
1118 | else:
1119 | categorical = False
1120 |
1121 | if kind is None:
1122 | kind = []
1123 | for k in ['LOESS','Centiles','GP','GAMLSS']:
1124 | if '{}_residuals'.format(k) in self.data.columns:
1125 | kind.append(k)
1126 | if len(kind)==0:
1127 | raise ValueError('No model residuals found in data.')
1128 |
1129 | if set(kind).issubset(set(['LOESS','Centiles','GP','GAMLSS'])) and len(kind)>1:
1130 | fig, ax = plt.subplots(1,len(kind),figsize=(len(kind)*5,5))
1131 | for i,k in enumerate(kind):
1132 | if categorical:
1133 | self._plot_res_z(ax[i],kind=k,confound=confound)
1134 | else:
1135 | self._plot_res_z_cont(ax[i],kind=k,confound=confound)
1136 | plt.show()
1137 | elif set(kind).issubset(set(['LOESS','Centiles','GP','GAMLSS'])):
1138 | fig, ax = plt.subplots(1,len(kind),figsize=(len(kind)*5,5))
1139 | if categorical:
1140 | self._plot_res_z(ax,kind=kind[0],confound=confound)
1141 | else:
1142 | self._plot_res_z_cont(ax,kind=kind[0],confound=confound)
1143 | plt.show()
1144 | else:
1145 | raise ValueError('Plot kind not recognized, must be a valid subset of ["Centiles","LOESS","GP","GAMLSS"] or None.')
1146 |
1147 | def plot_z(self, kind=None, confound=None):
1148 | """Plot the deviance scores of the normative model.
1149 |
1150 | Parameters
1151 | ----------
1152 | kind: list default=None
1153 | Type of plot, must be a valid subset of ["Centiles","LOESS","GP","GAMLSS"] or None. If None, all available
1154 | results will be plotted, if None are available a ValueError will be raised.
1155 | confound: str, default=None
1156 | Which confound to use as xaxis of plot.
1157 |
1158 | Raises
1159 | ------
1160 | ValueError
1161 | Plot kind not recognized, must be a valid subset of ["Centiles","LOESS","GP","GAMLSS"] or None.
1162 | ValueError
1163 | No model results found in data.
1164 | """
1165 | _, cat = read_confounds(self.confounds)
1166 | if confound is None:
1167 | categorical = True
1168 | elif confound in cat:
1169 | categorical = True
1170 | else:
1171 | categorical = False
1172 |
1173 | if kind is None:
1174 | kind = []
1175 | for k in ['LOESS','Centiles','GP','GAMLSS']:
1176 | if '{}_z'.format(k) in self.data.columns:
1177 | kind.append(k)
1178 | if len(kind)==0:
1179 | raise ValueError('No model deviance scores found in data.')
1180 |
1181 | if set(kind).issubset(set(['LOESS','Centiles','GP','GAMLSS'])) and len(kind)>1:
1182 | fig, ax = plt.subplots(1,len(kind),figsize=(len(kind)*5,5))
1183 | for i,k in enumerate(kind):
1184 | if categorical:
1185 | self._plot_res_z(ax[i],kind=k,confound=confound,z=True)
1186 | else:
1187 | self._plot_res_z_cont(ax[i],kind=k,confound=confound,z=True)
1188 | plt.show()
1189 | elif set(kind).issubset(set(['LOESS','Centiles','GP','GAMLSS'])):
1190 | fig, ax = plt.subplots(1,len(kind),figsize=(len(kind)*5,5))
1191 | if categorical:
1192 | self._plot_res_z(ax,kind=kind[0],confound=confound,z=True)
1193 | else:
1194 | self._plot_res_z_cont(ax,kind=kind[0],confound=confound,z=True)
1195 | plt.show()
1196 | else:
1197 | raise ValueError('Plot kind not recognized, must be a valid subset of ["Centiles","LOESS","GP","GAMLSS"] or None.')
1198 |
--------------------------------------------------------------------------------
/pynm/util.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def read_confounds(confounds):
4 | """ Process input list of confounds.
5 |
6 | Parameters
7 | ----------
8 | confounds : list of str
9 | List of confounds with categorical variables indicated by c(var) ('c' must be lower case).
10 |
11 | Returns
12 | -------
13 | list
14 | List of all confounds without wrapper on categorical variables: c(var) -> var.
15 | list
16 | List of only categorical confounds without wrapper.
17 | """
18 | categorical = []
19 | clean_confounds = []
20 | for conf in confounds:
21 | if ((conf[0:2] == 'c(') & (conf[-1] == ')')):
22 | categorical.append(conf[2:-1])
23 | clean_confounds.append(conf[2:-1])
24 | else:
25 | clean_confounds.append(conf)
26 | return clean_confounds, categorical
27 |
28 | def RMSE(y_true,y_pred):
29 | """Calculates Root Mean Square Error (RMSE).
30 |
31 | Parameters
32 | ----------
33 | y_true: array
34 | True values for response variable.
35 | y_pred: array
36 | Predicted values for response variable
37 |
38 | Returns
39 | -------
40 | float
41 | RMSE value for inputs.
42 | """
43 | return (np.mean((y_true - y_pred)**2))**0.5
44 |
45 | def SMSE(y_true,y_pred):
46 | """Calculates Standardized Mean Square Error (SMSE).
47 |
48 | Parameters
49 | ----------
50 | y_true: array
51 | True values for response variable.
52 | y_pred: array
53 | Predicted values for response variable
54 |
55 | Returns
56 | -------
57 | float
58 | SMSE value for inputs.
59 | """
60 | return (np.mean((y_true - y_pred)**2))**0.5/np.std(y_true)
61 |
62 | def MSLL(y_true,y_pred,sigma,y_train_mean,y_train_sigma):
63 | """Calculates Mean Standardized Log Loss (MSLL).
64 |
65 | Parameters
66 | ----------
67 | y_true: (n,) array
68 | True values for response variable.
69 | y_pred: (n,) array
70 | Predicted values for response variable
71 | sigma: (n,) array
72 | Standard deviation of predictive distribution.
73 | y_train_mean: float
74 | Mean of training data.
75 | y_train_sigma: float
76 | Standard deviation of training data.
77 |
78 |
79 | Returns
80 | -------
81 | float
82 | MSLL value for inputs.
83 | """
84 | inputs = [y_true,y_pred,sigma]
85 | for i in inputs:
86 | if len(i.shape) > 1:
87 | raise ValueError("Data must be 1-dimensional")
88 |
89 | #SLL = (0.5 * np.log(2 * np.pi * sigma**2) +
90 | # (y_true - y_pred)**2 / (2 * sigma**2) -
91 | # (y_true - np.mean(y_true))**2 /
92 | # (2 * np.std(y_true)))
93 |
94 | # Negative log probability under the model
95 | model = (0.5*np.log(2*np.pi*sigma**2)) + (y_true - y_pred)**2/(2*sigma**2)
96 |
97 | # Negative log probability under trivial model
98 | trivial = (0.5*np.log(2*np.pi*y_train_sigma**2)) + (y_true - y_train_mean)**2/(2*y_train_sigma**2)
99 |
100 | SLL = model - trivial
101 | return np.mean(SLL)
--------------------------------------------------------------------------------
/pynm_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppsp-team/PyNM/52de5b73ffd2cb4b04352e348f042dd695be3c87/pynm_logo.png
--------------------------------------------------------------------------------
/pynm_models.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppsp-team/PyNM/52de5b73ffd2cb4b04352e348f042dd695be3c87/pynm_models.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | gpytorch >= 1.4.0
2 | matplotlib >= 3.3.4
3 | numpy >= 1.19.5
4 | pandas >= 1.1.5
5 | pytest >= 6.2.3
6 | rpy2 >= 3.5.4
7 | scikit_learn >= 1.1.2
8 | scipy >= 1.5.3
9 | seaborn >= 0.12.0
10 | statsmodels >= 0.13.2
11 | torch >= 1.12.1
12 | tqdm >= 4.59.0
13 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | from setuptools import setup
3 |
4 | setup(
5 | name="pynm",
6 | version="1.0.1",
7 | author="Annabelle HARVEY, Guillaume DUMAS",
8 | author_email="annabelle.harvey@umontreal.ca, guillaume.dumas@ppsp.team",
9 | description=("Python implementation of Normative Modelling",
10 | "with GAMLSS, Gaussian Processes, LOESS & Centiles approaches."),
11 | long_description_content_type="text/x-rst",
12 | license="BSD",
13 | keywords="gaussian processes statistics modeling",
14 | url="https://github.com/ppsp-team/PyNM",
15 | packages=['pynm', 'test', 'pynm/models'],
16 | classifiers=[
17 | "Development Status :: 5 - Production/Stable",
18 | "Topic :: Scientific/Engineering :: Bio-Informatics",
19 | "License :: OSI Approved :: BSD License",
20 | ],
21 | entry_points={
22 | 'console_scripts': [
23 | 'pynm = pynm.cli:main',
24 | ],
25 | },
26 | install_requires=[
27 | 'gpytorch >= 1.4.0',
28 | 'matplotlib >= 3.3.4',
29 | 'numpy >= 1.19.5',
30 | 'pandas >= 1.1.5',
31 | 'rpy2 >= 3.5.4',
32 | 'scikit_learn >= 1.1.2',
33 | 'scipy >= 1.5.3',
34 | 'seaborn >= 0.12.0',
35 | 'statsmodels >= 0.13.2',
36 | 'torch >= 1.12.1',
37 | 'tqdm >= 4.59.0',
38 | ],
39 | )
40 |
--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppsp-team/PyNM/52de5b73ffd2cb4b04352e348f042dd695be3c87/test/__init__.py
--------------------------------------------------------------------------------
/test/test_pynm.py:
--------------------------------------------------------------------------------
1 | from pynm import pynm
2 | import numpy as np
3 | import pandas as pd
4 | import scipy.stats as sp
5 | import math
6 | import pytest
7 | from pynm.util import *
8 | import matplotlib.pyplot as plt
9 | from unittest.mock import patch
10 | from sklearn.model_selection import train_test_split
11 |
12 | def model(age, sex, offset):
13 | noise = np.random.normal(0, 0.1)
14 | return 0.001*age-0.00001*(age-50)**2+0.5 + noise - np.random.uniform(0, 0.3) * sex + offset
15 |
16 |
17 | def model_prob(age, sex, offset):
18 | noise = np.random.normal(0, 0.1)
19 | return 0.001*age-0.00001*(age-50)**2+0.5 + noise - np.random.uniform(0, 0.3) * sex - 0.2 * np.random.uniform() + offset
20 |
21 | # randseed = 3, sample_size = 1, n_sites = 2 has ONE PROB n=6
22 | # randseed = 1, sample_size = 1, n_sites = 2 has NO PROB n=12
23 | def generate_data(group='PROB_CON', sample_size=1, n_sites=2, randseed=3):
24 | np.random.seed(randseed)
25 | n_sites = n_sites
26 | age_min = (np.random.rand(n_sites)*50).astype(int)
27 | sites = pd.DataFrame(data={'sex_ratio': np.random.rand(n_sites),
28 | 'prob_ratio': 0.5*np.random.rand(n_sites),
29 | 'age_min': age_min,
30 | 'age_max': (age_min+5+np.random.rand(n_sites)*50).astype(int),
31 | 'score_shift': np.random.randn(n_sites)/4,
32 | 'sample_size': (sample_size+np.random.rand(n_sites)*sample_size*10).astype(int)})
33 |
34 | participants = []
35 | for site in sites.iterrows():
36 | for participant in range(int(site[1]['sample_size'])):
37 | sex = np.random.binomial(1, site[1]['sex_ratio'])
38 | prob = np.random.binomial(1, site[1]['prob_ratio'])
39 | age = np.random.uniform(site[1]['age_min'], site[1]['age_max'])
40 | if prob:
41 | score = model_prob(age, sex, site[1]['score_shift'])
42 | else:
43 | score = model(age, sex, site[1]['score_shift'])
44 | participants.append([site[0], sex, prob, age, score])
45 |
46 | df = pd.DataFrame(participants, columns=['site', 'sex', 'group', 'age', 'score'])
47 | df.sex.replace({1: 'Female', 0: 'Male'}, inplace=True)
48 | if group == 'PROB_CON':
49 | df.group.replace({1: 'PROB', 0: 'CTR'}, inplace=True)
50 | return df
51 |
52 | def sample_x(low=1,high=100,n_subs=1000,sampling='full'):
53 | if sampling =='full':
54 | x = np.random.uniform(low=low,high=high,size=n_subs)
55 | else:
56 | x = np.concatenate([np.random.normal(20,10,size=int(n_subs/2)),np.random.normal(80,10,size=int(n_subs/2))])
57 | x = x[(x low)]
58 | return x
59 |
60 | # Homoskedastic, gaussian noise
61 | def dataset_homo(low=1,high=100,n_subs=1000,sampling='full'):
62 | x = sample_x(low=low,high=high,n_subs=n_subs,sampling=sampling)
63 | scores = np.array([np.log(i) + np.random.randn() for i in x])
64 | df = pd.DataFrame([x,scores],index=['x','score']).transpose()
65 | df['train_sample'] = 1
66 | return df
67 |
68 | # Homoskedastic, skew noise
69 | def dataset_skew(low=1,high=100,n_subs=1000,sampling='full'):
70 | x = sample_x(low=low,high=high,n_subs=n_subs,sampling=sampling)
71 | scores = np.array([np.log(i) + sp.skewnorm.rvs(a=2,size=1)[0] for i in x])
72 | df = pd.DataFrame([x,scores],index=['x','score']).transpose()
73 | df['train_sample'] = 1
74 | return df
75 |
76 | # Heteroskedastic linear
77 | def dataset_het(low=1,high=100,n_subs=1000,sampling='full'):
78 | x = sample_x(low=low,high=high,n_subs=n_subs,sampling=sampling)
79 | scores = np.array([np.log(i) + 0.15*np.log(i)*np.random.randn() for i in x])
80 | df = pd.DataFrame([x,scores],index=['x','score']).transpose()
81 | df['train_sample'] = 1
82 | return df
83 |
84 | class TestBasic:
85 | def test_read_confounds_some_categorical(self):
86 | conf = ['a', 'b', 'c(c)']
87 | clean, cat = read_confounds(conf)
88 | assert clean == ['a', 'b', 'c']
89 | assert cat == ['c']
90 |
91 | def test_read_confounds_no_categorical(self):
92 | conf = ['a', 'b', 'c']
93 | clean, cat = read_confounds(conf)
94 | assert clean == conf
95 | assert cat == []
96 |
97 | def test_read_confounds_all_categorical(self):
98 | conf = ['c(a)', 'c(b)', 'c(c)']
99 | clean, cat = read_confounds(conf)
100 | assert clean == ['a', 'b', 'c']
101 | assert cat == ['a', 'b', 'c']
102 |
103 | def test_invalid_init(self):
104 | data1 = generate_data(randseed=1)
105 | data2 = generate_data(randseed=2)
106 | data = pd.concat([data1,data2])
107 | assert data.index.nunique() != data.shape[0]
108 | with pytest.raises(ValueError):
109 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
110 |
111 | def test_set_group_names_PROB_CON_all_CON(self):
112 | data = generate_data(randseed=1)
113 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
114 | assert m.CTR == 'CTR'
115 | assert m.PROB == 'PROB'
116 |
117 | def test_set_group_names_PROB_CON(self):
118 | data = generate_data(randseed=3)
119 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
120 | assert m.CTR == 'CTR'
121 | assert m.PROB == 'PROB'
122 |
123 | def test_set_group_names_01(self):
124 | data = generate_data(randseed=3, group='01')
125 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
126 | assert m.CTR == 0
127 | assert m.PROB == 1
128 |
129 | def test_set_group_controls(self):
130 | data = generate_data(randseed=3, group='01')
131 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],train_sample=1)
132 | assert m.group == 'group'
133 |
134 | def test_set_group_33(self):
135 | data = generate_data(randseed=3, group='01')
136 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],train_sample='0.33')
137 | assert m.group == 'train_sample'
138 | assert m.data['train_sample'].sum() == 1
139 | assert m.data[(m.data['train_sample']==1) & (m.data['group']== 1)].shape[0] == 0
140 |
141 | def test_set_group_manual_no_col(self):
142 | data = generate_data(randseed=3, group='01')
143 | with pytest.raises(ValueError):
144 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],train_sample='manual')
145 |
146 | def test_set_group_manual_zero_col(self):
147 | data = generate_data(randseed=3, group='01')
148 | data['train_sample'] = 0
149 | with pytest.raises(ValueError):
150 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],train_sample='manual')
151 |
152 | def test_set_group_manual_good_col(self):
153 | data = generate_data(randseed=3, group='01')
154 | data['train_sample'] = [1,1,0,0,0,0]
155 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],train_sample='manual')
156 | assert m.PROB == 0
157 | assert m.group == 'train_sample'
158 |
159 | def test_create_bins(self):
160 | data = generate_data(randseed=3)
161 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],bin_spacing=8,bin_width=1.5)
162 | m.centiles_normative_model()
163 | assert m.bins is not None
164 |
165 | def test_bins_num(self):
166 | data = generate_data(randseed=11)
167 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],bin_spacing=5, bin_width=10)
168 | m._create_bins()
169 | assert len(m.bins) == 6
170 |
171 | def test_loess_rank(self):
172 | data = generate_data(randseed=11)
173 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],bin_spacing=8,bin_width=1.5)
174 | m.loess_normative_model()
175 | assert np.sum(m.data.LOESS_rank) == 1
176 |
177 | def test_loess_normative_model(self):
178 | data = generate_data(randseed=11)
179 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],bin_spacing=8,bin_width=1.5)
180 | m.loess_normative_model()
181 | assert math.isclose(2.3482, np.sum(m.data.LOESS_z), abs_tol=0.00001)
182 |
183 | def test_centiles_rank(self):
184 | data = generate_data(randseed=11)
185 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],bin_spacing=8,bin_width=1.5)
186 | m.centiles_normative_model()
187 | assert np.sum(m.data.Centiles_rank) == -22
188 |
189 | def test_centiles_normative_model(self):
190 | data = generate_data(randseed=11)
191 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],bin_spacing=8,bin_width=1.5)
192 | m.centiles_normative_model()
193 | assert np.sum(m.data.Centiles) == 446
194 |
195 | def test_get_masks(self):
196 | a = np.array(list(range(6)))
197 | data = generate_data(randseed=3)
198 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
199 | ctr, prob = m._get_masks()
200 | assert a[ctr].shape[0] == 5
201 | assert a[prob][0] == 3
202 |
203 | def test_get_masks_all_CON(self):
204 | a = np.array(list(range(12)))
205 | data = generate_data(randseed=1)
206 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
207 | ctr, prob = m._get_masks()
208 | assert a[ctr].shape[0] == 12
209 | assert a[prob].shape[0] == 0
210 |
211 | def test_get_conf_mat(self):
212 | data = generate_data(randseed=3)
213 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
214 | conf_mat = m._get_conf_mat()
215 | assert conf_mat.shape[0] == 6
216 | assert conf_mat.shape[1] == 3
217 | for i in range(3):
218 | assert not isinstance(conf_mat[0, i], str)
219 |
220 | def test_use_approx_auto_small(self):
221 | data = generate_data(randseed=3)
222 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
223 | assert m._use_approx(method='auto') == False
224 |
225 | def test_use_approx_auto_big(self):
226 | data = generate_data(randseed=3,sample_size=1000)
227 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
228 | assert m._use_approx(method='auto') == True
229 |
230 | def test_use_approx_approx(self):
231 | data = generate_data(randseed=3,sample_size=1000)
232 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
233 | assert m._use_approx(method='approx') == True
234 |
235 | def test_use_approx_exact(self):
236 | data = generate_data(randseed=3,sample_size=2000)
237 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
238 | with pytest.warns(Warning) as record:
239 | use_approx = m._use_approx(method='exact')
240 | assert len(record) == 1
241 | assert record[0].message.args[0] == "Exact GP model with over 2000 data points requires large amounts of time and memory, continuing with exact model."
242 | assert use_approx == False
243 |
244 | def test_gp_normative_model(self):
245 | data = generate_data(sample_size=4, n_sites=2, randseed=3)
246 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
247 | m.gp_normative_model()
248 | assert 'GP_pred' in m.data.columns
249 | assert math.isclose(0,m.data['GP_residuals'].mean(),abs_tol=0.5)
250 |
251 | def test_homo_res(self):
252 | data = dataset_homo()
253 | m = pynm.PyNM(data,'score','train_sample',['x'])
254 | with pytest.warns(None) as record:
255 | m.gp_normative_model(method='exact')
256 | assert len(record) == 0
257 |
258 | def test_nongaussian_res(self):
259 | data = dataset_skew()
260 | m = pynm.PyNM(data,'score','train_sample',['x'])
261 | with pytest.warns(Warning) as record:
262 | m.gp_normative_model(method='exact')
263 | assert len(record) == 1
264 | assert record[0].message.args[0] == "The residuals are not Gaussian!"
265 |
266 | def test_het_res(self):
267 | data = dataset_het()
268 | m = pynm.PyNM(data,'score','train_sample',['x'])
269 | with pytest.warns(Warning) as record:
270 | m.gp_normative_model(method='exact')
271 | assert len(record) == 1
272 | assert record[0].message.args[0] == "The residuals are heteroskedastic!"
273 |
274 | def test_rmse(self):
275 | y_true = np.array([1,2,3,4,5])
276 | y_pred = np.array([0,1,3,5,2])
277 | assert RMSE(y_true,y_pred) == np.sqrt(2.4)
278 |
279 | def test_smse(self):
280 | y_true = np.array([1,2,3,4,5])
281 | y_pred = np.array([0,1,3,5,2])
282 | assert SMSE(y_true,y_pred) == np.sqrt(2.4)/np.sqrt(2)
283 |
284 | def test_msll(self):
285 | y_true = np.array([1,2,3,4,5])
286 | y_pred = np.array([0,1,3,5,2])
287 | sigma = np.array([1,2,1,2,1])
288 | y_train_mean = 2
289 | y_train_sigma = 1
290 |
291 | term1 = 0.5*np.log(2*np.pi*np.array([1,4,1,4,1])) + np.array([1/2,1/8,0,1/8,9/2])
292 | term2 = 0.5*np.log(2*np.pi) + np.array([1/2,0,1/2,2,4.5])
293 |
294 | assert MSLL(y_true,y_pred,sigma,y_train_mean,y_train_sigma) == np.mean(term1 - term2)
295 |
296 | def test_report(self):
297 | data = generate_data(sample_size=4, n_sites=2, randseed=3)
298 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
299 | m.centiles_normative_model()
300 | m.loess_normative_model()
301 | #m.gp_normative_model()
302 | #m.gamlss_normative_model()
303 | m.report()
304 |
305 | @patch("matplotlib.pyplot.show")
306 | class TestPlot:
307 | def test_plot_default(self,mock_patch):
308 | data = generate_data(randseed=3)
309 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
310 | m.centiles_normative_model()
311 | m.plot()
312 | assert True
313 |
314 | def test_plot_default_two_models(self,mock_patch):
315 | data = generate_data(randseed=3)
316 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
317 | m.centiles_normative_model()
318 | m.loess_normative_model()
319 | assert m.plot() is None
320 |
321 | def test_plot_default_no_models(self,mock_patch):
322 | data = generate_data(randseed=3)
323 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
324 | with pytest.warns(Warning) as record:
325 | m.plot()
326 |
327 | def test_plot_valid_subset(self,mock_patch):
328 | subset = ['Centiles','LOESS']
329 | data = generate_data(randseed=3)
330 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
331 | m.centiles_normative_model()
332 | m.loess_normative_model()
333 | assert m.plot(kind=subset) is None
334 |
335 | def test_plot_invalid_subset1(self,mock_patch):
336 | subset = ['Centiles',None]
337 | data = generate_data(randseed=3)
338 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
339 | m.centiles_normative_model()
340 | with pytest.raises(ValueError):
341 | m.plot(kind=subset)
342 |
343 | def test_plot_invalid_subset2(self,mock_patch):
344 | subset = ['Centiles','GAMLSS']
345 | data = generate_data(randseed=3)
346 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
347 | m.centiles_normative_model()
348 | with pytest.raises(KeyError):
349 | m.plot(kind=subset)
350 |
351 | def test_plot_res_default(self,mock_patch):
352 | data = generate_data(randseed=3)
353 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
354 | m.centiles_normative_model()
355 | assert m.plot_res() is None
356 |
357 | def test_plot_res_default_two_models(self,mock_patch):
358 | data = generate_data(randseed=3)
359 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
360 | m.centiles_normative_model()
361 | m.loess_normative_model()
362 | assert m.plot_res() is None
363 |
364 | def test_plot_res_default_no_models(self,mock_patch):
365 | data = generate_data(randseed=3)
366 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
367 | with pytest.raises(ValueError):
368 | m.plot_res()
369 |
370 | def test_plot_res_valid_subset(self,mock_patch):
371 | subset = ['Centiles','LOESS']
372 | data = generate_data(randseed=3)
373 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
374 | m.centiles_normative_model()
375 | m.loess_normative_model()
376 | assert m.plot_res(kind=subset) is None
377 |
378 | def test_plot_res_invalid_subset1(self,mock_patch):
379 | subset = ['Centiles',None]
380 | data = generate_data(randseed=3)
381 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
382 | m.centiles_normative_model()
383 | with pytest.raises(ValueError):
384 | m.plot_res(kind=subset)
385 |
386 | def test_plot_res_invalid_subset2(self,mock_patch):
387 | subset = ['Centiles','GAMLSS']
388 | data = generate_data(randseed=3)
389 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
390 | m.centiles_normative_model()
391 | with pytest.raises(ValueError):
392 | m.plot_res(kind=subset)
393 |
394 | def test_plot_z_default(self,mock_patch):
395 | data = generate_data(randseed=3)
396 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
397 | m.centiles_normative_model()
398 | assert m.plot_z() is None
399 |
400 | def test_plot_z_default_two_models(self,mock_patch):
401 | data = generate_data(randseed=3)
402 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
403 | m.centiles_normative_model()
404 | m.loess_normative_model()
405 | assert m.plot_z() is None
406 |
407 | def test_plot_z_default_no_models(self,mock_patch):
408 | data = generate_data(randseed=3)
409 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
410 | with pytest.raises(ValueError):
411 | m.plot_z()
412 |
413 | def test_plot_z_valid_subset(self,mock_patch):
414 | subset = ['Centiles','LOESS']
415 | data = generate_data(randseed=3)
416 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
417 | m.centiles_normative_model()
418 | m.loess_normative_model()
419 | assert m.plot_z(kind=subset) is None
420 |
421 | def test_plot_z_invalid_subset1(self,mock_patch):
422 | subset = ['Centiles',None]
423 | data = generate_data(randseed=3)
424 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
425 | m.centiles_normative_model()
426 | with pytest.raises(ValueError):
427 | m.plot_z(kind=subset)
428 |
429 | def test_plot_z_invalid_subset2(self,mock_patch):
430 | subset = ['Centiles','GAMLSS']
431 | data = generate_data(randseed=3)
432 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
433 | m.centiles_normative_model()
434 | with pytest.raises(ValueError):
435 | m.plot_z(kind=subset)
436 |
437 | class TestApprox:
438 | def test_svgp_init(self):
439 | from pynm.models.approx import SVGP
440 |
441 | data = generate_data(randseed=3)
442 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
443 | conf_mat = m._get_conf_mat()
444 | score = m._get_score()
445 |
446 | X_train,X_test,y_train,y_test = train_test_split(conf_mat, score)
447 | svgp = SVGP(X_train,X_test,y_train,y_test)
448 |
449 | def test_svgp_train(self):
450 | from pynm.models.approx import SVGP
451 |
452 | data = generate_data(randseed=3)
453 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
454 | conf_mat = m._get_conf_mat()
455 | score = m._get_score()
456 |
457 | X_train,X_test,y_train,y_test = train_test_split(conf_mat, score)
458 | svgp = SVGP(X_train,X_test,y_train,y_test)
459 | svgp.train(num_epochs = 2)
460 |
461 | assert len(svgp.loss) == 2
462 |
463 | def test_svgp_predict(self):
464 | from pynm.models.approx import SVGP
465 |
466 | data = generate_data(randseed=3)
467 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
468 | conf_mat = m._get_conf_mat()
469 | score = m._get_score()
470 |
471 | X_train,X_test,y_train,y_test = train_test_split(conf_mat, score)
472 | svgp = SVGP(X_train,X_test,y_train,y_test)
473 | svgp.train(num_epochs = 2)
474 | means,sigmas = svgp.predict()
475 |
476 | assert means.size(0) == y_test.shape[0]
477 | assert sigmas.size(0) == y_test.shape[0]
478 |
479 | def test_svgp_model(self):
480 | data = generate_data(sample_size=4, n_sites=2, randseed=3)
481 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
482 | m.gp_normative_model(method='approx')
483 |
484 | assert 'GP_pred' in m.data.columns
485 | assert math.isclose(0, m.data['GP_residuals'].mean(), abs_tol=0.5)
486 |
487 | class TestGAMLSS:
488 | def test_get_r_formulas(self):
489 | from pynm.models import gamlss
490 |
491 | g = gamlss.GAMLSS(mu='score ~ 1')
492 | mu,sigma,_,_ = g._get_r_formulas('score ~ cs(age) + site',None,None,None)
493 | #assert not isinstance(mu,str)
494 | assert mu == 'score ~ cs(age) + site'
495 | assert sigma == '~ 1'
496 |
497 | def test_gamlss(self):
498 | data = generate_data(sample_size=4, n_sites=2, randseed=3)
499 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
500 | m.gamlss_normative_model(mu='score ~ cs(age)',sigma='~ age + site',tau='~ c(sex)')
501 | assert 'GAMLSS_pred' in m.data.columns
502 |
503 | def test_gamlss_smse(self):
504 | data = generate_data(sample_size=4, n_sites=2, randseed=3)
505 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
506 | m.gamlss_normative_model(mu='score ~ cs(age)',sigma='~ age + site',tau='~ c(sex)')
507 | assert m.SMSE_GAMLSS > 0
508 |
509 | def test_gamlss_default_formulas(self):
510 | data = generate_data(sample_size=4, n_sites=2, randseed=3)
511 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
512 | m.gamlss_normative_model()
513 | assert 'GAMLSS_pred' in m.data.columns
514 |
515 | def test_gamlss_invalid_init(self):
516 | from pynm.models import gamlss
517 |
518 | with pytest.raises(ValueError):
519 | gamlss.GAMLSS()
520 |
521 | def test_gamlss_nan_issue(self):
522 | df = generate_data(n_sites=4,sample_size=35,randseed=650)
523 | #Initialize pynm w/ data and confounds
524 | m = pynm.PyNM(df,'score','group',['age','c(sex)','c(site)'])
525 | m.loess_normative_model()
526 | m.centiles_normative_model()
527 | m.gamlss_normative_model(mu='score ~ ps(age) + c(sex) + c(site)',sigma = '~ age',family='SHASHo2')
528 |
529 | def test_gamlss_random_effect(self):
530 | df = generate_data(n_sites=4,sample_size=35,randseed=650)
531 | #Initialize pynm w/ data and confounds
532 | m = pynm.PyNM(df,'score','group',
533 | confounds = ['age','c(sex)','c(site)'])
534 | m.gamlss_normative_model(mu='score ~ ps(age) + c(sex) + random(as.factor(site))',sigma = '~ ps(age)',family='SHASHo2',method='mixed(10,50)')
535 |
536 | def test_gamlss_random_effect_not_converged(self):
537 | #TODO: Force example where algorithm not converged warning gets thrown
538 | df = generate_data(n_sites=4,sample_size=35,randseed=650)
539 | #Initialize pynm w/ data and confounds
540 | m = pynm.PyNM(df,'score','group',
541 | confounds = ['age','c(sex)','c(site)'])
542 | m.gamlss_normative_model(mu='score ~ ps(age) + c(sex) + random(as.factor(site))',sigma = '~ ps(age)',family='SHASHo2',method='RS')
543 |
544 | def test_gamlss_bad_formula(self):
545 | df = generate_data(n_sites=4,sample_size=35,randseed=650)
546 | #Initialize pynm w/ data and confounds
547 | m = pynm.PyNM(df,'score','group',
548 | confounds = ['age','c(sex)','c(site)'])
549 | with pytest.raises(ValueError):
550 | m.gamlss_normative_model(mu='score ~ xxx(age) + c(sex) + c(site)',family='SHASHo2')
551 |
552 | class TestCV:
553 | def test_cv_1_loess(self):
554 | data = generate_data(randseed=11)
555 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],bin_spacing=8,bin_width=1.5)
556 | m.loess_normative_model()
557 | assert math.isclose(2.3482, np.sum(m.data.LOESS_z), abs_tol=0.00001)
558 |
559 | def test_cv_3_loess(self):
560 | data = generate_data(n_sites=1,sample_size=100,randseed=650)
561 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],bin_spacing=8,bin_width=1.5)
562 | m.loess_normative_model(cv_folds=3)
563 | assert not np.isnan(m.RMSE_LOESS)
564 | assert not np.isnan(m.SMSE_LOESS)
565 |
566 | def test_cv_1_centiles(self):
567 | data = generate_data(randseed=11)
568 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],bin_spacing=8,bin_width=1.5)
569 | m.centiles_normative_model()
570 | assert np.sum(m.data.Centiles) == 446
571 |
572 | def test_cv_3_centiles(self):
573 | data = generate_data(n_sites=1,sample_size=100,randseed=650)
574 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],bin_spacing=8,bin_width=1.5)
575 | m.centiles_normative_model(cv_folds=3)
576 | assert not np.isnan(m.RMSE_Centiles)
577 | assert not np.isnan(m.SMSE_Centiles)
578 |
579 | def test_cv_1_gp(self):
580 | data = generate_data(sample_size=4, n_sites=2, randseed=3)
581 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
582 | m.gp_normative_model()
583 | assert 'GP_pred' in m.data.columns
584 | assert math.isclose(0,m.data['GP_residuals'].mean(),abs_tol=0.5)
585 |
586 | def test_cv_3_gp(self):
587 | data = generate_data(sample_size=4, n_sites=2, randseed=3)
588 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
589 | m.gp_normative_model(cv_folds=3)
590 | assert 'GP_pred' in m.data.columns
591 | assert math.isclose(0,m.data['GP_residuals'].mean(),abs_tol=0.5)
592 |
593 | def test_cv_1_svgp(self):
594 | data = generate_data(sample_size=4, n_sites=2, randseed=3)
595 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
596 | m.gp_normative_model(method='approx')
597 |
598 | assert 'GP_pred' in m.data.columns
599 | assert math.isclose(0, m.data['GP_residuals'].mean(), abs_tol=0.5)
600 |
601 | def test_cv_3_svgp(self):
602 | data = generate_data(sample_size=4, n_sites=2, randseed=3)
603 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
604 | m.gp_normative_model(method='approx',cv_folds=3,num_epochs=3)
605 |
606 | assert 'GP_pred' in m.data.columns
607 | assert math.isclose(0, m.data['GP_residuals'].mean(), abs_tol=0.5)
608 |
609 | def test_cv_1_gamlss(self):
610 | data = generate_data(sample_size=4, n_sites=2, randseed=3)
611 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
612 | m.gamlss_normative_model(mu='score ~ cs(age)',sigma='~ age + site',tau='~ c(sex)')
613 | assert 'GAMLSS_pred' in m.data.columns
614 |
615 | def test_cv_3_gamlss(self):
616 | data = generate_data(sample_size=5, n_sites=2, randseed=3)
617 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'])
618 | m.gamlss_normative_model(mu='score ~ cs(age)',sigma='~ age + site',tau='~ c(sex)',cv_folds=3)
619 | assert 'GAMLSS_pred' in m.data.columns
--------------------------------------------------------------------------------
/tutorials/image1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppsp-team/PyNM/52de5b73ffd2cb4b04352e348f042dd695be3c87/tutorials/image1.jpg
--------------------------------------------------------------------------------
/tutorials/image2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppsp-team/PyNM/52de5b73ffd2cb4b04352e348f042dd695be3c87/tutorials/image2.jpg
--------------------------------------------------------------------------------
/tutorials/image3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppsp-team/PyNM/52de5b73ffd2cb4b04352e348f042dd695be3c87/tutorials/image3.jpg
--------------------------------------------------------------------------------
/tutorials/image4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppsp-team/PyNM/52de5b73ffd2cb4b04352e348f042dd695be3c87/tutorials/image4.jpg
--------------------------------------------------------------------------------
/tutorials/image5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppsp-team/PyNM/52de5b73ffd2cb4b04352e348f042dd695be3c87/tutorials/image5.jpg
--------------------------------------------------------------------------------