├── .binder └── environment.yml ├── .github └── workflows │ └── draft_pdf.yml ├── .gitignore ├── .travis.yml ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── paper ├── figure1.png ├── paper.bib └── paper.md ├── pynm ├── __init__.py ├── cli.py ├── models │ ├── __init__.py │ ├── approx.py │ ├── centiles.py │ ├── gamlss.py │ └── loess.py ├── pynm.py └── util.py ├── pynm_logo.png ├── pynm_models.png ├── requirements.txt ├── setup.py ├── test ├── __init__.py └── test_pynm.py └── tutorials ├── 0-why_normative_modeling.ipynb ├── 1-getting_started.ipynb ├── 2-multivariate_confounds.ipynb ├── 3-big_data.ipynb ├── 4-complex_data.ipynb ├── 5-model_selection.ipynb ├── 6-downstream_analyses.ipynb ├── image1.jpg ├── image2.jpg ├── image3.jpg ├── image4.jpg └── image5.jpg /.binder/environment.yml: -------------------------------------------------------------------------------- 1 | name: python 3.9 2 | 3 | channels: 4 | - conda-forge 5 | 6 | dependencies: 7 | - python=3.9 8 | - r-base=4.2 9 | - r-tidyverse 10 | - r-gamlss 11 | - r-gamlss.dist 12 | - r-gamlss.data 13 | - pip: 14 | - pynm 15 | -------------------------------------------------------------------------------- /.github/workflows/draft_pdf.yml: -------------------------------------------------------------------------------- 1 | on: [push] 2 | 3 | jobs: 4 | paper: 5 | runs-on: ubuntu-latest 6 | name: Paper Draft 7 | steps: 8 | - name: Checkout 9 | uses: actions/checkout@v2 10 | - name: Build draft PDF 11 | uses: openjournals/openjournals-draft-action@master 12 | with: 13 | journal: joss 14 | # This should be the path to the paper within your repo. 15 | paper-path: paper/paper.md 16 | - name: Upload 17 | uses: actions/upload-artifact@v1 18 | with: 19 | name: paper 20 | # This is the output path where Pandoc will write the compiled 21 | # PDF. Note, this should be the same directory as the input 22 | # paper.md 23 | path: paper/paper.pdf -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | #vscode 107 | .vscode/ -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | - "3.7" 5 | before_install: 6 | - sudo apt-get update 7 | # command to install dependencies 8 | install: 9 | # - pip install -r requirements.txt #(should work from install_requires) 10 | - pip install . 11 | - pip install pytest 12 | # command to run tests 13 | script: 14 | - pytest test/test_pynm.py::TestBasic 15 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to PyNM 2 | Your input is much welcome! Please do not hesitate to contribute to this project, whether it's: 3 | 4 | - Reporting a bug 5 | - Discussing the current state of the code 6 | - Submitting a fix 7 | - Proposing new features 8 | - Becoming a maintainer 9 | 10 | ## We Develop with Github 11 | We use github to host code, to track issues and feature requests, as well as accept pull requests. 12 | 13 | ## We Use [Github Flow](https://guides.github.com/introduction/flow/index.html), So All Code Changes Happen Through Pull Requests 14 | Pull requests are the best way to propose changes to the codebase (we use [Github Flow](https://guides.github.com/introduction/flow/index.html)). We actively welcome your pull requests: 15 | 16 | 1. Fork the repo and create your branch from `main`. 17 | 2. If you've added code that should be tested, add tests. 18 | 3. If you've changed APIs, update the documentation. 19 | 4. Ensure the test suite passes (Travis). 20 | 5. Make sure your code lints (PEP8). 21 | 6. Issue the pull request! 22 | 23 | ## Any contributions you make will be under the 3-clause BSD License 24 | In short, when you submit code changes, your submissions are understood to be under the same [3-clause BSD License](https://opensource.org/licenses/BSD-3-Clause) that covers the project. Feel free to contact the maintainers if that's a concern. 25 | 26 | ## Report bugs using Github's [issues](https://github.com/ppsp-team/PyNM/issues) 27 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/ppsp-team/PyNM/issues/new/choose); it's that easy! 28 | 29 | ## Write bug reports with detail, background, and sample code 30 | 31 | **Great Bug Reports** tend to have: 32 | 33 | - A quick summary and/or background 34 | - Specific steps to reproduce, with sample code if you can. 35 | - What you expected would happen 36 | - What actually happens 37 | - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work) 38 | 39 | Thanks in advance to take the time for writing **Great** Bug Reports! 40 | 41 | ## Use a Consistent Coding Style 42 | 43 | * 4 spaces for indentation rather than tabs 44 | * [PEP 8 Style Guide for Python Code](https://pep8.org/) 45 | * [Numpydoc](https://numpydoc.readthedocs.io/en/latest/) for the docstring 46 | 47 | ## License 48 | By contributing, you agree that your contributions will be licensed under its [3-clause BSD License](https://opensource.org/licenses/BSD-3-Clause). 49 | 50 | ## References 51 | This document was adapted from [this open-source contribution guidelines](https://gist.github.com/briandk/3d2e8b3ec8daf5a27a62). 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, Guillaume Dumas 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![PyNM Logo](pynm_logo.png) 2 | 3 | [![PyPI version shields.io](https://img.shields.io/pypi/v/pynm.svg)](https://pypi.org/project/pynm/) [![license](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) 4 | 5 | PyNM is a lightweight python implementation of Normative Modeling making it approachable and easy to adopt. The package provides: 6 | 7 | - Python API and a command-line interface for wide accessibility 8 | - Automatic dataset splitting and cross-validation 9 | - Five models from various back-ends in a unified interface that cover a broad range of common use cases 10 | - Centiles 11 | - LOESS 12 | - Gaussian Process (GP) 13 | - Stochastic Variational Gaussian Process (SVGP) 14 | - Generalized Additive Models of Location Shape and Scale (GAMLSS) 15 | - Solutions for very large datasets and heteroskedastic data 16 | - Integrated plotting and evaluation functions to quickly check the validity of the model fit and results 17 | - Comprehensive and interactive tutorials 18 | 19 | The tutorials can be accessed without any local installation via binder: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ppsp-team/PyNM/HEAD) 20 | 21 | For a more advanced implementation, see the Python library [PCNtoolkit](https://github.com/amarquand/PCNtoolkit). 22 | 23 | ## Installation 24 | **Note**: functional installation requires python 3.9 25 | 26 | **Minimal Installation (without R)** 27 | 28 | If you aren't using the GAMLSS model/don't need to install R. 29 | 30 | ```bash 31 | $ pip install pynm 32 | ``` 33 | 34 | **Installation with R** 35 | 36 | If you are using a GAMLSS. 37 | - Must first have R (v4.2.2) installed and packages: 38 | - gamlss 39 | - gamlss.dist 40 | - gamlss.data 41 | 42 | Instruction for installing R can be found at [r-project](https://www.r-project.org/). Once R and the `gamlss` packages are installed, install pynm: 43 | ```bash 44 | $ pip install pynm 45 | ``` 46 | **Bleeding-edge Installation** 47 | 48 | If you want to be up to date with the most recent changes to PyNM (not necessarily stable). For the options above replace `pip install pynm` with: 49 | ```bash 50 | $ git clone https://github.com/ppsp-team/PyNM.git 51 | $ cd pynm 52 | $ pip install . 53 | ``` 54 | 55 | ## Command Line Usage 56 | ``` 57 | usage: pynm [-h] --pheno_p PHENO_P --out_p OUT_P --confounds CONFOUNDS --score 58 | SCORE --group GROUP [--train_sample TRAIN_SAMPLE] [--LOESS] 59 | [--centiles] [--bin_spacing BIN_SPACING] [--bin_width BIN_WIDTH] 60 | [--GP] [--gp_method GP_METHOD] [--gp_num_epochs GP_NUM_EPOCHS] 61 | [--gp_n_inducing GP_N_INDUCING] [--gp_batch_size GP_BATCH_SIZE] 62 | [--gp_length_scale GP_LENGTH_SCALE] 63 | [--gp_length_scale_bounds [GP_LENGTH_SCALE_BOUNDS [GP_LENGTH_SCALE_BOUNDS ...]]] 64 | [--gp_nu NU] [--GAMLSS] [--gamlss_mu GAMLSS_MU] 65 | [--gamlss_sigma GAMLSS_SIGMA] [--gamlss_nu GAMLSS_NU] 66 | [--gamlss_tau GAMLSS_TAU] [--gamlss_family GAMLSS_FAMILY] 67 | 68 | optional arguments: 69 | -h, --help show this help message and exit 70 | --pheno_p PHENO_P Path to phenotype data. Data must be in a .csv file. 71 | --out_p OUT_P Path to output directory. 72 | --confounds CONFOUNDS 73 | List of confounds to use in the GP model.The list must 74 | formatted as a string with commas between confounds, 75 | each confound must be a column name from the phenotype 76 | .csv file. For GP model all confounds will be used, 77 | for LOESS and Centiles models only the first is used. 78 | For GAMLSS all confounds are used unless formulas are 79 | specified. Categorical values must be denoted by 80 | c(var) ('c' must be lower case), e.g. 'c(SEX)' for 81 | column name 'SEX'. 82 | --score SCORE Response variable for all models. Must be a column 83 | title from phenotype .csv file. 84 | --group GROUP Column name from the phenotype .csv file that 85 | distinguishes probands from controls. The column must 86 | be encoded with str labels using 'PROB' for probands 87 | and 'CTR' for controls or with int labels using 1 for 88 | probands and 0 for controls. 89 | --train_sample TRAIN_SAMPLE 90 | Which method to use for a training sample, can be a 91 | float in (0,1] for a percentage of controls or 92 | 'manual' to be manually set using a column of the 93 | DataFrame labelled 'train_sample'. 94 | --LOESS Flag to run LOESS model. 95 | --centiles Flag to run Centiles model. 96 | --bin_spacing BIN_SPACING 97 | Distance between bins for LOESS & centiles models. 98 | --bin_width BIN_WIDTH 99 | Width of bins for LOESS & centiles models. 100 | --GP Flag to run Gaussian Process model. 101 | --gp_method GP_METHOD 102 | Method to use for the GP model. Can be set to 103 | 'auto','approx' or 'exact'. In 'auto' mode, the exact 104 | model will be used for datasets smaller than 2000 data 105 | points. SVGP is used for the approximate model. See 106 | documentation for details. Default value is 'auto'. 107 | --gp_num_epochs GP_NUM_EPOCHS 108 | Number of training epochs for SVGP model. See 109 | documentation for details. Default value is 20. 110 | --gp_n_inducing GP_N_INDUCING 111 | Number of inducing points for SVGP model. See 112 | documentation for details. Default value is 500. 113 | --gp_batch_size GP_BATCH_SIZE 114 | Batch size for training and predicting from SVGP 115 | model. See documentation for details. Default value is 116 | 256. 117 | --gp_length_scale GP_LENGTH_SCALE 118 | Length scale of Matern kernel for exact model. See 119 | documentation for details. Default value is 1. 120 | --gp_length_scale_bounds [GP_LENGTH_SCALE_BOUNDS [GP_LENGTH_SCALE_BOUNDS ...]] 121 | The lower and upper bound on length_scale. If set to 122 | 'fixed', length_scale cannot be changed during 123 | hyperparameter tuning. See documentation for details. 124 | Default value is (1e-5,1e5). 125 | --gp_nu NU Nu of Matern kernel for exact and SVGP model. See 126 | documentation for details. Default value is 2.5. 127 | --GAMLSS Flag to run GAMLSS. 128 | --gamlss_mu GAMLSS_MU 129 | Formula for mu (location) parameter of GAMLSS. Default 130 | formula for score is sum of confounds with non- 131 | categorical columns as smooth functions, e.g. 'score ~ 132 | ps(age) + sex'. 133 | --gamlss_sigma GAMLSS_SIGMA 134 | Formula for mu (location) parameter of GAMLSS. Default 135 | formula is '~ 1'. 136 | --gamlss_nu GAMLSS_NU 137 | Formula for mu (location) parameter of GAMLSS. Default 138 | formula is '~ 1'. 139 | --gamlss_tau GAMLSS_TAU 140 | Formula for mu (location) parameter of GAMLSS. Default 141 | formula is '~ 1'. 142 | --gamlss_family GAMLSS_FAMILY 143 | Family of distributions to use for fitting, default is 144 | 'SHASHo2'. See R documentation for GAMLSS package for 145 | other available families of distributions. 146 | ``` 147 | ## API Example 148 | ```python 149 | from pynm.pynm import PyNM 150 | 151 | # Load data 152 | df = pd.read_csv('data.csv') 153 | 154 | # Initialize pynm w/ data and confounds 155 | m = PyNM(df,'score','group', confounds = ['age','c(sex)','c(site)']) 156 | 157 | # Run models 158 | m.loess_normative_model() 159 | m.centiles_normative_model() 160 | m.gp_normative_model() 161 | m.gamlss_normative_model() 162 | 163 | # Collect output 164 | data = m.data 165 | ``` 166 | 167 | ## Documentation 168 | 169 | All the functions have the classical Python DocStrings that you can summon with ```help()```. You can also see the [tutorials](https://github.com/ppsp-team/PyNM/tree/master/tutorials) for documented examples. 170 | 171 | ### Training sample 172 | By default, the models are fit on all the controls in the dataset and prediction is then done on the entire dataset. The residuals (scores of the normative model) are then calculated as the difference between the actual value and predicted value for each subject. This paradigm is not meant for situations in which the residuals will then be used in a prediction setting, since any train/test split stratified by proband/control will have information from the training set leaked into the test data. 173 | 174 | In order to avoid contaminating the test set, in a prediction setting it is important to fit the normative model on a subset of the controls and then leave those out. This is implemented in PyNM with the `--train_sample` flag. It can be set to: 175 | 1. A number in (0,1] 176 | - This is simplest usage that defines the sample size, PyNM will then select a random sample of the controls and use those as a training group. The number is the proportion of controls to use, the default value is 1 to use the full set of controls. 177 | - The subjects used in the sample are recorded in the column `'train_sample'` of the resulting PyNM.data object. Subjects used in the training sample are encoded as 1s, and the rest as 0s. 178 | 2. `'manual'` 179 | - It is also possible to specify exactly which subjects to use as a training group by providing a column in the input data labeled `'train_sample'` encoded the same way. 180 | 181 | ### Models 182 | #### Centiles and LOESS Models 183 | Both the Centiles and LOESS models are non parametric models based local approximations. They accept only a single dependent variable, passed using the `conf` option. 184 | 185 | #### Gaussian Process Model 186 | Gaussian Process Regression (GPR), which underpins the Gaussian Process Model, can accept an arbitrary number of dependent variables passed using the `confounds` option. Note: in order for GPR to be effective, the data must be homoskedastic. For a full discussion see [this paper](https://www.biorxiv.org/content/10.1101/2021.05.11.443565v1.full). 187 | 188 | GPR is very intensive on both memory and time usage. In order to have a scaleable method, we've implemented both an exact model for smaller datasets and an approximate method, recommended for datasets over ~1000 subjects. The method can be specified using the `method` option, it defaults to `auto` in which the approxiamte model will be chosen for datasets over 1000. 189 | 190 | ##### Exact Model 191 | The exact model implements [scikit-learn](https://scikit-learn.org/stable/index.html)'s Gaussian Process Regressor. The kernel is composed of a constant kernel, a white noise kernel, and a Matern kernel. The Matern kernel has parameters `nu` and `length_scale` that can be specified. The parameter `nu` has special values at 1.5 and 2.5, using other values will significantly increase computation time. See [documentation](https://scikit-learn.org/stable/modules/gaussian_process.html) for an overview of both. 192 | 193 | ##### Approximate Model 194 | The approximate model implements a Stochastic Variational Gaussian Process (SVGP) model using [GPytorch](https://gpytorch.ai/), with a kernel closely matching the one in the exact model. SVGP is a deep learning technique that needs to be trained on minibatches for a set number of epochs, this can be tuned with the parameters `batch_size` and `num_epoch`. The model speeds up computation by using a subset of the data as inducing points, this can be controlled with the parameter `n_inducing` that defines how many points to use. See [documentation](https://docs.gpytorch.ai/en/v1.1.1/examples/04_Variational_and_Approximate_GPs/SVGP_Regression_CUDA.html) for an overview. 195 | 196 | #### GAMLSS 197 | Generalized Additive Models of Location Shape and Scale (GAMLSS) are a flexible modeling framework that can model heteroskedasticity, non-linear effects of variables, and hierarchical structure of the data. The implementation here is a python wrapper for the R package gamlss, formulas for each parameter must be specified using functions available in the package (see [documentation](https://cran.r-project.org/web/packages/gamlss/index.html)). For a full discussion of using GAMLSS for normative modeling see [this paper](https://doi.org/10.1101/2021.06.14.448106). 198 | 199 | ![Available Models](pynm_models.png) 200 | 201 | ## References 202 | 203 | Original papers with Gaussian Processes (GP): 204 | - Marquand et al. Biological Psychiatry 2016 [doi:10.1016/j.biopsych.2015.12.023](https://doi.org/10.1016/j.biopsych.2015.12.023) 205 | - Marquand et al. Molecular Psychiatry 2019 [doi:10.1038/s41380-019-0441-1](https://doi.org/10.1038/s41380-019-0441-1) 206 | 207 | For limitations of Gaussian Proccesses: 208 | - Xu et al. PLoS ONE 2021, [The pitfalls of using Gaussian Process Regression for normative modeling](https://doi.org/10.1371/journal.pone.0252108) 209 | 210 | Example of use of the LOESS approach: 211 | - Lefebvre et al. Front. Neurosci. 2018 [doi:10.3389/fnins.2018.00662](https://doi.org/10.3389/fnins.2018.00662) 212 | - Maruani et al. Front. Psychiatry 2019 [doi:10.3389/fpsyt.2019.00011](https://doi.org/10.3389/fpsyt.2019.00011) 213 | 214 | For the Centiles approach see: 215 | - Bethlehem et al. Communications Biology 2020 [doi:10.1038/s42003-020-01212-9](https://doi.org/10.1038/s42003-020-01212-9) 216 | - R implementation [here](https://github.com/rb643/Normative_modeling). 217 | 218 | For the SVGP model see: 219 | - Hensman et al. [https://arxiv.org/pdf/1411.2005.pdf](https://arxiv.org/pdf/1411.2005.pdf) 220 | 221 | For GAMLSS see: 222 | - Dinga et al. [https://doi.org/10.1101/2021.06.14.448106](https://doi.org/10.1101/2021.06.14.448106) 223 | - R documentation [https://cran.r-project.org/web/packages/gamlss/index.html](https://cran.r-project.org/web/packages/gamlss/index.html) 224 | 225 | ## How to run tests 226 | 227 | To test the code locally, first make sure R and the required packages are installed then follow the instructions above under **Installation: Bleeding-edge Installation**. Finally, run: 228 | 229 | ```bash 230 | $ pip install -r requirements.txt 231 | $ pytest test/test_pynm.py 232 | ``` 233 | 234 | ## How to report errors 235 | 236 | If you spot any bugs :beetle:? Check out the [open issues](https://github.com/ppsp-team/PyNM/issues) to see if we're already working on it. If not, open up a new issue and we will check it out when we can! 237 | 238 | ## How to contribute 239 | 240 | Thank you for considering contributing to our project! Before getting involved, please review our [contribution guidelines](https://github.com/ppsp-team/PyNM/blob/master/CONTRIBUTING.md). 241 | 242 | ## Support 243 | 244 | This work is supported by [IVADO](https://ivado.ca/), [FRQS](http://www.frqs.gouv.qc.ca/en/), [CFI](https://www.innovation.ca/), [MITACS](https://www.mitacs.ca/en), and [Compute Canada](https://computecanada.ca). 245 | -------------------------------------------------------------------------------- /paper/figure1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ppsp-team/PyNM/52de5b73ffd2cb4b04352e348f042dd695be3c87/paper/figure1.png -------------------------------------------------------------------------------- /paper/paper.bib: -------------------------------------------------------------------------------- 1 | @Article{marquand:2019, 2 | author={Marquand, Andre F. 3 | and Kia, Seyed Mostafa 4 | and Zabihi, Mariam 5 | and Wolfers, Thomas 6 | and Buitelaar, Jan K. 7 | and Beckmann, Christian F.}, 8 | title={Conceptualizing mental disorders as deviations from normative functioning}, 9 | journal={Molecular Psychiatry}, 10 | year={2019}, 11 | month={Oct}, 12 | day={01}, 13 | volume={24}, 14 | number={10}, 15 | pages={1415-1424}, 16 | abstract={Normative models are a class of emerging statistical techniques useful for understanding the heterogeneous biology underlying psychiatric disorders at the level of the individual participant. Analogous to normative growth charts used in paediatric medicine for plotting child development in terms of height or weight as a function of age, normative models chart variation in clinical cohorts in terms of mappings between quantitative biological measures and clinically relevant variables. An emerging body of literature has demonstrated that such techniques are excellent tools for parsing the heterogeneity in clinical cohorts by providing statistical inferences at the level of the individual participant with respect to the normative range. Here, we provide a unifying review of the theory and application of normative modelling for understanding the biological and clinical heterogeneity underlying mental disorders. We first provide a statistically grounded yet non-technical overview of the conceptual underpinnings of normative modelling and propose a conceptual framework to link the many different methodological approaches that have been proposed for this purpose. We survey the literature employing these techniques, focusing principally on applications of normative modelling to quantitative neuroimaging-based biomarkers in psychiatry and, finally, we provide methodological considerations and recommendations to guide future applications of these techniques. We show that normative modelling provides a means by which the importance of modelling individual differences can be brought from theory to concrete data analysis procedures for understanding heterogeneous mental disorders and ultimately a promising route towards precision medicine in psychiatry.}, 17 | issn={1476-5578}, 18 | doi={10.1038/s41380-019-0441-1}, 19 | url={https://doi.org/10.1038/s41380-019-0441-1} 20 | } 21 | 22 | @Article{marquand:2016, 23 | author={Marquand, Andre F. 24 | and Rezek, Iead 25 | and Buitelaar, Jan 26 | and Beckmann, Christian F.}, 27 | title={Understanding Heterogeneity in Clinical Cohorts Using Normative Models: Beyond Case-Control Studies}, 28 | journal={Biological psychiatry}, 29 | year={2016}, 30 | month={Oct}, 31 | day={01}, 32 | edition={2016/01/06}, 33 | publisher={Elsevier}, 34 | volume={80}, 35 | number={7}, 36 | pages={552-561}, 37 | keywords={*Gaussian process; *Heterogeneity; *Normative model; *Outlier detection; *Patient stratification; *Research Domain Criteria; Adult; Attention Deficit Disorder with Hyperactivity/*diagnosis; Brain/physiology; Case-Control Studies; Cluster Analysis; *Data Interpretation, Statistical; Delay Discounting/physiology; Female; Functional Neuroimaging; Humans; Impulsive Behavior/physiology; Magnetic Resonance Imaging; Male; *Models, Statistical; Reward; Young Adult}, 38 | abstract={BACKGROUND: Despite many successes, the case-control approach is problematic in biomedical science. It introduces an artificial symmetry whereby all clinical groups (e.g., patients and control subjects) are assumed to be well defined, when biologically they are often highly heterogeneous. By definition, it also precludes inference over the validity of the diagnostic labels. In response, the National Institute of Mental Health Research Domain Criteria proposes to map relationships between symptom dimensions and broad behavioral and biological domains, cutting across diagnostic categories. However, to date, Research Domain Criteria have prompted few methods to meaningfully stratify clinical cohorts. METHODS: We introduce normative modeling for parsing heterogeneity in clinical cohorts, while allowing predictions at an individual subject level. This approach aims to map variation within the cohort and is distinct from, and complementary to, existing approaches that address heterogeneity by employing clustering techniques to fractionate cohorts. To demonstrate this approach, we mapped the relationship between trait impulsivity and reward-related brain activity in a large healthy cohort (N = 491). RESULTS: We identify participants who are outliers within this distribution and show that the degree of deviation (outlier magnitude) relates to specific attention-deficit/hyperactivity disorder symptoms (hyperactivity, but not inattention) on the basis of individualized patterns of abnormality. CONCLUSIONS: Normative modeling provides a natural framework to study disorders at the individual participant level without dichotomizing the cohort. Instead, disease can be considered as an extreme of the normal range or as-possibly idiosyncratic-deviation from normal functioning. It also enables inferences over the degree to which behavioral variables, including diagnostic labels, map onto biology.}, 39 | note={26927419[pmid]}, 40 | note={PMC5023321[pmcid]}, 41 | note={S0006-3223(16)00002-0[PII]}, 42 | issn={1873-2402}, 43 | doi={10.1016/j.biopsych.2015.12.023}, 44 | url={https://pubmed.ncbi.nlm.nih.gov/26927419}, 45 | url={https://doi.org/10.1016/j.biopsych.2015.12.023}, 46 | language={eng} 47 | } 48 | 49 | @Article{loth:2021, 50 | doi = {10.1371/journal.pcbi.1009477}, 51 | author = {Loth, Eva 52 | and Ahmad, Jumana 53 | and Chatham, Chris 54 | and López, Beatriz 55 | and Carter, Ben 56 | and Crawley, Daisy 57 | and Oakley, Bethany 58 | and Hayward, Hannah 59 | and Cooke, Jennifer 60 | and San José Cáceres, Antonia 61 | and Bzdok, Danilo 62 | and Jones, Emily 63 | and Charman, Tony 64 | and Beckmann, Christian 65 | and Bourgeron, Thomas 66 | and Toro, Roberto 67 | and Buitelaar, Jan 68 | and Murphy, Declan 69 | and Dumas, Guillaume}, 70 | journal = {PLOS Computational Biology}, 71 | publisher = {Public Library of Science}, 72 | title = {The meaning of significant mean group differences for biomarker discovery}, 73 | year = {2021}, 74 | month = {11}, 75 | volume = {17}, 76 | url = {https://doi.org/10.1371/journal.pcbi.1009477}, 77 | pages = {1-16}, 78 | abstract = {Over the past decade, biomarker discovery has become a key goal in psychiatry to aid in the more reliable diagnosis and prognosis of heterogeneous psychiatric conditions and the development of tailored therapies. Nevertheless, the prevailing statistical approach is still the mean group comparison between “cases” and “controls,” which tends to ignore within-group variability. In this educational article, we used empirical data simulations to investigate how effect size, sample size, and the shape of distributions impact the interpretation of mean group differences for biomarker discovery. We then applied these statistical criteria to evaluate biomarker discovery in one area of psychiatric research—autism research. Across the most influential areas of autism research, effect size estimates ranged from small (d = 0.21, anatomical structure) to medium (d = 0.36 electrophysiology, d = 0.5, eye-tracking) to large (d = 1.1 theory of mind). We show that in normal distributions, this translates to approximately 45% to 63% of cases performing within 1 standard deviation (SD) of the typical range, i.e., they do not have a deficit/atypicality in a statistical sense. For a measure to have diagnostic utility as defined by 80% sensitivity and 80% specificity, Cohen’s d of 1.66 is required, with still 40% of cases falling within 1 SD. However, in both normal and nonnormal distributions, 1 (skewness) or 2 (platykurtic, bimodal) biologically plausible subgroups may exist despite small or even nonsignificant mean group differences. This conclusion drastically contrasts the way mean group differences are frequently reported. Over 95% of studies omitted the “on average” when summarising their findings in their abstracts (“autistic people have deficits in X”), which can be misleading as it implies that the group-level difference applies to all individuals in that group. We outline practical approaches and steps for researchers to explore mean group comparisons for the discovery of stratification biomarkers.}, 79 | number = {11}, 80 | } 81 | 82 | @Article{xu:2021, 83 | doi = {10.1371/journal.pone.0252108}, 84 | author = {Xu, Bohan 85 | and Kuplicki, Rayus 86 | and Sen, Sandip 87 | and Paulus, Martin P.}, 88 | journal = {PLOS ONE}, 89 | publisher = {Public Library of Science}, 90 | title = {The pitfalls of using Gaussian Process Regression for normative modeling}, 91 | year = {2021}, 92 | month = {09}, 93 | volume = {16}, 94 | url = {https://doi.org/10.1371/journal.pone.0252108}, 95 | pages = {1-14}, 96 | abstract = {Normative modeling, a group of methods used to quantify an individual’s deviation from some expected trajectory relative to observed variability around that trajectory, has been used to characterize subject heterogeneity. Gaussian Processes Regression includes an estimate of variable uncertainty across the input domain, which at face value makes it an attractive method to normalize the cohort heterogeneity where the deviation between predicted value and true observation is divided by the derived uncertainty directly from Gaussian Processes Regression. However, we show that the uncertainty directly from Gaussian Processes Regression is irrelevant to the cohort heterogeneity in general.}, 97 | number = {9}, 98 | } 99 | 100 | @Misc{pcntoolkit, 101 | author = {Andre F. Marquand 102 | and Saige Rutherford 103 | and Seyed Mostafa Kia 104 | and Thomas Wolfers 105 | and Charlotte Fraza 106 | and Richard Dinga 107 | and Mariam Zabihi}, 108 | title = {PCNToolkit (0.20)}, 109 | year = {2021}, 110 | publisher = {Zenodo}, 111 | doi = {10.5281/zenodo.5207839}, 112 | } 113 | 114 | @InProceedings{kia:2020, 115 | author="Kia, Seyed Mostafa 116 | and Huijsdens, Hester 117 | and Dinga, Richard 118 | and Wolfers, Thomas 119 | and Mennes, Maarten 120 | and Andreassen, Ole A. 121 | and Westlye, Lars T. 122 | and Beckmann, Christian F. 123 | and Marquand, Andre F.", 124 | editor="Martel, Anne L. 125 | and Abolmaesumi, Purang 126 | and Stoyanov, Danail 127 | and Mateus, Diana 128 | and Zuluaga, Maria A. 129 | and Zhou, S. Kevin 130 | and Racoceanu, Daniel 131 | and Joskowicz, Leo", 132 | title="Hierarchical Bayesian Regression for Multi-site Normative Modeling of Neuroimaging Data", 133 | booktitle="Medical Image Computing and Computer Assisted Intervention -- MICCAI 2020", 134 | year="2020", 135 | publisher="Springer International Publishing", 136 | address="Cham", 137 | pages="699--709", 138 | abstract="Clinical neuroimaging has recently witnessed explosive growth in data availability which brings studying heterogeneity in clinical cohorts to the spotlight. Normative modeling is an emerging statistical tool for achieving this objective. However, its application remains technically challenging due to difficulties in properly dealing with nuisance variation, for example due to variability in image acquisition devices. Here, in a fully probabilistic framework, we propose an application of hierarchical Bayesian regression (HBR) for multi-site normative modeling. Our experimental results confirm the superiority of HBR in deriving more accurate normative ranges on large multi-site neuroimaging data compared to widely used methods. This provides the possibility i) to learn the normative range of structural and functional brain measures on large multi-site data; ii) to recalibrate and reuse the learned model on local small data; therefore, HBR closes the technical loop for applying normative modeling as a medical tool for the diagnosis and prognosis of mental disorders.", 139 | isbn="978-3-030-59728-3" 140 | } 141 | 142 | @Article {kia:2021, 143 | author = {Kia, Seyed Mostafa 144 | and Huijsdens, Hester 145 | and Rutherford, Saige 146 | and Dinga, Richard 147 | and Wolfers, Thomas 148 | and Mennes, Maarten 149 | and Andreassen, Ole A. 150 | and Westlye, Lars T. 151 | and Beckmann, Christian F. 152 | and Marquand, Andre F.}, 153 | title = {Federated Multi-Site Normative Modeling using Hierarchical Bayesian Regression}, 154 | elocation-id = {2021.05.28.446120}, 155 | year = {2021}, 156 | doi = {10.1101/2021.05.28.446120}, 157 | publisher = {Cold Spring Harbor Laboratory}, 158 | abstract = {Clinical neuroimaging data availability has grown substantially in the last decade, providing the potential for studying heterogeneity in clinical cohorts on a previously unprecedented scale. Normative modeling is an emerging statistical tool for dissecting heterogeneity in complex brain disorders. However, its application remains technically challenging due to medical data privacy issues and difficulties in dealing with nuisance variation, such as the variability in the image acquisition process. Here, we introduce a federated probabilistic framework using hierarchical Bayesian regression (HBR) for multi-site normative modeling. The proposed method completes the life-cycle of normative modeling by providing the possibilities to learn, update, and adapt the model parameters on decentralized neuroimaging data. Our experimental results confirm the superiority of HBR in deriving more accurate normative ranges on large multi-site neuroimaging datasets compared to the current standard methods. In addition, our approach provides the possibility to recalibrate and reuse the learned model on local datasets and even on datasets with very small sample sizes. The proposed federated framework closes the technical loop for applying normative modeling across multiple sites in a decentralized manner. This will facilitate applications of normative modeling as a medical tool for screening the biological deviations in individuals affected by complex illnesses such as mental disorders.Competing Interest StatementOle A. Andreassen is a consultant to HealthLytix and received a speaker honorarium from Lundbeck. Christian F. Beckmann is a shareholder and director of SBG Neuro.}, 159 | URL = {https://www.biorxiv.org/content/early/2021/05/30/2021.05.28.446120}, 160 | eprint = {https://www.biorxiv.org/content/early/2021/05/30/2021.05.28.446120.full.pdf}, 161 | journal = {bioRxiv} 162 | } 163 | 164 | @Article {dinga:2021, 165 | author = {Dinga, Richard 166 | and Fraza, Charlotte J. 167 | and Bayer, Johanna M.M. 168 | and Kia, Seyed Mostafa 169 | and Beckmann, Christian F. 170 | and Marquand, Andre F.}, 171 | title = {Normative modeling of neuroimaging data using generalized additive models of location scale and shape}, 172 | elocation-id = {2021.06.14.448106}, 173 | year = {2021}, 174 | doi = {10.1101/2021.06.14.448106}, 175 | publisher = {Cold Spring Harbor Laboratory}, 176 | abstract = {Normative modeling aims to quantify the degree to which an individual{\textquoteright}s brain deviates from a reference sample with respect to one or more variables, which can be used as a potential biomarker of a healthy brain and as a tool to study heterogeneity of psychiatric disorders. The application of normative models is hindered by methodological challenges and lacks standards for the usage and evaluation of normative models. In this paper, we present generalized additive models for location scale and shape (GAMLSS) for normative modeling of neuroimaging data, a flexible modeling framework that can model heteroskedasticity, non-linear effects of variables, and hierarchical structure of the data. It can model non-Gaussian distributions, and it allows for an automatic model order selection, thus improving the accuracy of normative models while mitigating problems of overfitting. Furthermore, we describe measures and diagnostic tools suitable for evaluating normative models and step-by-step examples of normative modeling, including fitting several candidate models, selecting the best models, and transferring them to new scan sites.Competing Interest StatementThe authors have declared no competing interest.}, 177 | URL = {https://www.biorxiv.org/content/early/2021/06/14/2021.06.14.448106}, 178 | eprint = {https://www.biorxiv.org/content/early/2021/06/14/2021.06.14.448106.full.pdf}, 179 | journal = {bioRxiv} 180 | } 181 | 182 | @Article{fraza:2021, 183 | title = {Warped Bayesian linear regression for normative modelling of big data}, 184 | journal = {NeuroImage}, 185 | volume = {245}, 186 | pages = {118715}, 187 | year = {2021}, 188 | issn = {1053-8119}, 189 | doi = {10.1016/j.neuroimage.2021.118715}, 190 | url = {https://www.sciencedirect.com/science/article/pii/S1053811921009873}, 191 | author = {Charlotte J. Fraza 192 | and Richard Dinga 193 | and Christian F. Beckmann 194 | and Andre F. Marquand}, 195 | keywords = {Machine learning, UK Biobank, Big data, Bayesian linear regression, Normative modelling}, 196 | abstract = {Normative modelling is becoming more popular in neuroimaging due to its ability to make predictions of deviation from a normal trajectory at the level of individual participants. It allows the user to model the distribution of several neuroimaging modalities, giving an estimation for the mean and centiles of variation. With the increase in the availability of big data in neuroimaging, there is a need to scale normative modelling to big data sets. However, the scaling of normative models has come with several challenges. So far, most normative modelling approaches used Gaussian process regression, and although suitable for smaller datasets (up to a few thousand participants) it does not scale well to the large cohorts currently available and being acquired. Furthermore, most neuroimaging modelling methods that are available assume the predictive distribution to be Gaussian in shape. However, deviations from Gaussianity can be frequently found, which may lead to incorrect inferences, particularly in the outer centiles of the distribution. In normative modelling, we use the centiles to give an estimation of the deviation of a particular participant from the ‘normal’ trend. Therefore, especially in normative modelling, the correct estimation of the outer centiles is of utmost importance, which is also where data are sparsest. Here, we present a novel framework based on Bayesian linear regression with likelihood warping that allows us to address these problems, that is, to correctly model non-Gaussian predictive distributions and scale normative modelling elegantly to big data cohorts. In addition, this method provides likelihood-based statistics, which are useful for model selection. To evaluate this framework, we use a range of neuroimaging-derived measures from the UK Biobank study, including image-derived phenotypes (IDPs) and whole-brain voxel-wise measures derived from diffusion tensor imaging. We show good computational scaling and improved accuracy of the warped BLR for certain IDPs and voxels if there was a deviation from normality of these parameters in their residuals. The present results indicate the advantage of a warped BLR in terms of; computational scalability and the flexibility to incorporate non-linearity and non-Gaussianity of the data, giving a wider range of neuroimaging datasets that can be correctly modelled.} 197 | } 198 | 199 | @Article{rutherford:2022a, 200 | article_type = {journal}, 201 | title = {Charting brain growth and aging at high spatial precision}, 202 | author = {Rutherford, Saige 203 | and Fraza, Charlotte 204 | and Dinga, Richard 205 | and Kia, Seyed Mostafa 206 | and Wolfers, Thomas 207 | and Zabihi, Mariam 208 | and Berthet, Pierre 209 | and Worker, Amanda 210 | and Verdi, Serena 211 | and Andrews, Derek 212 | and Han, Laura KM 213 | and Bayer, Johanna MM 214 | and Dazzan, Paola 215 | and McGuire, Phillip 216 | and Mocking, Roel T 217 | and Schene, Aart 218 | and Sripada, Chandra 219 | and Tso, Ivy F 220 | and Duval, Elizabeth R 221 | and Chang, Soo-Eun 222 | and Penninx, Brenda WJH 223 | and Heitzeg, Mary M 224 | and Burt, S Alexandra 225 | and Hyde, Luke W 226 | and Amaral, David 227 | and Wu Nordahl, Christine 228 | and Andreasssen, Ole A 229 | and Westlye, Lars T 230 | and Zahn, Roland 231 | and Ruhe, Henricus G 232 | and Beckmann, Christian 233 | and Marquand, Andre F}, 234 | editor = {Baker, Chris I and Taschler, Bernd and Esteban, Oscar and Constable, Todd}, 235 | volume = 11, 236 | year = 2022, 237 | month = {feb}, 238 | pub_date = {2022-02-01}, 239 | pages = {e72904}, 240 | citation = {eLife 2022;11:e72904}, 241 | doi = {10.7554/eLife.72904}, 242 | url = {https://doi.org/10.7554/eLife.72904}, 243 | abstract = {Defining reference models for population variation, and the ability to study individual deviations is essential for understanding inter-individual variability and its relation to the onset and progression of medical conditions. In this work, we assembled a reference cohort of neuroimaging data from 82 sites (N=58,836; ages 2–100) and used normative modeling to characterize lifespan trajectories of cortical thickness and subcortical volume. Models are validated against a manually quality checked subset (N=24,354) and we provide an interface for transferring to new data sources. We showcase the clinical value by applying the models to a transdiagnostic psychiatric sample (N=1985), showing they can be used to quantify variability underlying multiple disorders whilst also refining case-control inferences. These models will be augmented with additional samples and imaging modalities as they become available. This provides a common reference platform to bind results from different studies and ultimately paves the way for personalized clinical decision-making.}, 244 | keywords = {normative model, lifespan, growth chart, brain chart, big data, individual prediction}, 245 | journal = {eLife}, 246 | issn = {2050-084X}, 247 | publisher = {eLife Sciences Publications, Ltd}, 248 | } 249 | 250 | @Article{rutherford:2022b, 251 | author={Rutherford, Saige 252 | and Kia, Seyed Mostafa 253 | and Wolfers, Thomas 254 | and Fraza, Charlotte 255 | and Zabihi, Mariam 256 | and Dinga, Richard 257 | and Berthet, Pierre 258 | and Worker, Amanda 259 | and Verdi, Serena 260 | and Ruhe, Henricus G. 261 | and Beckmann, Christian F. 262 | and Marquand, Andre F.}, 263 | title={The normative modeling framework for computational psychiatry}, 264 | journal={Nature Protocols}, 265 | year={2022}, 266 | month={Jul}, 267 | day={01}, 268 | volume={17}, 269 | number={7}, 270 | pages={1711-1734}, 271 | abstract={Normative modeling is an emerging and innovative framework for mapping individual differences at the level of a single subject or observation in relation to a reference model. It involves charting centiles of variation across a population in terms of mappings between biology and behavior, which can then be used to make statistical inferences at the level of the individual. The fields of computational psychiatry and clinical neuroscience have been slow to transition away from patient versus `healthy' control analytic approaches, probably owing to a lack of tools designed to properly model biological heterogeneity of mental disorders. Normative modeling provides a solution to address this issue and moves analysis away from case--control comparisons that rely on potentially noisy clinical labels. Here we define a standardized protocol to guide users through, from start to finish, normative modeling analysis using the Predictive Clinical Neuroscience toolkit (PCNtoolkit). We describe the input data selection process, provide intuition behind the various modeling choices and conclude by demonstrating several examples of downstream analyses that the normative model may facilitate, such as stratification of high-risk individuals, subtyping and behavioral predictive modeling. The protocol takes {\textasciitilde}1--3 h to complete.}, 272 | issn={1750-2799}, 273 | doi={10.1038/s41596-022-00696-5}, 274 | url={https://doi.org/10.1038/s41596-022-00696-5} 275 | } 276 | 277 | @article{rigby:2005, 278 | author = {Rigby, R. A. and Stasinopoulos, D. M.}, 279 | title = {Generalized additive models for location, scale and shape}, 280 | journal = {Journal of the Royal Statistical Society: Series C (Applied Statistics)}, 281 | volume = {54}, 282 | number = {3}, 283 | pages = {507-554}, 284 | doi = {10.1111/j.1467-9876.2005.00510.x}, 285 | url = {https://rss.onlinelibrary.wiley.com/doi/abs/10.1111/j.1467-9876.2005.00510.x}, 286 | eprint = {https://rss.onlinelibrary.wiley.com/doi/pdf/10.1111/j.1467-9876.2005.00510.x}, 287 | year = {2005} 288 | } 289 | 290 | @Article{lefebvre:2018, 291 | author={Lefebvre, Aline 292 | and Delorme, Richard 293 | and Delanoë, Catherine 294 | and Amsellem, Frederique 295 | and Beggiato, Anita 296 | and Germanaud, David 297 | and Bourgeron, Thomas 298 | and Toro, Roberto 299 | and Dumas, Guillaume}, 300 | title={Alpha Waves as a Neuromarker of Autism Spectrum Disorder: The Challenge of Reproducibility and Heterogeneity}, 301 | journal={Frontiers in Neuroscience}, 302 | volume={12}, 303 | year={2018}, 304 | url={https://www.frontiersin.org/article/10.3389/fnins.2018.00662}, 305 | doi={10.3389/fnins.2018.00662}, 306 | issn={1662-453X}, 307 | abstract={Background: There is no consensus in the literature concerning the presence of abnormal alpha wave profiles in patients with autism spectrum disorder (ASD). This may be due to phenotypic heterogeneity among patients as well as the limited sample sizes utilized. Here we present our results of alpha wave profile analysis based on a sample larger than most of those in the field, performed using a robust processing pipeline.Methods: We compared the alpha waves profiles at rest in children with ASD to those of age-, sex-, and IQ-matched control individuals. We used linear regression and non-parametric normative models using age as covariate forparsing the clinical heterogeneity. We explored the correlation between EEG profiles and the patient’s brain volumes, obtained from structural MRI. We automatized the detection of the alpha peak and visually quality controled our MRI measurements. We assessed the robustness of our results by running the EEG preprocessing with two different versions of Matlab as well as Python.Results: A simple linear regression between peak power or frequency of the alpha waves and the status or age of the participants did not allow to identify any statistically significant relationship. The non-parametric normative model (which took account the non-linear effect of age on the alpha profiles) suggested that participants with ASD displayed more variability than control participants for both frequency and amplitude of the alpha peak (p < 0.05). Independent of the status of the individual, we also observed weak associations (uncorrected p < 0.05) between the alpha frequency, and the volumes of several cortical and subcortical structures (in particular the striatum), but which did not survive correction for multiple testing and changed between analysis pelines.Discussions: Our study did not find evidence for abnormal alpha wave profiles in ASD. We propose, however, an analysis pipeline to perform standardized and automatized EEG analyses on large cohorts. These should help the community to address the challenge of clinical heterogeneity of ASD and to tackle the problems of reproducibility.} 308 | } 309 | 310 | @Article{maruani:2019, 311 | author={Maruani, Anna 312 | and Dumas, Guillaume 313 | and Beggiato, Anita 314 | and Traut, Nicolas 315 | and Peyre, Hugo 316 | and Cohen-Freoua, Alicia 317 | and Amsellem, Frédérique 318 | and Elmaleh, Monique 319 | and Germanaud, David 320 | and Launay, Jean-Marie 321 | and Bourgeron, Thomas 322 | and Toro, Roberto 323 | and Delorme, Richard}, 324 | title={Morning Plasma Melatonin Differences in Autism: Beyond the Impact of Pineal Gland Volume}, 325 | journal={Frontiers in Psychiatry}, 326 | volume={10}, 327 | year={2019}, 328 | url={https://www.frontiersin.org/article/10.3389/fpsyt.2019.00011}, 329 | doi={10.3389/fpsyt.2019.00011}, 330 | issn={1664-0640}, 331 | abstract={While low plasma melatonin, a neuro-hormone synthesized in the pineal gland, has been frequently associated with autism, our understanding of the mechanisms behind it have remained unclear. In this exploratory study, we hypothesized that low melatonin levels in ASD could be linked to a decrease of the pineal gland volume (PGV). PGV estimates with magnetic resonance imaging (MRI) with a voxel-based volumetric measurement method and early morning plasma melatonin levels were evaluated for 215 participants, including 78 individuals with ASD, 90 unaffected relatives, and 47 controls. We first found that both early morning melatonin level and PGV were lower in patients compared to controls. We secondly built a linear model and observed that plasma melatonin was correlated to the group of the participant, but also to the PGV. To further understand the relationship between PGV and melatonin, we generated a normative model of the PGV relationship with melatonin level based on control participant data. We found an effect of PGV on normalized melatonin levels in ASD. Melatonin deficit appeared however more related to the group of the subject. Thus, melatonin variations in ASD could be mainly driven by melatonin pathway dysregulation.} 332 | } 333 | 334 | @Article{bethlehem:2020, 335 | author={Bethlehem, Richard A. I. 336 | and Seidlitz, Jakob 337 | and Romero-Garcia, Rafael 338 | and Trakoshis, Stavros 339 | and Dumas, Guillaume 340 | and Lombardo, Michael V.}, 341 | title={A normative modelling approach reveals age-atypical cortical thickness in a subgroup of males with autism spectrum disorder}, 342 | journal={Communications Biology}, 343 | year={2020}, 344 | month={Sep}, 345 | day={04}, 346 | volume={3}, 347 | number={1}, 348 | pages={486}, 349 | abstract={Understanding heterogeneity is an important goal on the path to precision medicine for autism spectrum disorders (ASD). We examined how cortical thickness (CT) in ASD can be parameterized as an individualized metric of atypicality relative to typically-developing (TD) age-related norms. Across a large sample (n{\thinspace}={\thinspace}870 per group) and wide age range (5--40 years), we applied normative modelling resulting in individualized whole-brain maps of age-related CT atypicality in ASD and isolating a small subgroup with highly age-atypical CT. Age-normed CT scores also highlights on-average differentiation, and associations with behavioural symptomatology that is separate from insights gleaned from traditional case-control approaches. This work showcases an individualized approach for understanding ASD heterogeneity that could potentially further prioritize work on a subset of individuals with cortical pathophysiology represented in age-related CT atypicality. Only a small subset of ASD individuals are actually highly atypical relative to age-norms. driving small on-average case-control differences.}, 350 | issn={2399-3642}, 351 | doi={10.1038/s42003-020-01212-9}, 352 | url={https://doi.org/10.1038/s42003-020-01212-9} 353 | } 354 | -------------------------------------------------------------------------------- /paper/paper.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'PyNM: a Lightweight Python implementation of Normative Modeling' 3 | tags: 4 | - Python 5 | - Normative Modeling 6 | - Heterogeneity 7 | - Heteroskedasticity 8 | - Big Data 9 | - Centiles 10 | - LOESS 11 | - Gaussian Process 12 | - Stochastic Variational Gaussian Process 13 | - GAMLSS 14 | - Computational Psychiatry 15 | - Neuroscience 16 | authors: 17 | - name: Harvey, Annabelle 18 | orcid: 0000-0002-9940-8799 19 | affiliation: "1, 2" 20 | - name: Dumas, Guillaume 21 | orcid: 0000-0002-2253-1844 22 | affiliation: "2, 3" 23 | affiliations: 24 | - name: Centre de Recherche de l’Institut Universitaire de Gériatrie de Montréal, Université de Montréal, QC, Canada 25 | index: 1 26 | - name: Centre de Recherche du CHU Sainte-Justine, Université de Montréal, QC, Canada 27 | index: 2 28 | - name: Mila - Quebec AI Institute, Université de Montréal, QC, Canada 29 | index: 3 30 | date: 10 March 2022 31 | bibliography: paper.bib 32 | --- 33 | 34 | 35 | # Summary 36 | 37 | The majority of studies in neuroimaging and psychiatry are focussed on case-control analysis [@marquand:2019]. However, case-control relies on well-defined groups which is more the exception than the rule in biology. Psychiatric conditions are diagnosed based on symptoms alone, which makes for heterogeneity at the biological level [@marquand:2016]. Relying on mean differences obscures this heterogeneity and the resulting loss of information can produce unreliable results or misleading conclusions [@loth:2021]. 38 | 39 | Normative Modeling is an emerging alternative to case-control analyses that seeks to parse heterogeneity by looking at how individuals deviate from the normal trajectory. Analogous to normative growth charts, normative models map the mean and variance of a trait for a given population against a set of explanatory variables (usually including age). Statistical inferences at the level of the individual participant can then be obtained with respect to the normative range [@marquand:2019]. This framework can detect patterns of abnormality that might not be consistent across the population, and recasts disease as an extreme deviation from the normal range rather than a separate group. 40 | 41 | PyNM is a lightweight python implementation of Normative Modeling making it approachable and easy to adopt. The package provides: 42 | 43 | - Python API and a command-line interface for wide accessibility 44 | - Automatic dataset splitting and cross-validation 45 | - Five models from various back-ends in a unified interface that cover a broad range of common use cases 46 | - Solutions for very large datasets and heteroskedastic data 47 | - Integrated plotting and evaluation functions to quickly check the validity of the model fit and results 48 | - Comprehensive and interactive tutorials 49 | 50 | 51 | # Statement of need 52 | 53 | The basic idea underpinning Normative Modeling is to fit a model on the controls (or a subset of them) of a dataset, and then apply it to the rest of the participants. The difference between the model’s prediction and the ground truth for the unseen participants relative to the variance around the prediction quantifies their deviation from the normal. While simple in concept, implementing Normative Modeling requires some care in managing the dataset and choosing an appropriate model. 54 | 55 | In principle, any model that estimates both the mean and variance of the predictive distribution could be used for Normative Modeling. However, in practice, we impose more constraints. First and foremost, the assumptions of the model must be met by the data. Second, it is important to distinguish between epistemic and aleatoric uncertainty. Epistemic or systematic uncertainty stems from how information about the distribution is collected, whereas aleatoric uncertainty is intrinsic to the distribution and represents the true variation of the population [@xu:2021]. 56 | 57 | To the author’s knowledge, PCNtoolkit [@pcntoolkit] is the only other available package for Normative Modeling. It implements methods that have been applied in a range of psychiatry and neuroimaging studies [@kia:2020; @kia:2021; @rutherford:2022a; @fraza:2021], and is accompanied by thorough [tutorials](https://pcntoolkit.readthedocs.io/en/latest/pages/BLR_normativemodel_protocol.html), a [forum](https://gitter.im/predictive-clinical-neuroscience/community), and a framework for Normative Modeling in computational psychiatry [@rutherford:2022b]. While PCNtoolkit offers more advanced functionality, PyNM emphasizes being lightweight and easy to use, and implements different models than PCNtoolkit including a wrapper for the GAMLSS package from R, which is a powerful option for Normative Modeling [@dinga:2021]. 58 | 59 | PyNM is intended to take users from their first steps in Normative Modeling to using advanced models on complex datasets. Crucially, it manages the dataset and has interactive tutorials – making it quick for new users to try the method either on their own data or on provided simulated data. The tutorials motivate the use of each model and highlight their limitations to help clarify which model is appropriate for what data, and built-in plotting and evaluation functions (\autoref{fig:Figure 1}) make it simple to check the validity of the model output. The package includes five models from various backends in a unified interface, including a wrapper for GAMLSS [@rigby:2005] from R that is otherwise not yet available in python, and the selected models cover many settings including big data and heteroskedasticity. 60 | 61 | Earlier versions of PyNM code were used in the following publications: 62 | 63 | - @lefebvre:2018 64 | - @maruani:2019 65 | - @bethlehem:2020 66 | 67 | # Usage Example 68 | ``` 69 | from pynm.pynm import PyNM 70 | 71 | # Load data 72 | # df contains columns ‘score’,’group’,’age’,’sex’,’site’ 73 | df = pd.read_csv(‘data.csv’) 74 | 75 | # Initialize pynm w/ data and confounds 76 | m = PyNM(df,'score','group', confounds = ['age','c(sex)','c(site)']) 77 | 78 | # Run models 79 | m.loess_normative_model() 80 | m.centiles_normative_model() 81 | m.gp_normative_model() 82 | m.gamlss_normative_model() 83 | 84 | # Collect output 85 | data = m.data 86 | ``` 87 | 88 | # Figures 89 | 90 | ![Output of built-in plotting function for model fit and residuals.\label{fig:Figure 1}](figure1.png) 91 | 92 | # Acknowledgements 93 | 94 | The development of this code has benefited from useful discussions with Andre Marquand, Thomas Wolfers, Eva Loth, Jumana Amad, Richard Bethlehem, and Michael Lombardo. The authors also want to thank the two reviewers Saige Rutherford ([`@saigerutherford`](https://github.com/saigerutherford)) and Seyed Mostafa Kia ([`@smkia`](https://github.com/smkia)) for their insightful feedback. 95 | 96 | Funding: This work is supported by IVADO, FRQS, CFI, MITACS, and Compute Canada. 97 | 98 | # References 99 | -------------------------------------------------------------------------------- /pynm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ppsp-team/PyNM/52de5b73ffd2cb4b04352e348f042dd695be3c87/pynm/__init__.py -------------------------------------------------------------------------------- /pynm/cli.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from pynm import pynm 3 | import pandas as pd 4 | 5 | def _cli_parser(): 6 | """Reads command line arguments and returns input specifications 7 | 8 | Returns 9 | ------- 10 | dict 11 | Parsed arguments. 12 | """ 13 | parser = ArgumentParser() 14 | parser.add_argument("--pheno_p",dest='pheno_p',required=True, 15 | help="Path to phenotype data. Data must be in a .csv file.") 16 | parser.add_argument("--out_p",dest='out_p',required=True, 17 | help="Path to output directory.") 18 | parser.add_argument("--confounds",dest='confounds',required=True, 19 | help="List of confounds to use in the GP model." 20 | "The list must formatted as a string with commas between confounds, " 21 | "each confound must be a column name from the phenotype .csv file. " 22 | "For GP model all confounds will be used, for LOESS and Centiles models " 23 | "only the first is used. For GAMLSS all confounds are used " 24 | "unless formulas are specified. Categorical values must be denoted by c(var) " 25 | "('c' must be lower case), e.g. 'c(SEX)' for column name 'SEX'.") 26 | parser.add_argument("--score",dest='score',required=True, 27 | help="Response variable for all models. " 28 | "Must be a column title from phenotype .csv file.") 29 | parser.add_argument("--group",dest='group',required=True, 30 | help="Column name from the phenotype .csv file that " 31 | "distinguishes probands from controls. The column must be " 32 | "encoded with str labels using 'PROB' for probands and 'CTR' for controls " 33 | "or with int labels using 1 for probands and 0 for controls.") 34 | parser.add_argument("--train_sample",default=1,dest='train_sample', 35 | help="Which method to use for a training sample, can be a float in (0,1] " 36 | "for a percentage of controls or 'manual' to be manually set using a column " 37 | "of the DataFrame labelled 'train_sample'.") 38 | parser.add_argument("--LOESS",dest='LOESS',action='store_true', 39 | help="Flag to run LOESS model.") 40 | parser.add_argument("--centiles",dest='centiles',action='store_true', 41 | help="Flag to run Centiles model.") 42 | parser.add_argument("--bin_spacing",default = -1,dest='bin_spacing', 43 | help="Distance between bins for LOESS & centiles models.") 44 | parser.add_argument("--bin_width",default = -1,dest='bin_width', 45 | help="Width of bins for LOESS & centiles models.") 46 | parser.add_argument("--GP",dest='GP',action='store_true', 47 | help="Flag to run Gaussian Process model.") 48 | parser.add_argument("--gp_method",default = 'auto',dest='gp_method', 49 | help="Method to use for the GP model. Can be set to " 50 | "'auto','approx' or 'exact'. In 'auto' mode, " 51 | "the exact model will be used for datasets smaller " 52 | "than 2000 data points. SVGP is used for the approximate model. " 53 | "See documentation for details. Default value is 'auto'.") 54 | parser.add_argument("--gp_num_epochs",default=20, dest='gp_num_epochs', 55 | help="Number of training epochs for SVGP model. " 56 | "See documentation for details. Default value is 20.") 57 | parser.add_argument("--gp_n_inducing",default=500,dest='gp_n_inducing', 58 | help="Number of inducing points for SVGP model. " 59 | "See documentation for details. Default value is 500.") 60 | parser.add_argument("--gp_batch_size",default=256,dest='gp_batch_size', 61 | help="Batch size for training and predicting from SVGP model. " 62 | "See documentation for details. Default value is 256.") 63 | parser.add_argument("--gp_length_scale",default=1,dest='gp_length_scale', 64 | help="Length scale of Matern kernel for exact model. " 65 | "See documentation for details. Default value is 1.") 66 | parser.add_argument("--gp_length_scale_bounds",default=(1e-5,1e5),dest='gp_length_scale_bounds', nargs='*', 67 | help="The lower and upper bound on length_scale. If set to 'fixed', " 68 | "length_scale cannot be changed during hyperparameter tuning. " 69 | "See documentation for details. Default value is (1e-5,1e5).") 70 | parser.add_argument("--gp_nu",default=2.5,dest='nu', 71 | help="Nu of Matern kernel for exact and SVGP model. " 72 | "See documentation for details. Default value is 2.5.") 73 | parser.add_argument("--GAMLSS",dest='GAMLSS',action='store_true', 74 | help="Flag to run GAMLSS.") 75 | parser.add_argument("--gamlss_mu",default=None,dest='gamlss_mu', 76 | help="Formula for mu (location) parameter of GAMLSS. Default " 77 | "formula for score is sum of confounds with non-categorical " 78 | "columns as smooth functions, e.g. 'score ~ ps(age) + sex'.") 79 | parser.add_argument("--gamlss_sigma",default=None,dest='gamlss_sigma', 80 | help="Formula for mu (location) parameter of GAMLSS. Default " 81 | "formula is '~ 1'.") 82 | parser.add_argument("--gamlss_nu",default=None,dest='gamlss_nu', 83 | help="Formula for mu (location) parameter of GAMLSS. Default " 84 | "formula is '~ 1'.") 85 | parser.add_argument("--gamlss_tau",default=None,dest='gamlss_tau', 86 | help="Formula for mu (location) parameter of GAMLSS. Default " 87 | "formula is '~ 1'.") 88 | parser.add_argument("--gamlss_family",default='SHASHo2',dest='gamlss_family', 89 | help="Family of distributions to use for fitting, default is 'SHASHo2'. " 90 | "See R documentation for GAMLSS package for other available families of distributions.") 91 | return parser.parse_args() 92 | 93 | def get_bounds(bounds): 94 | """Converts gp_length_scale_bounds parameter to appropriate type. 95 | 96 | Returns 97 | ------- 98 | pair of floats >= 0 or 'fixed' 99 | Appropriate argument for PyNM.gp_normative_model. 100 | 101 | Raises 102 | ------ 103 | ValueError 104 | Unrecognized argument for gp_length_scale_bounds. 105 | """ 106 | if isinstance(bounds,list): 107 | if len(bounds)==1 and bounds[0]=='fixed': 108 | return 'fixed' 109 | elif len(bounds)==2: 110 | return (float(bounds[0]),float(bounds[1])) 111 | else: 112 | raise ValueError('Unrecognized argument for gp_length_scale_bounds.') 113 | else: 114 | return bounds 115 | 116 | def main(): 117 | params = vars(_cli_parser()) 118 | 119 | confounds = params['confounds'].split(',') 120 | data = pd.read_csv(params['pheno_p']) 121 | 122 | m = pynm.PyNM(data,params['score'],params['group'],params['conf'],confounds,params['train_sample'], 123 | bin_spacing=params['bin_spacing'], bin_width=params['bin_width']) 124 | 125 | #Run models 126 | if params['LOESS']: 127 | m.loess_normative_model() 128 | m.bins_num() 129 | if params['centiles']: 130 | m.centiles_normative_model() 131 | m.bins_num() 132 | if params['GP']: 133 | gp_length_scale_bounds = get_bounds(params['gp_length_scale_bounds']) 134 | print(gp_length_scale_bounds) 135 | print(type(gp_length_scale_bounds)) 136 | m.gp_normative_model(length_scale=params['gp_length_scale'], 137 | length_scale_bounds = gp_length_scale_bounds, nu=params['gp_nu'], 138 | method=params['gp_method'],batch_size=params['gp_batch_size'], 139 | n_inducing=params['gp_n_inducing'],num_epochs=params['gp_num_epochs']) 140 | if args.GAMLSS: 141 | m.gamlss_normative_model(mu=params['gamlss_mu'],sigma=params['gamlss_sigma'],nu=params['gamlss_nu'], 142 | tau=params['gamlss_tau'],family=params['gamlss_family']) 143 | 144 | m.data.to_csv(params['out_p'],index=False) 145 | 146 | if __name__ == "__main__": 147 | raise RuntimeError("`pynm/cli.py` should not be run directly. Please install `pynm`.") -------------------------------------------------------------------------------- /pynm/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ppsp-team/PyNM/52de5b73ffd2cb4b04352e348f042dd695be3c87/pynm/models/__init__.py -------------------------------------------------------------------------------- /pynm/models/approx.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import gpytorch 4 | import numpy as np 5 | import statsmodels.api as sm 6 | from tqdm import tqdm 7 | from torch.utils.data import TensorDataset, DataLoader 8 | from gpytorch.models import ApproximateGP 9 | from gpytorch.variational import CholeskyVariationalDistribution 10 | from gpytorch.variational import VariationalStrategy 11 | 12 | class GPModel(ApproximateGP): 13 | """ Class for GPyTorch model. 14 | 15 | Attributes 16 | ---------- 17 | mean_module : gpytorch Mean 18 | Module to calculate mean. 19 | covar_module : gpytorch Kernel 20 | Module to calculate covariance. 21 | """ 22 | def __init__(self, inducing_points,nu=2.5,length_scale=1,length_scale_bounds=(1e-5,1e5)): 23 | """ Create a GPModel object. 24 | 25 | Parameters 26 | ---------- 27 | inducing_points: array 28 | Array of inducing points. 29 | length_scale: float, default=1 30 | Length scale parameter of Matern kernel. 31 | length_scale_bounds: pair of floats >= 0 or 'fixed', default=(1e-5, 1e5) 32 | The lower and upper bound on length_scale. If set to 'fixed', ‘length_scale’ cannot be changed during hyperparameter tuning. 33 | nu: float, default=2.5 34 | Nu parameter of Matern kernel. 35 | 36 | Raises 37 | ------ 38 | ValueError 39 | Invalid argument for length_scale_bounds 40 | """ 41 | variational_distribution = CholeskyVariationalDistribution(inducing_points.size(0)) 42 | variational_strategy = VariationalStrategy(self, inducing_points, variational_distribution, learn_inducing_locations=True) 43 | super(GPModel, self).__init__(variational_strategy) 44 | 45 | self.mean_module = gpytorch.means.ConstantMean() 46 | 47 | if length_scale_bounds == 'fixed': 48 | constraint = gpytorch.constraints.Interval(length_scale - 0.001,length_scale + 0.0001) 49 | elif isinstance(length_scale_bounds,tuple): 50 | constraint = gpytorch.constraints.Interval(length_scale_bounds[0],length_scale_bounds[1]) 51 | else: 52 | raise ValueError('Invalid argument for length_scale_bounds.') 53 | prior = gpytorch.priors.NormalPrior(length_scale,1) 54 | self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=nu,lengthscale_prior=prior),lengthscale_contraint = constraint) 55 | 56 | def forward(self, x): 57 | """ Calculate forward pass of GPModel. 58 | 59 | Parameters 60 | ---------- 61 | x: Tensor 62 | Data tensor. 63 | 64 | Returns 65 | ------- 66 | MultivariateNormal object 67 | """ 68 | mean_x = self.mean_module(x) 69 | covar_x = self.covar_module(x) 70 | return gpytorch.distributions.MultivariateNormal(mean_x, covar_x) 71 | 72 | 73 | class SVGP: 74 | """ Class for SVGP model. 75 | 76 | Attributes 77 | ---------- 78 | train_loader: pytorch DataLoader 79 | DataLoader for training data. 80 | test_loader: pytorch DataLoader 81 | DataLoader for test data. 82 | inducing_points: array 83 | Subset of training data to use as inducing points. 84 | n_train: int 85 | Number of training points. 86 | n_test: int 87 | Number of test points. 88 | model: GPModel 89 | Instance of GPModel class. 90 | likelihood: gpytorch Likelihood 91 | Gaussian likelihood function. 92 | loss: list 93 | Loss for each epoch of training. 94 | """ 95 | def __init__(self,X_train,X_test,y_train,y_test,n_inducing=500,batch_size=256,nu=2.5,length_scale=1,length_scale_bounds=(1e-5,1e5)): 96 | """ Create a SVGP object. 97 | 98 | Parameters 99 | ---------- 100 | X_train: array 101 | Training confounds with categorical values dummy encoded. 102 | X_test: array 103 | Test confounds with categorical values dummy encoded. 104 | y_train: array 105 | Training score/response variable. 106 | y_test: array 107 | Test score/response variable. 108 | length_scale: float, default=1 109 | Length scale parameter of Matern kernel. 110 | length_scale_bounds: pair of floats >= 0 or 'fixed', default=(1e-5, 1e5) 111 | The lower and upper bound on length_scale. If set to 'fixed', ‘length_scale’ cannot be changed during hyperparameter tuning. 112 | nu: float, default=2.5 113 | Nu parameter of Matern kernel. 114 | batch_size: int, default=256 115 | Batch size for SVGP model training and prediction. 116 | n_inducing: int, default=500 117 | Number of inducing points for SVGP model. 118 | """ 119 | # Get data in torch format 120 | train_x = torch.from_numpy(X_train).contiguous() 121 | test_x = torch.from_numpy(X_test).double().contiguous() 122 | train_y = torch.from_numpy(y_train).contiguous() 123 | test_y = torch.from_numpy(y_test).double().contiguous() 124 | 125 | # Create datasets 126 | train_dataset = TensorDataset(train_x, train_y) 127 | test_dataset = TensorDataset(test_x, test_y) 128 | 129 | # Create dataloaders 130 | self.train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) 131 | self.test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) 132 | inducing_idx = np.random.choice(np.array(range(train_x.shape[0])),size=n_inducing) 133 | self.inducing_points = train_x[inducing_idx, :] 134 | self.n_train = train_y.size(0) 135 | self.n_test = test_y.size(0) 136 | 137 | self.model = GPModel(inducing_points=self.inducing_points,nu=nu,length_scale=length_scale,length_scale_bounds=length_scale_bounds).double() 138 | self.likelihood = gpytorch.likelihoods.GaussianLikelihood() 139 | self.likelihood.initialize(noise=torch.std(train_x)) 140 | 141 | if torch.cuda.is_available(): 142 | self.model = self.model.cuda() 143 | self.likelihood = self.likelihood.cuda() 144 | 145 | self.loss = [] 146 | 147 | def train(self,num_epochs=20): 148 | """ Trains the SVGP model. 149 | 150 | Parameters 151 | ---------- 152 | num_epochs: int 153 | Number of epochs (full passes through dataset) to train for. 154 | """ 155 | self.model.train() 156 | self.likelihood.train() 157 | 158 | optimizer = torch.optim.Adam([{'params': self.model.parameters()},{'params': self.likelihood.parameters()}], lr=0.01) 159 | 160 | # Loss object. We're using the VariationalELBO 161 | mll = gpytorch.mlls.VariationalELBO(self.likelihood, self.model, num_data=self.n_train) 162 | 163 | epochs_iter = tqdm(range(num_epochs), desc="Epoch") 164 | for i in epochs_iter: 165 | # Within each iteration, we will go over each minibatch of data 166 | minibatch_iter = tqdm(self.train_loader, desc="Minibatch", leave=False) 167 | for x_batch, y_batch in minibatch_iter: 168 | optimizer.zero_grad() 169 | output = self.model(x_batch) 170 | loss = -mll(output, y_batch) 171 | minibatch_iter.set_postfix(loss=loss.item()) 172 | loss.backward() 173 | optimizer.step() 174 | self.loss.append(loss.item()) 175 | 176 | def predict(self): 177 | """ Predict from SVGP model. 178 | 179 | Returns 180 | ---------- 181 | array 182 | Model predictions (mean of predictive distribution). 183 | array 184 | Model uncertainty (standard deviation of predictive distribution). 185 | """ 186 | self.model.eval() 187 | self.likelihood.eval() 188 | 189 | mean = torch.tensor([0.]) 190 | sigma = torch.tensor([0.]) 191 | with torch.no_grad(): 192 | for x_batch, y_batch in self.test_loader: 193 | preds = self.likelihood(self.model(x_batch)) # get likelihood variance + posterior GP variance 194 | mean = torch.cat([mean, preds.mean.cpu()]) 195 | sigma = torch.cat([sigma, torch.sqrt(preds.variance.cpu())]) 196 | mean = mean[1:] 197 | sigma = sigma[1:] 198 | return mean, sigma -------------------------------------------------------------------------------- /pynm/models/centiles.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.stats.mstats import mquantiles 3 | 4 | def centiles_fit(train_data,bins,bin_width): 5 | """ Fit Centiles model. 6 | 7 | Parameters 8 | ---------- 9 | train_data: array 10 | Training data for Centiles model. 11 | bins: array 12 | Bins for Centiles model. 13 | bin_width: float 14 | Width of each bin. 15 | 16 | Returns 17 | ------- 18 | array 19 | Centiles for each bin. 20 | """ 21 | z = np.zeros([bins.shape[0], 101]) # centiles 22 | 23 | for i, bin_center in enumerate(bins): 24 | mu = np.array(bin_center) # bin_center value (age or conf) 25 | bin_mask = (abs(train_data[:, :1] - mu) < 26 | bin_width) * 1. # one hot mask 27 | idx = [u for (u, v) in np.argwhere(bin_mask)] 28 | scores = train_data[idx, 1] 29 | 30 | # if more than 2 non NaN values do the model 31 | if (~np.isnan(scores)).sum() > 2: 32 | # centiles 33 | z[i, :] = mquantiles(scores, prob=np.linspace(0, 1, 101), alphap=0.4, betap=0.4) 34 | else: 35 | z[i] = np.nan 36 | 37 | return z 38 | 39 | def centiles_predict(test_data,bins,z): 40 | """ Predict from Centiles model. 41 | 42 | Parameters 43 | ---------- 44 | test_data: array 45 | Test data for Centiles model. Column 0 is confound, 1 is score. 46 | bins: array 47 | Bins for Centiles model. 48 | z: array 49 | Centiles for each bin. 50 | 51 | Returns 52 | ------- 53 | array 54 | Centile within which each subject falls. 55 | array 56 | Centiles for each subject. 57 | """ 58 | dists = [np.abs(conf - bins) for conf in test_data[:,0]] 59 | idx = [np.argmin(d) for d in dists] 60 | centiles = np.array([z[i] for i in idx]) 61 | 62 | result = np.zeros(centiles.shape[0]) 63 | max_mask = test_data[:,1] >= np.max(centiles, axis=1) 64 | min_mask = test_data[:,1] < np.min(centiles, axis=1) 65 | else_mask = ~(max_mask | min_mask) 66 | result[max_mask] = 100 67 | result[min_mask] = 0 68 | result[else_mask] = np.array([np.argmin(test_data[:,1][i] >= centiles[i]) for i in range(test_data.shape[0])])[else_mask] 69 | 70 | return result, centiles -------------------------------------------------------------------------------- /pynm/models/gamlss.py: -------------------------------------------------------------------------------- 1 | import re 2 | import rpy2.robjects as ro 3 | from rpy2.robjects.packages import importr 4 | from rpy2.robjects import numpy2ri 5 | from rpy2.robjects import pandas2ri 6 | from rpy2.robjects import r 7 | from pynm.util import read_confounds 8 | 9 | class GAMLSS: 10 | """Class for GAMLSS model. 11 | 12 | Attributes 13 | ---------- 14 | gamlss_data: R package 15 | Python imported R package. 16 | gamlss_dist: R package 17 | Python imported R package. 18 | gamlss: R package 19 | Python imported R package. 20 | mu_f: R formula 21 | Formula for mu (location) parameter. 22 | sigma_f: R formula 23 | Formula for sigma (scale) parameter. 24 | nu_f: R formula 25 | Formula for nu (skewness) parameter. 26 | tau_f: R formula 27 | Formula for tau (kurtosis) parameter. 28 | rfamily: R object 29 | Family of distributions to use for fitting. 30 | method: str 31 | Method to fit GAMLSS. 32 | model: R object 33 | Fitted GAMLSS model. 34 | """ 35 | 36 | def __init__(self,mu=None,sigma=None,nu=None,tau=None,family='SHASHo2',method='RS',score=None,confounds=None): 37 | """Create GAMLSS object. Formulas must be written for R, using functions available in the GAMLSS package. 38 | 39 | Parameters 40 | ---------- 41 | mu: str, default=None 42 | Formula for mu (location) parameter of GAMLSS. If None, formula for score is sum of confounds 43 | with non-categorical columns as smooth functions, e.g. "score ~ ps(age) + sex". 44 | sigma: str, default=None 45 | Formula for sigma (scale) parameter of GAMLSS. If None, formula is '~ 1'. 46 | nu: str, default=None 47 | Formula for nu (skewness) parameter of GAMLSS. If None, formula is '~ 1'. 48 | tau: str, default=None 49 | Formula for tau (kurtosis) parameter of GAMLSS. If None, formula is '~ 1'. 50 | family: str,default='SHASHo2' 51 | Family of distributions to use for fitting, default is 'SHASHo2'. See R documentation for GAMLSS package for other available families of distributions. 52 | method: str, default = 'RS' 53 | Method for fitting GAMLSS. Can be 'RS' (Rigby and Stasinopoulos algorithm), 'CG' (Cole and Green algorithm) or 'mixed(n,m)' where n & m are integers. 54 | Specifying 'mixed(n,m)' will use the RS algorithm for n iterations and the CG algorithm for up to m additional iterations. 55 | score: str, default=None 56 | Label of score in DataFrame. 57 | confounds: list, default=None 58 | List of labels of confounds in DataFrame. 59 | 60 | Notes 61 | ----- 62 | If using 'random()' to model a random effect in any of the formulas, it must be passed a column of the dataframe with categorical values 63 | as a factor: e.g. 'random(as.factor(COL))'. 64 | """ 65 | numpy2ri.activate() 66 | pandas2ri.activate() 67 | 68 | self.gamlss_data = importr('gamlss.data') 69 | self.gamlss_dist = importr('gamlss.dist') 70 | self.gamlss = importr('gamlss') 71 | self.base = importr('base') 72 | 73 | self.score = score 74 | self.confounds = confounds 75 | self.mu_f,self.sigma_f,self.nu_f,self.tau_f = self._get_r_formulas(mu,sigma,nu,tau) 76 | self.family = family 77 | self.method = self._get_method(method) 78 | try: 79 | self.rfamily = r[family] 80 | except: 81 | raise ValueError("Provided family not valid, choose 'SHASHo2', 'NO' or see R documentation for GAMLSS package for other available families of distributions.") 82 | 83 | def _get_r_formulas(self,mu,sigma,nu,tau): 84 | """Convert from string input to R formula. 85 | 86 | Parameters 87 | ---------- 88 | mu: str or None 89 | Formula for mu (location) parameter of GAMLSS. If None, formula for score is sum of confounds 90 | with non-categorical columns as smooth functions, e.g. "score ~ ps(age) + sex". 91 | sigma: str or None 92 | Formula for sigma (scale) parameter of GAMLSS. If None, formula is '~ 1'. 93 | nu: str or None 94 | Formula for nu (skewness) parameter of GAMLSS. If None, formula is '~ 1'. 95 | tau: str or None 96 | Formula for tau (kurtosis) parameter of GAMLSS. If None, formula is '~ 1'. 97 | 98 | Raises 99 | ------ 100 | ValueError 101 | If any of the input strings contains a function call not recognised by the R GAMLSS package. 102 | ValueError 103 | If mu is None and either score or confounds is None. 104 | 105 | Returns 106 | ------- 107 | R formula, R formula, R formula, R formula 108 | R formula equivalent for each input string. 109 | """ 110 | if mu is None: 111 | if (self.score is None) or (self.confounds is None): 112 | raise ValueError('If mu is None, both score and confounds must be provided i.e. not None.') 113 | _,cat = read_confounds(self.confounds) 114 | formula_conf = ['ps({})'.format(conf) for conf in self.confounds if not conf[2:-1] in cat] + cat 115 | mu = '{} ~ {}'.format(self.score,' + '.join(formula_conf)) 116 | if sigma is None: 117 | sigma = '~ 1' 118 | if nu is None: 119 | nu = '~ 1' 120 | if tau is None: 121 | tau = '~ 1' 122 | 123 | # get r functions from formulas 124 | p = re.compile(r"\w*\(") 125 | funcs = [] 126 | for s in [mu,sigma,nu,tau]: 127 | for f in p.findall(s): 128 | funcs.append(f[:-1]) 129 | 130 | for func in funcs: 131 | try: 132 | exec("{} = r['{}']".format(func,func)) 133 | except: 134 | raise ValueError("'{}' function not found in R GAMLSS package. See GAMLSS documentation for available functions.".format(func)) 135 | 136 | return mu,sigma,nu,tau 137 | 138 | def _get_method(self,method): 139 | """ Get method parameter in appropriate format for R. 140 | 141 | Raises 142 | ------ 143 | TypeError 144 | "Argument 'method' must be of type str." 145 | ValueError 146 | "Unrecognized argument for 'method'." 147 | """ 148 | if not isinstance(method,str): 149 | raise TypeError("Argument 'method' must be of type str.") 150 | 151 | pattern = re.compile(r"mixed\([0-9]*,[0-9]*\)") 152 | 153 | if method == 'RS': 154 | return 'RS()' 155 | elif method == 'CG': 156 | return 'CG()' 157 | elif pattern.match(method) is not None: 158 | return method 159 | else: 160 | raise ValueError("Unrecognized argument for 'method'.") 161 | 162 | def fit(self,train_data): 163 | """Create and fit gamlss model. 164 | 165 | Parameters 166 | ---------- 167 | train_data: DataFrame 168 | DataFrame with training data. 169 | """ 170 | ro.globalenv['train_data'] = train_data 171 | 172 | self.model = r(f'''gamlss({self.mu_f}, 173 | sigma.formula={self.sigma_f}, 174 | nu.formula={self.nu_f}, 175 | tau.formula={self.tau_f}, 176 | family={self.family}, 177 | data=train_data, 178 | method={self.method})''') 179 | 180 | def predict(self,test_data,what='mu'): 181 | """Predict from fitted gamlss model. 182 | 183 | Parameters 184 | ---------- 185 | test_data: DataFrame 186 | DataFrame with test data. 187 | what: str 188 | Which parameter to predict, can be 'mu','sigma', 'nu', or 'tau'. 189 | """ 190 | ro.globalenv['model'] = self.model 191 | ro.globalenv['test_data'] = test_data 192 | 193 | res = r(f'''predict(model,newdata=test_data,parameter="{what}")''') 194 | return res 195 | -------------------------------------------------------------------------------- /pynm/models/loess.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import statsmodels.api as sm 3 | from statsmodels.sandbox.regression.predstd import wls_prediction_std 4 | 5 | def loess_fit(train_data,bins,bin_width): 6 | """ Fit LOESS model. 7 | 8 | Parameters 9 | ---------- 10 | train_data: array 11 | Training data for LOESS model. Column 0 is confound, 1 is score. 12 | bins: array 13 | Bins for LOESS model. 14 | bin_width: float 15 | Width of each bin. 16 | 17 | Returns 18 | ------- 19 | array 20 | Mean of each bin. 21 | array 22 | Standard deviation of each bin. 23 | array 24 | Confidence interval of each bin. 25 | """ 26 | zm = np.zeros(bins.shape[0]) # mean 27 | zstd = np.zeros(bins.shape[0]) # standard deviation 28 | zci = np.zeros([bins.shape[0], 2]) # confidence interval 29 | 30 | for i, bin_center in enumerate(bins): 31 | mu = np.array(bin_center) # bin_center value (age or conf) 32 | bin_mask = (abs(train_data[:, :1] - mu) < bin_width) * 1. 33 | idx = [u for (u, v) in np.argwhere(bin_mask)] 34 | 35 | scores = train_data[idx, 1] 36 | adj_conf = train_data[idx, 0] - mu # confound relative to bin center 37 | 38 | # if more than 2 non NaN values do the model 39 | if (~np.isnan(scores)).sum() > 2: 40 | mod = sm.WLS(scores, sm.tools.add_constant(adj_conf, 41 | has_constant='add'), 42 | missing='drop', weights=bin_mask.flatten()[idx], 43 | hasconst=True).fit() 44 | zm[i] = mod.params[0] # mean 45 | 46 | # std and confidence intervals 47 | prstd,_,_ = wls_prediction_std(mod, [0, 0]) 48 | zstd[i] = prstd 49 | zci[i, :] = mod.conf_int()[0, :] # [iv_l, iv_u] 50 | 51 | else: 52 | zm[i] = np.nan 53 | zci[i] = np.nan 54 | zstd[i] = np.nan 55 | 56 | return zm, zstd, zci 57 | 58 | def loess_predict(test_data,bins,zm,zstd): 59 | """ Predict from LOESS model. 60 | 61 | Parameters 62 | ---------- 63 | test_data: array 64 | Test data for LOESS model. Column 0 is confound, 1 is score. 65 | bins: array 66 | Bins for LOESS model. 67 | zm: array 68 | Mean of each bin. 69 | zstd: array 70 | Standard deviation of each bin. 71 | 72 | Returns 73 | ------- 74 | array 75 | Mean for each subject. 76 | array 77 | Standard deviation for each subject. 78 | """ 79 | dists = [np.abs(conf - bins) for conf in test_data[:,0]] 80 | idx = [np.argmin(d) for d in dists] 81 | m = np.array([zm[i] for i in idx]) 82 | std = np.array([zstd[i] for i in idx]) 83 | 84 | return m, std -------------------------------------------------------------------------------- /pynm/pynm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # ============================================================================== 4 | # title : PyNM.py 5 | # description : Gaussian Processes, Centiles & LOESS-based normative models 6 | # author : Guillaume Dumas (Institut Pasteur/Université de Montréal) 7 | # Annabelle Harvey (Université de Montréal) 8 | # date : 2021-04-15 9 | # notes : The input dataframe column passed to --group must either 10 | # have controls marked as "CTR" and probands as "PROB", or 11 | # controls marked as 0 and probands as 1. 12 | # The --pheno_p is for the path to the input dataframe. 13 | # The --out_p flag is for the path to save the output 14 | # dataframe, including filename formatted as 'filename.csv'. 15 | # The confounds columns for the gaussian process model must 16 | # be specified using the --confounds flag. The confound for 17 | # the LOESS and centiles models must be specified using the 18 | # --conf flag. 19 | # licence : BSD 3-Clause License 20 | # python_version : 3.7 21 | # ============================================================================== 22 | 23 | import pandas as pd 24 | import numpy as np 25 | import matplotlib.pyplot as plt 26 | import seaborn as sns 27 | import warnings 28 | 29 | from sklearn import gaussian_process 30 | from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel 31 | from sklearn.model_selection import KFold 32 | from statsmodels.stats.diagnostic import het_white 33 | from statsmodels.tools.tools import add_constant 34 | from scipy import stats 35 | 36 | from pynm.util import * 37 | from pynm.models.loess import * 38 | from pynm.models.centiles import * 39 | 40 | class PyNM: 41 | """ Class to run normative modeling using LOESS, centiles, or GP model. 42 | 43 | Attributes 44 | ---------- 45 | data : dataframe 46 | Dataset to fit model, must at least contain columns corresponding to 'group', 47 | 'score', and 'conf'. 48 | score : str 49 | Label of column from data with score (response variable). 50 | group : str 51 | Label of column from data that encodes wether subjects are probands or controls. 52 | CTR : str or int 53 | Label of controls in 'group' column can be 'CTR' or 0. 54 | PROB: str or int 55 | Label of probands in 'group' column can be 'PRB' or 1. 56 | conf: str 57 | Label of column from data with confound to use for LOESS and centiles models. 58 | confounds: list of str 59 | List of labels of columns from data with confounds. For GP model all confounds will be used, 60 | for LOESS and Centiles models only the first is used. For GAMLSS all confounds are used 61 | unless formulas are specified. Categorical values must be denoted by c(var) ('c' must be lower case). 62 | train_sample: str or float 63 | Which method to use for a training sample, can be 'controls' to use all the controls, 64 | 'manual' to be manually set, or a float in (0,1] for a percentage of controls. 65 | bin_spacing: int 66 | Distance between bins for LOESS & centiles models. 67 | bin_width: float 68 | Width of bins for LOESS & centiles models. 69 | bins: array 70 | Bins for the centiles and LOESS models. 71 | bin_count: array 72 | Number of controls in each bin. 73 | zm: array 74 | Mean of each bin (LOESS). 75 | zstd: array 76 | Standard deviation of each bin (LOESS). 77 | zci: array 78 | Confidence interval of each bin (LOESS). 79 | z: array 80 | Centiles for each bin. 81 | RMSE_LOESS: float 82 | RMSE of LOESS normative model 83 | SMSE_LOESS: float 84 | SMSE of LOESS normative model 85 | RMSE_Centiles: float 86 | RMSE of Centiles normative model 87 | SMSE_Centiles: float 88 | SMSE of Centiles normative model 89 | RMSE_GP: float 90 | RMSE of Gaussian Process normative model 91 | SMSE_GP: float 92 | SMSE of Gaussian Process normative model 93 | MSLL_GP: float 94 | MSLL of Gaussian Process normative model 95 | RMSE_GAMLSS: float 96 | RMSE of GAMLSS 97 | SMSE_GAMLSS: float 98 | SMSE of GAMLSS 99 | MSLL_GAMLSS: float 100 | MSLL of GAMLSS 101 | """ 102 | 103 | def __init__(self, data, score, group, confounds, 104 | train_sample=1, bin_spacing=-1, bin_width=-1, seed=None): 105 | """ Create a PyNM object. 106 | 107 | Parameters 108 | ---------- 109 | data : dataframe 110 | Dataset to fit model, must at least contain columns corresponding to 'group', 111 | 'score', and 'conf'. 112 | score : str 113 | Label of column from data with score (response variable). 114 | group : str 115 | Label of column from data that encodes wether subjects are probands or controls. 116 | confounds: list of str 117 | List of labels of columns from data with confounds. For GP model all confounds will be used, 118 | for LOESS and Centiles models only the first is used. For GAMLSS all confounds are used 119 | unless formulas are specified. Categorical values must be denoted by c(var) ('c' must be lower case). 120 | train_sample: str or float, default=1 121 | Which method to use for a training sample, can be a float in (0,1] for a percentage of controls 122 | or 'manual' to be manually set using a column of the DataFrame labelled 'train_sample'. 123 | bin_spacing: int, default=-1 124 | Distance between bins for LOESS & centiles models. 125 | bin_width: float, default=-1 126 | Width of bins for LOESS & centiles models. 127 | seed: int, default=None 128 | Seed for random state generator, if None no seed is set. 129 | 130 | Raises 131 | ------ 132 | ValueError 133 | Each row of DataFrame must have a unique index. 134 | """ 135 | if data.index.nunique() != data.shape[0]: 136 | raise ValueError('Each row of DataFrame must have a unique index.') 137 | self.data = data.copy() 138 | self.score = score 139 | self.group = group 140 | self.confounds = confounds 141 | self.conf = self.confounds[0] 142 | self.train_sample = train_sample 143 | self.CTR = None 144 | self.PROB = None 145 | self.bin_spacing = bin_spacing 146 | self.bin_width = bin_width 147 | self.bins = None 148 | self.bin_count = None 149 | self.zm = None 150 | self.zstd = None 151 | self.zci = None 152 | self.z = None 153 | self.RMSE_LOESS = None 154 | self.SMSE_LOESS = None 155 | self.RMSE_Centiles = None 156 | self.SMSE_Centiles = None 157 | self.RMSE_GP = None 158 | self.SMSE_GP = None 159 | self.MSLL_GP = None 160 | self.RMSE_GAMLSS = None 161 | self.SMSE_GAMLSS = None 162 | self.MSLL_GAMLSS = None 163 | 164 | if seed is not None: 165 | np.random.seed(seed) 166 | 167 | self._set_group_names() 168 | self._set_group() 169 | 170 | def _make_train_sample(self, train_size): 171 | """ Select a subsample of controls to be used as a training sample for the normative model. 172 | 173 | Parameters 174 | ---------- 175 | train_size: float 176 | Percentage of controls to use for training. Must be in (0,1]. 177 | """ 178 | ctr_idx = self.data[self.data[self.group] == self.CTR].index.tolist() 179 | n_ctr = len(ctr_idx) 180 | n_ctr_train = max(int(train_size*n_ctr), 1) 181 | 182 | np.random.seed(1) 183 | ctr_idx_train = np.array(np.random.choice(ctr_idx, size=n_ctr_train, replace=False)) 184 | 185 | train_sample = np.zeros(self.data.shape[0]) 186 | train_sample[ctr_idx_train] = 1 187 | self.data['train_sample'] = train_sample 188 | 189 | print('Models will be fit with train sample size = {}: using {}/{} of controls.'.format(train_size, n_ctr_train, n_ctr)) 190 | 191 | def _set_group(self): 192 | """ Read the specified training sample and set the group attribute to refer to the appropriate column of data. 193 | 194 | Raises 195 | ------ 196 | ValueError 197 | With train_sample=1: Dataset has no controls for training sample. 198 | ValueError 199 | With train_sample='manual': Data has no column "train_sample". To manually specify a training sample, 200 | data .csv must contain a column "train_sample" with included subjects marked with 1 and rest as 0. 201 | ValueError 202 | With train_sample='manual': Dataset has no subjects in specified training sample. 203 | ValueError 204 | Value for train_sample not recognized. Must be either a value in (0,1] or 'manual'. 205 | ValueError 206 | With train_sample float: Numerical value for train_sample must be in the range (0,1]. 207 | """ 208 | if self.train_sample == 1: 209 | print('Models will be fit on full set of controls.') 210 | if self.data[self.data[self.group] == self.CTR].shape[0] == 0: 211 | raise ValueError('Dataset has no controls for training sample.') 212 | elif self.train_sample == 'manual': 213 | print('Models will be fit using specified training sample.') 214 | if 'train_sample' not in self.data.columns: 215 | raise ValueError('Data has no column "train_sample". To manually specify a training sample, data .csv ' 216 | 'must contain a column "train_sample" with included subjects marked with 1 and rest as 0.') 217 | self.group = 'train_sample' 218 | self._set_group_names() 219 | 220 | if self.data[self.data[self.group] == self.CTR].shape[0] == 0: 221 | raise ValueError('Dataset has no subjects in specified training sample..') 222 | else: 223 | try: 224 | train_size = float(self.train_sample) 225 | except: 226 | raise ValueError("Value for train_sample not recognized. Must be either 'controls', 'manual', or a " 227 | "value in (0,1].") 228 | else: 229 | if (train_size > 1) or (train_size <= 0): 230 | raise ValueError("Numerical value for train_sample must be in the range (0,1].") 231 | else: 232 | self._make_train_sample(train_size) 233 | self.group = 'train_sample' 234 | self._set_group_names() 235 | 236 | def _set_group_names(self): 237 | """ Read whether subjects in data are labeled CTR/PROB or 0/1 and set labels accordingly.""" 238 | if self.group == 'train_sample': 239 | self.CTR = 1 240 | self.PROB = 0 241 | else: 242 | labels = list(self.data[self.group].unique()) 243 | if ('CTR' in labels) or ('PROB' in labels): 244 | self.CTR = 'CTR' 245 | self.PROB = 'PROB' 246 | else: 247 | self.CTR = 0 248 | self.PROB = 1 249 | 250 | def _get_masks(self): 251 | """ Get masks from data corresponding to controls and probands. 252 | 253 | Returns 254 | ------- 255 | array 256 | Control mask: controls marked as True. 257 | array 258 | Proband mask: probands marked as True. 259 | """ 260 | ctr = self.data.loc[(self.data[self.group] == self.CTR)] 261 | ctr_mask = self.data.index.isin(ctr.index) 262 | probands = self.data.loc[(self.data[self.group] == self.PROB)] 263 | prob_mask = self.data.index.isin(probands.index) 264 | return ctr_mask, prob_mask 265 | 266 | # Default values for age in days 267 | def _create_bins(self): 268 | """ Create bins for the centiles and LOESS models. 269 | Returns 270 | ------- 271 | array 272 | Bins for the centiles and LOESS models. 273 | """ 274 | min_conf = self.data[self.conf].min() 275 | max_conf = self.data[self.conf].max() 276 | 277 | if self.bin_width == -1: 278 | self.bin_width = (max_conf - min_conf)/100 279 | if self.bin_spacing == -1: 280 | self.bin_spacing = (max_conf - min_conf)/10 281 | 282 | # define the bins (according to width) 283 | self.bins = np.arange(min_conf, max_conf + self.bin_width, self.bin_spacing) 284 | return self.bins 285 | 286 | def bins_num(self): 287 | """ Give the number of ctr used for the age bin each participant is in. 288 | 289 | Returns 290 | ------- 291 | array 292 | Number of controls in each bin. 293 | """ 294 | if self.bins is None: 295 | self.create_bins() 296 | 297 | dists = [np.abs(conf - self.bins) for conf in self.data[self.conf]] 298 | idx = [np.argmin(d) for d in dists] 299 | n_ctr = [self.bin_count[i] for i in idx] 300 | self.data['participants'] = n_ctr 301 | return n_ctr 302 | 303 | def _loess_rank(self): 304 | """ Associate ranks to LOESS normative scores.""" 305 | self.data.loc[(self.data.LOESS_z <= -2), 'LOESS_rank'] = -2 306 | self.data.loc[(self.data.LOESS_z > -2) & 307 | (self.data.LOESS_z <= -1), 'LOESS_rank'] = -1 308 | self.data.loc[(self.data.LOESS_z > -1) & 309 | (self.data.LOESS_z <= +1), 'LOESS_rank'] = 0 310 | self.data.loc[(self.data.LOESS_z > +1) & 311 | (self.data.LOESS_z <= +2), 'LOESS_rank'] = 1 312 | self.data.loc[(self.data.LOESS_z > +2), 'LOESS_rank'] = 2 313 | 314 | def loess_normative_model(self,cv_folds=1): 315 | """ Compute LOESS normative model. 316 | 317 | Parameters 318 | ---------- 319 | cv_folds: int, default=1 320 | How many folds of cross-validation to perform. If 1, there is no cross-validation. 321 | """ 322 | if self.bins is None: 323 | self._create_bins() 324 | 325 | # Format data 326 | data = self.data[[self.conf, self.score]].to_numpy(dtype=np.float64) 327 | 328 | # Take the controls 329 | ctr_mask, _ = self._get_masks() 330 | ctr = data[ctr_mask] 331 | 332 | # Cross-validation 333 | if cv_folds == 1: 334 | self.zm,self.zstd,self.zci = loess_fit(ctr,self.bins,self.bin_width) 335 | m, std = loess_predict(data,self.bins,self.zm,self.zstd) 336 | 337 | rmse = RMSE(self.data[self.score].values[ctr_mask],m[ctr_mask]) 338 | smse = SMSE(self.data[self.score].values[ctr_mask],m[ctr_mask]) 339 | 340 | else: 341 | kf = KFold(n_splits=cv_folds, shuffle=True) 342 | rmse = [] 343 | smse = [] 344 | print(f'Starting {cv_folds} folds of CV...') 345 | for i, (train_index, test_index) in enumerate(kf.split(ctr)): 346 | ctr_train, ctr_test = ctr[train_index], ctr[test_index] 347 | cv_zm,cv_zstd,_ = loess_fit(ctr_train,self.bins,self.bin_width) 348 | cv_m, _ = loess_predict(ctr_test,self.bins,cv_zm,cv_zstd) 349 | r = RMSE(ctr_test[:,1],cv_m) 350 | s = SMSE(ctr_test[:,1],cv_m) 351 | print(f'CV Fold {i}: RMSE={r:.3f} - SMSE={s:.3f}') 352 | rmse.append(r) 353 | smse.append(s) 354 | print('Done!') 355 | 356 | rmse = np.mean(rmse) 357 | smse = np.mean(smse) 358 | print(f'Average: RMSE={rmse:.3f} - SMSE={smse:.3f}') 359 | 360 | self.zm,self.zstd,self.zci = loess_fit(ctr,self.bins,self.bin_width) 361 | m, std = loess_predict(data,self.bins,self.zm,self.zstd) 362 | 363 | self.data['LOESS_pred'] = m 364 | self.data['LOESS_sigma'] = std 365 | self.data['LOESS_residuals'] = self.data[self.score] - self.data['LOESS_pred'] 366 | self.data['LOESS_z'] = self.data['LOESS_residuals']/self.data['LOESS_sigma'] 367 | 368 | self.RMSE_LOESS = rmse 369 | self.SMSE_LOESS = smse 370 | 371 | self._loess_rank() 372 | 373 | def _centiles_rank(self): 374 | """ Associate ranks to centiles associated with normative modeling.""" 375 | self.data.loc[(self.data.Centiles_pred <= 5), 'Centiles_rank'] = -2 376 | self.data.loc[(self.data.Centiles_pred > 5) & 377 | (self.data.Centiles_pred <= 25), 'Centiles_rank'] = -1 378 | self.data.loc[(self.data.Centiles_pred > 25) & 379 | (self.data.Centiles_pred <= 75), 'Centiles_rank'] = 0 380 | self.data.loc[(self.data.Centiles_pred > 75) & 381 | (self.data.Centiles_pred <= 95), 'Centiles_rank'] = 1 382 | self.data.loc[(self.data.Centiles_pred > 95), 'Centiles_rank'] = 2 383 | 384 | def centiles_normative_model(self, cv_folds=1): 385 | """ Compute centiles normative model. 386 | 387 | Parameters 388 | ---------- 389 | cv_folds: int, default=1 390 | How many folds of cross-validation to perform. If 1, there is no cross-validation. 391 | """ 392 | if self.bins is None: 393 | self._create_bins() 394 | 395 | # Format data 396 | data = self.data[[self.conf, self.score]].to_numpy(dtype=np.float64) 397 | 398 | # Take the controls 399 | ctr_mask, _ = self._get_masks() 400 | ctr = data[ctr_mask] 401 | 402 | # Cross-validation 403 | if cv_folds == 1: 404 | self.z = centiles_fit(ctr,self.bins,self.bin_width) 405 | result, centiles = centiles_predict(data,self.bins,self.z) 406 | centiles_50 = np.array([centiles[i, 50] for i in range(self.data.shape[0])]) 407 | 408 | rmse = RMSE(self.data[self.score].values[ctr_mask],centiles_50[ctr_mask]) 409 | smse = SMSE(self.data[self.score].values[ctr_mask],centiles_50[ctr_mask]) 410 | 411 | else: 412 | kf = KFold(n_splits=cv_folds, shuffle=True) 413 | rmse = [] 414 | smse = [] 415 | print(f'Starting {cv_folds} folds of CV...') 416 | for i, (train_index, test_index) in enumerate(kf.split(ctr)): 417 | ctr_train, ctr_test = ctr[train_index], ctr[test_index] 418 | cv_z = centiles_fit(ctr_train,self.bins,self.bin_width) 419 | _, cv_centiles = centiles_predict(ctr_test, self.bins,cv_z) 420 | cv_50 = np.array([cv_centiles[i, 50] for i in range(ctr_test.shape[0])]) 421 | r = RMSE(ctr_test[:,1],cv_50) 422 | s = SMSE(ctr_test[:,1],cv_50) 423 | print(f'CV Fold {i}: RMSE={r:.3f} - SMSE={s:.3f}') 424 | rmse.append(r) 425 | smse.append(s) 426 | print('Done!') 427 | 428 | rmse = np.mean(rmse) 429 | smse = np.mean(smse) 430 | print(f'Average: RMSE={rmse:.3f} - SMSE={smse:.3f}') 431 | 432 | self.z = centiles_fit(ctr,self.bins,self.bin_width) 433 | result, centiles = centiles_predict(data,self.bins,self.z) 434 | 435 | self.data['Centiles'] = result 436 | self.data['Centiles_5'] = np.array([centiles[i, 5] for i in range(self.data.shape[0])]) 437 | self.data['Centiles_32'] = np.array([centiles[i, 32] for i in range(self.data.shape[0])]) 438 | self.data['Centiles_pred'] = np.array([centiles[i, 50] for i in range(self.data.shape[0])]) 439 | self.data['Centiles_68'] = np.array([centiles[i, 68] for i in range(self.data.shape[0])]) 440 | self.data['Centiles_95'] = np.array([centiles[i, 95] for i in range(self.data.shape[0])]) 441 | self.data['Centiles_sigma'] = (self.data['Centiles_68'] - self.data['Centiles_32'])/2 442 | self.data['Centiles_residuals'] = self.data[self.score] - self.data['Centiles_pred'] 443 | self.data['Centiles_z'] = self.data['Centiles_residuals']/self.data['Centiles_sigma'] 444 | 445 | self.RMSE_Centiles = rmse 446 | self.SMSE_Centiles = smse 447 | 448 | self._centiles_rank() 449 | 450 | def _get_conf_mat(self): 451 | """ Get confounds properly formatted from dataframe and input list. 452 | 453 | Returns 454 | ------- 455 | array 456 | Confounds with categorical values dummy encoded. Dummy encoding keeps k-1 457 | dummies out of k categorical levels. 458 | """ 459 | conf_clean, conf_cat = read_confounds(self.confounds) 460 | conf_mat = pd.get_dummies(self.data[conf_clean], columns=conf_cat, 461 | drop_first=True) 462 | return conf_mat.to_numpy() 463 | 464 | def _get_score(self): 465 | """ Get the score from the PyNM object as an array. 466 | 467 | Raises 468 | ------ 469 | ValueError 470 | Method must be one of "auto","approx", or "exact". 471 | 472 | Returns 473 | ------- 474 | array 475 | The column of data marked by the user as 'score'. 476 | """ 477 | return self.data[self.score].to_numpy() 478 | 479 | def _use_approx(self, method='auto'): 480 | """ Choose wether or not to use SVGP model. If method is set to 'auto' SVGP is chosen 481 | for datasets with more than 2000 points. 482 | 483 | Parameters 484 | ---------- 485 | method: str, default='auto' 486 | Which method to use, can be 'exact' for exact GP regression, 'approx' for SVGP, 487 | or 'auto' which will set the method according to the size of the data. 488 | 489 | Raises 490 | ------ 491 | ValueError 492 | Method must be one of "auto","approx", or "exact". 493 | """ 494 | if method == 'auto': 495 | if self.data.shape[0] > 2000: 496 | return True 497 | else: 498 | return False 499 | elif method == 'approx': 500 | return True 501 | elif method == 'exact': 502 | if self.data.shape[0] > 2000: 503 | warnings.warn("Exact GP model with over 2000 data points requires " 504 | "large amounts of time and memory, continuing with exact model.",Warning) 505 | return False 506 | else: 507 | raise ValueError('Method must be one of "auto","approx", or "exact".') 508 | 509 | def _test_gp_residuals(self,conf_mat): 510 | #Test normal 511 | k2, p_norm = stats.normaltest(self.data['GP_residuals']) 512 | if p_norm < 0.05: 513 | warnings.warn("The residuals are not Gaussian!") 514 | 515 | # Test heteroskedasticity 516 | exog = add_constant(conf_mat) 517 | _,p_het,_,_ = het_white((self.data['GP_residuals'])**2,exog) 518 | if p_het < 0.05: 519 | warnings.warn("The residuals are heteroskedastic!") 520 | 521 | def gp_normative_model(self, length_scale=1, nu=2.5, length_scale_bounds=(1e-5,1e5),method='auto', batch_size=256, n_inducing=500, num_epochs=20, cv_folds=1): 522 | """ Compute gaussian process normative model. Gaussian process regression is computed using 523 | the Matern Kernel with an added constant and white noise. For Matern kernel see scikit-learn documentation: 524 | https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.kernels.Matern.html. 525 | 526 | Parameters 527 | ------- 528 | length_scale: float, default=1 529 | Length scale parameter of Matern kernel. 530 | nu: float, default=2.5 531 | Nu parameter of Matern kernel. 532 | length_scale_bounds: pair of floats >= 0 or 'fixed', default=(1e-5, 1e5) 533 | The lower and upper bound on length_scale. If set to 'fixed', ‘length_scale’ cannot be changed during hyperparameter tuning. 534 | method: str, default='auto' 535 | Which method to use, can be 'exact' for exact GP regression, 'approx' for SVGP, 536 | or 'auto' which will set the method according to the size of the data. 537 | batch_size: int, default=256 538 | Batch size for SVGP model training and prediction. 539 | n_inducing: int, default=500 540 | Number of inducing points for SVGP model. 541 | num_epochs: int, default=20 542 | Number of epochs (passes through entire dataset) to train SVGP for. 543 | cv_folds: int, default=1 544 | How many folds of cross-validation to perform. If 1, there is no cross-validation. 545 | """ 546 | # get proband and control masks 547 | ctr_mask, _ = self._get_masks() 548 | 549 | # get matrix of confounds 550 | conf_mat = self._get_conf_mat() 551 | 552 | # get score 553 | score = self._get_score() 554 | 555 | if self._use_approx(method=method): 556 | self.loss = self._svgp_normative_model(conf_mat,score,ctr_mask,nu=nu,length_scale=length_scale, length_scale_bounds=length_scale_bounds, 557 | batch_size=batch_size,n_inducing=n_inducing,num_epochs=num_epochs,cv_folds=cv_folds) 558 | 559 | else: 560 | kernel = ConstantKernel() + WhiteKernel(noise_level=1) + Matern(length_scale=length_scale, nu=nu,length_scale_bounds=length_scale_bounds) 561 | gp = gaussian_process.GaussianProcessRegressor(kernel=kernel) 562 | 563 | # Define independent and response variables 564 | y = score[ctr_mask].reshape(-1,1) 565 | X = conf_mat[ctr_mask] 566 | 567 | if cv_folds == 1: 568 | gp.fit(X, y) 569 | y_pred, sigma = gp.predict(conf_mat, return_std=True) 570 | y_true = self.data[self.score].to_numpy() 571 | 572 | # For MSLL 573 | y_train_mean = np.mean(y_true[ctr_mask]) 574 | y_train_sigma = np.std(y_true[ctr_mask]) 575 | 576 | rmse = RMSE(y_true[ctr_mask],y_pred[ctr_mask]) 577 | smse = SMSE(y_true[ctr_mask],y_pred[ctr_mask]) 578 | msll = MSLL(y_true[ctr_mask],y_pred[ctr_mask],sigma[ctr_mask],y_train_mean,y_train_sigma) 579 | else: 580 | kf = KFold(n_splits=cv_folds, shuffle=True) 581 | rmse = [] 582 | smse = [] 583 | msll = [] 584 | print(f'Starting {cv_folds} folds of CV...') 585 | for i, (train_index, test_index) in enumerate(kf.split(X)): 586 | X_train, X_test = X[train_index], X[test_index] 587 | y_train, y_test = y[train_index], y[test_index] 588 | gp.fit(X_train, y_train) 589 | y_pred, sigma = gp.predict(X_test, return_std=True) 590 | 591 | # For MSLL 592 | y_train_mean = np.mean(y_train) 593 | y_train_sigma = np.std(y_train) 594 | 595 | r = RMSE(y_test,y_pred) 596 | s = SMSE(y_test,y_pred) 597 | m = MSLL(y_test.squeeze(),y_pred.squeeze(),sigma.squeeze(),y_train_mean,y_train_sigma) 598 | print(f'CV Fold {i}: RMSE={r:.3f} - SMSE={s:.3f} - MSLL={m:.3f}') 599 | rmse.append(r) 600 | smse.append(s) 601 | msll.append(m) 602 | print('Done!') 603 | 604 | rmse = np.mean(rmse) 605 | smse = np.mean(smse) 606 | msll = np.mean(msll) 607 | print(f'Average: RMSE={rmse:.3f} - SMSE={smse:.3f} - MSLL={msll:.3f}') 608 | 609 | gp.fit(X, y) 610 | y_pred, sigma = gp.predict(conf_mat, return_std=True) 611 | y_true = self.data[self.score].to_numpy().reshape(-1,1) 612 | 613 | self.data['GP_pred'] = y_pred 614 | self.data['GP_sigma'] = sigma 615 | self.data['GP_residuals'] = np.squeeze(y_true) - y_pred 616 | self.data['GP_z'] = self.data['GP_residuals'] / self.data['GP_sigma'] 617 | 618 | self.RMSE_GP = rmse 619 | self.SMSE_GP = smse 620 | self.MSLL_GP = msll 621 | 622 | self._test_gp_residuals(conf_mat) 623 | 624 | def _svgp_normative_model(self,conf_mat,score,ctr_mask,nu=2.5,length_scale=1,length_scale_bounds=(1e-5,1e5), 625 | batch_size=256,n_inducing=500,num_epochs=20,cv_folds=1): 626 | """ Compute SVGP model. See GPyTorch documentation for further details: 627 | https://docs.gpytorch.ai/en/v1.1.1/examples/04_Variational_and_Approximate_GPs/SVGP_Regression_CUDA.html#Creating-a-SVGP-Model. 628 | 629 | Parameters 630 | ---------- 631 | conf_mat: array 632 | Confounds with categorical values dummy encoded. 633 | score: array 634 | Score/response variable. 635 | ctr_mask: array 636 | Mask (boolean array) with controls marked True. 637 | length_scale: float, default=1 638 | Length scale parameter of Matern kernel. 639 | length_scale_bounds: pair of floats >= 0 or 'fixed', default=(1e-5, 1e5) 640 | The lower and upper bound on length_scale. If set to 'fixed', ‘length_scale’ cannot be changed during hyperparameter tuning. 641 | nu: float, default=2.5 642 | Nu parameter of Matern kernel. 643 | batch_size: int, default=256 644 | Batch size for SVGP model training and prediction. 645 | n_inducing: int, default=500 646 | Number of inducing points for SVGP model. 647 | num_epochs: int, default=20 648 | Number of epochs (passes through entire dataset) to train SVGP for. 649 | cv_folds: int, default=1 650 | How many folds of cross-validation to perform. If 1, there is no cross-validation. 651 | 652 | Raises 653 | ------ 654 | ImportError 655 | GPyTorch or it's dependencies aren't installed. 656 | 657 | Returns 658 | ------- 659 | array 660 | Loss per epoch of training (temp for debugging). 661 | """ 662 | try: 663 | from pynm.models.approx import SVGP 664 | except: 665 | raise ImportError("GPyTorch and it's dependencies must be installed to use the SVGP model.") 666 | else: 667 | if cv_folds == 1: 668 | svgp = SVGP(conf_mat[ctr_mask],conf_mat,score[ctr_mask],score,n_inducing=n_inducing,batch_size=batch_size,nu=nu, 669 | length_scale=length_scale,length_scale_bounds=length_scale_bounds) 670 | 671 | svgp.train(num_epochs=num_epochs) 672 | means, sigma = svgp.predict() 673 | 674 | y_pred = means.numpy() 675 | y_true = score 676 | residuals = (y_true - y_pred).astype(float) 677 | 678 | # For MSLL 679 | y_train_mean = np.mean(y_true[ctr_mask]) 680 | y_train_sigma = np.std(y_true[ctr_mask]) 681 | 682 | rmse = RMSE(y_true[ctr_mask],y_pred[ctr_mask]) 683 | smse = SMSE(y_true[ctr_mask],y_pred[ctr_mask]) 684 | msll = MSLL(y_true[ctr_mask],y_pred[ctr_mask],sigma.numpy()[ctr_mask],y_train_mean,y_train_sigma) 685 | 686 | else: 687 | X = conf_mat[ctr_mask] 688 | y = score[ctr_mask] 689 | 690 | kf = KFold(n_splits=cv_folds, shuffle=True) 691 | rmse = [] 692 | smse = [] 693 | msll = [] 694 | print(f'Starting {cv_folds} folds of CV...') 695 | for i, (train_index, test_index) in enumerate(kf.split(X)): 696 | X_train, X_test = X[train_index], X[test_index] 697 | y_train, y_test = y[train_index], y[test_index] 698 | 699 | # For MSLL 700 | y_train_mean = np.mean(y_train) 701 | y_train_sigma = np.std(y_train) 702 | 703 | cv_svgp = SVGP(X_train,X_test,y_train,y_test,n_inducing=n_inducing,batch_size=batch_size,nu=nu, 704 | length_scale=length_scale,length_scale_bounds=length_scale_bounds) 705 | 706 | cv_svgp.train(num_epochs=num_epochs) 707 | cv_means, cv_sigma = cv_svgp.predict() 708 | 709 | cv_y_pred = cv_means.numpy() 710 | cv_residuals = (y_test - cv_y_pred).astype(float) 711 | 712 | r = RMSE(y_test,cv_y_pred) 713 | s = SMSE(y_test,cv_y_pred) 714 | m = MSLL(y_test,cv_y_pred,cv_sigma.numpy(),y_train_mean,y_train_sigma) 715 | 716 | print(f'CV Fold {i}: RMSE={r:.3f} - SMSE={s:.3f} - MSLL={m:.3f}') 717 | rmse.append(r) 718 | smse.append(s) 719 | msll.append(m) 720 | print('Done!') 721 | 722 | rmse = np.mean(rmse) 723 | smse = np.mean(smse) 724 | msll = np.mean(msll) 725 | print(f'Average: RMSE={rmse:.3f} - SMSE={smse:.3f} - MSLL={msll:.3f}') 726 | 727 | svgp = SVGP(conf_mat[ctr_mask],conf_mat,score[ctr_mask],score,n_inducing=n_inducing,batch_size=batch_size,nu=nu, 728 | length_scale=length_scale,length_scale_bounds=length_scale_bounds) 729 | 730 | svgp.train(num_epochs=num_epochs) 731 | means, sigma = svgp.predict() 732 | 733 | y_pred = means.numpy() 734 | y_true = score 735 | residuals = (y_true - y_pred).astype(float) 736 | 737 | self.data['GP_pred'] = y_pred 738 | self.data['GP_sigma'] = sigma.numpy() 739 | self.data['GP_residuals'] = residuals 740 | self.data['GP_z'] = self.data['GP_residuals']/self.data['GP_sigma'] 741 | 742 | self.RMSE_GP = rmse 743 | self.SMSE_GP = smse 744 | self.MSLL_GP = msll 745 | 746 | 747 | def gamlss_normative_model(self,mu=None,sigma=None,nu=None,tau=None,family='SHASHo2',method='RS',cv_folds=1): 748 | """Compute GAMLSS normative model. 749 | 750 | Parameters 751 | ---------- 752 | mu: str or None 753 | Formula for mu (location) parameter of GAMLSS. If None, formula for score is sum of confounds 754 | with non-categorical columns as smooth functions, e.g. "score ~ ps(age) + sex". 755 | sigma: str or None 756 | Formula for sigma (scale) parameter of GAMLSS. If None, formula is '~ 1'. 757 | nu: str or None 758 | Formula for nu (skewness) parameter of GAMLSS. If None, formula is '~ 1'. 759 | tau: str or None 760 | Formula for tau (kurtosis) parameter of GAMLSS. If None, formula is '~ 1'. 761 | family: str,default='SHASHo2' 762 | Family of distributions to use for fitting, default is 'SHASHo2'. See R documentation for GAMLSS package for other available families of distributions. 763 | method: str, default = 'RS' 764 | Method for fitting GAMLSS. Can be 'RS' (Rigby and Stasinopoulos algorithm), 'CG' (Cole and Green algorithm) or 'mixed(n,m)' where n & m are integers. 765 | Specifying 'mixed(n,m)' will use the RS algorithm for n iterations and the CG algorithm for up to m additional iterations. 766 | cv_folds: int, default=1 767 | How many folds of cross-validation to perform. If 1, there is no cross-validation. 768 | 769 | Notes 770 | ----- 771 | If using 'random()' to model a random effect in any of the formulas, it must be passed a column of the dataframe with categorical values 772 | as a factor: e.g. 'random(as.factor(COL))'. 773 | """ 774 | try: 775 | from pynm.models.gamlss import GAMLSS 776 | except: 777 | raise ImportError("R and the GAMLSS package must be installed to use GAMLSS model, see documentation for installation help.") 778 | else: 779 | # get proband and control masks 780 | ctr_mask, _ = self._get_masks() 781 | 782 | gamlss = GAMLSS(mu=mu,sigma=sigma,nu=nu,tau=tau,family=family,method=method, 783 | score=self.score,confounds=self.confounds) 784 | 785 | nan_cols = ['LOESS_pred','LOESS_residuals','LOESS_z','LOESS_rank','LOESS_sigma', 786 | 'Centiles_pred','Centiles_residuals','Centiles_z','Centiles','Centiles_rank','Centiles_sigma', 787 | 'Centiles_95','Centiles_5','Centiles_32','Centiles_68'] 788 | gamlss_data = self.data[[c for c in self.data.columns if c not in nan_cols]] 789 | 790 | if cv_folds == 1: 791 | gamlss.fit(gamlss_data[ctr_mask]) 792 | 793 | mu_pred = gamlss.predict(gamlss_data,what='mu') 794 | sigma_pred = gamlss.predict(gamlss_data,what='sigma') 795 | 796 | # For MSLL 797 | y_train_mean = np.mean(self.data[self.score].values[ctr_mask]) 798 | y_train_sigma = np.std(self.data[self.score].values[ctr_mask]) 799 | 800 | rmse = RMSE(self.data[self.score].values[ctr_mask],mu_pred[ctr_mask]) 801 | smse = SMSE(self.data[self.score].values[ctr_mask],mu_pred[ctr_mask]) 802 | msll = MSLL(self.data[self.score].values[ctr_mask],mu_pred[ctr_mask],sigma_pred[ctr_mask], 803 | y_train_mean, y_train_sigma) 804 | 805 | else: 806 | X = gamlss_data[ctr_mask] 807 | kf = KFold(n_splits=cv_folds, shuffle=True) 808 | rmse = [] 809 | smse = [] 810 | msll = [] 811 | print(f'Starting {cv_folds} folds of CV...') 812 | for i, (train_index, test_index) in enumerate(kf.split(X)): 813 | X_train, X_test = X.iloc[train_index], X.iloc[test_index] 814 | 815 | # For MSLL 816 | y_train_mean = np.mean(self.data[self.score].values[train_index]) 817 | y_train_sigma = np.std(self.data[self.score].values[train_index]) 818 | 819 | gamlss.fit(X_train) 820 | 821 | cv_mu_pred = gamlss.predict(X_test,what='mu') 822 | cv_sigma_pred = gamlss.predict(X_test,what='sigma') 823 | 824 | r = RMSE(X_test[self.score].values,cv_mu_pred) 825 | s = SMSE(X_test[self.score].values,cv_mu_pred) 826 | m = MSLL(X_test[self.score].values,cv_mu_pred,cv_sigma_pred,y_train_mean,y_train_sigma) 827 | print(f'CV Fold {i}: RMSE={r:.3f} - SMSE={s:.3f} - MSLL={m:.3f}') 828 | rmse.append(r) 829 | smse.append(s) 830 | msll.append(m) 831 | print('Done!') 832 | 833 | rmse = np.mean(rmse) 834 | smse = np.mean(smse) 835 | msll = np.mean(msll) 836 | print(f'Average: RMSE={rmse:.3f} - SMSE={smse:.3f} - MSLL={msll:.3f}') 837 | 838 | gamlss.fit(gamlss_data[ctr_mask]) 839 | 840 | mu_pred = gamlss.predict(gamlss_data,what='mu') 841 | sigma_pred = gamlss.predict(gamlss_data,what='sigma') 842 | 843 | self.data['GAMLSS_pred'] = mu_pred 844 | self.data['GAMLSS_sigma'] = sigma_pred 845 | self.data['GAMLSS_residuals'] = self.data[self.score] - self.data['GAMLSS_pred'] 846 | self.data['GAMLSS_z'] = self.data['GAMLSS_residuals']/self.data['GAMLSS_sigma'] 847 | 848 | self.RMSE_GAMLSS = rmse 849 | self.SMSE_GAMLSS = smse 850 | self.MSLL_GAMLSS = msll 851 | 852 | def report(self): 853 | """ Prints the values of each metric (SMSE, RMSE, MSLL) for the models that have been run. 854 | """ 855 | print("------\nReport\n------") 856 | models = [] 857 | for m in ['LOESS','Centiles','GP','GAMLSS']: 858 | if '{}_pred'.format(m) in self.data.columns: 859 | models.append(m) 860 | if len(models)==0: 861 | print('No models have been run.') 862 | return 863 | 864 | print("========= SMSE - RMSE - MSLL") 865 | for m in models: 866 | k = 9 - len(m) 867 | m_formatted = m + k*' ' 868 | smse = np.round(eval(f"self.SMSE_{m}"),2) 869 | if np.isnan(smse): 870 | smse = 'NaN ' 871 | rmse = np.round(eval(f"self.RMSE_{m}"),2) 872 | if np.isnan(rmse): 873 | rmse = 'NaN ' 874 | msll = 'N/A' 875 | if (m == 'GP') or (m == 'GAMLSS'): 876 | msll = np.round(eval(f"self.MSLL_{m}"),2) 877 | print(f"{m_formatted} {smse} {rmse} {msll}") 878 | 879 | def _plot(self, ax,kind=None,gp_xaxis=None,gamlss_xaxis=None): 880 | """ Plot the data with the normative model overlaid. 881 | 882 | Parameters 883 | ---------- 884 | ax: matplotlib axis 885 | Axis on which to plot. 886 | kind: str, default=None 887 | Type of plot among "LOESS" (local polynomial), "Centiles", "GP" (gaussian processes), 888 | or "GAMLSS" (generalized additive models of location scale and shape). 889 | gp_xaxis: str,default=None 890 | Which confound to use for xaxis of GP plot. If set to None, first confound in list passed to model will be used. 891 | gamlss_xaxis: str,default=None 892 | Which confound to use for xaxis of GAMLSS plot. If set to None, first confound in list passed to model will be used. 893 | 894 | Returns 895 | ------- 896 | Axis 897 | handle for the matplotlib axis of the plot 898 | """ 899 | if kind is None: 900 | sns.scatterplot(data=self.data, x=self.conf, y=self.score, 901 | hue=self.group, style=self.group,ax=ax) 902 | elif kind == 'LOESS': 903 | sns.scatterplot(data=self.data, x=self.conf, y=self.score, 904 | hue=self.group, style=self.group,ax=ax) 905 | tmp=self.data.sort_values(self.conf) 906 | ax.plot(tmp[self.conf], tmp['LOESS_pred'], '-k',label='Prediction') 907 | ax.plot(tmp[self.conf], tmp['LOESS_pred'] - 1.96*tmp['LOESS_sigma'], '--k') 908 | ax.plot(tmp[self.conf], tmp['LOESS_pred'] + 1.96*tmp['LOESS_sigma'], '--k',label='95% CI') 909 | handles, labels = ax.get_legend_handles_labels() 910 | ax.legend(handles, labels) 911 | ax.set_title(f"{kind} SMSE={self.SMSE_LOESS:.3f}") 912 | elif kind == 'Centiles': 913 | sns.scatterplot(data=self.data, x=self.conf, y=self.score, 914 | hue=self.group, style=self.group,ax=ax) 915 | tmp=self.data.sort_values(self.conf) 916 | ax.plot(tmp[self.conf], tmp['Centiles_pred'], '-k',label='Prediction') 917 | ax.plot(tmp[self.conf], tmp['Centiles_5'],'--k') 918 | ax.plot(tmp[self.conf], tmp['Centiles_95'],'--k',label='95% CI') 919 | handles, labels = ax.get_legend_handles_labels() 920 | ax.legend(handles, labels) 921 | ax.set_title(f"{kind} SMSE={self.SMSE_Centiles:.3f}") 922 | elif kind == 'GP': 923 | if gp_xaxis is None: 924 | gp_xaxis = self.conf 925 | sns.scatterplot(data=self.data, x=gp_xaxis, y=self.score, 926 | hue=self.group, style=self.group,ax=ax) 927 | tmp=self.data.sort_values(gp_xaxis) 928 | if len(self.confounds) == 1: 929 | ax.plot(tmp[gp_xaxis], tmp['GP_pred'], '-k',label='Prediction') 930 | ax.plot(tmp[gp_xaxis], tmp['GP_pred'] - 1.96*tmp['GP_sigma'], '--k') 931 | ax.plot(tmp[gp_xaxis], tmp['GP_pred'] + 1.96*tmp['GP_sigma'], '--k',label='95% CI') 932 | else: 933 | ax.scatter(tmp[gp_xaxis], tmp['GP_pred'], label='Prediction',color='black',marker='_',s=25) 934 | ax.scatter(tmp[gp_xaxis], tmp['GP_pred'] - 1.96*tmp['GP_sigma'],color='black',s=0.2) 935 | ax.scatter(tmp[gp_xaxis], tmp['GP_pred'] + 1.96*tmp['GP_sigma'], label='95% CI',color='black',s=0.2) 936 | handles, labels = ax.get_legend_handles_labels() 937 | ax.legend(handles, labels) 938 | ax.set_title(f"{kind} SMSE={self.SMSE_GP:.3f} - MSLL={self.MSLL_GP:.3f}") 939 | elif kind == 'GAMLSS': 940 | if gamlss_xaxis is None: 941 | gamlss_xaxis = self.conf 942 | sns.scatterplot(data=self.data, x=gamlss_xaxis, y=self.score, 943 | hue=self.group, style=self.group,ax=ax) 944 | tmp=self.data.sort_values(gamlss_xaxis) 945 | if len(self.confounds) == 1: 946 | ax.plot(tmp[gamlss_xaxis], tmp['GAMLSS_pred'], '-k',label='Prediction') 947 | ax.plot(tmp[gamlss_xaxis], tmp['GAMLSS_pred'] - 1.96*tmp['GAMLSS_sigma'], '--k') 948 | ax.plot(tmp[gamlss_xaxis], tmp['GAMLSS_pred'] + 1.96*tmp['GAMLSS_sigma'], '--k',label='95% CI') 949 | else: 950 | ax.scatter(tmp[gamlss_xaxis], tmp['GAMLSS_pred'], label='Prediction',color='black',marker='_',s=25) 951 | ax.scatter(tmp[gamlss_xaxis], tmp['GAMLSS_pred'] - 1.96*tmp['GAMLSS_sigma'],color='black',s=0.2) 952 | ax.scatter(tmp[gamlss_xaxis], tmp['GAMLSS_pred'] + 1.96*tmp['GAMLSS_sigma'], label='95% CI',color='black',s=0.2) 953 | handles, labels = ax.get_legend_handles_labels() 954 | ax.legend(handles, labels) 955 | ax.set_title(f"{kind} SMSE={self.SMSE_GAMLSS:.3f} - MSLL={self.MSLL_GAMLSS:.3f}") 956 | return ax 957 | 958 | def plot(self, kind=None,gp_xaxis=None,gamlss_xaxis=None): 959 | """Plot the data with the normative model overlaid. 960 | 961 | Parameters 962 | ---------- 963 | kind: list, default=None 964 | Type of plot, must be a valid subset of ["Centiles","LOESS","GP","GAMLSS"] or None. If None, all available 965 | results will be plotted, if None are available a warning will be raised and only the data will be plotted. 966 | gp_xaxis: str,default=None 967 | Which confound to use for xaxis of GP plot. If set to None, first confound in list passed to model will be used. 968 | gamlss_xaxis: str,default=None 969 | Which confound to use for xaxis of GAMLSS plot. If set to None, first confound in list passed to model will be used. 970 | 971 | Raises 972 | ------ 973 | ValueError 974 | Plot kind not recognized, must be a valid subset of ["Centiles","LOESS","GP","GAMLSS"] or None. 975 | """ 976 | if kind is None: 977 | kind = [] 978 | for k in ['LOESS','Centiles','GP','GAMLSS']: 979 | if '{}_pred'.format(k) in self.data.columns: 980 | kind.append(k) 981 | if len(kind)==0: 982 | warnings.warn('No model results found in data.') 983 | 984 | if set(kind).issubset(set(['LOESS','Centiles','GP','GAMLSS'])) and len(kind)>1: 985 | fig, ax = plt.subplots(1,len(kind),figsize=(len(kind)*5,5)) 986 | for i,k in enumerate(kind): 987 | self._plot(ax[i],kind=k,gp_xaxis=gp_xaxis,gamlss_xaxis=gamlss_xaxis) 988 | plt.show() 989 | elif set(kind).issubset(set(['LOESS','Centiles','GP','GAMLSS'])) and len(kind)>0: 990 | fig, ax = plt.subplots(1,len(kind),figsize=(len(kind)*5,5)) 991 | self._plot(ax,kind=kind[0],gp_xaxis=gp_xaxis,gamlss_xaxis=gamlss_xaxis) 992 | plt.show() 993 | elif len(kind)==0: 994 | fig, ax = plt.subplots(1,1) 995 | self._plot(ax,None,gp_xaxis=gp_xaxis,gamlss_xaxis=gamlss_xaxis) 996 | ax.set_title('Data') 997 | plt.show() 998 | else: 999 | raise ValueError('Plot kind not recognized, must be a valid subset of ["Centiles","LOESS","GP","GAMLSS"] or None.') 1000 | 1001 | def _plot_res_z(self, ax,kind=None, confound=None,z=False): 1002 | """ Plot the residuals of the normative model. 1003 | 1004 | Parameters 1005 | ---------- 1006 | ax: matplotlib axis 1007 | Axis on which to plot. 1008 | kind: str, default=None 1009 | Type of plot among "LOESS" (local polynomial), "Centiles", "GP" (gaussian processes), 1010 | or "GAMLSS" (generalized additive models of location scale and shape). 1011 | confound: str or None 1012 | Which confound to use as xaxis of plot, must be categorical or None. 1013 | """ 1014 | if kind == 'LOESS': 1015 | if z: 1016 | sns.violinplot(x=confound, y='LOESS_z', 1017 | data=self.data, split=True, palette='Blues', hue=self.group,ax=ax) 1018 | else: 1019 | sns.violinplot(x=confound, y='LOESS_residuals', 1020 | data=self.data, split=True, palette='Blues', hue=self.group,ax=ax) 1021 | ax.set_title(f"{kind} SMSE={self.SMSE_LOESS:.3f}") 1022 | if kind == 'Centiles': 1023 | if z: 1024 | sns.violinplot(x=confound, y='Centiles_z', 1025 | data=self.data, split=True, palette='Blues', hue=self.group,ax=ax) 1026 | else: 1027 | sns.violinplot(x=confound, y='Centiles_residuals', 1028 | data=self.data, split=True, palette='Blues', hue=self.group,ax=ax) 1029 | ax.set_title(f"{kind} SMSE={self.SMSE_Centiles:.3f}") 1030 | if kind == 'GP': 1031 | if z: 1032 | sns.violinplot(x=confound, y='GP_z', 1033 | data=self.data, split=True, palette='Blues', hue=self.group,ax=ax) 1034 | else: 1035 | sns.violinplot(x=confound, y='GP_residuals', 1036 | data=self.data, split=True, palette='Blues', hue=self.group,ax=ax) 1037 | ax.set_title(f"{kind} SMSE={self.SMSE_GP:.3f} - MSLL={self.MSLL_GP:.3f}") 1038 | if kind == 'GAMLSS': 1039 | if z: 1040 | sns.violinplot(x=confound, y='GAMLSS_z', 1041 | data=self.data, split=True, palette='Blues', hue=self.group,ax=ax) 1042 | else: 1043 | sns.violinplot(x=confound, y='GAMLSS_residuals', 1044 | data=self.data, split=True, palette='Blues', hue=self.group,ax=ax) 1045 | ax.set_title(f"{kind} SMSE={self.SMSE_GAMLSS:.3f} - MSLL={self.MSLL_GAMLSS:.3f}") 1046 | if not isinstance(confound,str): 1047 | ax.set_xticklabels(['']) 1048 | 1049 | def _plot_res_z_cont(self, ax,kind=None, confound=None,z=False): 1050 | """ Plot the residuals of the normative model. 1051 | 1052 | Parameters 1053 | ---------- 1054 | ax: matplotlib axis 1055 | Axis on which to plot. 1056 | kind: str, default=None 1057 | Type of plot among "LOESS" (local polynomial), "Centiles", "GP" (gaussian processes), 1058 | or "GAMLSS" (generalized additive models of location scale and shape). 1059 | confound: str or None 1060 | Which confound to use as xaxis of plot, must be continuous. 1061 | """ 1062 | if kind == 'LOESS': 1063 | if z: 1064 | sns.scatterplot(x=confound, y='LOESS_z', 1065 | data=self.data, hue=self.group,ax=ax) 1066 | else: 1067 | sns.scatterplot(x=confound, y='LOESS_residuals', 1068 | data=self.data, hue=self.group,ax=ax) 1069 | ax.set_title(f"{kind} SMSE={self.SMSE_LOESS:.3f}") 1070 | if kind == 'Centiles': 1071 | if z: 1072 | sns.scatterplot(x=confound, y='Centiles_z', 1073 | data=self.data, hue=self.group,ax=ax) 1074 | else: 1075 | sns.scatterplot(x=confound, y='Centiles_residuals', 1076 | data=self.data, hue=self.group,ax=ax) 1077 | ax.set_title(f"{kind} SMSE={self.SMSE_Centiles:.3f}") 1078 | if kind == 'GP': 1079 | if z: 1080 | sns.scatterplot(x=confound, y='GP_z', 1081 | data=self.data, hue=self.group,ax=ax) 1082 | else: 1083 | sns.scatterplot(x=confound, y='GP_residuals', 1084 | data=self.data, hue=self.group,ax=ax) 1085 | ax.set_title(f"{kind} SMSE={self.SMSE_GP:.3f} - MSLL={self.MSLL_GP:.3f}") 1086 | if kind == 'GAMLSS': 1087 | if z: 1088 | sns.scatterplot(x=confound, y='GAMLSS_z', 1089 | data=self.data, hue=self.group,ax=ax) 1090 | else: 1091 | sns.scatterplot(x=confound, y='GAMLSS_residuals', 1092 | data=self.data, hue=self.group,ax=ax) 1093 | ax.set_title(f"{kind} SMSE={self.SMSE_GAMLSS:.3f} - MSLL={self.MSLL_GAMLSS:.3f}") 1094 | 1095 | def plot_res(self, kind=None, confound=None): 1096 | """Plot the residuals of the normative model. 1097 | 1098 | Parameters 1099 | ---------- 1100 | kind: list default=None 1101 | Type of plot, must be a valid subset of ["Centiles","LOESS","GP","GAMLSS"] or None. If None, all available 1102 | results will be plotted, if None are available a ValueError will be raised. 1103 | confound: str, default=None 1104 | Which confound to use as xaxis of plot. 1105 | 1106 | Raises 1107 | ------ 1108 | ValueError 1109 | Plot kind not recognized, must be a valid subset of ["Centiles","LOESS","GP","GAMLSS"] or None. 1110 | ValueError 1111 | No model results found in data. 1112 | """ 1113 | _, cat = read_confounds(self.confounds) 1114 | if confound is None: 1115 | categorical = True 1116 | elif confound in cat: 1117 | categorical = True 1118 | else: 1119 | categorical = False 1120 | 1121 | if kind is None: 1122 | kind = [] 1123 | for k in ['LOESS','Centiles','GP','GAMLSS']: 1124 | if '{}_residuals'.format(k) in self.data.columns: 1125 | kind.append(k) 1126 | if len(kind)==0: 1127 | raise ValueError('No model residuals found in data.') 1128 | 1129 | if set(kind).issubset(set(['LOESS','Centiles','GP','GAMLSS'])) and len(kind)>1: 1130 | fig, ax = plt.subplots(1,len(kind),figsize=(len(kind)*5,5)) 1131 | for i,k in enumerate(kind): 1132 | if categorical: 1133 | self._plot_res_z(ax[i],kind=k,confound=confound) 1134 | else: 1135 | self._plot_res_z_cont(ax[i],kind=k,confound=confound) 1136 | plt.show() 1137 | elif set(kind).issubset(set(['LOESS','Centiles','GP','GAMLSS'])): 1138 | fig, ax = plt.subplots(1,len(kind),figsize=(len(kind)*5,5)) 1139 | if categorical: 1140 | self._plot_res_z(ax,kind=kind[0],confound=confound) 1141 | else: 1142 | self._plot_res_z_cont(ax,kind=kind[0],confound=confound) 1143 | plt.show() 1144 | else: 1145 | raise ValueError('Plot kind not recognized, must be a valid subset of ["Centiles","LOESS","GP","GAMLSS"] or None.') 1146 | 1147 | def plot_z(self, kind=None, confound=None): 1148 | """Plot the deviance scores of the normative model. 1149 | 1150 | Parameters 1151 | ---------- 1152 | kind: list default=None 1153 | Type of plot, must be a valid subset of ["Centiles","LOESS","GP","GAMLSS"] or None. If None, all available 1154 | results will be plotted, if None are available a ValueError will be raised. 1155 | confound: str, default=None 1156 | Which confound to use as xaxis of plot. 1157 | 1158 | Raises 1159 | ------ 1160 | ValueError 1161 | Plot kind not recognized, must be a valid subset of ["Centiles","LOESS","GP","GAMLSS"] or None. 1162 | ValueError 1163 | No model results found in data. 1164 | """ 1165 | _, cat = read_confounds(self.confounds) 1166 | if confound is None: 1167 | categorical = True 1168 | elif confound in cat: 1169 | categorical = True 1170 | else: 1171 | categorical = False 1172 | 1173 | if kind is None: 1174 | kind = [] 1175 | for k in ['LOESS','Centiles','GP','GAMLSS']: 1176 | if '{}_z'.format(k) in self.data.columns: 1177 | kind.append(k) 1178 | if len(kind)==0: 1179 | raise ValueError('No model deviance scores found in data.') 1180 | 1181 | if set(kind).issubset(set(['LOESS','Centiles','GP','GAMLSS'])) and len(kind)>1: 1182 | fig, ax = plt.subplots(1,len(kind),figsize=(len(kind)*5,5)) 1183 | for i,k in enumerate(kind): 1184 | if categorical: 1185 | self._plot_res_z(ax[i],kind=k,confound=confound,z=True) 1186 | else: 1187 | self._plot_res_z_cont(ax[i],kind=k,confound=confound,z=True) 1188 | plt.show() 1189 | elif set(kind).issubset(set(['LOESS','Centiles','GP','GAMLSS'])): 1190 | fig, ax = plt.subplots(1,len(kind),figsize=(len(kind)*5,5)) 1191 | if categorical: 1192 | self._plot_res_z(ax,kind=kind[0],confound=confound,z=True) 1193 | else: 1194 | self._plot_res_z_cont(ax,kind=kind[0],confound=confound,z=True) 1195 | plt.show() 1196 | else: 1197 | raise ValueError('Plot kind not recognized, must be a valid subset of ["Centiles","LOESS","GP","GAMLSS"] or None.') 1198 | -------------------------------------------------------------------------------- /pynm/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def read_confounds(confounds): 4 | """ Process input list of confounds. 5 | 6 | Parameters 7 | ---------- 8 | confounds : list of str 9 | List of confounds with categorical variables indicated by c(var) ('c' must be lower case). 10 | 11 | Returns 12 | ------- 13 | list 14 | List of all confounds without wrapper on categorical variables: c(var) -> var. 15 | list 16 | List of only categorical confounds without wrapper. 17 | """ 18 | categorical = [] 19 | clean_confounds = [] 20 | for conf in confounds: 21 | if ((conf[0:2] == 'c(') & (conf[-1] == ')')): 22 | categorical.append(conf[2:-1]) 23 | clean_confounds.append(conf[2:-1]) 24 | else: 25 | clean_confounds.append(conf) 26 | return clean_confounds, categorical 27 | 28 | def RMSE(y_true,y_pred): 29 | """Calculates Root Mean Square Error (RMSE). 30 | 31 | Parameters 32 | ---------- 33 | y_true: array 34 | True values for response variable. 35 | y_pred: array 36 | Predicted values for response variable 37 | 38 | Returns 39 | ------- 40 | float 41 | RMSE value for inputs. 42 | """ 43 | return (np.mean((y_true - y_pred)**2))**0.5 44 | 45 | def SMSE(y_true,y_pred): 46 | """Calculates Standardized Mean Square Error (SMSE). 47 | 48 | Parameters 49 | ---------- 50 | y_true: array 51 | True values for response variable. 52 | y_pred: array 53 | Predicted values for response variable 54 | 55 | Returns 56 | ------- 57 | float 58 | SMSE value for inputs. 59 | """ 60 | return (np.mean((y_true - y_pred)**2))**0.5/np.std(y_true) 61 | 62 | def MSLL(y_true,y_pred,sigma,y_train_mean,y_train_sigma): 63 | """Calculates Mean Standardized Log Loss (MSLL). 64 | 65 | Parameters 66 | ---------- 67 | y_true: (n,) array 68 | True values for response variable. 69 | y_pred: (n,) array 70 | Predicted values for response variable 71 | sigma: (n,) array 72 | Standard deviation of predictive distribution. 73 | y_train_mean: float 74 | Mean of training data. 75 | y_train_sigma: float 76 | Standard deviation of training data. 77 | 78 | 79 | Returns 80 | ------- 81 | float 82 | MSLL value for inputs. 83 | """ 84 | inputs = [y_true,y_pred,sigma] 85 | for i in inputs: 86 | if len(i.shape) > 1: 87 | raise ValueError("Data must be 1-dimensional") 88 | 89 | #SLL = (0.5 * np.log(2 * np.pi * sigma**2) + 90 | # (y_true - y_pred)**2 / (2 * sigma**2) - 91 | # (y_true - np.mean(y_true))**2 / 92 | # (2 * np.std(y_true))) 93 | 94 | # Negative log probability under the model 95 | model = (0.5*np.log(2*np.pi*sigma**2)) + (y_true - y_pred)**2/(2*sigma**2) 96 | 97 | # Negative log probability under trivial model 98 | trivial = (0.5*np.log(2*np.pi*y_train_sigma**2)) + (y_true - y_train_mean)**2/(2*y_train_sigma**2) 99 | 100 | SLL = model - trivial 101 | return np.mean(SLL) -------------------------------------------------------------------------------- /pynm_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ppsp-team/PyNM/52de5b73ffd2cb4b04352e348f042dd695be3c87/pynm_logo.png -------------------------------------------------------------------------------- /pynm_models.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ppsp-team/PyNM/52de5b73ffd2cb4b04352e348f042dd695be3c87/pynm_models.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gpytorch >= 1.4.0 2 | matplotlib >= 3.3.4 3 | numpy >= 1.19.5 4 | pandas >= 1.1.5 5 | pytest >= 6.2.3 6 | rpy2 >= 3.5.4 7 | scikit_learn >= 1.1.2 8 | scipy >= 1.5.3 9 | seaborn >= 0.12.0 10 | statsmodels >= 0.13.2 11 | torch >= 1.12.1 12 | tqdm >= 4.59.0 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | setup( 5 | name="pynm", 6 | version="1.0.1", 7 | author="Annabelle HARVEY, Guillaume DUMAS", 8 | author_email="annabelle.harvey@umontreal.ca, guillaume.dumas@ppsp.team", 9 | description=("Python implementation of Normative Modelling", 10 | "with GAMLSS, Gaussian Processes, LOESS & Centiles approaches."), 11 | long_description_content_type="text/x-rst", 12 | license="BSD", 13 | keywords="gaussian processes statistics modeling", 14 | url="https://github.com/ppsp-team/PyNM", 15 | packages=['pynm', 'test', 'pynm/models'], 16 | classifiers=[ 17 | "Development Status :: 5 - Production/Stable", 18 | "Topic :: Scientific/Engineering :: Bio-Informatics", 19 | "License :: OSI Approved :: BSD License", 20 | ], 21 | entry_points={ 22 | 'console_scripts': [ 23 | 'pynm = pynm.cli:main', 24 | ], 25 | }, 26 | install_requires=[ 27 | 'gpytorch >= 1.4.0', 28 | 'matplotlib >= 3.3.4', 29 | 'numpy >= 1.19.5', 30 | 'pandas >= 1.1.5', 31 | 'rpy2 >= 3.5.4', 32 | 'scikit_learn >= 1.1.2', 33 | 'scipy >= 1.5.3', 34 | 'seaborn >= 0.12.0', 35 | 'statsmodels >= 0.13.2', 36 | 'torch >= 1.12.1', 37 | 'tqdm >= 4.59.0', 38 | ], 39 | ) 40 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ppsp-team/PyNM/52de5b73ffd2cb4b04352e348f042dd695be3c87/test/__init__.py -------------------------------------------------------------------------------- /test/test_pynm.py: -------------------------------------------------------------------------------- 1 | from pynm import pynm 2 | import numpy as np 3 | import pandas as pd 4 | import scipy.stats as sp 5 | import math 6 | import pytest 7 | from pynm.util import * 8 | import matplotlib.pyplot as plt 9 | from unittest.mock import patch 10 | from sklearn.model_selection import train_test_split 11 | 12 | def model(age, sex, offset): 13 | noise = np.random.normal(0, 0.1) 14 | return 0.001*age-0.00001*(age-50)**2+0.5 + noise - np.random.uniform(0, 0.3) * sex + offset 15 | 16 | 17 | def model_prob(age, sex, offset): 18 | noise = np.random.normal(0, 0.1) 19 | return 0.001*age-0.00001*(age-50)**2+0.5 + noise - np.random.uniform(0, 0.3) * sex - 0.2 * np.random.uniform() + offset 20 | 21 | # randseed = 3, sample_size = 1, n_sites = 2 has ONE PROB n=6 22 | # randseed = 1, sample_size = 1, n_sites = 2 has NO PROB n=12 23 | def generate_data(group='PROB_CON', sample_size=1, n_sites=2, randseed=3): 24 | np.random.seed(randseed) 25 | n_sites = n_sites 26 | age_min = (np.random.rand(n_sites)*50).astype(int) 27 | sites = pd.DataFrame(data={'sex_ratio': np.random.rand(n_sites), 28 | 'prob_ratio': 0.5*np.random.rand(n_sites), 29 | 'age_min': age_min, 30 | 'age_max': (age_min+5+np.random.rand(n_sites)*50).astype(int), 31 | 'score_shift': np.random.randn(n_sites)/4, 32 | 'sample_size': (sample_size+np.random.rand(n_sites)*sample_size*10).astype(int)}) 33 | 34 | participants = [] 35 | for site in sites.iterrows(): 36 | for participant in range(int(site[1]['sample_size'])): 37 | sex = np.random.binomial(1, site[1]['sex_ratio']) 38 | prob = np.random.binomial(1, site[1]['prob_ratio']) 39 | age = np.random.uniform(site[1]['age_min'], site[1]['age_max']) 40 | if prob: 41 | score = model_prob(age, sex, site[1]['score_shift']) 42 | else: 43 | score = model(age, sex, site[1]['score_shift']) 44 | participants.append([site[0], sex, prob, age, score]) 45 | 46 | df = pd.DataFrame(participants, columns=['site', 'sex', 'group', 'age', 'score']) 47 | df.sex.replace({1: 'Female', 0: 'Male'}, inplace=True) 48 | if group == 'PROB_CON': 49 | df.group.replace({1: 'PROB', 0: 'CTR'}, inplace=True) 50 | return df 51 | 52 | def sample_x(low=1,high=100,n_subs=1000,sampling='full'): 53 | if sampling =='full': 54 | x = np.random.uniform(low=low,high=high,size=n_subs) 55 | else: 56 | x = np.concatenate([np.random.normal(20,10,size=int(n_subs/2)),np.random.normal(80,10,size=int(n_subs/2))]) 57 | x = x[(x low)] 58 | return x 59 | 60 | # Homoskedastic, gaussian noise 61 | def dataset_homo(low=1,high=100,n_subs=1000,sampling='full'): 62 | x = sample_x(low=low,high=high,n_subs=n_subs,sampling=sampling) 63 | scores = np.array([np.log(i) + np.random.randn() for i in x]) 64 | df = pd.DataFrame([x,scores],index=['x','score']).transpose() 65 | df['train_sample'] = 1 66 | return df 67 | 68 | # Homoskedastic, skew noise 69 | def dataset_skew(low=1,high=100,n_subs=1000,sampling='full'): 70 | x = sample_x(low=low,high=high,n_subs=n_subs,sampling=sampling) 71 | scores = np.array([np.log(i) + sp.skewnorm.rvs(a=2,size=1)[0] for i in x]) 72 | df = pd.DataFrame([x,scores],index=['x','score']).transpose() 73 | df['train_sample'] = 1 74 | return df 75 | 76 | # Heteroskedastic linear 77 | def dataset_het(low=1,high=100,n_subs=1000,sampling='full'): 78 | x = sample_x(low=low,high=high,n_subs=n_subs,sampling=sampling) 79 | scores = np.array([np.log(i) + 0.15*np.log(i)*np.random.randn() for i in x]) 80 | df = pd.DataFrame([x,scores],index=['x','score']).transpose() 81 | df['train_sample'] = 1 82 | return df 83 | 84 | class TestBasic: 85 | def test_read_confounds_some_categorical(self): 86 | conf = ['a', 'b', 'c(c)'] 87 | clean, cat = read_confounds(conf) 88 | assert clean == ['a', 'b', 'c'] 89 | assert cat == ['c'] 90 | 91 | def test_read_confounds_no_categorical(self): 92 | conf = ['a', 'b', 'c'] 93 | clean, cat = read_confounds(conf) 94 | assert clean == conf 95 | assert cat == [] 96 | 97 | def test_read_confounds_all_categorical(self): 98 | conf = ['c(a)', 'c(b)', 'c(c)'] 99 | clean, cat = read_confounds(conf) 100 | assert clean == ['a', 'b', 'c'] 101 | assert cat == ['a', 'b', 'c'] 102 | 103 | def test_invalid_init(self): 104 | data1 = generate_data(randseed=1) 105 | data2 = generate_data(randseed=2) 106 | data = pd.concat([data1,data2]) 107 | assert data.index.nunique() != data.shape[0] 108 | with pytest.raises(ValueError): 109 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 110 | 111 | def test_set_group_names_PROB_CON_all_CON(self): 112 | data = generate_data(randseed=1) 113 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 114 | assert m.CTR == 'CTR' 115 | assert m.PROB == 'PROB' 116 | 117 | def test_set_group_names_PROB_CON(self): 118 | data = generate_data(randseed=3) 119 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 120 | assert m.CTR == 'CTR' 121 | assert m.PROB == 'PROB' 122 | 123 | def test_set_group_names_01(self): 124 | data = generate_data(randseed=3, group='01') 125 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 126 | assert m.CTR == 0 127 | assert m.PROB == 1 128 | 129 | def test_set_group_controls(self): 130 | data = generate_data(randseed=3, group='01') 131 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],train_sample=1) 132 | assert m.group == 'group' 133 | 134 | def test_set_group_33(self): 135 | data = generate_data(randseed=3, group='01') 136 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],train_sample='0.33') 137 | assert m.group == 'train_sample' 138 | assert m.data['train_sample'].sum() == 1 139 | assert m.data[(m.data['train_sample']==1) & (m.data['group']== 1)].shape[0] == 0 140 | 141 | def test_set_group_manual_no_col(self): 142 | data = generate_data(randseed=3, group='01') 143 | with pytest.raises(ValueError): 144 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],train_sample='manual') 145 | 146 | def test_set_group_manual_zero_col(self): 147 | data = generate_data(randseed=3, group='01') 148 | data['train_sample'] = 0 149 | with pytest.raises(ValueError): 150 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],train_sample='manual') 151 | 152 | def test_set_group_manual_good_col(self): 153 | data = generate_data(randseed=3, group='01') 154 | data['train_sample'] = [1,1,0,0,0,0] 155 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],train_sample='manual') 156 | assert m.PROB == 0 157 | assert m.group == 'train_sample' 158 | 159 | def test_create_bins(self): 160 | data = generate_data(randseed=3) 161 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],bin_spacing=8,bin_width=1.5) 162 | m.centiles_normative_model() 163 | assert m.bins is not None 164 | 165 | def test_bins_num(self): 166 | data = generate_data(randseed=11) 167 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],bin_spacing=5, bin_width=10) 168 | m._create_bins() 169 | assert len(m.bins) == 6 170 | 171 | def test_loess_rank(self): 172 | data = generate_data(randseed=11) 173 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],bin_spacing=8,bin_width=1.5) 174 | m.loess_normative_model() 175 | assert np.sum(m.data.LOESS_rank) == 1 176 | 177 | def test_loess_normative_model(self): 178 | data = generate_data(randseed=11) 179 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],bin_spacing=8,bin_width=1.5) 180 | m.loess_normative_model() 181 | assert math.isclose(2.3482, np.sum(m.data.LOESS_z), abs_tol=0.00001) 182 | 183 | def test_centiles_rank(self): 184 | data = generate_data(randseed=11) 185 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],bin_spacing=8,bin_width=1.5) 186 | m.centiles_normative_model() 187 | assert np.sum(m.data.Centiles_rank) == -22 188 | 189 | def test_centiles_normative_model(self): 190 | data = generate_data(randseed=11) 191 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],bin_spacing=8,bin_width=1.5) 192 | m.centiles_normative_model() 193 | assert np.sum(m.data.Centiles) == 446 194 | 195 | def test_get_masks(self): 196 | a = np.array(list(range(6))) 197 | data = generate_data(randseed=3) 198 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 199 | ctr, prob = m._get_masks() 200 | assert a[ctr].shape[0] == 5 201 | assert a[prob][0] == 3 202 | 203 | def test_get_masks_all_CON(self): 204 | a = np.array(list(range(12))) 205 | data = generate_data(randseed=1) 206 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 207 | ctr, prob = m._get_masks() 208 | assert a[ctr].shape[0] == 12 209 | assert a[prob].shape[0] == 0 210 | 211 | def test_get_conf_mat(self): 212 | data = generate_data(randseed=3) 213 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 214 | conf_mat = m._get_conf_mat() 215 | assert conf_mat.shape[0] == 6 216 | assert conf_mat.shape[1] == 3 217 | for i in range(3): 218 | assert not isinstance(conf_mat[0, i], str) 219 | 220 | def test_use_approx_auto_small(self): 221 | data = generate_data(randseed=3) 222 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 223 | assert m._use_approx(method='auto') == False 224 | 225 | def test_use_approx_auto_big(self): 226 | data = generate_data(randseed=3,sample_size=1000) 227 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 228 | assert m._use_approx(method='auto') == True 229 | 230 | def test_use_approx_approx(self): 231 | data = generate_data(randseed=3,sample_size=1000) 232 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 233 | assert m._use_approx(method='approx') == True 234 | 235 | def test_use_approx_exact(self): 236 | data = generate_data(randseed=3,sample_size=2000) 237 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 238 | with pytest.warns(Warning) as record: 239 | use_approx = m._use_approx(method='exact') 240 | assert len(record) == 1 241 | assert record[0].message.args[0] == "Exact GP model with over 2000 data points requires large amounts of time and memory, continuing with exact model." 242 | assert use_approx == False 243 | 244 | def test_gp_normative_model(self): 245 | data = generate_data(sample_size=4, n_sites=2, randseed=3) 246 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 247 | m.gp_normative_model() 248 | assert 'GP_pred' in m.data.columns 249 | assert math.isclose(0,m.data['GP_residuals'].mean(),abs_tol=0.5) 250 | 251 | def test_homo_res(self): 252 | data = dataset_homo() 253 | m = pynm.PyNM(data,'score','train_sample',['x']) 254 | with pytest.warns(None) as record: 255 | m.gp_normative_model(method='exact') 256 | assert len(record) == 0 257 | 258 | def test_nongaussian_res(self): 259 | data = dataset_skew() 260 | m = pynm.PyNM(data,'score','train_sample',['x']) 261 | with pytest.warns(Warning) as record: 262 | m.gp_normative_model(method='exact') 263 | assert len(record) == 1 264 | assert record[0].message.args[0] == "The residuals are not Gaussian!" 265 | 266 | def test_het_res(self): 267 | data = dataset_het() 268 | m = pynm.PyNM(data,'score','train_sample',['x']) 269 | with pytest.warns(Warning) as record: 270 | m.gp_normative_model(method='exact') 271 | assert len(record) == 1 272 | assert record[0].message.args[0] == "The residuals are heteroskedastic!" 273 | 274 | def test_rmse(self): 275 | y_true = np.array([1,2,3,4,5]) 276 | y_pred = np.array([0,1,3,5,2]) 277 | assert RMSE(y_true,y_pred) == np.sqrt(2.4) 278 | 279 | def test_smse(self): 280 | y_true = np.array([1,2,3,4,5]) 281 | y_pred = np.array([0,1,3,5,2]) 282 | assert SMSE(y_true,y_pred) == np.sqrt(2.4)/np.sqrt(2) 283 | 284 | def test_msll(self): 285 | y_true = np.array([1,2,3,4,5]) 286 | y_pred = np.array([0,1,3,5,2]) 287 | sigma = np.array([1,2,1,2,1]) 288 | y_train_mean = 2 289 | y_train_sigma = 1 290 | 291 | term1 = 0.5*np.log(2*np.pi*np.array([1,4,1,4,1])) + np.array([1/2,1/8,0,1/8,9/2]) 292 | term2 = 0.5*np.log(2*np.pi) + np.array([1/2,0,1/2,2,4.5]) 293 | 294 | assert MSLL(y_true,y_pred,sigma,y_train_mean,y_train_sigma) == np.mean(term1 - term2) 295 | 296 | def test_report(self): 297 | data = generate_data(sample_size=4, n_sites=2, randseed=3) 298 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 299 | m.centiles_normative_model() 300 | m.loess_normative_model() 301 | #m.gp_normative_model() 302 | #m.gamlss_normative_model() 303 | m.report() 304 | 305 | @patch("matplotlib.pyplot.show") 306 | class TestPlot: 307 | def test_plot_default(self,mock_patch): 308 | data = generate_data(randseed=3) 309 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 310 | m.centiles_normative_model() 311 | m.plot() 312 | assert True 313 | 314 | def test_plot_default_two_models(self,mock_patch): 315 | data = generate_data(randseed=3) 316 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 317 | m.centiles_normative_model() 318 | m.loess_normative_model() 319 | assert m.plot() is None 320 | 321 | def test_plot_default_no_models(self,mock_patch): 322 | data = generate_data(randseed=3) 323 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 324 | with pytest.warns(Warning) as record: 325 | m.plot() 326 | 327 | def test_plot_valid_subset(self,mock_patch): 328 | subset = ['Centiles','LOESS'] 329 | data = generate_data(randseed=3) 330 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 331 | m.centiles_normative_model() 332 | m.loess_normative_model() 333 | assert m.plot(kind=subset) is None 334 | 335 | def test_plot_invalid_subset1(self,mock_patch): 336 | subset = ['Centiles',None] 337 | data = generate_data(randseed=3) 338 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 339 | m.centiles_normative_model() 340 | with pytest.raises(ValueError): 341 | m.plot(kind=subset) 342 | 343 | def test_plot_invalid_subset2(self,mock_patch): 344 | subset = ['Centiles','GAMLSS'] 345 | data = generate_data(randseed=3) 346 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 347 | m.centiles_normative_model() 348 | with pytest.raises(KeyError): 349 | m.plot(kind=subset) 350 | 351 | def test_plot_res_default(self,mock_patch): 352 | data = generate_data(randseed=3) 353 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 354 | m.centiles_normative_model() 355 | assert m.plot_res() is None 356 | 357 | def test_plot_res_default_two_models(self,mock_patch): 358 | data = generate_data(randseed=3) 359 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 360 | m.centiles_normative_model() 361 | m.loess_normative_model() 362 | assert m.plot_res() is None 363 | 364 | def test_plot_res_default_no_models(self,mock_patch): 365 | data = generate_data(randseed=3) 366 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 367 | with pytest.raises(ValueError): 368 | m.plot_res() 369 | 370 | def test_plot_res_valid_subset(self,mock_patch): 371 | subset = ['Centiles','LOESS'] 372 | data = generate_data(randseed=3) 373 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 374 | m.centiles_normative_model() 375 | m.loess_normative_model() 376 | assert m.plot_res(kind=subset) is None 377 | 378 | def test_plot_res_invalid_subset1(self,mock_patch): 379 | subset = ['Centiles',None] 380 | data = generate_data(randseed=3) 381 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 382 | m.centiles_normative_model() 383 | with pytest.raises(ValueError): 384 | m.plot_res(kind=subset) 385 | 386 | def test_plot_res_invalid_subset2(self,mock_patch): 387 | subset = ['Centiles','GAMLSS'] 388 | data = generate_data(randseed=3) 389 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 390 | m.centiles_normative_model() 391 | with pytest.raises(ValueError): 392 | m.plot_res(kind=subset) 393 | 394 | def test_plot_z_default(self,mock_patch): 395 | data = generate_data(randseed=3) 396 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 397 | m.centiles_normative_model() 398 | assert m.plot_z() is None 399 | 400 | def test_plot_z_default_two_models(self,mock_patch): 401 | data = generate_data(randseed=3) 402 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 403 | m.centiles_normative_model() 404 | m.loess_normative_model() 405 | assert m.plot_z() is None 406 | 407 | def test_plot_z_default_no_models(self,mock_patch): 408 | data = generate_data(randseed=3) 409 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 410 | with pytest.raises(ValueError): 411 | m.plot_z() 412 | 413 | def test_plot_z_valid_subset(self,mock_patch): 414 | subset = ['Centiles','LOESS'] 415 | data = generate_data(randseed=3) 416 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 417 | m.centiles_normative_model() 418 | m.loess_normative_model() 419 | assert m.plot_z(kind=subset) is None 420 | 421 | def test_plot_z_invalid_subset1(self,mock_patch): 422 | subset = ['Centiles',None] 423 | data = generate_data(randseed=3) 424 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 425 | m.centiles_normative_model() 426 | with pytest.raises(ValueError): 427 | m.plot_z(kind=subset) 428 | 429 | def test_plot_z_invalid_subset2(self,mock_patch): 430 | subset = ['Centiles','GAMLSS'] 431 | data = generate_data(randseed=3) 432 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 433 | m.centiles_normative_model() 434 | with pytest.raises(ValueError): 435 | m.plot_z(kind=subset) 436 | 437 | class TestApprox: 438 | def test_svgp_init(self): 439 | from pynm.models.approx import SVGP 440 | 441 | data = generate_data(randseed=3) 442 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 443 | conf_mat = m._get_conf_mat() 444 | score = m._get_score() 445 | 446 | X_train,X_test,y_train,y_test = train_test_split(conf_mat, score) 447 | svgp = SVGP(X_train,X_test,y_train,y_test) 448 | 449 | def test_svgp_train(self): 450 | from pynm.models.approx import SVGP 451 | 452 | data = generate_data(randseed=3) 453 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 454 | conf_mat = m._get_conf_mat() 455 | score = m._get_score() 456 | 457 | X_train,X_test,y_train,y_test = train_test_split(conf_mat, score) 458 | svgp = SVGP(X_train,X_test,y_train,y_test) 459 | svgp.train(num_epochs = 2) 460 | 461 | assert len(svgp.loss) == 2 462 | 463 | def test_svgp_predict(self): 464 | from pynm.models.approx import SVGP 465 | 466 | data = generate_data(randseed=3) 467 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 468 | conf_mat = m._get_conf_mat() 469 | score = m._get_score() 470 | 471 | X_train,X_test,y_train,y_test = train_test_split(conf_mat, score) 472 | svgp = SVGP(X_train,X_test,y_train,y_test) 473 | svgp.train(num_epochs = 2) 474 | means,sigmas = svgp.predict() 475 | 476 | assert means.size(0) == y_test.shape[0] 477 | assert sigmas.size(0) == y_test.shape[0] 478 | 479 | def test_svgp_model(self): 480 | data = generate_data(sample_size=4, n_sites=2, randseed=3) 481 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 482 | m.gp_normative_model(method='approx') 483 | 484 | assert 'GP_pred' in m.data.columns 485 | assert math.isclose(0, m.data['GP_residuals'].mean(), abs_tol=0.5) 486 | 487 | class TestGAMLSS: 488 | def test_get_r_formulas(self): 489 | from pynm.models import gamlss 490 | 491 | g = gamlss.GAMLSS(mu='score ~ 1') 492 | mu,sigma,_,_ = g._get_r_formulas('score ~ cs(age) + site',None,None,None) 493 | #assert not isinstance(mu,str) 494 | assert mu == 'score ~ cs(age) + site' 495 | assert sigma == '~ 1' 496 | 497 | def test_gamlss(self): 498 | data = generate_data(sample_size=4, n_sites=2, randseed=3) 499 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 500 | m.gamlss_normative_model(mu='score ~ cs(age)',sigma='~ age + site',tau='~ c(sex)') 501 | assert 'GAMLSS_pred' in m.data.columns 502 | 503 | def test_gamlss_smse(self): 504 | data = generate_data(sample_size=4, n_sites=2, randseed=3) 505 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 506 | m.gamlss_normative_model(mu='score ~ cs(age)',sigma='~ age + site',tau='~ c(sex)') 507 | assert m.SMSE_GAMLSS > 0 508 | 509 | def test_gamlss_default_formulas(self): 510 | data = generate_data(sample_size=4, n_sites=2, randseed=3) 511 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 512 | m.gamlss_normative_model() 513 | assert 'GAMLSS_pred' in m.data.columns 514 | 515 | def test_gamlss_invalid_init(self): 516 | from pynm.models import gamlss 517 | 518 | with pytest.raises(ValueError): 519 | gamlss.GAMLSS() 520 | 521 | def test_gamlss_nan_issue(self): 522 | df = generate_data(n_sites=4,sample_size=35,randseed=650) 523 | #Initialize pynm w/ data and confounds 524 | m = pynm.PyNM(df,'score','group',['age','c(sex)','c(site)']) 525 | m.loess_normative_model() 526 | m.centiles_normative_model() 527 | m.gamlss_normative_model(mu='score ~ ps(age) + c(sex) + c(site)',sigma = '~ age',family='SHASHo2') 528 | 529 | def test_gamlss_random_effect(self): 530 | df = generate_data(n_sites=4,sample_size=35,randseed=650) 531 | #Initialize pynm w/ data and confounds 532 | m = pynm.PyNM(df,'score','group', 533 | confounds = ['age','c(sex)','c(site)']) 534 | m.gamlss_normative_model(mu='score ~ ps(age) + c(sex) + random(as.factor(site))',sigma = '~ ps(age)',family='SHASHo2',method='mixed(10,50)') 535 | 536 | def test_gamlss_random_effect_not_converged(self): 537 | #TODO: Force example where algorithm not converged warning gets thrown 538 | df = generate_data(n_sites=4,sample_size=35,randseed=650) 539 | #Initialize pynm w/ data and confounds 540 | m = pynm.PyNM(df,'score','group', 541 | confounds = ['age','c(sex)','c(site)']) 542 | m.gamlss_normative_model(mu='score ~ ps(age) + c(sex) + random(as.factor(site))',sigma = '~ ps(age)',family='SHASHo2',method='RS') 543 | 544 | def test_gamlss_bad_formula(self): 545 | df = generate_data(n_sites=4,sample_size=35,randseed=650) 546 | #Initialize pynm w/ data and confounds 547 | m = pynm.PyNM(df,'score','group', 548 | confounds = ['age','c(sex)','c(site)']) 549 | with pytest.raises(ValueError): 550 | m.gamlss_normative_model(mu='score ~ xxx(age) + c(sex) + c(site)',family='SHASHo2') 551 | 552 | class TestCV: 553 | def test_cv_1_loess(self): 554 | data = generate_data(randseed=11) 555 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],bin_spacing=8,bin_width=1.5) 556 | m.loess_normative_model() 557 | assert math.isclose(2.3482, np.sum(m.data.LOESS_z), abs_tol=0.00001) 558 | 559 | def test_cv_3_loess(self): 560 | data = generate_data(n_sites=1,sample_size=100,randseed=650) 561 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],bin_spacing=8,bin_width=1.5) 562 | m.loess_normative_model(cv_folds=3) 563 | assert not np.isnan(m.RMSE_LOESS) 564 | assert not np.isnan(m.SMSE_LOESS) 565 | 566 | def test_cv_1_centiles(self): 567 | data = generate_data(randseed=11) 568 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],bin_spacing=8,bin_width=1.5) 569 | m.centiles_normative_model() 570 | assert np.sum(m.data.Centiles) == 446 571 | 572 | def test_cv_3_centiles(self): 573 | data = generate_data(n_sites=1,sample_size=100,randseed=650) 574 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)'],bin_spacing=8,bin_width=1.5) 575 | m.centiles_normative_model(cv_folds=3) 576 | assert not np.isnan(m.RMSE_Centiles) 577 | assert not np.isnan(m.SMSE_Centiles) 578 | 579 | def test_cv_1_gp(self): 580 | data = generate_data(sample_size=4, n_sites=2, randseed=3) 581 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 582 | m.gp_normative_model() 583 | assert 'GP_pred' in m.data.columns 584 | assert math.isclose(0,m.data['GP_residuals'].mean(),abs_tol=0.5) 585 | 586 | def test_cv_3_gp(self): 587 | data = generate_data(sample_size=4, n_sites=2, randseed=3) 588 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 589 | m.gp_normative_model(cv_folds=3) 590 | assert 'GP_pred' in m.data.columns 591 | assert math.isclose(0,m.data['GP_residuals'].mean(),abs_tol=0.5) 592 | 593 | def test_cv_1_svgp(self): 594 | data = generate_data(sample_size=4, n_sites=2, randseed=3) 595 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 596 | m.gp_normative_model(method='approx') 597 | 598 | assert 'GP_pred' in m.data.columns 599 | assert math.isclose(0, m.data['GP_residuals'].mean(), abs_tol=0.5) 600 | 601 | def test_cv_3_svgp(self): 602 | data = generate_data(sample_size=4, n_sites=2, randseed=3) 603 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 604 | m.gp_normative_model(method='approx',cv_folds=3,num_epochs=3) 605 | 606 | assert 'GP_pred' in m.data.columns 607 | assert math.isclose(0, m.data['GP_residuals'].mean(), abs_tol=0.5) 608 | 609 | def test_cv_1_gamlss(self): 610 | data = generate_data(sample_size=4, n_sites=2, randseed=3) 611 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 612 | m.gamlss_normative_model(mu='score ~ cs(age)',sigma='~ age + site',tau='~ c(sex)') 613 | assert 'GAMLSS_pred' in m.data.columns 614 | 615 | def test_cv_3_gamlss(self): 616 | data = generate_data(sample_size=5, n_sites=2, randseed=3) 617 | m = pynm.PyNM(data,'score','group',['age','c(sex)','c(site)']) 618 | m.gamlss_normative_model(mu='score ~ cs(age)',sigma='~ age + site',tau='~ c(sex)',cv_folds=3) 619 | assert 'GAMLSS_pred' in m.data.columns -------------------------------------------------------------------------------- /tutorials/image1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ppsp-team/PyNM/52de5b73ffd2cb4b04352e348f042dd695be3c87/tutorials/image1.jpg -------------------------------------------------------------------------------- /tutorials/image2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ppsp-team/PyNM/52de5b73ffd2cb4b04352e348f042dd695be3c87/tutorials/image2.jpg -------------------------------------------------------------------------------- /tutorials/image3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ppsp-team/PyNM/52de5b73ffd2cb4b04352e348f042dd695be3c87/tutorials/image3.jpg -------------------------------------------------------------------------------- /tutorials/image4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ppsp-team/PyNM/52de5b73ffd2cb4b04352e348f042dd695be3c87/tutorials/image4.jpg -------------------------------------------------------------------------------- /tutorials/image5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ppsp-team/PyNM/52de5b73ffd2cb4b04352e348f042dd695be3c87/tutorials/image5.jpg --------------------------------------------------------------------------------