├── .github └── workflows │ └── publish.yaml ├── LICENSE ├── Pygformula_Documentation.pdf ├── README.md ├── datasets ├── example_data_absorbing.csv ├── example_data_basicdata.csv ├── example_data_basicdata_nocomp.csv ├── example_data_binary_eof.csv ├── example_data_categorical.csv ├── example_data_censor.csv ├── example_data_continuous_eof.csv ├── example_data_multiple_treatments.csv ├── example_data_truncated_normal.csv ├── example_data_visit_process.csv ├── example_data_zero_inflated_normal.csv └── example_threshold_data.csv ├── docs ├── Makefile ├── make.bat ├── requirements.txt └── source │ ├── Contact.rst │ ├── Datasets.rst │ ├── Get Started.rst │ ├── Installation.rst │ ├── Specifications │ ├── Censoring event.rst │ ├── Competing event.rst │ ├── Covariate models.rst │ ├── Deterministic knowledge.rst │ ├── Hazard ratio.rst │ ├── Input data.rst │ ├── Interventions.rst │ ├── Outcome model.rst │ ├── Output.rst │ ├── Visit process.rst │ └── index.rst │ ├── conf.py │ ├── index.rst │ └── media │ ├── absorbing_cov_example_output.png │ ├── binary_cov_example_output.png │ ├── binary_eof_example_output.png │ ├── bounded_normal_cov_example.png │ ├── categorical_cov_example_output.png │ ├── categorical_time_cov_example.png │ ├── censor_example_output.png │ ├── comp_restriction_example_output.png │ ├── competing_as_cens_output.png │ ├── competing_not_cens_output.png │ ├── continuous_eof_example_output.png │ ├── data_example.png │ ├── data_example_censor.png │ ├── data_example_competing.png │ ├── dynamic_example_output.png │ ├── example_hazardratio_output.png │ ├── get_started_example.png │ ├── get_started_example_all.jpg │ ├── get_started_example_bootstrap.jpg │ ├── get_started_example_intervention_curve.jpg │ ├── natural_course_output.png │ ├── natural_grace_period.png │ ├── normal_cov_example_output.png │ ├── random_forest_cov.png │ ├── restriction_example_output.png │ ├── static_example_one_treatment_output.png │ ├── static_example_two_treatments.png │ ├── static_multiple_interventions.png │ ├── survival_example_output.png │ ├── test_hazard_ratio.png │ ├── threshold_example_output.png │ ├── truncated_normal_cov_example.png │ ├── uniform_grace_period.png │ ├── visitprocess_example_output.png │ ├── yrestriction_example_output.png │ └── zero_inflated_normal_cov_example.png ├── pygformula ├── __init__.py ├── comparisons.py ├── data.py ├── interventions.py ├── parametric_gformula │ ├── __init__.py │ ├── bootstrap.py │ ├── fit.py │ ├── histories.py │ ├── parametric_gformula.py │ └── simulate.py ├── plot.py ├── utils │ ├── __init__.py │ ├── helper.py │ └── util.py └── version.py ├── readthedocs.yaml ├── requirements.txt ├── running_examples ├── get_started_example.py ├── test_absorbing_cov.py ├── test_binary_cov.py ├── test_binary_eof.py ├── test_bounded_normal_cov.py ├── test_categorical_cov.py ├── test_categorical_time.py ├── test_censor.py ├── test_comp_restrictions.py ├── test_competing_event.py ├── test_continuous_eof.py ├── test_custom_ymodel.py ├── test_dynamic_intervention.py ├── test_fit_random_forest.py ├── test_natural_course.py ├── test_natural_grace_period.py ├── test_normal_cov.py ├── test_restrictions.py ├── test_square_time.py ├── test_static_multiple_treatments.py ├── test_static_one_treatment.py ├── test_threshold_intervention.py ├── test_truncated_normal.py ├── test_uniform_grace_period.py ├── test_visit_process.py ├── test_yrestrictions.py └── test_zero_inflated_normal_cov.py └── setup.py /.github/workflows/publish.yaml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package to PyPI when a Release is Created 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | pypi-publish: 9 | name: Publish release to PyPI 10 | runs-on: ubuntu-latest 11 | environment: 12 | name: pypi 13 | url: https://pypi.org/p/pygformula 14 | permissions: 15 | id-token: write 16 | steps: 17 | - uses: actions/checkout@v4 18 | - name: Set up Python 19 | uses: actions/setup-python@v4 20 | with: 21 | python-version: "3.x" 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | pip install setuptools wheel 26 | - name: Build package 27 | run: | 28 | python setup.py sdist bdist_wheel # Could also be python -m build 29 | - name: Publish package distributions to PyPI 30 | uses: pypa/gh-action-pypi-publish@release/v1 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 The President and Fellows of Harvard College 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Pygformula_Documentation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/Pygformula_Documentation.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pygformula: a python implementation of the parametric g-formula 2 | 3 | [![PyPI version](https://badge.fury.io/py/pygformula.svg)](https://pypi.org/project/pygformula) 4 | [![Documentation Status](https://readthedocs.org/projects/pygformula/badge/?version=latest)](https://pygformula.readthedocs.io) 5 | [![Downloads](https://static.pepy.tech/badge/pygformula)](https://pepy.tech/project/pygformula) 6 | 7 | **Authors: Jing Li, Sophia Rein, Sean McGrath, Roger Logan, Ryan O’Dea, Miguel Hernán** 8 | 9 | 10 | ## Overview 11 | The pygformula package implements the non-iterative conditional expectation (NICE) estimator of the g-formula algorithm 12 | (Robins, 1986). The g-formula can estimate an outcome’s counterfactual mean or risk under hypothetical treatment strategies 13 | (interventions) when there is sufficient information on time-varying treatments and confounders. 14 | 15 | 16 | ### Features 17 | 18 | * Treatments: discrete or continuous time-varying treatments. 19 | * Outcomes: failure time outcomes or continuous/binary end of follow-up outcomes. 20 | * Interventions: interventions on a single treatment or joint interventions on multiple treatments. 21 | * Random measurement/visit process. 22 | * Incorporation of a priori knowledge of the data structure. 23 | * Censoring events. 24 | * Competing events. 25 | 26 | 27 | ## Requirements 28 | 29 | The package requires python 3.8+ and these necessary dependencies: 30 | 31 | - cmprsk 32 | - joblib 33 | - lifelines 34 | - matplotlib 35 | - numpy 36 | - pandas 37 | - prettytable 38 | - pytruncreg 39 | - scipy 40 | - seaborn 41 | - statsmodels 42 | - tqdm 43 | 44 | 45 | ## Documentation 46 | 47 | The online documentation is available at [pygformula documentation](https://pygformula.readthedocs.io). 48 | 49 | ## Issues 50 | 51 | If you have any issues, please open an [issue](https://github.com/CausalInference/pygformula/issues) on github, we will 52 | regularly check the questions. For any additional questions or comments, please email jing_li@hsph.harvard.edu. -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.https://www.sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx>=5.3.0 2 | sphinx_rtd_theme>=1.1.1 3 | readthedocs-sphinx-search>=0.3.2 4 | PyQt5>=5.15.11 5 | -------------------------------------------------------------------------------- /docs/source/Contact.rst: -------------------------------------------------------------------------------- 1 | Contact 2 | '''''''''''''''''''' 3 | 4 | The pygformula package was developed in the CAUSALab by: 5 | 6 | - Jing Li, jing_li@hsph.harvard.edu 7 | - Sophia Rein, srein@hsph.harvard.edu 8 | - Sean McGrath, sean_mcgrath@g.harvard.edu 9 | - Roger Logan, rwlogan@hsph.harvard.edu 10 | - Ryan O’Dea, ryanodea@hsph.harvard.edu 11 | - Miguel Hernán, mhernan@hsph.harvard.edu 12 | 13 | 14 | If you have any questions or suggestions about this package, please contact jing_li@hsph.harvard.edu. 15 | As an ongoing open-source project, contributions are highly welcome for any bug reports or 16 | feature suggestions. 17 | 18 | - Issue reports: if you have any issues, please let us know by opening an `issue `_ 19 | on github. 20 | 21 | - Feature requests: if you want to contribute any new feature implementation, please make a 22 | `pull request `_ to post the feature requests. 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /docs/source/Datasets.rst: -------------------------------------------------------------------------------- 1 | Datasets 2 | ''''''''''''''''''' 3 | 4 | We provide simulation datasets for users to run the different examples in this tutorial. 5 | Additionally, code for replicating all test examples can be found in `"running examples" `_. 6 | 7 | 8 | .. note:: 9 | 10 | The data can be downloaded by the following command, e.g., downloading the "example_data_basicdata_nocomp" dataset: 11 | 12 | .. code-block:: 13 | 14 | from pygformula.data import load_basicdata_nocomp 15 | 16 | or be accessed directly from the github repository `pygformula `_. 17 | -------------------------------------------------------------------------------- /docs/source/Get Started.rst: -------------------------------------------------------------------------------- 1 | Get Started 2 | '''''''''''''''''''' 3 | 4 | =================== 5 | Algorithm outline 6 | =================== 7 | 8 | 9 | The parametric g-formula estimator of the noniterative conditional expectation (NICE) requires 10 | the specification of models for the joint density of the confounders, treatments, and outcomes over time. 11 | The algorithm has three steps: (1) Parametric estimation, (2) Monte Carlo simulation 12 | , and (3) Calculation of risk/mean under each intervention. 13 | 14 | + **Parametric estimation**: (a) estimate the conditional densities of each covariate given past covariate history 15 | by fitting user-specified regression models, (b) estimate the discrete hazard (for survival outcome) or mean 16 | (for binary/continuous end of follow-up) of the outcome conditional on past covariate history by fitting a user-specified 17 | regression model, (c) if the event of interest is subject to competing events and competing events are not treated as censoring events, estimate the conditional probability of the competing event 18 | conditional on past covariate history by fitting user-specified regression model for the competing event. 19 | 20 | + **Monte Carlo simulation**: (a) generate a new dataset which is usually larger than original dataset, for each covariate, 21 | generate simulated values at each time step using the estimated covariate models from step (1), (b) for the 22 | covariates that are to undergo intervention, their values are assigned according to the user-specified intervention rule, 23 | (c) obtain the discrete hazard / mean of the outcome based on the estimated outcome model from step (1), 24 | (d) if the event of interest is subject to competing events and competing events are not treated as censoring events, 25 | obtain the discrete hazard of the competing event based on the estimated competing model from step (1). 26 | 27 | + **Calculation of risk/mean under each intervention**: for binary/continuous end of follow-up, the final estimate is the mean of 28 | the estimated outcome of all individuals in the new dataset computed from Step (2). For survival outcome, 29 | the final estimate is obtained by calculating the mean of cumulative risks for all individuals using the discrete hazards computed from step (2). 30 | 31 | 32 | 33 | Arguments: 34 | 35 | .. automodule:: pygformula.parametric_gformula 36 | .. autosummary:: ParametricGformula 37 | .. autoclass:: ParametricGformula 38 | 39 | 40 | 41 | =================== 42 | Example 43 | =================== 44 | The observational dataset 45 | `example_data_basicdata_nocomp `_ consists of 13,170 observations on 2,500 individuals with a maximum of 7 follow-up 46 | times. The dataset contains the following variables: 47 | 48 | - id: Unique identifier for each individual. 49 | - t0: Time index. 50 | - L1: Binary time-varying covariate. 51 | - L2: Continuous time-varying covariate. 52 | - L3: Categorical baseline covariate. 53 | - A: Binary treatment variable. 54 | - Y: Outcome of interest; time-varying indicator of failure. 55 | 56 | We are interested in the risk by the end of follow-up under the static interventions ‘‘Never treat’’ (set treatment 57 | to 0 at all times) and ‘‘Always treat’’ (set treatment to 1 at all times). 58 | 59 | - First, import the g-formula method ParametricGformula: 60 | 61 | .. code-block:: 62 | 63 | from pygformula import ParametricGformula 64 | 65 | - Then, load the data (here is an example of loading simulated `data `_ in the package, 66 | users can also load their own data) as required pandas DataFrame type 67 | 68 | .. code:: 69 | 70 | from pygformula.data import load_basicdata_nocomp 71 | obs_data = load_basicdata_nocomp() 72 | 73 | - Specify the name of the time variable, and the name of the individual identifier in the input data 74 | 75 | .. code-block:: 76 | 77 | time_name = 't0' 78 | id = 'id' 79 | 80 | - Specify covariate names, covariate types, and corresponding model statements 81 | 82 | .. code-block:: 83 | 84 | covnames = ['L1', 'L2', 'A'] 85 | covtypes = ['binary', 'bounded normal', 'binary'] 86 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 87 | 'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 88 | 'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0'] 89 | 90 | If there are baseline covariates (i.e., covariate with same value at all times) in the model statement, specify them in the 91 | ‘‘basecovs’’ argument: 92 | 93 | .. code:: 94 | 95 | basecovs = ['L3'] 96 | 97 | 98 | - Specify the static interventions of interest: 99 | 100 | .. code-block:: 101 | 102 | from pygformula.interventions import static 103 | 104 | time_points = np.max(np.unique(obs_data[time_name])) + 1 105 | int_descript = ['Never treat', 'Always treat'] 106 | 107 | Intervention1_A = [static, np.zeros(time_points)], 108 | Intervention2_A = [static, np.ones(time_points)], 109 | 110 | - Specify the outcome name, outcome model statement, and the outcome type 111 | 112 | .. code-block:: 113 | 114 | outcome_name = 'Y' 115 | ymodel = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0' 116 | outcome_type = 'survival' 117 | 118 | - Speficy all the arguments in the "ParametricGformula" class and call its "fit" function: 119 | 120 | .. code-block:: 121 | 122 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, 123 | covnames=covnames, covtypes=covtypes, 124 | covmodels=covmodels, basecovs=basecovs, 125 | time_points=time_points, 126 | Intervention1_A = [static, np.zeros(time_points)], 127 | Intervention2_A = [static, np.ones(time_points)], 128 | outcome_name=outcome_name, ymodel=ymodel, 129 | outcome_type = outcome_type) 130 | 131 | g.fit() 132 | 133 | - Finally, get the output: 134 | 135 | .. image:: media/get_started_example.png 136 | :align: center 137 | 138 | 139 | - "Intervention": the name of natural course intervention and user-specified interventions. 140 | - "NP-risk": the nonparametric estimates of the natural course risk. 141 | - "g-formula risk": the parametric g-formula estimates of each interventions. 142 | - "Risk Ratio (RR)": the risk ratio comparing each intervention and reference intervention. 143 | - "Risk Difference (RD)": the risk difference comparing each intervention and reference intervention. 144 | 145 | In the output table, the g-formula risk results under the specified interventions are shown, as well as the natural course. 146 | Furthermore, the nonparametric risk under the natural course is provided, which can be used to assess model misspecification of parametric 147 | g-formula. The risk ratio and risk difference comparing the specific intervention and the reference 148 | intervention (set to natural course by default) are also calculated. 149 | 150 | Users can also get the standard errors and 95% confidence intervals of the g-formula estimates by specifying the ‘‘nsamples’’ argument. 151 | For example, specifying ‘‘nsamples’’ as 20 with parallel processing using 8 cores: 152 | 153 | .. code-block:: 154 | 155 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, 156 | time_points = time_points, 157 | Intervention1_A = [static, np.zeros(time_points)], 158 | Intervention2_A = [static, np.ones(time_points)], 159 | covnames=covnames, covtypes=covtypes, 160 | covmodels=covmodels, basecovs=basecovs, 161 | outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type, 162 | nsamples=20, parallel=True, ncores=8) 163 | g.fit() 164 | 165 | The package will return following results: 166 | 167 | .. image:: media/get_started_example_bootstrap.jpg 168 | :align: center 169 | :width: 8.5in 170 | :height: 2in 171 | 172 | The result table contains 95% lower bound and upper bound for the risk, risk difference and risk ratio for all interventions. 173 | 174 | The pygformula also provides plots for risk curves of interventions, which can be called by: 175 | 176 | .. code:: 177 | 178 | g.plot_interventions() 179 | 180 | It will return the g-formula risk (with 95% confidence intervals if using bootstrap samples) at all follow-up times under each intervention: 181 | 182 | .. image:: media/get_started_example_intervention_curve.jpg 183 | :align: center 184 | :width: 5in 185 | :height: 4in 186 | 187 | User can also get the plots of parametric and nonparametric estimates of 188 | the risks and covariate means under natural course by: 189 | 190 | .. code:: 191 | 192 | g.plot_natural_course() 193 | 194 | 195 | .. image:: media/get_started_example_all.jpg 196 | :align: center 197 | 198 | 199 | 200 | **Running example** `[code] `_: 201 | 202 | .. code-block:: 203 | 204 | import numpy as np 205 | from pygformula import ParametricGformula 206 | from pygformula.interventions import static 207 | from pygformula.data import load_basicdata_nocomp 208 | 209 | obs_data = load_basicdata_nocomp() 210 | time_name = 't0' 211 | id = 'id' 212 | 213 | covnames = ['L1', 'L2', 'A'] 214 | covtypes = ['binary', 'bounded normal', 'binary'] 215 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 216 | 'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 217 | 'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0'] 218 | 219 | basecovs = ['L3'] 220 | 221 | outcome_name = 'Y' 222 | ymodel = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0' 223 | outcome_type = 'survival' 224 | 225 | time_points = np.max(np.unique(obs_data[time_name])) + 1 226 | int_descript = ['Never treat', 'Always treat'] 227 | 228 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, 229 | time_points = time_points, int_descript = int_descript, 230 | covnames=covnames, covtypes=covtypes, 231 | covmodels=covmodels, basecovs=basecovs, 232 | outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type, 233 | Intervention1_A = [static, np.zeros(time_points)], 234 | Intervention2_A = [static, np.ones(time_points)], 235 | nsamples=20, parallel=True, ncores=8) 236 | g.fit() 237 | g.plot_natural_course() 238 | g.plot_interventions() 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | -------------------------------------------------------------------------------- /docs/source/Installation.rst: -------------------------------------------------------------------------------- 1 | '''''''''''''''''''' 2 | Installation 3 | '''''''''''''''''''' 4 | 5 | Requirements 6 | ^^^^^^^^^^^^ 7 | 8 | The package requires python ≥ 3.8 and these necessary dependencies: 9 | 10 | * cmprsk 11 | * joblib 12 | * lifelines 13 | * matplotlib 14 | * numpy 15 | * pandas 16 | * prettytable 17 | * pytruncreg 18 | * scipy 19 | * seaborn 20 | * statsmodels 21 | * tqdm 22 | 23 | 24 | All the dependencies needed by the pygformula are listed in the file 25 | `"requirements.txt" `_ , users can 26 | install them by: 27 | 28 | .. code:: 29 | 30 | pip install -r requirements.txt 31 | 32 | 33 | Install pygformula 34 | ^^^^^^^^^^^^^^^^^^^^^^^^ 35 | 36 | Users can use the following command to install the pygformula package: 37 | 38 | .. code:: 39 | 40 | pip install pygformula 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /docs/source/Specifications/Censoring event.rst: -------------------------------------------------------------------------------- 1 | .. _Censoring event: 2 | 3 | Censoring event 4 | =================== 5 | 6 | When there are censoring events, the package provides the option to obtain inverse probability weighted (IPW) estimates 7 | for comparison with the g-formula estimates. The comparison of these two estimates can be useful to assess model misspecification 8 | of the g-formula [1]_. 9 | To get the IPW estimate, the name of the censoring variable in the input data should be specified, 10 | users also need to specify a censor model to obtain the weights. 11 | 12 | Note that the arguments ‘‘censor_name’’ and ‘‘censor_model’’ are only needed when users want to 13 | get the IPW estimate. The package will return the nonparametric observed risk in general cases. 14 | 15 | 16 | The arguments for censoring events: 17 | 18 | .. list-table:: 19 | :header-rows: 1 20 | 21 | * - Arguments 22 | - Description 23 | * - censor_name 24 | - (Optional) A string specifying the name of the censoring variable in obs_data. Only applicable when using inverse 25 | probability weights to estimate the natural course means / risk from the observed data. 26 | * - censor_model 27 | - (Optional) A string specifying the model statement for the censoring variable. Only applicable when using inverse 28 | probability weights to estimate the natural course means / risk from the observed data. 29 | * - ipw_cutoff_quantile 30 | - (Optional) Percentile value for truncation of the inverse probability weights. 31 | * - ipw_cutoff_value 32 | - (Optional) Absolute value for truncation of the inverse probability weights. 33 | 34 | Users can also specify a percentile value (in the argument ‘‘ipw_cutoff_quantile’’) or an absolute value 35 | (in the argument ‘‘ipw_cutoff_value’’) to truncate inverse probability weight. 36 | 37 | 38 | **Sample syntax**: 39 | 40 | .. code-block:: 41 | 42 | censor_name = 'C' 43 | censor_model = 'C ~ A + L' 44 | 45 | g = ParametricGformula(..., censor_name = censor_name, censor_model = censor_model, ...) 46 | 47 | .. note:: 48 | 49 | When there are categorical covariates (which are assigned a 'C' symbol) in the model statement of censoring variable, 50 | please name the censoring variable any name except 'C' to avoild name confusion. 51 | 52 | 53 | **Running example** `[code] `_: 54 | 55 | .. code-block:: 56 | 57 | import numpy as np 58 | from pygformula import ParametricGformula 59 | from pygformula.interventions import static 60 | from pygformula.data import load_censor_data 61 | 62 | obs_data = load_censor_data() 63 | time_name = 't0' 64 | id = 'id' 65 | 66 | covnames = ['L', 'A'] 67 | covtypes = ['binary', 'normal'] 68 | 69 | covmodels = ['L ~ lag1_L + t0', 70 | 'A ~ lag1_A + L + t0'] 71 | 72 | outcome_name = 'Y' 73 | ymodel = 'Y ~ A + L' 74 | 75 | censor_name = 'C' 76 | censor_model = 'C ~ A + L' 77 | 78 | time_points = np.max(np.unique(obs_data[time_name])) + 1 79 | int_descript = ['Never treat', 'Always treat'] 80 | 81 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, 82 | time_points = time_points, 83 | int_descript=int_descript, 84 | Intervention1_A = [static, np.zeros(time_points)], 85 | Intervention2_A = [static, np.ones(time_points)], 86 | censor_name= censor_name, censor_model=censor_model, 87 | covnames = covnames, covtypes = covtypes, covmodels = covmodels, 88 | outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival') 89 | g.fit() 90 | 91 | 92 | **Output**: 93 | 94 | .. image:: ../media/censor_example_output.png 95 | :align: center 96 | 97 | .. [1] Yu-Han Chiu, Lan Wen, Sean McGrath, Roger Logan, Issa J Dahabreh, and Miguel A Hernán. 2022. Evaluating model specification when using the parametric g-formula in the presence of censoring. American Journal of Epidemiology. -------------------------------------------------------------------------------- /docs/source/Specifications/Competing event.rst: -------------------------------------------------------------------------------- 1 | .. _Competing event: 2 | 3 | Competing event 4 | =================== 5 | 6 | In the presence of competing events, users may choose whether to treat competing 7 | events as censoring events. When competing events are treated as censoring events, 8 | risks under different interventions are calculated under elimination of 9 | competing events, and are obtained by the Kaplan–Meier estimator. 10 | When competing events are not treated as censoring events, risks under different interventions are calculated without elimination of 11 | competing events, and are obtained by using an estimate of the subdistribution cumulative incidence function [1]_ :sup:`,` [2]_. 12 | 13 | The arguments for competing events: 14 | 15 | .. list-table:: 16 | :header-rows: 1 17 | 18 | * - Arguments 19 | - Description 20 | * - compevent_name 21 | - (Optional) A string specifying the name of the competing event variable in obs_data. Only applicable for survival outcomes. 22 | * - compevent_model 23 | - (Optional) A string specifying the model statement for the competing event variable. Only applicable for survival outcomes. 24 | * - compevent_cens 25 | - (Optional) A boolean value indicating whether to treat competing events as censoring events. Default is False. 26 | 27 | 28 | **Sample syntax**: 29 | 30 | .. code-block:: 31 | 32 | compevent_name = 'D' 33 | compevent_model = 'D ~ A + L1 + L2 + L3 + t0' 34 | compevent_cens = False 35 | 36 | g = ParametricGformula(..., compevent_name = compevent_name, compevent_model = compevent_model, compevent_cens = compevent_cens, ...) 37 | 38 | The name of competing event in the input data should be specified in the argument ‘‘compevent_name’’. 39 | The model statement for the competing event variable should be specified in the argument ‘‘compevent_model’’. 40 | Users should also specify the argument ‘‘compevent_cens’’ as True or False indicating whether they want to treat the competing 41 | event as censoring event (the default is False). 42 | 43 | Setting ‘‘compevent_cens’’ as default (False): 44 | 45 | **Running example** `[code] `_: 46 | 47 | .. code-block:: 48 | 49 | from pygformula import ParametricGformula 50 | from pygformula.interventions import static 51 | from pygformula.data import load_basicdata 52 | 53 | obs_data = load_basicdata() 54 | 55 | covnames = ['L1', 'L2', 'A'] 56 | covtypes = ['binary', 'bounded normal', 'binary'] 57 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 58 | 'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 59 | 'A ~ lag1_A + L1 + L2 +lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0'] 60 | 61 | ymodel = 'Y ~ A + L1 + L2 + L3 + lag1_A + lag1_L1 + lag1_L2' 62 | 63 | time_name = 't0' 64 | id = 'id' 65 | outcome_name = 'Y' 66 | basecovs = ['L3'] 67 | 68 | compevent_name = 'D' 69 | compevent_model = 'D ~ A + L1 + L2 + L3 + t0' 70 | 71 | time_points = np.max(np.unique(obs_data[time_name])) + 1 72 | int_descript = ['Never treat', 'Always treat'] 73 | 74 | g = ParametricGformula(obs_data = obs_data, id = id, time_points = time_points, 75 | time_name=time_name, int_descript = int_descript, 76 | Intervention1_A = [static, np.zeros(time_points)], 77 | Intervention2_A = [static, np.ones(time_points)], 78 | basecovs =basecovs, covnames=covnames, 79 | covtypes=covtypes, covmodels=covmodels, 80 | compevent_name = compevent_name, compevent_model=compevent_model, 81 | outcome_name=outcome_name, outcome_type='survival', ymodel=ymodel) 82 | g.fit() 83 | 84 | 85 | **Output**: 86 | 87 | .. image:: ../media/competing_not_cens_output.png 88 | :align: center 89 | 90 | Setting ‘‘compevent_cens’’ as True: 91 | 92 | .. code-block:: 93 | 94 | compevent_name = 'D' 95 | compevent_model = 'D ~ A + L1 + L2 + L3 + t0' 96 | compevent_cens = True 97 | 98 | g = ParametricGformula(..., compevent_name = compevent_name, compevent_model = compevent_model, compevent_cens = compevent_cens, ...) 99 | 100 | **Output**: 101 | 102 | .. image:: ../media/competing_as_cens_output.png 103 | :align: center 104 | 105 | 106 | .. [1] Young JG, Stensrud MJ, Tchetgen Tchetgen EJ, Hernán MA. A causal framework for classical statistical estimands 107 | in failure-time settings with competing events. Statistics in Medicine. 2020;39:1199-236. 108 | .. [2] Fine JP and Gray RJ. A proportional hazards model for the subdistribution of a competing risk. Journal of the American Statistical Association, 94(446):496–509, 1999. 109 | 110 | -------------------------------------------------------------------------------- /docs/source/Specifications/Deterministic knowledge.rst: -------------------------------------------------------------------------------- 1 | .. _Deterministic knowledge: 2 | 3 | 4 | Deterministic knowledge 5 | ============================================== 6 | When there are known priori deterministic knowledge, they can be incorporated into the g-formula algorithm to avoid unnecessary 7 | extrapolation. The package allows users to apply restrictions of the deterministic knowledge on the covariates, 8 | outcome or competing event. 9 | 10 | 11 | Restrictions on covariates 12 | ------------------------------- 13 | 14 | When incorporating the deterministic knowledge of one time-varying covariate Z, the estimation is changed as follows: 15 | 16 | 1. In step 1 of the algorithm, restrict the chosen method of estimating the mean of Z given 17 | “history” to only records where deterministic knowledge is absent. 18 | 19 | 2. In step 2 of the algorithm, set Z deterministically to its known value for histories under which this 20 | value is known. Otherwise, draw Z according to the model-based estimate conditional distribution of Z. 21 | 22 | For example, when there are two time-varying covariates, one indicator of whether an individual has started menopause 23 | by a given interval k (menopause), and another indicator of whether she is pregnant in interval k (pregnancy). 24 | The deterministic knowledge is that given menopause == 1, the probability that pregnancy == 0 is 1. In the first 25 | estimation step, only records with menopause == 0 are used for model estimation of pregnancy. Then in the second 26 | simulation step, if the value of menopause in step 1 at time k is 1 then pregnancy is set to 0. Otherwise, the value 27 | of pregnancy at time k is drawn from the estimated distribution in step 1. 28 | 29 | The package allows deterministic knowledge incorporation for covariates by the argument ‘‘restrictions’’: 30 | 31 | .. list-table:: 32 | :header-rows: 1 33 | 34 | * - Arguments 35 | - Description 36 | * - restrictions 37 | - (Optional) List of lists. Each inner list contains its first entry the covariate name of that its deterministic knowledge 38 | is known; its second entry is a dictionary whose key is the conditions which should be True when the covariate 39 | is modeled, the third entry is the value that is set to the covariate during simulation when the conditions 40 | in the second entry are not True. 41 | 42 | Note that for each restricted covariate and its conditional covariates, they need to follow the same order in ‘‘covnames’’, i.e., 43 | the restricted covariate should be after its conditional covariates. 44 | 45 | An example of the restrictions that encodes the relationship between menopause and pregnancy above: 46 | 47 | .. code-block:: 48 | 49 | restrictions = [['pregnancy', {'menopause': lambda x: x == 0}, 1]] 50 | g = ParametricGformula(..., restrictions = restrictions, ...) 51 | 52 | **Sample syntax**: 53 | 54 | An example with one deterministic knowledge conditions for one covariate 'L2': if L1 equals 0, L2 is estimated 55 | by its parametric model, otherwise, it is set to a known value 0.5. 56 | 57 | .. code-block:: 58 | 59 | restrictions = [['L2', {'L1': lambda x: x == 0}, 0.5]] 60 | g = ParametricGformula(..., restrictions = restrictions, ...) 61 | 62 | An example with multiple deterministic knowledge conditions for one covariate 'A': if L1 equals 0 and L2 is greater than 0.5, A is estimated 63 | by its parametric model, otherwise, it is set to a known value 1. 64 | 65 | .. code-block:: 66 | 67 | restrictions = [['A', {'L1': lambda x: x == 0, 'L2': lambda x: x > 0.5}, 1]] 68 | g = ParametricGformula(..., restrictions = restrictions, ...) 69 | 70 | An example with multiple restrictions, one for covariate L2 and one for covariate A: 71 | 72 | .. code-block:: 73 | 74 | restrictions = [['L2', {'L1': lambda x: x == 0}, 0.5], ['A', {'L1': lambda x: x == 0, 'L2': lambda x: x > 0.5}, 1]] 75 | g = ParametricGformula(..., restrictions = restrictions, ...) 76 | 77 | If the assigned value of the covariate is not a static value, but determined by a user-specified function, 78 | the ‘‘restrictions’’ allows an input as a function type. In this case, the third entry for a restriction is a function 79 | instead of a value. 80 | 81 | For each custom restriction function, the input should be the parameters (not necessary to use all): 82 | 83 | * new_df: A DataFrame that contains the observed or simulated data at time t. 84 | * pool: A DataFrame that contains the observed or simulated data up to time t. 85 | * time_name: A string specifying the name of the time variable in pool. 86 | * t: An integer specifying the current time index. 87 | 88 | The function output should be a list of values that users wish to assign for the restricted covariate at time t. 89 | The package will automatically assign these values for records that are not restricted by the conditions. 90 | 91 | An example with one deterministic knowledge condition for covariate L2: if L1 equals 0, L2 is estimated 92 | by its parametric model, otherwise, its previous value is carried forward. 93 | 94 | .. code-block:: 95 | 96 | def carry_forward(new_df, pool, time_name, t): 97 | assigned_values = pool.loc[pool[time_name] == t-1, 'L2'] 98 | return assigned_values 99 | 100 | restrictions = [['L2', {'L1': lambda x: x == 0}, carry_forward]] 101 | g = ParametricGformula(..., restrictions = restrictions, ...) 102 | 103 | **Running example** `[code] `_: 104 | 105 | .. code-block:: 106 | 107 | import numpy as np 108 | from pygformula.interventions import static 109 | from pygformula import ParametricGformula 110 | from pygformula.data import load_basicdata_nocomp 111 | 112 | obs_data = load_basicdata_nocomp() 113 | 114 | time_name = 't0' 115 | id = 'id' 116 | 117 | covnames = ['L1', 'L2', 'A'] 118 | covtypes = ['binary', 'normal', 'binary'] 119 | covmodels = ['L1 ~ lag1_L1 + lag1_A', 120 | 'L2 ~ L1 + lag1_L2', 121 | 'A ~ L1 + L2'] 122 | 123 | basecovs = ['L3'] 124 | outcome_name = 'Y' 125 | ymodel = 'Y ~ L1 + L2 + A' 126 | 127 | # define interventions 128 | time_points = np.max(np.unique(obs_data[time_name])) + 1 129 | int_descript = ['Never treat', 'Always treat'] 130 | 131 | restrictions = [['L2', {'L1': lambda x: x == 0}, 0.5], ['A', {'L1': lambda x: x == 0, 'L2': lambda x: x > 0.5}, 1]] 132 | 133 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, 134 | time_points = time_points, 135 | int_descript = int_descript, 136 | Intervention1_A = [static, np.zeros(time_points)], 137 | Intervention2_A = [static, np.ones(time_points)], 138 | covnames=covnames, covtypes=covtypes, 139 | covmodels=covmodels, basecovs=basecovs, 140 | restrictions=restrictions, outcome_name=outcome_name, 141 | ymodel=ymodel, outcome_type='survival') 142 | g.fit() 143 | 144 | 145 | **Output**: 146 | 147 | .. image:: ../media/restriction_example_output.png 148 | :align: center 149 | 150 | 151 | Restrictions on outcome 152 | --------------------------------- 153 | 154 | When there is deterministic knowledge of the outcome variable Y, the package offers the argument 155 | ‘‘restrictions’’ to incorporate the knowledge: 156 | 157 | .. list-table:: 158 | :header-rows: 1 159 | 160 | * - Arguments 161 | - Description 162 | * - yrestrictions 163 | - (Optional) List of lists. For each inner list, its first entry is a dictionary whose key is the conditions which 164 | should be True when the outcome is modeled, the second entry is the value that is set to the outcome during 165 | simulation when the conditions in the first entry are not True. 166 | 167 | 168 | **Sample syntax**: 169 | 170 | An example with one deterministic knowledge conditions for outcome Y: if L1 equals 0, the probability of outcome Y is estimated 171 | by its parametric model, otherwise, it is set to value 1. 172 | 173 | .. code-block:: 174 | 175 | yrestrictions = [[{'L1': lambda x: x == 0}, 1]] 176 | g = ParametricGformula(..., yrestrictions = yrestrictions, ...) 177 | 178 | An example with multiple restrcitions for outcome Y: if L1 equals 0, 179 | the probability of outcome Y is estimated by its parametric model, otherwise, it is set to a value 0; if L2 is greater than 0.5, 180 | the probability of outcome Y is estimated by its parametric model, otherwise, it is set to a value 0.1; 181 | 182 | .. code-block:: 183 | 184 | yrestrictions = [[{'L1': lambda x: x == 0}, 0], [{'L2': lambda x: x > 0.5}, 0.1]] 185 | g = ParametricGformula(..., yrestrictions = yrestrictions, ...) 186 | 187 | 188 | **Running example** `[code] `_: 189 | 190 | .. code-block:: 191 | 192 | from pygformula import ParametricGformula 193 | from pygformula.interventions import static 194 | from pygformula.data import load_basicdata_nocomp 195 | 196 | obs_data = load_basicdata_nocomp() 197 | 198 | time_name = 't0' 199 | id = 'id' 200 | 201 | covnames = ['L1', 'L2', 'A'] 202 | covtypes = ['binary', 'normal', 'binary'] 203 | covmodels = ['L1 ~ lag1_L1 + lag1_A', 204 | 'L2 ~ L1 + lag1_L2', 205 | 'A ~ L1 + L2'] 206 | 207 | basecovs = ['L3'] 208 | outcome_name = 'Y' 209 | ymodel = 'Y ~ L1 + L2 + A' 210 | 211 | # define interventions 212 | time_points = np.max(np.unique(obs_data[time_name])) + 1 213 | int_descript = ['Never treat', 'Always treat'] 214 | 215 | yrestrictions = [[{'L1': lambda x: x == 0}, 0], [{'L2': lambda x: x > 0.5}, 0.1]] 216 | 217 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, 218 | time_points = time_points, 219 | int_descript = int_descript, 220 | Intervention1_A = [static, np.zeros(time_points)], 221 | Intervention2_A = [static, np.ones(time_points)], 222 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs, 223 | yrestrictions=yrestrictions, outcome_name=outcome_name, 224 | ymodel=ymodel, outcome_type='survival') 225 | g.fit() 226 | 227 | 228 | **Output**: 229 | 230 | .. image:: ../media/yrestriction_example_output.png 231 | :align: center 232 | 233 | 234 | Restrictions on competing event 235 | ----------------------------------- 236 | 237 | When there is a competing event D and there is known deterministic knowledge of the competing event, 238 | the package offers the argument ‘‘compevent_restrictions’’ for incorporation: 239 | 240 | .. list-table:: 241 | :header-rows: 1 242 | 243 | * - Arguments 244 | - Description 245 | * - compevent_restrictions 246 | - (Optional) List of lists. For each inner list, its first entry is a dictionary whose key is the conditions which 247 | should be True when the competing event is modeled, the second entry is the value that is set to the competing 248 | event during simulation when the conditions in the first entry are not True. Only applicable for survival outcomes. 249 | 250 | 251 | **Sample syntax**: 252 | 253 | An example with one deterministic knowledge conditions for competing event D: if L1 equals 0, the probability of competing 254 | event is estimated by its parametric model, otherwise, it is set to a value 1. 255 | 256 | .. code-block:: 257 | 258 | compevent_restrictions = [{'L1': lambda x: x == 0}, 1] 259 | g = ParametricGformula(..., compevent_restrictions = compevent_restrictions, ...) 260 | 261 | An example with multiple restrictions for competing event D: if L1 equals 0, the probability of competing 262 | event is estimated by its parametric model, otherwise, it is set to a value 1; if L2 is greater than 0.5, 263 | the probability of competing event is estimated by its parametric model, otherwise, 264 | it is set to a value 0.1; 265 | 266 | .. code-block:: 267 | 268 | compevent_restrictions = [[{'L1': lambda x: x == 0}, 0], [{'L2': lambda x: x > 0.5}, 0.1]] 269 | g = ParametricGformula(..., compevent_restrictions = compevent_restrictions, ...) 270 | 271 | 272 | **Running example** `[code] `_: 273 | 274 | .. code-block:: 275 | 276 | from pygformula import ParametricGformula 277 | from pygformula.interventions import static 278 | from pygformula.data import load_basicdata 279 | 280 | obs_data = load_basicdata() 281 | 282 | covnames = ['L1', 'L2', 'A'] 283 | covtypes = ['binary', 'bounded normal', 'binary'] 284 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 285 | 'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 286 | 'A ~ lag1_A + L1 + L2 +lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0'] 287 | 288 | outcome_model = 'Y ~ A + L1 + L2 + L3 + lag1_A + lag1_L1 + lag1_L2' 289 | 290 | time_name = 't0' 291 | id = 'id' 292 | outcome_name = 'Y' 293 | basecovs = ['L3'] 294 | 295 | compevent_name = 'D' 296 | compevent_model = 'D ~ A + L1 + L2 + L3 + t0' 297 | compevent_cens = False 298 | 299 | time_points = np.max(np.unique(obs_data[time_name])) + 1 300 | int_descript = ['Never treat', 'Always treat'] 301 | 302 | compevent_restrictions = [[{'L1': lambda x: x == 0}, 0], [{'L2': lambda x: x > 0.5}, 0.1]] 303 | 304 | g = ParametricGformula(obs_data = obs_data, id = id, time_points = time_points, 305 | time_name=time_name, int_descript = int_descript, 306 | Intervention1_A = [static, np.zeros(time_points)], 307 | Intervention2_A = [static, np.ones(time_points)], 308 | basecovs =basecovs, covnames=covnames, 309 | covtypes=covtypes, covmodels=covmodels, 310 | compevent_restrictions = compevent_restrictions, 311 | compevent_cens= compevent_cens, compevent_name = compevent_name, 312 | compevent_model=compevent_model, outcome_name=outcome_name, 313 | outcome_type='survival', ymodel=ymodel) 314 | g.fit() 315 | 316 | 317 | **Output**: 318 | 319 | .. image:: ../media/comp_restriction_example_output.png 320 | :align: center -------------------------------------------------------------------------------- /docs/source/Specifications/Hazard ratio.rst: -------------------------------------------------------------------------------- 1 | .. _Hazard ratio: 2 | 3 | Hazard ratio 4 | ============================ 5 | For survival outcomes, the pygformula provides the option of calculating the hazard ratio comparing any 6 | two interventions of interest. In the presence of competing events, it will return the subdistribution hazard ratio 7 | [1]_. Note that there is an order requirement for the input data structure that it should have the competing event before the outcome event. 8 | 9 | *Prerequisite*: If users want to calculate the hazard ratio with competing event, they needs to install additional “rpy2” package 10 | and install the python `"cmprsk" `_ package. Please follow the steps below to install: 11 | 12 | - Install R to set up R environment 13 | 14 | - Install cmprsk R package in R environment: 15 | 16 | .. code:: 17 | 18 | install.packages("cmprsk") 19 | 20 | - Install rpy2 package in python environment: 21 | 22 | .. code:: 23 | 24 | pip install rpy2 25 | 26 | - Install cmprsk package in python environment: 27 | 28 | .. code:: 29 | 30 | pip install cmprsk 31 | 32 | .. note:: 33 | 34 | If you encounters the problem of not finding the R environment, you can set up the R path 35 | in your environment using the following command in the code: 36 | 37 | .. code-block:: 38 | 39 | import os 40 | os.environ["R_HOME"] = 'R_HOME' 41 | 42 | where R_HOME is the R home directory path. 43 | 44 | The argument for calculating the hazard ratio: 45 | 46 | .. list-table:: 47 | :header-rows: 1 48 | 49 | * - Arguments 50 | - Description 51 | * - intcomp 52 | - (Optional) List of two numbers indicating a pair of interventions to be compared by a hazard ratio. 53 | 54 | Users can specify the two interventions by: 55 | 56 | .. code:: 57 | 58 | intcomp = [1, 2] 59 | 60 | The integer i in ‘‘intcomp’’ denotes the i-th intervention in the user-specified interventions. 0 denotes the natural course intervention. 61 | 62 | 63 | **Running example** `[code] `_: 64 | 65 | 66 | .. code-block:: 67 | 68 | from pygformula import ParametricGformula 69 | from pygformula.interventions import static 70 | from pygformula.data import load_basicdata_nocomp 71 | 72 | obs_data = load_basicdata_nocomp() 73 | time_name = 't0' 74 | id = 'id' 75 | 76 | covnames = ['L2', 'A'] 77 | covtypes = ['bounded normal', 'binary'] 78 | covmodels = ['L2 ~ lag1_A + lag_cumavg1_L2 + L3 + t0', 79 | 'A ~ lag1_A + L2 + lag_cumavg1_L2 + L3 + t0'] 80 | 81 | basecovs = ['L3'] 82 | 83 | outcome_name = 'Y' 84 | ymodel = 'Y ~ L2 + A + lag1_A + L3 + t0' 85 | outcome_type = 'survival' 86 | 87 | time_points = np.max(np.unique(obs_data[time_name])) + 1 88 | int_descript = ['Never treat', 'Always treat'] 89 | 90 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, 91 | time_points = time_points, 92 | int_descript = int_descript, intcomp=[1, 2], 93 | Intervention1_A = [static, np.zeros(time_points)], 94 | Intervention2_A = [static, np.ones(time_points)], 95 | covnames=covnames, covtypes=covtypes, 96 | covmodels=covmodels, basecovs=basecovs, 97 | outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type) 98 | g.fit() 99 | 100 | 101 | **Output**: 102 | 103 | .. image:: ../media/test_hazard_ratio.png 104 | :align: center 105 | 106 | 107 | .. [1] Fine JP and Gray RJ. A proportional hazards model for the subdistribution of a competing risk. Journal of the American Statistical Association, 94(446):496–509, 1999. 108 | 109 | 110 | -------------------------------------------------------------------------------- /docs/source/Specifications/Input data.rst: -------------------------------------------------------------------------------- 1 | .. _Input data: 2 | 3 | Input data 4 | ============================ 5 | 6 | The input dataset is specified by the ‘‘obs_data’’ argument which should contain: ‘‘id’’ specifying 7 | the individual identifier, ‘‘time_name’’ specifying the time index, ‘‘covnames’’ specifying the names of 8 | time-varying covariates, ‘‘outcome_name’’ specifying the name of the outcome of interest, ‘‘compevent_name’’ 9 | indicating the competing event status (if present), ‘‘censor_name’’ indicating the censoring event status (if present). 10 | 11 | 12 | **The related arguments**: 13 | 14 | .. list-table:: 15 | :header-rows: 1 16 | 17 | * - Arguments 18 | - Description 19 | * - obs_data 20 | - (Required) A data frame containing the observed data. 21 | * - id 22 | - (Required) A string specifying the name of the id variable in obs_data. 23 | * - time_name 24 | - (Required) A string specifying the name of the time variable in obs_data. 25 | * - outcome_name 26 | - (Required) A string specifying the name of the outcome variable in obs_data. 27 | * - covnames 28 | - (Required) A list of strings specifying the names of the time-varying covariates in obs_data. 29 | * - compevent_name 30 | - (Optional) A string specifying the name of the competing event variable in obs_data. Only applicable for survival outcomes. 31 | * - censor_name 32 | - (Optional) A string specifying the name of the censoring variable in obs_data. Only applicable when using inverse 33 | probability weights to estimate the natural course means / risk from the observed data. 34 | * - time_points 35 | - (Optional) An integer indicating the number of time points to simulate. It is set equal to the maximum number of records (K) 36 | that obs_data contains for any individual plus 1, if not specified by users. 37 | 38 | 39 | The input data should contain one record for each follow-up time k for each subject (identified by the individual identifier). 40 | The time index k for each subject should increment by 1 for each subsequent interval (the starting index is 0 in the following 41 | examples, pre-baseline times are also allowed). 42 | The record at each line in the data corresponds to an interval k, which contains the 43 | covariate measurements at interval k and the outcome measurement at interval k+1. 44 | 45 | 46 | Here is an example of input data structure for one subject which contains 7 records on 47 | the measurements of three time-varying covariates ‘‘L1’’, ‘‘L2’’, ‘‘A’’, 48 | one baseline covariate ‘‘L3’’ and the outcome ‘‘Y’’. See `"example_data_basicdata_nocomp" `_ for complete example data. 49 | 50 | .. image:: ../media/data_example.png 51 | :align: center 52 | :width: 5.2in 53 | :height: 1.8in 54 | 55 | **Censoring events.** When there are censoring events, and users want to compute nature course estimate via 56 | inverse probability weighting, there should be a variable in the input data set that is an 57 | indicator of censoring in the time between covariate measurements in interval k and interval k+1. 58 | 1 indicates the subject is censored (C_k+1 = 1) and 0 indicates the subject is not censored (C_k+1 = 0). 59 | Subjects have no more records after they are censored. Note that the censoring indicator is not needed 60 | if users don't want to compute the natural course estimate using IPW. 61 | 62 | For survival outcome, the outcome Y_k+1 on the line where individual is censored (C_k+1 = 1) can be coded NA or 0. 63 | This choice will make no difference to estimates in the algorithm when intervals are made small enough 64 | such that there are no failures in intervals where there are censoring events. It depends on 65 | whether to count such subjects in the time k risk set or not [1]_ :sup:`,` [2]_. For fixed binary/continuous end of follow-up, the 66 | outcome Y_k+1 should be coded NA. 67 | 68 | Here is an example of input data structure with a censoring event (identified by ‘‘C’’). The subject contains 8 records on the measurements of 69 | two time-varying covariates ‘‘L’’, ‘‘A’’, the outcome ‘‘Y’’ and is censored at time index k+1=8. See `"example_data_censor" `_ for complete example data. 70 | 71 | .. image:: ../media/data_example_censor.png 72 | :align: center 73 | :width: 4.5in 74 | :height: 2in 75 | 76 | **Competing events.** When there are competing events in the data, if the user chooses to treat competing 77 | events as censoring events, the data should be structured as censoring case above. If competing events 78 | are not treated as censoring events, there should be a variable in the input data set that is an 79 | indicator of competing event between interval k and k+1 covariate measurements, where 80 | 1 indicates there is a competing event for the subject (D_k+1 = 1) and 0 indicates no competing event (D_k+1 = 0). 81 | If D_k+1 = 1 on a record line k for a given subject, that subject will only have k+1 lines 82 | in the follow-up data with follow-up time k on the last line, and on that line, Y_k+1 should be coded NA. 83 | Note that the competing case is only applicable for survival outcome. 84 | 85 | Here is an example of input data structure with a competing event (identified by ‘‘D’’). The subject contains 7 records on 86 | three time-varying covariates ‘‘L1’’, ‘‘L2’’, ‘‘A’’, one baseline covariate ‘‘L3’’ and the outcome ‘‘Y’’. 87 | The subject experiences a competing event after measurement of interval k=6 covariates. See `"example_data_basicdata" `_ for complete example data. 88 | 89 | .. image:: ../media/data_example_competing.png 90 | :align: center 91 | :width: 6in 92 | :height: 1.8in 93 | 94 | 95 | + Note that the ‘‘time_points’’ argument specifies the desired end of follow-up (a 96 | follow-up interval k that is no more than the maximum number of records for an individual in the dataset), 97 | and is only applicable for survival outcome. 98 | 99 | 100 | .. [1] McGrath S, Lin V, Zhang Z, Petito LC, Logan RW, Hernán MA, Young JG. gfoRmula: An R Package for Estimating the Effects of Sustained Treatment Strategies via the Parametric g-formula. Patterns (N Y). 2020;1(3):100008. `gfoRmula `_. 101 | 102 | .. [2] Roger W. Logan, Jessica G. Young, Sarah Taubman, Yu-Han Chiu, Sara Lodi, Sally Picciotto, Goodarz Danaei, Miguel A. Hernán. `GFORMULA SAS `_. 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /docs/source/Specifications/Outcome model.rst: -------------------------------------------------------------------------------- 1 | .. _Outcome model: 2 | 3 | 4 | Outcome model 5 | =================== 6 | 7 | The package supports g-formula analysis on three types of outcomes: survival outcomes, fixed binary 8 | end of follow-up outcomes and continuous end of follow-up outcomes. 9 | 10 | For all types of outcomes, users should specify the name of outcome in the argument ‘‘outcome_name’’, and the model 11 | statement for outcome variable in the argument ‘‘ymodel’’. If users are interested in the probability of failing of an event by 12 | a specified follow-up time k under different interventions, they need to specify the type of outcome as 13 | 'survival' in the argument ‘‘outcome_type’’. If users are interested in the outcome mean at a fixed time point, 14 | and the outcome distribution is binary, they need to specify the type of outcome as 15 | 'binary_eof'. Similarly, they need to specify the type of outcome as 'continuous_eof' when the distribution of the outcome is continuous. 16 | 17 | The package uses generalized linear model (glm) to estimate the outcome model by default. If users want to use a custom 18 | model for estimation, they can use the arguments ‘‘ymodel_fit_custom’’ and ‘‘ymodel_predict_custom’’ for specification. 19 | 20 | 21 | .. list-table:: 22 | :header-rows: 1 23 | 24 | * - Arguments 25 | - Description 26 | * - outcome_name 27 | - (Required) A string specifying the name of the outcome variable in obs_data. 28 | * - ymodel 29 | - (Required) A string specifying the model statement for the outcome variable. 30 | * - outcome_type 31 | - (Required) A string specifying the "type" of outcome. The possible "types" are: "survival", "continuous_eof", and "binary_eof". 32 | * - ymodel_fit_custom 33 | - (Optional) A user-specified fit function for the outcome variable. 34 | * - ymodel_predict_custom 35 | - (Optional) A user-specified predict function for the outcome variable. 36 | 37 | 38 | Survival outcome 39 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 40 | 41 | For survival outcomes, the package will output estimates of contrasts in failure risks by a specified follow-up time k 42 | under different user-specified interventions. 43 | 44 | 45 | **Sample syntax**: 46 | 47 | .. code-block:: 48 | 49 | outcome_name = 'Y' 50 | ymodel = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0' 51 | outcome_type = 'survival' 52 | time_points = 5 53 | 54 | g = ParametricGformula(..., outcome_name = outcome_name, outcome_type = outcome_type, ymodel = ymodel, time_points = time_points, ...) 55 | 56 | Users can also specify the follow-up time of interest for survival outcome by the argument ‘‘time_points’’. 57 | 58 | 59 | **Running example** `[code] `_: 60 | 61 | .. code-block:: 62 | 63 | import numpy as np 64 | from pygformula import ParametricGformula 65 | from pygformula.interventions import static 66 | from pygformula.data import load_basicdata_nocomp 67 | 68 | obs_data = load_basicdata_nocomp() 69 | time_name = 't0' 70 | id = 'id' 71 | 72 | covnames = ['L1', 'L2', 'A'] 73 | covtypes = ['binary', 'bounded normal', 'binary'] 74 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 75 | 'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 76 | 'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0'] 77 | 78 | basecovs = ['L3'] 79 | 80 | outcome_name = 'Y' 81 | outcome_model = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0' 82 | outcome_type = 'survival' 83 | 84 | time_points = np.max(np.unique(obs_data[time_name])) + 1 85 | int_descript = ['Never treat', 'Always treat'] 86 | 87 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, 88 | time_points = time_points, int_descript = int_descript, 89 | covnames=covnames, covtypes=covtypes, 90 | covmodels=covmodels, basecovs=basecovs, 91 | outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type, 92 | Intervention1_A = [static, np.zeros(time_points)], 93 | Intervention2_A = [static, np.ones(time_points)]) 94 | g.fit() 95 | 96 | 97 | **Output**: 98 | 99 | .. image:: ../media/get_started_example.png 100 | :align: center 101 | 102 | 103 | Binary end of follow-up outcome 104 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 105 | 106 | For binary end of follow-up outcomes, the package will output estimates of contrasts in the outcome probability 107 | under different user-specified treatment strategies. 108 | 109 | **Sample syntax**: 110 | 111 | .. code-block:: 112 | 113 | outcome_name = 'Y' 114 | ymodel = 'Y ~ L1 + A + lag1_A + lag1_L1 + L3 + t0' 115 | outcome_type = 'binary_eof' 116 | 117 | g = ParametricGformula(..., outcome_name = outcome_name, outcome_type = outcome_type, ymodel = ymodel, ...) 118 | 119 | **Running example** `[code] `_: 120 | 121 | .. code-block:: 122 | 123 | import numpy as np 124 | from pygformula import ParametricGformula 125 | from pygformula.interventions import threshold 126 | from pygformula.data import load_binary_eof 127 | 128 | obs_data = load_binary_eof() 129 | time_name = 't0' 130 | id = 'id' 131 | 132 | covnames = ['L1', 'L2', 'A'] 133 | covtypes = ['binary', 'zero-inflated normal', 'normal'] 134 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + L3 + t0', 135 | 'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 136 | 'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0'] 137 | 138 | basecovs = ['L3'] 139 | 140 | outcome_name = 'Y' 141 | ymodel = 'Y ~ L1 + A + lag1_A + lag1_L1 + L3 + t0' 142 | outcome_type = 'binary_eof' 143 | 144 | int_descript = ['Threshold intervention'] 145 | 146 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, 147 | int_descript = int_descript, 148 | Intervention1_A = [threshold, [0.5, float('inf')]], 149 | covnames=covnames, covtypes=covtypes, 150 | covmodels=covmodels, basecovs=basecovs, 151 | outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type) 152 | g.fit() 153 | 154 | **Output**: 155 | 156 | .. image:: ../media/binary_eof_example_output.png 157 | :align: center 158 | 159 | 160 | Continuous end of follow-up outcome 161 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 162 | 163 | For continuous end of follow-up outcomes, the package will output estimates of contrasts in the outcome mean 164 | under different user-specified treatment strategies. 165 | 166 | **Sample syntax**: 167 | 168 | .. code-block:: 169 | 170 | outcome_name = 'Y' 171 | ymodel = 'Y ~ C(L1) + L2 + A' 172 | outcome_type = 'continuous_eof' 173 | 174 | g = ParametricGformula(..., outcome_name = outcome_name, outcome_type = outcome_type, ymodel = ymodel, ...) 175 | 176 | 177 | 178 | **Running example** `[code] `_: 179 | 180 | .. code-block:: 181 | 182 | import numpy as np 183 | from pygformula import ParametricGformula 184 | from pygformula.interventions import static 185 | from pygformula.data import load_continuous_eof 186 | 187 | obs_data = load_continuous_eof() 188 | time_name = 't0' 189 | id = 'id' 190 | 191 | covnames = ['L1', 'L2', 'A'] 192 | covtypes = ['categorical', 'normal', 'binary'] 193 | covmodels = ['L1 ~ C(lag1_L1) + lag1_L2 + t0', 194 | 'L2 ~ lag1_L2 + C(lag1_L1) + lag1_A + t0', 195 | 'A ~ C(L1) + L2 + t0'] 196 | 197 | basecovs = ['L3'] 198 | 199 | outcome_name = 'Y' 200 | outcome_model = 'Y ~ C(L1) + L2 + A' 201 | outcome_type = 'continuous_eof' 202 | 203 | time_points = np.max(np.unique(obs_data[time_name])) + 1 204 | int_descript = ['Never treat', 'Always treat'] 205 | 206 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, 207 | int_descript=int_descript, 208 | Intervention1_A = [static, np.zeros(time_points)], 209 | Intervention2_A = [static, np.ones(time_points)], 210 | covnames=covnames, covtypes=covtypes, 211 | covmodels=covmodels, basecovs=basecovs, 212 | outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type) 213 | g.fit() 214 | 215 | 216 | 217 | **Output**: 218 | 219 | .. image:: ../media/continuous_eof_example_output.png 220 | :align: center 221 | 222 | 223 | Custom model 224 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 225 | 226 | 227 | The custom fit function needs to contain the input parameters: 228 | 229 | * ymodel: model statement of the outcome 230 | * fit_data: data used to fit the outcome model 231 | 232 | and return a fitted model which is used to make prediction in the simulation step. 233 | 234 | 235 | An example using random forest to fit a outcome model: 236 | 237 | .. code-block:: 238 | 239 | def ymodel_fit_custom(ymodel, fit_data): 240 | y_name, x_name = re.split('~', ymodel.replace(' ', '')) 241 | x_name = re.split('\+', x_name.replace(' ', '')) 242 | # get feature and target data to fit ymodel 243 | y = fit_data[y_name].to_numpy() 244 | X = fit_data[x_name].to_numpy() 245 | fit_rf = RandomForestRegressor() 246 | fit_rf.fit(X, y) 247 | return fit_rf 248 | 249 | 250 | The custom predict function needs to contain the input parameters: 251 | 252 | * ymodel: model statement of the outcome 253 | * new_df: simulated data at time t. 254 | * fit: fitted model of the custom function 255 | 256 | and return a list of predicted values at time t. For survival and binary end-of-follow-up outcomes, the predict 257 | function should return the estimated probability. For continuous end-of-follow-up outcomes, it should return the 258 | estimated mean. 259 | 260 | 261 | The example of custom predict function using the random forest model: 262 | 263 | .. code-block:: 264 | 265 | def ymodel_predict_custom(ymodel, new_df, fit): 266 | y_name, x_name = re.split('~', ymodel.replace(' ', '')) 267 | x_name = re.split('\+', x_name.replace(' ', '')) 268 | # get feature data to predict 269 | X = new_df[x_name].to_numpy() 270 | prediction = fit.predict(X) 271 | return prediction 272 | 273 | 274 | **Running example** `[code] `_: 275 | 276 | .. code-block:: 277 | 278 | import numpy as np 279 | import pygformula 280 | from pygformula import ParametricGformula 281 | from pygformula.interventions import static 282 | from pygformula.data import load_continuous_eof 283 | 284 | obs_data = load_continuous_eof() 285 | 286 | time_name = 't0' 287 | id = 'id' 288 | 289 | covnames = ['L1', 'L2', 'A'] 290 | covtypes = ['categorical', 'normal', 'binary'] 291 | covmodels = ['L1 ~ C(lag1_L1) + lag1_L2 + t0', 292 | 'L2 ~ lag1_L2 + C(lag1_L1) + lag1_A + t0', 293 | 'A ~ C(L1) + L2 + t0'] 294 | 295 | basecovs = ['L3'] 296 | 297 | outcome_name = 'Y' 298 | 299 | ymodel = 'Y ~ lag1_L2 + L2 + lag1_A + A' 300 | 301 | # define interventions 302 | time_points = np.max(np.unique(obs_data[time_name])) + 1 303 | int_descript = ['Never treat', 'Always treat'] 304 | 305 | 306 | def ymodel_fit_custom(ymodel, fit_data): 307 | y_name, x_name = re.split('~', ymodel.replace(' ', '')) 308 | x_name = re.split('\+', x_name.replace(' ', '')) 309 | # get feature and target data to fit ymodel 310 | y = fit_data[y_name].to_numpy() 311 | X = fit_data[x_name].to_numpy() 312 | fit_rf = RandomForestRegressor() 313 | fit_rf.fit(X, y) 314 | return fit_rf 315 | 316 | def ymodel_predict_custom(ymodel, new_df, fit): 317 | y_name, x_name = re.split('~', ymodel.replace(' ', '')) 318 | x_name = re.split('\+', x_name.replace(' ', '')) 319 | # get feature data to predict 320 | X = new_df[x_name].to_numpy() 321 | prediction = fit.predict(X) 322 | return prediction 323 | 324 | 325 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 326 | int_descript = int_descript, 327 | Intervention1_A = [static, np.zeros(time_points)], basecovs=['L3'], 328 | Intervention2_A = [static, np.ones(time_points)], 329 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, 330 | ymodel_fit_custom = ymodel_fit_custom, ymodel_predict_custom=ymodel_predict_custom, 331 | outcome_name=outcome_name, ymodel=ymodel, outcome_type='continuous_eof') 332 | g.fit() 333 | 334 | 335 | 336 | .. note:: 337 | 338 | Note that when there are categorical covariates in the model statement, adding the ‘‘C( )’’ only applies to the 339 | default model fitting function. If users want to include it in a custom model fitting function, they need to 340 | process the categorical data in addition. -------------------------------------------------------------------------------- /docs/source/Specifications/Output.rst: -------------------------------------------------------------------------------- 1 | .. _Output: 2 | 3 | 4 | Output 5 | ================= 6 | 7 | 8 | Numerical results 9 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 10 | 11 | The package provides the following outputs: 12 | 13 | + **Data table of g-formula estimates**: The result table of g-formula estimates is returned by the fit function, containing (1) the nonparametric estimates 14 | of the natural course risk/mean outcome, (2) the parametric g-formula estimates of the risk/mean outcome under each user-specified intervention, 15 | (3) the risk ratio between each intervention and the reference intervention (natural course by default, can be specified in the argument ‘‘ref_int’’), 16 | (4) the risk difference between each intervention and the reference intervention. 17 | 18 | 19 | + **Simulated data table for interventions**: The package gives the simulated data table in the simulation step under 20 | each specified intervention, which can be obtained by: 21 | 22 | .. code:: 23 | 24 | sim_data = g.summary_dict['sim_data'] 25 | 26 | To get the simulated data under a particular intervention: 27 | 28 | .. code:: 29 | 30 | sim_data = g.summary_dict['sim_data'][intervention_name] 31 | 32 | 33 | + **The IP weights**: To get the inverse probability weights when there is censoring event: 34 | 35 | .. code:: 36 | 37 | ip_weights = g.summary_dict['IP_weights'] 38 | 39 | 40 | + **The model summary**: The package gives the model summary for each covariate, outcome, 41 | competing event (if applicable), censoring event (if applicable). 42 | First the argument ‘‘model_fits’’ should be set to True, then the model summary can be obtained by: 43 | 44 | .. code:: 45 | 46 | fitted_models = g.summary_dict['model_fits_summary'] 47 | 48 | To get the fitted model for a particular variable: 49 | 50 | .. code:: 51 | 52 | fitted_model = g.summary_dict['model_fits_summary'][variable_name] 53 | 54 | + **The coefficients**: The package gives the parameter estimates of all the models, which can be obtained by: 55 | 56 | .. code:: 57 | 58 | model_coeffs = g.summary_dict['model_coeffs'] 59 | 60 | To get the coefficients of the model for a particular variable, please use: 61 | 62 | .. code:: 63 | 64 | model_coeffs = g.summary_dict['model_coeffs'][variable_name] 65 | 66 | 67 | + **The standard errors**: The package gives the standard errors of the parameter estimates of all the models, which can be obtained by: 68 | 69 | .. code:: 70 | 71 | model_stderrs = g.summary_dict['model_stderrs'] 72 | 73 | To get the standard errors of the model for a particular variable, please use: 74 | 75 | .. code:: 76 | 77 | model_stderrs = g.summary_dict['model_stderrs'][variable_name] 78 | 79 | + **The variance-covariance matrices**: The package gives the variance-covariance matrices of the parameter estimates of all the models, 80 | which can be obtained by: 81 | 82 | .. code:: 83 | 84 | model_vcovs = g.summary_dict['model_vcovs'] 85 | 86 | To get the variance-covariance matrix of the parameter estimates of the model for a particular variable, please use: 87 | 88 | .. code:: 89 | 90 | model_vcovs = g.summary_dict['model_vcovs'][variable_name] 91 | 92 | 93 | + **The root mean square error**: The package gives the RMSE values of the models, which can be obtained by: 94 | 95 | .. code:: 96 | 97 | rmses = g.summary_dict['rmses'] 98 | 99 | To get the RMSE of the model for a particular variable, please use: 100 | 101 | .. code:: 102 | 103 | rmses = g.summary_dict['rmses'][variable_name] 104 | 105 | + **Nonparametric estimates at each time point**: The package gives the nonparametric estimates of all covariates and risk at each time point for survival outcomes, which can be obtained by: 106 | 107 | .. code:: 108 | 109 | obs_estimates = g.summary_dict['obs_plot'] 110 | 111 | To get the nonparametric estimates of a particular variable, e.g., risk, please use: 112 | 113 | .. code:: 114 | 115 | obs_estimates = g.summary_dict['obs_plot']['risk'] 116 | 117 | + **Parametric estimates at each time point**: The package gives the parametric estimates of all covariates and risk at each time point for survival outcomes, which can be obtained by: 118 | 119 | .. code:: 120 | 121 | est_estimates = g.summary_dict['est_plot'] 122 | 123 | To get the parametric estimates of a particular variable, e.g., risk, please use: 124 | 125 | .. code:: 126 | 127 | est_estimates = g.summary_dict['est_plot']['risk'] 128 | 129 | 130 | + **Hazard ratio**: The package gives hazard ratio value for the two interventions specified, which can be obtained by: 131 | 132 | .. code:: 133 | 134 | hazard_ratio = g.summary_dict['hazard_ratio'] 135 | 136 | The package also implement nonparametric bootstrapping to obtain 95% confidence intervals for risk/mean estimates 137 | by repeating the algorithm for many bootstrap samples. Users can choose the argument ‘‘nsamples’’ to specify the number of new generated bootstrap samples. 138 | Users may choose the argument ‘‘parallel’’ to parallelize bootstrapping and simulation steps under each intervention to 139 | make the algorithm run faster. The argument ‘‘ncores’’ can be used to specify the desired number of CPU cores 140 | in parallarization. 141 | 142 | The package provides two ways for calculating the confidence intervals 143 | in argument ‘‘ci_method’’, ‘‘percentile’’ means using percentile bootstrap method which takes the 2.5th and 97.5th percentiles of the bootstrap estimates to get the 95% confidence interval, 144 | "normal" means using the normal bootstrap method which uses the the original estimate and 145 | the standard deviation of the bootstrap estimates to get the normal approximation 95% confidence interval. 146 | 147 | + **The g-formula estimates of bootstrap samples**: The package gives the parametric g-formula estimates of all 148 | bootstrap samples, which can be obtained by: 149 | 150 | .. code-block:: 151 | 152 | g = ParametricGformula(..., nsamples = 20, parallel=True, n_core=10, ci_method = 'percentile', ...) 153 | g.fit() 154 | bootests = g.summary_dict['bootests'] 155 | 156 | To get the parametric g-formula estimates of a particular bootstrap sample, please use: 157 | 158 | .. code:: 159 | 160 | g.summary_dict['bootests']['sample_{id}_estimates'] 161 | 162 | where id is the sample id which should be an integer between 0 and ‘‘nsamples’’ - 1. 163 | 164 | 165 | + **The coefficients of bootstrap samples**: The package gives the parameter estimates of all the models for all generated 166 | bootstrap samples, which can be obtained by: 167 | 168 | .. code-block:: 169 | 170 | g = ParametricGformula(..., nsamples = 20, parallel=True, n_core=10, ci_method = 'percentile', boot_diag=True, ...) 171 | g.fit() 172 | bootcoeffs = g.summary_dict['bootcoeffs'] 173 | 174 | Note that the ‘‘boot_diag’’ should be set to true if users want to obtain the coefficients, standard errors or variance-covariance matrices 175 | of bootstrap samples. 176 | 177 | To get the coefficients of a particular bootstrap sample, please use: 178 | 179 | .. code:: 180 | 181 | g.summary_dict['bootcoeffs']['sample_{id}_coeffs'] 182 | 183 | + **The standard errors of bootstrap samples**: The package gives the standard errors of the parameter estimates of all the models for all generated 184 | bootstrap samples, which can be obtained by: 185 | 186 | .. code-block:: 187 | 188 | g = ParametricGformula(..., nsamples = 20, parallel=True, n_core=10, ci_method = 'percentile', boot_diag=True, ...) 189 | g.fit() 190 | bootstderrs = g.summary_dict['bootstderrs'] 191 | 192 | To get the standard errors of a particular bootstrap sample, please use: 193 | 194 | .. code:: 195 | 196 | g.summary_dict['bootstderrs']['sample_{id}_stderrs'] 197 | 198 | 199 | + **The variance-covariance matrices of bootstrap samples**: The package gives the variance-covariance matrices of the parameter estimates of all the models for all generated 200 | bootstrap samples, which can be obtained by: 201 | 202 | .. code-block:: 203 | 204 | g = ParametricGformula(..., nsamples = 20, parallel=True, n_core=10, ci_method = 'percentile', boot_diag=True, ...) 205 | g.fit() 206 | bootvcovs = g.summary_dict['bootvcovs'] 207 | 208 | To get the variance-covariance matrices of a particular bootstrap sample, please use: 209 | 210 | .. code:: 211 | 212 | g.summary_dict['bootvcovs']['sample_{id}_vcovs'] 213 | 214 | 215 | Note that to get bootstrap results of coefficients, standard errors, and variance-covariance matrices, the argument 216 | ‘‘boot_diag’’ must be set to True. 217 | 218 | All the output results above can be saved by the argument ‘‘save_results’’, once it is set to True, 219 | results will be saved locally by creating a folder automatically. Users can also specify the folder path by the 220 | argument ‘‘save_path’’: 221 | 222 | .. code-block:: 223 | 224 | g = ParametricGformula(..., save_results = True, save_path = 'user-specified path', ...) 225 | g.fit() 226 | 227 | 228 | **Arguments**: 229 | 230 | .. list-table:: 231 | :header-rows: 1 232 | 233 | * - Arguments 234 | - Description 235 | * - n_simul 236 | - (Optional) An integer indicating the number of subjects for whom to simulate data. It is set equal to the number (M) of 237 | subjects in obs_data, if not specified by users. 238 | * - ref_int 239 | - (Optional) An integer indicating the intervention to be used as the reference for calculating the end-of-follow-up mean/risk 240 | ratio and mean/risk difference. 0 denotes the natural course, while subsequent integers denote user-specified 241 | interventions in the order that they are named in interventions. It is set to 0 if not specified by users. 242 | * - nsamples 243 | - (Optional) An integer specifying the number of bootstrap samples to generate. 244 | * - parallel 245 | - (Optional) A boolean value indicating whether to parallelize simulations of different interventions to multiple cores. 246 | * - ncores 247 | - (Optional) An integer indicating the number of cores used in parallelization. It is set to 1 if not specified by users. 248 | * - model_fits 249 | - (Optional) A boolean value indicating whether to return the parameter estimates of the models. 250 | * - ci_method 251 | - (Optional) A string specifying the method for calculating the bootstrap 95% confidence intervals, if applicable. 252 | The options are "percentile" and "normal". It is set to "percentile" if not specified by users. 253 | * - boot_diag 254 | - (Optional) A boolean value indicating whether to return the parametric g-formula estimates as well as the coefficients, 255 | standard errors, and variance-covariance matrices of the parameters of the fitted models in the bootstrap samples. 256 | * - save_results 257 | - (Optional) A boolean value indicating whether to save all the returned results to the save_path. 258 | * - save_path 259 | - (Optional) A path to save all the returned results. A folder will be created automatically in the current working directory 260 | if the save_path is not specified by users. 261 | * - seed 262 | - (Optional) An integer indicating the starting seed for simulations and bootstrapping. It is set to 1234 if not specified by users. 263 | 264 | 265 | 266 | 267 | 268 | Graphical results 269 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 270 | 271 | .. automodule:: pygformula.plot 272 | 273 | 274 | The package also provides two plotting functions: "plot_natural_course" and "plot_interventions". 275 | The plot_natural_course function plots the curves of each covariate mean (for all types of outcomes) and risk (for survival outcomes only) under g-formula parametric and 276 | non-parametric estimation. 277 | 278 | .. autosummary:: plot_natural_course 279 | .. autofunction:: plot_natural_course 280 | 281 | 282 | The plot_interventions function plots the curves of risk under interventions of interest (for survival outcomes only). 283 | 284 | 285 | .. autosummary:: plot_interventions 286 | .. autofunction:: plot_interventions 287 | 288 | 289 | Arguments for plotting: 290 | 291 | .. list-table:: 292 | :header-rows: 1 293 | 294 | * - Arguments 295 | - Description 296 | * - plot_name 297 | - A string specifying the name for plotting, which is set to "all", "risk" or one specific covariate name. Only 298 | applicable for the plot_natural_course function. The default is "all". 299 | * - colors 300 | - For plot_natural_course function, it is a list wth two elements, specifying the non-parametric estimate curve and parametric curve respectively. 301 | Users can choose colors from `matplotlib colors `_. 302 | For plot_interventions function, it is a list wth m elements with m the number of interventions plus 1, 303 | specifying all intervention curves. If not specified, the function will use default colors. 304 | * - marker 305 | - A string used to customize the appearance of points in plotting. Users can also choose markers from 306 | `matplotlib markers `_ library. 307 | * - markersize 308 | - An integer specifies the size of the markers in plotting. 309 | * - linewidth 310 | - A number that specifies the width of the line in plotting. 311 | * - save_figure 312 | - A boolean value indicating whether to save the figure or not. 313 | 314 | 315 | Users can call the 'plot_natural_course' function by: 316 | 317 | .. code-block:: 318 | 319 | g.plot_natural_course() 320 | 321 | Users can call the 'plot_interventions' function by: 322 | 323 | .. code-block:: 324 | 325 | g.plot_interventions() 326 | 327 | 328 | Note that the plotting functions can only be applied after calling the 'g.fit' function. 329 | 330 | The figures can be saved by the argument ‘‘save_figure’’, once it is set to True, 331 | results will be saved locally by creating a folder automatically. If the argument ‘‘save_path’’ is specified, the figure will be saved to the corresponding folder. 332 | 333 | 334 | **Sample syntax**: 335 | 336 | .. code-block:: 337 | 338 | g.plot_natural_course(plot_name='L1', colors=['blue', 'red'], markersize=5, linewidth=1, marker='v', save_figure=True) 339 | g.plot_interventions(colors =['green', 'red', 'yellow'], markersize=5, linewidth=1, marker='v', save_figure=True) 340 | 341 | .. note:: 342 | 343 | We recommend setting the ‘‘save_figure’’ as True if users want to access the figure 344 | when running the package on Linux system. -------------------------------------------------------------------------------- /docs/source/Specifications/Visit process.rst: -------------------------------------------------------------------------------- 1 | .. _Visit process: 2 | 3 | 4 | Visit process 5 | ================= 6 | 7 | When the data are not recorded at regular intervals but rather are recorded everytime the patient visits the 8 | clinic, the times at which the time-varying covariates are measured will vary by subject. In this setting, 9 | it is typical to construct the data such that (i) at a time when there is no visit/measurement, 10 | the last measured value of a covariate is carried forward, and (ii) a subject is censored after a certain number of consecutive times 11 | with no visit/measurement [1]_ :sup:`,` [2]_. 12 | 13 | In pygformula, the deterministic knowledge (i) and (ii) can be incorporated via the argument ‘‘visitprocess’’. 14 | Each vector in ‘‘visitprocess’’ contains three parameters that attach a visit process to one covariate. 15 | The first parameter is the name of a time-varying indicator in the input data set of whether a covariate was measured in each interval 16 | (1 means there is a visit/measurement, 0 means there is no visit/measurement). 17 | The second parameter is the name of the covariate. The third parameter is the maximum number s of missed measurements of this covariate allowed 18 | since the last measurement before a subject is censored. 19 | 20 | For the visit indicator, in the fitting step, the probability of a visit is estimated only using records 21 | where the sum of consecutive missed visits through previous k-1 time points is less than the maximum number of consecutive missed visits s. 22 | Then in the simulation step, if the sum of consecutive missed visits through previous k-1 time points is less than s, then the visit 23 | indicator is simulated from a distribution based on this estimate; otherwise, the visit indicator is set to 1 so 24 | as to eliminate subjects with more than s consecutive missed visits. For the covariate, in the fitting step, the conditional mean of the covariate will be estimated 25 | only for data records where there is a current visit. If the visit indicator equals 1, then in simulation step, the value of the 26 | dependent covariate will be generated from a distribution based on this estimate; otherwise, the last value is 27 | carried forward. 28 | 29 | 30 | 31 | The argument for visit process: 32 | 33 | .. list-table:: 34 | :header-rows: 1 35 | 36 | * - Arguments 37 | - Description 38 | * - visitprocess 39 | - (Optional) List of lists. Each inner list contains its first entry the covariate name of a visit process; its second entry 40 | the name of a covariate whose modeling depends on the visit process; and its third entry the maximum number 41 | of consecutive visits that can be missed before an individual is censored. 42 | 43 | .. code:: 44 | 45 | covnames = ['visit_cd4', 'visit_rna', 'cd4_v', 'rna_v', 'everhaart'] 46 | covtypes = ['binary', 'binary', 'normal', 'normal', 'binary'] 47 | covmodels = ['visit_cd4 ~ lag1_everhaart + lag_cumavg1_cd4_v + sex + race + month', 48 | 'visit_rna ~ lag1_everhaart + lag_cumavg1_rna_v + sex + race + month', 49 | 'cd4_v ~ lag1_everhaart + lag_cumavg1_cd4_v + sex + race + month', 50 | 'rna_v ~ lag1_everhaart + lag_cumavg1_rna_v + sex + race + month', 51 | 'everhaart ~ lag1_everhaart + cd4_v + rna_v + sex + race + month'] 52 | 53 | visitprocess = [['visit_cd4', 'cd4_v', 3], ['visit_rna', 'rna_v', 3]] 54 | 55 | g = ParametricGformula(..., covnames = covnames, covtypes = covtypes, covmodels = covmodels, visitprocess = visitprocess, ...) 56 | 57 | 58 | Here is an example in clinical cohorts of HIV-positive patients, ‘‘cd4_v’’ is a time-varying covariate of CD4 cell count measurement, 59 | the visit indicator ‘‘visit_cd4’’ indicats whether the CD4 cell count measurements were taken in interval k. 60 | 3 means that the data is constructed such that the subjects are censored once they have not had CD4 measured for 3 consecutive intervals. 61 | Note that for the visit indicator ‘‘visit_cd4’’, it should come before the dependent covariate ‘‘cd4_v’’ and be assigned 62 | the ‘‘binary’’ covariate type in ‘‘covtypes’’. 63 | 64 | 65 | **Running example** `[code] `_: 66 | 67 | .. code-block:: 68 | 69 | from pygformula import ParametricGformula 70 | from pygformula.interventions import static 71 | from pygformula.data import load_visit_process 72 | 73 | obs_data = load_visit_process() 74 | time_name = 'month' 75 | id = 'id' 76 | 77 | covnames = ['visit_cd4', 'visit_rna', 'cd4_v', 'rna_v', 'everhaart'] 78 | covtypes = ['binary', 'binary', 'normal', 'normal', 'binary'] 79 | covmodels = ['visit_cd4 ~ lag1_everhaart + lag_cumavg1_cd4_v + sex + race + month', 80 | 'visit_rna ~ lag1_everhaart + lag_cumavg1_rna_v + sex + race + month', 81 | 'cd4_v ~ lag1_everhaart + lag_cumavg1_cd4_v + sex + race + month', 82 | 'rna_v ~ lag1_everhaart + lag_cumavg1_rna_v + sex + race + month', 83 | 'everhaart ~ lag1_everhaart + cd4_v + rna_v + sex + race + month'] 84 | 85 | basecovs = ['sex', 'race', 'age'] 86 | 87 | visitprocess = [['visit_cd4', 'cd4_v', 3], ['visit_rna', 'rna_v', 3]] 88 | 89 | outcome_name = 'event' 90 | ymodel = 'event ~ cd4_v + rna_v + everhaart + sex + race + month' 91 | 92 | time_points = np.max(np.unique(obs_data[time_name])) + 1 93 | 94 | int_descript = ['Never treat', 'Always treat'] 95 | 96 | g = ParametricGformula(obs_data = obs_data, id = id, time_name = time_name, 97 | visitprocess = visitprocess, 98 | int_descript = int_descript, 99 | Intervention1_everhaart = [static, np.zeros(time_points)], 100 | Intervention2_everhaart = [static, np.ones(time_points)], 101 | covnames=covnames, covtypes=covtypes, 102 | covmodels=covmodels, basecovs = basecovs, 103 | outcome_name=ou tcome_name, ymodel=ymodel, outcome_type='survival') 104 | g.fit() 105 | 106 | 107 | **Output**: 108 | 109 | .. image:: ../media/visitprocess_example_output.png 110 | :align: center 111 | 112 | 113 | .. [1] Hernán MA, McAdams M, McGrath N, Lanoy E, Costagliola D. Observation plans in longitudinal studies with 114 | time-varying treatments. Statistical Methods in Medical Research 2009;18(1):27-52. 115 | 116 | .. [2] Young JG, Cain LE, Robins JM, O’Reilly E, Hernán MA. Comparative effectiveness of dynamic treatment regimes: 117 | an application of the parametric g-formula. Statistics in Biosciences 2011; 3:119-143. 118 | -------------------------------------------------------------------------------- /docs/source/Specifications/index.rst: -------------------------------------------------------------------------------- 1 | Specifications 2 | =================== 3 | 4 | 5 | 6 | The ‘‘Specifications’’ section gives detailed instructions about how to specify the required or optional 7 | arguments in different modules of pygformula to construct a specific analysis. To use the g-formula method in the package, 8 | the first step is to make sure that the input data meets the requirement of 9 | :doc:`Input data`. 10 | Then, users need to specify their parametric covariate model (see :doc:`Covariate models`), 11 | parametric outcome model (see :doc:`Outcome model`) 12 | , as well as the intervention of interest (see :doc:`Interventions`). 13 | Once these required modules are well-defined, the g-formula in pygformula can be called and output the results of the method. 14 | 15 | Additionally, if there are censoring events, the package provides the option to obtain inverse probability weighted estimates 16 | for comparison with the g-formula estimates, 17 | see :doc:`Censoring event`. 18 | If there are competing events, the package provides two options for handling competing events in the case of survival outcomes, see 19 | :doc:`Competing event`. 20 | The package also provides option for calculating the hazard ratio of any two interventions of interest in 21 | :doc:`Hazard ratio`. 22 | If the data structure contains visit process, users can also perform g-formula analysis for this setting in 23 | :doc:`Visit process`. 24 | If there is deterministic knowledge about the relationship between the variables, it can be incorporated into the estimation 25 | of g-formula by applying restrictions, see :doc:`Deterministic knowledge`. 26 | 27 | 28 | 29 | 30 | **Contents**: 31 | 32 | .. toctree:: 33 | :maxdepth: 2 34 | 35 | Input data 36 | Interventions 37 | Covariate models 38 | Outcome model 39 | Censoring event 40 | Competing event 41 | Hazard ratio 42 | Visit process 43 | Deterministic knowledge 44 | Output 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'pygformula' 21 | copyright = '2024, The President and Fellows of Harvard College' 22 | 23 | 24 | import os 25 | import sys 26 | sys.path.insert(0, os.path.abspath('../..')) 27 | 28 | from pygformula import __version__ 29 | 30 | release = __version__ 31 | 32 | # -- General configuration --------------------------------------------------- 33 | 34 | # Add any Sphinx extension module names here, as strings. They can be 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 36 | # ones. 37 | 38 | 39 | extensions = ['sphinx.ext.autodoc', 40 | 'sphinx.ext.autosummary', 41 | 'sphinx.ext.napoleon' 42 | ] 43 | 44 | 45 | # Add any paths that contain templates here, relative to this directory. 46 | templates_path = ['_templates'] 47 | 48 | # List of patterns, relative to source directory, that match files and 49 | # directories to ignore when looking for source files. 50 | # This pattern also affects html_static_path and html_extra_path. 51 | exclude_patterns = [] 52 | 53 | # The suffix of source filenames. 54 | source_suffix = ".rst" 55 | 56 | 57 | # -- Options for HTML output ------------------------------------------------- 58 | 59 | # The theme to use for HTML and HTML Help pages. See the documentation for 60 | # a list of builtin themes. 61 | # 62 | html_theme = 'sphinx_rtd_theme' 63 | 64 | # Add any paths that contain custom static files (such as style sheets) here, 65 | # relative to this directory. They are copied after the builtin static files, 66 | # so a file named "default.css" will overwrite the builtin "default.css". 67 | html_static_path = ['_static'] 68 | 69 | 70 | latex_elements = { 71 | } 72 | 73 | 74 | latex_documents = [ 75 | ('index', 'pygformula.tex', 'Pygformula Documentation', '', 76 | 'manual'), 77 | ] 78 | 79 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | 2 | Welcome to pygformula's documentation! 3 | ====================================== 4 | 5 | The `pygformula `_ package implements the non-iterative 6 | conditional expectation (NICE) estimator of the g-formula algorithm [1]_ :sup:`,` [2]_. The g-formula can estimate an 7 | outcome’s counterfactual mean or risk under hypothetical treatment strategies (interventions) when there 8 | is sufficient information on time-varying treatments and confounders. 9 | 10 | This package can be used for discrete or continuous time-varying treatments and for failure time outcomes or 11 | continuous/binary end of follow-up outcomes. The package can handle a random measurement/visit process and a 12 | priori knowledge of the data structure, as well as censoring (e.g., by loss to follow-up) and two options for 13 | handling competing events for failure time outcomes. Interventions can be flexibly specified, both as 14 | interventions on a single treatment or as joint interventions on multiple treatments. 15 | 16 | For a quick overview of how to use the pygformula, see a simple example in :doc:`Get Started`. 17 | For a detailed list of options, see :doc:`Specifications/index`. 18 | 19 | .. toctree:: 20 | :maxdepth: 2 21 | 22 | Installation 23 | Get Started 24 | 25 | .. toctree:: 26 | :maxdepth: 4 27 | 28 | Specifications/index 29 | 30 | .. toctree:: 31 | :maxdepth: 2 32 | 33 | Datasets 34 | Contact 35 | 36 | 37 | .. [1] Robins JM. A new approach to causal inference in mortality studies with a sustained exposure period: 38 | application to the healthy worker survivor effect. Mathematical Modelling. 1986;7:1393–1512. [Errata (1987) 39 | in Computers and Mathematics with Applications 14, 917-921. Addendum (1987) in Computers and Mathematics 40 | with Applications 14, 923-945. Errata (1987) to addendum in Computers and Mathematics with Applications 41 | 18, 477. 42 | .. [2] Hernán, M.A., and Robins, J. (2020). Causal Inference: What If (Chapman & Hall/CRC). 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /docs/source/media/absorbing_cov_example_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/absorbing_cov_example_output.png -------------------------------------------------------------------------------- /docs/source/media/binary_cov_example_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/binary_cov_example_output.png -------------------------------------------------------------------------------- /docs/source/media/binary_eof_example_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/binary_eof_example_output.png -------------------------------------------------------------------------------- /docs/source/media/bounded_normal_cov_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/bounded_normal_cov_example.png -------------------------------------------------------------------------------- /docs/source/media/categorical_cov_example_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/categorical_cov_example_output.png -------------------------------------------------------------------------------- /docs/source/media/categorical_time_cov_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/categorical_time_cov_example.png -------------------------------------------------------------------------------- /docs/source/media/censor_example_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/censor_example_output.png -------------------------------------------------------------------------------- /docs/source/media/comp_restriction_example_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/comp_restriction_example_output.png -------------------------------------------------------------------------------- /docs/source/media/competing_as_cens_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/competing_as_cens_output.png -------------------------------------------------------------------------------- /docs/source/media/competing_not_cens_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/competing_not_cens_output.png -------------------------------------------------------------------------------- /docs/source/media/continuous_eof_example_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/continuous_eof_example_output.png -------------------------------------------------------------------------------- /docs/source/media/data_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/data_example.png -------------------------------------------------------------------------------- /docs/source/media/data_example_censor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/data_example_censor.png -------------------------------------------------------------------------------- /docs/source/media/data_example_competing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/data_example_competing.png -------------------------------------------------------------------------------- /docs/source/media/dynamic_example_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/dynamic_example_output.png -------------------------------------------------------------------------------- /docs/source/media/example_hazardratio_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/example_hazardratio_output.png -------------------------------------------------------------------------------- /docs/source/media/get_started_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/get_started_example.png -------------------------------------------------------------------------------- /docs/source/media/get_started_example_all.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/get_started_example_all.jpg -------------------------------------------------------------------------------- /docs/source/media/get_started_example_bootstrap.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/get_started_example_bootstrap.jpg -------------------------------------------------------------------------------- /docs/source/media/get_started_example_intervention_curve.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/get_started_example_intervention_curve.jpg -------------------------------------------------------------------------------- /docs/source/media/natural_course_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/natural_course_output.png -------------------------------------------------------------------------------- /docs/source/media/natural_grace_period.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/natural_grace_period.png -------------------------------------------------------------------------------- /docs/source/media/normal_cov_example_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/normal_cov_example_output.png -------------------------------------------------------------------------------- /docs/source/media/random_forest_cov.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/random_forest_cov.png -------------------------------------------------------------------------------- /docs/source/media/restriction_example_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/restriction_example_output.png -------------------------------------------------------------------------------- /docs/source/media/static_example_one_treatment_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/static_example_one_treatment_output.png -------------------------------------------------------------------------------- /docs/source/media/static_example_two_treatments.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/static_example_two_treatments.png -------------------------------------------------------------------------------- /docs/source/media/static_multiple_interventions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/static_multiple_interventions.png -------------------------------------------------------------------------------- /docs/source/media/survival_example_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/survival_example_output.png -------------------------------------------------------------------------------- /docs/source/media/test_hazard_ratio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/test_hazard_ratio.png -------------------------------------------------------------------------------- /docs/source/media/threshold_example_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/threshold_example_output.png -------------------------------------------------------------------------------- /docs/source/media/truncated_normal_cov_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/truncated_normal_cov_example.png -------------------------------------------------------------------------------- /docs/source/media/uniform_grace_period.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/uniform_grace_period.png -------------------------------------------------------------------------------- /docs/source/media/visitprocess_example_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/visitprocess_example_output.png -------------------------------------------------------------------------------- /docs/source/media/yrestriction_example_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/yrestriction_example_output.png -------------------------------------------------------------------------------- /docs/source/media/zero_inflated_normal_cov_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/zero_inflated_normal_cov_example.png -------------------------------------------------------------------------------- /pygformula/__init__.py: -------------------------------------------------------------------------------- 1 | from .version import __version__ 2 | from .parametric_gformula import ParametricGformula 3 | -------------------------------------------------------------------------------- /pygformula/comparisons.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def comparison_calculate(obs_data, time_name, time_points, id, covnames, covtypes, outcome_name, outcome_type, 5 | nc_pool, nc_risk, censor, censor_name, censor_fit, ipw_cutoff_quantile, ipw_cutoff_value, 6 | competing=None, compevent_name=None, compevent_cens=False, compevent_fit=None): 7 | """ 8 | This is an internal function to calculate the mean observed values of covariates at each time point, as well as mean 9 | observed risk. 10 | 11 | Parameters 12 | ---------- 13 | obs_data: DataFrame 14 | A data frame containing the observed data. 15 | 16 | time_name: Str 17 | A string specifying the name of the time variable in obs_data. 18 | 19 | time_points: Int 20 | An integer indicating the number of time points to simulate. It is set equal to the maximum number of records 21 | that obs_data contains for any individual plus 1, if not specified by users. 22 | 23 | id: Str 24 | A string specifying the name of the id variable in obs_data. 25 | 26 | covnames: List 27 | A list of strings specifying the names of the time-varying covariates in obs_data. 28 | 29 | covtypes: List 30 | A list of strings specifying the “type” of each time-varying covariate included in covnames. 31 | The supported types: "binary", "normal", "categorical", "bounded normal", "zero-inflated normal", 32 | "truncated normal", "absorbing", "categorical time" "square time" and "custom". The list must be the same length 33 | as covnames and in the same order. 34 | 35 | outcome_name: Str 36 | A string specifying the name of the outcome variable in obs_data. 37 | 38 | outcome_type: Str 39 | A string specifying the "type" of outcome. The possible "types" are: "survival", "continuous_eof", and "binary_eof".\ 40 | 41 | nc_pool: DataFrame 42 | A dataframe of the simulated data under natural course. 43 | 44 | nc_risk: List 45 | A list contains the parametric risk of all the time points for natural course. 46 | 47 | censor: Bool 48 | A boolean value indicating the if there is a censoring event. 49 | 50 | censor_name: Str 51 | A string specifying the name of the censoring variable in obs_data. Only applicable when using inverse 52 | probability weights to estimate the natural course means / risk from the observed data. 53 | 54 | censor_fit: Class 55 | A class object of the fitted model for the censoring event. 56 | 57 | ipw_cutoff_quantile: Float 58 | Percentile value for truncation of the inverse probability weights 59 | 60 | ipw_cutoff_value: Float 61 | Absolute value for truncation of the inverse probability weights. 62 | 63 | competing: Bool 64 | A boolean value indicating the if there is a competing event. 65 | 66 | compevent_name: Str 67 | A string specifying the name of the competing event variable in obs_data. Only applicable for survival outcomes. 68 | 69 | compevent_cens: Bool 70 | A boolean value indicating whether to treat competing events as censoring events. 71 | 72 | compevent_fit: Class 73 | A class object of the fitted model for the competing event. 74 | 75 | Returns 76 | ------- 77 | obs_means: Dict 78 | A dictionary, where the key is the covariate / risk name and the value is its observational mean at all the time points. 79 | 80 | est_means: Dict 81 | A dictionary, where the key is the covariate / risk name and the value is its parametric mean at all the time points. 82 | 83 | obs_res: Float 84 | A value of the observational risk / mean at final time point. 85 | 86 | IP_weights: List 87 | A list contains the inverse probability weights from the censor model. 88 | 89 | """ 90 | if censor: 91 | # for non-parametric cov means and risks 92 | censor_pre = censor_fit.predict(obs_data) 93 | censor_p0_inv = 1 / (1 - censor_pre) 94 | obs_data['censor_p0_inv'] = censor_p0_inv 95 | censor_inv_cum = obs_data.groupby([id])['censor_p0_inv'].cumprod() 96 | obs_data['censor_inv_cum'] = censor_inv_cum 97 | w_censor = censor_inv_cum * (1 - obs_data[censor_name]) 98 | if outcome_type == 'survival' and compevent_cens: 99 | comprisk_p0_inv = 1 / (1 - compevent_fit.predict(obs_data)) 100 | obs_data['comprisk_p0_inv'] = comprisk_p0_inv 101 | comprisk_inv_cum = obs_data.groupby([id])['comprisk_p0_inv'].cumprod() 102 | w_comp = np.where((obs_data[compevent_name].isna()) | (obs_data[compevent_name] == 1), 0, comprisk_inv_cum) 103 | w = w_comp * w_censor 104 | else: 105 | w = w_censor 106 | obs_data['IP_weight'] = w 107 | 108 | if ipw_cutoff_quantile: 109 | quantile_w = np.percentile(list(w_censor), ipw_cutoff_quantile * 100) 110 | obs_data.loc[obs_data['IP_weight'] > quantile_w, 'IP_weight'] = quantile_w 111 | if ipw_cutoff_value: 112 | obs_data.loc[obs_data['IP_weight'] > ipw_cutoff_value, 'IP_weight'] = ipw_cutoff_value 113 | 114 | obs_data['IP_weight_cov'] = np.where(obs_data[time_name] > 0, obs_data['IP_weight'].shift(1), 1) 115 | 116 | obs_means = {} 117 | if covnames is not None: 118 | for k, covname in enumerate(covnames): 119 | if covtypes[k] == 'categorical': 120 | all_levels = np.unique(obs_data[covname]) 121 | all_levels_obs_prob = [] 122 | for level in all_levels: 123 | obs_level_prob = obs_data[obs_data[covname].notna()].groupby([time_name]).apply(lambda g: 124 | (((g[covname] == level) * g['IP_weight_cov']).mean()) /g['IP_weight_cov'].mean()).tolist()[:time_points] 125 | all_levels_obs_prob.append(obs_level_prob) 126 | else: 127 | cov_mean = obs_data[obs_data[covname].notna()].groupby(time_name).apply(lambda g: (g['IP_weight_cov'] * g[covname]).mean() / g['IP_weight_cov'].mean()).tolist()[:time_points] 128 | obs_means[covname] = cov_mean 129 | 130 | if outcome_type == 'binary_eof' or outcome_type == 'continuous_eof': 131 | obs_data_last_record = obs_data.loc[obs_data[outcome_name].notna()] 132 | obs_mean_Ey = (obs_data_last_record[outcome_name] * obs_data_last_record['IP_weight']).mean() / obs_data_last_record['IP_weight'].mean() 133 | 134 | if outcome_type == 'survival': 135 | if competing and not compevent_cens: 136 | w_elimD = obs_data['IP_weight'] * (1 - obs_data[compevent_name]) 137 | obs_data['w_elimD'] = w_elimD 138 | h_k = obs_data[obs_data[outcome_name].notna()].groupby(time_name).apply( 139 | lambda g: (g['w_elimD'] * g[outcome_name]).mean() / g['w_elimD'].mean()) 140 | h_k2 = obs_data[obs_data[compevent_name].notna()].groupby(time_name).apply( 141 | lambda g: (g['IP_weight'] * g[compevent_name]).mean() / g['IP_weight'].mean()) 142 | risks = np.array([list(h_k)[k] * (1 - list(h_k2)[k]) if k == 0 else list(h_k)[k] 143 | * (1 - list(h_k2)[k]) * list((1 - h_k).cumprod())[k - 1] * list((1 - h_k2).cumprod())[k - 1] 144 | for k in range(time_points)]).cumsum().tolist()[:time_points] 145 | obs_means['risk'] = risks 146 | obs_risk = risks[-1] 147 | else: 148 | weight_outcome_mean = obs_data[obs_data[outcome_name].notna()].groupby(time_name).apply( 149 | lambda g: (g['IP_weight'] * g[outcome_name]).mean() / g['IP_weight'].mean()) 150 | weight_p0_mean = 1 - weight_outcome_mean 151 | risks = np.array([weight_outcome_mean.tolist()[k] if k == 0 else weight_outcome_mean.tolist()[k] * 152 | weight_p0_mean.cumprod().tolist()[k - 1] for k in 153 | range(time_points)]).cumsum().tolist()[:time_points] 154 | obs_means['risk'] = risks 155 | obs_risk = risks[-1] 156 | 157 | if outcome_type == 'survival': 158 | # for parametric cov means and risks 159 | if competing and not compevent_cens: 160 | nc_pool['p0_cum'] = nc_pool.groupby(id)['prob0'].cumprod() 161 | nc_pool['pd_0'] = 1 - nc_pool['prob_D'] 162 | nc_pool['pd_0_cum'] = nc_pool.groupby(id)['pd_0'].cumprod() 163 | nc_pool['w_cov'] = np.where(nc_pool[time_name] > 0, 164 | nc_pool['p0_cum'].shift(1) * nc_pool['pd_0_cum'].shift(1), 1) 165 | else: 166 | nc_pool['p0_cum'] = nc_pool.groupby([id])['prob0'].cumprod() 167 | nc_pool['w_cov'] = np.where(nc_pool[time_name] > 0, nc_pool['p0_cum'].shift(1), 1) 168 | else: 169 | nc_pool['w_cov'] = 1 170 | 171 | est_means = {} 172 | if covnames is not None: 173 | for k, covname in enumerate(covnames): 174 | if covtypes[k] == 'categorical': 175 | all_levels = np.unique(obs_data[covname]) 176 | all_levels_est_prob_mean = [] 177 | for level in all_levels: 178 | est_level_prob = nc_pool[nc_pool[covname].notna()].groupby([time_name]).apply( 179 | lambda g: ((g[covname] == level) * g['w_cov']).mean() / g['w_cov'].mean()).tolist()[:time_points] 180 | all_levels_est_prob_mean.append(est_level_prob) 181 | est_means[covname] = all_levels_est_prob_mean 182 | else: 183 | cov_mean = nc_pool[nc_pool[covname].notna()].groupby(time_name).apply( 184 | lambda g: (g['w_cov'] * g[covname]).mean() / g['w_cov'].mean()).tolist()[:time_points] 185 | est_means[covname] = cov_mean 186 | if outcome_type == 'survival': 187 | est_means['risk'] = nc_risk 188 | 189 | else: 190 | # for non-parametric cov means and risks 191 | obs_means = {} 192 | if covnames is not None: 193 | for k, covname in enumerate(covnames): 194 | if covtypes[k] == 'categorical': 195 | all_levels = np.unique(obs_data[covname]) 196 | all_levels_obs_prob_mean = [] 197 | for level in all_levels: 198 | obs_level_prob = obs_data.groupby([time_name]).apply(lambda g: ((g[covname] == level)).mean()).tolist()[:time_points] 199 | all_levels_obs_prob_mean.append(obs_level_prob) 200 | obs_means[covname] = all_levels_obs_prob_mean 201 | else: 202 | obs_mean = obs_data.groupby([time_name])[covname].mean().tolist()[:time_points] 203 | obs_means[covname] = obs_mean 204 | 205 | if outcome_type == 'binary_eof' or outcome_type == 'continuous_eof': 206 | obs_mean_Ey = obs_data.loc[obs_data[time_name] == time_points - 1][outcome_name].mean() 207 | 208 | if outcome_type == 'survival': 209 | if competing and not compevent_cens: 210 | p1_mean = obs_data[obs_data[outcome_name].notna()].groupby(time_name)[outcome_name].mean() 211 | pd_mean = obs_data[obs_data[compevent_name].notna()].groupby(time_name)[compevent_name].mean() 212 | comrisks = np.array( 213 | [list(p1_mean)[k] * (1 - list(pd_mean)[k]) if k == 0 else list(p1_mean)[k] * (1 - list(pd_mean)[k]) * 214 | list((1 - p1_mean).cumprod())[k - 1] * list((1 - pd_mean).cumprod())[k - 1] 215 | for k in range(time_points)]).cumsum().tolist()[:time_points] 216 | obs_means['risk'] = comrisks 217 | obs_risk = comrisks[-1] 218 | else: 219 | p1_mean_obs = obs_data[obs_data[outcome_name].notna()].groupby(time_name)[outcome_name].mean() 220 | p0_mean_obs = 1 - p1_mean_obs 221 | risks = np.array( 222 | [p1_mean_obs.tolist()[k] if k == 0 else p1_mean_obs.tolist()[k] * p0_mean_obs.cumprod().tolist()[k - 1] 223 | for k in range(time_points)]).cumsum().tolist()[:time_points] 224 | obs_means['risk'] = risks 225 | obs_risk = risks[-1] 226 | 227 | if outcome_type == 'survival': 228 | # for parametric cov means and risks 229 | if competing and not compevent_cens: 230 | nc_pool['p0_cum'] = nc_pool.groupby(id)['prob0'].cumprod() 231 | nc_pool['pd_0'] = 1 - nc_pool['prob_D'] 232 | nc_pool['pd_0_cum'] = nc_pool.groupby(id)['pd_0'].cumprod() 233 | nc_pool['w_cov'] = np.where(nc_pool[time_name] > 0, 234 | nc_pool['p0_cum'].shift(1) * nc_pool['pd_0_cum'].shift(1), 1) 235 | else: 236 | nc_pool['p0_cum'] = nc_pool.groupby([id])['prob0'].cumprod() 237 | nc_pool['w_cov'] = np.where(nc_pool[time_name] > 0, nc_pool['p0_cum'].shift(1), 1) 238 | else: 239 | nc_pool['w_cov'] = 1 240 | 241 | est_means = {} 242 | if covnames is not None: 243 | for k, covname in enumerate(covnames): 244 | if covtypes[k] == 'categorical': 245 | all_levels = np.unique(obs_data[covname]) 246 | all_levels_est_prob_mean = [] 247 | for level in all_levels: 248 | est_level_prob = nc_pool[nc_pool[covname].notna()].groupby([time_name]).apply( 249 | lambda g: ((g[covname] == level) * g['w_cov']).mean() / g['w_cov'].mean()).tolist()[:time_points] 250 | all_levels_est_prob_mean.append(est_level_prob) 251 | est_means[covname] = all_levels_est_prob_mean 252 | else: 253 | est_mean = nc_pool[nc_pool[covname].notna()].groupby(time_name).apply(lambda g: 254 | (g['w_cov'] * g[covname]).mean() 255 | / g['w_cov'].mean()).tolist()[:time_points] 256 | est_means[covname] = est_mean 257 | if outcome_type == 'survival': 258 | est_means['risk'] = nc_risk 259 | 260 | obs_res = obs_risk if outcome_type == 'survival' else obs_mean_Ey 261 | IP_weights = obs_data['IP_weight'].tolist() if censor else None 262 | 263 | return obs_means, est_means, obs_res, IP_weights 264 | 265 | -------------------------------------------------------------------------------- /pygformula/data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def load_basicdata(): 5 | """ 6 | Data description: a survival dataset that contains 11,332 observations on 2,500 individuals over 7 time points. 7 | Each row in the dataset corresponds to the record of one individual at one time point. 8 | 9 | id: Unique identifier for each individual. 10 | t0: Time index. 11 | L1: Binary time-varying covariate. 12 | L2: Continuous time-varying covariate. 13 | L3: Categorical baseline covariate. For each individual, the baseline values are repeated at each time point. 14 | A: Binary treatment variable. 15 | D: Competing event; time-varying indicator of failure. 16 | Y: Outcome of interest; time-varying indicator of failure. 17 | 18 | Returns 19 | ------- 20 | A pandas dataframe 21 | """ 22 | data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_data_basicdata.csv' 23 | data = pd.read_csv(data_url) 24 | return data 25 | 26 | def load_basicdata_nocomp(): 27 | """ 28 | Data description: a survival dataset that contains 13,170 observations on 2,500 individuals over 7 time points. 29 | Each row in the dataset corresponds to the record of one individual at one time point. 30 | 31 | id: Unique identifier for each individual. 32 | t0: Time index. 33 | L1: Binary time-varying covariate. 34 | L2: Continuous time-varying covariate. 35 | L3: Categorical baseline covariate. For each individual, the baseline values are repeated at each time point. 36 | A: Binary treatment variable. 37 | Y: Outcome of interest; time-varying indicator of failure. 38 | 39 | This is a survival dataset that contains 2500 individuals with maximum 7 time points. There are 40 | one binary covariate L1, one normal covariate L2, one baseline covariate L3, one binary treatment variable A, and a 41 | binary outcome Y. 42 | 43 | Returns 44 | ------- 45 | A pandas dataframe 46 | """ 47 | data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_data_basicdata_nocomp.csv' 48 | data = pd.read_csv(data_url) 49 | return data 50 | 51 | def load_absorbing_data(): 52 | """ 53 | Data description: a survival dataset that contains 6,033 observations, 1,000 individuals over 10 time points. 54 | Each row in the dataset corresponds to the record of one individual at one time point. 55 | 56 | id: Unique identifier for each individual. 57 | t0: Time index. 58 | L: Binary time-varying covariate with absorbing type, once it takes value 1, it keeps 1 at subsequent time points. 59 | A: Binary treatment variable. 60 | Y: Outcome of interest; time-varying indicator of failure. 61 | 62 | Returns 63 | ------- 64 | A pandas dataframe 65 | """ 66 | data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_data_absorbing.csv' 67 | data = pd.read_csv(data_url) 68 | return data 69 | 70 | def load_binary_eof(): 71 | """ 72 | Data description: a dataset that contains 17,500 observations on 2,500 individuals over 7 time points. 73 | Each row in the dataset corresponds to the record of one individual at one time point. 74 | 75 | id: Unique identifier for each individual. 76 | t0: Time index. 77 | L1: Binary time-varying covariate. 78 | L2: Continuous time-varying covariate. 79 | L3: Categorical baseline covariate. For each individual, the baseline values are repeated at each time point. 80 | A: Continuous treatment variable. 81 | Y: Binary outcome of interest. Because this outcome is only defined at the end of follow-up, values of NA are given 82 | in all other time points. 83 | 84 | Returns 85 | ------- 86 | A pandas dataframe 87 | """ 88 | data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_data_binary_eof.csv' 89 | data = pd.read_csv(data_url) 90 | return data 91 | 92 | def load_categorical(): 93 | """ 94 | Data description: a survival dataset that contains 7,822 observations, 1,000 individuals over 10 time points. 95 | Each row in the dataset corresponds to the record of one individual at one time point. 96 | 97 | id: Unique identifier for each individual. 98 | t0: Time index. 99 | L: Categorical covariate with 5 categories. 100 | A: Binary treatment variable. 101 | Y: Outcome of interest; time-varying indicator of failure. 102 | 103 | Returns 104 | ------- 105 | A pandas dataframe 106 | """ 107 | data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_data_categorical.csv' 108 | data = pd.read_csv(data_url) 109 | return data 110 | 111 | def load_censor_data(): 112 | """ 113 | Data description: a survival dataset with censoring event that contains 118,725 observations, 10,000 individuals 114 | over 10 time points. Each row in the dataset corresponds to the record of one individual at one time point. 115 | Individuals who are censored at time k+1 only have a total of k+1 records, which correspond to time indices 0,..., k. 116 | 117 | id: Unique identifier for each individual. 118 | t0: Time index. 119 | L: Binary time-varying covariate. 120 | A: Continuous treatment variable. 121 | C: Censoring indicator. 122 | Y: Outcome of interest; time-varying indicator of failure. 123 | 124 | Returns 125 | ------- 126 | A pandas dataframe 127 | """ 128 | data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_data_censor.csv' 129 | data = pd.read_csv(data_url) 130 | return data 131 | 132 | def load_continuous_eof(): 133 | """ 134 | Data description: a dataset that contains 17,500 observations on 2,500 individuals over 7 time points. 135 | Each row in the dataset corresponds to the record of one individual at one time point. 136 | 137 | id: Unique identifier for each individual. 138 | t0: Time index. 139 | L1: Categorical time-varying covariate with 3 categories. 140 | L2: Continuous time-varying covariate. 141 | L3: Categorical baseline covariate. For each individual, the baseline values are repeated at each time point. 142 | A: Binary treatment variable. 143 | Y: Continuous outcome of interest. Because this outcome is only defined at the end of follow-up, values of NA are 144 | given in all other time points. 145 | 146 | Returns 147 | ------- 148 | A pandas dataframe 149 | """ 150 | data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_data_continuous_eof.csv' 151 | data = pd.read_csv(data_url) 152 | return data 153 | 154 | def load_visit_process(): 155 | """ 156 | Data description: a survival dataset with visit process that contains 1,739 observations on 200 individuals over 37 time points. 157 | Each row in the dataset corresponds to the record of one individual at one time point. 158 | 159 | id: Unique identifier for each individual. 160 | month: Time index. 161 | sex: Binary baseline covariate. For each individual, the baseline values are repeated at each time point. 162 | age: Continuous baseline covariate. 163 | race: Categorical baseline covariate. 164 | cd4_v: Continuous time-varying covariate. 165 | visit_cd4: Indicator of whether there is a cd4 visit/measurement. 166 | rna_v: Continuous time-varying covariate. 167 | visit_rna: Indicator of whether there is a rna visit/measurement. 168 | everhaart: Binary treatment variable. 169 | event: Outcome of interest; time-varying indicator of failure. 170 | 171 | Returns 172 | ------- 173 | A pandas dataframe 174 | """ 175 | data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_data_visit_process.csv' 176 | data = pd.read_csv(data_url) 177 | return data 178 | 179 | def load_truncated_normal(): 180 | """ 181 | Data description: a survival dataset with visit process that contains 7,855 observations on 1,000 individuals over 10 time points. 182 | Each row in the dataset corresponds to the record of one individual at one time point. 183 | 184 | id: Unique identifier for each individual. 185 | t0: Time index. 186 | L: Continuous time-varying covariate with truncated normal distribution. 187 | A: Binary treatment variable. 188 | Y: Outcome of interest; time-varying indicator of failure. 189 | 190 | Returns 191 | ------- 192 | A pandas dataframe 193 | """ 194 | data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_data_truncated_normal.csv' 195 | data = pd.read_csv(data_url) 196 | return data 197 | 198 | def load_zero_inflated_normal(): 199 | """ 200 | Data description: a survival dataset with visit process that contains 7,678 observations on 1,000 individuals over 10 time points. 201 | Each row in the dataset corresponds to the record of one individual at one time point. 202 | 203 | id: Unique identifier for each individual. 204 | t0: Time index. 205 | L: Continuous time-varying covariate with zero-inflated normal distribution. 206 | A: Binary treatment variable. 207 | Y: Outcome of interest; time-varying indicator of failure. 208 | 209 | Returns 210 | ------- 211 | A pandas dataframe 212 | """ 213 | data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_data_zero_inflated_normal.csv' 214 | data = pd.read_csv(data_url) 215 | return data 216 | 217 | 218 | def load_multiple_treatments_data(): 219 | """ 220 | Data description: a survival dataset that contains 3,416 observations on 1,000 individuals over 5 time points. 221 | Each row in the dataset corresponds to the record of one individual at one time point. 222 | 223 | id: Unique identifier for each individual. 224 | t0: Time index. 225 | L1: Binary time-varying covariate. 226 | L2: Continuous time-varying covariate. 227 | A1: Binary treatment variable. 228 | A2: Binary treatment variable. 229 | Y: Outcome of interest; time-varying indicator of failure. 230 | 231 | Returns 232 | ------- 233 | A pandas dataframe 234 | """ 235 | data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_data_multiple_treatments.csv' 236 | data = pd.read_csv(data_url) 237 | return data 238 | 239 | 240 | def load_threshold_data(): 241 | """ 242 | Data description: a survival dataset that contains 1,853 observations on 1,000 individuals over 5 time points. 243 | Each row in the dataset corresponds to the record of one individual at one time point. 244 | 245 | id: Unique identifier for each individual. 246 | t0: Time index. 247 | L1: Binary time-varying covariate. 248 | L2: Continuous time-varying covariate. 249 | A: Continuous treatment variable. 250 | Y: Outcome of interest; time-varying indicator of failure. 251 | 252 | Returns 253 | ------- 254 | A pandas dataframe 255 | """ 256 | data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_threshold_data.csv' 257 | data = pd.read_csv(data_url) 258 | return data -------------------------------------------------------------------------------- /pygformula/interventions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from functools import reduce 3 | import operator 4 | 5 | 6 | def natural(new_df, pool, int_var, time_name, t): 7 | """ 8 | This is an internal function used by natural course which does nothing on the new_df data. 9 | 10 | Parameters 11 | ---------- 12 | new_df: DataFrame 13 | A DataFrame that contains the observed or simulated data at time t. 14 | 15 | pool: DataFrame 16 | A DataFrame that contains the observed or simulated data up to time t. 17 | 18 | int_var: List 19 | A list containing strings of treatment names to be intervened in a particular intervention. 20 | 21 | time_name: Str 22 | A string specifying the name of the time variable in obs_data. 23 | 24 | t: Int 25 | An integer indicating the current time index to be intervened. 26 | 27 | Returns 28 | ------- 29 | None 30 | 31 | """ 32 | pass 33 | 34 | 35 | def static(new_df, pool, int_var, int_values, time_name, t): 36 | """ 37 | This is an internal function to perform a static intervention. 38 | 39 | Parameters 40 | ---------- 41 | new_df: DataFrame 42 | A DataFrame that contains the observed or simulated data at time t. 43 | 44 | pool: DataFrame 45 | A DataFrame that contains the observed or simulated data up to time t. 46 | 47 | int_var: List 48 | A list containing strings of treatment names to be intervened in a particular intervention. 49 | 50 | int_values: List 51 | A list containing the value needed when performing a particular intervention function. 52 | 53 | time_name: Str 54 | A string specifying the name of the time variable in obs_data. 55 | 56 | t: Int 57 | An integer indicating the current time index to be intervened. 58 | 59 | Returns 60 | ------- 61 | Nothing is returned, the new_df is changed under a particular intervention. 62 | 63 | """ 64 | new_df.loc[new_df[time_name] == t, int_var] = int_values[t] 65 | 66 | 67 | def threshold(new_df, pool, int_var, threshold_values, time_name, t): 68 | """ 69 | This is an internal function to perform a threshold intervention. 70 | 71 | Parameters 72 | ---------- 73 | new_df: DataFrame 74 | A DataFrame that contains the observed or simulated data at time t. 75 | 76 | pool: DataFrame 77 | A DataFrame that contains the observed or simulated data up to time t. 78 | 79 | int_var: List 80 | A list containing strings of treatment names to be intervened in a particular intervention. 81 | 82 | threshold_values: List 83 | A list containing the threshold values needed when performing a threshold intervention function. 84 | 85 | time_name: Str 86 | A string specifying the name of the time variable in obs_data. 87 | 88 | t: Int 89 | An integer indicating the current time index to be intervened. 90 | 91 | Returns 92 | ------- 93 | Nothing is returned, the new_df is changed under a particular intervention. 94 | 95 | """ 96 | new_df.loc[new_df[time_name] == t, int_var] = new_df[int_var].where(new_df[int_var] > threshold_values[0], threshold_values[0]) 97 | new_df.loc[new_df[time_name] == t, int_var] = new_df[int_var].where(new_df[int_var] < threshold_values[1], threshold_values[1]) 98 | 99 | 100 | def natural_grace_period(new_df, pool, int_var, nperiod, conditions, time_name, t): 101 | """ 102 | This is a pre-coded function to perform a natural grace period intervention. Once a covariate 103 | meets a threshold level, the treatment (int_var) is initiated within m (nperiod) time intervals which is the duration 104 | of the grace period. During grace period, the treatment takes its natural value. 105 | 106 | Parameters 107 | ---------- 108 | new_df: DataFrame 109 | A DataFrame that contains the observed or simulated data at time t. 110 | 111 | pool: DataFrame 112 | A DataFrame that contains the observed or simulated data up to time t. 113 | 114 | int_var: Str 115 | A string specifying the treatment variable to be intervened. 116 | 117 | nperiod: Int 118 | An integer indicating the duration of the grace period. 119 | 120 | conditions: Dict 121 | A dictionary that contains the covariate and its coditions for initiating the treatment. 122 | 123 | time_name: Str 124 | A string specifying the name of the time variable in obs_data. 125 | 126 | t: Int 127 | An integer indicating the current time index to be intervened. 128 | 129 | Returns 130 | ------- 131 | Nothing is returned, the new_df is changed under a particular intervention. 132 | 133 | """ 134 | 135 | # if condition is True, start initiation of the treatment with grace period 136 | masks = [] 137 | for cond_var, condition in conditions.items(): 138 | mask = new_df[cond_var].apply(condition) 139 | masks.append(mask) 140 | restrict_mask = reduce(operator.and_, masks) 141 | new_df[int_var] = np.where(restrict_mask, new_df[int_var], 0) 142 | 143 | # treatment is initiated by the end of the grace period 144 | if t >= nperiod: 145 | pool_data = pool[pool[time_name] == t - nperiod] 146 | masks = [] 147 | for cond_var, condition in conditions.items(): 148 | mask = pool_data[cond_var].apply(condition) 149 | masks.append(mask) 150 | restrict_mask = reduce(operator.and_, masks) 151 | new_df[int_var] = np.where(restrict_mask, 1, new_df[int_var]) 152 | 153 | # treatment is set to 1 once it is initiated 154 | if t > 0: 155 | new_df[int_var] = np.where(pool.loc[pool[time_name] == t - 1, int_var] == 1, 1, new_df[int_var]).tolist() 156 | 157 | 158 | def uniform_grace_period(new_df, pool, int_var, nperiod, conditions, time_name, t): 159 | """ 160 | This is a pre-coded function to perform a uniform grace period intervention. Once a covariate 161 | meets a threshold level, the treatment (int_var) is initiated within m (nperiod) time intervals which is the duration 162 | of the grace period. During grace period, treatment initiation is randomly allocated with a uniform probability of 163 | starting treatment in each time interval of the grace period. 164 | 165 | Parameters 166 | ---------- 167 | new_df: DataFrame 168 | A DataFrame that contains the observed or simulated data at time t. 169 | 170 | pool: DataFrame 171 | A DataFrame that contains the observed or simulated data up to time t. 172 | 173 | int_var: Str 174 | A string specifying the treatment variable to be intervened. 175 | 176 | nperiod: Int 177 | An integer indicating the duration of the grace period. 178 | 179 | conditions: Dict 180 | A dictionary that contains the covariate and its coditions for initiating the treatment. 181 | 182 | time_name: Str 183 | A string specifying the name of the time variable in obs_data. 184 | 185 | t: Int 186 | An integer indicating the current time index to be intervened. 187 | 188 | Returns 189 | ------- 190 | Nothing is returned, the new_df is changed under a particular intervention. 191 | 192 | """ 193 | 194 | def sample(prob): 195 | treatment = np.random.binomial(1, prob) 196 | return treatment 197 | 198 | masks = [] 199 | for cond_var, condition in conditions.items(): 200 | mask = new_df[cond_var].apply(condition) 201 | masks.append(mask) 202 | cond_initiation = reduce(operator.and_, masks) 203 | 204 | if t == 0: 205 | # initialize counts: the number of consecutive intervals up to t that an individual failed to receive treatment 206 | new_df['counts'] = 0 207 | 208 | # if condition is True, start initiation of the treatment according to a uniform distribution with grace period 209 | new_df['uni_prob'] = np.where(cond_initiation, 1 / (nperiod + 1 - new_df['counts']), 0) 210 | new_df[int_var] = np.where(cond_initiation, new_df['uni_prob'].apply(sample), 0) 211 | 212 | # update counts according to current treatment value 213 | new_df['counts'] = np.where(cond_initiation & (new_df[int_var] == 0), 1, 0) 214 | pool.loc[pool[time_name] == t, 'counts'] = new_df['counts'] 215 | 216 | else: 217 | # calculate the uniform probability for initiation when 1) the grace period has started in previous step, or 2) the grace period started at current step 218 | new_df['uni_prob'] = np.where(pool.loc[pool[time_name] == t - 1, 'counts'] > 0, 1 / (nperiod + 1 - pool.loc[pool[time_name] == t - 1, 'counts']), 219 | np.where(cond_initiation, 1 / (nperiod + 1 - pool.loc[pool[time_name] == t - 1, 'counts']), 0)) 220 | 221 | # get the teatment value according to the uniform probability 222 | new_df[int_var] = np.where((pool.loc[pool[time_name] == t - 1, 'counts'] > 0) | cond_initiation, new_df['uni_prob'].apply(sample), 0) 223 | 224 | # treatment is initiated by the end of the grace period 225 | if t >= nperiod: 226 | previous_pool_data = pool[pool[time_name] == t - nperiod] 227 | masks = [] 228 | for cond_var, condition in conditions.items(): 229 | mask = previous_pool_data[cond_var].apply(condition) 230 | masks.append(mask) 231 | pre_cond_initiation = reduce(operator.and_, masks) 232 | new_df[int_var] = np.where(pre_cond_initiation, 1, new_df[int_var]) 233 | 234 | # treatment is set to 1 once it is initiated 235 | new_df[int_var] = np.where(pool.loc[pool[time_name] == t - 1, int_var] == 1, 1, new_df[int_var]) 236 | 237 | # update current counts according to current treatment value 238 | new_df['counts'] = np.where((pool.loc[pool[time_name] == t - 1, 'counts'] > 0) & (new_df[int_var] == 0), pool.loc[pool[time_name] == t - 1, 'counts'] + 1, 239 | np.where(cond_initiation & (new_df[int_var] == 0), pool.loc[pool[time_name] == t - 1, 'counts'] + 1, 0)) 240 | 241 | 242 | def intervention_func(new_df, pool, intervention, time_name, t): 243 | 244 | """ 245 | This is an internal function which applies user-specified interventions on the data during simulation. 246 | 247 | Parameters 248 | ---------- 249 | new_df: DataFrame 250 | A DataFrame that contains the observed or simulated data at time t. 251 | 252 | pool: DataFrame 253 | A DataFrame that contains the observed or simulated data up to time t. 254 | 255 | intervention: List 256 | List of lists. The k-th list contains the intervention list on k-th treatment name in the intervention. 257 | The intervention list contains a function implementing a particular intervention on the treatment variable, 258 | required values for the intervention function and a list of time points in which the intervention 259 | is applied. 260 | 261 | time_name: Str 262 | A string specifying the name of the time variable in obs_data. 263 | 264 | t: Int 265 | An integer indicating the current time index to be intervened. 266 | 267 | Returns 268 | ------- 269 | Nothing is returned. 270 | 271 | """ 272 | 273 | if intervention == natural: 274 | pass 275 | else: 276 | for i in range(len(intervention)): 277 | int_var = intervention[i][0] 278 | int_func = intervention[i][1] 279 | 280 | if int_func == static: 281 | int_values = intervention[i][2] 282 | if len(intervention[i]) == 3: # no int_times specified, intervene on all times 283 | int_func(new_df, pool, int_var, int_values, time_name, t) 284 | else: # intervene on specified int_times 285 | int_times = intervention[i][3] 286 | if t in int_times: 287 | int_func(new_df, pool, int_var, int_values, time_name, t) 288 | 289 | elif int_func == threshold: 290 | threshold_values = intervention[i][2] 291 | if len(intervention[i]) == 3: 292 | int_func(new_df, pool, int_var, threshold_values, time_name, t) 293 | else: 294 | int_times = intervention[i][3] 295 | if t in int_times: 296 | int_func(new_df, pool, int_var, threshold_values, time_name, t) 297 | 298 | elif int_func == natural_grace_period or int_func == uniform_grace_period: 299 | nperiod = intervention[i][2][0] 300 | conditions = intervention[i][2][1] 301 | if len(intervention[i]) == 3: 302 | int_func(new_df, pool, int_var, nperiod, conditions, time_name, t) 303 | else: 304 | int_times = intervention[i][3] 305 | if t in int_times: 306 | int_func(new_df, pool, int_var, nperiod, conditions, time_name, t) 307 | 308 | else: # dynamic or custom intervention 309 | if len(intervention[i]) == 2: 310 | int_func(new_df, pool, int_var, time_name, t) 311 | else: 312 | int_times = intervention[i][2] 313 | if t in int_times: 314 | int_func(new_df, pool, int_var, time_name, t) 315 | 316 | 317 | 318 | 319 | -------------------------------------------------------------------------------- /pygformula/parametric_gformula/__init__.py: -------------------------------------------------------------------------------- 1 | from .parametric_gformula import ParametricGformula 2 | 3 | -------------------------------------------------------------------------------- /pygformula/parametric_gformula/bootstrap.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import warnings 4 | from lifelines import CoxPHFitter 5 | from .histories import update_precoded_history, update_custom_history 6 | from .simulate import simulate 7 | from .fit import fit_covariate_model, fit_ymodel, fit_compevent_model 8 | from ..utils.helper import hr_data_helper, hr_comp_data_helper 9 | 10 | 11 | def Bootstrap(obs_data, boot_id, boot_seeds, int_descript, intervention_dicts, covnames, 12 | basecovs, cov_hist, time_points, n_simul, time_name, id, custom_histvars, custom_histories, 13 | covmodels, hazardratio, intcomp, covtypes, covfits_custom, covpredict_custom, 14 | ymodel_fit_custom, ymodel_predict_custom, 15 | ymodel, outcome_type, outcome_name, competing, compevent_name, compevent_model, compevent_cens, 16 | boot_diag, trunc_params, visit_names, visit_covs, ts_visit_names, max_visits, time_thresholds, 17 | below_zero_indicator, baselags, restrictions, yrestrictions, compevent_restrictions, sim_trunc): 18 | """ 19 | This is an internal function to get the results of parametric g-formula for each bootstrap sample. 20 | 21 | Parameters 22 | ---------- 23 | obs_data: DataFrame 24 | A data frame containing the observed data. 25 | 26 | boot_id: Int 27 | An integer indicating the id of the bootstrap sample. 28 | 29 | boot_seeds: List 30 | A list that stores the random seeds of all bootstrap samples. 31 | 32 | int_descript: List 33 | A list of strings, each of which describes a user-specified intervention. 34 | 35 | intervention_dicts: Dict 36 | A dictionary whose key is the intervention decription and the value is the intervention list for all treatment 37 | variables in this intervention. 38 | 39 | covnames: List 40 | A list of strings specifying the names of the time-varying covariates in obs_data. 41 | 42 | basecovs: List 43 | A list of strings specifying the names of baseline covariates in obs_data. These covariates should not be 44 | included in covnames. 45 | 46 | cov_hist: Dict 47 | A dictionary whose keys are covariate names and values are sub-dictionaries with historical information for 48 | covariates. Each sub-dictionaty contains keys 'lagged', 'cumavg' and 'lagavg', the corresponding value for the 49 | key 'lagged' is a two-element list where the first element is a list with all lagged terms, the second element 50 | is a list with the corresponding lagged numbers. Same for the key 'lagavg'. The corresponding value for the key 51 | 'cumavg' is a list with all cumavg terms. 52 | 53 | time_points: Int 54 | An integer indicating the number of time points to simulate. It is set equal to the maximum number of records (K) 55 | that obs_data contains for any individual plus 1, if not specified by users. 56 | 57 | n_simul: Int 58 | An integer indicating the number of subjects for whom to simulate data. It is set equal to the number (M) of 59 | subjects in obs_data, if not specified by users. 60 | 61 | time_name: Str 62 | A string specifying the name of the time variable in obs_data. 63 | 64 | id: Str 65 | A string specifying the name of the id variable in obs_data. 66 | 67 | custom_histvars: List 68 | A list of strings, each of which specifies the names of the time-varying covariates with user-specified custom histories. 69 | 70 | custom_histories: List 71 | A list of functions, each function is the user-specified custom history functions for covariates. The list must 72 | be the same length as custom_histvars and in the same order. 73 | 74 | covmodels: List 75 | A list of strings, where each string is the model statement of the time-varying covariate. The list must be the 76 | same length as covnames and in the same order. If a model is not required for a certain covariate, it should be 77 | set to 'NA' at that index. 78 | 79 | hazardratio: Bool 80 | A boolean value indicating whether to calculate the hazard ratio of the two compared interventions. 81 | 82 | intcomp: List 83 | A list of two numbers indicating a pair of interventions to be compared by a hazard ratio. 84 | 85 | covtypes: List 86 | A list of strings specifying the “type” of each time-varying covariate included in covnames. The supported types: 87 | "binary", "normal", "categorical", "bounded normal", "zero-inflated normal", "truncated normal", "absorbing", 88 | "categorical time", "square time" and "custom". The list must be the same length as covnames and in the same order. 89 | 90 | covfits_custom: List 91 | A list, each element could be 'NA' or a user-specified fit function. The non-NA value is set 92 | for the covariates with custom type. The 'NA' value is set for other covariates. The list must be the 93 | same length as covnames and in the same order. 94 | 95 | covpredict_custom: List 96 | A list, each element could be 'NA' or a user-specified predict function. The non-NA value is set 97 | for the covariates with custom type. The 'NA' value is set for other covariates. The list must be the 98 | same length as covnames and in the same order. 99 | 100 | ymodel_fit_custom: Function 101 | A user-specified fit function for the outcome variable. 102 | 103 | ymodel_predict_custom: Function 104 | A user-specified predict function for the outcome variable. 105 | 106 | ymodel: Str 107 | A string specifying the model statement for the outcome variable. 108 | 109 | outcome_type: Str 110 | A string specifying the "type" of outcome. The possible "types" are: "survival", "continuous_eof", and "binary_eof". 111 | 112 | outcome_name: Str 113 | A string specifying the name of the outcome variable in obs_data. 114 | 115 | competing: Bool 116 | A boolean value indicating if there is a competing event in obs_data. 117 | 118 | compevent_name: Str 119 | A string specifying the name of the competing event variable in obs_data. Only applicable for survival outcomes. 120 | 121 | compevent_model: Str 122 | A string specifying the model statement for the competing event variable. Only applicable for survival outcomes. 123 | 124 | compevent_cens: Bool 125 | A boolean value indicating whether to treat competing events as censoring events. 126 | 127 | boot_diag: Bool 128 | A boolean value indicating whether to return the parametric g-formula estimates as well as the coefficients, 129 | standard errors, and variance-covariance matrices of the parameters of the fitted models in the bootstrap samples. 130 | 131 | trunc_params: List 132 | A list, each element could be 'NA' or a two-element list. If not 'NA', the first element specifies the truncated 133 | value and the second element specifies the truncated direction (‘left’ or ‘right’). The non-NA value is set 134 | for the truncated normal covariates. The 'NA' value is set for other covariates. The list should be the same 135 | length as covnames and in the same order. 136 | 137 | visit_names: List 138 | A list, each of which is a string specifying the covariate name of a visit process. 139 | 140 | visit_covs: List 141 | A list of strings, each of which specifies the name of a covariate whose modeling depends on the visit process. 142 | 143 | ts_visit_names: List 144 | A list of strings, each of which indicates the number of consecutive missed visits for one covariate before an 145 | individual is censored. 146 | 147 | max_visits: List 148 | A list of integers, each integer indicates the maximum number of consecutive missed visits for one covariate that 149 | has a visit process. 150 | 151 | time_thresholds: List 152 | A list of integers that splits the time points into different intervals. It is used to create the variable 153 | "categorical time". 154 | 155 | below_zero_indicator: Bool 156 | A boolean value indicating if the obs_data contains pre-baseline times. 157 | 158 | baselags: Bool 159 | A boolean value specifying the convention used for lagi and lag_cumavgi terms in the model statements when 160 | pre-baseline times are not included in obs_data and when the current time index, t, is such that t < i. If this 161 | argument is set to False, the value of all lagi and lag_cumavgi terms in this context are set to 0 (for 162 | non-categorical covariates) or the reference level (for categorical covariates). If this argument is set to 163 | True, the value of lagi and lag_cumavgi terms are set to their values at time 0. The default is False. 164 | 165 | restrictions: List 166 | List of lists. Each inner list contains its first entry the covariate name of that its deterministic knowledge 167 | is known; its second entry is a dictionary whose key is the conditions which should be True when the covariate 168 | is modeled, the third entry is the value that is set to the covariate during simulation when the conditions 169 | in the second entry are not True. 170 | 171 | yrestrictions: List 172 | List of lists. For each inner list, its first entry is a dictionary whose key is the conditions which 173 | should be True when the outcome is modeled, the second entry is the value that is set to the outcome during 174 | simulation when the conditions in the first entry are not True. 175 | 176 | compevent_restrictions: List 177 | List of lists. For each inner list, its first entry is a dictionary whose key is the conditions which 178 | should be True when the competing event is modeled, the second entry is the value that is set to the competing 179 | event during simulation when the conditions in the first entry are not True. Only applicable for survival outcomes. 180 | 181 | sim_trunc: Bool 182 | A boolean value indicating if the simulated values of normal covariates are truncated by the observed ranges. 183 | 184 | Returns 185 | ------- 186 | boot_results_dict: Dict 187 | A dictionary contains the 'boot_results', 'bootcoeffs', 'bootstderrs', 'bootvcovs' and 'boot_hr' for a bootstrap sample. 188 | 189 | """ 190 | try: 191 | np.random.seed(boot_seeds[boot_id]) 192 | 193 | data_list = dict(list(obs_data.groupby(id, group_keys=False))) 194 | ids = np.unique(obs_data[id]) 195 | new_ids = np.random.choice(ids, len(ids), replace=True) 196 | 197 | new_df = [] 198 | for index, new_id in enumerate(new_ids): 199 | new_id_df = data_list[new_id].copy() 200 | new_id_df[id] = index 201 | new_df.append(new_id_df) 202 | resample_data = pd.concat(new_df, ignore_index=True) 203 | 204 | update_precoded_history(pool=resample_data, covnames=covnames, cov_hist=cov_hist, covtypes=covtypes, 205 | time_name=time_name, id=id, below_zero_indicator=below_zero_indicator, 206 | baselags=baselags, ts_visit_names = ts_visit_names) 207 | if custom_histvars is not None: 208 | for t in range(time_points): 209 | update_custom_history(resample_data, custom_histvars, custom_histories, time_name, t, id) 210 | 211 | covariate_fits, bounds, rmses, cov_model_coeffs, cov_model_stderrs, cov_model_vcovs, cov_model_fits_summary = \ 212 | fit_covariate_model(covmodels=covmodels, covnames=covnames, covtypes=covtypes, 213 | covfits_custom=covfits_custom, time_name=time_name, obs_data=resample_data, 214 | return_fits=boot_diag, trunc_params=trunc_params, visit_names=visit_names, 215 | max_visits=max_visits, ts_visit_names=ts_visit_names, 216 | visit_covs=visit_covs, restrictions=restrictions) 217 | 218 | outcome_fit, ymodel_coeffs, ymodel_stderrs, ymodel_vcovs, ymodel_fits_summary = \ 219 | fit_ymodel(ymodel=ymodel, outcome_type=outcome_type, outcome_name=outcome_name, 220 | ymodel_fit_custom=ymodel_fit_custom, time_name=time_name, obs_data=resample_data, 221 | competing=competing, compevent_name=compevent_name, return_fits=boot_diag, 222 | yrestrictions=yrestrictions) 223 | 224 | model_coeffs = {**cov_model_coeffs, **ymodel_coeffs} 225 | model_stderrs = {**cov_model_stderrs, **ymodel_stderrs} 226 | model_vcovs = {**cov_model_vcovs, **ymodel_vcovs} 227 | model_fits_summary = {**cov_model_fits_summary, **ymodel_fits_summary} 228 | 229 | if competing: 230 | compevent_fit, comp_model_coeffs, comp_model_stderrs, comp_model_vcovs, comp_model_fits_summary = \ 231 | fit_compevent_model(compevent_model=compevent_model, compevent_name=compevent_name, 232 | time_name=time_name, obs_data=resample_data, return_fits=boot_diag, 233 | compevent_restrictions=compevent_restrictions) 234 | model_coeffs.update(comp_model_coeffs) 235 | model_stderrs.update(comp_model_stderrs) 236 | model_vcovs.update(comp_model_vcovs) 237 | model_fits_summary.update(comp_model_fits_summary) 238 | else: 239 | compevent_fit = None 240 | 241 | if n_simul != len(np.unique(resample_data[id])): 242 | data_list = dict(list(obs_data.groupby(id, group_keys=True))) 243 | ids = np.unique(obs_data[id]) 244 | new_ids = np.random.choice(ids, n_simul, replace=True) 245 | 246 | new_df = [] 247 | for index, new_id in enumerate(new_ids): 248 | new_id_df = data_list[new_id].copy() 249 | new_id_df[id] = index 250 | new_df.append(new_id_df) 251 | resample_data = pd.concat(new_df, ignore_index=True) 252 | 253 | boot_results = [] 254 | boot_pools = [] 255 | for intervention_name in int_descript: 256 | boot_result = simulate(seed=boot_seeds[boot_id], time_points=time_points, time_name=time_name, 257 | id=id, covnames=covnames, basecovs=basecovs, 258 | covmodels=covmodels, covtypes=covtypes, cov_hist=cov_hist, 259 | covariate_fits=covariate_fits, rmses=rmses, bounds=bounds, outcome_type=outcome_type, 260 | obs_data=resample_data, 261 | intervention=intervention_dicts[intervention_name], 262 | custom_histvars = custom_histvars, custom_histories=custom_histories, 263 | covpredict_custom=covpredict_custom, ymodel=ymodel, 264 | ymodel_predict_custom=ymodel_predict_custom, 265 | outcome_fit=outcome_fit, outcome_name=outcome_name, 266 | competing=competing, compevent_name=compevent_name, 267 | compevent_fit=compevent_fit, compevent_model=compevent_model, 268 | compevent_cens=compevent_cens, trunc_params=trunc_params, visit_names=visit_names, 269 | visit_covs=visit_covs, ts_visit_names=ts_visit_names, 270 | max_visits=max_visits, time_thresholds=time_thresholds, 271 | baselags=baselags, below_zero_indicator=below_zero_indicator, 272 | restrictions=restrictions, yrestrictions=yrestrictions, 273 | compevent_restrictions=compevent_restrictions, sim_trunc=sim_trunc 274 | ) 275 | boot_results.append(boot_result['g_result']) 276 | boot_pools.append(boot_result['pool']) 277 | 278 | boot_results_dict = {'boot_results': boot_results, 'bootcoeffs': model_coeffs, 'bootstderrs': model_stderrs, 279 | 'bootvcovs': model_vcovs} 280 | 281 | if hazardratio: 282 | pool1 = boot_pools[intcomp[0]] 283 | pool2 = boot_pools[intcomp[1]] 284 | 285 | if competing and not compevent_cens: 286 | import cmprsk.cmprsk as cmprsk 287 | 288 | new_pool1 = pool1.groupby(id, group_keys=False).apply(hr_comp_data_helper, 289 | outcome_name=outcome_name, compevent_name=compevent_name) 290 | new_pool2 = pool2.groupby(id, group_keys=False).apply(hr_comp_data_helper, 291 | outcome_name=outcome_name, compevent_name=compevent_name) 292 | new_pool1['regime'] = 0 293 | new_pool2['regime'] = 1 294 | concat_data = pd.concat([new_pool1, new_pool2]) 295 | concat_data = concat_data[[time_name, outcome_name, compevent_name, 'regime']] 296 | concat_data = concat_data.reset_index(drop=True) 297 | concat_data['event'] = np.where(concat_data[compevent_name] == 1, 2, 298 | concat_data[outcome_name]).tolist() 299 | ftime = concat_data[time_name] 300 | fstatus = concat_data['event'] 301 | crr_res = cmprsk.crr(failure_time=ftime, failure_status=fstatus, static_covariates=concat_data[['regime']]) 302 | hazard_ratio = crr_res.hazard_ratio()[0][0] 303 | else: 304 | new_pool1 = pool1.groupby(id, group_keys=False).apply(hr_data_helper, outcome_name=outcome_name) 305 | new_pool2 = pool2.groupby(id, group_keys=False).apply(hr_data_helper, outcome_name=outcome_name) 306 | new_pool1['regime'] = 0 307 | new_pool2['regime'] = 1 308 | concat_data = pd.concat([new_pool1, new_pool2]) 309 | concat_data = concat_data[[time_name, outcome_name, 'regime']] 310 | cph = CoxPHFitter() 311 | cph.fit(concat_data, duration_col=time_name, event_col=outcome_name) 312 | hazard_ratio = cph.hazard_ratios_.values[0] 313 | 314 | boot_results_dict['boot_hr'] = hazard_ratio 315 | 316 | except Exception as e: 317 | warnings.warn("An error occurred at bootstrap sample {0}: {1}. " 318 | "The analysis should likely be repeated with more parsimonious models.".format(boot_id, e)) 319 | boot_results_dict = {'boot_results': None, 'bootcoeffs': None, 'bootstderrs': None, 'bootvcovs': None} 320 | 321 | return boot_results_dict 322 | 323 | 324 | -------------------------------------------------------------------------------- /pygformula/parametric_gformula/histories.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def update_precoded_history(pool, covnames, cov_hist, covtypes, time_name, id, below_zero_indicator, baselags, 6 | ts_visit_names=None): 7 | """ 8 | This internal function is used to add new columns to the original pool for the three precoded historical terms (the 9 | lagged term, cumavg term, and lagavg term) in the model statement. 10 | 11 | Parameters 12 | ---------- 13 | pool : DataFrame 14 | A DataFrame that contains the observed or simulated data up to the maximum time step of the data table. 15 | The historical terms at all time steps in the data table are to be updated. 16 | 17 | covnames : List 18 | A list of strings specifying the names of the time-varying covariates. 19 | 20 | cov_hist : Dict 21 | A dictionary whose keys are covariate names and values are sub-dictionaries with historical information for 22 | covariates. Each sub-dictionaty contains keys 'lagged', 'cumavg' and 'lagavg', the corresponding value for the 23 | key 'lagged' is a two-element list where the first element is a list with all lagged terms, the second element 24 | is a list with the corresponding lagged numbers. Same for the key 'lagavg'. The corresponding value for the key 25 | 'cumavg' is a list with all cumavg terms. 26 | 27 | covtypes : List 28 | A list of strings specifying the type of each time-varying covariate included in covnames. The list must be 29 | the same length as covnames and in the same order. The supported types: “binary”, “normal”, “categorical”, 30 | “bounded normal”, “zero-inflated normal”, “truncated normal”, “absorbing”, “categorical time”, "square time" 31 | and "custom". 32 | 33 | time_name : Str 34 | A string specifying the name of the time variable in obs_data. 35 | 36 | id : Str 37 | A string specifying the name of the id variable in obs_data. 38 | 39 | below_zero_indicator : Bool 40 | A boolean variable indicating if the obs_data contains pre-baseline times. 41 | 42 | baselags : Bool 43 | A boolean value specifying the convention used for lagi and lag_cumavgi terms in the model statements when 44 | pre-baseline times are not included in obs_data and when the current time index, t, is such that t < i. If this 45 | argument is set to False, the value of all lagi and lag_cumavgi terms in this context are set to 0 (for 46 | non-categorical covariates) or the reference level (for categorical covariates). If this argument is set to 47 | True, the value of lagi and lag_cumavgi terms are set to their values at time 0. 48 | 49 | ts_visit_names : List 50 | A list of strings, each of which indicates the number of consecutive missed visits for one covariate before an 51 | individual is censored. 52 | 53 | Returns 54 | ------- 55 | None : The original input pool has been updated and nothing is returned. 56 | 57 | """ 58 | 59 | if ts_visit_names: 60 | covnames = covnames + ts_visit_names 61 | 62 | for k, cov in enumerate(covnames): 63 | if ts_visit_names is not None: 64 | cov_type = covtypes[k] if cov not in ts_visit_names else None 65 | else: 66 | cov_type = covtypes[k] 67 | 68 | lagged_covs = cov_hist[cov]['lagged'][0] 69 | lagged_nums = cov_hist[cov]['lagged'][1] 70 | if len(lagged_covs) > 0: # create lag variable 71 | for i, lagged_cov in enumerate(lagged_covs): 72 | if cov_type == 'categorical': 73 | if below_zero_indicator: 74 | pool[lagged_cov] = np.array(pool.groupby([id])[cov].shift(lagged_nums[i])) 75 | else: 76 | fill_values = pool.groupby([id])[cov].transform('first') if baselags else \ 77 | pd.Categorical(pool[cov]).categories[0] 78 | pool[lagged_cov] = np.where(pool[time_name] >= lagged_nums[i], 79 | pool.groupby([id])[cov].shift(lagged_nums[i]), fill_values) 80 | pool[lagged_cov] = pd.Categorical(pool[lagged_cov]) 81 | else: 82 | if below_zero_indicator: 83 | pool[lagged_cov] = np.array(pool.groupby([id])[cov].shift(lagged_nums[i])) 84 | else: 85 | fill_values = pool.groupby(id)[cov].transform('first') if baselags else 0 86 | pool[lagged_cov] = np.where(pool[time_name] >= lagged_nums[i], 87 | pool.groupby([id])[cov].shift(lagged_nums[i]), fill_values) 88 | 89 | if len(cov_hist[cov]['cumavg']) > 0: # create cumavg variable 90 | pool['_'.join(['cumavg', str(cov)])] = np.array(pool.groupby([id])[cov].expanding().mean()) 91 | 92 | lagavg_covs = cov_hist[cov]['lagavg'][0] 93 | lagavg_nums = cov_hist[cov]['lagavg'][1] 94 | if len(lagavg_covs) > 0: # create lagavg variable 95 | if len(cov_hist[cov]['cumavg']) == 0: # if cumavg variable has not been created yet, create cumavg variable 96 | pool['_'.join(['cumavg', str(cov)])] = np.array(pool.groupby([id])[cov].expanding().mean()) 97 | 98 | for i, lagavg_cov in enumerate(lagavg_covs): 99 | if below_zero_indicator: 100 | pool[lagavg_cov] = np.array(pool.groupby([id])['_'.join(['cumavg', str(cov)])].shift(lagavg_nums[i])) 101 | else: 102 | fill_values = pool.groupby(id)[cov].transform('first') if baselags else 0 103 | pool[lagavg_cov] = np.where(pool[time_name] >= lagavg_nums[i], 104 | pool.groupby([id])['_'.join(['cumavg', str(cov)])].shift(lagavg_nums[i]), fill_values) 105 | 106 | 107 | def ave_last3(pool, histvar, time_name, t, id): 108 | """ 109 | This is an example historical function which generates the average of the three most recent values for a specified 110 | covariate. 111 | 112 | Parameters 113 | ---------- 114 | pool : DataFrame 115 | A DataFrame that contains the observed or simulated data up to time t. The historical term at time t in the data 116 | table is to be updated. 117 | 118 | histvar : Str 119 | A string that specifies the name of the variable for which the history function is to be applied. 120 | 121 | time_name : Str 122 | A string specifying the name of the time variable in pool. 123 | 124 | t : Int 125 | An integer specifying the current time index. 126 | 127 | id : Str 128 | A string specifying the name of the id variable in the obs_data. 129 | 130 | Returns 131 | ------- 132 | None : The original input pool has been updated and nothing is returned. 133 | 134 | """ 135 | def avg_func(df, time_name, t, histvar): 136 | if t < 3: 137 | avg_values = np.mean((df[(df[time_name] >= 0) & (df[time_name] <= t)][histvar])) 138 | else: 139 | avg_values = np.mean((df[(df[time_name] > t - 3) & (df[time_name] <= t)][histvar])) 140 | return avg_values 141 | 142 | valid_pool = pool.groupby(id).filter(lambda x: max(x[time_name]) >= t) 143 | pool.loc[pool[time_name] == t, '_'.join(['ave_last3', str(histvar)])] = list(valid_pool.groupby(id).apply( 144 | avg_func, time_name=time_name, t=t, histvar=histvar)) 145 | 146 | 147 | def update_custom_history(pool, histvars, histories, time_name, t, id): 148 | """ 149 | This internal function is used to add new columns to the original pool for the user-specified custom historical 150 | terms. 151 | 152 | Parameters 153 | ---------- 154 | pool : DataFrame 155 | A DataFrame that contains the observed or simulated data up to time t. The historical term at time t in the data 156 | table is to be updated. 157 | 158 | histvars : List 159 | A list of strings, each of which specifies the name of the variable for which its custom history function 160 | is to be applied. 161 | 162 | histories : List 163 | A list of custom functions, each of which is applied to the variable with the same index in histvars. 164 | 165 | time_name : Str 166 | A string specifying the name of the time variable in obs_data. 167 | 168 | t : Int 169 | An integer specifying the current time index. 170 | 171 | id : Str 172 | A string specifying the name of the id variable in obs_data. 173 | 174 | Returns 175 | ------- 176 | None : The original input pool has been updated and nothing is returned. 177 | 178 | """ 179 | for i in range(len(histvars)): 180 | histories[i](pool=pool, histvar=histvars[i], time_name=time_name, t=t, id=id) 181 | -------------------------------------------------------------------------------- /pygformula/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/pygformula/utils/__init__.py -------------------------------------------------------------------------------- /pygformula/utils/helper.py: -------------------------------------------------------------------------------- 1 | import re 2 | import numpy as np 3 | 4 | 5 | def get_cov_hist_info(covnames, covmodels, covtypes, ymodel, compevent_model=None, censor_model=None, 6 | visit_covs=None, ts_visit_names=None): 7 | """ 8 | This is an internal function to get the lagged term and its number indicator, cumavg term, and lagavg term and its number 9 | indicator for each covariate from user-specified models. 10 | 11 | Parameters 12 | ---------- 13 | covnames : List 14 | A list of strings specifying the names of the time-varying covariates in obs_data. 15 | 16 | covmodels : List 17 | A list of strings, where each string is the model statement of the time-varying covariate. The list must be the 18 | same length as covnames and in the same order. If a model is not required for a certain covariate, it should be 19 | set to 'NA' at that index. 20 | 21 | covtypes : List 22 | A list of strings specifying the “type” of each time-varying covariate included in covnames. The supported types: 23 | "binary", "normal", "categorical", "bounded normal", "zero-inflated normal", "truncated normal", "absorbing", 24 | "categorical time", "square time" and "custom". The list must be the same length as covnames and in the same order. 25 | 26 | ymodel : Str 27 | A string specifying the model statement for the outcome variable. 28 | 29 | compevent_model : Str, (default=None) 30 | A string specifying model statement for the competing event variable. Only applicable for survival outcomes. 31 | 32 | censor_model : Str, (default=None) 33 | A string specifying the model statement for the censoring variable. Only applicable when using inverse 34 | probability weights to estimate the natural course means / risk from the observed data. 35 | 36 | visit_covs : List, (default=None) 37 | A list of strings, each of which specifies the name of a covariate whose modeling depends on the visit process. 38 | 39 | ts_visit_names : List, (default=None) 40 | A list of strings, each of which indicates the number of consecutive missed visits for one covariate before an 41 | individual is censored. The list has the same length as visit_covs. 42 | 43 | Returns 44 | ------- 45 | cov_hist_infos : Dict 46 | A dictionary whose keys are covariate names and values are sub-dictionaries with historical information for 47 | covariates. Each sub-dictionaty contains keys 'lagged', 'cumavg' and 'lagavg', the corresponding value for the 48 | key 'lagged' is a two-element list where the first element is a list with all lagged terms, the second element 49 | is a list with the corresponding lagged numbers. Same for the key 'lagavg'. The corresponding value for the key 50 | 'cumavg' is a list with all cumavg terms. 51 | 52 | """ 53 | 54 | all_variables = [] 55 | for model in covmodels: 56 | all_variables.extend(re.split('[~|+]', model.replace(' ', ''))) 57 | all_variables.extend(re.split('[~|+]', ymodel.replace(' ', ''))) 58 | 59 | if compevent_model is not None: 60 | all_variables.extend(re.split('[~|+]', compevent_model.replace(' ', ''))) 61 | if censor_model is not None: 62 | all_variables.extend(re.split('[~|+]', censor_model.replace(' ', ''))) 63 | 64 | if ts_visit_names: 65 | covnames = covnames + ts_visit_names 66 | 67 | cov_hist_infos = {} 68 | for k, cov in enumerate(covnames): 69 | cov_list = np.unique([str_cov for str_cov in all_variables if cov in str_cov]) 70 | if k < len(covtypes): 71 | if covtypes[k] == 'absorbing': 72 | cov_list = np.append(cov_list, 'lag1_{0}'.format(cov)) 73 | if visit_covs and cov in visit_covs: 74 | cov_list = np.append(cov_list, 'lag1_{0}'.format(cov)) 75 | if ts_visit_names and cov in ts_visit_names: 76 | cov_list = np.append(cov_list, 'lag1_{0}'.format(cov)) 77 | 78 | cov_hist = {} 79 | lagavg_variables, cumavg_variables, lagged_variables = [], [], [] 80 | lagged_numbers, lagavg_numbers = [], [] 81 | for item in cov_list: 82 | if 'lag' in item and 'lag_cumavg' not in item: 83 | pattern = re.compile(r'lag\d+_{0}'.format(cov)) 84 | lag_names = pattern.findall(item) 85 | for lag_name in lag_names: 86 | lagged_variables.append(lag_name) 87 | lagged_numbers.append(int(lag_name.split('_')[0].split('lag')[1])) 88 | 89 | if 'cumavg' in item and 'lag_cumavg' not in item: 90 | if covtypes[k] == 'categorical' or covtypes[k] == 'categorical time': 91 | raise ValueError('Cannot apply cumulative average function to categorical covariates.') 92 | pattern = re.compile(r'cumavg_{0}'.format(cov)) 93 | cumavg_names = pattern.findall(item) 94 | for cumavg_name in cumavg_names: 95 | cumavg_variables.append(cumavg_name) 96 | 97 | if 'lag_cumavg' in item: 98 | if covtypes[k] == 'categorical' or covtypes[k] == 'categorical time': 99 | raise ValueError('Cannot apply lagged cumulative average function to categorical covariates.') 100 | pattern = re.compile(r'lag_cumavg\d+_{0}'.format(cov)) 101 | lagavg_names = pattern.findall(item) 102 | for lagavg_name in lagavg_names: 103 | lagavg_variables.append(lagavg_name) 104 | lagavg_numbers.append(int(lagavg_name.split('_')[1].split('cumavg')[1])) 105 | 106 | cov_hist['lagged'] = [lagged_variables, lagged_numbers] 107 | cov_hist['cumavg'] = cumavg_variables 108 | cov_hist['lagavg'] = [lagavg_variables, lagavg_numbers] 109 | cov_hist_infos[cov] = cov_hist 110 | 111 | return cov_hist_infos 112 | 113 | 114 | def visit_func(df, time_name, visit_name, ts_visit_name): 115 | """ 116 | An internal function assists the implementation of a visit process, it creates a new column named ts_visit_name. 117 | 118 | Parameters 119 | ---------- 120 | df : DataFrame 121 | A pandas DataFrame of the input obs_data. 122 | 123 | time_name : Str 124 | A string specifying the name of the time variable in obs_data. 125 | 126 | visit_name : Str 127 | A string specifying the covariate name of a visit process. 128 | 129 | ts_visit_name : Str 130 | A string indicating the number of consecutive missed visits before an individual is censored. 131 | 132 | Returns 133 | ------- 134 | df : DataFrame 135 | A pandas DataFrame with a new column ts_visit_name created. 136 | 137 | """ 138 | 139 | df.loc[df[time_name] == 0, ts_visit_name] = 0 140 | tp_visits = 0 141 | for t in range(1, max(df[time_name]) + 1): 142 | if df.loc[df[time_name] == t, visit_name].values[0] == 1: 143 | df.loc[df[time_name] == t, ts_visit_name] = 0 144 | else: 145 | if df.loc[df[time_name] == t - 1, visit_name].values[0] == 1: # restart the count with new visit 146 | df.loc[df[time_name] == t, ts_visit_name] = 1 147 | tp_visits = 0 148 | else: # continue to count the missed visit number 149 | tp_visits += 1 150 | df.loc[df[time_name] == t, ts_visit_name] = 1 + tp_visits 151 | return df 152 | 153 | 154 | def categorical_func(t, time_thresholds): 155 | for i in range(len(time_thresholds)): 156 | if t <= time_thresholds[i]: 157 | categorical_t = i 158 | break 159 | else: 160 | categorical_t = i + 1 161 | return categorical_t 162 | 163 | 164 | def hr_data_helper(df, outcome_name): 165 | for i, row in df.iterrows(): 166 | if row[outcome_name] == 1: 167 | return row 168 | return row 169 | 170 | 171 | def hr_comp_data_helper(df, outcome_name, compevent_name): 172 | for i, row in df.iterrows(): 173 | if row[compevent_name] == 1: 174 | return row 175 | elif row[outcome_name] == 1: 176 | return row 177 | return row -------------------------------------------------------------------------------- /pygformula/version.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 The President and Fellows of Harvard College 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | # SOFTWARE. 20 | 21 | # Pygformula: a python implementation of the parametric g-formula 22 | # The pygformula 1.0 implements the non-iterative conditional expectation (NICE) algorithm of the g-formula with 23 | # parametric models for covariates, treatments and the outcome. 24 | 25 | __version__ = '1.1.6' 26 | -------------------------------------------------------------------------------- /readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-22.04 5 | tools: 6 | python: "3.8" 7 | jobs: 8 | post_create_environment: 9 | - python -m pip install sphinx_rtd_theme 10 | 11 | sphinx: 12 | configuration: docs/source/conf.py 13 | 14 | formats: 15 | - pdf 16 | - epub 17 | 18 | python: 19 | install: 20 | - method: pip 21 | path: . 22 | - requirements: docs/requirements.txt 23 | - requirements: requirements.txt 24 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | joblib>=1.2.0 2 | lifelines>=0.27.4 3 | matplotlib>=3.5.1 4 | numpy>=1.22.0 5 | pandas>=1.5.2 6 | prettytable>=3.10.0 7 | pytruncreg>=0.1.2 8 | scipy>=1.10.0 9 | seaborn>=0.11.2 10 | statsmodels>=0.14.0 11 | tqdm>=4.64.0 12 | PyQt5>=5.15.11 13 | -------------------------------------------------------------------------------- /running_examples/get_started_example.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.interventions import static 4 | from pygformula.data import load_basicdata_nocomp 5 | 6 | obs_data = load_basicdata_nocomp() 7 | time_name = 't0' 8 | id = 'id' 9 | 10 | covnames = ['L1', 'L2', 'A'] 11 | covtypes = ['binary', 'bounded normal', 'binary'] 12 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 13 | 'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 14 | 'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0'] 15 | 16 | basecovs = ['L3'] 17 | 18 | outcome_name = 'Y' 19 | ymodel = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0' 20 | outcome_type = 'survival' 21 | 22 | time_points = np.max(np.unique(obs_data[time_name])) + 1 23 | int_descript = ['Never treat', 'Always treat'] 24 | 25 | 26 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 27 | int_descript = int_descript, 28 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs, 29 | outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type, 30 | Intervention1_A = [static, np.zeros(time_points)], 31 | Intervention2_A = [static, np.ones(time_points)], 32 | nsamples=20, parallel=True, ncores=8, 33 | ) 34 | g.fit() 35 | g.plot_natural_course() 36 | g.plot_interventions() -------------------------------------------------------------------------------- /running_examples/test_absorbing_cov.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.interventions import static 4 | from pygformula.data import load_absorbing_data 5 | 6 | obs_data = load_absorbing_data() 7 | time_name = 't0' 8 | id = 'id' 9 | 10 | covnames = ['L', 'A'] 11 | covtypes = ['absorbing', 'binary'] 12 | covmodels = ['L ~ lag1_L + lag1_A + t0', 13 | 'A ~ lag1_A + L + t0'] 14 | 15 | outcome_name = 'Y' 16 | ymodel = 'Y ~ L + A + t0' 17 | outcome_type = 'survival' 18 | 19 | time_points = np.max(np.unique(obs_data[time_name])) + 1 20 | int_descript = ['Never treat', 'Always treat'] 21 | 22 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 23 | int_descript = int_descript, 24 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, 25 | Intervention1_A = [static, np.zeros(time_points)], 26 | Intervention2_A = [static, np.ones(time_points)], 27 | outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type 28 | ) 29 | g.fit() 30 | -------------------------------------------------------------------------------- /running_examples/test_binary_cov.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.interventions import static 4 | from pygformula.data import load_basicdata_nocomp 5 | 6 | obs_data = load_basicdata_nocomp() 7 | time_name = 't0' 8 | id = 'id' 9 | 10 | covnames = ['L1', 'A'] 11 | covtypes = ['binary', 'binary'] 12 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + L3 + t0', 13 | 'A ~ lag1_A + L1 + lag_cumavg1_L1 + L3 + t0'] 14 | 15 | basecovs = ['L3'] 16 | 17 | outcome_name = 'Y' 18 | ymodel = 'Y ~ L1 + A + lag1_A + lag1_L1 + L3 + t0' 19 | outcome_type = 'survival' 20 | 21 | time_points = np.max(np.unique(obs_data[time_name])) + 1 22 | int_descript = ['Never treat', 'Always treat'] 23 | 24 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 25 | int_descript = int_descript, 26 | Intervention1_A = [static, np.zeros(time_points)], 27 | Intervention2_A = [static, np.ones(time_points)], 28 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs, 29 | outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type 30 | ) 31 | g.fit() -------------------------------------------------------------------------------- /running_examples/test_binary_eof.py: -------------------------------------------------------------------------------- 1 | from pygformula import ParametricGformula 2 | from pygformula.interventions import threshold 3 | from pygformula.data import load_binary_eof 4 | 5 | obs_data = load_binary_eof() 6 | time_name = 't0' 7 | id = 'id' 8 | 9 | covnames = ['L1', 'L2', 'A'] 10 | covtypes = ['binary', 'zero-inflated normal', 'normal'] 11 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + L3 + t0', 12 | 'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 13 | 'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0'] 14 | 15 | basecovs = ['L3'] 16 | 17 | outcome_name = 'Y' 18 | ymodel = 'Y ~ L1 + A + lag1_A + lag1_L1 + L3 + t0' 19 | outcome_type = 'binary_eof' 20 | 21 | int_descript = ['Threshold intervention'] 22 | 23 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, 24 | int_descript = int_descript, 25 | Intervention1_A = [threshold, [0.5, float('inf')]], 26 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs, 27 | outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type 28 | ) 29 | g.fit() -------------------------------------------------------------------------------- /running_examples/test_bounded_normal_cov.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.interventions import static 4 | from pygformula.data import load_basicdata_nocomp 5 | 6 | obs_data = load_basicdata_nocomp() 7 | time_name = 't0' 8 | id = 'id' 9 | 10 | covnames = ['L2', 'A'] 11 | covtypes = ['bounded normal', 'binary'] 12 | covmodels = ['L2 ~ lag1_A + lag_cumavg1_L2 + L3 + t0', 13 | 'A ~ lag1_A + L2 + lag_cumavg1_L2 + L3 + t0'] 14 | 15 | basecovs = ['L3'] 16 | 17 | outcome_name = 'Y' 18 | ymodel = 'Y ~ L2 + A + lag1_A + L3 + t0' 19 | outcome_type = 'survival' 20 | 21 | time_points = np.max(np.unique(obs_data[time_name])) + 1 22 | int_descript = ['Never treat', 'Always treat'] 23 | 24 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 25 | int_descript = int_descript, intcomp=[1, 2], 26 | Intervention1_A = [static, np.zeros(time_points)], 27 | Intervention2_A = [static, np.ones(time_points)], 28 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs, 29 | outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type 30 | ) 31 | g.fit() -------------------------------------------------------------------------------- /running_examples/test_categorical_cov.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.interventions import static 4 | from pygformula.data import load_categorical 5 | 6 | obs_data = load_categorical() 7 | time_name = 't0' 8 | id = 'id' 9 | 10 | covnames = [ 'L', 'A'] 11 | covtypes = ['categorical', 'binary'] 12 | covmodels = [ 'L ~ C(lag1_L) + t0', 13 | 'A ~ C(L) + C(lag1_L) + t0'] 14 | 15 | outcome_name = 'Y' 16 | ymodel = 'Y ~ C(lag1_L) + A' 17 | 18 | time_points = np.max(np.unique(obs_data[time_name])) + 1 19 | int_descript = ['Never treat', 'Always treat'] 20 | 21 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 22 | int_descript = int_descript, 23 | Intervention1_A = [static, np.zeros(time_points)], 24 | Intervention2_A = [static, np.ones(time_points)], 25 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, outcome_name=outcome_name, 26 | ymodel=ymodel, outcome_type='survival') 27 | g.fit() 28 | 29 | -------------------------------------------------------------------------------- /running_examples/test_categorical_time.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.interventions import static 4 | from pygformula.data import load_basicdata_nocomp 5 | 6 | obs_data = load_basicdata_nocomp() 7 | time_name = 't0' 8 | id = 'id' 9 | 10 | covnames = ['L1', 'L2', 'A', 't0_f'] 11 | covtypes = ['binary', 'bounded normal', 'binary', 'categorical time'] 12 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0 + C(t0_f)', 13 | 'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0 + C(t0_f)', 14 | 'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0 + C(t0_f)', 15 | 'NA'] 16 | 17 | time_thresholds = [1, 3, 5] 18 | 19 | basecovs = ['L3'] 20 | 21 | outcome_name = 'Y' 22 | ymodel = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0' 23 | outcome_type = 'survival' 24 | 25 | time_points = np.max(np.unique(obs_data[time_name])) + 1 26 | int_descript = ['Never treat', 'Always treat'] 27 | 28 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 29 | int_descript = int_descript, time_thresholds = time_thresholds, 30 | Intervention1_A = [static, np.zeros(time_points)], 31 | Intervention2_A = [static, np.ones(time_points)], 32 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs, 33 | outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type, 34 | ) 35 | g.fit() 36 | -------------------------------------------------------------------------------- /running_examples/test_censor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.interventions import static 4 | from pygformula.data import load_censor_data 5 | 6 | obs_data = load_censor_data() 7 | time_name = 't0' 8 | id = 'id' 9 | 10 | covnames = ['L', 'A'] 11 | covtypes = ['binary', 'normal'] 12 | 13 | covmodels = ['L ~ lag1_L + t0', 14 | 'A ~ lag1_A + L + t0'] 15 | 16 | outcome_name = 'Y' 17 | ymodel = 'Y ~ A + L' 18 | 19 | censor_name = 'C' 20 | censor_model = 'C ~ A + L' 21 | 22 | time_points = np.max(np.unique(obs_data[time_name])) + 1 23 | int_descript = ['Never treat', 'Always treat'] 24 | 25 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 26 | int_descript=int_descript, 27 | Intervention1_A = [static, np.zeros(time_points)], 28 | Intervention2_A = [static, np.ones(time_points)], 29 | censor_name= censor_name, censor_model=censor_model, 30 | covnames = covnames, covtypes = covtypes, covmodels = covmodels, 31 | outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival') 32 | g.fit() 33 | -------------------------------------------------------------------------------- /running_examples/test_comp_restrictions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.interventions import static 4 | from pygformula.data import load_basicdata 5 | 6 | obs_data = load_basicdata() 7 | 8 | covnames = ['L1', 'L2', 'A'] 9 | covtypes = ['binary', 'bounded normal', 'binary'] 10 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 11 | 'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 12 | 'A ~ lag1_A + L1 + L2 +lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0'] 13 | 14 | ymodel = 'Y ~ A + L1 + L2 + L3 + lag1_A + lag1_L1 + lag1_L2' 15 | 16 | time_name = 't0' 17 | id = 'id' 18 | outcome_name = 'Y' 19 | basecovs = ['L3'] 20 | 21 | compevent_name = 'D' 22 | compevent_model = 'D ~ A + L1 + L2 + L3 + t0' 23 | compevent_cens = False 24 | 25 | time_points = np.max(np.unique(obs_data[time_name])) + 1 26 | int_descript = ['Never treat', 'Always treat'] 27 | 28 | 29 | compevent_restrictions = [[{'L1': lambda x: x == 0}, 0], [{'L2': lambda x: x > 0.5}, 0.1]] 30 | 31 | g = ParametricGformula(obs_data = obs_data, id = id, time_points = time_points, time_name=time_name, 32 | int_descript = int_descript, 33 | Intervention1_A = [static, np.zeros(time_points)], 34 | Intervention2_A = [static, np.ones(time_points)], 35 | basecovs =basecovs, covnames=covnames, covtypes=covtypes, covmodels=covmodels, 36 | compevent_restrictions = compevent_restrictions, 37 | compevent_cens= compevent_cens, compevent_name = compevent_name, compevent_model=compevent_model, 38 | outcome_name=outcome_name, outcome_type='survival', ymodel=ymodel) 39 | g.fit() -------------------------------------------------------------------------------- /running_examples/test_competing_event.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.interventions import static 4 | from pygformula.data import load_basicdata 5 | 6 | obs_data = load_basicdata() 7 | 8 | covnames = ['L1', 'L2', 'A'] 9 | covtypes = ['binary', 'bounded normal', 'binary'] 10 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 11 | 'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 12 | 'A ~ lag1_A + L1 + L2 +lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0'] 13 | 14 | ymodel = 'Y ~ A + L1 + L2 + L3 + lag1_A + lag1_L1 + lag1_L2' 15 | 16 | time_name = 't0' 17 | id = 'id' 18 | outcome_name = 'Y' 19 | basecovs = ['L3'] 20 | 21 | compevent_name = 'D' 22 | compevent_model = 'D ~ A + L1 + L2 + L3 + t0' 23 | 24 | time_points = np.max(np.unique(obs_data[time_name])) + 1 25 | int_descript = ['Never treat', 'Always treat'] 26 | 27 | g = ParametricGformula(obs_data = obs_data, id = id, time_points = time_points, time_name=time_name, 28 | int_descript = int_descript, 29 | Intervention1_A = [static, np.zeros(time_points)], 30 | Intervention2_A = [static, np.ones(time_points)], 31 | basecovs =basecovs, covnames=covnames, covtypes=covtypes, covmodels=covmodels, 32 | compevent_name = compevent_name, compevent_model=compevent_model, 33 | outcome_name=outcome_name, outcome_type='survival', ymodel=ymodel) 34 | g.fit() 35 | -------------------------------------------------------------------------------- /running_examples/test_continuous_eof.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.interventions import static 4 | from pygformula.data import load_continuous_eof 5 | 6 | obs_data = load_continuous_eof() 7 | time_name = 't0' 8 | id = 'id' 9 | 10 | covnames = ['L1', 'L2', 'A'] 11 | covtypes = ['categorical', 'normal', 'binary'] 12 | covmodels = ['L1 ~ C(lag1_L1) + lag1_L2 + t0', 13 | 'L2 ~ lag1_L2 + C(lag1_L1) + lag1_A + t0', 14 | 'A ~ C(L1) + L2 + t0'] 15 | 16 | basecovs = ['L3'] 17 | 18 | outcome_name = 'Y' 19 | ymodel = 'Y ~ C(L1) + L2 + A' 20 | outcome_type = 'continuous_eof' 21 | 22 | time_points = np.max(np.unique(obs_data[time_name])) + 1 23 | int_descript = ['Never treat', 'Always treat'] 24 | 25 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, 26 | int_descript=int_descript, 27 | Intervention1_A = [static, np.zeros(time_points)], 28 | Intervention2_A = [static, np.ones(time_points)], 29 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs, 30 | outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type 31 | ) 32 | g.fit() 33 | -------------------------------------------------------------------------------- /running_examples/test_custom_ymodel.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import re 3 | from sklearn.ensemble import RandomForestRegressor 4 | 5 | from pygformula.interventions import static 6 | from pygformula import ParametricGformula 7 | from pygformula.data import load_continuous_eof 8 | 9 | obs_data = load_continuous_eof() 10 | 11 | time_name = 't0' 12 | id = 'id' 13 | 14 | covnames = ['L1', 'L2', 'A'] 15 | covtypes = ['categorical', 'normal', 'binary'] 16 | covmodels = ['L1 ~ C(lag1_L1) + lag1_L2 + t0', 17 | 'L2 ~ lag1_L2 + C(lag1_L1) + lag1_A + t0', 18 | 'A ~ C(L1) + L2 + t0'] 19 | 20 | basecovs = ['L3'] 21 | 22 | outcome_name = 'Y' 23 | 24 | ymodel = 'Y ~ lag1_L2 + L2 + lag1_A + A' 25 | 26 | # define interventions 27 | time_points = np.max(np.unique(obs_data[time_name])) + 1 28 | int_descript = ['Never treat', 'Always treat'] 29 | 30 | 31 | def ymodel_fit_custom(ymodel, fit_data): 32 | y_name, x_name = re.split('~', ymodel.replace(' ', '')) 33 | x_name = re.split('\+', x_name.replace(' ', '')) 34 | # get feature and target data to fit ymodel 35 | y = fit_data[y_name].to_numpy() 36 | X = fit_data[x_name].to_numpy() 37 | fit_rf = RandomForestRegressor() 38 | fit_rf.fit(X, y) 39 | return fit_rf 40 | 41 | def ymodel_predict_custom(ymodel, new_df, fit): 42 | y_name, x_name = re.split('~', ymodel.replace(' ', '')) 43 | x_name = re.split('\+', x_name.replace(' ', '')) 44 | # get feature data to predict 45 | X = new_df[x_name].to_numpy() 46 | prediction = fit.predict(X) 47 | return prediction 48 | 49 | 50 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 51 | int_descript = int_descript, 52 | Intervention1_A = [static, np.zeros(time_points)], basecovs=['L3'], 53 | Intervention2_A = [static, np.ones(time_points)], 54 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, 55 | ymodel_fit_custom = ymodel_fit_custom, ymodel_predict_custom=ymodel_predict_custom, 56 | outcome_name=outcome_name, ymodel=ymodel, outcome_type='continuous_eof') 57 | g.fit() -------------------------------------------------------------------------------- /running_examples/test_dynamic_intervention.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.data import load_basicdata_nocomp 4 | 5 | obs_data = load_basicdata_nocomp() 6 | time_name = 't0' 7 | id = 'id' 8 | 9 | covnames = ['L1', 'L2', 'A'] 10 | covtypes = ['binary', 'bounded normal', 'binary'] 11 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 12 | 'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 13 | 'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0'] 14 | 15 | basecovs = ['L3'] 16 | 17 | time_points = np.max(np.unique(obs_data[time_name])) + 1 18 | 19 | def dynamic_intervention(new_df, pool, int_var, time_name, t): 20 | new_df.loc[new_df[time_name] == t, int_var] = 0 21 | new_df.loc[new_df['L2'] > 0.75, int_var] = 1 22 | 23 | int_descript = ['Dynamic intervention'] 24 | 25 | outcome_name = 'Y' 26 | ymodel = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0' 27 | 28 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 29 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs, 30 | int_descript = int_descript, 31 | Intervention1_A = [dynamic_intervention], 32 | outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival') 33 | g.fit() -------------------------------------------------------------------------------- /running_examples/test_fit_random_forest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import re 3 | from sklearn.ensemble import RandomForestRegressor 4 | 5 | from pygformula.interventions import static 6 | from pygformula import ParametricGformula 7 | from pygformula.data import load_basicdata_nocomp 8 | 9 | obs_data = load_basicdata_nocomp() 10 | 11 | time_name = 't0' 12 | id = 'id' 13 | 14 | covnames = ['L1', 'L2', 'A'] 15 | covtypes = ['binary', 'custom', 'binary'] 16 | covmodels = ['L1 ~ lag1_A + lag2_A + lag1_L1 + lag_cumavg1_L2 + t0', 17 | 'L2 ~ lag1_A + L1 + lag1_L1 + lag_cumavg1_L2 + t0', 18 | 'A ~ lag1_A + L1 + L2 +lag1_L1 + lag_cumavg1_L2 + t0'] 19 | 20 | 21 | outcome_name = 'Y' 22 | ymodel = 'Y ~ L1 + L2 + A' 23 | 24 | # define interventions 25 | time_points = np.max(np.unique(obs_data[time_name])) + 1 26 | int_descript = ['Never treat', 'Always treat'] 27 | 28 | 29 | def fit_rf(covmodel, covname, fit_data): 30 | max_depth = 2 31 | y_name, x_name = re.split('~', covmodel.replace(' ', '')) 32 | x_name = re.split('\+', x_name.replace(' ', '')) 33 | y = fit_data[y_name].to_numpy() 34 | X = fit_data[x_name].to_numpy() 35 | fit_rf = RandomForestRegressor(max_depth=max_depth, random_state=0) 36 | fit_rf.fit(X, y) 37 | return fit_rf 38 | 39 | def predict_rf(covmodel, new_df, fit): 40 | y_name, x_name = re.split('~', covmodel.replace(' ', '')) 41 | x_name = re.split('\+', x_name.replace(' ', '')) 42 | X = new_df[x_name].to_numpy() 43 | prediction = fit.predict(X) 44 | return prediction 45 | 46 | covfits_custom = ['NA', fit_rf, 'NA'] 47 | covpredict_custom = ['NA', predict_rf, 'NA'] 48 | 49 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 50 | int_descript = int_descript, 51 | Intervention1_A = [static, np.zeros(time_points)], 52 | Intervention2_A = [static, np.ones(time_points)], 53 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, 54 | covfits_custom = covfits_custom, covpredict_custom=covpredict_custom, 55 | outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival') 56 | g.fit() 57 | -------------------------------------------------------------------------------- /running_examples/test_natural_course.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.data import load_basicdata_nocomp 4 | 5 | obs_data = load_basicdata_nocomp() 6 | time_name = 't0' 7 | id = 'id' 8 | 9 | covnames = ['L1', 'L2', 'A'] 10 | covtypes = ['binary', 'bounded normal', 'binary'] 11 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 12 | 'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 13 | 'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0'] 14 | 15 | basecovs = ['L3'] 16 | 17 | outcome_name = 'Y' 18 | ymodel = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0' 19 | outcome_type = 'survival' 20 | 21 | time_points = np.max(np.unique(obs_data[time_name])) + 1 22 | 23 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 24 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs, 25 | outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type 26 | ) 27 | g.fit() 28 | -------------------------------------------------------------------------------- /running_examples/test_natural_grace_period.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.interventions import natural_grace_period 4 | from pygformula.data import load_basicdata_nocomp 5 | 6 | obs_data = load_basicdata_nocomp() 7 | time_name = 't0' 8 | id = 'id' 9 | 10 | covnames = ['L1', 'L2', 'A'] 11 | covtypes = ['binary', 'bounded normal', 'binary'] 12 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 13 | 'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 14 | 'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0'] 15 | 16 | basecovs = ['L3'] 17 | 18 | time_points = np.max(np.unique(obs_data[time_name])) + 1 19 | 20 | int_descript = ['natural grace period intervention'] 21 | 22 | outcome_name = 'Y' 23 | ymodel = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0' 24 | 25 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 26 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs, 27 | int_descript = int_descript, 28 | Intervention1_A = [natural_grace_period, [3, {'L1': lambda x: x == 1}]], 29 | outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival') 30 | g.fit() -------------------------------------------------------------------------------- /running_examples/test_normal_cov.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.interventions import static 4 | from pygformula.data import load_basicdata_nocomp 5 | 6 | obs_data = load_basicdata_nocomp() 7 | time_name = 't0' 8 | id = 'id' 9 | 10 | covnames = ['L2', 'A'] 11 | covtypes = ['normal', 'binary'] 12 | covmodels = ['L2 ~ lag1_A + lag_cumavg1_L2 + L3 + t0', 13 | 'A ~ lag1_A + L2 + lag_cumavg1_L2 + L3 + t0'] 14 | 15 | basecovs = ['L3'] 16 | 17 | outcome_name = 'Y' 18 | ymodel = 'Y ~ L2 + A + lag1_A + L3 + t0' 19 | outcome_type = 'survival' 20 | 21 | time_points = np.max(np.unique(obs_data[time_name])) + 1 22 | int_descript = ['Never treat', 'Always treat'] 23 | 24 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 25 | int_descript = int_descript, 26 | Intervention1_A = [static, np.zeros(time_points)], 27 | Intervention2_A = [static, np.ones(time_points)], 28 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs, 29 | outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type 30 | ) 31 | g.fit() -------------------------------------------------------------------------------- /running_examples/test_restrictions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.interventions import static 4 | from pygformula.data import load_basicdata_nocomp 5 | 6 | obs_data = load_basicdata_nocomp() 7 | 8 | time_name = 't0' 9 | id = 'id' 10 | 11 | covnames = ['L1', 'L2', 'A'] 12 | covtypes = ['binary', 'normal', 'binary'] 13 | covmodels = ['L1 ~ lag1_L1 + lag1_A', 14 | 'L2 ~ L1 + lag1_L2', 15 | 'A ~ L1 + L2'] 16 | 17 | basecovs = ['L3'] 18 | outcome_name = 'Y' 19 | ymodel = 'Y ~ L1 + L2 + A' 20 | 21 | # define interventions 22 | time_points = np.max(np.unique(obs_data[time_name])) + 1 23 | int_descript = ['Never treat', 'Always treat'] 24 | 25 | 26 | restrictions = [['L2', {'L1': lambda x: x == 0}, 0.5], ['A', {'L1': lambda x: x == 0, 'L2': lambda x: x > 0.5}, 1]] 27 | 28 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 29 | int_descript = int_descript, 30 | Intervention1_A = [static, np.zeros(time_points)], 31 | Intervention2_A = [static, np.ones(time_points)], 32 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs, 33 | restrictions=restrictions, outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival') 34 | g.fit() 35 | -------------------------------------------------------------------------------- /running_examples/test_square_time.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.interventions import static 4 | from pygformula.data import load_basicdata_nocomp 5 | 6 | obs_data = load_basicdata_nocomp() 7 | time_name = 't0' 8 | id = 'id' 9 | 10 | covnames = ['L1', 'L2', 'A', 'square_t0'] 11 | covtypes = ['binary', 'bounded normal', 'binary', 'square time'] 12 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0 + square_t0', 13 | 'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0 + square_t0', 14 | 'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0 + square_t0', 15 | 'NA'] 16 | 17 | basecovs = ['L3'] 18 | 19 | outcome_name = 'Y' 20 | ymodel = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0 + square_t0' 21 | outcome_type = 'survival' 22 | 23 | time_points = np.max(np.unique(obs_data[time_name])) + 1 24 | int_descript = ['Never treat', 'Always treat'] 25 | 26 | 27 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 28 | int_descript = int_descript, 29 | Intervention1_A = [static, np.zeros(time_points)], 30 | Intervention2_A = [static, np.ones(time_points)], 31 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs, 32 | outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type, 33 | ) 34 | g.fit() 35 | -------------------------------------------------------------------------------- /running_examples/test_static_multiple_treatments.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.interventions import static 4 | from pygformula.data import load_multiple_treatments_data 5 | 6 | obs_data = load_multiple_treatments_data() 7 | time_name = 't0' 8 | id = 'id' 9 | 10 | covnames = ['L1', 'L2', 'A1', 'A2'] 11 | covtypes = ['binary', 'bounded normal', 'binary', 'binary'] 12 | covmodels = ['L1 ~ lag1_L1', 13 | 'L2 ~ lag1_L1 + lag1_L2 + lag1_A2 + L1', 14 | 'A1 ~ lag1_L1 + lag1_L2', 15 | 'A2 ~ lag1_A1'] 16 | 17 | time_points = np.max(np.unique(obs_data[time_name])) + 1 18 | int_descript = ['Always treat on A1 & A2'] 19 | 20 | 21 | outcome_name = 'Y' 22 | ymodel = 'Y ~ L1 + L2 + A1 + A2' 23 | 24 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 25 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, 26 | int_descript = int_descript, 27 | Intervention1_A1 = [static, np.ones(time_points)], 28 | Intervention1_A2 = [static, np.ones(time_points)], 29 | outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival') 30 | g.fit() -------------------------------------------------------------------------------- /running_examples/test_static_one_treatment.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.interventions import static 4 | from pygformula.data import load_basicdata_nocomp 5 | 6 | obs_data = load_basicdata_nocomp() 7 | time_name = 't0' 8 | id = 'id' 9 | 10 | covnames = ['L1', 'L2', 'A'] 11 | covtypes = ['binary', 'bounded normal', 'binary'] 12 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 13 | 'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 14 | 'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0'] 15 | 16 | basecovs = ['L3'] 17 | 18 | time_points = np.max(np.unique(obs_data[time_name])) + 1 19 | int_descript = ['Always treat'] 20 | 21 | outcome_name = 'Y' 22 | ymodel = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0' 23 | 24 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 25 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs, 26 | int_descript = int_descript, 27 | Intervention1_A = [static, np.ones(time_points), [0, 1, 4]], 28 | outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival') 29 | g.fit() -------------------------------------------------------------------------------- /running_examples/test_threshold_intervention.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.interventions import threshold 4 | from pygformula.data import load_threshold_data 5 | 6 | obs_data = load_threshold_data() 7 | time_name = 't0' 8 | id = 'id' 9 | 10 | covnames = ['L1', 'L2', 'A'] 11 | covtypes = ['binary', 'bounded normal', 'normal'] 12 | covmodels = ['L1 ~ lag1_L1', 13 | 'L2 ~ lag1_L1 + lag1_L2 + L1', 14 | 'A ~ L1 + L2'] 15 | 16 | time_points = np.max(np.unique(obs_data[time_name])) + 1 17 | 18 | int_descript = ['Threshold intervention'] 19 | 20 | outcome_name = 'Y' 21 | ymodel = 'Y ~ L1 + L2 + A' 22 | 23 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 24 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, 25 | int_descript = int_descript, 26 | Intervention1_A = [threshold, [0.5, float('inf')]], 27 | outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival') 28 | g.fit() -------------------------------------------------------------------------------- /running_examples/test_truncated_normal.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.interventions import static 4 | from pygformula.data import load_truncated_normal 5 | 6 | obs_data = load_truncated_normal() 7 | time_name = 't0' 8 | id = 'id' 9 | 10 | covnames = ['L', 'A'] 11 | covtypes = ['truncated normal', 'binary'] 12 | covmodels = ['L ~ lag1_A + lag1_L + t0', 13 | 'A ~ lag1_A + lag1_L + L + t0'] 14 | 15 | trunc_params = [[1, 'right'], 'NA'] 16 | 17 | outcome_name = 'Y' 18 | ymodel = 'Y ~ L + A + t0' 19 | outcome_type = 'survival' 20 | 21 | time_points = np.max(np.unique(obs_data[time_name])) + 1 22 | int_descript = ['Never treat', 'Always treat'] 23 | 24 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 25 | int_descript = int_descript, 26 | Intervention1_A = [static, np.zeros(time_points)], 27 | Intervention2_A = [static, np.ones(time_points)], 28 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, trunc_params=trunc_params, 29 | outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type 30 | ) 31 | g.fit() 32 | -------------------------------------------------------------------------------- /running_examples/test_uniform_grace_period.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.interventions import uniform_grace_period 4 | from pygformula.data import load_basicdata_nocomp 5 | 6 | obs_data = load_basicdata_nocomp() 7 | time_name = 't0' 8 | id = 'id' 9 | 10 | covnames = ['L1', 'L2', 'A'] 11 | covtypes = ['binary', 'bounded normal', 'binary'] 12 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 13 | 'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0', 14 | 'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0'] 15 | 16 | basecovs = ['L3'] 17 | 18 | time_points = np.max(np.unique(obs_data[time_name])) + 1 19 | 20 | int_descript = ['uniform grace period intervention'] 21 | 22 | outcome_name = 'Y' 23 | ymodel = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0' 24 | 25 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 26 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs, 27 | int_descript = int_descript, 28 | Intervention1_A = [uniform_grace_period, [3, {'L1': lambda x: x == 1}]], 29 | outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival') 30 | g.fit() -------------------------------------------------------------------------------- /running_examples/test_visit_process.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.interventions import static 4 | from pygformula.data import load_visit_process 5 | 6 | obs_data = load_visit_process() 7 | time_name = 'month' 8 | id = 'id' 9 | 10 | covnames = ['visit_cd4', 'visit_rna', 'cd4_v', 'rna_v', 'everhaart'] 11 | covtypes = ['binary', 'binary', 'normal', 'normal', 'binary'] 12 | covmodels = ['visit_cd4 ~ lag1_everhaart + lag_cumavg1_cd4_v + sex + race + month', 13 | 'visit_rna ~ lag1_everhaart + lag_cumavg1_rna_v + sex + race + month', 14 | 'cd4_v ~ lag1_everhaart + lag_cumavg1_cd4_v + sex + race + month', 15 | 'rna_v ~ lag1_everhaart + lag_cumavg1_rna_v + sex + race + month', 16 | 'everhaart ~ lag1_everhaart + cd4_v + rna_v + sex + race + month'] 17 | 18 | basecovs = ['sex', 'race', 'age'] 19 | 20 | visitprocess = [['visit_cd4', 'cd4_v', 3], ['visit_rna', 'rna_v', 3]] 21 | 22 | outcome_name = 'event' 23 | ymodel = 'event ~ cd4_v + rna_v + everhaart + sex + race + month' 24 | 25 | time_points = np.max(np.unique(obs_data[time_name])) + 1 26 | 27 | int_descript = ['Never treat', 'Always treat'] 28 | 29 | 30 | g = ParametricGformula(obs_data = obs_data, id = id, time_name = time_name, visitprocess = visitprocess, 31 | int_descript = int_descript, 32 | Intervention1_everhaart = [static, np.zeros(time_points)], 33 | Intervention2_everhaart = [static, np.ones(time_points)], 34 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs = basecovs, 35 | outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival') 36 | g.fit() 37 | g.plot_interventions() 38 | g.plot_natural_course() -------------------------------------------------------------------------------- /running_examples/test_yrestrictions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.interventions import static 4 | from pygformula.data import load_basicdata_nocomp 5 | 6 | obs_data = load_basicdata_nocomp() 7 | 8 | time_name = 't0' 9 | id = 'id' 10 | 11 | covnames = ['L1', 'L2', 'A'] 12 | covtypes = ['binary', 'normal', 'binary'] 13 | covmodels = ['L1 ~ lag1_L1 + lag1_A', 14 | 'L2 ~ L1 + lag1_L2', 15 | 'A ~ L1 + L2'] 16 | 17 | basecovs = ['L3'] 18 | outcome_name = 'Y' 19 | ymodel = 'Y ~ L1 + L2 + A' 20 | 21 | # define interventions 22 | time_points = np.max(np.unique(obs_data[time_name])) + 1 23 | int_descript = ['Never treat', 'Always treat'] 24 | 25 | yrestrictions = [[{'L1': lambda x: x == 0}, 0], [{'L2': lambda x: x > 0.5}, 0.1]] 26 | 27 | 28 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 29 | int_descript = int_descript, 30 | Intervention1_A = [static, np.zeros(time_points)], 31 | Intervention2_A = [static, np.ones(time_points)], 32 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs, 33 | yrestrictions=yrestrictions, outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival') 34 | g.fit() -------------------------------------------------------------------------------- /running_examples/test_zero_inflated_normal_cov.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pygformula import ParametricGformula 3 | from pygformula.interventions import static 4 | from pygformula.data import load_zero_inflated_normal 5 | 6 | obs_data = load_zero_inflated_normal() 7 | time_name = 't0' 8 | id = 'id' 9 | 10 | covnames = ['L', 'A'] 11 | covtypes = ['zero-inflated normal', 'binary'] 12 | covmodels = ['L ~ lag1_L + lag1_A + t0', 13 | 'A ~ lag1_A + L + t0'] 14 | 15 | outcome_name = 'Y' 16 | ymodel = 'Y ~ L + A + t0' 17 | outcome_type = 'survival' 18 | 19 | time_points = np.max(np.unique(obs_data[time_name])) + 1 20 | int_descript = ['Never treat', 'Always treat'] 21 | 22 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points, 23 | int_descript = int_descript, 24 | Intervention1_A = [static, np.zeros(time_points)], 25 | Intervention2_A = [static, np.ones(time_points)], 26 | covnames=covnames, covtypes=covtypes, covmodels=covmodels, 27 | outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type 28 | ) 29 | g.fit() 30 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | INSTALL_REQUIRES = [ 4 | 'joblib>=1.2', 5 | 'lifelines>=0.27', 6 | 'matplotlib>=3.5', 7 | 'numpy>=1.22', 8 | 'pandas>=1.5', 9 | 'prettytable>=3.10', 10 | 'pytruncreg>=0.1', 11 | 'scipy>=1.10', 12 | 'seaborn>=0.11', 13 | 'statsmodels>=0.14', 14 | 'tqdm>=4.64', 15 | 'PyQt5>=5.15' 16 | ] 17 | 18 | version = {} 19 | with open("pygformula/version.py") as fp: 20 | exec(fp.read(), version) 21 | 22 | with open('README.md', 'r', encoding='utf-8') as f: 23 | long_description = f.read() 24 | 25 | setuptools.setup( 26 | name='pygformula', 27 | version=version['__version__'], 28 | maintainer='Jing Li', 29 | maintainer_email='jing_li@hsph.harvard.edu', 30 | description='A python implementation of the parametric g-formula', 31 | long_description=long_description, 32 | long_description_content_type='text/markdown', 33 | packages=setuptools.find_packages(), 34 | install_requires=INSTALL_REQUIRES, 35 | python_requires='>=3.8' 36 | ) --------------------------------------------------------------------------------