├── .github
    └── workflows
    │   └── publish.yaml
├── LICENSE
├── Pygformula_Documentation.pdf
├── README.md
├── datasets
    ├── example_data_absorbing.csv
    ├── example_data_basicdata.csv
    ├── example_data_basicdata_nocomp.csv
    ├── example_data_binary_eof.csv
    ├── example_data_categorical.csv
    ├── example_data_censor.csv
    ├── example_data_continuous_eof.csv
    ├── example_data_multiple_treatments.csv
    ├── example_data_truncated_normal.csv
    ├── example_data_visit_process.csv
    ├── example_data_zero_inflated_normal.csv
    └── example_threshold_data.csv
├── docs
    ├── Makefile
    ├── make.bat
    ├── requirements.txt
    └── source
    │   ├── Contact.rst
    │   ├── Datasets.rst
    │   ├── Get Started.rst
    │   ├── Installation.rst
    │   ├── Specifications
    │       ├── Censoring event.rst
    │       ├── Competing event.rst
    │       ├── Covariate models.rst
    │       ├── Deterministic knowledge.rst
    │       ├── Hazard ratio.rst
    │       ├── Input data.rst
    │       ├── Interventions.rst
    │       ├── Outcome model.rst
    │       ├── Output.rst
    │       ├── Visit process.rst
    │       └── index.rst
    │   ├── conf.py
    │   ├── index.rst
    │   └── media
    │       ├── absorbing_cov_example_output.png
    │       ├── binary_cov_example_output.png
    │       ├── binary_eof_example_output.png
    │       ├── bounded_normal_cov_example.png
    │       ├── categorical_cov_example_output.png
    │       ├── categorical_time_cov_example.png
    │       ├── censor_example_output.png
    │       ├── comp_restriction_example_output.png
    │       ├── competing_as_cens_output.png
    │       ├── competing_not_cens_output.png
    │       ├── continuous_eof_example_output.png
    │       ├── data_example.png
    │       ├── data_example_censor.png
    │       ├── data_example_competing.png
    │       ├── dynamic_example_output.png
    │       ├── example_hazardratio_output.png
    │       ├── get_started_example.png
    │       ├── get_started_example_all.jpg
    │       ├── get_started_example_bootstrap.jpg
    │       ├── get_started_example_intervention_curve.jpg
    │       ├── natural_course_output.png
    │       ├── natural_grace_period.png
    │       ├── normal_cov_example_output.png
    │       ├── random_forest_cov.png
    │       ├── restriction_example_output.png
    │       ├── static_example_one_treatment_output.png
    │       ├── static_example_two_treatments.png
    │       ├── static_multiple_interventions.png
    │       ├── survival_example_output.png
    │       ├── test_hazard_ratio.png
    │       ├── threshold_example_output.png
    │       ├── truncated_normal_cov_example.png
    │       ├── uniform_grace_period.png
    │       ├── visitprocess_example_output.png
    │       ├── yrestriction_example_output.png
    │       └── zero_inflated_normal_cov_example.png
├── pygformula
    ├── __init__.py
    ├── comparisons.py
    ├── data.py
    ├── interventions.py
    ├── parametric_gformula
    │   ├── __init__.py
    │   ├── bootstrap.py
    │   ├── fit.py
    │   ├── histories.py
    │   ├── parametric_gformula.py
    │   └── simulate.py
    ├── plot.py
    ├── utils
    │   ├── __init__.py
    │   ├── helper.py
    │   └── util.py
    └── version.py
├── readthedocs.yaml
├── requirements.txt
├── running_examples
    ├── get_started_example.py
    ├── test_absorbing_cov.py
    ├── test_binary_cov.py
    ├── test_binary_eof.py
    ├── test_bounded_normal_cov.py
    ├── test_categorical_cov.py
    ├── test_categorical_time.py
    ├── test_censor.py
    ├── test_comp_restrictions.py
    ├── test_competing_event.py
    ├── test_continuous_eof.py
    ├── test_custom_ymodel.py
    ├── test_dynamic_intervention.py
    ├── test_fit_random_forest.py
    ├── test_natural_course.py
    ├── test_natural_grace_period.py
    ├── test_normal_cov.py
    ├── test_restrictions.py
    ├── test_square_time.py
    ├── test_static_multiple_treatments.py
    ├── test_static_one_treatment.py
    ├── test_threshold_intervention.py
    ├── test_truncated_normal.py
    ├── test_uniform_grace_period.py
    ├── test_visit_process.py
    ├── test_yrestrictions.py
    └── test_zero_inflated_normal_cov.py
└── setup.py


/.github/workflows/publish.yaml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package to PyPI when a Release is Created
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   pypi-publish:
 9 |     name: Publish release to PyPI
10 |     runs-on: ubuntu-latest
11 |     environment:
12 |       name: pypi
13 |       url: https://pypi.org/p/pygformula
14 |     permissions:
15 |       id-token: write
16 |     steps:
17 |       - uses: actions/checkout@v4
18 |       - name: Set up Python
19 |         uses: actions/setup-python@v4
20 |         with:
21 |           python-version: "3.x"
22 |       - name: Install dependencies
23 |         run: |
24 |           python -m pip install --upgrade pip
25 |           pip install setuptools wheel
26 |       - name: Build package
27 |         run: |
28 |           python setup.py sdist bdist_wheel  # Could also be python -m build
29 |       - name: Publish package distributions to PyPI
30 |         uses: pypa/gh-action-pypi-publish@release/v1


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 The President and Fellows of Harvard College
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Pygformula_Documentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/Pygformula_Documentation.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pygformula: a python implementation of the parametric g-formula
 2 | 
 3 | [![PyPI version](https://badge.fury.io/py/pygformula.svg)](https://pypi.org/project/pygformula)
 4 | [![Documentation Status](https://readthedocs.org/projects/pygformula/badge/?version=latest)](https://pygformula.readthedocs.io)
 5 | [![Downloads](https://static.pepy.tech/badge/pygformula)](https://pepy.tech/project/pygformula)
 6 | 
 7 | **Authors: Jing Li, Sophia Rein, Sean McGrath, Roger Logan, Ryan O’Dea, Miguel Hernán**
 8 | 
 9 | 
10 | ## Overview
11 | The pygformula package implements the non-iterative conditional expectation (NICE) estimator of the g-formula algorithm
12 | (Robins, 1986). The g-formula can estimate an outcome’s counterfactual mean or risk under hypothetical treatment strategies
13 | (interventions) when there is sufficient information on time-varying treatments and confounders.
14 | 
15 | 
16 | ### Features
17 | 
18 | * Treatments: discrete or continuous time-varying treatments.
19 | * Outcomes: failure time outcomes or continuous/binary end of follow-up outcomes.
20 | * Interventions: interventions on a single treatment or joint interventions on multiple treatments.
21 | * Random measurement/visit process.
22 | * Incorporation of a priori knowledge of the data structure.
23 | * Censoring events.
24 | * Competing events.
25 | 
26 | 
27 | ## Requirements
28 | 
29 | The package requires python 3.8+ and these necessary dependencies:
30 | 
31 | - cmprsk
32 | - joblib
33 | - lifelines
34 | - matplotlib
35 | - numpy
36 | - pandas
37 | - prettytable
38 | - pytruncreg
39 | - scipy
40 | - seaborn
41 | - statsmodels
42 | - tqdm
43 | 
44 | 
45 | ## Documentation
46 | 
47 | The online documentation is available at [pygformula documentation](https://pygformula.readthedocs.io).
48 | 
49 | ## Issues
50 | 
51 | If you have any issues, please open an [issue](https://github.com/CausalInference/pygformula/issues) on github, we will 
52 | regularly check the questions. For any additional questions or comments, please email jing_li@hsph.harvard.edu.


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.https://www.sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx>=5.3.0
2 | sphinx_rtd_theme>=1.1.1
3 | readthedocs-sphinx-search>=0.3.2
4 | PyQt5>=5.15.11
5 | 


--------------------------------------------------------------------------------
/docs/source/Contact.rst:
--------------------------------------------------------------------------------
 1 | Contact
 2 | ''''''''''''''''''''
 3 | 
 4 | The pygformula package was developed in the CAUSALab by:
 5 | 
 6 |  - Jing Li, jing_li@hsph.harvard.edu
 7 |  - Sophia Rein, srein@hsph.harvard.edu
 8 |  - Sean McGrath, sean_mcgrath@g.harvard.edu
 9 |  - Roger Logan, rwlogan@hsph.harvard.edu
10 |  - Ryan O’Dea, ryanodea@hsph.harvard.edu
11 |  - Miguel Hernán, mhernan@hsph.harvard.edu
12 | 
13 | 
14 | If you have any questions or suggestions about this package, please contact jing_li@hsph.harvard.edu.
15 | As an ongoing open-source project, contributions are highly welcome for any bug reports or
16 | feature suggestions.
17 | 
18 |  - Issue reports: if you have any issues, please let us know by opening an `issue <https://github.com/CausalInference/pygformula/issues>`_
19 |    on github.
20 | 
21 |  - Feature requests: if you want to contribute any new feature implementation, please make a
22 |    `pull request <https://github.com/CausalInference/pygformula/pulls>`_ to post the feature requests.
23 | 
24 | 
25 | 
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/docs/source/Datasets.rst:
--------------------------------------------------------------------------------
 1 | Datasets
 2 | '''''''''''''''''''
 3 | 
 4 | We provide simulation datasets for users to run the different examples in this tutorial.
 5 | Additionally, code for replicating all test examples can be found in `"running examples" <https://github.com/CausalInference/pygformula/tree/main/running_examples>`_.
 6 | 
 7 | 
 8 | .. note::
 9 | 
10 |    The data can be downloaded by the following command, e.g., downloading the "example_data_basicdata_nocomp" dataset:
11 | 
12 |      .. code-block::
13 | 
14 |        from pygformula.data import load_basicdata_nocomp
15 | 
16 |    or be accessed directly from the github repository `pygformula <https://github.com/CausalInference/pygformula/tree/main/datasets>`_.
17 | 


--------------------------------------------------------------------------------
/docs/source/Get Started.rst:
--------------------------------------------------------------------------------
  1 | Get Started
  2 | ''''''''''''''''''''
  3 | 
  4 | ===================
  5 | Algorithm outline
  6 | ===================
  7 | 
  8 | 
  9 | The parametric g-formula estimator of the noniterative conditional expectation (NICE) requires
 10 | the specification of models for the joint density of the confounders, treatments, and outcomes over time.
 11 | The algorithm has three steps: (1) Parametric estimation, (2) Monte Carlo simulation
 12 | , and (3) Calculation of risk/mean under each intervention.
 13 | 
 14 | +  **Parametric estimation**: (a) estimate the conditional densities of each covariate given past covariate history
 15 |    by fitting user-specified regression models, (b) estimate the discrete hazard (for survival outcome) or mean
 16 |    (for binary/continuous end of follow-up) of the outcome conditional on past covariate history by fitting a user-specified
 17 |    regression model, (c) if the event of interest is subject to competing events and competing events are not treated as censoring events, estimate the conditional probability of the competing event
 18 |    conditional on past covariate history by fitting user-specified regression model for the competing event.
 19 | 
 20 | +  **Monte Carlo simulation**: (a) generate a new dataset which is usually larger than original dataset, for each covariate,
 21 |    generate simulated values at each time step using the estimated covariate models from step (1), (b) for the
 22 |    covariates that are to undergo intervention, their values are assigned according to the user-specified intervention rule,
 23 |    (c) obtain the discrete hazard / mean of the outcome based on the estimated outcome model from step (1),
 24 |    (d) if the event of interest is subject to competing events and competing events are not treated as censoring events,
 25 |    obtain the discrete hazard of the competing event based on the estimated competing model from step (1).
 26 | 
 27 | +  **Calculation of risk/mean under each intervention**: for binary/continuous end of follow-up, the final estimate is the mean of
 28 |    the estimated outcome of all individuals in the new dataset computed from Step (2). For survival outcome,
 29 |    the final estimate is obtained by calculating the mean of cumulative risks for all individuals using the discrete hazards computed from step (2).
 30 | 
 31 | 
 32 | 
 33 | Arguments:
 34 | 
 35 | .. automodule:: pygformula.parametric_gformula
 36 | .. autosummary:: ParametricGformula
 37 | .. autoclass:: ParametricGformula
 38 | 
 39 | 
 40 | 
 41 | ===================
 42 | Example
 43 | ===================
 44 | The observational dataset
 45 | `example_data_basicdata_nocomp <https://github.com/CausalInference/pygformula/blob/main/datasets/example_data_basicdata_nocomp.csv>`_ consists of 13,170 observations on 2,500 individuals with a maximum of 7 follow-up
 46 | times. The dataset contains the following variables:
 47 | 
 48 |  - id: Unique identifier for each individual.
 49 |  - t0: Time index.
 50 |  - L1: Binary time-varying covariate.
 51 |  - L2: Continuous time-varying covariate.
 52 |  - L3: Categorical baseline covariate.
 53 |  - A: Binary treatment variable.
 54 |  - Y: Outcome of interest; time-varying indicator of failure.
 55 | 
 56 | We are interested in the risk by the end of follow-up under the static interventions ‘‘Never treat’’ (set treatment
 57 | to 0 at all times) and ‘‘Always treat’’ (set treatment to 1 at all times).
 58 | 
 59 | - First, import the g-formula method ParametricGformula:
 60 | 
 61 |   .. code-block::
 62 | 
 63 |       from pygformula import ParametricGformula
 64 | 
 65 | - Then, load the data (here is an example of loading simulated `data <https://github.com/CausalInference/pygformula/blob/main/datasets/example_data_basicdata_nocomp.csv>`_ in the package,
 66 |   users can also load their own data) as required pandas DataFrame type
 67 | 
 68 |   .. code::
 69 | 
 70 |       from pygformula.data import load_basicdata_nocomp
 71 |       obs_data = load_basicdata_nocomp()
 72 | 
 73 | - Specify the name of the time variable, and the name of the individual identifier in the input data
 74 | 
 75 |   .. code-block::
 76 | 
 77 |       time_name = 't0'
 78 |       id = 'id'
 79 | 
 80 | - Specify covariate names, covariate types, and corresponding model statements
 81 | 
 82 |   .. code-block::
 83 | 
 84 |       covnames = ['L1', 'L2', 'A']
 85 |       covtypes = ['binary', 'bounded normal', 'binary']
 86 |       covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
 87 |                    'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
 88 |                    'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0']
 89 | 
 90 |   If there are baseline covariates (i.e., covariate with same value at all times) in the model statement, specify them in the
 91 |   ‘‘basecovs’’ argument:
 92 | 
 93 |   .. code::
 94 | 
 95 |       basecovs = ['L3']
 96 | 
 97 | 
 98 | - Specify the static interventions of interest:
 99 | 
100 |   .. code-block::
101 | 
102 |       from pygformula.interventions import static
103 | 
104 |       time_points = np.max(np.unique(obs_data[time_name])) + 1
105 |       int_descript = ['Never treat', 'Always treat']
106 | 
107 |       Intervention1_A = [static, np.zeros(time_points)],
108 |       Intervention2_A = [static, np.ones(time_points)],
109 | 
110 | - Specify the outcome name, outcome model statement, and the outcome type
111 | 
112 |   .. code-block::
113 | 
114 |       outcome_name = 'Y'
115 |       ymodel = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0'
116 |       outcome_type = 'survival'
117 | 
118 | - Speficy all the arguments in the "ParametricGformula" class and call its "fit" function:
119 | 
120 |   .. code-block::
121 | 
122 |       g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name,
123 |           covnames=covnames, covtypes=covtypes,
124 |           covmodels=covmodels, basecovs=basecovs,
125 |           time_points=time_points,
126 |           Intervention1_A = [static, np.zeros(time_points)],
127 |           Intervention2_A = [static, np.ones(time_points)],
128 |           outcome_name=outcome_name, ymodel=ymodel,
129 |           outcome_type = outcome_type)
130 | 
131 |       g.fit()
132 | 
133 | - Finally, get the output:
134 | 
135 |   .. image:: media/get_started_example.png
136 |      :align: center
137 | 
138 | 
139 |   - "Intervention": the name of natural course intervention and user-specified interventions.
140 |   - "NP-risk": the nonparametric estimates of the natural course risk.
141 |   - "g-formula risk": the parametric g-formula estimates of each interventions.
142 |   - "Risk Ratio (RR)": the risk ratio comparing each intervention and reference intervention.
143 |   - "Risk Difference (RD)": the risk difference comparing each intervention and reference intervention.
144 | 
145 | In the output table, the g-formula risk results under the specified interventions are shown, as well as the natural course.
146 | Furthermore, the nonparametric risk under the natural course is provided, which can be used to assess model misspecification of parametric
147 | g-formula. The risk ratio and risk difference comparing the specific intervention and the reference
148 | intervention (set to natural course by default) are also calculated.
149 | 
150 | Users can also get the standard errors and 95% confidence intervals of the g-formula estimates by specifying the ‘‘nsamples’’ argument.
151 | For example, specifying ‘‘nsamples’’ as 20 with parallel processing using 8 cores:
152 | 
153 |   .. code-block::
154 | 
155 |       g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name,
156 |           time_points = time_points,
157 |           Intervention1_A = [static, np.zeros(time_points)],
158 |           Intervention2_A = [static, np.ones(time_points)],
159 |           covnames=covnames, covtypes=covtypes,
160 |           covmodels=covmodels, basecovs=basecovs,
161 |           outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type,
162 |           nsamples=20, parallel=True, ncores=8)
163 |       g.fit()
164 | 
165 | The package will return following results:
166 | 
167 |   .. image:: media/get_started_example_bootstrap.jpg
168 |      :align: center
169 |      :width: 8.5in
170 |      :height: 2in
171 | 
172 | The result table contains 95% lower bound and upper bound for the risk, risk difference and risk ratio for all interventions.
173 | 
174 | The pygformula also provides plots for risk curves of interventions, which can be called by:
175 | 
176 |   .. code::
177 | 
178 |      g.plot_interventions()
179 | 
180 | It will return the g-formula risk (with 95% confidence intervals if using bootstrap samples) at all follow-up times under each intervention:
181 | 
182 |   .. image:: media/get_started_example_intervention_curve.jpg
183 |      :align: center
184 |      :width: 5in
185 |      :height: 4in
186 | 
187 | User can also get the plots of parametric and nonparametric estimates of
188 | the risks and covariate means under natural course by:
189 | 
190 |   .. code::
191 | 
192 |      g.plot_natural_course()
193 | 
194 | 
195 |   .. image:: media/get_started_example_all.jpg
196 |      :align: center
197 | 
198 | 
199 | 
200 | **Running example** `[code] <https://github.com/CausalInference/pygformula/blob/main/running_examples/get_started_example.py>`_:
201 | 
202 |    .. code-block::
203 | 
204 |     import numpy as np
205 |     from pygformula import ParametricGformula
206 |     from pygformula.interventions import static
207 |     from pygformula.data import load_basicdata_nocomp
208 | 
209 |     obs_data = load_basicdata_nocomp()
210 |     time_name = 't0'
211 |     id = 'id'
212 | 
213 |     covnames = ['L1', 'L2', 'A']
214 |     covtypes = ['binary', 'bounded normal', 'binary']
215 |     covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
216 |                'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
217 |                'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0']
218 | 
219 |     basecovs = ['L3']
220 | 
221 |     outcome_name = 'Y'
222 |     ymodel = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0'
223 |     outcome_type = 'survival'
224 | 
225 |     time_points = np.max(np.unique(obs_data[time_name])) + 1
226 |     int_descript = ['Never treat', 'Always treat']
227 | 
228 |     g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name,
229 |                  time_points = time_points, int_descript = int_descript,
230 |                  covnames=covnames, covtypes=covtypes,
231 |                  covmodels=covmodels, basecovs=basecovs,
232 |                  outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type,
233 |                  Intervention1_A = [static, np.zeros(time_points)],
234 |                  Intervention2_A = [static, np.ones(time_points)],
235 |                  nsamples=20, parallel=True, ncores=8)
236 |     g.fit()
237 |     g.plot_natural_course()
238 |     g.plot_interventions()
239 | 
240 | 
241 | 
242 | 
243 | 
244 | 
245 | 
246 | 
247 | 
248 | 
249 | 
250 | 


--------------------------------------------------------------------------------
/docs/source/Installation.rst:
--------------------------------------------------------------------------------
 1 | ''''''''''''''''''''
 2 | Installation
 3 | ''''''''''''''''''''
 4 | 
 5 | Requirements
 6 | ^^^^^^^^^^^^
 7 | 
 8 | The package requires python ≥ 3.8 and these necessary dependencies:
 9 | 
10 |    * cmprsk
11 |    * joblib
12 |    * lifelines
13 |    * matplotlib
14 |    * numpy
15 |    * pandas
16 |    * prettytable
17 |    * pytruncreg
18 |    * scipy
19 |    * seaborn
20 |    * statsmodels
21 |    * tqdm
22 | 
23 | 
24 | All the dependencies needed by the pygformula are listed in the file
25 | `"requirements.txt" <https://github.com/CausalInference/pygformula/blob/main/requirements.txt>`_ , users can
26 | install them by:
27 | 
28 | .. code::
29 | 
30 |     pip install -r requirements.txt
31 | 
32 | 
33 | Install pygformula
34 | ^^^^^^^^^^^^^^^^^^^^^^^^
35 | 
36 | Users can use the following command to install the pygformula package:
37 | 
38 | .. code::
39 | 
40 |   pip install pygformula
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/docs/source/Specifications/Censoring event.rst:
--------------------------------------------------------------------------------
 1 | .. _Censoring event:
 2 | 
 3 | Censoring event
 4 | ===================
 5 | 
 6 | When there are censoring events, the package provides the option to obtain inverse probability weighted (IPW) estimates
 7 | for comparison with the g-formula estimates. The comparison of these two estimates can be useful to assess model misspecification
 8 | of the g-formula [1]_.
 9 | To get the IPW estimate, the name of the censoring variable in the input data should be specified,
10 | users also need to specify a censor model to obtain the weights.
11 | 
12 | Note that the arguments ‘‘censor_name’’ and ‘‘censor_model’’ are only needed when users want to
13 | get the IPW estimate. The package will return the nonparametric observed risk in general cases.
14 | 
15 | 
16 | The arguments for censoring events:
17 | 
18 | .. list-table::
19 |     :header-rows: 1
20 | 
21 |     * - Arguments
22 |       - Description
23 |     * - censor_name
24 |       - (Optional) A string specifying the name of the censoring variable in obs_data. Only applicable when using inverse
25 |         probability weights to estimate the natural course means / risk from the observed data.
26 |     * - censor_model
27 |       - (Optional) A string specifying the model statement for the censoring variable. Only applicable when using inverse
28 |         probability weights to estimate the natural course means / risk from the observed data.
29 |     * - ipw_cutoff_quantile
30 |       - (Optional) Percentile value for truncation of the inverse probability weights.
31 |     * - ipw_cutoff_value
32 |       - (Optional) Absolute value for truncation of the inverse probability weights.
33 | 
34 | Users can also specify a percentile value (in the argument ‘‘ipw_cutoff_quantile’’) or an absolute value
35 | (in the argument ‘‘ipw_cutoff_value’’) to truncate inverse probability weight.
36 | 
37 | 
38 | **Sample syntax**:
39 | 
40 | .. code-block::
41 | 
42 |        censor_name = 'C'
43 |        censor_model = 'C ~ A + L'
44 | 
45 |        g = ParametricGformula(..., censor_name = censor_name, censor_model = censor_model, ...)
46 | 
47 | .. note::
48 | 
49 |    When there are categorical covariates (which are assigned a 'C' symbol) in the model statement of censoring variable,
50 |    please name the censoring variable any name except 'C' to avoild name confusion.
51 | 
52 | 
53 | **Running example** `[code] <https://github.com/CausalInference/pygformula/blob/main/running_examples/test_censor.py>`_:
54 | 
55 | .. code-block::
56 | 
57 |         import numpy as np
58 |         from pygformula import ParametricGformula
59 |         from pygformula.interventions import static
60 |         from pygformula.data import load_censor_data
61 | 
62 |         obs_data = load_censor_data()
63 |         time_name = 't0'
64 |         id = 'id'
65 | 
66 |         covnames = ['L', 'A']
67 |         covtypes = ['binary', 'normal']
68 | 
69 |         covmodels = ['L ~ lag1_L + t0',
70 |                      'A ~ lag1_A + L + t0']
71 | 
72 |         outcome_name = 'Y'
73 |         ymodel = 'Y ~ A + L'
74 | 
75 |         censor_name = 'C'
76 |         censor_model = 'C ~ A + L'
77 | 
78 |         time_points = np.max(np.unique(obs_data[time_name])) + 1
79 |         int_descript = ['Never treat', 'Always treat']
80 | 
81 |         g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name,
82 |             time_points = time_points,
83 |             int_descript=int_descript,
84 |             Intervention1_A = [static, np.zeros(time_points)],
85 |             Intervention2_A = [static, np.ones(time_points)],
86 |             censor_name= censor_name, censor_model=censor_model,
87 |             covnames = covnames, covtypes = covtypes, covmodels = covmodels,
88 |             outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival')
89 |         g.fit()
90 | 
91 | 
92 | **Output**:
93 | 
94 |     .. image:: ../media/censor_example_output.png
95 |          :align: center
96 | 
97 | .. [1] Yu-Han Chiu, Lan Wen, Sean McGrath, Roger Logan, Issa J Dahabreh, and Miguel A Hernán. 2022. Evaluating model specification when using the parametric g-formula in the presence of censoring. American Journal of Epidemiology.


--------------------------------------------------------------------------------
/docs/source/Specifications/Competing event.rst:
--------------------------------------------------------------------------------
  1 | .. _Competing event:
  2 | 
  3 | Competing event
  4 | ===================
  5 | 
  6 | In the presence of competing events, users may choose whether to treat competing
  7 | events as censoring events. When competing events are treated as censoring events,
  8 | risks under different interventions are calculated under elimination of
  9 | competing events, and are obtained by the Kaplan–Meier estimator.
 10 | When competing events are not treated as censoring events, risks under different interventions are calculated without elimination of
 11 | competing events, and are obtained by using an estimate of the subdistribution cumulative incidence function [1]_ :sup:`,` [2]_.
 12 | 
 13 | The arguments for competing events:
 14 | 
 15 | .. list-table::
 16 |     :header-rows: 1
 17 | 
 18 |     * - Arguments
 19 |       - Description
 20 |     * - compevent_name
 21 |       - (Optional) A string specifying the name of the competing event variable in obs_data. Only applicable for survival outcomes.
 22 |     * - compevent_model
 23 |       - (Optional) A string specifying the model statement for the competing event variable. Only applicable for survival outcomes.
 24 |     * - compevent_cens
 25 |       - (Optional) A boolean value indicating whether to treat competing events as censoring events. Default is False.
 26 | 
 27 | 
 28 | **Sample syntax**:
 29 | 
 30 | .. code-block::
 31 | 
 32 |         compevent_name = 'D'
 33 |         compevent_model = 'D ~ A + L1 + L2 + L3 + t0'
 34 |         compevent_cens = False
 35 | 
 36 |         g = ParametricGformula(..., compevent_name = compevent_name, compevent_model = compevent_model, compevent_cens = compevent_cens, ...)
 37 | 
 38 | The name of competing event in the input data should be specified in the argument ‘‘compevent_name’’.
 39 | The model statement for the competing event variable should be specified in the argument ‘‘compevent_model’’.
 40 | Users should also specify the argument ‘‘compevent_cens’’ as True or False indicating whether they want to treat the competing
 41 | event as censoring event (the default is False).
 42 | 
 43 | Setting ‘‘compevent_cens’’ as default (False):
 44 | 
 45 | **Running example** `[code] <https://github.com/CausalInference/pygformula/blob/main/running_examples/test_competing_event.py>`_:
 46 | 
 47 | .. code-block::
 48 | 
 49 |         from pygformula import ParametricGformula
 50 |         from pygformula.interventions import static
 51 |         from pygformula.data import load_basicdata
 52 | 
 53 |         obs_data = load_basicdata()
 54 | 
 55 |         covnames = ['L1', 'L2', 'A']
 56 |         covtypes = ['binary', 'bounded normal', 'binary']
 57 |         covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
 58 |                      'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2  + L3 + t0',
 59 |                      'A ~ lag1_A + L1 + L2 +lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0']
 60 | 
 61 |         ymodel = 'Y ~ A + L1 + L2 + L3 + lag1_A + lag1_L1 + lag1_L2'
 62 | 
 63 |         time_name = 't0'
 64 |         id = 'id'
 65 |         outcome_name = 'Y'
 66 |         basecovs = ['L3']
 67 | 
 68 |         compevent_name = 'D'
 69 |         compevent_model = 'D ~ A + L1 + L2 + L3 + t0'
 70 | 
 71 |         time_points = np.max(np.unique(obs_data[time_name])) + 1
 72 |         int_descript = ['Never treat', 'Always treat']
 73 | 
 74 |         g = ParametricGformula(obs_data = obs_data, id = id, time_points = time_points,
 75 |                       time_name=time_name, int_descript = int_descript,
 76 |                       Intervention1_A = [static, np.zeros(time_points)],
 77 |                       Intervention2_A = [static, np.ones(time_points)],
 78 |                       basecovs =basecovs, covnames=covnames,
 79 |                       covtypes=covtypes, covmodels=covmodels,
 80 |                       compevent_name = compevent_name, compevent_model=compevent_model,
 81 |                       outcome_name=outcome_name, outcome_type='survival', ymodel=ymodel)
 82 |         g.fit()
 83 | 
 84 | 
 85 | **Output**:
 86 | 
 87 |     .. image:: ../media/competing_not_cens_output.png
 88 |          :align: center
 89 | 
 90 | Setting ‘‘compevent_cens’’ as True:
 91 | 
 92 | .. code-block::
 93 | 
 94 |         compevent_name = 'D'
 95 |         compevent_model = 'D ~ A + L1 + L2 + L3 + t0'
 96 |         compevent_cens = True
 97 | 
 98 |         g = ParametricGformula(..., compevent_name = compevent_name, compevent_model = compevent_model, compevent_cens = compevent_cens, ...)
 99 | 
100 | **Output**:
101 | 
102 |     .. image:: ../media/competing_as_cens_output.png
103 |          :align: center
104 | 
105 | 
106 | .. [1] Young JG, Stensrud MJ, Tchetgen Tchetgen EJ, Hernán MA. A causal framework for classical statistical estimands
107 |        in failure-time settings with competing events. Statistics in Medicine. 2020;39:1199-236.
108 | .. [2] Fine JP and Gray RJ. A proportional hazards model for the subdistribution of a competing risk. Journal of the American Statistical Association, 94(446):496–509, 1999.
109 | 
110 | 


--------------------------------------------------------------------------------
/docs/source/Specifications/Deterministic knowledge.rst:
--------------------------------------------------------------------------------
  1 | .. _Deterministic knowledge:
  2 | 
  3 | 
  4 | Deterministic knowledge
  5 | ==============================================
  6 | When there are known priori deterministic knowledge, they can be incorporated into the g-formula algorithm to avoid unnecessary
  7 | extrapolation. The package allows users to apply restrictions of the deterministic knowledge on the covariates,
  8 | outcome or competing event.
  9 | 
 10 | 
 11 | Restrictions on covariates
 12 | -------------------------------
 13 | 
 14 | When incorporating the deterministic knowledge of one time-varying covariate Z, the estimation is changed as follows:
 15 | 
 16 | 1. In step 1 of the algorithm, restrict the chosen method of estimating the mean of Z given
 17 | “history” to only records where deterministic knowledge is absent.
 18 | 
 19 | 2. In step 2 of the algorithm, set Z deterministically to its known value for histories under which this
 20 | value is known. Otherwise, draw Z according to the model-based estimate conditional distribution of Z.
 21 | 
 22 | For example, when there are two time-varying covariates, one indicator of whether an individual has started menopause
 23 | by a given interval k (menopause), and another indicator of whether she is pregnant in interval k (pregnancy).
 24 | The deterministic knowledge is that given menopause == 1, the probability that pregnancy == 0 is 1. In the first
 25 | estimation step, only records with menopause == 0 are used for model estimation of pregnancy. Then in the second
 26 | simulation step, if the value of menopause in step 1 at time k is 1 then pregnancy is set to 0. Otherwise, the value
 27 | of pregnancy at time k is drawn from the estimated distribution in step 1.
 28 | 
 29 | The package allows deterministic knowledge incorporation for covariates by the argument ‘‘restrictions’’:
 30 | 
 31 | .. list-table::
 32 |     :header-rows: 1
 33 | 
 34 |     * - Arguments
 35 |       - Description
 36 |     * - restrictions
 37 |       - (Optional) List of lists. Each inner list contains its first entry the covariate name of that its deterministic knowledge
 38 |         is known; its second entry is a dictionary whose key is the conditions which should be True when the covariate
 39 |         is modeled, the third entry is the value that is set to the covariate during simulation when the conditions
 40 |         in the second entry are not True.
 41 | 
 42 | Note that for each restricted covariate and its conditional covariates, they need to follow the same order in ‘‘covnames’’, i.e.,
 43 | the restricted covariate should be after its conditional covariates.
 44 | 
 45 | An example of the restrictions that encodes the relationship between menopause and pregnancy above:
 46 | 
 47 | .. code-block::
 48 | 
 49 |         restrictions = [['pregnancy', {'menopause': lambda x: x == 0}, 1]]
 50 |         g = ParametricGformula(..., restrictions = restrictions, ...)
 51 | 
 52 | **Sample syntax**:
 53 | 
 54 | An example with one deterministic knowledge conditions for one covariate 'L2': if L1 equals 0, L2 is estimated
 55 | by its parametric model, otherwise, it is set to a known value 0.5.
 56 | 
 57 | .. code-block::
 58 | 
 59 |       restrictions = [['L2', {'L1': lambda x: x == 0}, 0.5]]
 60 |       g = ParametricGformula(..., restrictions = restrictions, ...)
 61 | 
 62 | An example with multiple deterministic knowledge conditions for one covariate 'A': if L1 equals 0 and L2 is greater than 0.5, A is estimated
 63 | by its parametric model, otherwise, it is set to a known value 1.
 64 | 
 65 | .. code-block::
 66 | 
 67 |       restrictions = [['A', {'L1': lambda x: x == 0, 'L2': lambda x: x > 0.5}, 1]]
 68 |       g = ParametricGformula(..., restrictions = restrictions, ...)
 69 | 
 70 | An example with multiple restrictions, one for covariate L2 and one for covariate A:
 71 | 
 72 | .. code-block::
 73 | 
 74 |       restrictions = [['L2', {'L1': lambda x: x == 0}, 0.5], ['A', {'L1': lambda x: x == 0, 'L2': lambda x: x > 0.5}, 1]]
 75 |       g = ParametricGformula(..., restrictions = restrictions, ...)
 76 | 
 77 | If the assigned value of the covariate is not a static value, but determined by a user-specified function,
 78 | the ‘‘restrictions’’ allows an input as a function type. In this case, the third entry for a restriction is a function
 79 | instead of a value.
 80 | 
 81 | For each custom restriction function, the input should be the parameters (not necessary to use all):
 82 | 
 83 | * new_df: A DataFrame that contains the observed or simulated data at time t.
 84 | * pool: A DataFrame that contains the observed or simulated data up to time t.
 85 | * time_name: A string specifying the name of the time variable in pool.
 86 | * t: An integer specifying the current time index.
 87 | 
 88 | The function output should be a list of values that users wish to assign for the restricted covariate at time t.
 89 | The package will automatically assign these values for records that are not restricted by the conditions.
 90 | 
 91 | An example with one deterministic knowledge condition for covariate L2: if L1 equals 0, L2 is estimated
 92 | by its parametric model, otherwise, its previous value is carried forward.
 93 | 
 94 | .. code-block::
 95 | 
 96 |       def carry_forward(new_df, pool, time_name, t):
 97 |           assigned_values = pool.loc[pool[time_name] == t-1, 'L2']
 98 |           return assigned_values
 99 | 
100 |       restrictions = [['L2', {'L1': lambda x: x == 0}, carry_forward]]
101 |       g = ParametricGformula(..., restrictions = restrictions, ...)
102 | 
103 | **Running example** `[code] <https://github.com/CausalInference/pygformula/blob/main/running_examples/test_restrictions.py>`_:
104 | 
105 | .. code-block::
106 | 
107 |         import numpy as np
108 |         from pygformula.interventions import static
109 |         from pygformula import ParametricGformula
110 |         from pygformula.data import load_basicdata_nocomp
111 | 
112 |         obs_data = load_basicdata_nocomp()
113 | 
114 |         time_name = 't0'
115 |         id = 'id'
116 | 
117 |         covnames = ['L1', 'L2', 'A']
118 |         covtypes = ['binary', 'normal', 'binary']
119 |         covmodels = ['L1 ~ lag1_L1 + lag1_A',
120 |                      'L2 ~ L1 + lag1_L2',
121 |                       'A ~ L1 + L2']
122 | 
123 |         basecovs = ['L3']
124 |         outcome_name = 'Y'
125 |         ymodel = 'Y ~ L1 + L2 + A'
126 | 
127 |         # define interventions
128 |         time_points = np.max(np.unique(obs_data[time_name])) + 1
129 |         int_descript = ['Never treat', 'Always treat']
130 | 
131 |         restrictions = [['L2', {'L1': lambda x: x == 0}, 0.5], ['A', {'L1': lambda x: x == 0, 'L2': lambda x: x > 0.5}, 1]]
132 | 
133 |         g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name,
134 |             time_points = time_points,
135 |             int_descript = int_descript,
136 |             Intervention1_A = [static, np.zeros(time_points)],
137 |             Intervention2_A = [static, np.ones(time_points)],
138 |             covnames=covnames, covtypes=covtypes,
139 |             covmodels=covmodels, basecovs=basecovs,
140 |             restrictions=restrictions, outcome_name=outcome_name,
141 |             ymodel=ymodel, outcome_type='survival')
142 |         g.fit()
143 | 
144 | 
145 | **Output**:
146 | 
147 |     .. image:: ../media/restriction_example_output.png
148 |          :align: center
149 | 
150 | 
151 | Restrictions on outcome
152 | ---------------------------------
153 | 
154 | When there is deterministic knowledge of the outcome variable Y, the package offers the argument
155 | ‘‘restrictions’’ to incorporate the knowledge:
156 | 
157 | .. list-table::
158 |     :header-rows: 1
159 | 
160 |     * - Arguments
161 |       - Description
162 |     * - yrestrictions
163 |       - (Optional) List of lists. For each inner list, its first entry is a dictionary whose key is the conditions which
164 |         should be True when the outcome is modeled, the second entry is the value that is set to the outcome during
165 |         simulation when the conditions in the first entry are not True.
166 | 
167 | 
168 | **Sample syntax**:
169 | 
170 | An example with one deterministic knowledge conditions for outcome Y: if L1 equals 0, the probability of outcome Y is estimated
171 | by its parametric model, otherwise, it is set to value 1.
172 | 
173 | .. code-block::
174 | 
175 |       yrestrictions = [[{'L1': lambda x: x == 0}, 1]]
176 |       g = ParametricGformula(..., yrestrictions = yrestrictions, ...)
177 | 
178 | An example with multiple restrcitions for outcome Y: if L1 equals 0,
179 | the probability of outcome Y is estimated by its parametric model, otherwise, it is set to a value 0; if L2 is greater than 0.5,
180 | the probability of outcome Y is estimated by its parametric model, otherwise, it is set to a value 0.1;
181 | 
182 | .. code-block::
183 | 
184 |       yrestrictions = [[{'L1': lambda x: x == 0}, 0], [{'L2': lambda x: x > 0.5}, 0.1]]
185 |       g = ParametricGformula(..., yrestrictions = yrestrictions, ...)
186 | 
187 | 
188 | **Running example** `[code] <https://github.com/CausalInference/pygformula/blob/main/running_examples/test_yrestrictions.py>`_:
189 | 
190 | .. code-block::
191 | 
192 |         from pygformula import ParametricGformula
193 |         from pygformula.interventions import static
194 |         from pygformula.data import load_basicdata_nocomp
195 | 
196 |         obs_data = load_basicdata_nocomp()
197 | 
198 |         time_name = 't0'
199 |         id = 'id'
200 | 
201 |         covnames = ['L1', 'L2', 'A']
202 |         covtypes = ['binary', 'normal', 'binary']
203 |         covmodels = ['L1 ~ lag1_L1 + lag1_A',
204 |                      'L2 ~ L1 + lag1_L2',
205 |                       'A ~ L1 + L2']
206 | 
207 |         basecovs = ['L3']
208 |         outcome_name = 'Y'
209 |         ymodel = 'Y ~ L1 + L2 + A'
210 | 
211 |         # define interventions
212 |         time_points = np.max(np.unique(obs_data[time_name])) + 1
213 |         int_descript = ['Never treat', 'Always treat']
214 | 
215 |         yrestrictions = [[{'L1': lambda x: x == 0}, 0], [{'L2': lambda x: x > 0.5}, 0.1]]
216 | 
217 |         g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name,
218 |             time_points = time_points,
219 |             int_descript = int_descript,
220 |             Intervention1_A = [static, np.zeros(time_points)],
221 |             Intervention2_A = [static, np.ones(time_points)],
222 |             covnames=covnames,  covtypes=covtypes, covmodels=covmodels, basecovs=basecovs,
223 |             yrestrictions=yrestrictions, outcome_name=outcome_name,
224 |             ymodel=ymodel, outcome_type='survival')
225 |         g.fit()
226 | 
227 | 
228 | **Output**:
229 | 
230 |     .. image:: ../media/yrestriction_example_output.png
231 |          :align: center
232 | 
233 | 
234 | Restrictions on competing event
235 | -----------------------------------
236 | 
237 | When there is a competing event D and there is known deterministic knowledge of the competing event,
238 | the package offers the argument ‘‘compevent_restrictions’’ for incorporation:
239 | 
240 | .. list-table::
241 |     :header-rows: 1
242 | 
243 |     * - Arguments
244 |       - Description
245 |     * - compevent_restrictions
246 |       - (Optional) List of lists. For each inner list, its first entry is a dictionary whose key is the conditions which
247 |         should be True when the competing event is modeled, the second entry is the value that is set to the competing
248 |         event during simulation when the conditions in the first entry are not True. Only applicable for survival outcomes.
249 | 
250 | 
251 | **Sample syntax**:
252 | 
253 | An example with one deterministic knowledge conditions for competing event D: if L1 equals 0, the probability of competing
254 | event is estimated by its parametric model, otherwise, it is set to a value 1.
255 | 
256 | .. code-block::
257 | 
258 |       compevent_restrictions = [{'L1': lambda x: x == 0}, 1]
259 |       g = ParametricGformula(..., compevent_restrictions = compevent_restrictions, ...)
260 | 
261 | An example with multiple restrictions for competing event D: if L1 equals 0, the probability of competing
262 | event is estimated by its parametric model, otherwise, it is set to a value 1; if L2 is greater than 0.5,
263 | the probability of competing event is estimated by its parametric model, otherwise,
264 | it is set to a value 0.1;
265 | 
266 | .. code-block::
267 | 
268 |       compevent_restrictions = [[{'L1': lambda x: x == 0}, 0], [{'L2': lambda x: x > 0.5}, 0.1]]
269 |       g = ParametricGformula(..., compevent_restrictions = compevent_restrictions, ...)
270 | 
271 | 
272 | **Running example** `[code] <https://github.com/CausalInference/pygformula/blob/main/running_examples/test_comp_restrictions.py>`_:
273 | 
274 | .. code-block::
275 | 
276 |         from pygformula import ParametricGformula
277 |         from pygformula.interventions import static
278 |         from pygformula.data import load_basicdata
279 | 
280 |         obs_data = load_basicdata()
281 | 
282 |         covnames = ['L1', 'L2', 'A']
283 |         covtypes = ['binary', 'bounded normal', 'binary']
284 |         covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
285 |                      'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2  + L3 + t0',
286 |                      'A ~ lag1_A + L1 + L2 +lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0']
287 | 
288 |         outcome_model = 'Y ~ A + L1 + L2 + L3 + lag1_A + lag1_L1 + lag1_L2'
289 | 
290 |         time_name = 't0'
291 |         id = 'id'
292 |         outcome_name = 'Y'
293 |         basecovs = ['L3']
294 | 
295 |         compevent_name = 'D'
296 |         compevent_model = 'D ~ A + L1 + L2 + L3 + t0'
297 |         compevent_cens = False
298 | 
299 |         time_points = np.max(np.unique(obs_data[time_name])) + 1
300 |         int_descript = ['Never treat', 'Always treat']
301 | 
302 |         compevent_restrictions = [[{'L1': lambda x: x == 0}, 0], [{'L2': lambda x: x > 0.5}, 0.1]]
303 | 
304 |         g = ParametricGformula(obs_data = obs_data, id = id, time_points = time_points,
305 |             time_name=time_name, int_descript = int_descript,
306 |             Intervention1_A = [static, np.zeros(time_points)],
307 |             Intervention2_A = [static, np.ones(time_points)],
308 |             basecovs =basecovs, covnames=covnames,
309 |             covtypes=covtypes, covmodels=covmodels,
310 |             compevent_restrictions = compevent_restrictions,
311 |             compevent_cens= compevent_cens, compevent_name = compevent_name,
312 |             compevent_model=compevent_model, outcome_name=outcome_name,
313 |             outcome_type='survival', ymodel=ymodel)
314 |         g.fit()
315 | 
316 | 
317 | **Output**:
318 | 
319 |     .. image:: ../media/comp_restriction_example_output.png
320 |          :align: center


--------------------------------------------------------------------------------
/docs/source/Specifications/Hazard ratio.rst:
--------------------------------------------------------------------------------
  1 | .. _Hazard ratio:
  2 | 
  3 | Hazard ratio
  4 | ============================
  5 | For survival outcomes, the pygformula provides the option of calculating the hazard ratio comparing any
  6 | two interventions of interest. In the presence of competing events, it will return the subdistribution hazard ratio
  7 | [1]_. Note that there is an order requirement for the input data structure that it should have the competing event before the outcome event.
  8 | 
  9 | *Prerequisite*: If users want to calculate the hazard ratio with competing event, they needs to install additional “rpy2” package
 10 | and install the python `"cmprsk" <https://pypi.org/project/cmprsk>`_ package. Please follow the steps below to install:
 11 | 
 12 |  - Install R to set up R environment
 13 | 
 14 |  - Install cmprsk R package in R environment:
 15 | 
 16 |    .. code::
 17 | 
 18 |       install.packages("cmprsk")
 19 | 
 20 |  - Install rpy2 package in python environment:
 21 | 
 22 |    .. code::
 23 | 
 24 |       pip install rpy2
 25 | 
 26 |  - Install cmprsk package in python environment:
 27 | 
 28 |    .. code::
 29 | 
 30 |       pip install cmprsk
 31 | 
 32 | .. note::
 33 | 
 34 |    If you encounters the problem of not finding the R environment, you can set up the R path
 35 |    in your environment using the following command in the code:
 36 | 
 37 |     .. code-block::
 38 | 
 39 |        import os
 40 |        os.environ["R_HOME"] = 'R_HOME'
 41 | 
 42 |     where R_HOME is the R home directory path.
 43 | 
 44 | The argument for calculating the hazard ratio:
 45 | 
 46 | .. list-table::
 47 |     :header-rows: 1
 48 | 
 49 |     * - Arguments
 50 |       - Description
 51 |     * - intcomp
 52 |       - (Optional) List of two numbers indicating a pair of interventions to be compared by a hazard ratio.
 53 | 
 54 | Users can specify the two interventions by:
 55 | 
 56 | .. code::
 57 | 
 58 |        intcomp = [1, 2]
 59 | 
 60 | The integer i in ‘‘intcomp’’ denotes the i-th intervention in the user-specified interventions. 0 denotes the natural course intervention.
 61 | 
 62 | 
 63 | **Running example** `[code] <https://github.com/CausalInference/pygformula/blob/main/running_examples/test_bounded_normal_cov.py>`_:
 64 | 
 65 | 
 66 | .. code-block::
 67 | 
 68 |     from pygformula import ParametricGformula
 69 |     from pygformula.interventions import static
 70 |     from pygformula.data import load_basicdata_nocomp
 71 | 
 72 |     obs_data = load_basicdata_nocomp()
 73 |     time_name = 't0'
 74 |     id = 'id'
 75 | 
 76 |     covnames = ['L2', 'A']
 77 |     covtypes = ['bounded normal', 'binary']
 78 |     covmodels = ['L2 ~ lag1_A + lag_cumavg1_L2 + L3 + t0',
 79 |                'A ~ lag1_A + L2 + lag_cumavg1_L2 + L3 + t0']
 80 | 
 81 |     basecovs = ['L3']
 82 | 
 83 |     outcome_name = 'Y'
 84 |     ymodel = 'Y ~ L2 + A + lag1_A + L3 + t0'
 85 |     outcome_type = 'survival'
 86 | 
 87 |     time_points = np.max(np.unique(obs_data[time_name])) + 1
 88 |     int_descript = ['Never treat', 'Always treat']
 89 | 
 90 |     g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name,
 91 |         time_points = time_points,
 92 |         int_descript = int_descript, intcomp=[1, 2],
 93 |         Intervention1_A = [static, np.zeros(time_points)],
 94 |         Intervention2_A = [static, np.ones(time_points)],
 95 |         covnames=covnames, covtypes=covtypes,
 96 |         covmodels=covmodels, basecovs=basecovs,
 97 |         outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type)
 98 |     g.fit()
 99 | 
100 | 
101 | **Output**:
102 | 
103 |     .. image:: ../media/test_hazard_ratio.png
104 |          :align: center
105 | 
106 | 
107 | .. [1] Fine JP and Gray RJ. A proportional hazards model for the subdistribution of a competing risk. Journal of the American Statistical Association, 94(446):496–509, 1999.
108 | 
109 | 
110 | 


--------------------------------------------------------------------------------
/docs/source/Specifications/Input data.rst:
--------------------------------------------------------------------------------
  1 | .. _Input data:
  2 | 
  3 | Input data
  4 | ============================
  5 | 
  6 | The input dataset is specified by the ‘‘obs_data’’ argument which should contain: ‘‘id’’ specifying
  7 | the individual identifier, ‘‘time_name’’ specifying the time index, ‘‘covnames’’ specifying the names of
  8 | time-varying covariates, ‘‘outcome_name’’ specifying the name of the outcome of interest, ‘‘compevent_name’’
  9 | indicating the competing event status (if present), ‘‘censor_name’’ indicating the censoring event status (if present).
 10 | 
 11 | 
 12 | **The related arguments**:
 13 | 
 14 | .. list-table::
 15 |     :header-rows: 1
 16 | 
 17 |     * - Arguments
 18 |       - Description
 19 |     * - obs_data
 20 |       - (Required) A data frame containing the observed data.
 21 |     * - id
 22 |       - (Required) A string specifying the name of the id variable in obs_data.
 23 |     * - time_name
 24 |       - (Required) A string specifying the name of the time variable in obs_data.
 25 |     * - outcome_name
 26 |       - (Required) A string specifying the name of the outcome variable in obs_data.
 27 |     * - covnames
 28 |       - (Required) A list of strings specifying the names of the time-varying covariates in obs_data.
 29 |     * - compevent_name
 30 |       - (Optional) A string specifying the name of the competing event variable in obs_data. Only applicable for survival outcomes.
 31 |     * - censor_name
 32 |       - (Optional) A string specifying the name of the censoring variable in obs_data. Only applicable when using inverse
 33 |         probability weights to estimate the natural course means / risk from the observed data.
 34 |     * - time_points
 35 |       - (Optional) An integer indicating the number of time points to simulate. It is set equal to the maximum number of records (K)
 36 |         that obs_data contains for any individual plus 1, if not specified by users.
 37 | 
 38 | 
 39 | The input data should contain one record for each follow-up time k for each subject (identified by the individual identifier).
 40 | The time index k for each subject should increment by 1 for each subsequent interval (the starting index is 0 in the following
 41 | examples, pre-baseline times are also allowed).
 42 | The record at each line in the data corresponds to an interval k, which contains the
 43 | covariate measurements at interval k and the outcome measurement at interval k+1.
 44 | 
 45 | 
 46 | Here is an example of input data structure for one subject which contains 7 records on
 47 | the measurements of three time-varying covariates ‘‘L1’’, ‘‘L2’’, ‘‘A’’,
 48 | one baseline covariate ‘‘L3’’ and the outcome ‘‘Y’’. See `"example_data_basicdata_nocomp" <https://github.com/CausalInference/pygformula/blob/main/datasets/example_data_basicdata_nocomp.csv>`_ for complete example data.
 49 | 
 50 |     .. image:: ../media/data_example.png
 51 |          :align: center
 52 |          :width: 5.2in
 53 |          :height: 1.8in
 54 | 
 55 | **Censoring events.** When there are censoring events, and users want to compute nature course estimate via
 56 | inverse probability weighting, there should be a variable in the input data set that is an
 57 | indicator of censoring in the time between covariate measurements in interval k and interval k+1.
 58 | 1 indicates the subject is censored (C_k+1 = 1) and 0 indicates the subject is not censored (C_k+1 = 0).
 59 | Subjects have no more records after they are censored. Note that the censoring indicator is not needed
 60 | if users don't want to compute the natural course estimate using IPW.
 61 | 
 62 | For survival outcome, the outcome Y_k+1 on the line where individual is censored (C_k+1 = 1) can be coded NA or 0.
 63 | This choice will make no difference to estimates in the algorithm when intervals are made small enough
 64 | such that there are no failures in intervals where there are censoring events. It depends on
 65 | whether to count such subjects in the time k risk set or not [1]_ :sup:`,` [2]_. For fixed binary/continuous end of follow-up, the
 66 | outcome Y_k+1 should be coded NA.
 67 | 
 68 | Here is an example of input data structure with a censoring event (identified by ‘‘C’’). The subject contains 8 records on the measurements of
 69 | two time-varying covariates ‘‘L’’, ‘‘A’’, the outcome ‘‘Y’’ and is censored at time index k+1=8. See `"example_data_censor" <https://github.com/CausalInference/pygformula/blob/master/datasets/example_data_censor.csv>`_ for complete example data.
 70 | 
 71 |     .. image:: ../media/data_example_censor.png
 72 |          :align: center
 73 |          :width: 4.5in
 74 |          :height: 2in
 75 | 
 76 | **Competing events.** When there are competing events in the data, if the user chooses to treat competing
 77 | events as censoring events, the data should be structured as censoring case above. If competing events
 78 | are not treated as censoring events, there should be a variable in the input data set that is an
 79 | indicator of competing event between interval k and k+1 covariate measurements, where
 80 | 1 indicates there is a competing event for the subject (D_k+1 = 1) and 0 indicates no competing event (D_k+1 = 0).
 81 | If D_k+1 = 1 on a record line k for a given subject, that subject will only have k+1 lines
 82 | in the follow-up data with follow-up time k on the last line, and on that line, Y_k+1 should be coded NA.
 83 | Note that the competing case is only applicable for survival outcome.
 84 | 
 85 | Here is an example of input data structure with a competing event (identified by ‘‘D’’). The subject contains 7 records on
 86 | three time-varying covariates ‘‘L1’’, ‘‘L2’’, ‘‘A’’, one baseline covariate ‘‘L3’’ and the outcome ‘‘Y’’.
 87 | The subject experiences a competing event after measurement of interval k=6 covariates. See `"example_data_basicdata" <https://github.com/CausalInference/pygformula/blob/main/datasets/example_data_basicdata.csv>`_ for complete example data.
 88 | 
 89 |     .. image:: ../media/data_example_competing.png
 90 |          :align: center
 91 |          :width: 6in
 92 |          :height: 1.8in
 93 | 
 94 | 
 95 | +  Note that the ‘‘time_points’’ argument specifies the desired end of follow-up (a
 96 |    follow-up interval k that is no more than the maximum number of records for an individual in the dataset),
 97 |    and is only applicable for survival outcome.
 98 | 
 99 | 
100 | .. [1] McGrath S, Lin V, Zhang Z, Petito LC, Logan RW, Hernán MA, Young JG. gfoRmula: An R Package for Estimating the Effects of Sustained Treatment Strategies via the Parametric g-formula. Patterns (N Y). 2020;1(3):100008. `gfoRmula <https://github.com/CausalInference/gfoRmula>`_.
101 | 
102 | .. [2] Roger W. Logan, Jessica G. Young, Sarah Taubman, Yu-Han Chiu, Sara Lodi, Sally Picciotto, Goodarz Danaei, Miguel A. Hernán. `GFORMULA SAS <https://github.com/CausalInference/GFORMULA-SAS>`_.
103 | 
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/docs/source/Specifications/Outcome model.rst:
--------------------------------------------------------------------------------
  1 | .. _Outcome model:
  2 | 
  3 | 
  4 | Outcome model
  5 | ===================
  6 | 
  7 | The package supports g-formula analysis on three types of outcomes: survival outcomes, fixed binary
  8 | end of follow-up outcomes and continuous end of follow-up outcomes.
  9 | 
 10 | For all types of outcomes, users should specify the name of outcome in the argument ‘‘outcome_name’’, and the model
 11 | statement for outcome variable in the argument ‘‘ymodel’’. If users are interested in the probability of failing of an event by
 12 | a specified follow-up time k under different interventions, they need to specify the type of outcome as
 13 | 'survival' in the argument ‘‘outcome_type’’. If users are interested in the outcome mean at a fixed time point,
 14 | and the outcome distribution is binary, they need to specify the type of outcome as
 15 | 'binary_eof'. Similarly, they need to specify the type of outcome as 'continuous_eof' when the distribution of the outcome is continuous.
 16 | 
 17 | The package uses generalized linear model (glm) to estimate the outcome model by default. If users want to use a custom
 18 | model for estimation, they can use the arguments ‘‘ymodel_fit_custom’’ and ‘‘ymodel_predict_custom’’ for specification.
 19 | 
 20 | 
 21 | .. list-table::
 22 |     :header-rows: 1
 23 | 
 24 |     * - Arguments
 25 |       - Description
 26 |     * - outcome_name
 27 |       - (Required) A string specifying the name of the outcome variable in obs_data.
 28 |     * - ymodel
 29 |       - (Required) A string specifying the model statement for the outcome variable.
 30 |     * - outcome_type
 31 |       - (Required) A string specifying the "type" of outcome. The possible "types" are: "survival", "continuous_eof", and "binary_eof".
 32 |     * - ymodel_fit_custom
 33 |       - (Optional) A user-specified fit function for the outcome variable.
 34 |     * - ymodel_predict_custom
 35 |       - (Optional) A user-specified predict function for the outcome variable.
 36 | 
 37 | 
 38 | Survival outcome
 39 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 40 | 
 41 | For survival outcomes, the package will output estimates of contrasts in failure risks by a specified follow-up time k
 42 | under different user-specified interventions.
 43 | 
 44 | 
 45 | **Sample syntax**:
 46 | 
 47 | .. code-block::
 48 | 
 49 |        outcome_name = 'Y'
 50 |        ymodel = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0'
 51 |        outcome_type = 'survival'
 52 |        time_points = 5
 53 | 
 54 |        g = ParametricGformula(..., outcome_name = outcome_name, outcome_type = outcome_type, ymodel = ymodel, time_points = time_points, ...)
 55 | 
 56 | Users can also specify the follow-up time of interest for survival outcome by the argument ‘‘time_points’’.
 57 | 
 58 | 
 59 | **Running example** `[code] <https://github.com/CausalInference/pygformula/blob/main/running_examples/get_started_example.py>`_:
 60 | 
 61 | .. code-block::
 62 | 
 63 |         import numpy as np
 64 |         from pygformula import ParametricGformula
 65 |         from pygformula.interventions import static
 66 |         from pygformula.data import load_basicdata_nocomp
 67 | 
 68 |         obs_data = load_basicdata_nocomp()
 69 |         time_name = 't0'
 70 |         id = 'id'
 71 | 
 72 |         covnames = ['L1', 'L2', 'A']
 73 |         covtypes = ['binary', 'bounded normal', 'binary']
 74 |         covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
 75 |                    'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
 76 |                    'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0']
 77 | 
 78 |         basecovs = ['L3']
 79 | 
 80 |         outcome_name = 'Y'
 81 |         outcome_model = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0'
 82 |         outcome_type = 'survival'
 83 | 
 84 |         time_points = np.max(np.unique(obs_data[time_name])) + 1
 85 |         int_descript = ['Never treat', 'Always treat']
 86 | 
 87 |         g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name,
 88 |             time_points = time_points, int_descript = int_descript,
 89 |             covnames=covnames, covtypes=covtypes,
 90 |             covmodels=covmodels, basecovs=basecovs,
 91 |             outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type,
 92 |             Intervention1_A = [static, np.zeros(time_points)],
 93 |             Intervention2_A = [static, np.ones(time_points)])
 94 |         g.fit()
 95 | 
 96 | 
 97 | **Output**:
 98 | 
 99 |     .. image:: ../media/get_started_example.png
100 |          :align: center
101 | 
102 | 
103 | Binary end of follow-up outcome
104 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
105 | 
106 | For binary end of follow-up outcomes, the package will output estimates of contrasts in the outcome probability
107 | under different user-specified treatment strategies.
108 | 
109 | **Sample syntax**:
110 | 
111 | .. code-block::
112 | 
113 |        outcome_name = 'Y'
114 |        ymodel = 'Y ~ L1 + A + lag1_A + lag1_L1 + L3 + t0'
115 |        outcome_type = 'binary_eof'
116 | 
117 |        g = ParametricGformula(..., outcome_name = outcome_name, outcome_type = outcome_type, ymodel = ymodel, ...)
118 | 
119 | **Running example** `[code] <https://github.com/CausalInference/pygformula/blob/main/running_examples/test_binary_eof.py>`_:
120 | 
121 | .. code-block::
122 | 
123 |         import numpy as np
124 |         from pygformula import ParametricGformula
125 |         from pygformula.interventions import threshold
126 |         from pygformula.data import load_binary_eof
127 | 
128 |         obs_data = load_binary_eof()
129 |         time_name = 't0'
130 |         id = 'id'
131 | 
132 |         covnames = ['L1', 'L2', 'A']
133 |         covtypes = ['binary', 'zero-inflated normal', 'normal']
134 |         covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + L3 + t0',
135 |                      'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
136 |                      'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0']
137 | 
138 |         basecovs = ['L3']
139 | 
140 |         outcome_name = 'Y'
141 |         ymodel = 'Y ~ L1 + A + lag1_A + lag1_L1 + L3 + t0'
142 |         outcome_type = 'binary_eof'
143 | 
144 |         int_descript = ['Threshold intervention']
145 | 
146 |         g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name,
147 |             int_descript = int_descript,
148 |             Intervention1_A = [threshold, [0.5, float('inf')]],
149 |             covnames=covnames, covtypes=covtypes,
150 |             covmodels=covmodels, basecovs=basecovs,
151 |             outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type)
152 |         g.fit()
153 | 
154 | **Output**:
155 | 
156 |     .. image:: ../media/binary_eof_example_output.png
157 |          :align: center
158 | 
159 | 
160 | Continuous end of follow-up outcome
161 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
162 | 
163 | For continuous end of follow-up outcomes, the package will output estimates of contrasts in the outcome mean
164 | under different user-specified treatment strategies.
165 | 
166 | **Sample syntax**:
167 | 
168 | .. code-block::
169 | 
170 |         outcome_name = 'Y'
171 |         ymodel = 'Y ~ C(L1) + L2 + A'
172 |         outcome_type = 'continuous_eof'
173 | 
174 |         g = ParametricGformula(..., outcome_name = outcome_name, outcome_type = outcome_type, ymodel = ymodel, ...)
175 | 
176 | 
177 | 
178 | **Running example** `[code] <https://github.com/CausalInference/pygformula/blob/main/running_examples/test_continuous_eof.py>`_:
179 | 
180 | .. code-block::
181 | 
182 |         import numpy as np
183 |         from pygformula import ParametricGformula
184 |         from pygformula.interventions import static
185 |         from pygformula.data import load_continuous_eof
186 | 
187 |         obs_data = load_continuous_eof()
188 |         time_name = 't0'
189 |         id = 'id'
190 | 
191 |         covnames = ['L1', 'L2', 'A']
192 |         covtypes = ['categorical', 'normal', 'binary']
193 |         covmodels = ['L1 ~ C(lag1_L1) + lag1_L2 + t0',
194 |                      'L2 ~ lag1_L2 + C(lag1_L1) + lag1_A + t0',
195 |                       'A ~ C(L1) + L2 + t0']
196 | 
197 |         basecovs = ['L3']
198 | 
199 |         outcome_name = 'Y'
200 |         outcome_model = 'Y ~ C(L1) + L2 + A'
201 |         outcome_type = 'continuous_eof'
202 | 
203 |         time_points = np.max(np.unique(obs_data[time_name])) + 1
204 |         int_descript = ['Never treat', 'Always treat']
205 | 
206 |         g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name,
207 |             int_descript=int_descript,
208 |             Intervention1_A = [static, np.zeros(time_points)],
209 |             Intervention2_A = [static, np.ones(time_points)],
210 |             covnames=covnames, covtypes=covtypes,
211 |             covmodels=covmodels, basecovs=basecovs,
212 |             outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type)
213 |         g.fit()
214 | 
215 | 
216 | 
217 | **Output**:
218 | 
219 |     .. image:: ../media/continuous_eof_example_output.png
220 |          :align: center
221 | 
222 | 
223 | Custom model
224 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
225 | 
226 | 
227 | The custom fit function needs to contain the input parameters:
228 | 
229 | * ymodel: model statement of the outcome
230 | * fit_data: data used to fit the outcome model
231 | 
232 | and return a fitted model which is used to make prediction in the simulation step.
233 | 
234 | 
235 | An example using random forest to fit a outcome model:
236 | 
237 | .. code-block::
238 | 
239 |       def ymodel_fit_custom(ymodel, fit_data):
240 |           y_name, x_name = re.split('~', ymodel.replace(' ', ''))
241 |           x_name = re.split('\+', x_name.replace(' ', ''))
242 |           # get feature and target data to fit ymodel
243 |           y = fit_data[y_name].to_numpy()
244 |           X = fit_data[x_name].to_numpy()
245 |           fit_rf = RandomForestRegressor()
246 |           fit_rf.fit(X, y)
247 |           return fit_rf
248 | 
249 | 
250 | The custom predict function needs to contain the input parameters:
251 | 
252 | * ymodel: model statement of the outcome
253 | * new_df: simulated data at time t.
254 | * fit: fitted model of the custom function
255 | 
256 | and return a list of predicted values at time t. For survival and binary end-of-follow-up outcomes, the predict
257 | function should return the estimated probability. For continuous end-of-follow-up outcomes, it should return the
258 | estimated mean.
259 | 
260 | 
261 | The example of custom predict function using the random forest model:
262 | 
263 | .. code-block::
264 | 
265 |       def ymodel_predict_custom(ymodel, new_df, fit):
266 |           y_name, x_name = re.split('~', ymodel.replace(' ', ''))
267 |           x_name = re.split('\+', x_name.replace(' ', ''))
268 |           # get feature data to predict
269 |           X = new_df[x_name].to_numpy()
270 |           prediction = fit.predict(X)
271 |           return prediction
272 | 
273 | 
274 | **Running example** `[code] <https://github.com/CausalInference/pygformula/blob/main/running_examples/test_custom_ymodel.py>`_:
275 | 
276 | .. code-block::
277 | 
278 |         import numpy as np
279 |         import pygformula
280 |         from pygformula import ParametricGformula
281 |         from pygformula.interventions import static
282 |         from pygformula.data import load_continuous_eof
283 | 
284 |         obs_data = load_continuous_eof()
285 | 
286 |         time_name = 't0'
287 |         id = 'id'
288 | 
289 |         covnames = ['L1', 'L2', 'A']
290 |         covtypes = ['categorical', 'normal', 'binary']
291 |         covmodels = ['L1 ~ C(lag1_L1) + lag1_L2 + t0',
292 |                      'L2 ~ lag1_L2 + C(lag1_L1) + lag1_A + t0',
293 |                       'A ~ C(L1) + L2 + t0']
294 | 
295 |         basecovs = ['L3']
296 | 
297 |         outcome_name = 'Y'
298 | 
299 |         ymodel = 'Y ~ lag1_L2 + L2 + lag1_A + A'
300 | 
301 |         # define interventions
302 |         time_points = np.max(np.unique(obs_data[time_name])) + 1
303 |         int_descript = ['Never treat', 'Always treat']
304 | 
305 | 
306 |         def ymodel_fit_custom(ymodel, fit_data):
307 |             y_name, x_name = re.split('~', ymodel.replace(' ', ''))
308 |             x_name = re.split('\+', x_name.replace(' ', ''))
309 |             # get feature and target data to fit ymodel
310 |             y = fit_data[y_name].to_numpy()
311 |             X = fit_data[x_name].to_numpy()
312 |             fit_rf = RandomForestRegressor()
313 |             fit_rf.fit(X, y)
314 |             return fit_rf
315 | 
316 |         def ymodel_predict_custom(ymodel, new_df, fit):
317 |             y_name, x_name = re.split('~', ymodel.replace(' ', ''))
318 |             x_name = re.split('\+', x_name.replace(' ', ''))
319 |             # get feature data to predict
320 |             X = new_df[x_name].to_numpy()
321 |             prediction = fit.predict(X)
322 |             return prediction
323 | 
324 | 
325 |         g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
326 |                      int_descript = int_descript,
327 |                      Intervention1_A = [static, np.zeros(time_points)], basecovs=['L3'],
328 |                      Intervention2_A = [static, np.ones(time_points)],
329 |                      covnames=covnames,  covtypes=covtypes, covmodels=covmodels,
330 |                      ymodel_fit_custom = ymodel_fit_custom, ymodel_predict_custom=ymodel_predict_custom,
331 |                      outcome_name=outcome_name, ymodel=ymodel, outcome_type='continuous_eof')
332 |         g.fit()
333 | 
334 | 
335 | 
336 | .. note::
337 | 
338 |    Note that when there are categorical covariates in the model statement, adding the ‘‘C( )’’ only applies to the
339 |    default model fitting function. If users want to include it in a custom model fitting function, they need to
340 |    process the categorical data in addition.


--------------------------------------------------------------------------------
/docs/source/Specifications/Output.rst:
--------------------------------------------------------------------------------
  1 | .. _Output:
  2 | 
  3 | 
  4 | Output
  5 | =================
  6 | 
  7 | 
  8 | Numerical results
  9 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
 10 | 
 11 | The package provides the following outputs:
 12 | 
 13 | + **Data table of g-formula estimates**: The result table of g-formula estimates is returned by the fit function, containing (1) the nonparametric estimates
 14 |   of the natural course risk/mean outcome, (2) the parametric g-formula estimates of the risk/mean outcome under each user-specified intervention,
 15 |   (3) the risk ratio between each intervention and the reference intervention (natural course by default, can be specified in the argument ‘‘ref_int’’),
 16 |   (4) the risk difference between each intervention and the reference intervention.
 17 | 
 18 | 
 19 | + **Simulated data table for interventions**: The package gives the simulated data table in the simulation step under
 20 |   each specified intervention, which can be obtained by:
 21 | 
 22 |      .. code::
 23 | 
 24 |         sim_data = g.summary_dict['sim_data']
 25 | 
 26 |   To get the simulated data under a particular intervention:
 27 | 
 28 |      .. code::
 29 | 
 30 |         sim_data = g.summary_dict['sim_data'][intervention_name]
 31 | 
 32 | 
 33 | + **The IP weights**: To get the inverse probability weights when there is censoring event:
 34 | 
 35 |      .. code::
 36 | 
 37 |         ip_weights = g.summary_dict['IP_weights']
 38 | 
 39 | 
 40 | + **The model summary**: The package gives the model summary for each covariate, outcome,
 41 |   competing event (if applicable), censoring event (if applicable).
 42 |   First the argument ‘‘model_fits’’ should be set to True, then the model summary can be obtained by:
 43 | 
 44 |      .. code::
 45 | 
 46 |         fitted_models = g.summary_dict['model_fits_summary']
 47 | 
 48 |   To get the fitted model for a particular variable:
 49 | 
 50 |      .. code::
 51 | 
 52 |         fitted_model = g.summary_dict['model_fits_summary'][variable_name]
 53 | 
 54 | + **The coefficients**: The package gives the parameter estimates of all the models, which can be obtained by:
 55 | 
 56 |      .. code::
 57 | 
 58 |         model_coeffs = g.summary_dict['model_coeffs']
 59 | 
 60 |   To get the coefficients of the model for a particular variable, please use:
 61 | 
 62 |      .. code::
 63 | 
 64 |         model_coeffs = g.summary_dict['model_coeffs'][variable_name]
 65 | 
 66 | 
 67 | + **The standard errors**: The package gives the standard errors of the parameter estimates of all the models, which can be obtained by:
 68 | 
 69 |      .. code::
 70 | 
 71 |         model_stderrs = g.summary_dict['model_stderrs']
 72 | 
 73 |   To get the standard errors of the model for a particular variable, please use:
 74 | 
 75 |      .. code::
 76 | 
 77 |         model_stderrs = g.summary_dict['model_stderrs'][variable_name]
 78 | 
 79 | + **The variance-covariance matrices**: The package gives the variance-covariance matrices of the parameter estimates of all the models,
 80 |   which can be obtained by:
 81 | 
 82 |      .. code::
 83 | 
 84 |         model_vcovs = g.summary_dict['model_vcovs']
 85 | 
 86 |   To get the variance-covariance matrix of the parameter estimates of the model for a particular variable, please use:
 87 | 
 88 |      .. code::
 89 | 
 90 |         model_vcovs = g.summary_dict['model_vcovs'][variable_name]
 91 | 
 92 | 
 93 | + **The root mean square error**: The package gives the RMSE values of the models, which can be obtained by:
 94 | 
 95 |      .. code::
 96 | 
 97 |         rmses = g.summary_dict['rmses']
 98 | 
 99 |   To get the RMSE of the model for a particular variable, please use:
100 | 
101 |      .. code::
102 | 
103 |         rmses = g.summary_dict['rmses'][variable_name]
104 | 
105 | + **Nonparametric estimates at each time point**: The package gives the nonparametric estimates of all covariates and risk at each time point for survival outcomes, which can be obtained by:
106 | 
107 |      .. code::
108 | 
109 |         obs_estimates = g.summary_dict['obs_plot']
110 | 
111 |   To get the nonparametric estimates of a particular variable, e.g., risk, please use:
112 | 
113 |      .. code::
114 | 
115 |         obs_estimates = g.summary_dict['obs_plot']['risk']
116 | 
117 | + **Parametric estimates at each time point**: The package gives the parametric estimates of all covariates and risk at each time point for survival outcomes, which can be obtained by:
118 | 
119 |      .. code::
120 | 
121 |         est_estimates = g.summary_dict['est_plot']
122 | 
123 |   To get the parametric estimates of a particular variable, e.g., risk, please use:
124 | 
125 |      .. code::
126 | 
127 |         est_estimates = g.summary_dict['est_plot']['risk']
128 | 
129 | 
130 | + **Hazard ratio**: The package gives hazard ratio value for the two interventions specified, which can be obtained by:
131 | 
132 |      .. code::
133 | 
134 |         hazard_ratio = g.summary_dict['hazard_ratio']
135 | 
136 | The package also implement nonparametric bootstrapping to obtain 95% confidence intervals for risk/mean estimates
137 | by repeating the algorithm for many bootstrap samples. Users can choose the argument ‘‘nsamples’’ to specify the number of new generated bootstrap samples.
138 | Users may choose the argument ‘‘parallel’’ to parallelize bootstrapping and simulation steps under each intervention to
139 | make the algorithm run faster. The argument ‘‘ncores’’ can be used to specify the desired number of CPU cores
140 | in parallarization.
141 | 
142 | The package provides two ways for calculating the confidence intervals
143 | in argument ‘‘ci_method’’, ‘‘percentile’’ means using percentile bootstrap method which takes the 2.5th and 97.5th percentiles of the bootstrap estimates to get the 95% confidence interval,
144 | "normal" means using the normal bootstrap method which uses the the original estimate and
145 | the standard deviation of the bootstrap estimates to get the normal approximation 95% confidence interval.
146 | 
147 | + **The g-formula estimates of bootstrap samples**: The package gives the parametric g-formula estimates of all
148 |   bootstrap samples, which can be obtained by:
149 | 
150 |      .. code-block::
151 | 
152 |         g = ParametricGformula(..., nsamples = 20, parallel=True, n_core=10, ci_method = 'percentile', ...)
153 |         g.fit()
154 |         bootests = g.summary_dict['bootests']
155 | 
156 |   To get the parametric g-formula estimates of a particular bootstrap sample, please use:
157 | 
158 |      .. code::
159 | 
160 |         g.summary_dict['bootests']['sample_{id}_estimates']
161 | 
162 |   where id is the sample id which should be an integer between 0 and ‘‘nsamples’’ - 1.
163 | 
164 | 
165 | + **The coefficients of bootstrap samples**: The package gives the parameter estimates of all the models for all generated
166 |   bootstrap samples, which can be obtained by:
167 | 
168 |      .. code-block::
169 | 
170 |         g = ParametricGformula(..., nsamples = 20, parallel=True, n_core=10, ci_method = 'percentile', boot_diag=True, ...)
171 |         g.fit()
172 |         bootcoeffs = g.summary_dict['bootcoeffs']
173 | 
174 | Note that the ‘‘boot_diag’’ should be set to true if users want to obtain the coefficients, standard errors or variance-covariance matrices
175 | of bootstrap samples.
176 | 
177 |   To get the coefficients of a particular bootstrap sample, please use:
178 | 
179 |      .. code::
180 | 
181 |         g.summary_dict['bootcoeffs']['sample_{id}_coeffs']
182 | 
183 | + **The standard errors of bootstrap samples**: The package gives the standard errors of the parameter estimates of all the models for all generated
184 |   bootstrap samples, which can be obtained by:
185 | 
186 |      .. code-block::
187 | 
188 |         g = ParametricGformula(..., nsamples = 20, parallel=True, n_core=10, ci_method = 'percentile', boot_diag=True, ...)
189 |         g.fit()
190 |         bootstderrs = g.summary_dict['bootstderrs']
191 | 
192 |   To get the standard errors of a particular bootstrap sample, please use:
193 | 
194 |      .. code::
195 | 
196 |         g.summary_dict['bootstderrs']['sample_{id}_stderrs']
197 | 
198 | 
199 | + **The variance-covariance matrices of bootstrap samples**: The package gives the variance-covariance matrices of the parameter estimates of all the models for all generated
200 |   bootstrap samples, which can be obtained by:
201 | 
202 |      .. code-block::
203 | 
204 |         g = ParametricGformula(..., nsamples = 20, parallel=True, n_core=10, ci_method = 'percentile', boot_diag=True, ...)
205 |         g.fit()
206 |         bootvcovs = g.summary_dict['bootvcovs']
207 | 
208 |   To get the variance-covariance matrices of a particular bootstrap sample, please use:
209 | 
210 |      .. code::
211 | 
212 |         g.summary_dict['bootvcovs']['sample_{id}_vcovs']
213 | 
214 | 
215 | Note that to get bootstrap results of coefficients, standard errors, and variance-covariance matrices, the argument
216 | ‘‘boot_diag’’ must be set to True.
217 | 
218 | All the output results above can be saved by the argument ‘‘save_results’’, once it is set to True,
219 | results will be saved locally by creating a folder automatically. Users can also specify the folder path by the
220 | argument ‘‘save_path’’:
221 | 
222 |      .. code-block::
223 | 
224 |         g = ParametricGformula(..., save_results = True, save_path = 'user-specified path', ...)
225 |         g.fit()
226 | 
227 | 
228 | **Arguments**:
229 | 
230 | .. list-table::
231 |     :header-rows: 1
232 | 
233 |     * - Arguments
234 |       - Description
235 |     * - n_simul
236 |       - (Optional) An integer indicating the number of subjects for whom to simulate data. It is set equal to the number (M) of
237 |         subjects in obs_data, if not specified by users.
238 |     * - ref_int
239 |       - (Optional) An integer indicating the intervention to be used as the reference for calculating the end-of-follow-up mean/risk
240 |         ratio and mean/risk difference. 0 denotes the natural course, while subsequent integers denote user-specified
241 |         interventions in the order that they are named in interventions. It is set to 0 if not specified by users.
242 |     * - nsamples
243 |       - (Optional) An integer specifying the number of bootstrap samples to generate.
244 |     * - parallel
245 |       - (Optional) A boolean value indicating whether to parallelize simulations of different interventions to multiple cores.
246 |     * - ncores
247 |       - (Optional) An integer indicating the number of cores used in parallelization. It is set to 1 if not specified by users.
248 |     * - model_fits
249 |       - (Optional) A boolean value indicating whether to return the parameter estimates of the models.
250 |     * - ci_method
251 |       - (Optional) A string specifying the method for calculating the bootstrap 95% confidence intervals, if applicable.
252 |         The options are "percentile" and "normal". It is set to "percentile" if not specified by users.
253 |     * - boot_diag
254 |       - (Optional) A boolean value indicating whether to return the parametric g-formula estimates as well as the coefficients,
255 |         standard errors, and variance-covariance matrices of the parameters of the fitted models in the bootstrap samples.
256 |     * - save_results
257 |       - (Optional) A boolean value indicating whether to save all the returned results to the save_path.
258 |     * - save_path
259 |       - (Optional) A path to save all the returned results. A folder will be created automatically in the current working directory
260 |         if the save_path is not specified by users.
261 |     * - seed
262 |       - (Optional) An integer indicating the starting seed for simulations and bootstrapping. It is set to 1234 if not specified by users.
263 | 
264 | 
265 | 
266 | 
267 | 
268 | Graphical results
269 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
270 | 
271 | .. automodule:: pygformula.plot
272 | 
273 | 
274 | The package also provides two plotting functions: "plot_natural_course" and "plot_interventions".
275 | The plot_natural_course function plots the curves of each covariate mean (for all types of outcomes) and risk (for survival outcomes only) under g-formula parametric and
276 | non-parametric estimation.
277 | 
278 | .. autosummary:: plot_natural_course
279 | .. autofunction:: plot_natural_course
280 | 
281 | 
282 | The plot_interventions function plots the curves of risk under interventions of interest (for survival outcomes only).
283 | 
284 | 
285 | .. autosummary:: plot_interventions
286 | .. autofunction:: plot_interventions
287 | 
288 | 
289 | Arguments for plotting:
290 | 
291 | .. list-table::
292 |     :header-rows: 1
293 | 
294 |     * - Arguments
295 |       - Description
296 |     * - plot_name
297 |       - A string specifying the name for plotting, which is set to "all", "risk" or one specific covariate name. Only
298 |         applicable for the plot_natural_course function. The default is "all".
299 |     * - colors
300 |       - For plot_natural_course function, it is a list wth two elements, specifying the non-parametric estimate curve and parametric curve respectively.
301 |         Users can choose colors from `matplotlib colors <https://matplotlib.org/stable/gallery/color/named_colors.html>`_.
302 |         For plot_interventions function, it is a list wth m elements with m the number of interventions plus 1,
303 |         specifying all intervention curves. If not specified, the function will use default colors.
304 |     * - marker
305 |       - A string used to customize the appearance of points in plotting. Users can also choose markers from
306 |         `matplotlib markers <https://matplotlib.org/stable/api/markers_api.html>`_ library.
307 |     * - markersize
308 |       - An integer specifies the size of the markers in plotting.
309 |     * - linewidth
310 |       - A number that specifies the width of the line in plotting.
311 |     * - save_figure
312 |       - A boolean value indicating whether to save the figure or not.
313 | 
314 | 
315 | Users can call the 'plot_natural_course' function by:
316 | 
317 |      .. code-block::
318 | 
319 |         g.plot_natural_course()
320 | 
321 | Users can call the 'plot_interventions' function by:
322 | 
323 |      .. code-block::
324 | 
325 |         g.plot_interventions()
326 | 
327 | 
328 | Note that the plotting functions can only be applied after calling the 'g.fit' function.
329 | 
330 | The figures can be saved by the argument ‘‘save_figure’’, once it is set to True,
331 | results will be saved locally by creating a folder automatically. If the argument ‘‘save_path’’ is specified, the figure will be saved to the corresponding folder.
332 | 
333 | 
334 | **Sample syntax**:
335 | 
336 | .. code-block::
337 | 
338 |         g.plot_natural_course(plot_name='L1', colors=['blue', 'red'], markersize=5, linewidth=1, marker='v', save_figure=True)
339 |         g.plot_interventions(colors =['green', 'red', 'yellow'], markersize=5, linewidth=1, marker='v', save_figure=True)
340 | 
341 | .. note::
342 | 
343 |    We recommend setting the ‘‘save_figure’’ as True if users want to access the figure
344 |    when running the package on Linux system.


--------------------------------------------------------------------------------
/docs/source/Specifications/Visit process.rst:
--------------------------------------------------------------------------------
  1 | .. _Visit process:
  2 | 
  3 | 
  4 | Visit process
  5 | =================
  6 | 
  7 | When the data are not recorded at regular intervals but rather are recorded everytime the patient visits the
  8 | clinic, the times at which the time-varying covariates are measured will vary by subject. In this setting,
  9 | it is typical to construct the data such that (i) at a time when there is no visit/measurement,
 10 | the last measured value of a covariate is carried forward, and (ii) a subject is censored after a certain number of consecutive times
 11 | with no visit/measurement [1]_ :sup:`,` [2]_.
 12 | 
 13 | In pygformula, the deterministic knowledge (i) and (ii) can be incorporated via the argument ‘‘visitprocess’’.
 14 | Each vector in ‘‘visitprocess’’ contains three parameters that attach a visit process to one covariate.
 15 | The first parameter is the name of a time-varying indicator in the input data set of whether a covariate was measured in each interval
 16 | (1 means there is a visit/measurement, 0 means there is no visit/measurement).
 17 | The second parameter is the name of the covariate. The third parameter is the maximum number s of missed measurements of this covariate allowed
 18 | since the last measurement before a subject is censored.
 19 | 
 20 | For the visit indicator, in the fitting step, the probability of a visit is estimated only using records
 21 | where the sum of consecutive missed visits through previous k-1 time points is less than the maximum number of consecutive missed visits s.
 22 | Then in the simulation step, if the sum of consecutive missed visits through previous k-1 time points is less than s, then the visit
 23 | indicator is simulated from a distribution based on this estimate; otherwise, the visit indicator is set to 1 so
 24 | as to eliminate subjects with more than s consecutive missed visits. For the covariate, in the fitting step, the conditional mean of the covariate will be estimated
 25 | only for data records where there is a current visit. If the visit indicator equals 1, then in simulation step, the value of the
 26 | dependent covariate will be generated from a distribution based on this estimate; otherwise, the last value is
 27 | carried forward.
 28 | 
 29 | 
 30 | 
 31 | The argument for visit process:
 32 | 
 33 | .. list-table::
 34 |     :header-rows: 1
 35 | 
 36 |     * - Arguments
 37 |       - Description
 38 |     * - visitprocess
 39 |       - (Optional) List of lists. Each inner list contains its first entry the covariate name of a visit process; its second entry
 40 |         the name of a covariate whose modeling depends on the visit process; and its third entry the maximum number
 41 |         of consecutive visits that can be missed before an individual is censored.
 42 | 
 43 | .. code::
 44 | 
 45 |         covnames = ['visit_cd4', 'visit_rna', 'cd4_v', 'rna_v', 'everhaart']
 46 |         covtypes = ['binary', 'binary', 'normal', 'normal', 'binary']
 47 |         covmodels = ['visit_cd4 ~ lag1_everhaart + lag_cumavg1_cd4_v + sex + race + month',
 48 |                      'visit_rna ~ lag1_everhaart + lag_cumavg1_rna_v + sex + race + month',
 49 |                      'cd4_v ~ lag1_everhaart + lag_cumavg1_cd4_v + sex + race + month',
 50 |                      'rna_v ~ lag1_everhaart + lag_cumavg1_rna_v + sex + race + month',
 51 |                      'everhaart ~ lag1_everhaart + cd4_v + rna_v + sex + race + month']
 52 | 
 53 |         visitprocess = [['visit_cd4', 'cd4_v', 3], ['visit_rna', 'rna_v', 3]]
 54 | 
 55 |         g = ParametricGformula(..., covnames = covnames, covtypes = covtypes, covmodels = covmodels, visitprocess = visitprocess, ...)
 56 | 
 57 | 
 58 | Here is an example in clinical cohorts of HIV-positive patients, ‘‘cd4_v’’ is a time-varying covariate of CD4 cell count measurement,
 59 | the visit indicator ‘‘visit_cd4’’ indicats whether the CD4 cell count measurements were taken in interval k.
 60 | 3 means that the data is constructed such that the subjects are censored once they have not had CD4 measured for 3 consecutive intervals.
 61 | Note that for the visit indicator ‘‘visit_cd4’’, it should come before the dependent covariate ‘‘cd4_v’’ and be assigned
 62 | the ‘‘binary’’ covariate type in ‘‘covtypes’’.
 63 | 
 64 | 
 65 | **Running example** `[code] <https://github.com/CausalInference/pygformula/blob/main/running_examples/test_visit_process.py>`_:
 66 | 
 67 | .. code-block::
 68 | 
 69 |         from pygformula import ParametricGformula
 70 |         from pygformula.interventions import static
 71 |         from pygformula.data import load_visit_process
 72 | 
 73 |         obs_data = load_visit_process()
 74 |         time_name = 'month'
 75 |         id = 'id'
 76 | 
 77 |         covnames = ['visit_cd4', 'visit_rna', 'cd4_v', 'rna_v', 'everhaart']
 78 |         covtypes = ['binary', 'binary', 'normal', 'normal', 'binary']
 79 |         covmodels = ['visit_cd4 ~ lag1_everhaart + lag_cumavg1_cd4_v + sex + race + month',
 80 |                      'visit_rna ~ lag1_everhaart + lag_cumavg1_rna_v + sex + race + month',
 81 |                      'cd4_v ~ lag1_everhaart + lag_cumavg1_cd4_v + sex + race + month',
 82 |                      'rna_v ~ lag1_everhaart + lag_cumavg1_rna_v + sex + race + month',
 83 |                      'everhaart ~ lag1_everhaart + cd4_v + rna_v + sex + race + month']
 84 | 
 85 |         basecovs = ['sex', 'race', 'age']
 86 | 
 87 |         visitprocess = [['visit_cd4', 'cd4_v', 3], ['visit_rna', 'rna_v', 3]]
 88 | 
 89 |         outcome_name = 'event'
 90 |         ymodel = 'event ~ cd4_v + rna_v + everhaart + sex + race + month'
 91 | 
 92 |         time_points = np.max(np.unique(obs_data[time_name])) + 1
 93 | 
 94 |         int_descript = ['Never treat', 'Always treat']
 95 | 
 96 |         g = ParametricGformula(obs_data = obs_data, id = id,  time_name = time_name,
 97 |             visitprocess = visitprocess,
 98 |             int_descript = int_descript,
 99 |             Intervention1_everhaart = [static, np.zeros(time_points)],
100 |             Intervention2_everhaart = [static, np.ones(time_points)],
101 |             covnames=covnames, covtypes=covtypes,
102 |             covmodels=covmodels, basecovs = basecovs,
103 |             outcome_name=ou tcome_name, ymodel=ymodel, outcome_type='survival')
104 |         g.fit()
105 | 
106 | 
107 | **Output**:
108 | 
109 |     .. image:: ../media/visitprocess_example_output.png
110 |          :align: center
111 | 
112 | 
113 | .. [1] Hernán MA, McAdams M, McGrath N, Lanoy E, Costagliola D. Observation plans in longitudinal studies with
114 |        time-varying treatments. Statistical Methods in Medical Research 2009;18(1):27-52.
115 | 
116 | .. [2] Young JG, Cain LE, Robins JM, O’Reilly E, Hernán MA. Comparative effectiveness of dynamic treatment regimes:
117 |        an application of the parametric g-formula. Statistics in Biosciences 2011; 3:119-143.
118 | 


--------------------------------------------------------------------------------
/docs/source/Specifications/index.rst:
--------------------------------------------------------------------------------
 1 | Specifications
 2 | ===================
 3 | 
 4 | 
 5 | 
 6 | The ‘‘Specifications’’ section gives detailed instructions about how to specify the required or optional
 7 | arguments in different modules of pygformula to construct a specific analysis. To use the g-formula method in the package,
 8 | the first step is to make sure that the input data meets the requirement of
 9 | :doc:`Input data`.
10 | Then, users need to specify their parametric covariate model (see :doc:`Covariate models`),
11 | parametric outcome model (see :doc:`Outcome model`)
12 | , as well as the intervention of interest (see :doc:`Interventions`).
13 | Once these required modules are well-defined, the g-formula in pygformula can be called and output the results of the method.
14 | 
15 | Additionally, if there are censoring events, the package provides the option to obtain inverse probability weighted estimates
16 | for comparison with the g-formula estimates,
17 | see :doc:`Censoring event`.
18 | If there are competing events, the package provides two options for handling competing events in the case of survival outcomes, see
19 | :doc:`Competing event`.
20 | The package also provides option for calculating the hazard ratio of any two interventions of interest in
21 | :doc:`Hazard ratio`.
22 | If the data structure contains visit process, users can also perform g-formula analysis for this setting in
23 | :doc:`Visit process`.
24 | If there is deterministic knowledge about the relationship between the variables, it can be incorporated into the estimation
25 | of g-formula by applying restrictions, see :doc:`Deterministic knowledge`.
26 | 
27 | 
28 | 
29 | 
30 | **Contents**:
31 | 
32 | .. toctree::
33 |     :maxdepth: 2
34 | 
35 |     Input data
36 |     Interventions
37 |     Covariate models
38 |     Outcome model
39 |     Censoring event
40 |     Competing event
41 |     Hazard ratio
42 |     Visit process
43 |     Deterministic knowledge
44 |     Output
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'pygformula'
21 | copyright = '2024, The President and Fellows of Harvard College'
22 | 
23 | 
24 | import os
25 | import sys
26 | sys.path.insert(0, os.path.abspath('../..'))
27 | 
28 | from pygformula import __version__
29 | 
30 | release = __version__
31 | 
32 | # -- General configuration ---------------------------------------------------
33 | 
34 | # Add any Sphinx extension module names here, as strings. They can be
35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
36 | # ones.
37 | 
38 | 
39 | extensions = ['sphinx.ext.autodoc',
40 |               'sphinx.ext.autosummary',
41 |               'sphinx.ext.napoleon'
42 |               ]
43 | 
44 | 
45 | # Add any paths that contain templates here, relative to this directory.
46 | templates_path = ['_templates']
47 | 
48 | # List of patterns, relative to source directory, that match files and
49 | # directories to ignore when looking for source files.
50 | # This pattern also affects html_static_path and html_extra_path.
51 | exclude_patterns = []
52 | 
53 | # The suffix of source filenames.
54 | source_suffix = ".rst"
55 | 
56 | 
57 | # -- Options for HTML output -------------------------------------------------
58 | 
59 | # The theme to use for HTML and HTML Help pages.  See the documentation for
60 | # a list of builtin themes.
61 | #
62 | html_theme = 'sphinx_rtd_theme'
63 | 
64 | # Add any paths that contain custom static files (such as style sheets) here,
65 | # relative to this directory. They are copied after the builtin static files,
66 | # so a file named "default.css" will overwrite the builtin "default.css".
67 | html_static_path = ['_static']
68 | 
69 | 
70 | latex_elements = {
71 | }
72 | 
73 | 
74 | latex_documents = [
75 |   ('index', 'pygformula.tex', 'Pygformula Documentation', '',
76 |    'manual'),
77 | ]
78 | 
79 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Welcome to pygformula's documentation!
 3 | ======================================
 4 | 
 5 | The `pygformula <https://github.com/CausalInference/pygformula>`_ package implements the non-iterative
 6 | conditional expectation (NICE) estimator of the g-formula algorithm [1]_ :sup:`,` [2]_. The g-formula can estimate an
 7 | outcome’s counterfactual mean or risk under hypothetical treatment strategies (interventions) when there
 8 | is sufficient information on time-varying treatments and confounders.
 9 | 
10 | This package can be used for discrete or continuous time-varying treatments and for failure time outcomes or
11 | continuous/binary end of follow-up outcomes. The package can handle a random measurement/visit process and a
12 | priori knowledge of the data structure, as well as censoring (e.g., by loss to follow-up) and two options for
13 | handling competing events for failure time outcomes. Interventions can be flexibly specified, both as
14 | interventions on a single treatment or as joint interventions on multiple treatments.
15 | 
16 | For a quick overview of how to use the pygformula, see a simple example in :doc:`Get Started`.
17 | For a detailed list of options, see :doc:`Specifications/index`.
18 | 
19 | .. toctree::
20 |    :maxdepth: 2
21 | 
22 |    Installation
23 |    Get Started
24 | 
25 | .. toctree::
26 |    :maxdepth: 4
27 | 
28 |    Specifications/index
29 | 
30 | .. toctree::
31 |    :maxdepth: 2
32 | 
33 |    Datasets
34 |    Contact
35 | 
36 | 
37 | .. [1] Robins JM. A new approach to causal inference in mortality studies with a sustained exposure period:
38 |        application to the healthy worker survivor effect. Mathematical Modelling. 1986;7:1393–1512. [Errata (1987)
39 |        in Computers and Mathematics with Applications 14, 917-921. Addendum (1987) in Computers and Mathematics
40 |        with Applications 14, 923-945. Errata (1987) to addendum in Computers and Mathematics with Applications
41 |        18, 477.
42 | .. [2] Hernán, M.A., and Robins, J. (2020). Causal Inference: What If (Chapman & Hall/CRC).
43 | 
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/docs/source/media/absorbing_cov_example_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/absorbing_cov_example_output.png


--------------------------------------------------------------------------------
/docs/source/media/binary_cov_example_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/binary_cov_example_output.png


--------------------------------------------------------------------------------
/docs/source/media/binary_eof_example_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/binary_eof_example_output.png


--------------------------------------------------------------------------------
/docs/source/media/bounded_normal_cov_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/bounded_normal_cov_example.png


--------------------------------------------------------------------------------
/docs/source/media/categorical_cov_example_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/categorical_cov_example_output.png


--------------------------------------------------------------------------------
/docs/source/media/categorical_time_cov_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/categorical_time_cov_example.png


--------------------------------------------------------------------------------
/docs/source/media/censor_example_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/censor_example_output.png


--------------------------------------------------------------------------------
/docs/source/media/comp_restriction_example_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/comp_restriction_example_output.png


--------------------------------------------------------------------------------
/docs/source/media/competing_as_cens_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/competing_as_cens_output.png


--------------------------------------------------------------------------------
/docs/source/media/competing_not_cens_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/competing_not_cens_output.png


--------------------------------------------------------------------------------
/docs/source/media/continuous_eof_example_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/continuous_eof_example_output.png


--------------------------------------------------------------------------------
/docs/source/media/data_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/data_example.png


--------------------------------------------------------------------------------
/docs/source/media/data_example_censor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/data_example_censor.png


--------------------------------------------------------------------------------
/docs/source/media/data_example_competing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/data_example_competing.png


--------------------------------------------------------------------------------
/docs/source/media/dynamic_example_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/dynamic_example_output.png


--------------------------------------------------------------------------------
/docs/source/media/example_hazardratio_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/example_hazardratio_output.png


--------------------------------------------------------------------------------
/docs/source/media/get_started_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/get_started_example.png


--------------------------------------------------------------------------------
/docs/source/media/get_started_example_all.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/get_started_example_all.jpg


--------------------------------------------------------------------------------
/docs/source/media/get_started_example_bootstrap.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/get_started_example_bootstrap.jpg


--------------------------------------------------------------------------------
/docs/source/media/get_started_example_intervention_curve.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/get_started_example_intervention_curve.jpg


--------------------------------------------------------------------------------
/docs/source/media/natural_course_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/natural_course_output.png


--------------------------------------------------------------------------------
/docs/source/media/natural_grace_period.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/natural_grace_period.png


--------------------------------------------------------------------------------
/docs/source/media/normal_cov_example_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/normal_cov_example_output.png


--------------------------------------------------------------------------------
/docs/source/media/random_forest_cov.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/random_forest_cov.png


--------------------------------------------------------------------------------
/docs/source/media/restriction_example_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/restriction_example_output.png


--------------------------------------------------------------------------------
/docs/source/media/static_example_one_treatment_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/static_example_one_treatment_output.png


--------------------------------------------------------------------------------
/docs/source/media/static_example_two_treatments.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/static_example_two_treatments.png


--------------------------------------------------------------------------------
/docs/source/media/static_multiple_interventions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/static_multiple_interventions.png


--------------------------------------------------------------------------------
/docs/source/media/survival_example_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/survival_example_output.png


--------------------------------------------------------------------------------
/docs/source/media/test_hazard_ratio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/test_hazard_ratio.png


--------------------------------------------------------------------------------
/docs/source/media/threshold_example_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/threshold_example_output.png


--------------------------------------------------------------------------------
/docs/source/media/truncated_normal_cov_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/truncated_normal_cov_example.png


--------------------------------------------------------------------------------
/docs/source/media/uniform_grace_period.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/uniform_grace_period.png


--------------------------------------------------------------------------------
/docs/source/media/visitprocess_example_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/visitprocess_example_output.png


--------------------------------------------------------------------------------
/docs/source/media/yrestriction_example_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/yrestriction_example_output.png


--------------------------------------------------------------------------------
/docs/source/media/zero_inflated_normal_cov_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/docs/source/media/zero_inflated_normal_cov_example.png


--------------------------------------------------------------------------------
/pygformula/__init__.py:
--------------------------------------------------------------------------------
1 | from .version import __version__
2 | from .parametric_gformula import ParametricGformula
3 | 


--------------------------------------------------------------------------------
/pygformula/comparisons.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | def comparison_calculate(obs_data, time_name, time_points, id, covnames, covtypes, outcome_name, outcome_type,
  5 |                          nc_pool, nc_risk, censor, censor_name, censor_fit, ipw_cutoff_quantile, ipw_cutoff_value,
  6 |                          competing=None, compevent_name=None, compevent_cens=False, compevent_fit=None):
  7 |     """
  8 |     This is an internal function to calculate the mean observed values of covariates at each time point, as well as mean
  9 |     observed risk.
 10 | 
 11 |     Parameters
 12 |     ----------
 13 |     obs_data: DataFrame
 14 |         A data frame containing the observed data.
 15 | 
 16 |     time_name: Str
 17 |         A string specifying the name of the time variable in obs_data.
 18 | 
 19 |     time_points: Int
 20 |         An integer indicating the number of time points to simulate. It is set equal to the maximum number of records
 21 |         that obs_data contains for any individual plus 1, if not specified by users.
 22 | 
 23 |     id: Str
 24 |         A string specifying the name of the id variable in obs_data.
 25 | 
 26 |     covnames: List
 27 |         A list of strings specifying the names of the time-varying covariates in obs_data.
 28 | 
 29 |     covtypes: List
 30 |         A list of strings specifying the “type” of each time-varying covariate included in covnames.
 31 |         The supported types: "binary", "normal", "categorical", "bounded normal", "zero-inflated normal",
 32 |         "truncated normal", "absorbing", "categorical time" "square time" and "custom". The list must be the same length
 33 |         as covnames and in the same order.
 34 | 
 35 |     outcome_name: Str
 36 |         A string specifying the name of the outcome variable in obs_data.
 37 | 
 38 |     outcome_type: Str
 39 |         A string specifying the "type" of outcome. The possible "types" are: "survival", "continuous_eof", and "binary_eof".\
 40 | 
 41 |     nc_pool: DataFrame
 42 |         A dataframe of the simulated data under natural course.
 43 | 
 44 |     nc_risk: List
 45 |         A list contains the parametric risk of all the time points for natural course.
 46 | 
 47 |     censor: Bool
 48 |         A boolean value indicating the if there is a censoring event.
 49 | 
 50 |     censor_name: Str
 51 |         A string specifying the name of the censoring variable in obs_data. Only applicable when using inverse
 52 |         probability weights to estimate the natural course means / risk from the observed data.
 53 | 
 54 |     censor_fit: Class
 55 |         A class object of the fitted model for the censoring event.
 56 | 
 57 |     ipw_cutoff_quantile: Float
 58 |         Percentile value for truncation of the inverse probability weights
 59 | 
 60 |     ipw_cutoff_value: Float
 61 |         Absolute value for truncation of the inverse probability weights.
 62 | 
 63 |     competing: Bool
 64 |         A boolean value indicating the if there is a competing event.
 65 | 
 66 |     compevent_name: Str
 67 |         A string specifying the name of the competing event variable in obs_data. Only applicable for survival outcomes.
 68 | 
 69 |     compevent_cens: Bool
 70 |         A boolean value indicating whether to treat competing events as censoring events.
 71 | 
 72 |     compevent_fit: Class
 73 |         A class object of the fitted model for the competing event.
 74 | 
 75 |     Returns
 76 |     -------
 77 |     obs_means: Dict
 78 |         A dictionary, where the key is the covariate / risk name and the value is its observational mean at all the time points.
 79 | 
 80 |     est_means: Dict
 81 |         A dictionary, where the key is the covariate / risk name and the value is its parametric mean at all the time points.
 82 | 
 83 |     obs_res: Float
 84 |         A value of the observational risk / mean at final time point.
 85 | 
 86 |     IP_weights: List
 87 |         A list contains the inverse probability weights from the censor model.
 88 | 
 89 |     """
 90 |     if censor:
 91 |         # for non-parametric cov means and risks
 92 |         censor_pre = censor_fit.predict(obs_data)
 93 |         censor_p0_inv = 1 / (1 - censor_pre)
 94 |         obs_data['censor_p0_inv'] = censor_p0_inv
 95 |         censor_inv_cum = obs_data.groupby([id])['censor_p0_inv'].cumprod()
 96 |         obs_data['censor_inv_cum'] = censor_inv_cum
 97 |         w_censor = censor_inv_cum * (1 - obs_data[censor_name])
 98 |         if outcome_type == 'survival' and compevent_cens:
 99 |             comprisk_p0_inv = 1 / (1 - compevent_fit.predict(obs_data))
100 |             obs_data['comprisk_p0_inv'] = comprisk_p0_inv
101 |             comprisk_inv_cum = obs_data.groupby([id])['comprisk_p0_inv'].cumprod()
102 |             w_comp = np.where((obs_data[compevent_name].isna()) | (obs_data[compevent_name] == 1), 0, comprisk_inv_cum)
103 |             w = w_comp * w_censor
104 |         else:
105 |             w = w_censor
106 |         obs_data['IP_weight'] = w
107 | 
108 |         if ipw_cutoff_quantile:
109 |             quantile_w = np.percentile(list(w_censor), ipw_cutoff_quantile * 100)
110 |             obs_data.loc[obs_data['IP_weight'] > quantile_w, 'IP_weight'] = quantile_w
111 |         if ipw_cutoff_value:
112 |             obs_data.loc[obs_data['IP_weight'] > ipw_cutoff_value, 'IP_weight'] = ipw_cutoff_value
113 | 
114 |         obs_data['IP_weight_cov'] = np.where(obs_data[time_name] > 0, obs_data['IP_weight'].shift(1), 1)
115 | 
116 |         obs_means = {}
117 |         if covnames is not None:
118 |             for k, covname in enumerate(covnames):
119 |                 if covtypes[k] == 'categorical':
120 |                     all_levels = np.unique(obs_data[covname])
121 |                     all_levels_obs_prob = []
122 |                     for level in all_levels:
123 |                         obs_level_prob = obs_data[obs_data[covname].notna()].groupby([time_name]).apply(lambda g:
124 |                                         (((g[covname] == level) * g['IP_weight_cov']).mean()) /g['IP_weight_cov'].mean()).tolist()[:time_points]
125 |                         all_levels_obs_prob.append(obs_level_prob)
126 |                 else:
127 |                     cov_mean = obs_data[obs_data[covname].notna()].groupby(time_name).apply(lambda g: (g['IP_weight_cov'] * g[covname]).mean() / g['IP_weight_cov'].mean()).tolist()[:time_points]
128 |                     obs_means[covname] = cov_mean
129 | 
130 |         if outcome_type == 'binary_eof' or outcome_type == 'continuous_eof':
131 |             obs_data_last_record =  obs_data.loc[obs_data[outcome_name].notna()]
132 |             obs_mean_Ey = (obs_data_last_record[outcome_name] * obs_data_last_record['IP_weight']).mean() / obs_data_last_record['IP_weight'].mean()
133 | 
134 |         if outcome_type == 'survival':
135 |             if competing and not compevent_cens:
136 |                 w_elimD = obs_data['IP_weight'] * (1 - obs_data[compevent_name])
137 |                 obs_data['w_elimD'] = w_elimD
138 |                 h_k = obs_data[obs_data[outcome_name].notna()].groupby(time_name).apply(
139 |                     lambda g: (g['w_elimD'] * g[outcome_name]).mean() / g['w_elimD'].mean())
140 |                 h_k2 = obs_data[obs_data[compevent_name].notna()].groupby(time_name).apply(
141 |                     lambda g: (g['IP_weight'] * g[compevent_name]).mean() / g['IP_weight'].mean())
142 |                 risks = np.array([list(h_k)[k] * (1 - list(h_k2)[k]) if k == 0 else list(h_k)[k]
143 |                                   * (1 - list(h_k2)[k]) * list((1 - h_k).cumprod())[k - 1] * list((1 - h_k2).cumprod())[k - 1]
144 |                                   for k in range(time_points)]).cumsum().tolist()[:time_points]
145 |                 obs_means['risk'] = risks
146 |                 obs_risk = risks[-1]
147 |             else:
148 |                 weight_outcome_mean = obs_data[obs_data[outcome_name].notna()].groupby(time_name).apply(
149 |                     lambda g: (g['IP_weight'] * g[outcome_name]).mean() / g['IP_weight'].mean())
150 |                 weight_p0_mean = 1 - weight_outcome_mean
151 |                 risks = np.array([weight_outcome_mean.tolist()[k] if k == 0 else weight_outcome_mean.tolist()[k] *
152 |                                  weight_p0_mean.cumprod().tolist()[k - 1] for k in
153 |                                  range(time_points)]).cumsum().tolist()[:time_points]
154 |                 obs_means['risk'] = risks
155 |                 obs_risk = risks[-1]
156 | 
157 |         if outcome_type == 'survival':
158 |             # for parametric cov means and risks
159 |             if competing and not compevent_cens:
160 |                 nc_pool['p0_cum'] = nc_pool.groupby(id)['prob0'].cumprod()
161 |                 nc_pool['pd_0'] = 1 - nc_pool['prob_D']
162 |                 nc_pool['pd_0_cum'] = nc_pool.groupby(id)['pd_0'].cumprod()
163 |                 nc_pool['w_cov'] = np.where(nc_pool[time_name] > 0,
164 |                                             nc_pool['p0_cum'].shift(1) * nc_pool['pd_0_cum'].shift(1), 1)
165 |             else:
166 |                 nc_pool['p0_cum'] = nc_pool.groupby([id])['prob0'].cumprod()
167 |                 nc_pool['w_cov'] = np.where(nc_pool[time_name] > 0, nc_pool['p0_cum'].shift(1), 1)
168 |         else:
169 |             nc_pool['w_cov'] = 1
170 | 
171 |         est_means = {}
172 |         if covnames is not None:
173 |             for k, covname in enumerate(covnames):
174 |                 if covtypes[k] == 'categorical':
175 |                     all_levels = np.unique(obs_data[covname])
176 |                     all_levels_est_prob_mean = []
177 |                     for level in all_levels:
178 |                         est_level_prob = nc_pool[nc_pool[covname].notna()].groupby([time_name]).apply(
179 |                             lambda g: ((g[covname] == level) * g['w_cov']).mean() / g['w_cov'].mean()).tolist()[:time_points]
180 |                         all_levels_est_prob_mean.append(est_level_prob)
181 |                     est_means[covname] = all_levels_est_prob_mean
182 |                 else:
183 |                     cov_mean = nc_pool[nc_pool[covname].notna()].groupby(time_name).apply(
184 |                         lambda g: (g['w_cov'] * g[covname]).mean() / g['w_cov'].mean()).tolist()[:time_points]
185 |                     est_means[covname] = cov_mean
186 |         if outcome_type == 'survival':
187 |             est_means['risk'] = nc_risk
188 | 
189 |     else:
190 |         # for non-parametric cov means and risks
191 |         obs_means = {}
192 |         if covnames is not None:
193 |             for k, covname in enumerate(covnames):
194 |                 if covtypes[k] == 'categorical':
195 |                     all_levels = np.unique(obs_data[covname])
196 |                     all_levels_obs_prob_mean = []
197 |                     for level in all_levels:
198 |                         obs_level_prob = obs_data.groupby([time_name]).apply(lambda g: ((g[covname] == level)).mean()).tolist()[:time_points]
199 |                         all_levels_obs_prob_mean.append(obs_level_prob)
200 |                     obs_means[covname] = all_levels_obs_prob_mean
201 |                 else:
202 |                     obs_mean = obs_data.groupby([time_name])[covname].mean().tolist()[:time_points]
203 |                     obs_means[covname] = obs_mean
204 | 
205 |         if outcome_type == 'binary_eof' or outcome_type == 'continuous_eof':
206 |             obs_mean_Ey = obs_data.loc[obs_data[time_name] == time_points - 1][outcome_name].mean()
207 | 
208 |         if outcome_type == 'survival':
209 |             if competing and not compevent_cens:
210 |                 p1_mean = obs_data[obs_data[outcome_name].notna()].groupby(time_name)[outcome_name].mean()
211 |                 pd_mean = obs_data[obs_data[compevent_name].notna()].groupby(time_name)[compevent_name].mean()
212 |                 comrisks = np.array(
213 |                     [list(p1_mean)[k] * (1 - list(pd_mean)[k]) if k == 0 else list(p1_mean)[k] * (1 - list(pd_mean)[k]) *
214 |                      list((1 - p1_mean).cumprod())[k - 1] * list((1 - pd_mean).cumprod())[k - 1]
215 |                      for k in range(time_points)]).cumsum().tolist()[:time_points]
216 |                 obs_means['risk'] = comrisks
217 |                 obs_risk = comrisks[-1]
218 |             else:
219 |                 p1_mean_obs = obs_data[obs_data[outcome_name].notna()].groupby(time_name)[outcome_name].mean()
220 |                 p0_mean_obs = 1 - p1_mean_obs
221 |                 risks = np.array(
222 |                     [p1_mean_obs.tolist()[k] if k == 0 else p1_mean_obs.tolist()[k] * p0_mean_obs.cumprod().tolist()[k - 1]
223 |                      for k in range(time_points)]).cumsum().tolist()[:time_points]
224 |                 obs_means['risk'] = risks
225 |                 obs_risk = risks[-1]
226 | 
227 |         if outcome_type == 'survival':
228 |             # for parametric cov means and risks
229 |             if competing and not compevent_cens:
230 |                 nc_pool['p0_cum'] = nc_pool.groupby(id)['prob0'].cumprod()
231 |                 nc_pool['pd_0'] = 1 - nc_pool['prob_D']
232 |                 nc_pool['pd_0_cum'] = nc_pool.groupby(id)['pd_0'].cumprod()
233 |                 nc_pool['w_cov'] = np.where(nc_pool[time_name] > 0,
234 |                                             nc_pool['p0_cum'].shift(1) * nc_pool['pd_0_cum'].shift(1), 1)
235 |             else:
236 |                 nc_pool['p0_cum'] = nc_pool.groupby([id])['prob0'].cumprod()
237 |                 nc_pool['w_cov'] = np.where(nc_pool[time_name] > 0, nc_pool['p0_cum'].shift(1), 1)
238 |         else:
239 |             nc_pool['w_cov'] = 1
240 | 
241 |         est_means = {}
242 |         if covnames is not None:
243 |             for k, covname in enumerate(covnames):
244 |                 if covtypes[k] == 'categorical':
245 |                     all_levels = np.unique(obs_data[covname])
246 |                     all_levels_est_prob_mean = []
247 |                     for level in all_levels:
248 |                         est_level_prob = nc_pool[nc_pool[covname].notna()].groupby([time_name]).apply(
249 |                             lambda g: ((g[covname] == level) * g['w_cov']).mean() / g['w_cov'].mean()).tolist()[:time_points]
250 |                         all_levels_est_prob_mean.append(est_level_prob)
251 |                     est_means[covname] = all_levels_est_prob_mean
252 |                 else:
253 |                     est_mean = nc_pool[nc_pool[covname].notna()].groupby(time_name).apply(lambda g:
254 |                                                                               (g['w_cov'] * g[covname]).mean()
255 |                                                                              / g['w_cov'].mean()).tolist()[:time_points]
256 |                     est_means[covname] = est_mean
257 |         if outcome_type == 'survival':
258 |             est_means['risk'] = nc_risk
259 | 
260 |     obs_res = obs_risk if outcome_type == 'survival' else obs_mean_Ey
261 |     IP_weights = obs_data['IP_weight'].tolist() if censor else None
262 | 
263 |     return obs_means, est_means, obs_res, IP_weights
264 | 
265 | 


--------------------------------------------------------------------------------
/pygformula/data.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | 
  4 | def load_basicdata():
  5 |     """
  6 |     Data description: a survival dataset that contains 11,332 observations on 2,500 individuals over 7 time points.
  7 |     Each row in the dataset corresponds to the record of one individual at one time point.
  8 | 
  9 |     id: Unique identifier for each individual.
 10 |     t0: Time index.
 11 |     L1: Binary time-varying covariate.
 12 |     L2: Continuous time-varying covariate.
 13 |     L3: Categorical baseline covariate. For each individual, the baseline values are repeated at each time point.
 14 |     A: Binary treatment variable.
 15 |     D: Competing event; time-varying indicator of failure.
 16 |     Y: Outcome of interest; time-varying indicator of failure.
 17 | 
 18 |     Returns
 19 |     -------
 20 |     A pandas dataframe
 21 |     """
 22 |     data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_data_basicdata.csv'
 23 |     data = pd.read_csv(data_url)
 24 |     return data
 25 | 
 26 | def load_basicdata_nocomp():
 27 |     """
 28 |     Data description: a survival dataset that contains 13,170 observations on 2,500 individuals over 7 time points.
 29 |     Each row in the dataset corresponds to the record of one individual at one time point.
 30 | 
 31 |     id: Unique identifier for each individual.
 32 |     t0: Time index.
 33 |     L1: Binary time-varying covariate.
 34 |     L2: Continuous time-varying covariate.
 35 |     L3: Categorical baseline covariate. For each individual, the baseline values are repeated at each time point.
 36 |     A: Binary treatment variable.
 37 |     Y: Outcome of interest; time-varying indicator of failure.
 38 | 
 39 |     This is a survival dataset that contains 2500 individuals with maximum 7 time points. There are
 40 |     one binary covariate L1, one normal covariate L2, one baseline covariate L3, one binary treatment variable A, and a
 41 |     binary outcome Y.
 42 | 
 43 |     Returns
 44 |     -------
 45 |     A pandas dataframe
 46 |     """
 47 |     data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_data_basicdata_nocomp.csv'
 48 |     data = pd.read_csv(data_url)
 49 |     return data
 50 | 
 51 | def load_absorbing_data():
 52 |     """
 53 |     Data description: a survival dataset that contains 6,033 observations, 1,000 individuals over 10 time points.
 54 |     Each row in the dataset corresponds to the record of one individual at one time point.
 55 | 
 56 |     id: Unique identifier for each individual.
 57 |     t0: Time index.
 58 |     L: Binary time-varying covariate with absorbing type, once it takes value 1, it keeps 1 at subsequent time points.
 59 |     A: Binary treatment variable.
 60 |     Y: Outcome of interest; time-varying indicator of failure.
 61 | 
 62 |     Returns
 63 |     -------
 64 |     A pandas dataframe
 65 |     """
 66 |     data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_data_absorbing.csv'
 67 |     data = pd.read_csv(data_url)
 68 |     return data
 69 | 
 70 | def load_binary_eof():
 71 |     """
 72 |     Data description: a dataset that contains 17,500 observations on 2,500 individuals over 7 time points.
 73 |     Each row in the dataset corresponds to the record of one individual at one time point.
 74 | 
 75 |     id: Unique identifier for each individual.
 76 |     t0: Time index.
 77 |     L1: Binary time-varying covariate.
 78 |     L2: Continuous time-varying covariate.
 79 |     L3: Categorical baseline covariate. For each individual, the baseline values are repeated at each time point.
 80 |     A: Continuous treatment variable.
 81 |     Y: Binary outcome of interest. Because this outcome is only defined at the end of follow-up, values of NA are given
 82 |     in all other time points.
 83 | 
 84 |     Returns
 85 |     -------
 86 |     A pandas dataframe
 87 |     """
 88 |     data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_data_binary_eof.csv'
 89 |     data = pd.read_csv(data_url)
 90 |     return data
 91 | 
 92 | def load_categorical():
 93 |     """
 94 |     Data description: a survival dataset that contains 7,822 observations, 1,000 individuals over 10 time points.
 95 |     Each row in the dataset corresponds to the record of one individual at one time point.
 96 | 
 97 |     id: Unique identifier for each individual.
 98 |     t0: Time index.
 99 |     L: Categorical covariate with 5 categories.
100 |     A: Binary treatment variable.
101 |     Y: Outcome of interest; time-varying indicator of failure.
102 | 
103 |     Returns
104 |     -------
105 |     A pandas dataframe
106 |     """
107 |     data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_data_categorical.csv'
108 |     data = pd.read_csv(data_url)
109 |     return data
110 | 
111 | def load_censor_data():
112 |     """
113 |     Data description: a survival dataset with censoring event that contains 118,725 observations, 10,000 individuals
114 |     over 10 time points. Each row in the dataset corresponds to the record of one individual at one time point.
115 |     Individuals who are censored at time k+1 only have a total of k+1 records, which correspond to time indices 0,..., k.
116 | 
117 |     id: Unique identifier for each individual.
118 |     t0: Time index.
119 |     L: Binary time-varying covariate.
120 |     A: Continuous treatment variable.
121 |     C: Censoring indicator.
122 |     Y: Outcome of interest; time-varying indicator of failure.
123 | 
124 |     Returns
125 |     -------
126 |     A pandas dataframe
127 |     """
128 |     data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_data_censor.csv'
129 |     data = pd.read_csv(data_url)
130 |     return data
131 | 
132 | def load_continuous_eof():
133 |     """
134 |     Data description: a dataset that contains 17,500 observations on 2,500 individuals over 7 time points.
135 |     Each row in the dataset corresponds to the record of one individual at one time point.
136 | 
137 |     id: Unique identifier for each individual.
138 |     t0: Time index.
139 |     L1: Categorical time-varying covariate with 3 categories.
140 |     L2: Continuous time-varying covariate.
141 |     L3: Categorical baseline covariate. For each individual, the baseline values are repeated at each time point.
142 |     A: Binary treatment variable.
143 |     Y: Continuous outcome of interest. Because this outcome is only defined at the end of follow-up, values of NA are
144 |     given in all other time points.
145 | 
146 |     Returns
147 |     -------
148 |     A pandas dataframe
149 |     """
150 |     data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_data_continuous_eof.csv'
151 |     data = pd.read_csv(data_url)
152 |     return data
153 | 
154 | def load_visit_process():
155 |     """
156 |     Data description: a survival dataset with visit process that contains 1,739 observations on 200 individuals over 37 time points.
157 |     Each row in the dataset corresponds to the record of one individual at one time point.
158 | 
159 |     id: Unique identifier for each individual.
160 |     month: Time index.
161 |     sex: Binary baseline covariate. For each individual, the baseline values are repeated at each time point.
162 |     age: Continuous baseline covariate.
163 |     race: Categorical baseline covariate.
164 |     cd4_v: Continuous time-varying covariate.
165 |     visit_cd4: Indicator of whether there is a cd4 visit/measurement.
166 |     rna_v: Continuous time-varying covariate.
167 |     visit_rna: Indicator of whether there is a rna visit/measurement.
168 |     everhaart: Binary treatment variable.
169 |     event: Outcome of interest; time-varying indicator of failure.
170 | 
171 |     Returns
172 |     -------
173 |     A pandas dataframe
174 |     """
175 |     data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_data_visit_process.csv'
176 |     data = pd.read_csv(data_url)
177 |     return data
178 | 
179 | def load_truncated_normal():
180 |     """
181 |     Data description: a survival dataset with visit process that contains 7,855 observations on 1,000 individuals over 10 time points.
182 |     Each row in the dataset corresponds to the record of one individual at one time point.
183 | 
184 |     id: Unique identifier for each individual.
185 |     t0: Time index.
186 |     L: Continuous time-varying covariate with truncated normal distribution.
187 |     A: Binary treatment variable.
188 |     Y: Outcome of interest; time-varying indicator of failure.
189 | 
190 |     Returns
191 |     -------
192 |     A pandas dataframe
193 |     """
194 |     data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_data_truncated_normal.csv'
195 |     data = pd.read_csv(data_url)
196 |     return data
197 | 
198 | def load_zero_inflated_normal():
199 |     """
200 |     Data description: a survival dataset with visit process that contains 7,678 observations on 1,000 individuals over 10 time points.
201 |     Each row in the dataset corresponds to the record of one individual at one time point.
202 | 
203 |     id: Unique identifier for each individual.
204 |     t0: Time index.
205 |     L: Continuous time-varying covariate with zero-inflated normal distribution.
206 |     A: Binary treatment variable.
207 |     Y: Outcome of interest; time-varying indicator of failure.
208 | 
209 |     Returns
210 |     -------
211 |     A pandas dataframe
212 |     """
213 |     data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_data_zero_inflated_normal.csv'
214 |     data = pd.read_csv(data_url)
215 |     return data
216 | 
217 | 
218 | def load_multiple_treatments_data():
219 |     """
220 |     Data description: a survival dataset that contains 3,416 observations on 1,000 individuals over 5 time points.
221 |     Each row in the dataset corresponds to the record of one individual at one time point.
222 | 
223 |     id: Unique identifier for each individual.
224 |     t0: Time index.
225 |     L1: Binary time-varying covariate.
226 |     L2: Continuous time-varying covariate.
227 |     A1: Binary treatment variable.
228 |     A2: Binary treatment variable.
229 |     Y: Outcome of interest; time-varying indicator of failure.
230 | 
231 |     Returns
232 |     -------
233 |     A pandas dataframe
234 |     """
235 |     data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_data_multiple_treatments.csv'
236 |     data = pd.read_csv(data_url)
237 |     return data
238 | 
239 | 
240 | def load_threshold_data():
241 |     """
242 |     Data description: a survival dataset that contains 1,853 observations on 1,000 individuals over 5 time points.
243 |     Each row in the dataset corresponds to the record of one individual at one time point.
244 | 
245 |     id: Unique identifier for each individual.
246 |     t0: Time index.
247 |     L1: Binary time-varying covariate.
248 |     L2: Continuous time-varying covariate.
249 |     A: Continuous treatment variable.
250 |     Y: Outcome of interest; time-varying indicator of failure.
251 | 
252 |     Returns
253 |     -------
254 |     A pandas dataframe
255 |     """
256 |     data_url = 'https://raw.githubusercontent.com/CausalInference/pygformula/main/datasets/example_threshold_data.csv'
257 |     data = pd.read_csv(data_url)
258 |     return data


--------------------------------------------------------------------------------
/pygformula/interventions.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from functools import reduce
  3 | import operator
  4 | 
  5 | 
  6 | def natural(new_df, pool, int_var, time_name, t):
  7 |     """
  8 |     This is an internal function used by natural course which does nothing on the new_df data.
  9 | 
 10 |     Parameters
 11 |     ----------
 12 |     new_df: DataFrame
 13 |         A DataFrame that contains the observed or simulated data at time t.
 14 | 
 15 |     pool: DataFrame
 16 |         A DataFrame that contains the observed or simulated data up to time t.
 17 | 
 18 |     int_var: List
 19 |         A list containing strings of treatment names to be intervened in a particular intervention.
 20 | 
 21 |     time_name: Str
 22 |         A string specifying the name of the time variable in obs_data.
 23 | 
 24 |     t: Int
 25 |         An integer indicating the current time index to be intervened.
 26 | 
 27 |     Returns
 28 |     -------
 29 |     None
 30 | 
 31 |     """
 32 |     pass
 33 | 
 34 | 
 35 | def static(new_df, pool, int_var, int_values, time_name, t):
 36 |     """
 37 |     This is an internal function to perform a static intervention.
 38 | 
 39 |     Parameters
 40 |     ----------
 41 |     new_df: DataFrame
 42 |         A DataFrame that contains the observed or simulated data at time t.
 43 | 
 44 |     pool: DataFrame
 45 |         A DataFrame that contains the observed or simulated data up to time t.
 46 | 
 47 |     int_var: List
 48 |         A list containing strings of treatment names to be intervened in a particular intervention.
 49 | 
 50 |     int_values: List
 51 |         A list containing the value needed when performing a particular intervention function.
 52 | 
 53 |     time_name: Str
 54 |         A string specifying the name of the time variable in obs_data.
 55 | 
 56 |     t: Int
 57 |         An integer indicating the current time index to be intervened.
 58 | 
 59 |     Returns
 60 |     -------
 61 |     Nothing is returned, the new_df is changed under a particular intervention.
 62 | 
 63 |     """
 64 |     new_df.loc[new_df[time_name] == t, int_var] = int_values[t]
 65 | 
 66 | 
 67 | def threshold(new_df, pool, int_var, threshold_values, time_name, t):
 68 |     """
 69 |     This is an internal function to perform a threshold intervention.
 70 | 
 71 |     Parameters
 72 |     ----------
 73 |     new_df: DataFrame
 74 |         A DataFrame that contains the observed or simulated data at time t.
 75 | 
 76 |     pool: DataFrame
 77 |         A DataFrame that contains the observed or simulated data up to time t.
 78 | 
 79 |     int_var: List
 80 |         A list containing strings of treatment names to be intervened in a particular intervention.
 81 | 
 82 |     threshold_values: List
 83 |         A list containing the threshold values needed when performing a threshold intervention function.
 84 | 
 85 |     time_name: Str
 86 |         A string specifying the name of the time variable in obs_data.
 87 | 
 88 |     t: Int
 89 |         An integer indicating the current time index to be intervened.
 90 | 
 91 |     Returns
 92 |     -------
 93 |     Nothing is returned, the new_df is changed under a particular intervention.
 94 | 
 95 |     """
 96 |     new_df.loc[new_df[time_name] == t, int_var] = new_df[int_var].where(new_df[int_var] > threshold_values[0], threshold_values[0])
 97 |     new_df.loc[new_df[time_name] == t, int_var] = new_df[int_var].where(new_df[int_var] < threshold_values[1], threshold_values[1])
 98 | 
 99 | 
100 | def natural_grace_period(new_df, pool, int_var, nperiod, conditions, time_name, t):
101 |     """
102 |     This is a pre-coded function to perform a natural grace period intervention. Once a covariate
103 |     meets a threshold level, the treatment (int_var) is initiated within m (nperiod) time intervals which is the duration
104 |     of the grace period. During grace period, the treatment takes its natural value.
105 | 
106 |     Parameters
107 |     ----------
108 |     new_df: DataFrame
109 |         A DataFrame that contains the observed or simulated data at time t.
110 | 
111 |     pool: DataFrame
112 |         A DataFrame that contains the observed or simulated data up to time t.
113 | 
114 |     int_var: Str
115 |         A string specifying the treatment variable to be intervened.
116 | 
117 |     nperiod: Int
118 |         An integer indicating the duration of the grace period.
119 | 
120 |     conditions: Dict
121 |         A dictionary that contains the covariate and its coditions for initiating the treatment.
122 | 
123 |     time_name: Str
124 |         A string specifying the name of the time variable in obs_data.
125 | 
126 |     t: Int
127 |         An integer indicating the current time index to be intervened.
128 | 
129 |     Returns
130 |     -------
131 |     Nothing is returned, the new_df is changed under a particular intervention.
132 | 
133 |     """
134 | 
135 |     # if condition is True, start initiation of the treatment with grace period
136 |     masks = []
137 |     for cond_var, condition in conditions.items():
138 |         mask = new_df[cond_var].apply(condition)
139 |         masks.append(mask)
140 |     restrict_mask = reduce(operator.and_, masks)
141 |     new_df[int_var] = np.where(restrict_mask, new_df[int_var], 0)
142 | 
143 |     # treatment is initiated by the end of the grace period
144 |     if t >= nperiod:
145 |         pool_data = pool[pool[time_name] == t - nperiod]
146 |         masks = []
147 |         for cond_var, condition in conditions.items():
148 |             mask = pool_data[cond_var].apply(condition)
149 |             masks.append(mask)
150 |         restrict_mask = reduce(operator.and_, masks)
151 |         new_df[int_var] = np.where(restrict_mask, 1, new_df[int_var])
152 | 
153 |     # treatment is set to 1 once it is initiated
154 |     if t > 0:
155 |         new_df[int_var] = np.where(pool.loc[pool[time_name] == t - 1, int_var] == 1, 1, new_df[int_var]).tolist()
156 | 
157 | 
158 | def uniform_grace_period(new_df, pool, int_var, nperiod, conditions, time_name, t):
159 |     """
160 |     This is a pre-coded function to perform a uniform grace period intervention. Once a covariate
161 |     meets a threshold level, the treatment (int_var) is initiated within m (nperiod) time intervals which is the duration
162 |     of the grace period. During grace period, treatment initiation is randomly allocated with a uniform probability of
163 |     starting treatment in each time interval of the grace period.
164 | 
165 |     Parameters
166 |     ----------
167 |     new_df: DataFrame
168 |         A DataFrame that contains the observed or simulated data at time t.
169 | 
170 |     pool: DataFrame
171 |         A DataFrame that contains the observed or simulated data up to time t.
172 | 
173 |     int_var: Str
174 |         A string specifying the treatment variable to be intervened.
175 | 
176 |     nperiod: Int
177 |         An integer indicating the duration of the grace period.
178 | 
179 |     conditions: Dict
180 |         A dictionary that contains the covariate and its coditions for initiating the treatment.
181 | 
182 |     time_name: Str
183 |         A string specifying the name of the time variable in obs_data.
184 | 
185 |     t: Int
186 |         An integer indicating the current time index to be intervened.
187 | 
188 |     Returns
189 |     -------
190 |     Nothing is returned, the new_df is changed under a particular intervention.
191 | 
192 |     """
193 | 
194 |     def sample(prob):
195 |         treatment = np.random.binomial(1, prob)
196 |         return treatment
197 | 
198 |     masks = []
199 |     for cond_var, condition in conditions.items():
200 |         mask = new_df[cond_var].apply(condition)
201 |         masks.append(mask)
202 |     cond_initiation = reduce(operator.and_, masks)
203 | 
204 |     if t == 0:
205 |         # initialize counts: the number of consecutive intervals up to t that an individual failed to receive treatment
206 |         new_df['counts'] = 0
207 | 
208 |         # if condition is True, start initiation of the treatment according to a uniform distribution with grace period
209 |         new_df['uni_prob'] = np.where(cond_initiation, 1 / (nperiod + 1 - new_df['counts']), 0)
210 |         new_df[int_var] = np.where(cond_initiation, new_df['uni_prob'].apply(sample), 0)
211 | 
212 |         # update counts according to current treatment value
213 |         new_df['counts'] = np.where(cond_initiation & (new_df[int_var] == 0), 1, 0)
214 |         pool.loc[pool[time_name] == t, 'counts'] = new_df['counts']
215 | 
216 |     else:
217 |         # calculate the uniform probability for initiation when 1) the grace period has started in previous step, or 2) the grace period started at current step
218 |         new_df['uni_prob'] = np.where(pool.loc[pool[time_name] == t - 1, 'counts'] > 0, 1 / (nperiod + 1 - pool.loc[pool[time_name] == t - 1, 'counts']),
219 |                                       np.where(cond_initiation, 1 / (nperiod + 1 - pool.loc[pool[time_name] == t - 1, 'counts']), 0))
220 | 
221 |         # get the teatment value according to the uniform probability
222 |         new_df[int_var] = np.where((pool.loc[pool[time_name] == t - 1, 'counts'] > 0) | cond_initiation, new_df['uni_prob'].apply(sample), 0)
223 | 
224 |         # treatment is initiated by the end of the grace period
225 |         if t >= nperiod:
226 |             previous_pool_data = pool[pool[time_name] == t - nperiod]
227 |             masks = []
228 |             for cond_var, condition in conditions.items():
229 |                 mask = previous_pool_data[cond_var].apply(condition)
230 |                 masks.append(mask)
231 |             pre_cond_initiation = reduce(operator.and_, masks)
232 |             new_df[int_var] = np.where(pre_cond_initiation, 1, new_df[int_var])
233 | 
234 |         # treatment is set to 1 once it is initiated
235 |         new_df[int_var] = np.where(pool.loc[pool[time_name] == t - 1, int_var] == 1, 1, new_df[int_var])
236 | 
237 |         # update current counts according to current treatment value
238 |         new_df['counts'] = np.where((pool.loc[pool[time_name] == t - 1, 'counts'] > 0) & (new_df[int_var] == 0), pool.loc[pool[time_name] == t - 1, 'counts'] + 1,
239 |                                np.where(cond_initiation & (new_df[int_var] == 0), pool.loc[pool[time_name] == t - 1, 'counts'] + 1, 0))
240 | 
241 | 
242 | def intervention_func(new_df, pool, intervention, time_name, t):
243 | 
244 |     """
245 |     This is an internal function which applies user-specified interventions on the data during simulation.
246 | 
247 |     Parameters
248 |     ----------
249 |     new_df: DataFrame
250 |         A DataFrame that contains the observed or simulated data at time t.
251 | 
252 |     pool: DataFrame
253 |         A DataFrame that contains the observed or simulated data up to time t.
254 | 
255 |     intervention: List
256 |         List of lists. The k-th list contains the intervention list on k-th treatment name in the intervention.
257 |         The intervention list contains a function implementing a particular intervention on the treatment variable,
258 |         required values for the intervention function and a list of time points in which the intervention
259 |         is applied.
260 | 
261 |     time_name: Str
262 |         A string specifying the name of the time variable in obs_data.
263 | 
264 |     t: Int
265 |         An integer indicating the current time index to be intervened.
266 | 
267 |     Returns
268 |     -------
269 |     Nothing is returned.
270 | 
271 |     """
272 | 
273 |     if intervention == natural:
274 |         pass
275 |     else:
276 |         for i in range(len(intervention)):
277 |             int_var = intervention[i][0]
278 |             int_func = intervention[i][1]
279 | 
280 |             if int_func ==  static:
281 |                 int_values = intervention[i][2]
282 |                 if len(intervention[i]) == 3:  # no int_times specified, intervene on all times
283 |                     int_func(new_df, pool, int_var, int_values, time_name, t)
284 |                 else:  # intervene on specified int_times
285 |                     int_times = intervention[i][3]
286 |                     if t in int_times:
287 |                         int_func(new_df, pool, int_var, int_values, time_name, t)
288 | 
289 |             elif int_func ==  threshold:
290 |                 threshold_values = intervention[i][2]
291 |                 if len(intervention[i]) == 3:
292 |                     int_func(new_df, pool, int_var, threshold_values, time_name, t)
293 |                 else:
294 |                     int_times = intervention[i][3]
295 |                     if t in int_times:
296 |                         int_func(new_df, pool, int_var, threshold_values, time_name, t)
297 | 
298 |             elif int_func == natural_grace_period or int_func == uniform_grace_period:
299 |                 nperiod = intervention[i][2][0]
300 |                 conditions = intervention[i][2][1]
301 |                 if len(intervention[i]) == 3:
302 |                     int_func(new_df, pool, int_var, nperiod, conditions, time_name, t)
303 |                 else:
304 |                     int_times = intervention[i][3]
305 |                     if t in int_times:
306 |                         int_func(new_df, pool, int_var, nperiod, conditions, time_name, t)
307 | 
308 |             else:  # dynamic or custom intervention
309 |                 if len(intervention[i]) == 2:
310 |                     int_func(new_df, pool, int_var, time_name, t)
311 |                 else:
312 |                     int_times = intervention[i][2]
313 |                     if t in int_times:
314 |                         int_func(new_df, pool, int_var, time_name, t)
315 | 
316 | 
317 | 
318 | 
319 | 


--------------------------------------------------------------------------------
/pygformula/parametric_gformula/__init__.py:
--------------------------------------------------------------------------------
1 | from .parametric_gformula import ParametricGformula
2 | 
3 | 


--------------------------------------------------------------------------------
/pygformula/parametric_gformula/bootstrap.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import warnings
  4 | from lifelines import CoxPHFitter
  5 | from .histories import update_precoded_history, update_custom_history
  6 | from .simulate import simulate
  7 | from .fit import fit_covariate_model, fit_ymodel, fit_compevent_model
  8 | from ..utils.helper import hr_data_helper, hr_comp_data_helper
  9 | 
 10 | 
 11 | def Bootstrap(obs_data, boot_id, boot_seeds, int_descript, intervention_dicts, covnames,
 12 |               basecovs, cov_hist, time_points, n_simul, time_name, id, custom_histvars, custom_histories,
 13 |               covmodels, hazardratio, intcomp, covtypes, covfits_custom, covpredict_custom,
 14 |               ymodel_fit_custom, ymodel_predict_custom,
 15 |               ymodel, outcome_type, outcome_name, competing, compevent_name, compevent_model, compevent_cens,
 16 |               boot_diag, trunc_params, visit_names, visit_covs, ts_visit_names, max_visits, time_thresholds,
 17 |               below_zero_indicator, baselags, restrictions, yrestrictions, compevent_restrictions, sim_trunc):
 18 |     """
 19 |     This is an internal function to get the results of parametric g-formula for each bootstrap sample.
 20 | 
 21 |     Parameters
 22 |     ----------
 23 |     obs_data: DataFrame
 24 |         A data frame containing the observed data.
 25 | 
 26 |     boot_id: Int
 27 |         An integer indicating the id of the bootstrap sample.
 28 | 
 29 |     boot_seeds: List
 30 |         A list that stores the random seeds of all bootstrap samples.
 31 | 
 32 |     int_descript: List
 33 |         A list of strings, each of which describes a user-specified intervention.
 34 | 
 35 |     intervention_dicts: Dict
 36 |         A dictionary whose key is the intervention decription and the value is the intervention list for all treatment
 37 |         variables in this intervention.
 38 | 
 39 |     covnames: List
 40 |         A list of strings specifying the names of the time-varying covariates in obs_data.
 41 | 
 42 |     basecovs: List
 43 |         A list of strings specifying the names of baseline covariates in obs_data. These covariates should not be
 44 |         included in covnames.
 45 | 
 46 |     cov_hist: Dict
 47 |         A dictionary whose keys are covariate names and values are sub-dictionaries with historical information for
 48 |         covariates. Each sub-dictionaty contains keys 'lagged', 'cumavg' and 'lagavg', the corresponding value for the
 49 |         key 'lagged' is a two-element list where the first element is a list with all lagged terms, the second element
 50 |         is a list with the corresponding lagged numbers. Same for the key 'lagavg'. The corresponding value for the key
 51 |         'cumavg' is a list with all cumavg terms.
 52 | 
 53 |     time_points: Int
 54 |         An integer indicating the number of time points to simulate. It is set equal to the maximum number of records (K)
 55 |         that obs_data contains for any individual plus 1, if not specified by users.
 56 | 
 57 |     n_simul: Int
 58 |         An integer indicating the number of subjects for whom to simulate data. It is set equal to the number (M) of
 59 |         subjects in obs_data, if not specified by users.
 60 | 
 61 |     time_name: Str
 62 |         A string specifying the name of the time variable in obs_data.
 63 | 
 64 |     id: Str
 65 |         A string specifying the name of the id variable in obs_data.
 66 | 
 67 |     custom_histvars: List
 68 |         A list of strings, each of which specifies the names of the time-varying covariates with user-specified custom histories.
 69 | 
 70 |     custom_histories: List
 71 |         A list of functions, each function is the user-specified custom history functions for covariates. The list must
 72 |         be the same length as custom_histvars and in the same order.
 73 | 
 74 |     covmodels: List
 75 |         A list of strings, where each string is the model statement of the time-varying covariate. The list must be the
 76 |         same length as covnames and in the same order. If a model is not required for a certain covariate, it should be
 77 |         set to 'NA' at that index.
 78 | 
 79 |     hazardratio: Bool
 80 |         A boolean value indicating whether to calculate the hazard ratio of the two compared interventions.
 81 | 
 82 |     intcomp: List
 83 |         A list of two numbers indicating a pair of interventions to be compared by a hazard ratio.
 84 | 
 85 |     covtypes: List
 86 |         A list of strings specifying the “type” of each time-varying covariate included in covnames. The supported types:
 87 |         "binary", "normal", "categorical", "bounded normal", "zero-inflated normal", "truncated normal", "absorbing",
 88 |         "categorical time", "square time" and "custom". The list must be the same length as covnames and in the same order.
 89 | 
 90 |     covfits_custom: List
 91 |         A list, each element could be 'NA' or a user-specified fit function. The non-NA value is set
 92 |         for the covariates with custom type. The 'NA' value is set for other covariates. The list must be the
 93 |         same length as covnames and in the same order.
 94 | 
 95 |     covpredict_custom: List
 96 |         A list, each element could be 'NA' or a user-specified predict function. The non-NA value is set
 97 |         for the covariates with custom type. The 'NA' value is set for other covariates. The list must be the
 98 |         same length as covnames and in the same order.
 99 | 
100 |     ymodel_fit_custom: Function
101 |         A user-specified fit function for the outcome variable.
102 | 
103 |     ymodel_predict_custom: Function
104 |         A user-specified predict function for the outcome variable.
105 | 
106 |     ymodel: Str
107 |         A string specifying the model statement for the outcome variable.
108 | 
109 |     outcome_type: Str
110 |         A string specifying the "type" of outcome. The possible "types" are: "survival", "continuous_eof", and "binary_eof".
111 | 
112 |     outcome_name: Str
113 |         A string specifying the name of the outcome variable in obs_data.
114 | 
115 |     competing: Bool
116 |         A boolean value indicating if there is a competing event in obs_data.
117 | 
118 |     compevent_name: Str
119 |         A string specifying the name of the competing event variable in obs_data. Only applicable for survival outcomes.
120 | 
121 |     compevent_model: Str
122 |         A string specifying the model statement for the competing event variable. Only applicable for survival outcomes.
123 | 
124 |     compevent_cens: Bool
125 |         A boolean value indicating whether to treat competing events as censoring events.
126 | 
127 |     boot_diag: Bool
128 |         A boolean value indicating whether to return the parametric g-formula estimates as well as the coefficients,
129 |         standard errors, and variance-covariance matrices of the parameters of the fitted models in the bootstrap samples.
130 | 
131 |     trunc_params: List
132 |         A list, each element could be 'NA' or a two-element list. If not 'NA', the first element specifies the truncated
133 |         value and the second element specifies the truncated direction (‘left’ or ‘right’). The non-NA value is set
134 |         for the truncated normal covariates. The 'NA' value is set for other covariates. The list should be the same
135 |         length as covnames and in the same order.
136 | 
137 |     visit_names: List
138 |         A list, each of which is a string specifying the covariate name of a visit process.
139 | 
140 |     visit_covs: List
141 |         A list of strings, each of which specifies the name of a covariate whose modeling depends on the visit process.
142 | 
143 |     ts_visit_names: List
144 |         A list of strings, each of which indicates the number of consecutive missed visits for one covariate before an
145 |         individual is censored.
146 | 
147 |     max_visits: List
148 |         A list of integers, each integer indicates the maximum number of consecutive missed visits for one covariate that
149 |         has a visit process.
150 | 
151 |     time_thresholds: List
152 |         A list of integers that splits the time points into different intervals. It is used to create the variable
153 |         "categorical time".
154 | 
155 |     below_zero_indicator: Bool
156 |         A boolean value indicating if the obs_data contains pre-baseline times.
157 | 
158 |     baselags: Bool
159 |         A boolean value specifying the convention used for lagi and lag_cumavgi terms in the model statements when
160 |         pre-baseline times are not included in obs_data and when the current time index, t, is such that t < i. If this
161 |         argument is set to False, the value of all lagi and lag_cumavgi terms in this context are set to 0 (for
162 |         non-categorical covariates) or the reference level (for categorical covariates). If this argument is set to
163 |         True, the value of lagi and lag_cumavgi terms are set to their values at time 0. The default is False.
164 | 
165 |     restrictions: List
166 |         List of lists. Each inner list contains its first entry the covariate name of that its deterministic knowledge
167 |         is known; its second entry is a dictionary whose key is the conditions which should be True when the covariate
168 |         is modeled, the third entry is the value that is set to the covariate during simulation when the conditions
169 |         in the second entry are not True.
170 | 
171 |     yrestrictions: List
172 |         List of lists. For each inner list, its first entry is a dictionary whose key is the conditions which
173 |         should be True when the outcome is modeled, the second entry is the value that is set to the outcome during
174 |         simulation when the conditions in the first entry are not True.
175 | 
176 |     compevent_restrictions: List
177 |         List of lists. For each inner list, its first entry is a dictionary whose key is the conditions which
178 |         should be True when the competing event is modeled, the second entry is the value that is set to the competing
179 |         event during simulation when the conditions in the first entry are not True. Only applicable for survival outcomes.
180 | 
181 |     sim_trunc: Bool
182 |         A boolean value indicating if the simulated values of normal covariates are truncated by the observed ranges.
183 | 
184 |     Returns
185 |     -------
186 |     boot_results_dict: Dict
187 |         A dictionary contains the 'boot_results', 'bootcoeffs', 'bootstderrs', 'bootvcovs' and 'boot_hr' for a bootstrap sample.
188 | 
189 |     """
190 |     try:
191 |         np.random.seed(boot_seeds[boot_id])
192 | 
193 |         data_list = dict(list(obs_data.groupby(id, group_keys=False)))
194 |         ids = np.unique(obs_data[id])
195 |         new_ids = np.random.choice(ids, len(ids), replace=True)
196 | 
197 |         new_df = []
198 |         for index, new_id in enumerate(new_ids):
199 |             new_id_df = data_list[new_id].copy()
200 |             new_id_df[id] = index
201 |             new_df.append(new_id_df)
202 |         resample_data = pd.concat(new_df, ignore_index=True)
203 | 
204 |         update_precoded_history(pool=resample_data, covnames=covnames, cov_hist=cov_hist, covtypes=covtypes,
205 |                                 time_name=time_name, id=id, below_zero_indicator=below_zero_indicator,
206 |                                 baselags=baselags, ts_visit_names = ts_visit_names)
207 |         if custom_histvars is not None:
208 |             for t in range(time_points):
209 |                 update_custom_history(resample_data, custom_histvars, custom_histories, time_name, t, id)
210 | 
211 |         covariate_fits, bounds, rmses, cov_model_coeffs, cov_model_stderrs, cov_model_vcovs, cov_model_fits_summary = \
212 |             fit_covariate_model(covmodels=covmodels, covnames=covnames, covtypes=covtypes,
213 |                                 covfits_custom=covfits_custom, time_name=time_name, obs_data=resample_data,
214 |                                 return_fits=boot_diag, trunc_params=trunc_params, visit_names=visit_names,
215 |                                 max_visits=max_visits, ts_visit_names=ts_visit_names,
216 |                                 visit_covs=visit_covs, restrictions=restrictions)
217 | 
218 |         outcome_fit, ymodel_coeffs, ymodel_stderrs, ymodel_vcovs, ymodel_fits_summary = \
219 |             fit_ymodel(ymodel=ymodel, outcome_type=outcome_type, outcome_name=outcome_name,
220 |                               ymodel_fit_custom=ymodel_fit_custom, time_name=time_name, obs_data=resample_data,
221 |                               competing=competing, compevent_name=compevent_name, return_fits=boot_diag,
222 |                               yrestrictions=yrestrictions)
223 | 
224 |         model_coeffs = {**cov_model_coeffs, **ymodel_coeffs}
225 |         model_stderrs = {**cov_model_stderrs, **ymodel_stderrs}
226 |         model_vcovs = {**cov_model_vcovs, **ymodel_vcovs}
227 |         model_fits_summary = {**cov_model_fits_summary, **ymodel_fits_summary}
228 | 
229 |         if competing:
230 |             compevent_fit, comp_model_coeffs, comp_model_stderrs, comp_model_vcovs, comp_model_fits_summary = \
231 |                 fit_compevent_model(compevent_model=compevent_model, compevent_name=compevent_name,
232 |                                     time_name=time_name, obs_data=resample_data, return_fits=boot_diag,
233 |                                     compevent_restrictions=compevent_restrictions)
234 |             model_coeffs.update(comp_model_coeffs)
235 |             model_stderrs.update(comp_model_stderrs)
236 |             model_vcovs.update(comp_model_vcovs)
237 |             model_fits_summary.update(comp_model_fits_summary)
238 |         else:
239 |             compevent_fit = None
240 | 
241 |         if n_simul != len(np.unique(resample_data[id])):
242 |             data_list = dict(list(obs_data.groupby(id, group_keys=True)))
243 |             ids = np.unique(obs_data[id])
244 |             new_ids = np.random.choice(ids, n_simul, replace=True)
245 | 
246 |             new_df = []
247 |             for index, new_id in enumerate(new_ids):
248 |                 new_id_df = data_list[new_id].copy()
249 |                 new_id_df[id] = index
250 |                 new_df.append(new_id_df)
251 |             resample_data = pd.concat(new_df, ignore_index=True)
252 | 
253 |         boot_results = []
254 |         boot_pools = []
255 |         for intervention_name in int_descript:
256 |             boot_result = simulate(seed=boot_seeds[boot_id], time_points=time_points, time_name=time_name,
257 |                                        id=id, covnames=covnames, basecovs=basecovs,
258 |                                        covmodels=covmodels,  covtypes=covtypes, cov_hist=cov_hist,
259 |                                        covariate_fits=covariate_fits, rmses=rmses, bounds=bounds, outcome_type=outcome_type,
260 |                                        obs_data=resample_data,
261 |                                        intervention=intervention_dicts[intervention_name],
262 |                                        custom_histvars = custom_histvars, custom_histories=custom_histories,
263 |                                        covpredict_custom=covpredict_custom, ymodel=ymodel,
264 |                                        ymodel_predict_custom=ymodel_predict_custom,
265 |                                        outcome_fit=outcome_fit, outcome_name=outcome_name,
266 |                                        competing=competing, compevent_name=compevent_name,
267 |                                        compevent_fit=compevent_fit, compevent_model=compevent_model,
268 |                                        compevent_cens=compevent_cens, trunc_params=trunc_params, visit_names=visit_names,
269 |                                        visit_covs=visit_covs, ts_visit_names=ts_visit_names,
270 |                                        max_visits=max_visits, time_thresholds=time_thresholds,
271 |                                        baselags=baselags, below_zero_indicator=below_zero_indicator,
272 |                                        restrictions=restrictions, yrestrictions=yrestrictions,
273 |                                        compevent_restrictions=compevent_restrictions, sim_trunc=sim_trunc
274 |                                    )
275 |             boot_results.append(boot_result['g_result'])
276 |             boot_pools.append(boot_result['pool'])
277 | 
278 |         boot_results_dict = {'boot_results': boot_results, 'bootcoeffs': model_coeffs, 'bootstderrs': model_stderrs,
279 |                              'bootvcovs': model_vcovs}
280 | 
281 |         if hazardratio:
282 |             pool1 = boot_pools[intcomp[0]]
283 |             pool2 = boot_pools[intcomp[1]]
284 | 
285 |             if competing and not compevent_cens:
286 |                 import cmprsk.cmprsk as cmprsk
287 | 
288 |                 new_pool1 = pool1.groupby(id, group_keys=False).apply(hr_comp_data_helper,
289 |                             outcome_name=outcome_name, compevent_name=compevent_name)
290 |                 new_pool2 = pool2.groupby(id, group_keys=False).apply(hr_comp_data_helper,
291 |                             outcome_name=outcome_name, compevent_name=compevent_name)
292 |                 new_pool1['regime'] = 0
293 |                 new_pool2['regime'] = 1
294 |                 concat_data = pd.concat([new_pool1, new_pool2])
295 |                 concat_data = concat_data[[time_name, outcome_name, compevent_name, 'regime']]
296 |                 concat_data = concat_data.reset_index(drop=True)
297 |                 concat_data['event'] = np.where(concat_data[compevent_name] == 1, 2,
298 |                                                 concat_data[outcome_name]).tolist()
299 |                 ftime = concat_data[time_name]
300 |                 fstatus = concat_data['event']
301 |                 crr_res = cmprsk.crr(failure_time=ftime, failure_status=fstatus, static_covariates=concat_data[['regime']])
302 |                 hazard_ratio = crr_res.hazard_ratio()[0][0]
303 |             else:
304 |                 new_pool1 = pool1.groupby(id, group_keys=False).apply(hr_data_helper, outcome_name=outcome_name)
305 |                 new_pool2 = pool2.groupby(id, group_keys=False).apply(hr_data_helper, outcome_name=outcome_name)
306 |                 new_pool1['regime'] = 0
307 |                 new_pool2['regime'] = 1
308 |                 concat_data = pd.concat([new_pool1, new_pool2])
309 |                 concat_data = concat_data[[time_name, outcome_name, 'regime']]
310 |                 cph = CoxPHFitter()
311 |                 cph.fit(concat_data, duration_col=time_name, event_col=outcome_name)
312 |                 hazard_ratio = cph.hazard_ratios_.values[0]
313 | 
314 |             boot_results_dict['boot_hr'] = hazard_ratio
315 | 
316 |     except Exception as e:
317 |         warnings.warn("An error occurred at bootstrap sample {0}: {1}. "
318 |                       "The analysis should likely be repeated with more parsimonious models.".format(boot_id, e))
319 |         boot_results_dict = {'boot_results': None, 'bootcoeffs': None, 'bootstderrs': None, 'bootvcovs': None}
320 | 
321 |     return boot_results_dict
322 | 
323 | 
324 | 


--------------------------------------------------------------------------------
/pygformula/parametric_gformula/histories.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | 
  5 | def update_precoded_history(pool, covnames, cov_hist, covtypes, time_name, id, below_zero_indicator, baselags,
  6 |                    ts_visit_names=None):
  7 |     """
  8 |     This internal function is used to add new columns to the original pool for the three precoded historical terms (the
  9 |     lagged term, cumavg term, and lagavg term) in the model statement.
 10 | 
 11 |     Parameters
 12 |     ----------
 13 |     pool : DataFrame
 14 |         A DataFrame that contains the observed or simulated data up to the maximum time step of the data table.
 15 |         The historical terms at all time steps in the data table are to be updated.
 16 | 
 17 |     covnames : List
 18 |         A list of strings specifying the names of the time-varying covariates.
 19 | 
 20 |     cov_hist : Dict
 21 |         A dictionary whose keys are covariate names and values are sub-dictionaries with historical information for
 22 |         covariates. Each sub-dictionaty contains keys 'lagged', 'cumavg' and 'lagavg', the corresponding value for the
 23 |         key 'lagged' is a two-element list where the first element is a list with all lagged terms, the second element
 24 |         is a list with the corresponding lagged numbers. Same for the key 'lagavg'. The corresponding value for the key
 25 |         'cumavg' is a list with all cumavg terms.
 26 | 
 27 |     covtypes : List
 28 |         A list of strings specifying the type of each time-varying covariate included in covnames. The list must be
 29 |         the same length as covnames and in the same order. The supported types: “binary”, “normal”, “categorical”,
 30 |         “bounded normal”, “zero-inflated normal”, “truncated normal”, “absorbing”, “categorical time”, "square time"
 31 |         and "custom".
 32 | 
 33 |     time_name : Str
 34 |         A string specifying the name of the time variable in obs_data.
 35 | 
 36 |     id : Str
 37 |         A string specifying the name of the id variable in obs_data.
 38 | 
 39 |     below_zero_indicator : Bool
 40 |         A boolean variable indicating if the obs_data contains pre-baseline times.
 41 | 
 42 |     baselags : Bool
 43 |         A boolean value specifying the convention used for lagi and lag_cumavgi terms in the model statements when
 44 |         pre-baseline times are not included in obs_data and when the current time index, t, is such that t < i. If this
 45 |         argument is set to False, the value of all lagi and lag_cumavgi terms in this context are set to 0 (for
 46 |         non-categorical covariates) or the reference level (for categorical covariates). If this argument is set to
 47 |         True, the value of lagi and lag_cumavgi terms are set to their values at time 0.
 48 | 
 49 |     ts_visit_names : List
 50 |         A list of strings, each of which indicates the number of consecutive missed visits for one covariate before an
 51 |         individual is censored.
 52 | 
 53 |     Returns
 54 |     -------
 55 |     None : The original input pool has been updated and nothing is returned.
 56 | 
 57 |     """
 58 | 
 59 |     if ts_visit_names:
 60 |         covnames = covnames + ts_visit_names
 61 | 
 62 |     for k, cov in enumerate(covnames):
 63 |         if ts_visit_names is not None:
 64 |             cov_type = covtypes[k] if cov not in ts_visit_names else None
 65 |         else:
 66 |             cov_type = covtypes[k]
 67 | 
 68 |         lagged_covs = cov_hist[cov]['lagged'][0]
 69 |         lagged_nums = cov_hist[cov]['lagged'][1]
 70 |         if len(lagged_covs) > 0:  # create lag variable
 71 |             for i, lagged_cov in enumerate(lagged_covs):
 72 |                 if cov_type == 'categorical':
 73 |                     if below_zero_indicator:
 74 |                         pool[lagged_cov] = np.array(pool.groupby([id])[cov].shift(lagged_nums[i]))
 75 |                     else:
 76 |                         fill_values = pool.groupby([id])[cov].transform('first') if baselags else \
 77 |                             pd.Categorical(pool[cov]).categories[0]
 78 |                         pool[lagged_cov] = np.where(pool[time_name] >= lagged_nums[i],
 79 |                                                         pool.groupby([id])[cov].shift(lagged_nums[i]), fill_values)
 80 |                     pool[lagged_cov] = pd.Categorical(pool[lagged_cov])
 81 |                 else:
 82 |                     if below_zero_indicator:
 83 |                         pool[lagged_cov] = np.array(pool.groupby([id])[cov].shift(lagged_nums[i]))
 84 |                     else:
 85 |                         fill_values = pool.groupby(id)[cov].transform('first') if baselags else 0
 86 |                         pool[lagged_cov] = np.where(pool[time_name] >= lagged_nums[i],
 87 |                                                     pool.groupby([id])[cov].shift(lagged_nums[i]), fill_values)
 88 | 
 89 |         if len(cov_hist[cov]['cumavg']) > 0:  # create cumavg variable
 90 |             pool['_'.join(['cumavg', str(cov)])] = np.array(pool.groupby([id])[cov].expanding().mean())
 91 | 
 92 |         lagavg_covs = cov_hist[cov]['lagavg'][0]
 93 |         lagavg_nums = cov_hist[cov]['lagavg'][1]
 94 |         if len(lagavg_covs) > 0:  # create lagavg variable
 95 |             if len(cov_hist[cov]['cumavg']) == 0:  # if cumavg variable has not been created yet, create cumavg variable
 96 |                 pool['_'.join(['cumavg', str(cov)])] = np.array(pool.groupby([id])[cov].expanding().mean())
 97 | 
 98 |             for i, lagavg_cov in enumerate(lagavg_covs):
 99 |                 if below_zero_indicator:
100 |                     pool[lagavg_cov] = np.array(pool.groupby([id])['_'.join(['cumavg', str(cov)])].shift(lagavg_nums[i]))
101 |                 else:
102 |                     fill_values = pool.groupby(id)[cov].transform('first') if baselags else 0
103 |                     pool[lagavg_cov] = np.where(pool[time_name] >= lagavg_nums[i],
104 |                                            pool.groupby([id])['_'.join(['cumavg', str(cov)])].shift(lagavg_nums[i]), fill_values)
105 | 
106 | 
107 | def ave_last3(pool, histvar, time_name, t, id):
108 |     """
109 |     This is an example historical function which generates the average of the three most recent values for a specified
110 |     covariate.
111 | 
112 |     Parameters
113 |     ----------
114 |     pool : DataFrame
115 |         A DataFrame that contains the observed or simulated data up to time t. The historical term at time t in the data
116 |         table is to be updated.
117 | 
118 |     histvar : Str
119 |         A string that specifies the name of the variable for which the history function is to be applied.
120 | 
121 |     time_name : Str
122 |         A string specifying the name of the time variable in pool.
123 | 
124 |     t : Int
125 |          An integer specifying the current time index.
126 | 
127 |     id : Str
128 |         A string specifying the name of the id variable in the obs_data.
129 | 
130 |     Returns
131 |     -------
132 |     None : The original input pool has been updated and nothing is returned.
133 | 
134 |     """
135 |     def avg_func(df, time_name, t, histvar):
136 |         if t < 3:
137 |             avg_values = np.mean((df[(df[time_name] >= 0) & (df[time_name] <= t)][histvar]))
138 |         else:
139 |             avg_values = np.mean((df[(df[time_name] > t - 3) & (df[time_name] <= t)][histvar]))
140 |         return avg_values
141 | 
142 |     valid_pool = pool.groupby(id).filter(lambda x: max(x[time_name]) >= t)
143 |     pool.loc[pool[time_name] == t, '_'.join(['ave_last3', str(histvar)])] = list(valid_pool.groupby(id).apply(
144 |         avg_func, time_name=time_name, t=t, histvar=histvar))
145 | 
146 | 
147 | def update_custom_history(pool, histvars, histories, time_name, t, id):
148 |     """
149 |     This internal function is used to add new columns to the original pool for the user-specified custom historical
150 |     terms.
151 | 
152 |     Parameters
153 |     ----------
154 |     pool :  DataFrame
155 |         A DataFrame that contains the observed or simulated data up to time t. The historical term at time t in the data
156 |         table is to be updated.
157 | 
158 |     histvars : List
159 |         A list of strings, each of which specifies the name of the variable for which its custom history function
160 |         is to be applied.
161 | 
162 |     histories : List
163 |         A list of custom functions, each of which is applied to the variable with the same index in histvars.
164 | 
165 |     time_name : Str
166 |         A string specifying the name of the time variable in obs_data.
167 | 
168 |     t : Int
169 |          An integer specifying the current time index.
170 | 
171 |     id : Str
172 |         A string specifying the name of the id variable in obs_data.
173 | 
174 |     Returns
175 |     -------
176 |     None : The original input pool has been updated and nothing is returned.
177 | 
178 |     """
179 |     for i in range(len(histvars)):
180 |         histories[i](pool=pool, histvar=histvars[i], time_name=time_name, t=t, id=id)
181 | 


--------------------------------------------------------------------------------
/pygformula/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalInference/pygformula/a94748658fe52f368f989d08952f1cedd433cc52/pygformula/utils/__init__.py


--------------------------------------------------------------------------------
/pygformula/utils/helper.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import numpy as np
  3 | 
  4 | 
  5 | def get_cov_hist_info(covnames, covmodels, covtypes, ymodel, compevent_model=None, censor_model=None,
  6 |                       visit_covs=None, ts_visit_names=None):
  7 |     """
  8 |     This is an internal function to get the lagged term and its number indicator, cumavg term, and lagavg term and its number
  9 |     indicator for each covariate from user-specified models.
 10 | 
 11 |     Parameters
 12 |     ----------
 13 |     covnames : List
 14 |         A list of strings specifying the names of the time-varying covariates in obs_data.
 15 | 
 16 |     covmodels : List
 17 |         A list of strings, where each string is the model statement of the time-varying covariate. The list must be the
 18 |         same length as covnames and in the same order. If a model is not required for a certain covariate, it should be
 19 |         set to 'NA' at that index.
 20 | 
 21 |     covtypes : List
 22 |         A list of strings specifying the “type” of each time-varying covariate included in covnames. The supported types:
 23 |         "binary", "normal", "categorical", "bounded normal", "zero-inflated normal", "truncated normal", "absorbing",
 24 |         "categorical time", "square time" and "custom". The list must be the same length as covnames and in the same order.
 25 | 
 26 |     ymodel : Str
 27 |         A string specifying the model statement for the outcome variable.
 28 | 
 29 |     compevent_model : Str, (default=None)
 30 |         A string specifying model statement for the competing event variable. Only applicable for survival outcomes.
 31 | 
 32 |     censor_model : Str, (default=None)
 33 |         A string specifying the model statement for the censoring variable. Only applicable when using inverse
 34 |         probability weights to estimate the natural course means / risk from the observed data.
 35 | 
 36 |     visit_covs : List, (default=None)
 37 |         A list of strings, each of which specifies the name of a covariate whose modeling depends on the visit process.
 38 | 
 39 |     ts_visit_names : List, (default=None)
 40 |         A list of strings, each of which indicates the number of consecutive missed visits for one covariate before an
 41 |         individual is censored. The list has the same length as visit_covs.
 42 | 
 43 |     Returns
 44 |     -------
 45 |     cov_hist_infos : Dict
 46 |         A dictionary whose keys are covariate names and values are sub-dictionaries with historical information for
 47 |         covariates. Each sub-dictionaty contains keys 'lagged', 'cumavg' and 'lagavg', the corresponding value for the
 48 |         key 'lagged' is a two-element list where the first element is a list with all lagged terms, the second element
 49 |         is a list with the corresponding lagged numbers. Same for the key 'lagavg'. The corresponding value for the key
 50 |         'cumavg' is a list with all cumavg terms.
 51 | 
 52 |     """
 53 | 
 54 |     all_variables = []
 55 |     for model in covmodels:
 56 |         all_variables.extend(re.split('[~|+]', model.replace(' ', '')))
 57 |     all_variables.extend(re.split('[~|+]', ymodel.replace(' ', '')))
 58 | 
 59 |     if compevent_model is not None:
 60 |         all_variables.extend(re.split('[~|+]', compevent_model.replace(' ', '')))
 61 |     if censor_model is not None:
 62 |         all_variables.extend(re.split('[~|+]', censor_model.replace(' ', '')))
 63 | 
 64 |     if ts_visit_names:
 65 |         covnames = covnames + ts_visit_names
 66 | 
 67 |     cov_hist_infos = {}
 68 |     for k, cov in enumerate(covnames):
 69 |         cov_list = np.unique([str_cov for str_cov in all_variables if cov in str_cov])
 70 |         if k < len(covtypes):
 71 |             if covtypes[k] == 'absorbing':
 72 |                 cov_list = np.append(cov_list, 'lag1_{0}'.format(cov))
 73 |         if visit_covs and cov in visit_covs:
 74 |             cov_list = np.append(cov_list, 'lag1_{0}'.format(cov))
 75 |         if ts_visit_names and cov in ts_visit_names:
 76 |             cov_list = np.append(cov_list, 'lag1_{0}'.format(cov))
 77 | 
 78 |         cov_hist = {}
 79 |         lagavg_variables, cumavg_variables, lagged_variables = [], [], []
 80 |         lagged_numbers, lagavg_numbers = [], []
 81 |         for item in cov_list:
 82 |             if 'lag' in item and 'lag_cumavg' not in item:
 83 |                 pattern = re.compile(r'lag\d+_{0}'.format(cov))
 84 |                 lag_names = pattern.findall(item)
 85 |                 for lag_name in lag_names:
 86 |                     lagged_variables.append(lag_name)
 87 |                     lagged_numbers.append(int(lag_name.split('_')[0].split('lag')[1]))
 88 | 
 89 |             if 'cumavg' in item and 'lag_cumavg' not in item:
 90 |                 if covtypes[k] == 'categorical' or covtypes[k] == 'categorical time':
 91 |                     raise ValueError('Cannot apply cumulative average function to categorical covariates.')
 92 |                 pattern = re.compile(r'cumavg_{0}'.format(cov))
 93 |                 cumavg_names = pattern.findall(item)
 94 |                 for cumavg_name in cumavg_names:
 95 |                     cumavg_variables.append(cumavg_name)
 96 | 
 97 |             if 'lag_cumavg' in item:
 98 |                 if covtypes[k] == 'categorical' or covtypes[k] == 'categorical time':
 99 |                     raise ValueError('Cannot apply lagged cumulative average function to categorical covariates.')
100 |                 pattern = re.compile(r'lag_cumavg\d+_{0}'.format(cov))
101 |                 lagavg_names = pattern.findall(item)
102 |                 for lagavg_name in lagavg_names:
103 |                     lagavg_variables.append(lagavg_name)
104 |                     lagavg_numbers.append(int(lagavg_name.split('_')[1].split('cumavg')[1]))
105 | 
106 |         cov_hist['lagged'] = [lagged_variables, lagged_numbers]
107 |         cov_hist['cumavg'] = cumavg_variables
108 |         cov_hist['lagavg'] = [lagavg_variables, lagavg_numbers]
109 |         cov_hist_infos[cov] = cov_hist
110 | 
111 |     return cov_hist_infos
112 | 
113 | 
114 | def visit_func(df, time_name, visit_name, ts_visit_name):
115 |     """
116 |     An internal function assists the implementation of a visit process, it creates a new column named ts_visit_name.
117 | 
118 |     Parameters
119 |     ----------
120 |     df : DataFrame
121 |         A pandas DataFrame of the input obs_data.
122 | 
123 |     time_name : Str
124 |         A string specifying the name of the time variable in obs_data.
125 | 
126 |     visit_name : Str
127 |         A string specifying the covariate name of a visit process.
128 | 
129 |     ts_visit_name : Str
130 |         A string indicating the number of consecutive missed visits before an individual is censored.
131 | 
132 |     Returns
133 |     -------
134 |     df : DataFrame
135 |         A pandas DataFrame with a new column ts_visit_name created.
136 | 
137 |     """
138 | 
139 |     df.loc[df[time_name] == 0, ts_visit_name] = 0
140 |     tp_visits = 0
141 |     for t in range(1, max(df[time_name]) + 1):
142 |         if df.loc[df[time_name] == t, visit_name].values[0] == 1:
143 |             df.loc[df[time_name] == t, ts_visit_name] = 0
144 |         else:
145 |             if df.loc[df[time_name] == t - 1, visit_name].values[0] == 1:  # restart the count with new visit
146 |                 df.loc[df[time_name] == t, ts_visit_name] = 1
147 |                 tp_visits = 0
148 |             else:  # continue to count the missed visit number
149 |                 tp_visits += 1
150 |                 df.loc[df[time_name] == t, ts_visit_name] = 1 + tp_visits
151 |     return df
152 | 
153 | 
154 | def categorical_func(t, time_thresholds):
155 |     for i in range(len(time_thresholds)):
156 |         if t <= time_thresholds[i]:
157 |             categorical_t = i
158 |             break
159 |         else:
160 |             categorical_t = i + 1
161 |     return categorical_t
162 | 
163 | 
164 | def hr_data_helper(df, outcome_name):
165 |     for i, row in df.iterrows():
166 |         if row[outcome_name] == 1:
167 |             return row
168 |     return row
169 | 
170 | 
171 | def hr_comp_data_helper(df, outcome_name, compevent_name):
172 |     for i, row in df.iterrows():
173 |         if row[compevent_name] == 1:
174 |             return row
175 |         elif row[outcome_name] == 1:
176 |             return row
177 |     return row


--------------------------------------------------------------------------------
/pygformula/version.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 The President and Fellows of Harvard College
 2 | #
 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | # of this software and associated documentation files (the "Software"), to deal
 5 | # in the Software without restriction, including without limitation the rights
 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | # copies of the Software, and to permit persons to whom the Software is
 8 | # furnished to do so, subject to the following conditions:
 9 | #
10 | # The above copyright notice and this permission notice shall be included in all
11 | # copies or substantial portions of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | # SOFTWARE.
20 | 
21 | # Pygformula: a python implementation of the parametric g-formula
22 | # The pygformula 1.0 implements the non-iterative conditional expectation (NICE) algorithm of the g-formula with
23 | # parametric models for covariates, treatments and the outcome.
24 | 
25 | __version__ = '1.1.6'
26 | 


--------------------------------------------------------------------------------
/readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   os: ubuntu-22.04
 5 |   tools:
 6 |     python: "3.8"
 7 |   jobs:
 8 |     post_create_environment:
 9 |       - python -m pip install sphinx_rtd_theme
10 | 
11 | sphinx:
12 |   configuration: docs/source/conf.py
13 | 
14 | formats:
15 |    - pdf
16 |    - epub
17 | 
18 | python:
19 |    install:
20 |      - method: pip
21 |        path: .
22 |      - requirements: docs/requirements.txt
23 |      - requirements: requirements.txt
24 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | joblib>=1.2.0
 2 | lifelines>=0.27.4
 3 | matplotlib>=3.5.1
 4 | numpy>=1.22.0
 5 | pandas>=1.5.2
 6 | prettytable>=3.10.0
 7 | pytruncreg>=0.1.2
 8 | scipy>=1.10.0
 9 | seaborn>=0.11.2
10 | statsmodels>=0.14.0
11 | tqdm>=4.64.0
12 | PyQt5>=5.15.11
13 | 


--------------------------------------------------------------------------------
/running_examples/get_started_example.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.interventions import static
 4 | from pygformula.data import load_basicdata_nocomp
 5 | 
 6 | obs_data = load_basicdata_nocomp()
 7 | time_name = 't0'
 8 | id = 'id'
 9 | 
10 | covnames = ['L1', 'L2', 'A']
11 | covtypes = ['binary', 'bounded normal', 'binary']
12 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
13 |            'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
14 |            'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0']
15 | 
16 | basecovs = ['L3']
17 | 
18 | outcome_name = 'Y'
19 | ymodel = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0'
20 | outcome_type = 'survival'
21 | 
22 | time_points = np.max(np.unique(obs_data[time_name])) + 1
23 | int_descript = ['Never treat', 'Always treat']
24 | 
25 | 
26 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
27 |              int_descript = int_descript,
28 |              covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs,
29 |              outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type,
30 |              Intervention1_A = [static, np.zeros(time_points)],
31 |              Intervention2_A = [static, np.ones(time_points)],
32 |              nsamples=20, parallel=True, ncores=8,
33 |              )
34 | g.fit()
35 | g.plot_natural_course()
36 | g.plot_interventions()


--------------------------------------------------------------------------------
/running_examples/test_absorbing_cov.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.interventions import static
 4 | from pygformula.data import load_absorbing_data
 5 | 
 6 | obs_data = load_absorbing_data()
 7 | time_name = 't0'
 8 | id = 'id'
 9 | 
10 | covnames = ['L', 'A']
11 | covtypes = ['absorbing', 'binary']
12 | covmodels = ['L ~ lag1_L + lag1_A + t0',
13 |               'A ~ lag1_A + L + t0']
14 | 
15 | outcome_name = 'Y'
16 | ymodel = 'Y ~ L + A + t0'
17 | outcome_type = 'survival'
18 | 
19 | time_points = np.max(np.unique(obs_data[time_name])) + 1
20 | int_descript = ['Never treat', 'Always treat']
21 | 
22 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
23 |              int_descript = int_descript,
24 |              covnames=covnames, covtypes=covtypes, covmodels=covmodels,
25 |              Intervention1_A = [static, np.zeros(time_points)],
26 |              Intervention2_A = [static, np.ones(time_points)],
27 |              outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type
28 |              )
29 | g.fit()
30 | 


--------------------------------------------------------------------------------
/running_examples/test_binary_cov.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.interventions import static
 4 | from pygformula.data import load_basicdata_nocomp
 5 | 
 6 | obs_data = load_basicdata_nocomp()
 7 | time_name = 't0'
 8 | id = 'id'
 9 | 
10 | covnames = ['L1', 'A']
11 | covtypes = ['binary', 'binary']
12 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + L3 + t0',
13 |            'A ~ lag1_A + L1 + lag_cumavg1_L1 + L3 + t0']
14 | 
15 | basecovs = ['L3']
16 | 
17 | outcome_name = 'Y'
18 | ymodel = 'Y ~ L1 + A + lag1_A + lag1_L1 + L3 + t0'
19 | outcome_type = 'survival'
20 | 
21 | time_points = np.max(np.unique(obs_data[time_name])) + 1
22 | int_descript = ['Never treat', 'Always treat']
23 | 
24 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
25 |              int_descript = int_descript,
26 |              Intervention1_A = [static, np.zeros(time_points)],
27 |              Intervention2_A = [static, np.ones(time_points)],
28 |              covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs,
29 |              outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type
30 |              )
31 | g.fit()


--------------------------------------------------------------------------------
/running_examples/test_binary_eof.py:
--------------------------------------------------------------------------------
 1 | from pygformula import ParametricGformula
 2 | from pygformula.interventions import threshold
 3 | from pygformula.data import load_binary_eof
 4 | 
 5 | obs_data = load_binary_eof()
 6 | time_name = 't0'
 7 | id = 'id'
 8 | 
 9 | covnames = ['L1', 'L2', 'A']
10 | covtypes = ['binary', 'zero-inflated normal', 'normal']
11 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + L3 + t0',
12 |              'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
13 |              'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0']
14 | 
15 | basecovs = ['L3']
16 | 
17 | outcome_name = 'Y'
18 | ymodel = 'Y ~ L1 + A + lag1_A + lag1_L1 + L3 + t0'
19 | outcome_type = 'binary_eof'
20 | 
21 | int_descript = ['Threshold intervention']
22 | 
23 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name,
24 |              int_descript = int_descript,
25 |              Intervention1_A = [threshold, [0.5, float('inf')]],
26 |              covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs,
27 |              outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type
28 |              )
29 | g.fit()


--------------------------------------------------------------------------------
/running_examples/test_bounded_normal_cov.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.interventions import static
 4 | from pygformula.data import load_basicdata_nocomp
 5 | 
 6 | obs_data = load_basicdata_nocomp()
 7 | time_name = 't0'
 8 | id = 'id'
 9 | 
10 | covnames = ['L2', 'A']
11 | covtypes = ['bounded normal', 'binary']
12 | covmodels = ['L2 ~ lag1_A + lag_cumavg1_L2 + L3 + t0',
13 |            'A ~ lag1_A + L2 + lag_cumavg1_L2 + L3 + t0']
14 | 
15 | basecovs = ['L3']
16 | 
17 | outcome_name = 'Y'
18 | ymodel = 'Y ~ L2 + A + lag1_A + L3 + t0'
19 | outcome_type = 'survival'
20 | 
21 | time_points = np.max(np.unique(obs_data[time_name])) + 1
22 | int_descript = ['Never treat', 'Always treat']
23 | 
24 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
25 |              int_descript = int_descript, intcomp=[1, 2],
26 |              Intervention1_A = [static, np.zeros(time_points)],
27 |              Intervention2_A = [static, np.ones(time_points)],
28 |              covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs,
29 |              outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type
30 |              )
31 | g.fit()


--------------------------------------------------------------------------------
/running_examples/test_categorical_cov.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.interventions import static
 4 | from pygformula.data import load_categorical
 5 | 
 6 | obs_data = load_categorical()
 7 | time_name = 't0'
 8 | id = 'id'
 9 | 
10 | covnames = [ 'L', 'A']
11 | covtypes = ['categorical', 'binary']
12 | covmodels = [ 'L ~ C(lag1_L) + t0',
13 |               'A ~ C(L) + C(lag1_L) + t0']
14 | 
15 | outcome_name = 'Y'
16 | ymodel = 'Y ~ C(lag1_L) + A'
17 | 
18 | time_points = np.max(np.unique(obs_data[time_name])) + 1
19 | int_descript = ['Never treat', 'Always treat']
20 | 
21 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
22 |                int_descript = int_descript,
23 |                Intervention1_A = [static, np.zeros(time_points)],
24 |                Intervention2_A = [static, np.ones(time_points)],
25 |                covnames=covnames,  covtypes=covtypes, covmodels=covmodels, outcome_name=outcome_name,
26 |                ymodel=ymodel, outcome_type='survival')
27 | g.fit()
28 | 
29 | 


--------------------------------------------------------------------------------
/running_examples/test_categorical_time.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.interventions import static
 4 | from pygformula.data import load_basicdata_nocomp
 5 | 
 6 | obs_data = load_basicdata_nocomp()
 7 | time_name = 't0'
 8 | id = 'id'
 9 | 
10 | covnames = ['L1', 'L2', 'A', 't0_f']
11 | covtypes = ['binary', 'bounded normal', 'binary', 'categorical time']
12 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0 + C(t0_f)',
13 |            'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0 + C(t0_f)',
14 |            'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0 + C(t0_f)',
15 |            'NA']
16 | 
17 | time_thresholds = [1, 3, 5]
18 | 
19 | basecovs = ['L3']
20 | 
21 | outcome_name = 'Y'
22 | ymodel = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0'
23 | outcome_type = 'survival'
24 | 
25 | time_points = np.max(np.unique(obs_data[time_name])) + 1
26 | int_descript = ['Never treat', 'Always treat']
27 | 
28 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
29 |              int_descript = int_descript, time_thresholds = time_thresholds,
30 |              Intervention1_A = [static, np.zeros(time_points)],
31 |              Intervention2_A = [static, np.ones(time_points)],
32 |              covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs,
33 |              outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type,
34 |              )
35 | g.fit()
36 | 


--------------------------------------------------------------------------------
/running_examples/test_censor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.interventions import static
 4 | from pygformula.data import load_censor_data
 5 | 
 6 | obs_data = load_censor_data()
 7 | time_name = 't0'
 8 | id = 'id'
 9 | 
10 | covnames = ['L', 'A']
11 | covtypes = ['binary', 'normal']
12 | 
13 | covmodels = ['L ~ lag1_L + t0',
14 |              'A ~ lag1_A + L + t0']
15 | 
16 | outcome_name = 'Y'
17 | ymodel = 'Y ~ A + L'
18 | 
19 | censor_name = 'C'
20 | censor_model = 'C ~ A + L'
21 | 
22 | time_points = np.max(np.unique(obs_data[time_name])) + 1
23 | int_descript = ['Never treat', 'Always treat']
24 | 
25 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
26 |              int_descript=int_descript,
27 |              Intervention1_A = [static, np.zeros(time_points)],
28 |              Intervention2_A = [static, np.ones(time_points)],
29 |              censor_name= censor_name, censor_model=censor_model,
30 |              covnames = covnames, covtypes = covtypes, covmodels = covmodels,
31 |              outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival')
32 | g.fit()
33 | 


--------------------------------------------------------------------------------
/running_examples/test_comp_restrictions.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.interventions import static
 4 | from pygformula.data import load_basicdata
 5 | 
 6 | obs_data = load_basicdata()
 7 | 
 8 | covnames = ['L1', 'L2', 'A']
 9 | covtypes = ['binary', 'bounded normal', 'binary']
10 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
11 |              'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2  + L3 + t0',
12 |              'A ~ lag1_A + L1 + L2 +lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0']
13 | 
14 | ymodel = 'Y ~ A + L1 + L2 + L3 + lag1_A + lag1_L1 + lag1_L2'
15 | 
16 | time_name = 't0'
17 | id = 'id'
18 | outcome_name = 'Y'
19 | basecovs = ['L3']
20 | 
21 | compevent_name = 'D'
22 | compevent_model = 'D ~ A + L1 + L2 + L3 + t0'
23 | compevent_cens = False
24 | 
25 | time_points = np.max(np.unique(obs_data[time_name])) + 1
26 | int_descript = ['Never treat', 'Always treat']
27 | 
28 | 
29 | compevent_restrictions = [[{'L1': lambda x: x == 0}, 0], [{'L2': lambda x: x > 0.5}, 0.1]]
30 | 
31 | g = ParametricGformula(obs_data = obs_data, id = id, time_points = time_points, time_name=time_name,
32 |                   int_descript = int_descript,
33 |                   Intervention1_A = [static, np.zeros(time_points)],
34 |                   Intervention2_A = [static, np.ones(time_points)],
35 |                   basecovs =basecovs, covnames=covnames,  covtypes=covtypes, covmodels=covmodels,
36 |                   compevent_restrictions = compevent_restrictions,
37 |                   compevent_cens= compevent_cens, compevent_name = compevent_name, compevent_model=compevent_model,
38 |                   outcome_name=outcome_name, outcome_type='survival', ymodel=ymodel)
39 | g.fit()


--------------------------------------------------------------------------------
/running_examples/test_competing_event.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.interventions import static
 4 | from pygformula.data import load_basicdata
 5 | 
 6 | obs_data = load_basicdata()
 7 | 
 8 | covnames = ['L1', 'L2', 'A']
 9 | covtypes = ['binary', 'bounded normal', 'binary']
10 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
11 |              'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2  + L3 + t0',
12 |              'A ~ lag1_A + L1 + L2 +lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0']
13 | 
14 | ymodel = 'Y ~ A + L1 + L2 + L3 + lag1_A + lag1_L1 + lag1_L2'
15 | 
16 | time_name = 't0'
17 | id = 'id'
18 | outcome_name = 'Y'
19 | basecovs = ['L3']
20 | 
21 | compevent_name = 'D'
22 | compevent_model = 'D ~ A + L1 + L2 + L3 + t0'
23 | 
24 | time_points = np.max(np.unique(obs_data[time_name])) + 1
25 | int_descript = ['Never treat', 'Always treat']
26 | 
27 | g = ParametricGformula(obs_data = obs_data, id = id, time_points = time_points, time_name=time_name,
28 |                   int_descript = int_descript,
29 |                   Intervention1_A = [static, np.zeros(time_points)],
30 |                   Intervention2_A = [static, np.ones(time_points)],
31 |                   basecovs =basecovs, covnames=covnames,  covtypes=covtypes, covmodels=covmodels,
32 |                   compevent_name = compevent_name, compevent_model=compevent_model,
33 |                   outcome_name=outcome_name, outcome_type='survival', ymodel=ymodel)
34 | g.fit()
35 | 


--------------------------------------------------------------------------------
/running_examples/test_continuous_eof.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.interventions import static
 4 | from pygformula.data import load_continuous_eof
 5 | 
 6 | obs_data = load_continuous_eof()
 7 | time_name = 't0'
 8 | id = 'id'
 9 | 
10 | covnames = ['L1', 'L2', 'A']
11 | covtypes = ['categorical', 'normal', 'binary']
12 | covmodels = ['L1 ~ C(lag1_L1) + lag1_L2 + t0',
13 |              'L2 ~ lag1_L2 + C(lag1_L1) + lag1_A + t0',
14 |               'A ~ C(L1) + L2 + t0']
15 | 
16 | basecovs = ['L3']
17 | 
18 | outcome_name = 'Y'
19 | ymodel = 'Y ~ C(L1) + L2 + A'
20 | outcome_type = 'continuous_eof'
21 | 
22 | time_points = np.max(np.unique(obs_data[time_name])) + 1
23 | int_descript = ['Never treat', 'Always treat']
24 | 
25 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name,
26 |              int_descript=int_descript,
27 |              Intervention1_A = [static, np.zeros(time_points)],
28 |              Intervention2_A = [static, np.ones(time_points)],
29 |              covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs,
30 |              outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type
31 |              )
32 | g.fit()
33 | 


--------------------------------------------------------------------------------
/running_examples/test_custom_ymodel.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import re
 3 | from sklearn.ensemble import RandomForestRegressor
 4 | 
 5 | from pygformula.interventions import static
 6 | from pygformula import ParametricGformula
 7 | from pygformula.data import load_continuous_eof
 8 | 
 9 | obs_data = load_continuous_eof()
10 | 
11 | time_name = 't0'
12 | id = 'id'
13 | 
14 | covnames = ['L1', 'L2', 'A']
15 | covtypes = ['categorical', 'normal', 'binary']
16 | covmodels = ['L1 ~ C(lag1_L1) + lag1_L2 + t0',
17 |              'L2 ~ lag1_L2 + C(lag1_L1) + lag1_A + t0',
18 |               'A ~ C(L1) + L2 + t0']
19 | 
20 | basecovs = ['L3']
21 | 
22 | outcome_name = 'Y'
23 | 
24 | ymodel = 'Y ~ lag1_L2 + L2 + lag1_A + A'
25 | 
26 | # define interventions
27 | time_points = np.max(np.unique(obs_data[time_name])) + 1
28 | int_descript = ['Never treat', 'Always treat']
29 | 
30 | 
31 | def ymodel_fit_custom(ymodel, fit_data):
32 |     y_name, x_name = re.split('~', ymodel.replace(' ', ''))
33 |     x_name = re.split('\+', x_name.replace(' ', ''))
34 |     # get feature and target data to fit ymodel
35 |     y = fit_data[y_name].to_numpy()
36 |     X = fit_data[x_name].to_numpy()
37 |     fit_rf = RandomForestRegressor()
38 |     fit_rf.fit(X, y)
39 |     return fit_rf
40 | 
41 | def ymodel_predict_custom(ymodel, new_df, fit):
42 |     y_name, x_name = re.split('~', ymodel.replace(' ', ''))
43 |     x_name = re.split('\+', x_name.replace(' ', ''))
44 |     # get feature data to predict
45 |     X = new_df[x_name].to_numpy()
46 |     prediction = fit.predict(X)
47 |     return prediction
48 | 
49 | 
50 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
51 |              int_descript = int_descript,
52 |              Intervention1_A = [static, np.zeros(time_points)], basecovs=['L3'],
53 |              Intervention2_A = [static, np.ones(time_points)],
54 |              covnames=covnames,  covtypes=covtypes, covmodels=covmodels,
55 |              ymodel_fit_custom = ymodel_fit_custom, ymodel_predict_custom=ymodel_predict_custom,
56 |              outcome_name=outcome_name, ymodel=ymodel, outcome_type='continuous_eof')
57 | g.fit()


--------------------------------------------------------------------------------
/running_examples/test_dynamic_intervention.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.data import load_basicdata_nocomp
 4 | 
 5 | obs_data = load_basicdata_nocomp()
 6 | time_name = 't0'
 7 | id = 'id'
 8 | 
 9 | covnames = ['L1', 'L2', 'A']
10 | covtypes = ['binary', 'bounded normal', 'binary']
11 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
12 |              'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
13 |              'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0']
14 | 
15 | basecovs = ['L3']
16 | 
17 | time_points = np.max(np.unique(obs_data[time_name])) + 1
18 | 
19 | def dynamic_intervention(new_df, pool, int_var, time_name, t):
20 |     new_df.loc[new_df[time_name] == t, int_var] = 0
21 |     new_df.loc[new_df['L2'] > 0.75, int_var] = 1
22 | 
23 | int_descript = ['Dynamic intervention']
24 | 
25 | outcome_name = 'Y'
26 | ymodel = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0'
27 | 
28 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
29 |              covnames=covnames,  covtypes=covtypes, covmodels=covmodels, basecovs=basecovs,
30 |              int_descript = int_descript,
31 |              Intervention1_A = [dynamic_intervention],
32 |              outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival')
33 | g.fit()


--------------------------------------------------------------------------------
/running_examples/test_fit_random_forest.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import re
 3 | from sklearn.ensemble import RandomForestRegressor
 4 | 
 5 | from pygformula.interventions import static
 6 | from pygformula import ParametricGformula
 7 | from pygformula.data import load_basicdata_nocomp
 8 | 
 9 | obs_data = load_basicdata_nocomp()
10 | 
11 | time_name = 't0'
12 | id = 'id'
13 | 
14 | covnames = ['L1', 'L2', 'A']
15 | covtypes = ['binary', 'custom', 'binary']
16 | covmodels = ['L1 ~ lag1_A + lag2_A + lag1_L1 + lag_cumavg1_L2 + t0',
17 |              'L2 ~ lag1_A + L1 + lag1_L1 + lag_cumavg1_L2 + t0',
18 |              'A ~ lag1_A + L1 + L2 +lag1_L1 + lag_cumavg1_L2 + t0']
19 | 
20 | 
21 | outcome_name = 'Y'
22 | ymodel = 'Y ~ L1 + L2 + A'
23 | 
24 | # define interventions
25 | time_points = np.max(np.unique(obs_data[time_name])) + 1
26 | int_descript = ['Never treat', 'Always treat']
27 | 
28 | 
29 | def fit_rf(covmodel, covname, fit_data):
30 |     max_depth = 2
31 |     y_name, x_name = re.split('~', covmodel.replace(' ', ''))
32 |     x_name = re.split('\+', x_name.replace(' ', ''))
33 |     y = fit_data[y_name].to_numpy()
34 |     X = fit_data[x_name].to_numpy()
35 |     fit_rf = RandomForestRegressor(max_depth=max_depth, random_state=0)
36 |     fit_rf.fit(X, y)
37 |     return fit_rf
38 | 
39 | def predict_rf(covmodel, new_df, fit):
40 |     y_name, x_name = re.split('~', covmodel.replace(' ', ''))
41 |     x_name = re.split('\+', x_name.replace(' ', ''))
42 |     X = new_df[x_name].to_numpy()
43 |     prediction = fit.predict(X)
44 |     return prediction
45 | 
46 | covfits_custom = ['NA', fit_rf, 'NA']
47 | covpredict_custom = ['NA', predict_rf, 'NA']
48 | 
49 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
50 |              int_descript = int_descript,
51 |              Intervention1_A = [static, np.zeros(time_points)],
52 |              Intervention2_A = [static, np.ones(time_points)],
53 |              covnames=covnames,  covtypes=covtypes, covmodels=covmodels,
54 |              covfits_custom = covfits_custom, covpredict_custom=covpredict_custom,
55 |              outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival')
56 | g.fit()
57 | 


--------------------------------------------------------------------------------
/running_examples/test_natural_course.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.data import load_basicdata_nocomp
 4 | 
 5 | obs_data = load_basicdata_nocomp()
 6 | time_name = 't0'
 7 | id = 'id'
 8 | 
 9 | covnames = ['L1', 'L2', 'A']
10 | covtypes = ['binary', 'bounded normal', 'binary']
11 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
12 |            'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
13 |            'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0']
14 | 
15 | basecovs = ['L3']
16 | 
17 | outcome_name = 'Y'
18 | ymodel = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0'
19 | outcome_type = 'survival'
20 | 
21 | time_points = np.max(np.unique(obs_data[time_name])) + 1
22 | 
23 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
24 |              covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs,
25 |              outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type
26 |              )
27 | g.fit()
28 | 


--------------------------------------------------------------------------------
/running_examples/test_natural_grace_period.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.interventions import natural_grace_period
 4 | from pygformula.data import load_basicdata_nocomp
 5 | 
 6 | obs_data = load_basicdata_nocomp()
 7 | time_name = 't0'
 8 | id = 'id'
 9 | 
10 | covnames = ['L1', 'L2', 'A']
11 | covtypes = ['binary', 'bounded normal', 'binary']
12 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
13 |              'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
14 |              'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0']
15 | 
16 | basecovs = ['L3']
17 | 
18 | time_points = np.max(np.unique(obs_data[time_name])) + 1
19 | 
20 | int_descript = ['natural grace period intervention']
21 | 
22 | outcome_name = 'Y'
23 | ymodel = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0'
24 | 
25 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
26 |              covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs,
27 |              int_descript = int_descript,
28 |              Intervention1_A = [natural_grace_period, [3, {'L1': lambda x: x == 1}]],
29 |              outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival')
30 | g.fit()


--------------------------------------------------------------------------------
/running_examples/test_normal_cov.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.interventions import static
 4 | from pygformula.data import load_basicdata_nocomp
 5 | 
 6 | obs_data = load_basicdata_nocomp()
 7 | time_name = 't0'
 8 | id = 'id'
 9 | 
10 | covnames = ['L2', 'A']
11 | covtypes = ['normal', 'binary']
12 | covmodels = ['L2 ~ lag1_A + lag_cumavg1_L2 + L3 + t0',
13 |            'A ~ lag1_A + L2 + lag_cumavg1_L2 + L3 + t0']
14 | 
15 | basecovs = ['L3']
16 | 
17 | outcome_name = 'Y'
18 | ymodel = 'Y ~ L2 + A + lag1_A + L3 + t0'
19 | outcome_type = 'survival'
20 | 
21 | time_points = np.max(np.unique(obs_data[time_name])) + 1
22 | int_descript = ['Never treat', 'Always treat']
23 | 
24 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
25 |              int_descript = int_descript,
26 |              Intervention1_A = [static, np.zeros(time_points)],
27 |              Intervention2_A = [static, np.ones(time_points)],
28 |              covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs,
29 |              outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type
30 |              )
31 | g.fit()


--------------------------------------------------------------------------------
/running_examples/test_restrictions.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.interventions import static
 4 | from pygformula.data import load_basicdata_nocomp
 5 | 
 6 | obs_data = load_basicdata_nocomp()
 7 | 
 8 | time_name = 't0'
 9 | id = 'id'
10 | 
11 | covnames = ['L1', 'L2', 'A']
12 | covtypes = ['binary', 'normal', 'binary']
13 | covmodels = ['L1 ~ lag1_L1 + lag1_A',
14 |              'L2 ~ L1 + lag1_L2',
15 |               'A ~ L1 + L2']
16 | 
17 | basecovs = ['L3']
18 | outcome_name = 'Y'
19 | ymodel = 'Y ~ L1 + L2 + A'
20 | 
21 | # define interventions
22 | time_points = np.max(np.unique(obs_data[time_name])) + 1
23 | int_descript = ['Never treat', 'Always treat']
24 | 
25 | 
26 | restrictions = [['L2', {'L1': lambda x: x == 0}, 0.5], ['A', {'L1': lambda x: x == 0, 'L2': lambda x: x > 0.5}, 1]]
27 | 
28 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
29 |              int_descript = int_descript,
30 |              Intervention1_A = [static, np.zeros(time_points)],
31 |              Intervention2_A = [static, np.ones(time_points)],
32 |              covnames=covnames,  covtypes=covtypes, covmodels=covmodels, basecovs=basecovs,
33 |              restrictions=restrictions, outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival')
34 | g.fit()
35 | 


--------------------------------------------------------------------------------
/running_examples/test_square_time.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.interventions import static
 4 | from pygformula.data import load_basicdata_nocomp
 5 | 
 6 | obs_data = load_basicdata_nocomp()
 7 | time_name = 't0'
 8 | id = 'id'
 9 | 
10 | covnames = ['L1', 'L2', 'A', 'square_t0']
11 | covtypes = ['binary', 'bounded normal', 'binary', 'square time']
12 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0 + square_t0',
13 |            'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0 + square_t0',
14 |            'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0 + square_t0',
15 |            'NA']
16 | 
17 | basecovs = ['L3']
18 | 
19 | outcome_name = 'Y'
20 | ymodel = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0 + square_t0'
21 | outcome_type = 'survival'
22 | 
23 | time_points = np.max(np.unique(obs_data[time_name])) + 1
24 | int_descript = ['Never treat', 'Always treat']
25 | 
26 | 
27 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
28 |              int_descript = int_descript,
29 |              Intervention1_A = [static, np.zeros(time_points)],
30 |              Intervention2_A = [static, np.ones(time_points)],
31 |              covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs,
32 |              outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type,
33 |              )
34 | g.fit()
35 | 


--------------------------------------------------------------------------------
/running_examples/test_static_multiple_treatments.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.interventions import static
 4 | from pygformula.data import load_multiple_treatments_data
 5 | 
 6 | obs_data = load_multiple_treatments_data()
 7 | time_name = 't0'
 8 | id = 'id'
 9 | 
10 | covnames = ['L1', 'L2', 'A1', 'A2']
11 | covtypes = ['binary', 'bounded normal', 'binary', 'binary']
12 | covmodels = ['L1 ~ lag1_L1',
13 |              'L2 ~ lag1_L1 + lag1_L2 + lag1_A2 + L1',
14 |              'A1 ~ lag1_L1 + lag1_L2',
15 |              'A2 ~ lag1_A1']
16 | 
17 | time_points = np.max(np.unique(obs_data[time_name])) + 1
18 | int_descript = ['Always treat on A1 & A2']
19 | 
20 | 
21 | outcome_name = 'Y'
22 | ymodel = 'Y ~ L1 + L2 + A1 + A2'
23 | 
24 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
25 |              covnames=covnames,  covtypes=covtypes, covmodels=covmodels,
26 |              int_descript = int_descript,
27 |              Intervention1_A1 = [static, np.ones(time_points)],
28 |              Intervention1_A2 = [static, np.ones(time_points)],
29 |              outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival')
30 | g.fit()


--------------------------------------------------------------------------------
/running_examples/test_static_one_treatment.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.interventions import static
 4 | from pygformula.data import load_basicdata_nocomp
 5 | 
 6 | obs_data = load_basicdata_nocomp()
 7 | time_name = 't0'
 8 | id = 'id'
 9 | 
10 | covnames = ['L1', 'L2', 'A']
11 | covtypes = ['binary', 'bounded normal', 'binary']
12 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
13 |              'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
14 |              'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0']
15 | 
16 | basecovs = ['L3']
17 | 
18 | time_points = np.max(np.unique(obs_data[time_name])) + 1
19 | int_descript = ['Always treat']
20 | 
21 | outcome_name = 'Y'
22 | ymodel = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0'
23 | 
24 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
25 |              covnames=covnames, covtypes=covtypes, covmodels=covmodels, basecovs=basecovs,
26 |              int_descript = int_descript,
27 |              Intervention1_A = [static, np.ones(time_points), [0, 1, 4]],
28 |              outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival')
29 | g.fit()


--------------------------------------------------------------------------------
/running_examples/test_threshold_intervention.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.interventions import threshold
 4 | from pygformula.data import load_threshold_data
 5 | 
 6 | obs_data = load_threshold_data()
 7 | time_name = 't0'
 8 | id = 'id'
 9 | 
10 | covnames = ['L1', 'L2', 'A']
11 | covtypes = ['binary', 'bounded normal', 'normal']
12 | covmodels = ['L1 ~ lag1_L1',
13 |              'L2 ~ lag1_L1 + lag1_L2 + L1',
14 |              'A ~ L1 + L2']
15 | 
16 | time_points = np.max(np.unique(obs_data[time_name])) + 1
17 | 
18 | int_descript = ['Threshold intervention']
19 | 
20 | outcome_name = 'Y'
21 | ymodel = 'Y ~ L1 + L2 + A'
22 | 
23 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
24 |              covnames=covnames,  covtypes=covtypes, covmodels=covmodels,
25 |              int_descript = int_descript,
26 |              Intervention1_A = [threshold, [0.5, float('inf')]],
27 |              outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival')
28 | g.fit()


--------------------------------------------------------------------------------
/running_examples/test_truncated_normal.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.interventions import static
 4 | from pygformula.data import load_truncated_normal
 5 | 
 6 | obs_data = load_truncated_normal()
 7 | time_name = 't0'
 8 | id = 'id'
 9 | 
10 | covnames = ['L', 'A']
11 | covtypes = ['truncated normal', 'binary']
12 | covmodels = ['L ~ lag1_A + lag1_L + t0',
13 |            'A ~ lag1_A + lag1_L + L + t0']
14 | 
15 | trunc_params = [[1, 'right'], 'NA']
16 | 
17 | outcome_name = 'Y'
18 | ymodel = 'Y ~ L + A + t0'
19 | outcome_type = 'survival'
20 | 
21 | time_points = np.max(np.unique(obs_data[time_name])) + 1
22 | int_descript = ['Never treat', 'Always treat']
23 | 
24 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
25 |              int_descript = int_descript,
26 |              Intervention1_A = [static, np.zeros(time_points)],
27 |              Intervention2_A = [static, np.ones(time_points)],
28 |              covnames=covnames, covtypes=covtypes, covmodels=covmodels, trunc_params=trunc_params,
29 |              outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type
30 |              )
31 | g.fit()
32 | 


--------------------------------------------------------------------------------
/running_examples/test_uniform_grace_period.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.interventions import uniform_grace_period
 4 | from pygformula.data import load_basicdata_nocomp
 5 | 
 6 | obs_data = load_basicdata_nocomp()
 7 | time_name = 't0'
 8 | id = 'id'
 9 | 
10 | covnames = ['L1', 'L2', 'A']
11 | covtypes = ['binary', 'bounded normal', 'binary']
12 | covmodels = ['L1 ~ lag1_A + lag2_A + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
13 |              'L2 ~ lag1_A + L1 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0',
14 |              'A ~ lag1_A + L1 + L2 + lag_cumavg1_L1 + lag_cumavg1_L2 + L3 + t0']
15 | 
16 | basecovs = ['L3']
17 | 
18 | time_points = np.max(np.unique(obs_data[time_name])) + 1
19 | 
20 | int_descript = ['uniform grace period intervention']
21 | 
22 | outcome_name = 'Y'
23 | ymodel = 'Y ~ L1 + L2 + L3 + A + lag1_A + lag1_L1 + lag1_L2 + t0'
24 | 
25 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
26 |              covnames=covnames,  covtypes=covtypes, covmodels=covmodels, basecovs=basecovs,
27 |              int_descript = int_descript,
28 |              Intervention1_A = [uniform_grace_period, [3, {'L1': lambda x: x == 1}]],
29 |              outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival')
30 | g.fit()


--------------------------------------------------------------------------------
/running_examples/test_visit_process.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.interventions import static
 4 | from pygformula.data import load_visit_process
 5 | 
 6 | obs_data = load_visit_process()
 7 | time_name = 'month'
 8 | id = 'id'
 9 | 
10 | covnames = ['visit_cd4', 'visit_rna', 'cd4_v', 'rna_v', 'everhaart']
11 | covtypes = ['binary', 'binary', 'normal', 'normal', 'binary']
12 | covmodels = ['visit_cd4 ~ lag1_everhaart + lag_cumavg1_cd4_v + sex + race + month',
13 |              'visit_rna ~ lag1_everhaart + lag_cumavg1_rna_v + sex + race + month',
14 |              'cd4_v ~ lag1_everhaart + lag_cumavg1_cd4_v + sex + race + month',
15 |              'rna_v ~ lag1_everhaart + lag_cumavg1_rna_v + sex + race + month',
16 |              'everhaart ~ lag1_everhaart + cd4_v + rna_v + sex + race + month']
17 | 
18 | basecovs = ['sex', 'race', 'age']
19 | 
20 | visitprocess = [['visit_cd4', 'cd4_v', 3], ['visit_rna', 'rna_v', 3]]
21 | 
22 | outcome_name = 'event'
23 | ymodel = 'event ~ cd4_v + rna_v + everhaart + sex + race + month'
24 | 
25 | time_points = np.max(np.unique(obs_data[time_name])) + 1
26 | 
27 | int_descript = ['Never treat', 'Always treat']
28 | 
29 | 
30 | g = ParametricGformula(obs_data = obs_data, id = id,  time_name = time_name, visitprocess = visitprocess,
31 |                   int_descript = int_descript,
32 |                   Intervention1_everhaart = [static, np.zeros(time_points)],
33 |                   Intervention2_everhaart = [static, np.ones(time_points)],
34 |                   covnames=covnames,  covtypes=covtypes, covmodels=covmodels, basecovs = basecovs,
35 |                   outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival')
36 | g.fit()
37 | g.plot_interventions()
38 | g.plot_natural_course()


--------------------------------------------------------------------------------
/running_examples/test_yrestrictions.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.interventions import static
 4 | from pygformula.data import load_basicdata_nocomp
 5 | 
 6 | obs_data = load_basicdata_nocomp()
 7 | 
 8 | time_name = 't0'
 9 | id = 'id'
10 | 
11 | covnames = ['L1', 'L2', 'A']
12 | covtypes = ['binary', 'normal', 'binary']
13 | covmodels = ['L1 ~ lag1_L1 + lag1_A',
14 |              'L2 ~ L1 + lag1_L2',
15 |               'A ~ L1 + L2']
16 | 
17 | basecovs = ['L3']
18 | outcome_name = 'Y'
19 | ymodel = 'Y ~ L1 + L2 + A'
20 | 
21 | # define interventions
22 | time_points = np.max(np.unique(obs_data[time_name])) + 1
23 | int_descript = ['Never treat', 'Always treat']
24 | 
25 | yrestrictions = [[{'L1': lambda x: x == 0}, 0], [{'L2': lambda x: x > 0.5}, 0.1]]
26 | 
27 | 
28 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
29 |              int_descript = int_descript,
30 |              Intervention1_A = [static, np.zeros(time_points)],
31 |              Intervention2_A = [static, np.ones(time_points)],
32 |              covnames=covnames,  covtypes=covtypes, covmodels=covmodels, basecovs=basecovs,
33 |              yrestrictions=yrestrictions, outcome_name=outcome_name, ymodel=ymodel, outcome_type='survival')
34 | g.fit()


--------------------------------------------------------------------------------
/running_examples/test_zero_inflated_normal_cov.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pygformula import ParametricGformula
 3 | from pygformula.interventions import static
 4 | from pygformula.data import load_zero_inflated_normal
 5 | 
 6 | obs_data = load_zero_inflated_normal()
 7 | time_name = 't0'
 8 | id = 'id'
 9 | 
10 | covnames = ['L', 'A']
11 | covtypes = ['zero-inflated normal', 'binary']
12 | covmodels = ['L ~ lag1_L + lag1_A + t0',
13 |               'A ~ lag1_A + L + t0']
14 | 
15 | outcome_name = 'Y'
16 | ymodel = 'Y ~ L + A + t0'
17 | outcome_type = 'survival'
18 | 
19 | time_points = np.max(np.unique(obs_data[time_name])) + 1
20 | int_descript = ['Never treat', 'Always treat']
21 | 
22 | g = ParametricGformula(obs_data = obs_data, id = id, time_name=time_name, time_points = time_points,
23 |              int_descript = int_descript,
24 |              Intervention1_A = [static, np.zeros(time_points)],
25 |              Intervention2_A = [static, np.ones(time_points)],
26 |              covnames=covnames, covtypes=covtypes, covmodels=covmodels,
27 |              outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type
28 |              )
29 | g.fit()
30 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | INSTALL_REQUIRES = [
 4 |     'joblib>=1.2',
 5 |     'lifelines>=0.27',
 6 |     'matplotlib>=3.5',
 7 |     'numpy>=1.22',
 8 |     'pandas>=1.5',
 9 |     'prettytable>=3.10',
10 |     'pytruncreg>=0.1',
11 |     'scipy>=1.10',
12 |     'seaborn>=0.11',
13 |     'statsmodels>=0.14',
14 |     'tqdm>=4.64',
15 |     'PyQt5>=5.15'
16 | ]
17 | 
18 | version = {}
19 | with open("pygformula/version.py") as fp:
20 |     exec(fp.read(), version)
21 | 
22 | with open('README.md', 'r', encoding='utf-8') as f:
23 |     long_description = f.read()
24 | 
25 | setuptools.setup(
26 |     name='pygformula',
27 |     version=version['__version__'],
28 |     maintainer='Jing Li',
29 |     maintainer_email='jing_li@hsph.harvard.edu',
30 |     description='A python implementation of the parametric g-formula',
31 |     long_description=long_description,
32 |     long_description_content_type='text/markdown',
33 |     packages=setuptools.find_packages(),
34 |     install_requires=INSTALL_REQUIRES,
35 |     python_requires='>=3.8'
36 | )


--------------------------------------------------------------------------------