├── .github
    └── workflows
    │   ├── codecov.yml
    │   ├── lint.yml
    │   ├── pages.yml
    │   ├── publish.yml
    │   └── tests.yml
├── .gitignore
├── CITATION.cff
├── LICENSE
├── README.md
├── data
    ├── basque.csv
    ├── germany.csv
    └── texas.csv
├── doc
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── augsynth.rst
    │   ├── biblio.bib
    │   ├── bibliography.rst
    │   ├── conf.py
    │   ├── dataprep.rst
    │   ├── generator.rst
    │   ├── index.rst
    │   ├── penalized.rst
    │   ├── placebo.rst
    │   ├── robust.rst
    │   └── synth.rst
├── examples
    ├── augsynth
    │   └── basque_augsynth.ipynb
    ├── basque.ipynb
    ├── factor-model.ipynb
    ├── germany.ipynb
    ├── penalized
    │   └── basque_penalized.ipynb
    ├── robust
    │   └── basque_robust.ipynb
    └── texas.ipynb
├── pyproject.toml
├── pysyncon
    ├── __init__.py
    ├── augsynth.py
    ├── base.py
    ├── dataprep.py
    ├── generator.py
    ├── inference.py
    ├── penalized.py
    ├── robust.py
    ├── synth.py
    └── utils.py
├── requirements-dev.txt
├── setup.cfg
└── tests
    ├── test_augsynth.py
    ├── test_augsynth_basque.py
    ├── test_conformal_interence.py
    ├── test_dataprep.py
    ├── test_linear_factor_model.py
    ├── test_penalized.py
    ├── test_penalized_basque.py
    ├── test_robust.py
    ├── test_robust_basque.py
    ├── test_synth.py
    ├── test_synth_basque.py
    ├── test_synth_germany.py
    ├── test_synth_texas.py
    └── test_utils.py


/.github/workflows/codecov.yml:
--------------------------------------------------------------------------------
 1 | name: Generate Code Coverage report and upload to codecov.io
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: main
 6 |     paths:
 7 |       - "pysyncon/**"
 8 |       - "tests/**"
 9 |   push:
10 |     branches: main
11 |     paths:
12 |       - "pysyncon/**"
13 |       - "tests/**"
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   codecov:
20 |     runs-on: ubuntu-latest
21 |     steps:
22 |       - name: Checkout
23 |         uses: actions/checkout@v4
24 |       - name: Set up Python
25 |         uses: actions/setup-python@v4
26 |         with:
27 |           python-version: "3.9"
28 |       - name: Install dependencies
29 |         run: |
30 |           python -m pip install --upgrade pip
31 |           python -m pip install -r requirements-dev.txt
32 |           python -m pip install codecov
33 |       - name: Run tests and collect coverage
34 |         run: coverage run -m unittest discover -s tests
35 |       - name: Upload coverage reports to Codecov with GitHub Action
36 |         uses: codecov/codecov-action@v4.2.0
37 |         env:
38 |           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
39 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: Lint codebase using Black
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: main
 6 |     paths:
 7 |       - "pysyncon/**"
 8 |       - "tests/**"
 9 |       - "examples/**"
10 | 
11 | permissions:
12 |   contents: read
13 | 
14 | jobs:
15 |   lint:
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |     - name: Checkout
19 |       uses: actions/checkout@v4
20 |     - name: Set up Python
21 |       uses: actions/setup-python@v4
22 |       with:
23 |         python-version: "3.9"
24 |     - name: Install dependencies
25 |       run: |
26 |         python -m pip install --upgrade pip
27 |         python -m pip install -r requirements-dev.txt
28 |     - name: Lint
29 |       run: black --check .
30 | 


--------------------------------------------------------------------------------
/.github/workflows/pages.yml:
--------------------------------------------------------------------------------
 1 | name: Build html using Sphinx and upload to Github-pages
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: main
 6 |     paths:
 7 |       - "pysyncon/**"
 8 |       - "doc/**"
 9 | 
10 | permissions:
11 |   contents: read
12 | 
13 | jobs:
14 |   pages:
15 |     runs-on: ubuntu-latest
16 |     environment:
17 |       name: github-pages
18 |       url: ${{ steps.deployment.outputs.page_url }}
19 |     permissions:
20 |       pages: write
21 |       id-token: write
22 |     steps:
23 |     - name: Checkout
24 |       uses: actions/checkout@v4
25 |     - name: Set up Python
26 |       uses: actions/setup-python@v4
27 |       with:
28 |         python-version: "3.9"
29 |     - name: Install Sphinx
30 |       run: |
31 |         python -m pip install --upgrade pip
32 |         python -m pip install -r requirements-dev.txt
33 |         python -m pip install sphinx
34 |         python -m pip install sphinxcontrib-bibtex
35 |     - name: Build html
36 |       run: sphinx-build -b html ./doc/source/ ./doc/build/
37 |     - name: Setup Pages
38 |       uses: actions/configure-pages@v2
39 |     - name: Upload artifact
40 |       uses: actions/upload-pages-artifact@v3
41 |       with:
42 |         path: ./doc/build/
43 |     - name: Deploy to GitHub Pages
44 |       id: deployment
45 |       uses: actions/deploy-pages@v1
46 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Build package and upload to PyPI
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | permissions:
 8 |   contents: read
 9 | 
10 | jobs:
11 |   upload:
12 |     runs-on: ubuntu-latest
13 |     permissions:
14 |       id-token: write
15 |     steps:
16 |     - uses: actions/checkout@v4
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v4
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install build
25 |     - name: Build package
26 |       run: python -m build
27 |     - name: Publish package
28 |       uses: pypa/gh-action-pypi-publish@79739dc2f2bf6bcfd21ecf9af9f06bd643dbeeae
29 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Run unittests
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: main
 6 |     paths:
 7 |       - "pysyncon/**"
 8 |       - "tests/**"
 9 | 
10 | permissions:
11 |   contents: read
12 | 
13 | jobs:
14 |   tests:
15 |     strategy:
16 |       fail-fast: false
17 |       matrix:
18 |         buildplat: [ubuntu-20.04, windows-2019]
19 |         python: ["3.8", "3.9", "3.10", "3.11", "3.12"]
20 |     runs-on: ${{ matrix.buildplat }}
21 |     steps:
22 |     - name: Checkout
23 |       uses: actions/checkout@v4
24 |     - name: Set up Python
25 |       uses: actions/setup-python@v4
26 |       with:
27 |         python-version: ${{ matrix.python }}
28 |     - name: Install dependencies
29 |       run: |
30 |         python -m pip install --upgrade pip
31 |         python -m pip install -r requirements-dev.txt
32 |     - name: Run tests
33 |       run: python -m unittest discover -s tests
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by https://www.toptal.com/developers/gitignore/api/python,r,virtualenv,visualstudiocode
  2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,r,virtualenv,visualstudiocode
  3 | 
  4 | ### Python ###
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | cover/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | .pybuilder/
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | #   For a library or package, you might want to ignore these files since the code is
 91 | #   intended to run in multiple environments; otherwise, check them in:
 92 | # .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # poetry
102 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
104 | #   commonly ignored for libraries.
105 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106 | #poetry.lock
107 | 
108 | # pdm
109 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110 | #pdm.lock
111 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112 | #   in version control.
113 | #   https://pdm.fming.dev/#use-with-ide
114 | .pdm.toml
115 | 
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 | 
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 | 
123 | # SageMath parsed files
124 | *.sage.py
125 | 
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # pytype static type analyzer
154 | .pytype/
155 | 
156 | # Cython debug symbols
157 | cython_debug/
158 | 
159 | # PyCharm
160 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
163 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 | 
166 | ### Python Patch ###
167 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
168 | poetry.toml
169 | 
170 | 
171 | ### R ###
172 | # History files
173 | .Rhistory
174 | .Rapp.history
175 | 
176 | # Session Data files
177 | .RData
178 | .RDataTmp
179 | 
180 | # User-specific files
181 | .Ruserdata
182 | 
183 | # Example code in package build process
184 | *-Ex.R
185 | 
186 | # Output files from R CMD build
187 | /*.tar.gz
188 | 
189 | # Output files from R CMD check
190 | /*.Rcheck/
191 | 
192 | # RStudio files
193 | .Rproj.user/
194 | 
195 | # produced vignettes
196 | vignettes/*.html
197 | vignettes/*.pdf
198 | 
199 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
200 | .httr-oauth
201 | 
202 | # knitr and R markdown default cache directories
203 | *_cache/
204 | /cache/
205 | 
206 | # Temporary files created by R markdown
207 | *.utf8.md
208 | *.knit.md
209 | 
210 | # R Environment Variables
211 | .Renviron
212 | 
213 | # pkgdown site
214 | docs/
215 | 
216 | # translation temp files
217 | po/*~
218 | 
219 | # RStudio Connect folder
220 | rsconnect/
221 | 
222 | ### R.Bookdown Stack ###
223 | # R package: bookdown caching files
224 | /*_files/
225 | 
226 | ### VirtualEnv ###
227 | # Virtualenv
228 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
229 | [Bb]in
230 | [Ii]nclude
231 | [Ll]ib
232 | [Ll]ib64
233 | [Ll]ocal
234 | [Ss]cripts
235 | pyvenv.cfg
236 | pip-selfcheck.json
237 | 
238 | ### VisualStudioCode ###
239 | .vscode/*
240 | !.vscode/settings.json
241 | !.vscode/tasks.json
242 | !.vscode/launch.json
243 | !.vscode/extensions.json
244 | !.vscode/*.code-snippets
245 | 
246 | # Local History for Visual Studio Code
247 | .history/
248 | 
249 | # Built Visual Studio Code Extensions
250 | *.vsix
251 | 
252 | ### VisualStudioCode Patch ###
253 | # Ignore all local history of files
254 | .history
255 | .ionide
256 | 
257 | # Settings
258 | settings.json
259 | 
260 | # End of https://www.toptal.com/developers/gitignore/api/python,r,virtualenv,visualstudiocode
261 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.0.0
 2 | message: "If you use this software in your research, please cite it as below."
 3 | authors:
 4 | - family-names: "Fordham"
 5 |   given-names: "Stiofán"
 6 |   orcid: "https://orcid.org/0009-0003-1345-3252"
 7 | title: "pysyncon: a Python package for the Synthetic Control Method"
 8 | date-released: 2022-12-20
 9 | url: "https://github.com/sdfordham/pysyncon"
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Stiofán Fordham
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # pysyncon ![](https://img.shields.io/badge/python-3.8+-blue.svg) [![codecov](https://codecov.io/gh/sdfordham/pysyncon/graph/badge.svg?token=hmi7xHQ4OT)](https://codecov.io/gh/sdfordham/pysyncon)
 4 | 
 5 | A python module for the synthetic control method that provides implementations of:
 6 | 
 7 | - Synthetic Control Method (Abadie & Gardeazabal 2003)
 8 | - Robust Synthetic Control Method (Amjad, Shah & Shen 2018)
 9 | - Augmented Synthetic Control Method (Ben-Michael, Feller & Rothstein 2021)
10 | - Penalized Synthetic Control Method (Abadie & L'Hour 2021)
11 | 
12 | The package also provides methods for performing placebo tests and generating confidence intervals.
13 | 
14 | The implementation of the synthetic control method aims to be reconcilable with the R package [Synth](https://CRAN.R-project.org/package=Synth) and similarly the implementation of the Augmented synthetic control method and the R package [augsynth](https://github.com/ebenmichael/augsynth).
15 | 
16 | ## Installation
17 | Install it from PyPI using pip:
18 | 
19 | ````bash
20 | python -m pip install pysyncon
21 | ````
22 | 
23 | ## Usage
24 | 
25 | Documentation is available on [github-pages](https://sdfordham.github.io/pysyncon/). In the examples folder are notebooks reproducing the weights from:
26 | 
27 | - The Economic Costs of Conflict: A Case Study of the Basque Country, Alberto Abadie and Javier Gardeazabal; The American Economic Review Vol. 93, No. 1 (Mar., 2003), pp. 113-132. ([notebook here](examples/basque.ipynb))
28 | - The worked example 'Prison construction and Black male incarceration' from the last chapter of 'Causal Inference: The Mixtape' by Scott Cunningham. ([notebook here](examples/texas.ipynb))
29 | - Comparative Politics and the Synthetic Control Method, Alberto Abadie, Alexis Diamond and Jens Hainmueller; American Journal of Political Science Vol. 59, No. 2 (April 2015), pp. 495-510. ([notebook here](examples/germany.ipynb))
30 | 
31 | ## Citation
32 | 
33 | If you use this package in your research, you can cite it as below.
34 | 
35 | ```
36 | @software{pysyncon,
37 |   author = {Fordham, Stiofán},
38 |   month = dec,
39 |   title = {{pysyncon: a Python package for the Synthetic Control Method}},
40 |   url = {https://github.com/sdfordham/pysyncon},
41 |   year = {2022}
42 | }
43 | ```
44 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/doc/source/augsynth.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Augmented Synthetic Control Method
 3 | ==================================
 4 | 
 5 | The *Augmented Synthetic Control Method* is due to Ben-Michael, Feller & Rothstein
 6 | :cite:`augsynth2021` and adapts the :doc:`Synthetic Control Method <synth>` in an
 7 | effort to adjust for poor pre-treatment fit.
 8 | 
 9 | The authors do this by adjusting the Synthetic Control Method estimate by adding
10 | a term that is an imbalance in a particular function of the pre-treatment outcomes.
11 | In the *Ridge Augmented Synthetic Control Method* this function is
12 | linear in the pre-treatment outcomes and fit by ridge regression of the control
13 | post-treatment outcomes against pre-treatment outcomes.
14 | 
15 | In particular, the method constructs a vector of weights :math:`w = (w_1, w_2, \dots, w_k)`
16 | such that
17 | 
18 | .. math::
19 |    w = w_\mathrm{scm} + w_\mathrm{aug},
20 | 
21 | where :math:`w_\mathrm{scm}` are the weights obtained from the standard
22 | :doc:`Synthetic Control Method <synth>` and :math:`w_\mathrm{aug}` are
23 | augmentations that are included when the treated unit lies outside the
24 | convex hull defined by the control units. The weights may be negative and
25 | larger than 1, the degree of extrapolation is controlled by a ridge
26 | parameter :math:`\lambda`.
27 | 
28 | In general, this method will obtain weights at least as good as the synthetic
29 | control method in terms of pre-treatment fit.
30 | 
31 | The :class:`AugSynth` class
32 | ***************************
33 | 
34 | The :class:`AugSynth <pysyncon.AugSynth>` class implements the Ridge Augmented
35 | Synthetic Control Method. The expected way to use the class is to first create a
36 | :class:`Dataprep <pysyncon.Dataprep>` object that defines the study data and
37 | then use it as input to a :class:`AugSynth <pysyncon.Synth>` object. See the
38 | `examples folder <https://github.com/sdfordham/pysyncon/tree/main/examples>`_
39 | of the repository for examples illustrating usage.
40 | 
41 | The implementation is based on the same method in the R
42 | `augsynth package <https://github.com/ebenmichael/augsynth>`_
43 | and aims to produce results that can be reconciled with that package.
44 | 
45 | .. autoclass:: pysyncon.AugSynth
46 |    :members:
47 |    :inherited-members:
48 | 


--------------------------------------------------------------------------------
/doc/source/biblio.bib:
--------------------------------------------------------------------------------
 1 | @article{basque2003,
 2 |     Author = {Abadie, Alberto and Gardeazabal, Javier},
 3 |     Title = {The Economic Costs of Conflict: A Case Study of the Basque Country},
 4 |     Journal = {American Economic Review},
 5 |     Volume = {93},
 6 |     Number = {1},
 7 |     Year = {2003},
 8 |     Month = {March},
 9 |     Pages = {113-132},
10 |     DOI = {10.1257/000282803321455188},
11 | }
12 | @book{california2007,
13 |     title={Synthetic Control Methods for Comparative Case Studies: Estimating the Effect of California's Tobacco Control Program},
14 |     DOI={10.3386/w12831},
15 |     publisher={National Bureau of Economic Research},
16 |     author={Abadie, Alberto and Diamond, Alexis and Hainmueller, Jens},
17 |     year={2007},
18 |     month={January}
19 | }
20 | @article{germany2015,
21 |   title = {Comparative Politics and the Synthetic Control Method},
22 |   author = {Abadie, Alberto and Diamond, Alexis and Hainmueller, Jens},
23 |   year = {2015},
24 |   journal = {American Journal of Political Science},
25 |   volume = {59},
26 |   number = {2},
27 |   pages = {495--510},
28 |   doi = {10.1111/ajps.12116}
29 | }
30 | @article{penalized2021,
31 |     author = {Alberto Abadie and Jérémy L'Hour},
32 |     title = {A Penalized Synthetic Control Estimator for Disaggregated Data},
33 |     journal = {Journal of the American Statistical Association},
34 |     volume = {116},
35 |     number = {536},
36 |     pages = {1817-1834},
37 |     year = {2021},
38 |     publisher = {Taylor & Francis},
39 |     doi = {10.1080/01621459.2021.1971535},
40 | }
41 | @article{robust2018,
42 |     author  = {Muhammad Amjad and Devavrat Shah and Dennis Shen},
43 |     title   = {Robust Synthetic Control},
44 |     journal = {Journal of Machine Learning Research},
45 |     year    = {2018},
46 |     volume  = {19},
47 |     number  = {22},
48 |     pages   = {1-51},
49 |     url     = {http://jmlr.org/papers/v19/17-777.html}
50 | }
51 | @article{augsynth2021,
52 |     author = {Eli Ben-Michael and Avi Feller and Jesse Rothstein},
53 |     title = {The Augmented Synthetic Control Method},
54 |     journal = {Journal of the American Statistical Association},
55 |     volume = {116},
56 |     number = {536},
57 |     pages = {1789-1803},
58 |     year = {2021},
59 |     publisher = {Taylor & Francis},
60 |     doi = {10.1080/01621459.2021.1929245},
61 | }
62 | @article{fp2018,
63 |     author = {Firpo, Sergio and Possebom, Vitor},
64 |     title = {Synthetic Control Method: Inference, Sensitivity Analysis and Confidence Sets},
65 |     journal = {Journal of Causal Inference},
66 |     volume = {6},
67 |     number = {2},
68 |     year = {2018},
69 |     pages = {20160026},
70 |     publisher = {De Gruyter},
71 |     doi = {10.1515/jci-2016-0026},
72 | }
73 | @article{inference2021,
74 |     author = {Victor Chernozhukov, Kaspar Wüthrich and Yinchu Zhu},
75 |     title = {An Exact and Robust Conformal Inference Method for Counterfactual and Synthetic Controls},
76 |     journal = {Journal of the American Statistical Association},
77 |     volume = {116},
78 |     number = {536},
79 |     year = {2021},
80 |     pages = {1849--1864},
81 |     publisher = {Taylor \& Francis},
82 |     doi = {10.1080/01621459.2021.1920957},
83 | }
84 | 


--------------------------------------------------------------------------------
/doc/source/bibliography.rst:
--------------------------------------------------------------------------------
1 | Bibliography
2 | ============
3 | 
4 | .. bibliography::
5 | 


--------------------------------------------------------------------------------
/doc/source/conf.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | 
 5 | sys.path.insert(0, os.path.abspath("../../"))
 6 | 
 7 | project = "pysyncon"
 8 | copyright = "2025, Stiofán Fordham"
 9 | author = "Stiofán Fordham"
10 | release = "1.5.2"
11 | 
12 | extensions = [
13 |     "sphinx.ext.autodoc",
14 |     "sphinx.ext.napoleon",
15 |     "sphinx.ext.mathjax",
16 |     "sphinx.ext.githubpages",
17 |     "sphinxcontrib.bibtex",
18 | ]
19 | html_theme = "alabaster"
20 | bibtex_bibfiles = ["biblio.bib"]
21 | 


--------------------------------------------------------------------------------
/doc/source/dataprep.rst:
--------------------------------------------------------------------------------
 1 | :class:`Dataprep` class
 2 | ========================
 3 | 
 4 | This class and its API are based on the similarly named function in the R
 5 | `Synth package <https://cran.r-project.org/web/packages/Synth/index.html>`_.
 6 | 
 7 | The ``dataprep`` class defines all the information necessary for the synthetic
 8 | control study. It takes in as argument a ``pandas.DataFrame`` `foo` containing
 9 | the panel data, a list of predictors, special predictors, the statistical operation to
10 | apply to the predictors over the selected time frame, the dependant variable,
11 | the columns denoting the unit labels, the label denoting the control units,
12 | the label denoting the treated unit, the time period to carry out the optimisation
13 | procedure over and the time period to apply the statistical operation to the
14 | predictors. See below for further details about each individual argument, and also see
15 | the `examples folder <https://github.com/sdfordham/pysyncon/tree/main/examples>`_
16 | of the repository to see how this class is set up in three real research contexts.
17 | 
18 | The principal difference between the function signature here and the one in
19 | the ``R`` ``synth`` package is that whereas there are two arguments `unit.variable`
20 | and `unit.names.variable` in that package, in this package these are
21 | consolidated into one argument `unit_variable` as here it is unnecessary to have
22 | both.
23 | 
24 | .. autoclass:: pysyncon.Dataprep
25 |    :members:
26 | 


--------------------------------------------------------------------------------
/doc/source/generator.rst:
--------------------------------------------------------------------------------
 1 | Sample data generation
 2 | ======================
 3 | 
 4 | The package provides a method for generating fake data for testing purposes.
 5 | 
 6 | Linear Factor model
 7 | *******************
 8 | 
 9 | Let :math:`Y_{it}^N` (resp. :math:`Y_{it}^I`) denote the outcome for unit :math:`i` at time :math:`t`
10 | in the absence of treatment (resp. in the presence of treatment). The :class:`LinearFactorModel`
11 | generates sample potential outcomes data according to a Linear
12 | Factor model:
13 | 
14 | .. math::
15 | 
16 |     Y_{jt}^N &= \theta_t^T Z_j + \lambda_t^T \mu_j + \epsilon_{tj},\\
17 |     Y_{jt}^I &= Y_{jt}^N + \delta_t,
18 | 
19 | where :math:`Z_j` denotes a vector of observable covariates, :math:`\mu_j` is a vector of unobservable
20 | covariates and :math:`\epsilon_{tj}` are mean-zero normal shocks. The vector :math:`\delta_t` denotes
21 | a vector of treatment effects and the remaining variables are model parameters.
22 | 
23 | .. autoclass:: pysyncon.generator.LinearFactorModel
24 |    :members:
25 |    :inherited-members:


--------------------------------------------------------------------------------
/doc/source/index.rst:
--------------------------------------------------------------------------------
 1 | pysyncon
 2 | ========
 3 | 
 4 | pysyncon is a Python package that provides methods for the synthetic control
 5 | method and derivative methods.
 6 | 
 7 | The types of synthetic control studies available in the package are:
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 1
11 | 
12 |    Synthetic Control Method <synth>
13 |    Augmented Synthetic Control Method <augsynth>
14 |    Robust Synthetic Control Method <robust>
15 |    Penalized Synthetic Control Method <penalized>
16 | 
17 | The package also provides a method for performing permutation tests/placebo
18 | tests with the above methods:
19 | 
20 | .. toctree::
21 |    :maxdepth: 1
22 | 
23 |    Placebo Tests <placebo>
24 | 
25 | The main helper class that is used to describe the study data and used as
26 | input to a synthetic control method is the dataprep class:
27 | 
28 | .. toctree::
29 |    :maxdepth: 1
30 | 
31 |    Dataprep <dataprep>
32 | 
33 | How to use the package
34 | **********************
35 | 
36 | There are notebooks in the examples folder of the package illustrating how
37 | to use the package `here <https://github.com/sdfordham/pysyncon/tree/main/examples>`_.
38 | 


--------------------------------------------------------------------------------
/doc/source/penalized.rst:
--------------------------------------------------------------------------------
 1 | Penalized Synthetic Control Method
 2 | ==================================
 3 | 
 4 | The penalized synthetic control method is due to Abadie & L'Hour :cite:`penalized2021`.
 5 | 
 6 | This version of the synthetic control method adds a penalization term to the loss
 7 | function that has the effect of serving to reduce the interpolation bias. It does this
 8 | by penalizing pairwise discrepancies in any unit contributing to the synthetic control
 9 | and the treated unit.
10 | 
11 | The :class:`PenalizedSynth` class
12 | *********************************
13 | 
14 | The :class:`PenalizedSynth <pysyncon.PenalizedSynth>` class implements the penalized
15 | synthetic control method. The expected way to use the class is to first create a
16 | :class:`Dataprep <pysyncon.Dataprep>` object that defines the study data and
17 | then use it as input to a :class:`PenalizedSynth <pysyncon.RobustSynth>` object. See the
18 | `examples folder <https://github.com/sdfordham/pysyncon/tree/main/examples>`_
19 | of the repository for examples illustrating usage.
20 | 
21 | .. autoclass:: pysyncon.PenalizedSynth
22 |    :members:
23 |    :inherited-members:
24 | 


--------------------------------------------------------------------------------
/doc/source/placebo.rst:
--------------------------------------------------------------------------------
 1 | Placebo Tests
 2 | =============
 3 | 
 4 | A placebo test is used to assess the significance of a synthetic control study
 5 | by running the study once for each control unit set as treated unit and the
 6 | remaining control units set as controls. See :cite:`germany2015` (section I.B)
 7 | for a motivation. An example of usage is in the python notebook reproducing
 8 | the weights from that paper in the package repository
 9 | `here <https://github.com/sdfordham/pysyncon/tree/main/examples/basque.ipynb>`_
10 | 
11 | The :class:`PlaceboTest` class
12 | ******************************
13 | 
14 | .. autoclass:: pysyncon.utils.PlaceboTest
15 |    :members:
16 | 


--------------------------------------------------------------------------------
/doc/source/robust.rst:
--------------------------------------------------------------------------------
 1 | Robust Synthetic Control Method
 2 | ===============================
 3 | 
 4 | The Robust Synthetic Control Method is due to Amjad, Shah & Shen :cite:`robust2018`.
 5 | 
 6 | This method de-noises the data matrix of the control units by
 7 | applying a threshold to the singular values of the observation matrix
 8 | and then fits a linear model using ridge regression of the de-noised control
 9 | post-treatment outcomes against pre-treatment outcomes. Similarly to the
10 | :doc:`Ridge Agumented Synthetic Control Method <augsynth>` the weights here
11 | may be negative or larger than 1.
12 | 
13 | The :class:`RobustSynth` class
14 | ******************************
15 | 
16 | The :class:`RobustSynth <pysyncon.RobustSynth>` class implements the robust control
17 | method. The expected way to use the class is to first create a
18 | :class:`Dataprep <pysyncon.Dataprep>` object that defines the study data and
19 | then use it as input to a :class:`RobustSynth <pysyncon.RobustSynth>` object. See the
20 | `examples folder <https://github.com/sdfordham/pysyncon/tree/main/examples>`_
21 | of the repository for examples illustrating usage.
22 | 
23 | .. autoclass:: pysyncon.RobustSynth
24 |    :members:
25 |    :inherited-members:
26 | 


--------------------------------------------------------------------------------
/doc/source/synth.rst:
--------------------------------------------------------------------------------
 1 | Synthetic Control Method
 2 | ========================
 3 | 
 4 | Overview
 5 | ********
 6 | 
 7 | The synthetic control method is due to Abadie and Gardeazabal :cite:`basque2003`
 8 | (also see Abadie, Diamond and Hainmueller :cite:`california2007` :cite:`germany2015`).
 9 | This method constructs a weighted combination of the control units that
10 | most resembles the selected characteristics of the treated unit in a time period
11 | prior to the treatment time. This so-constructed "synthetic control unit" can then be
12 | compared with the treated unit to investigate the causal effect of the treatment.
13 | 
14 | Details
15 | *******
16 | 
17 | In particular, this method constructs a vector of non-negative weights
18 | :math:`w = (w_1, w_2, \dots, w_k)` whose sum is 1 and :math:`k` is the number
19 | of control units that minimizes
20 | 
21 | .. math::
22 |    \|x_1-X_0w^T\|_V,
23 | 
24 | where
25 | 
26 |    - :math:`\|A\|_V=\sqrt{A^TVA}`, where :math:`V` is a diagonal matrix
27 |      with non-negative entries that captures the relationship between the
28 |      outcome variable and the predictors,
29 |    - :math:`X_0` is a matrix of the values for the control units of the chosen
30 |      statistic for the chosen predictors over the selected (pre-intervention)
31 |      time-period (each column corresponds to a control),
32 |    - :math:`x_1` is a (column) vector of the corresponding values for the
33 |      treated unit.
34 | 
35 | The matrix :math:`V` can be supplied otherwise it is part of the
36 | optimization problem: it is obtained by minimizing the quantity
37 |    
38 | .. math::
39 |    \|z_1-Z_0w^T\|,
40 |    
41 | where
42 | 
43 |    - :math:`Z_0` is a matrix of the values of the outcome variable for the
44 |      control units over the (pre-intervention) time-period (each column
45 |      corresponds to a control),
46 |    - :math:`z_1` is a (column) vector of the corresponding values for the
47 |      treated unit.
48 | 
49 | The :class:`Synth` class
50 | ************************
51 | 
52 | The :class:`Synth <pysyncon.Synth>` class implements the synthetic control
53 | method. The expected way to use the class is to first create a
54 | :class:`Dataprep <pysyncon.Dataprep>` object that defines the study data and
55 | then use it as input to a :class:`Synth <pysyncon.Synth>` object. See the
56 | `examples folder <https://github.com/sdfordham/pysyncon/tree/main/examples>`_
57 | of the repository for examples illustrating usage.
58 | 
59 | The implementation is based on the same method in the R
60 | `Synth package <https://cran.r-project.org/web/packages/Synth/index.html>`_
61 | and aims to produce results that can be reconciled with that package.
62 | 
63 | .. autoclass:: pysyncon.Synth
64 |    :members:
65 |    :inherited-members:
66 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools"]
3 | build-backend = "setuptools.build_meta"
4 | 


--------------------------------------------------------------------------------
/pysyncon/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.5.1"
2 | 
3 | from .dataprep import Dataprep
4 | from .synth import Synth
5 | from .augsynth import AugSynth
6 | from .robust import RobustSynth
7 | from .penalized import PenalizedSynth
8 | 


--------------------------------------------------------------------------------
/pysyncon/augsynth.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from typing import Optional
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from .dataprep import Dataprep
  8 | from .base import BaseSynth, VanillaOptimMixin
  9 | from .utils import HoldoutSplitter, CrossValidationResult
 10 | 
 11 | 
 12 | class AugSynth(BaseSynth, VanillaOptimMixin):
 13 |     """Implementation of the augmented synthetic control method due to Ben-
 14 |     Michael, Feller & Rothstein :cite:`augsynth2021`.
 15 | 
 16 |     The implementation follows the augsynth R package with the option
 17 |     `progfunc="Ridge"`.
 18 |     """
 19 | 
 20 |     def __init__(self) -> None:
 21 |         super().__init__()
 22 |         self.lambda_: Optional[float] = None
 23 |         self.cv_result: Optional[CrossValidationResult] = None
 24 | 
 25 |     def fit(self, dataprep: Dataprep, lambda_: Optional[float] = None) -> None:
 26 |         """Fit the model/calculate the weights.
 27 | 
 28 |         Parameters
 29 |         ----------
 30 |         dataprep : Dataprep, optional
 31 |             :class:`Dataprep` object containing data to model.
 32 |         lambda_ : float, optional
 33 |             Ridge parameter to use. If not supplied, then it is obtained by
 34 |             cross-validation, by default None
 35 |         """
 36 |         if (
 37 |             isinstance(dataprep.treatment_identifier, (list, tuple))
 38 |             and len(dataprep.treatment_identifier) > 1
 39 |         ):
 40 |             raise ValueError("AugSynth requires exactly one treated unit.")
 41 |         self.dataprep = dataprep
 42 |         Z0, Z1 = self.dataprep.make_covariate_mats()
 43 |         X0, X1 = self.dataprep.make_outcome_mats()
 44 | 
 45 |         X0_demean, X1_demean, Z0_normal, Z1_normal = self._normalize(X0, X1, Z0, Z1)
 46 |         X0_stacked = pd.concat([X0_demean, Z0_normal], axis=0)
 47 |         X1_stacked = pd.concat([X1_demean, Z1_normal], axis=0)
 48 | 
 49 |         if lambda_ is None:
 50 |             lambdas = self.generate_lambdas(X0)
 51 |             self.cv_result = self.cross_validate(X0, X1, lambdas)
 52 |             self.lambda_ = self.cv_result.best_lambda()
 53 |         else:
 54 |             self.lambda_ = lambda_
 55 | 
 56 |         n_r, _ = X0.shape
 57 |         V_mat = np.diag(np.full(n_r, 1 / n_r))
 58 |         W, _ = self.w_optimize(V_mat=V_mat, X0=X0.to_numpy(), X1=X1.to_numpy())
 59 | 
 60 |         W_ridge = self.solve_ridge(
 61 |             X1_stacked.to_numpy(), X0_stacked.to_numpy(), W, self.lambda_
 62 |         )
 63 |         self.W = W + W_ridge
 64 | 
 65 |     @staticmethod
 66 |     def solve_ridge(
 67 |         A: np.ndarray, B: np.ndarray, W: np.ndarray, lambda_: float
 68 |     ) -> np.ndarray:
 69 |         """Calculate the ridge adjustment to the weights.
 70 | 
 71 |         :meta private:
 72 |         """
 73 |         M = A - B @ W
 74 |         N = np.linalg.inv(B @ B.T + lambda_ * np.identity(B.shape[0]))
 75 |         return M @ N @ B
 76 | 
 77 |     def _normalize(
 78 |         self, X0: pd.DataFrame, X1: pd.Series, Z0: pd.DataFrame, Z1: pd.Series
 79 |     ) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
 80 |         """Normalise the data before the weight calculation.
 81 | 
 82 |         :meta private:
 83 |         """
 84 |         X0_demean = X0.subtract(X0.mean(axis=1), axis=0)
 85 |         X1_demean = X1.subtract(X0.mean(axis=1), axis=0)
 86 | 
 87 |         Z0_demean = Z0.subtract(Z0.mean(axis=1), axis=0)
 88 |         Z1_demean = Z1.subtract(Z0.mean(axis=1), axis=0)
 89 | 
 90 |         Z0_std = Z0_demean.std(axis=1)
 91 |         X0_std = X0_demean.to_numpy().std(ddof=1).item()
 92 | 
 93 |         Z0_normal = Z0_demean.divide(Z0_std, axis=0) * X0_std
 94 |         Z1_normal = Z1_demean.divide(Z0_std, axis=0) * X0_std
 95 |         return X0_demean, X1_demean, Z0_normal, Z1_normal
 96 | 
 97 |     def cross_validate(
 98 |         self, X0: np.ndarray, X1: np.ndarray, lambdas: np.ndarray, holdout_len: int = 1
 99 |     ) -> CrossValidationResult:
100 |         """Method that calculates the mean error and standard error to the mean
101 |         error using a cross-validation procedure for the given ridge parameter
102 |         values.
103 | 
104 |         :meta private:
105 |         """
106 |         V = np.identity(X0.shape[0] - holdout_len)
107 |         res = list()
108 |         for X0_t, X0_v, X1_t, X1_v in HoldoutSplitter(X0, X1, holdout_len=holdout_len):
109 |             W, _ = self.w_optimize(V_mat=V, X0=X0_t.to_numpy(), X1=X1_t.to_numpy())
110 |             this_res = list()
111 |             for l in lambdas:
112 |                 ridge_weights = self.solve_ridge(A=X1_t, B=X0_t, W=W, lambda_=l)
113 |                 W_aug = W + ridge_weights
114 |                 err = (X1_v - X0_v @ W_aug).pow(2).sum()
115 |                 this_res.append(err.item())
116 |             res.append(this_res)
117 |         means = np.array(res).mean(axis=0)
118 |         ses = np.array(res).std(axis=0) / np.sqrt(len(lambdas))
119 |         return CrossValidationResult(lambdas, means, ses)
120 | 
121 |     def generate_lambdas(
122 |         self, X: pd.DataFrame, lambda_min_ratio: float = 1e-8, n_lambda: int = 20
123 |     ) -> np.ndarray:
124 |         """Generate a suitable set of lambdas to run the cross-validation
125 |         procedure on.
126 | 
127 |         :meta private:
128 |         """
129 |         _, sing, _ = np.linalg.svd(X.T)
130 |         lambda_max = sing[0].item() ** 2.0
131 |         scaler = lambda_min_ratio ** (1 / n_lambda)
132 |         return lambda_max * (scaler ** np.array(range(n_lambda)))
133 | 


--------------------------------------------------------------------------------
/pysyncon/base.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from typing import Optional, Literal, Sequence
  3 | from abc import ABCMeta, abstractmethod
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import matplotlib.pyplot as plt
  8 | from scipy.optimize import minimize, Bounds, LinearConstraint
  9 | 
 10 | from .dataprep import Dataprep, IsinArg_t
 11 | 
 12 | 
 13 | class BaseSynth(metaclass=ABCMeta):
 14 |     """Metaclass that defines methods common to synthetic control methods."""
 15 | 
 16 |     def __init__(self) -> None:
 17 |         self.dataprep: Optional[Dataprep] = None
 18 |         self.W: Optional[np.ndarray] = None
 19 |         self.W_names: Optional[Sequence] = None
 20 | 
 21 |     @abstractmethod
 22 |     def fit(*args, **kwargs) -> None:
 23 |         raise NotImplementedError
 24 | 
 25 |     def _synthetic(self, Z0: pd.DataFrame) -> pd.Series:
 26 |         """Assemble the synthetic unit using the calculated weight matrix.
 27 | 
 28 |         Parameters
 29 |         ----------
 30 |         Z0 : pandas.DataFrame, shape (n, c)
 31 |             A matrix of the time series of the outcome variable with each
 32 |             column corresponding to a control unit and the rows are the time
 33 |             steps
 34 | 
 35 |         Returns
 36 |         -------
 37 |         pd.Series
 38 |             Time series of the synthetic unit.
 39 |         """
 40 |         ts_synthetic = (Z0 * self.W).sum(axis=1)
 41 |         return ts_synthetic
 42 | 
 43 |     def path_plot(
 44 |         self,
 45 |         time_period: Optional[IsinArg_t] = None,
 46 |         treatment_time: Optional[int] = None,
 47 |         grid: bool = True,
 48 |         Z0: Optional[pd.DataFrame] = None,
 49 |         Z1: Optional[pd.Series] = None,
 50 |     ) -> None:
 51 |         """Plot the outcome variable over time for the treated unit and the
 52 |         synthetic control.
 53 | 
 54 |         Parameters
 55 |         ----------
 56 |         time_period : Iterable | pandas.Series | dict, optional
 57 |             Time range to plot, if none is supplied then the time range used
 58 |             is the time period over which the optimisation happens, by default
 59 |             None
 60 |         treatment_time : int, optional
 61 |             If supplied, plot a vertical line at the time period that the
 62 |             treatment time occurred, by default None
 63 |         grid : bool, optional
 64 |             Whether or not to plot a grid, by default True
 65 |         Z0 : pandas.DataFrame, shape (n, c), optional
 66 |             The matrix of the time series of the outcome variable for the control units.
 67 |             If no dataprep is set, then this must be supplied along with Z1, by default None.
 68 |         Z1 : pandas.Series, shape (n, 1), optional
 69 |             The matrix of the time series of the outcome variable for the treated unit.
 70 |             If no dataprep is set, then this must be supplied along with Z0, by default None.
 71 | 
 72 |         Raises
 73 |         ------
 74 |         ValueError
 75 |             If there is no weight matrix available
 76 |         ValueError
 77 |             If there is no :class:`Dataprep` object set or (Z0, Z1) is not supplied
 78 |         """
 79 |         if self.dataprep is not None:
 80 |             Z0, Z1 = self.dataprep.make_outcome_mats(time_period=time_period)
 81 |         elif Z0 is None or Z1 is None:
 82 |             raise ValueError("dataprep must be set or (Z0, Z1) must be set for plots.")
 83 |         if self.W is None:
 84 |             raise ValueError("No weight matrix available; fit data first.")
 85 | 
 86 |         ts_synthetic = self._synthetic(Z0=Z0)
 87 |         plt.plot(Z1, color="black", linewidth=1, label=Z1.name)
 88 |         plt.plot(
 89 |             ts_synthetic,
 90 |             color="black",
 91 |             linewidth=1,
 92 |             linestyle="dashed",
 93 |             label="Synthetic",
 94 |         )
 95 |         if self.dataprep is not None:
 96 |             plt.ylabel(self.dataprep.dependent)
 97 |         if treatment_time:
 98 |             plt.axvline(x=treatment_time, ymin=0.05, ymax=0.95, linestyle="dashed")
 99 |         plt.legend()
100 |         plt.grid(grid)
101 |         plt.show()
102 | 
103 |     def _gaps(self, Z0: pd.DataFrame, Z1: pd.Series) -> pd.Series:
104 |         """Calculate the gaps (difference between factual
105 |         and estimated counterfactual)
106 | 
107 |         Parameters
108 |         ----------
109 |         Z0 : pandas.DataFrame, shape (n, c)
110 |             A matrix of the time series of the outcome variable with each
111 |             column corresponding to a control unit and the rows are the time
112 |             steps
113 |         Z1 : pandas.DataFrame, shape (n, 1)
114 |             A matrix of the time series of the outcome variable for the treated
115 |             unit and the rows are the time steps
116 | 
117 |         Returns
118 |         -------
119 |         pd.Series
120 |             Series containing the gaps
121 | 
122 |         :meta private:
123 |         """
124 |         ts_synthetic = self._synthetic(Z0=Z0)
125 |         ts_gap = Z1 - ts_synthetic
126 |         return ts_gap
127 | 
128 |     def gaps_plot(
129 |         self,
130 |         time_period: Optional[IsinArg_t] = None,
131 |         treatment_time: Optional[int] = None,
132 |         grid: bool = True,
133 |         Z0: Optional[pd.DataFrame] = None,
134 |         Z1: Optional[pd.Series] = None,
135 |     ) -> None:
136 |         """Plots the gap between the treated unit and the synthetic unit over
137 |         time.
138 | 
139 |         Parameters
140 |         ----------
141 |         time_period : Iterable | pandas.Series | dict, optional
142 |             Time range to plot, if none is supplied then the time range used
143 |             is the time period over which the optimisation happens, by default
144 |             None
145 |         treatment_time : int, optional
146 |             If supplied, plot a vertical line at the time period that the
147 |             treatment time occurred, by default None
148 |         grid : bool, optional
149 |             Whether or not to plot a grid, by default True
150 |         Z0 : pandas.DataFrame, shape (n, c), optional
151 |             The matrix of the time series of the outcome variable for the control units.
152 |             If no dataprep is set, then this must be supplied along with Z1, by default None.
153 |         Z1 : pandas.Series, shape (n, 1), optional
154 |             The matrix of the time series of the outcome variable for the treated unit.
155 |             If no dataprep is set, then this must be supplied along with Z0, by default None.
156 | 
157 |         Raises
158 |         ------
159 |         ValueError
160 |             If there is no weight matrix available
161 |         ValueError
162 |             If there is no :class:`Dataprep` object set or (Z0, Z1) is not supplied
163 |         """
164 |         if self.dataprep is not None:
165 |             Z0, Z1 = self.dataprep.make_outcome_mats(time_period=time_period)
166 |         elif Z0 is None or Z1 is None:
167 |             raise ValueError("dataprep must be set or (Z0, Z1) must be set for plots.")
168 |         if self.W is None:
169 |             raise ValueError("No weight matrix available; fit data first.")
170 | 
171 |         ts_gap = self._gaps(Z0=Z0, Z1=Z1)
172 |         plt.plot(ts_gap, color="black", linewidth=1)
173 |         if self.dataprep is not None:
174 |             plt.ylabel(self.dataprep.dependent)
175 |         plt.hlines(
176 |             y=0,
177 |             xmin=min(ts_gap.index),
178 |             xmax=max(ts_gap.index),
179 |             color="black",
180 |             linestyle="dashed",
181 |         )
182 |         if treatment_time:
183 |             plt.axvline(x=treatment_time, ymin=0.05, ymax=0.95, linestyle="dashed")
184 |         plt.grid(grid)
185 |         plt.show()
186 | 
187 |     def weights(self, round: int = 3, threshold: Optional[float] = None) -> pd.Series:
188 |         """Return a ``pandas.Series`` of the weights for each control unit.
189 | 
190 |         Parameters
191 |         ----------
192 |         round : int, optional
193 |             Round the weights to given number of places, by default 3
194 |         threshold : float, optional
195 |             If supplied, will only show weights above this value, by default
196 |             None
197 | 
198 |         Returns
199 |         -------
200 |         pandas.Series
201 |             The weights computed
202 | 
203 |         Raises
204 |         ------
205 |         ValueError
206 |             If there is no weight matrix available
207 |         """
208 |         if self.W is None:
209 |             raise ValueError("No weight matrix available; fit data first.")
210 |         if self.dataprep is None:
211 |             weights_ser = pd.Series(self.W, index=self.W_names, name="weights")
212 |         else:
213 |             weights_ser = pd.Series(
214 |                 self.W, index=list(self.dataprep.controls_identifier), name="weights"
215 |             )
216 |         weights_ser = (
217 |             weights_ser[weights_ser >= threshold] if threshold else weights_ser
218 |         )
219 |         return weights_ser.round(round)
220 | 
221 |     def summary(
222 |         self,
223 |         round: int = 3,
224 |         X0: Optional[pd.DataFrame] = None,
225 |         X1: Optional[pd.Series] = None,
226 |     ) -> pd.DataFrame:
227 |         """Generates a ``pandas.DataFrame`` with summary data. The
228 |         first column will show the mean value of each predictor over the time
229 |         period ``time_predictors_prior`` for the treated unit and the second
230 |         column the case of the synthetic unit and finally there will be a
231 |         column 'sample mean' that shows the mean value of each predictor
232 |         over the time period ``time_predictors_prior`` across all the control
233 |         units, i.e. this will be the same as a synthetic control where all
234 |         the weights are equal.
235 | 
236 |         Parameters
237 |         ----------
238 |         round : int, optional
239 |             Round the table values to the given number of places, by
240 |             default 3
241 |         X0 : pd.DataFrame, shape (n_cov, n_controls), optional
242 |             Matrix with each column corresponding to a control unit and each
243 |             row is a covariate. If no dataprep is set, then this must be
244 |             supplied along with X1, by default None.
245 |         X1 : pandas.Series, shape (n_cov, 1), optional
246 |             Column vector giving the covariate values for the treated unit.
247 |             If no dataprep is set, then this must be supplied along with Z1,
248 |             by default None.
249 | 
250 |         Returns
251 |         -------
252 |         pandas.DataFrame
253 |             Summary data.
254 | 
255 |         Raises
256 |         ------
257 |         ValueError
258 |             If there is no weight matrix available
259 |         ValueError
260 |             If there is no :class:`Dataprep` object set or (Z0, Z1) is not supplied
261 |         """
262 |         if self.W is None:
263 |             raise ValueError("No weight matrix available: fit data first.")
264 |         if self.dataprep is not None:
265 |             X0, X1 = self.dataprep.make_covariate_mats()
266 |         elif X0 is None or X1 is None:
267 |             raise ValueError(
268 |                 "dataprep must be set or (X0, X1) must be set for summary."
269 |             )
270 | 
271 |         treated = X1.rename("treated")
272 |         synthetic = (X0 * self.W).sum(axis=1).rename("synthetic")
273 |         sample_mean = X0.mean(axis=1).rename("sample mean")
274 | 
275 |         return pd.concat([treated, synthetic, sample_mean], axis=1).round(round)
276 | 
277 |     def att(
278 |         self,
279 |         time_period: IsinArg_t,
280 |         Z0: Optional[pd.DataFrame] = None,
281 |         Z1: Optional[pd.Series] = None,
282 |     ) -> dict[str, float]:
283 |         """Computes the average treatment effect on the treated unit (ATT) and
284 |         the standard error to the value over the chosen time-period.
285 | 
286 |         Parameters
287 |         ----------
288 |         time_period : Iterable | pandas.Series | dict, optional
289 |             Time period to compute the ATT over.
290 |         Z0 : pandas.DataFrame, shape (n, c), optional
291 |             The matrix of the time series of the outcome variable for the control units.
292 |             If no dataprep is set, then this must be supplied along with Z1, by default None.
293 |         Z1 : pandas.Series, shape (n, 1), optional
294 |             The matrix of the time series of the outcome variable for the treated unit.
295 |             If no dataprep is set, then this must be supplied along with Z0, by default None.
296 | 
297 |         Returns
298 |         -------
299 |         dict
300 |             A dictionary with the ATT value and the standard error to the ATT.
301 | 
302 |         Raises
303 |         ------
304 |         ValueError
305 |             If there is no weight matrix available
306 |         ValueError
307 |             If there is no :class:`Dataprep` object set or (Z0, Z1) is not supplied
308 |         """
309 |         if self.W is None:
310 |             raise ValueError("No weight matrix available; fit data first.")
311 |         if self.dataprep is not None:
312 |             Z0, Z1 = self.dataprep.make_outcome_mats(time_period=time_period)
313 |             gaps = self._gaps(Z0=Z0, Z1=Z1)
314 |         if Z0 is not None and Z1 is not None:
315 |             gaps = self._gaps(Z0=Z0.loc[time_period, :], Z1=Z1.loc[time_period])
316 |         else:
317 |             raise ValueError("dataprep must be set or (Z0, Z1) must be set for att.")
318 |         att = np.mean(gaps)
319 |         se = np.std(gaps, ddof=1) / np.sqrt(len(time_period))
320 | 
321 |         return {"att": att.item(), "se": se.item()}
322 | 
323 |     def mspe(
324 |         self, Z0: Optional[pd.DataFrame] = None, Z1: Optional[pd.Series] = None
325 |     ) -> float:
326 |         """Returns the mean square prediction error in the fit of
327 |         the synthetic control versus the treated unit over the
328 |         optimization time-period.
329 | 
330 |         Parameters
331 |         ----------
332 |         Z0 : pandas.DataFrame, shape (n, c), optional
333 |             The matrix of the time series of the outcome variable for the control units.
334 |             If no dataprep is set, then this must be supplied along with Z1, by default None.
335 |         Z1 : pandas.Series, shape (n, 1), optional
336 |             The matrix of the time series of the outcome variable for the treated unit.
337 |             If no dataprep is set, then this must be supplied along with Z0, by default None.
338 | 
339 |         Returns
340 |         -------
341 |         float
342 |             Mean square prediction Error
343 | 
344 |         Raises
345 |         ------
346 |         ValueError
347 |             If the fit method has not been run (no weights available.)
348 |         ValueError
349 |             If there is no :class:`Dataprep` object set or (Z0, Z1) is not supplied
350 |         """
351 |         if self.W is None:
352 |             raise ValueError("No weight matrix available; fit data first.")
353 |         if self.dataprep is not None:
354 |             Z0, Z1 = self.dataprep.make_outcome_mats(
355 |                 time_period=self.dataprep.time_optimize_ssr
356 |             )
357 |         if Z0 is None or Z1 is None:
358 |             raise ValueError("dataprep must be set or (Z0, Z1) must be set for plots.")
359 |         ts_synthetic = self._synthetic(Z0=Z0)
360 | 
361 |         n = len(ts_synthetic)
362 |         return (1 / n) * (Z1 - ts_synthetic).pow(2).sum().item()
363 | 
364 |     def mape(
365 |         self, Z0: Optional[pd.DataFrame] = None, Z1: Optional[pd.Series] = None
366 |     ) -> float:
367 |         """Returns the mean absolute percentage error in the fit of
368 |         the synthetic control versus the treated unit over the
369 |         optimization time-period.
370 | 
371 |         Parameters
372 |         ----------
373 |         Z0 : pandas.DataFrame, shape (n, c), optional
374 |             The matrix of the time series of the outcome variable for the control units.
375 |             If no dataprep is set, then this must be supplied along with Z1, by default None.
376 |         Z1 : pandas.Series, shape (n, 1), optional
377 |             The matrix of the time series of the outcome variable for the treated unit.
378 |             If no dataprep is set, then this must be supplied along with Z0, by default None.
379 | 
380 |         Returns
381 |         -------
382 |         float
383 |             Mean absolute percentage error
384 | 
385 |         Raises
386 |         ------
387 |         ValueError
388 |             If the fit method has not been run (no weights available.)
389 |         ValueError
390 |             If there is no :class:`Dataprep` object set or (Z0, Z1) is not supplied
391 |         """
392 |         if self.W is None:
393 |             raise ValueError("No weight matrix available; fit data first.")
394 |         if self.dataprep is not None:
395 |             Z0, Z1 = self.dataprep.make_outcome_mats(
396 |                 time_period=self.dataprep.time_optimize_ssr
397 |             )
398 |         if Z0 is None or Z1 is None:
399 |             raise ValueError("dataprep must be set or (Z0, Z1) must be set for plots.")
400 |         ts_synthetic = self._synthetic(Z0=Z0)
401 | 
402 |         n = len(ts_synthetic)
403 |         return (1 / n) * ((Z1 - ts_synthetic) / Z1).abs().sum().item()
404 | 
405 |     def mae(
406 |         self, Z0: Optional[pd.DataFrame] = None, Z1: Optional[pd.Series] = None
407 |     ) -> float:
408 |         """Returns the mean absolute error in the fit of
409 |         the synthetic control versus the treated unit over the
410 |         optimization time-period.
411 | 
412 |         Parameters
413 |         ----------
414 |         Z0 : pandas.DataFrame, shape (n, c), optional
415 |             The matrix of the time series of the outcome variable for the control units.
416 |             If no dataprep is set, then this must be supplied along with Z1, by default None.
417 |         Z1 : pandas.Series, shape (n, 1), optional
418 |             The matrix of the time series of the outcome variable for the treated unit.
419 |             If no dataprep is set, then this must be supplied along with Z0, by default None.
420 | 
421 |         Returns
422 |         -------
423 |         float
424 |             Mean absolute error
425 | 
426 |         Raises
427 |         ------
428 |         ValueError
429 |             If the fit method has not been run (no weights available.)
430 |         ValueError
431 |             If there is no :class:`Dataprep` object set or (Z0, Z1) is not supplied
432 |         """
433 |         if self.W is None:
434 |             raise ValueError("No weight matrix available; fit data first.")
435 |         if self.dataprep is not None:
436 |             Z0, Z1 = self.dataprep.make_outcome_mats(
437 |                 time_period=self.dataprep.time_optimize_ssr
438 |             )
439 |         if Z0 is None or Z1 is None:
440 |             raise ValueError("dataprep must be set or (Z0, Z1) must be set for plots.")
441 |         ts_synthetic = self._synthetic(Z0=Z0)
442 | 
443 |         n = len(ts_synthetic)
444 |         return (1 / n) * (Z1 - ts_synthetic).abs().sum().item()
445 | 
446 | 
447 | class VanillaOptimMixin:
448 |     @staticmethod
449 |     def w_optimize(
450 |         V_mat: np.ndarray,
451 |         X0: np.ndarray,
452 |         X1: np.ndarray,
453 |         qp_method: Literal["SLSQP"] = "SLSQP",
454 |         qp_options: dict = {"maxiter": 1000},
455 |     ) -> tuple[np.ndarray, float]:
456 |         """Solves the inner part of the quadratic minimization problem for a
457 |         given V matrix (see Abadie and Gardeazabal :cite:`basque2003`).
458 | 
459 |         Parameters
460 |         ----------
461 |         V_mat : numpy.ndarray, shape (c, c)
462 |             V matrix using the notation of the Abadie, Diamond & Hainmueller
463 |             paper defining.
464 |         X0 : numpy.ndarray, shape (m, c)
465 |             Matrix with each column corresponding to a control unit and each
466 |             row is covariates.
467 |         X1 : numpy.ndarray, shape (m,)
468 |             Column vector giving the covariate values for the treated unit.
469 |         qp_method : str, optional
470 |             Minimization routine to use in scipy minimize to solve the problem
471 |             , by default "SLSQP"
472 |         qp_options : dict, optional
473 |             Options for scipy minimize, by default {"maxiter": 1000}
474 | 
475 |         Returns
476 |         -------
477 |         tuple[np.ndarray, float]
478 |             tuple of the optimal weights and the loss
479 | 
480 |         :meta private:
481 |         """
482 |         _, n_c = X0.shape
483 | 
484 |         P = X0.T @ V_mat @ X0
485 |         q = X1.T @ V_mat @ X0
486 | 
487 |         def fun(x):
488 |             return 0.5 * x.T @ P @ x - q.T @ x
489 | 
490 |         bounds = Bounds(lb=np.full(n_c, 0.0), ub=np.full(n_c, 1.0))
491 |         constraints = LinearConstraint(A=np.full(n_c, 1.0), lb=1.0, ub=1.0)
492 | 
493 |         x0 = np.full(n_c, 1 / n_c)
494 |         res = minimize(
495 |             fun=fun,
496 |             x0=x0,
497 |             bounds=bounds,
498 |             constraints=constraints,
499 |             method=qp_method,
500 |             options=qp_options,
501 |         )
502 |         W, loss_W = res["x"], res["fun"]
503 |         return W, loss_W.item()
504 | 


--------------------------------------------------------------------------------
/pysyncon/dataprep.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from typing import Any, Iterable, Union, Optional, Literal, Sequence, Mapping, Tuple
  3 | 
  4 | import pandas as pd
  5 | from pandas._typing import Axes
  6 | 
  7 | 
  8 | AGG_OP = ("mean", "std", "median", "sum", "count", "max", "min", "var")
  9 | PredictorsOp_t = Literal["mean", "std", "median", "sum", "count", "max", "min", "var"]
 10 | IsinArg_t = Union[Iterable, pd.Series, dict]
 11 | SpecialPredictor_t = Tuple[
 12 |     Any, Union[pd.Series, pd.DataFrame, Sequence, Mapping], PredictorsOp_t
 13 | ]
 14 | 
 15 | 
 16 | class Dataprep:
 17 |     """Helper class that takes in the panel data and all necessary information
 18 |     needed to describe the study setup. It is used to automatically generate
 19 |     the matrices needed for the optimisation methods, plots of the results etc.
 20 | 
 21 |     Parameters
 22 |     ----------
 23 |     foo : pandas.DataFrame
 24 |         A pandas DataFrame containing the panel data where the columns are
 25 |         predictor/outcome variables and each row is a time-step for some unit
 26 |     predictors : Axes
 27 |         The columns of ``foo`` to use as predictors
 28 |     predictors_op : "mean" | "std" | "median" | "sum" | "count" | "max" | "min" | "var"
 29 |         The statistical operation to use on the predictors - the time range that
 30 |         the operation is applied to is ``time_predictors_prior``
 31 |     dependent : Any
 32 |         The column of ``foo`` to use as the dependent variable
 33 |     unit_variable : Any
 34 |         The column of ``foo`` that contains the unit labels
 35 |     time_variable : Any
 36 |         The column of ``foo`` that contains the time period
 37 |     treatment_identifier : Any
 38 |         The unit label that denotes the treated unit
 39 |     controls_identifier : Iterable
 40 |         The unit labels denoting the control units
 41 |     time_predictors_prior : Iterable
 42 |         The time range over which to apply the statistical operation to the
 43 |         predictors (see ``predictors_op`` argument)
 44 |     time_optimize_ssr : Iterable
 45 |         The time range over which the loss function should be minimised
 46 |     special_predictors : Iterable[SpecialPredictor_t], optional
 47 |         An iterable of special predictors which are additional predictors
 48 |         that should be averaged over a custom time period and an indicated
 49 |         statistical operator. In particular, a special predictor
 50 |         consists of a triple of:
 51 | 
 52 |             - ``column``: the column of ``foo`` containing the predictor to use,
 53 |             - ``time-range``: the time range to apply ``operator`` over - it should
 54 |               have the same type as ``time_predictors_prior`` or ``time_optimize_ssr``
 55 |             - ``operator``: the statistical operator to apply to ``column`` - it should
 56 |               have the same type as ``predictors_op``
 57 | 
 58 |         by default None
 59 | 
 60 |     Raises
 61 |     ------
 62 |     TypeError
 63 |         if ``foo`` is not of type ``pandas.DataFrame``
 64 |     ValueError
 65 |         if ``predictor`` is not a column of ``foo``
 66 |     ValueError
 67 |         if ``predictor_op`` is not one of "mean", "std", "median",
 68 |         "sum", "count", "max", "min" or "var".
 69 |     ValueError
 70 |         if ``dependent`` is not a column of ``foo``
 71 |     ValueError
 72 |         if ``unit_variable`` is not a column of ``foo``
 73 |     ValueError
 74 |         if ``time_variable`` is not a column of ``foo``
 75 |     ValueError
 76 |         if ``treatment_identifier`` is not present in ``foo['unit_variable']``
 77 |     TypeError
 78 |         if ``controls_identifier`` is not of type ``Iterable``
 79 |     ValueError
 80 |         if ``treatment_identifier`` is in the list of controls
 81 |     ValueError
 82 |         if any of the controls is not in ``foo['unit_variable']``
 83 |     ValueError
 84 |         if any element of ``special_predictors`` is not an Iterable of length
 85 |         3
 86 |     ValueError
 87 |         if a predictor in an element of ``special_predictors`` is not a column
 88 |         of foo
 89 |     ValueError
 90 |         if one of the operators in an element of ``special_predictors`` is not
 91 |         one of "mean", "std", "median", "sum", "count", "max", "min" or "var".
 92 |     """
 93 | 
 94 |     def __init__(
 95 |         self,
 96 |         foo: pd.DataFrame,
 97 |         predictors: Axes,
 98 |         predictors_op: PredictorsOp_t,
 99 |         dependent: Any,
100 |         unit_variable: Any,
101 |         time_variable: Any,
102 |         treatment_identifier: Union[Any, list, tuple],
103 |         controls_identifier: Union[list, tuple],
104 |         time_predictors_prior: IsinArg_t,
105 |         time_optimize_ssr: IsinArg_t,
106 |         special_predictors: Optional[Iterable[SpecialPredictor_t]] = None,
107 |     ) -> None:
108 |         if not isinstance(foo, pd.DataFrame):
109 |             raise TypeError("foo must be pandas.DataFrame.")
110 |         self.foo = foo
111 | 
112 |         for predictor in predictors:
113 |             if predictor not in foo.columns:
114 |                 raise ValueError(f"predictor {predictor} not in foo columns.")
115 |         self.predictors = predictors
116 | 
117 |         if predictors_op not in AGG_OP:
118 |             agg_op_str = ", ".join([f'"{o}"' for o in AGG_OP])
119 |             raise ValueError(f"predictors_op must be one of {agg_op_str}.")
120 |         self.predictors_op = predictors_op
121 | 
122 |         if dependent not in foo.columns:
123 |             raise ValueError(f"dependent {dependent} not in foo columns.")
124 |         self.dependent = dependent
125 | 
126 |         if unit_variable not in foo.columns:
127 |             raise ValueError(f"unit_variable {unit_variable} not in foo columns.")
128 |         self.unit_variable = unit_variable
129 | 
130 |         if time_variable not in foo.columns:
131 |             raise ValueError(f"time_variable {time_variable} not in foo columns.")
132 |         self.time_variable = time_variable
133 | 
134 |         if foo[[unit_variable, time_variable]].duplicated().any():
135 |             raise ValueError(
136 |                 "Multiple rows found in `foo` for same [unit, time] pairs."
137 |             )
138 | 
139 |         if isinstance(treatment_identifier, (list, tuple)):
140 |             for treated in treatment_identifier:
141 |                 # This throws FutureWarning (see https://stackoverflow.com/a/46721064/11594901)
142 |                 if treated not in foo[unit_variable].values:
143 |                     raise ValueError(
144 |                         f'treatment_identifier {treated} not found in foo["{unit_variable}"].'
145 |                     )
146 |         else:
147 |             # This throws FutureWarning (see https://stackoverflow.com/a/46721064/11594901)
148 |             if treatment_identifier not in foo[unit_variable].values:
149 |                 raise ValueError(
150 |                     f'treatment_identifier {treatment_identifier} not found in foo["{unit_variable}"].'
151 |                 )
152 |         if (
153 |             isinstance(treatment_identifier, (list, tuple))
154 |             and len(treatment_identifier) == 1
155 |         ):
156 |             self.treatment_identifier = treatment_identifier[0]
157 |         else:
158 |             self.treatment_identifier = treatment_identifier
159 | 
160 |         if not isinstance(controls_identifier, (list, tuple)):
161 |             raise TypeError("controls_identifier should be an list or tuple")
162 |         for control in controls_identifier:
163 |             if isinstance(self.treatment_identifier, (list, tuple)):
164 |                 if control in treatment_identifier:
165 |                     raise ValueError(
166 |                         f"{control} in both treatment_identifier and controls_identifier."
167 |                     )
168 |             else:
169 |                 if control == treatment_identifier:
170 |                     raise ValueError("treatment_identifier in controls_identifier.")
171 |             if control not in foo[unit_variable].values:
172 |                 raise ValueError(
173 |                     f'controls_identifier {control} not found in foo["{unit_variable}"].'
174 |                 )
175 |         self.controls_identifier = controls_identifier
176 | 
177 |         if self.foo[self.foo[self.time_variable].isin(time_predictors_prior)].empty:
178 |             raise ValueError(
179 |                 f"foo has no rows in the time range `time_predictors_prior`."
180 |             )
181 |         self.time_predictors_prior = time_predictors_prior
182 | 
183 |         if self.foo[self.foo[self.time_variable].isin(time_optimize_ssr)].empty:
184 |             raise ValueError(f"foo has no rows in the time range `time_optimize_ssr`.")
185 |         self.time_optimize_ssr = time_optimize_ssr
186 | 
187 |         if special_predictors:
188 |             for el in special_predictors:
189 |                 if not isinstance(el, tuple) or len(el) != 3:
190 |                     raise ValueError(
191 |                         "Elements of special_predictors should be tuples of length 3."
192 |                     )
193 |                 predictor, time_range, op = el
194 |                 if predictor not in foo.columns:
195 |                     raise ValueError(
196 |                         f"{predictor} in special_predictors not in foo columns."
197 |                     )
198 |                 if self.foo[self.foo[self.time_variable].isin(time_range)].empty:
199 |                     raise ValueError(
200 |                         f"foo has no rows in the time range {time_range} for `special_predictor` {el}."
201 |                     )
202 |                 if op not in AGG_OP:
203 |                     agg_op_str = ", ".join([f'"{o}"' for o in AGG_OP])
204 |                     raise ValueError(
205 |                         f"{op} in special_predictors must be one of {agg_op_str}."
206 |                     )
207 |         self.special_predictors = special_predictors
208 | 
209 |     def make_covariate_mats(
210 |         self,
211 |     ) -> tuple[pd.DataFrame, Union[pd.Series, pd.DataFrame]]:
212 |         """Generate the covariate matrices to use as input to the fit method
213 |         of the synthetic control computation.
214 | 
215 |         Returns
216 |         -------
217 |         tuple[pandas.DataFrame, pandas.Series]
218 |             Returns the matrices :math:`X_0`, :math:`X_1` (using the notation of Abadie
219 |             & Gardeazabal :cite:`basque2003`).
220 | 
221 |         :meta private:
222 |         """
223 |         X_nonspecial = (
224 |             self.foo[self.foo[self.time_variable].isin(self.time_predictors_prior)]
225 |             .groupby(self.unit_variable)[self.predictors]
226 |             .agg(self.predictors_op)
227 |             .T
228 |         )
229 |         X1_nonspecial = X_nonspecial[self.treatment_identifier]
230 |         X0_nonspecial = X_nonspecial[list(self.controls_identifier)]
231 | 
232 |         if self.special_predictors is None:
233 |             return X0_nonspecial, X1_nonspecial
234 | 
235 |         X0_special = list()
236 |         for control in self.controls_identifier:
237 |             this_control = list()
238 |             for predictor, time_period, op in self.special_predictors:
239 |                 mask = (self.foo[self.unit_variable] == control) & (
240 |                     self.foo[self.time_variable].isin(time_period)
241 |                 )
242 |                 this_control.append(self.foo[mask][predictor].agg(op))
243 |             X0_special.append(this_control)
244 | 
245 |         X0_special_columns = list()
246 |         for idx, (predictor, _, _) in enumerate(self.special_predictors, 1):
247 |             X0_special_columns.append(f"special.{idx}.{predictor}")
248 | 
249 |         X0_special = pd.DataFrame(
250 |             X0_special, columns=X0_special_columns, index=self.controls_identifier
251 |         ).T
252 |         X0 = pd.concat([X0_nonspecial, X0_special], axis=0)
253 | 
254 |         X1_special = list()
255 |         if isinstance(self.treatment_identifier, (list, tuple)):
256 |             for treated in self.treatment_identifier:
257 |                 this_treated = list()
258 |                 for predictor, time_period, op in self.special_predictors:
259 |                     mask = (self.foo[self.unit_variable] == treated) & (
260 |                         self.foo[self.time_variable].isin(time_period)
261 |                     )
262 |                     this_treated.append(self.foo[mask][predictor].agg(op))
263 |                 X1_special.append(this_treated)
264 |             X1_special = pd.DataFrame(
265 |                 X1_special, columns=X0_special_columns, index=self.treatment_identifier
266 |             ).T
267 |         else:
268 |             for predictor, time_period, op in self.special_predictors:
269 |                 mask = (self.foo[self.unit_variable] == self.treatment_identifier) & (
270 |                     self.foo[self.time_variable].isin(time_period)
271 |                 )
272 |                 X1_special.append(self.foo[mask][predictor].agg(op))
273 | 
274 |             X1_special = pd.Series(X1_special, index=X0_special_columns).rename(
275 |                 self.treatment_identifier
276 |             )
277 |         X1 = pd.concat([X1_nonspecial, X1_special], axis=0)
278 |         return X0, X1
279 | 
280 |     def make_outcome_mats(
281 |         self, time_period: Optional[IsinArg_t] = None
282 |     ) -> tuple[pd.DataFrame, Union[pd.Series, pd.DataFrame]]:
283 |         """Generates the time-series (outcome) matrices to use as input to the fit
284 |         method of the synthetic control computation.
285 | 
286 |         Parameters
287 |         ----------
288 |         time_period : Iterable | pandas.Series | dict, optional
289 |             Time period to use when generating the matrices, defaults to
290 |             time_optimize_ssr set when initialising the class, by default None
291 | 
292 |         Returns
293 |         -------
294 |         tuple[pd.DataFrame, Union[pd.Series, pd.DataFrame]]
295 |             Returns the matrices :math:`Z_0`, :math:`Z_1` (using the notation
296 |             of Abadie & Gardeazabal :cite:`basque2003`).
297 | 
298 |         :meta private:
299 |         """
300 |         time_period = time_period if time_period is not None else self.time_optimize_ssr
301 | 
302 |         Z = self.foo[self.foo[self.time_variable].isin(time_period)].pivot(
303 |             index=self.time_variable, columns=self.unit_variable, values=self.dependent
304 |         )
305 |         Z0, Z1 = Z[list(self.controls_identifier)], Z[self.treatment_identifier]
306 |         return Z0, Z1
307 | 
308 |     def __str__(self) -> str:
309 |         str_rep = (
310 |             "Dataprep\n"
311 |             f"Treated unit: {self.treatment_identifier}\n"
312 |             f"Dependent variable: {self.dependent}\n"
313 |             f"Control units: {', '.join([str(c) for c in self.controls_identifier])}\n"
314 |             f"Time range in data: {min(self.foo[self.time_variable])}"
315 |             f" - {max(self.foo[self.time_variable])}\n"
316 |             f"Time range for loss minimization: {self.time_optimize_ssr}\n"
317 |             f"Time range for predictors: {self.time_predictors_prior}\n"
318 |             f"Predictors: {', '.join([str(p) for p in self.predictors])}\n"
319 |         )
320 | 
321 |         if self.special_predictors:
322 |             str_special_pred = ""
323 |             for predictor, time_range, op in self.special_predictors:
324 |                 rep = f"    `{predictor}` over `{time_range}` using `{op}`\n"
325 |                 str_special_pred = str_special_pred + rep
326 |             str_rep = str_rep + f"Special predictors:\n" + str_special_pred
327 |         return str_rep
328 | 


--------------------------------------------------------------------------------
/pysyncon/generator.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from typing import Optional
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | 
  8 | class LinearFactorModel:
  9 |     """Generates potential outcomes following a linear factor model"""
 10 | 
 11 |     def __init__(
 12 |         self,
 13 |         observed_dist: tuple[int] = (0, 1),
 14 |         observed_params_dist: tuple[int] = (0, 10),
 15 |         unobserved_dist: tuple[int] = (0, 1),
 16 |         unobserved_params_dist: tuple[int] = (0, 10),
 17 |         effect_dist: tuple[int] = (0, 20),
 18 |         shocks_dist: tuple[int] = (0, 1),
 19 |         seed: Optional[int] = None,
 20 |         rng: Optional[np.random.Generator] = None,
 21 |     ) -> None:
 22 |         """Generates potential outcomes following a linear factor model
 23 | 
 24 |         Parameters
 25 |         ----------
 26 |         observed_dist : tuple, optional
 27 |             Parameters for the uniform distribution that the observed
 28 |             covariates follow, by default (0, 1)
 29 |         observed_params_dist : tuple, optional
 30 |             Parameters for the uniform distribution that the observed
 31 |             covariates model parameters follow, by default (0, 10)
 32 |         unobserved_dist : tuple, optional
 33 |             Parameters for the uniform distribution that the unobserved
 34 |             covariates follow, by default (0, 1)
 35 |         unobserved_params_dist : tuple, optional
 36 |             Parameters for the uniform distribution that the unobserved
 37 |             covariates model parameters follow, by default (0, 10)
 38 |         effect_dist : tuple, optional
 39 |             Uniform distribution parameters that the effect follows,
 40 |             by default (0, 20)
 41 |         shocks_dist : tuple, optional
 42 |             Normal distribution parameters that the shocks follow, by default (0, 1)
 43 |         seed : int, optional
 44 |             Random number generator seed, by default None
 45 |         rng : numpy.random.Generator, optional
 46 |             Provide a numpy random number generator, by default None
 47 |         """
 48 |         self.observed_dist = observed_dist
 49 |         self.observed_params_dist = observed_params_dist
 50 |         self.unobserved_dist = unobserved_dist
 51 |         self.unobserved_params_dist = unobserved_params_dist
 52 |         self.effect_dist = effect_dist
 53 |         self.shocks_dist = shocks_dist
 54 |         self.seed = seed
 55 |         self.rng = rng
 56 | 
 57 |     def generate(
 58 |         self,
 59 |         n_units: int,
 60 |         n_observable: int,
 61 |         n_unobservable: int,
 62 |         n_periods_pre: int,
 63 |         n_periods_post: int,
 64 |     ) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
 65 |         """Generate the matrices (:math:`X_0`, :math:`X_1`, :math:`Z_0`,
 66 |         :math:`Z_1`) that can be used as input to a synthetic control
 67 |         method (using the notation of Abadie & Gardeazabal :cite:`basque2003`).
 68 | 
 69 |         Parameters
 70 |         ----------
 71 |         n_units : int
 72 |             Number of units in the model
 73 |         n_observable : int
 74 |             Number of observable covariates in the model
 75 |         n_unobservable : int
 76 |             Number of unobservable covariates in the model
 77 |         n_periods_pre : int
 78 |             Number of time periods prior to the intervention
 79 |         n_periods_post : int
 80 |             Number of time periods post the intervention
 81 | 
 82 |         Returns
 83 |         -------
 84 |         tuple[pandas.DataFrame, pandas.Series, pandas.DataFrame, pandas.Series]
 85 |             Returns a tuple of 4 pandas objects: :math:`X_0` a pandas DataFrame
 86 |             of shape (n_periods_pre + n_periods_post, n_units - 1), :math:`X_1` a
 87 |             pandas Series of shape (n_periods_pre + n_periods_post, 1), :math:`Z_0`
 88 |             a pandas DataFrame of shape (n_observable, n_units - 1), :math:`Z_1`
 89 |             a pandas Series of shape (n_observable, 1).
 90 |         """
 91 |         rng = self.rng(self.seed) if self.rng else np.random.default_rng(seed=self.seed)
 92 | 
 93 |         n_periods = n_periods_pre + n_periods_post
 94 | 
 95 |         delta = rng.uniform(*self.effect_dist, size=n_periods).reshape(-1, 1)
 96 |         delta = np.column_stack([delta] * n_units)
 97 | 
 98 |         Z = rng.uniform(*self.observed_dist, size=(n_observable, n_units))
 99 |         mu = rng.uniform(*self.unobserved_dist, size=(n_unobservable, n_units))
100 |         theta = rng.uniform(*self.observed_params_dist, size=(n_observable, n_periods))
101 |         lambda_ = rng.uniform(
102 |             *self.unobserved_params_dist, size=(n_unobservable, n_periods)
103 |         )
104 |         epsilon = rng.normal(*self.shocks_dist, size=(n_periods, n_units))
105 | 
106 |         Y_N = theta.T @ Z + lambda_.T @ mu + epsilon
107 |         Y_I = Y_N + delta
108 | 
109 |         X0 = pd.DataFrame(
110 |             data=Z[:, 1:],
111 |             columns=range(2, n_units + 1),
112 |             index=[f"observable{i}" for i in range(1, n_observable + 1)],
113 |         )
114 |         X1 = pd.Series(
115 |             data=Z[:, 0],
116 |             name=1,
117 |             index=[f"observable{i}" for i in range(1, n_observable + 1)],
118 |         )
119 |         Z0 = pd.DataFrame(
120 |             data=Y_N[:, 1:],
121 |             columns=range(2, n_units + 1),
122 |             index=range(1, n_periods + 1),
123 |         )
124 |         Z1 = pd.Series(
125 |             data=np.concatenate(
126 |                 [Y_N[:n_periods_pre, 0], Y_I[n_periods_pre:, 0]], axis=0
127 |             ),
128 |             name=1,
129 |             index=range(1, n_periods + 1),
130 |         )
131 |         return X0, X1, Z0, Z1
132 | 


--------------------------------------------------------------------------------
/pysyncon/inference.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from typing import Optional, Callable, Literal
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from pysyncon.base import BaseSynth
  8 | 
  9 | 
 10 | class ConformalInference:
 11 |     """Implementation of the conformal inference based confidence intervals
 12 |     following Chernozhukov et al. :cite:`inference2021`
 13 |     """
 14 | 
 15 |     def __init__(self) -> None:
 16 |         pass
 17 | 
 18 |     def confidence_intervals(
 19 |         self,
 20 |         alpha: float,
 21 |         scm: BaseSynth,
 22 |         Z0: pd.DataFrame,
 23 |         Z1: pd.Series,
 24 |         pre_periods: list,
 25 |         post_periods: list,
 26 |         tol: float = 0.1,
 27 |         max_iter: int = 50,
 28 |         step_sz: Optional[float] = None,
 29 |         step_sz_div: float = 20.0,
 30 |         verbose: bool = True,
 31 |         scm_fit_args: dict = {},
 32 |     ) -> pd.DataFrame:
 33 |         """Confidence intervals obtained from test-inversion, where
 34 |         the p-values are obtained by adjusted re-fits of the data
 35 |         following Chernozhukov et al. :cite:`inference2021`.
 36 | 
 37 |         Parameters
 38 |         ----------
 39 |         alpha : float
 40 |             The required significance level, e.g. alpha = 0.05 will
 41 |             yield a confidence level of 100 * (1 - alpha) = 95%.
 42 |         scm : BaseSynth
 43 |             The synth object to calculate the confidence interval for.
 44 |         Z0 : pandas.DataFrame, shape (n, c)
 45 |             A matrix of the time series of the outcome variable with each
 46 |             column corresponding to a control unit and the rows are the time
 47 |             steps.
 48 |         Z1 : pd.Series
 49 |             Column vector giving the outcome variable values over time for the
 50 |             treated unit.
 51 |         tol : float
 52 |             The required tolerance (accuracy) required when calculating the
 53 |             lower/upper cut-off point of the confidence interval. The search
 54 |             will try to obtain this tolerance level but will not exceed `max_iter`
 55 |             iterations trying to achieve that.
 56 |         pre_periods : list
 57 |             The time-periods to use for the optimization when refitting the
 58 |             data with the adjusted outcomes.
 59 |         post_periods : list
 60 |             The time-periods to calculate confidence intervals for.
 61 |         max_iter : int, optional
 62 |             Maximum number of times to re-fit the data when trying to locate
 63 |             the lower/upper cut-off point, by default 50
 64 |         step_sz : Optional[float], optional
 65 |             Step size to use when searching for an interval that contains the
 66 |             lower or upper cut-off point of the confidence interval, by default None
 67 |         step_sz_div : float, optional
 68 |             Alternative way to define step size: it is the fraction that defines
 69 |             step-size in terms of the standard deviation of the att, i.e. if
 70 |             `step_sz_div=20.0` then the step size used will be (att +/- 2.5 * std(att)) / 20.0,
 71 |             by default 20.0
 72 |         verbose : bool, optional
 73 |             Print output, by default True
 74 |         scm_fit_args : dict, optional
 75 |             A dictionary defining anything extra that should be provided to the
 76 |             synthetic control object `fit` method when doing the refits, by default {}
 77 | 
 78 |         Returns
 79 |         -------
 80 |         pd.DataFrame
 81 |             A pandas.DataFrame indexed by `post_periods`, with 3 columns: `value` that
 82 |             gives the calculated treatment effect, `lower_ci` that gives the value
 83 |             defining the lower-end of the confidence interval, `upper_ci` that gives
 84 |             the value defining the upper-end of the confidence interval.
 85 | 
 86 |         Raises
 87 |         ------
 88 |         TypeError
 89 |             if `alpha` is not a float
 90 |         ValueError
 91 |             if `alpha` is not in the open interval (0, 1).
 92 |         TypeError
 93 |             if `max_iter` is not an integer
 94 |         ValueError
 95 |             if `max_iter` is not at least 1
 96 |         TypeError
 97 |             if `tol` is not a float
 98 |         ValueError
 99 |             if `tol` is less than 0.0
100 |         TypeError
101 |             if `step_sz` is not a float
102 |         ValueError
103 |             if `step_sz` is not greater than 0.0
104 |         TypeError
105 |             if `step_sz_div` is not a float
106 |         ValueError
107 |             if `step_sz_div` is not greater than 0.0
108 |         """
109 |         if not isinstance(alpha, float):
110 |             raise TypeError("`alpha` must be a float")
111 |         elif not 0.0 < alpha < 1.0:
112 |             raise ValueError("`alpha` must be greater than 0.0 and less than 1.0")
113 |         if not isinstance(max_iter, int):
114 |             raise TypeError("`max_iter` must be an integer")
115 |         elif max_iter < 1:
116 |             raise ValueError("`max_iter` must be at least 1")
117 |         if not isinstance(tol, float):
118 |             raise TypeError("`tol` must be a float")
119 |         elif tol <= 0.0:
120 |             raise ValueError("`tol` must be greater than 0.0")
121 |         if step_sz != None:
122 |             if not isinstance(step_sz, float):
123 |                 raise TypeError("`step_sz` should be a float")
124 |             elif step_sz <= 0.0:
125 |                 raise ValueError("`step_sz` should be greater than 0.0")
126 |             elif step_sz <= tol:
127 |                 raise ValueError("`step_sz` must be greater than `tol`.")
128 |         if not isinstance(step_sz_div, float):
129 |             raise TypeError("`step_sz_div` must be a float")
130 |         elif step_sz_div <= 0.0:
131 |             raise ValueError("`step_sz_div` must be greater than 0.0")
132 |         if scm.W is None:
133 |             raise ValueError("No weight matrix available; fit data first.")
134 | 
135 |         gaps = scm._gaps(Z0=Z0, Z1=Z1)
136 |         if step_sz is None:
137 |             # Try to guess a step-size
138 |             if len(post_periods) > 1:
139 |                 factor = np.std(gaps.loc[post_periods])
140 |             else:
141 |                 factor = gaps.loc[post_periods].item() / 2.0
142 |             step_sz = 2.5 * factor / step_sz_div
143 |             if step_sz <= tol:
144 |                 # Failed to guess a sensible step-size :(
145 |                 step_sz = 1.1 * tol
146 | 
147 |         conf_interval = dict()
148 |         n_periods = len(post_periods)
149 |         for idx, post_period in enumerate(post_periods, 1):
150 |             if verbose:
151 |                 print(
152 |                     f"({idx}/{n_periods}) Calculating confidence interval "
153 |                     f"for time-period t={post_period}..."
154 |                 )
155 |             new_time_range = pre_periods + [post_period]
156 |             Z0_new, Z1_new = Z0.loc[new_time_range], Z1.loc[new_time_range]
157 |             Z1_post_orig = Z1_new.loc[post_period].item()
158 | 
159 |             def _compute_p_value(g):
160 |                 Z1_new.loc[post_period] = Z1_post_orig - g
161 |                 scm.fit(Z0=Z0_new, Z1=Z1_new, **scm_fit_args)
162 |                 _gaps = scm._gaps(Z0=Z0_new, Z1=Z1_new)
163 | 
164 |                 u_hat = _gaps.loc[new_time_range]
165 |                 u_hat_post = u_hat.loc[post_period]
166 |                 return np.mean(abs(u_hat) >= abs(u_hat_post))
167 | 
168 |             lower_ci = self._root_search(
169 |                 fn=lambda x: _compute_p_value(x) - alpha,
170 |                 x0=gaps.loc[post_period],
171 |                 direction=-1.0,
172 |                 tol=tol,
173 |                 step_sz=step_sz,
174 |                 max_iter=max_iter,
175 |             )
176 | 
177 |             upper_ci = self._root_search(
178 |                 fn=lambda x: _compute_p_value(x) - alpha,
179 |                 x0=gaps.loc[post_period],
180 |                 direction=1.0,
181 |                 tol=tol,
182 |                 step_sz=step_sz,
183 |                 max_iter=max_iter,
184 |             )
185 | 
186 |             conf_interval[post_period] = (lower_ci, upper_ci)
187 |             if verbose:
188 |                 print(
189 |                     f"\t{100 * (1 - alpha)}% CI: [{round(lower_ci, 3)}, {round(upper_ci, 3)}]"
190 |                 )
191 | 
192 |         df_ci = pd.DataFrame.from_dict(
193 |             conf_interval, orient="index", columns=["lower_ci", "upper_ci"]
194 |         )
195 |         df_ci = pd.concat([gaps.loc[post_periods].rename("value"), df_ci], axis=1)
196 |         df_ci.index.name = "time"
197 |         return df_ci
198 | 
199 |     def _root_search(
200 |         self,
201 |         fn: Callable,
202 |         x0: float,
203 |         direction: Literal[+1, -1],
204 |         tol: float,
205 |         step_sz: float,
206 |         max_iter: int,
207 |         theta: float = 0.75,
208 |         phi: float = 1.3,
209 |     ) -> float:
210 |         """Search for a root
211 | 
212 |         Parameters
213 |         ----------
214 |         fn : callable
215 |             Function to find a root of
216 |         x0 : float
217 |             Starting point
218 |         direction : int
219 |             Direction, either -1.0 or +1.0.
220 |         tol : float
221 |             Tolerance
222 |         step_sz : float
223 |             Step size in the search
224 |         max_iter : int
225 |             Maximum number of iterations
226 |         theta : float, optional
227 |             Step size reduction factor, should be positive and < 1.0, by default 0.75
228 |         phi : float, optional
229 |             Step size increase factor, should be positive and > 1.0, by default 1.3
230 | 
231 |         Returns
232 |         -------
233 |         float
234 |             Root of the function
235 | 
236 |         Raises
237 |         ------
238 |         Exception
239 |             if `max_iter` iterations exceeded before satisfying tolerance condition.
240 | 
241 |         :meta private:
242 |         """
243 |         x, gamma = x0, step_sz
244 |         for _ in range(max_iter):
245 |             if gamma <= tol:
246 |                 return x
247 |             y = fn(x + gamma * direction)
248 |             if y > 0.0:
249 |                 x = x + gamma * direction
250 |                 gamma = phi * gamma
251 |             else:
252 |                 gamma = theta * gamma
253 |         raise Exception(
254 |             "Exceeded `max_iter` iterations without satisfying tolerance requirement."
255 |         )
256 | 


--------------------------------------------------------------------------------
/pysyncon/penalized.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from typing import Optional, Literal, Union
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | from scipy.optimize import minimize, Bounds, LinearConstraint
  7 | 
  8 | from .dataprep import Dataprep
  9 | from .base import BaseSynth
 10 | 
 11 | 
 12 | class PenalizedOptimMixin:
 13 |     @staticmethod
 14 |     def w_optimize(
 15 |         V_mat: np.ndarray,
 16 |         X0: np.ndarray,
 17 |         X1: np.ndarray,
 18 |         lambda_: float,
 19 |         qp_method: Literal["SLSQP"] = "SLSQP",
 20 |         qp_options: dict = {"maxiter": 1000},
 21 |     ) -> tuple[np.ndarray, float]:
 22 |         """Solves the weight optimisation problem in the penalized setting,
 23 |         see Abadie & L'Hour :cite:`penalized2021`.
 24 | 
 25 |         Parameters
 26 |         ----------
 27 |         V_mat : numpy.ndarray, shape (c, c)
 28 |             The V matrix (using the notation of the Abadie, Diamond &
 29 |             Hainmueller paper, this matrix is denoted by Γ in the Abadie and
 30 |             L'Hour paper).
 31 |         X0 : numpy.ndarray, shape (c, m)
 32 |             Matrix with each column corresponding to a control unit and each
 33 |             row is covariates.
 34 |         X1 : numpy.ndarray, shape (c,)
 35 |             Column vector giving the covariate values for the treated unit.
 36 |         lambda_ : float,
 37 |             Regularization parameter.
 38 |         qp_method : str, optional
 39 |             Minimization routine to use in scipy minimize to solve the problem
 40 |             , by default "SLSQP"
 41 |         qp_options : dict, optional
 42 |             Options for scipy minimize, by default {"maxiter": 1000}
 43 | 
 44 |         Returns
 45 |         -------
 46 |         tuple[np.ndarray, float]
 47 |             tuple of the optimal weights and the loss
 48 | 
 49 |         :meta private:
 50 |         """
 51 |         _, n_c = X0.shape
 52 | 
 53 |         diff = np.subtract(X0, X1.reshape(-1, 1))
 54 |         r = np.diag(diff.T @ V_mat @ diff)
 55 | 
 56 |         P = X0.T @ V_mat @ X0
 57 |         q = -1.0 * X1.T @ V_mat @ X0 + (lambda_ / 2.0) * r.T
 58 | 
 59 |         def fun(x):
 60 |             return q.T @ x + 0.5 * x.T @ P @ x
 61 | 
 62 |         bounds = Bounds(lb=np.full(n_c, 0.0), ub=np.full(n_c, 1.0))
 63 |         constraints = LinearConstraint(A=np.full(n_c, 1.0), lb=1.0, ub=1.0)
 64 | 
 65 |         x0 = np.full(n_c, 1 / n_c)
 66 |         res = minimize(
 67 |             fun=fun,
 68 |             x0=x0,
 69 |             bounds=bounds,
 70 |             constraints=constraints,
 71 |             method=qp_method,
 72 |             options=qp_options,
 73 |         )
 74 |         W, loss_W = res["x"], res["fun"]
 75 |         return W, loss_W.item()
 76 | 
 77 | 
 78 | class PenalizedSynth(BaseSynth, PenalizedOptimMixin):
 79 |     """Implementation of the penalized synthetic control method due to
 80 |     Abadie & L'Hour :cite:`penalized2021`.
 81 |     """
 82 | 
 83 |     def __init__(self) -> None:
 84 |         super().__init__()
 85 |         self.loss_W: Optional[float] = None
 86 |         self.lambda_: Optional[float] = None
 87 | 
 88 |     def fit(
 89 |         self,
 90 |         dataprep: Optional[Dataprep] = None,
 91 |         X0: Optional[pd.DataFrame] = None,
 92 |         X1: Optional[Union[pd.Series, pd.DataFrame]] = None,
 93 |         lambda_: Optional[float] = 0.01,
 94 |         custom_V: Optional[np.ndarray] = None,
 95 |     ) -> None:
 96 |         """Fit the model/calculate the weights.
 97 | 
 98 |         Parameters
 99 |         ----------
100 |         dataprep : Dataprep, optional
101 |             :class:`Dataprep` object containing data to model, by default None.
102 |         X0 : pd.DataFrame, shape (c, m), optional
103 |             Matrix with each column corresponding to a control unit and each
104 |             row is a covariate value, by default None.
105 |         X1 : pandas.Series, shape (c, 1), optional
106 |             Column vector giving the covariate values for the treated unit, by
107 |             default None.
108 |         lambda_ : float, optional
109 |             Ridge parameter to use, default 0.01
110 |         custom_V : numpy.ndarray, shape (c, c), optional
111 |             Provide a V matrix (using the notation of the Abadie, Diamond &
112 |             Hainmueller paper, this matrix is denoted by Γ in the Abadie and
113 |             L'Hour paper), if not provided then the identity matrix is used
114 |             (equal importance to all covariates).
115 | 
116 |         Returns
117 |         -------
118 |         NoneType
119 |             None
120 | 
121 |         Raises
122 |         ------
123 |         ValueError
124 |             if neither a Dataprep object nor all of (X0, X1) are
125 |             supplied.
126 |         """
127 |         if dataprep:
128 |             if (
129 |                 isinstance(dataprep.treatment_identifier, (list, tuple))
130 |                 and len(dataprep.treatment_identifier) > 1
131 |             ):
132 |                 raise ValueError("PenalizedSynth requires exactly one treated unit.")
133 |             self.dataprep = dataprep
134 |             X0, X1 = dataprep.make_covariate_mats()
135 |         else:
136 |             if X0 is None or X1 is None:
137 |                 raise ValueError("dataprep must be set or (X0, X1) must all be set.")
138 |             if not isinstance(X1, pd.Series):
139 |                 raise TypeError("X1 must be of type `pandas.Series`.")
140 |         self.lambda_ = lambda_
141 | 
142 |         X = pd.concat([X0, X1], axis=1)
143 |         X_scaled = X.divide(X.std(axis=1), axis=0)
144 |         X0_scaled, X1_scaled = X_scaled.drop(columns=X1.name), X_scaled[X1.name]
145 | 
146 |         X0_arr = X0_scaled.to_numpy()
147 |         X1_arr = X1_scaled.to_numpy()
148 | 
149 |         if custom_V is None:
150 |             V_mat = np.identity(X0_arr.shape[0])
151 |         else:
152 |             V_mat = np.diag(custom_V)
153 | 
154 |         W, loss_W = self.w_optimize(V_mat=V_mat, X0=X0_arr, X1=X1_arr, lambda_=lambda_)
155 |         self.W, self.loss_W = W, loss_W
156 | 


--------------------------------------------------------------------------------
/pysyncon/robust.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from typing import Optional
  3 | 
  4 | import numpy as np
  5 | 
  6 | from .dataprep import Dataprep
  7 | from .base import BaseSynth
  8 | 
  9 | 
 10 | class RobustSynth(BaseSynth):
 11 |     """Implementation of the robust synthetic control method due to
 12 |     Amjad, Shah & Shen :cite:`robust2018`.
 13 |     """
 14 | 
 15 |     def __init__(self) -> None:
 16 |         super().__init__()
 17 |         self.W: Optional[np.ndarray] = None
 18 |         self.lambda_: Optional[float] = None
 19 | 
 20 |     def fit(
 21 |         self,
 22 |         dataprep: Dataprep,
 23 |         lambda_: float,
 24 |         threshold: Optional[float] = None,
 25 |         sv_count: Optional[int] = None,
 26 |     ) -> None:
 27 |         """Fit the model/calculate the weights.
 28 | 
 29 |         Parameters
 30 |         ----------
 31 |         dataprep : Dataprep
 32 |             :class:`Dataprep` object containing data to model.
 33 |         lambda_ : float
 34 |             Ridge parameter to use.
 35 |         threshold: float, optional
 36 |             Remove singular values that are less than this threshold.
 37 |         sv_count: int, optional
 38 |             Keep this many of the largest singular values when
 39 |             reducing the outcome matrix
 40 |         """
 41 |         if (
 42 |             isinstance(dataprep.treatment_identifier, (list, tuple))
 43 |             and len(dataprep.treatment_identifier) > 1
 44 |         ):
 45 |             raise ValueError("RobustSynth requires exactly one treated unit.")
 46 |         self.dataprep = dataprep
 47 |         time_period_min = dataprep.foo[dataprep.time_variable].astype("int").min()
 48 |         time_period_max = dataprep.foo[dataprep.time_variable].astype("int").max()
 49 | 
 50 |         X0, X1 = dataprep.make_outcome_mats(
 51 |             time_period=range(time_period_min, 1 + time_period_max)
 52 |         )
 53 |         Y = X0.T.values
 54 | 
 55 |         M_hat = self._sv_decomposition(Y, threshold, sv_count).T
 56 | 
 57 |         time_optim_end = 1 + dataprep.time_optimize_ssr[-1]
 58 |         end_idx = X0.index.to_list().index(time_optim_end)
 59 |         M_hat_neg = M_hat[:end_idx, :]
 60 |         Y1_neg = X1.to_numpy()[:end_idx]
 61 | 
 62 |         self.W = np.matmul(
 63 |             np.linalg.inv(
 64 |                 M_hat_neg.T @ M_hat_neg + lambda_ * np.identity(M_hat_neg.shape[1])
 65 |             ),
 66 |             M_hat_neg.T @ Y1_neg,
 67 |         )
 68 | 
 69 |     def _sv_decomposition(
 70 |         self,
 71 |         Y: np.ndarray,
 72 |         threshold: Optional[float] = None,
 73 |         sv_count: Optional[int] = None,
 74 |     ) -> np.ndarray:
 75 |         """Calculate the :math:`\hat{M}` matrix from the paper (see :cite:`robust2018`) by
 76 |         carrying out an SVD of the outcome matrix and remove the specified number
 77 |         of singular values.
 78 | 
 79 |         Parameters
 80 |         ----------
 81 |         Y : np.ndarray
 82 |             The outcome matrix (:math:`Y` matrix in the notation of the paper).
 83 |         threshold : Optional[float], optional
 84 |             Remove singular values that are less that `threshold`,
 85 |             either this must be specified or `sv_count`, by default None
 86 |         sv_count : Optional[int], optional
 87 |             Keep this many of the largest singular values,
 88 |             either this must be specified or `threshold`, by default None
 89 | 
 90 |         Returns
 91 |         -------
 92 |         np.ndarray
 93 |             The :math:`\hat{M}` matrix from the paper (see :cite:`robust2018`).
 94 | 
 95 |         Raises
 96 |         ------
 97 |         ValueError
 98 |             If neither `threshold` nor `sv_count` are supplied.
 99 | 
100 |         :meta private:
101 |         """
102 |         if not threshold and not sv_count:
103 |             raise ValueError("One of `threshold` or `sv_count` must be supplied.")
104 |         u, s, v = np.linalg.svd(Y)
105 |         s_shape = s.shape[0] - 1
106 |         if threshold:
107 |             idx = 0
108 |             while s[idx] > threshold and idx < s_shape:
109 |                 idx += 1
110 |         else:
111 |             idx = sv_count
112 | 
113 |         s_res = np.zeros_like(Y)
114 |         s_res[:idx, :idx] = np.diag(s[:idx])
115 | 
116 |         r, c = Y.shape
117 |         p_hat = max(np.count_nonzero(Y) / (r * c), 1 / (r * c))
118 |         M_hat = (1 / p_hat) * (u @ s_res @ v)
119 |         return M_hat
120 | 


--------------------------------------------------------------------------------
/pysyncon/synth.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from typing import Union, Optional, Literal
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | from scipy.optimize import minimize
  7 | 
  8 | from .dataprep import Dataprep
  9 | from .base import BaseSynth, VanillaOptimMixin
 10 | from .inference import ConformalInference
 11 | 
 12 | 
 13 | OptimizerMethod_t = Literal[
 14 |     "Nelder-Mead", "Powell", "CG", "BFGS", "L-BFGS-B", "TNC", "COBYLA", "trust-constr"
 15 | ]
 16 | 
 17 | 
 18 | class Synth(BaseSynth, VanillaOptimMixin):
 19 |     """Implementation of the synthetic control method due to
 20 |     Abadie & Gardeazabal :cite:`basque2003`.
 21 |     """
 22 | 
 23 |     def __init__(self) -> None:
 24 |         super().__init__()
 25 |         self.loss_W: Optional[float] = None
 26 |         self.V: Optional[np.ndarray] = None
 27 |         self.loss_V: Optional[float] = None
 28 | 
 29 |     def fit(
 30 |         self,
 31 |         dataprep: Optional[Dataprep] = None,
 32 |         X0: Optional[pd.DataFrame] = None,
 33 |         X1: Optional[pd.Series] = None,
 34 |         Z0: Optional[pd.DataFrame] = None,
 35 |         Z1: Optional[pd.Series] = None,
 36 |         custom_V: Optional[np.ndarray] = None,
 37 |         optim_method: OptimizerMethod_t = "Nelder-Mead",
 38 |         optim_initial: Literal["equal", "ols"] = "equal",
 39 |         optim_options: dict = {"maxiter": 1000},
 40 |     ) -> None:
 41 |         """Fit the model/calculate the weights. Either a :class:`Dataprep` object
 42 |         should be provided or otherwise matrices (:math:`X_0`, :math:`X_1`, :math:`Z_0`,
 43 |         :math:`Z_1`) should be provided (using the notation of Abadie &
 44 |         Gardeazabal :cite:`basque2003`).
 45 | 
 46 |         Parameters
 47 |         ----------
 48 |         dataprep : Dataprep, optional
 49 |             :class:`Dataprep` object containing data to model, by default None.
 50 |         X0 : pd.DataFrame, shape (m, c), optional
 51 |             Matrix with each column corresponding to a control unit and each
 52 |             row is covariates, by default None.
 53 |         X1 : pandas.Series, shape (m, 1), optional
 54 |             Column vector giving the covariate values for the treated unit, by
 55 |             default None.
 56 |         Z0 : pandas.DataFrame, shape (n, c), optional
 57 |             A matrix of the time series of the outcome variable with each
 58 |             column corresponding to a control unit and the rows are the time
 59 |             steps; the columns correspond with the columns of X0, by default
 60 |             None.
 61 |         Z1 : pandas.Series, shape (n, 1), optional
 62 |             Column vector giving the outcome variable values over time for the
 63 |             treated unit, by default None.
 64 |         custom_V : numpy.ndarray, shape (c, c), optional
 65 |             Provide a V matrix (using the notation of the Abadie, Diamond &
 66 |             Hainmueller paper), the optimisation problem will only then be
 67 |             solved for the weight matrix W, by default None.
 68 |         optim_method : str, optional
 69 |             Optimisation method to use for the outer optimisation, can be
 70 |             any of the valid options for scipy minimize that do not require a
 71 |             jacobian matrix, namely
 72 | 
 73 |                 - 'Nelder-Mead'
 74 |                 - 'Powell'
 75 |                 - 'CG'
 76 |                 - 'BFGS'
 77 |                 - 'L-BFGS-B'
 78 |                 - 'TNC'
 79 |                 - 'COBYLA'
 80 |                 - 'trust-constr'
 81 | 
 82 |             By default 'Nelder-Mead'.
 83 |         optim_initial : str, optional
 84 |             Starting value for the outer optimisation, possible starting
 85 |             values are
 86 | 
 87 |                 - 'equal', where the weights are all equal,
 88 |                 - 'ols', which uses a starting value obtained for fitting a
 89 |                   regression.
 90 | 
 91 |             By default 'equal'.
 92 |         optim_options : dict, optional
 93 |             options to provide to the outer part of the optimisation, value
 94 |             options are any option that can be provided to scipy minimize for
 95 |             the given optimisation method, by default `{'maxiter': 1000}`.
 96 | 
 97 |         Returns
 98 |         -------
 99 |         NoneType
100 |             None
101 | 
102 |         Raises
103 |         ------
104 |         ValueError
105 |             if neither a `Dataprep` object nor all of (:math:`X_0`, :math:`X_1`,
106 |             :math:`Z_0`, :math:`Z_1`) are supplied.
107 |         TypeError
108 |             if (:math:`X1`, :math:`Z1`) are not of type `pandas.Series`.
109 |         ValueError
110 |             if `optim_initial=ols` and there is collinearity in the data.
111 |         ValueError
112 |             if `optim_initial` is not one of `'equal'` or `'ols'`.
113 |         """
114 |         if dataprep:
115 |             if (
116 |                 isinstance(dataprep.treatment_identifier, (list, tuple))
117 |                 and len(dataprep.treatment_identifier) > 1
118 |             ):
119 |                 raise ValueError("Synth requires exactly one treated unit.")
120 |             self.dataprep = dataprep
121 |             X0, X1 = dataprep.make_covariate_mats()
122 |             Z0, Z1 = dataprep.make_outcome_mats()
123 |         else:
124 |             if X0 is None or X1 is None or Z0 is None or Z1 is None:
125 |                 raise ValueError(
126 |                     "dataprep must be set or (X0, X1, Z0, Z1) must all be set."
127 |                 )
128 |             if not isinstance(X1, pd.Series) or not isinstance(Z1, pd.Series):
129 |                 raise TypeError("X1 and Z1 must be of type `pandas.Series`.")
130 | 
131 |         X = pd.concat([X0, X1], axis=1)
132 |         X_scaled = X.divide(X.std(axis=1), axis=0)
133 |         X0_scaled, X1_scaled = X_scaled.drop(columns=X1.name), X_scaled[X1.name]
134 | 
135 |         X0_arr = X0_scaled.to_numpy()
136 |         X1_arr = X1_scaled.to_numpy()
137 |         Z0_arr = Z0.to_numpy()
138 |         Z1_arr = Z1.to_numpy()
139 | 
140 |         if custom_V is not None:
141 |             V_mat = np.diag(custom_V)
142 |             W, loss_W = self.w_optimize(V_mat=V_mat, X0=X0_arr, X1=X1_arr)
143 |             loss_V = self.calc_loss_V(W=W, Z0=Z0_arr, Z1=Z1_arr)
144 |             self.W, self.loss_W, self.V, self.loss_V = W, loss_W, custom_V, loss_V
145 |             return
146 | 
147 |         n_r, _ = X0_arr.shape
148 | 
149 |         if optim_initial == "equal":
150 |             x0 = [1 / n_r] * n_r
151 |         elif optim_initial == "ols":
152 |             X_arr = np.hstack([X0_arr, X1_arr.reshape(-1, 1)])
153 |             X_arr = np.hstack([np.full((X_arr.shape[1], 1), 1), X_arr.T])
154 |             Z_arr = np.hstack([Z0_arr, Z1_arr.reshape(-1, 1)])
155 | 
156 |             try:
157 |                 beta = np.linalg.inv(X_arr.T @ X_arr) @ X_arr.T @ Z_arr.T
158 |             except np.linalg.LinAlgError:
159 |                 raise ValueError(
160 |                     'Could not invert X^T.X required for `optim_initial="ols"`, '
161 |                     "probably there is collinearity in your data."
162 |                 )
163 | 
164 |             beta = beta[1:,]  # fmt: skip
165 |             x0 = np.diag(beta @ beta.T)
166 |             x0 = x0 / sum(x0)
167 |         else:
168 |             raise ValueError("Unknown option for `optim_initial`.")
169 | 
170 |         def fun(x):
171 |             V_mat = np.diag(np.abs(x)) / np.sum(np.abs(x))
172 |             W, _ = self.w_optimize(V_mat=V_mat, X0=X0_arr, X1=X1_arr)
173 |             loss_V = self.calc_loss_V(W=W, Z0=Z0_arr, Z1=Z1_arr)
174 |             return loss_V
175 | 
176 |         res = minimize(fun=fun, x0=x0, method=optim_method, options=optim_options)
177 |         V_mat = np.diag(np.abs(res["x"])) / np.sum(np.abs(res["x"]))
178 |         W, loss_W = self.w_optimize(V_mat=V_mat, X0=X0_arr, X1=X1_arr)
179 |         loss_V = self.calc_loss_V(W=W, Z0=Z0_arr, Z1=Z1_arr)
180 | 
181 |         self.W, self.loss_W, self.V, self.loss_V = W, loss_W, V_mat.diagonal(), loss_V
182 |         self.W_names = Z0.columns
183 | 
184 |     @staticmethod
185 |     def calc_loss_V(W: np.ndarray, Z0: np.ndarray, Z1: np.ndarray) -> float:
186 |         """Calculates the V loss.
187 | 
188 |         Parameters
189 |         ----------
190 |         W : numpy.ndarray, shape (n,)
191 |             Vector of the control weights
192 |         Z0 : numpy.ndarray, shape (m, n)
193 |             Matrix of the time series of the outcome variable with each
194 |             column corresponding to a control unit and the rows are the time
195 |             steps.
196 |         Z1 : numpy.ndarray, shape (m,)
197 |             Column vector giving the outcome variable values over time for the
198 |             treated unit
199 | 
200 |         Returns
201 |         -------
202 |         float
203 |             V loss.
204 | 
205 |         :meta private:
206 |         """
207 |         loss_V = (Z1 - Z0 @ W).T @ (Z1 - Z0 @ W) / len(Z0)
208 |         return loss_V.item()
209 | 
210 |     def summary(
211 |         self,
212 |         round: int = 3,
213 |         X0: Optional[pd.DataFrame] = None,
214 |         X1: Optional[pd.Series] = None,
215 |     ) -> pd.DataFrame:
216 |         """Generates a ``pandas.DataFrame`` with summary data. In particular,
217 |         it will show the values of the V matrix for each predictor, then the
218 |         next column will show the mean value of each predictor over the time
219 |         period ``time_predictors_prior`` for the treated unit and the synthetic
220 |         unit and finally there will be a column 'sample mean' that shows the
221 |         mean value of each predictor over the time period
222 |         ``time_predictors_prior`` across all the control units, i.e. this will
223 |         be the same as a synthetic control where all the weights are equal.
224 | 
225 |         Parameters
226 |         ----------
227 |         round : int, optional
228 |             Round the numbers to given number of places, by default 3
229 |         X0 : pd.DataFrame, shape (n_cov, n_controls), optional
230 |             Matrix with each column corresponding to a control unit and each
231 |             row is a covariate. If no dataprep is set, then this must be
232 |             supplied along with X1, by default None.
233 |         X1 : pandas.Series, shape (n_cov, 1), optional
234 |             Column vector giving the covariate values for the treated unit.
235 |             If no dataprep is set, then this must be supplied along with Z1,
236 |             by default None.
237 | 
238 |         Returns
239 |         -------
240 |         pandas.DataFrame
241 |             Summary data.
242 | 
243 |         Raises
244 |         ------
245 |         ValueError
246 |             If there is no V matrix available
247 |         ValueError
248 |             If there is no :class:`Dataprep` object set or (Z0, Z1) is not supplied
249 |         ValueError
250 |             If there is no weight matrix available
251 |         """
252 |         if self.V is None:
253 |             raise ValueError("No V matrix available; fit data first.")
254 |         summary_ser = super().summary(round=round, X0=X0, X1=X1)
255 | 
256 |         V = pd.Series(self.V, index=summary_ser.index, name="V")
257 |         return pd.concat([V, summary_ser], axis=1).round(round)
258 | 
259 |     def confidence_interval(
260 |         self,
261 |         alpha: float,
262 |         time_periods: list,
263 |         tol: float,
264 |         pre_periods: Optional[list] = None,
265 |         dataprep: Optional[Dataprep] = None,
266 |         X0: Optional[pd.DataFrame] = None,
267 |         X1: Optional[pd.Series] = None,
268 |         Z0: Optional[pd.DataFrame] = None,
269 |         Z1: Optional[pd.Series] = None,
270 |         custom_V: Optional[np.ndarray] = None,
271 |         optim_method: OptimizerMethod_t = None,
272 |         optim_initial: Literal["equal", "ols"] = None,
273 |         optim_options: dict = None,
274 |         method: Literal["conformal"] = "conformal",
275 |         max_iter: int = 50,
276 |         step_sz: Optional[float] = None,
277 |         step_sz_div: float = 20.0,
278 |         verbose: bool = True,
279 |     ) -> pd.DataFrame:
280 |         """Confidence intervals obtained from test-inversion, where
281 |         the p-values are obtained by adjusted refits of the data
282 |         following Chernozhukov et al. :cite:`inference2021`.
283 | 
284 |         Parameters
285 |         ----------
286 |         alpha : float
287 |             The required significance level, e.g. alpha = 0.05 will
288 |             yield a confidence level of 100 * (1 - alpha) = 95%.
289 |         time_periods : list
290 |             The time-periods to calculate confidence intervals for.
291 |         tol : float
292 |             The required tolerance (accuracy) required when calculating the
293 |             lower/upper cut-off point of the confidence interval. The search
294 |             will try to obtain this tolerance level but will not exceed `max_iter`
295 |             iterations trying to achieve that.
296 |         pre_periods : Optional[list], optional
297 |             The time-periods to use for the optimization when refitting the
298 |             data with the adjusted outcomes, optional.
299 |         dataprep : Optional[Dataprep], optional
300 |             Dataprep object defining the study data, if this is not supplied
301 |             then either self.dataprep must be set or else (X0, X1, Z0, Z1) must
302 |             all be supplied, by default None.
303 |         X0 : pd.DataFrame, shape (m, c), optional
304 |             Matrix with each column corresponding to a control unit and each
305 |             row is covariates, if this is not supplied then either `dataprep` must
306 |              be supplied or `self.dataprep` must be set by default None.
307 |         X1 : pandas.Series, shape (m, 1), optional
308 |             Column vector giving the covariate values for the treated unit, if
309 |             this is not supplied then either `dataprep` must
310 |              be supplied or `self.dataprep` must be set by default None.
311 |         Z0 : pandas.DataFrame, shape (n, c), optional
312 |             A matrix of the time series of the outcome variable with each
313 |             column corresponding to a control unit and the rows are the time
314 |             steps; the columns correspond with the columns of X0, if this
315 |             is not supplied then either `dataprep` must be supplied or
316 |             `self.dataprep` must be set by default None.
317 |         Z1 : pandas.Series, shape (n, 1), optional
318 |             Column vector giving the outcome variable values over time for the
319 |             treated unit, if this is not supplied then either `dataprep` must
320 |              be supplied or `self.dataprep` must be set by default None.
321 |         custom_V : numpy.ndarray, shape (c, c), optional
322 |             Provide a V matrix (using the notation of the Abadie, Diamond &
323 |             Hainmueller paper), the optimisation problem will only then be
324 |             solved for the weight matrix W. This is the same argument
325 |             as in the `fit` method, by default None.
326 |         optim_method : str, optional
327 |             Optimisation method to use for the outer optimisation, can be
328 |             any of the valid options for scipy minimize that do not require a
329 |             jacobian matrix, namely
330 | 
331 |                 - 'Nelder-Mead'
332 |                 - 'Powell'
333 |                 - 'CG'
334 |                 - 'BFGS'
335 |                 - 'L-BFGS-B'
336 |                 - 'TNC'
337 |                 - 'COBYLA'
338 |                 - 'trust-constr'
339 | 
340 |             This is the same argument as in the `fit` method, by default
341 |             'Nelder-Mead'.
342 |         optim_initial : str, optional
343 |             Starting value for the outer optimisation, possible starting
344 |             values are
345 | 
346 |                 - 'equal', where the weights are all equal,
347 |                 - 'ols', which uses a starting value obtained for fitting a
348 |                   regression.
349 | 
350 |             This is the same argument as in the `fit` method, by default
351 |             'equal'.
352 |         optim_options : dict, optional
353 |             options to provide to the outer part of the optimisation, value
354 |             options are any option that can be provided to scipy minimize for
355 |             the given optimisation method. This is the same argument as in
356 |              the `fit` method, by default `{'maxiter': 1000}`.
357 |         method : str, optional
358 |             The type of method to use when computing the confidence intervals,
359 |             currently only conformal inference (`conformal`) is implemented,
360 |             by default "conformal".
361 |         max_iter : int, optional
362 |             Maximum number of times to re-fit the data when trying to locate
363 |             the lower/upper cut-off point and when binary searching for the
364 |             cut-off point, by default 20.
365 |         step_sz : Optional[float], optional
366 |             Step size to use when searching for an interval that contains the
367 |             lower or upper cut-off point of the confidence interval, by default None.
368 |         step_sz_div : float, optional
369 |             Alternative way to define step size: it is the fraction that defines
370 |             step-size in terms of the standard deviation of the att, i.e. if
371 |             `step_sz_div=20.0` then the step size used will be (att +/- 2.5 * std(att)) / 20.0,
372 |             by default 20.0.
373 |         verbose : bool, optional
374 |             Print output, by default True.
375 | 
376 |         Returns
377 |         -------
378 |         pd.DataFrame
379 |             A pandas.DataFrame indexed by `post_periods`, with 3 columns: `value` that
380 |             gives the calculated treatment effect, `lower_ci` that gives the value
381 |             defining the lower-end of the confidence interval, `upper_ci` that gives
382 |             the value defining the upper-end of the confidence interval.
383 | 
384 |         Raises
385 |         ------
386 |         ValueError
387 |             If there is no :class:`Dataprep` object set or (X0, X1, Z0, Z1) is not supplied or
388 |             `self.dataprep` is not set.
389 |         TypeError
390 |             if (:math:`X1`, :math:`Z1`) are not of type `pandas.Series`.
391 |         ValueError
392 |             if `dataprep` is not set and `pre-periods` is not set.
393 |         ValueError
394 |             if an invalid option for `method` is given, currently only `conformal` is supported.
395 |         """
396 |         if method == "conformal":
397 |             if dataprep is not None:
398 |                 X0, X1 = dataprep.make_covariate_mats()
399 |                 if pre_periods is None:
400 |                     pre_periods = list(dataprep.time_optimize_ssr)
401 |                 if 1.0 / len(pre_periods) > alpha:
402 |                     raise ValueError(
403 |                         "Too few pre-intervention time-periods available for "
404 |                         f"significance level `alpha`={alpha}, either increase `alpha` "
405 |                         "or use more pre-intervention time-periods."
406 |                     )
407 |                 all_time_periods = time_periods + list(pre_periods)
408 |                 Z0, Z1 = dataprep.make_outcome_mats(time_period=all_time_periods)
409 |             elif self.dataprep is not None:
410 |                 X0, X1 = self.dataprep.make_covariate_mats()
411 |                 if pre_periods is None:
412 |                     pre_periods = list(self.dataprep.time_optimize_ssr)
413 |                 if 1.0 / len(pre_periods) > alpha:
414 |                     raise ValueError(
415 |                         "Too few pre-intervention time-periods available for "
416 |                         f"significance level `alpha`={alpha}, either increase `alpha` "
417 |                         "or use more pre-intervention time-periods."
418 |                     )
419 |                 all_time_periods = time_periods + list(pre_periods)
420 |                 Z0, Z1 = self.dataprep.make_outcome_mats(time_period=all_time_periods)
421 |             else:
422 |                 if X0 is None or X1 is None or Z0 is None or Z1 is None:
423 |                     raise ValueError(
424 |                         "dataprep must be set or (X0, X1, Z0, Z1) must all be set."
425 |                     )
426 |                 if not isinstance(X1, pd.Series) or not isinstance(Z1, pd.Series):
427 |                     raise TypeError("X1 and Z1 must be of type `pandas.Series`.")
428 |                 if pre_periods is None:
429 |                     raise ValueError("`pre_periods` must be set if not using dataprep.")
430 |                 if 1.0 / len(pre_periods) > alpha:
431 |                     raise ValueError(
432 |                         "Too few pre-intervention time-periods available for "
433 |                         f"significance level `alpha`={alpha}, either increase `alpha` "
434 |                         "or use more pre-intervention time-periods."
435 |                     )
436 | 
437 |             scm_fit_args = {"X0": X0, "X1": X1}
438 |             if custom_V is not None:
439 |                 scm_fit_args["custom_V"] = custom_V
440 |             if optim_method:
441 |                 scm_fit_args["optim_method"] = optim_method
442 |             if optim_initial:
443 |                 scm_fit_args["optim_initial"] = optim_initial
444 |             if optim_options:
445 |                 scm_fit_args["optim_options"] = optim_options
446 | 
447 |             conformal_inf = ConformalInference()
448 |             df_cis = conformal_inf.confidence_intervals(
449 |                 alpha=alpha,
450 |                 scm=self,
451 |                 Z0=Z0,
452 |                 Z1=Z1,
453 |                 pre_periods=pre_periods,
454 |                 post_periods=time_periods,
455 |                 scm_fit_args=scm_fit_args,
456 |                 max_iter=max_iter,
457 |                 tol=tol,
458 |                 step_sz=step_sz,
459 |                 step_sz_div=step_sz_div,
460 |                 verbose=verbose,
461 |             )
462 |             return df_cis
463 |         else:
464 |             raise ValueError("Invalid option for `method`.")
465 | 


--------------------------------------------------------------------------------
/pysyncon/utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from typing import Optional, Union
  3 | from concurrent import futures
  4 | import copy
  5 | from dataclasses import dataclass
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import matplotlib.pyplot as plt
 10 | 
 11 | from .dataprep import Dataprep, IsinArg_t
 12 | from .base import BaseSynth
 13 | 
 14 | 
 15 | class HoldoutSplitter:
 16 |     """Iterator that prepares the time series for cross-validation by
 17 |     progressively removing blocks of length `holdout_len`.
 18 |     """
 19 | 
 20 |     def __init__(self, df: pd.DataFrame, ser: pd.Series, holdout_len: int = 1):
 21 |         """Iterator that prepares the time series for cross-validation by
 22 |         progressively removing blocks of length `holdout_len`.
 23 | 
 24 |         Parameters
 25 |         ----------
 26 |         df : pandas.DataFrame, shape (r, c)
 27 |             Dataframe that will be split for the cross-validation.
 28 |         ser : pandas.Series, shape (r, 1)
 29 |             Series that will split for the cross-validation.
 30 |         holdout_len : int, optional
 31 |             Number of days to remove in each iteration, by default 1.
 32 | 
 33 |         Raises
 34 |         ------
 35 |         ValueError
 36 |             if df and ser do not have the same number of rows.
 37 |         ValueError
 38 |             if `holdout_len` is not >= 1.
 39 |         ValueError
 40 |             if `holdout_len` is larger than the number of rows of df.
 41 |         """
 42 |         if df.shape[0] != ser.shape[0]:
 43 |             raise ValueError("`df` and `ser` must have the same number of rows.")
 44 |         if holdout_len < 1:
 45 |             raise ValueError("`holdout_len` must be at least 1.")
 46 |         if holdout_len >= df.shape[0]:
 47 |             raise ValueError("`holdout_len` must be less than df.shape[0]")
 48 |         self.df = df
 49 |         self.ser = ser
 50 |         self.holdout_len = holdout_len
 51 |         self.idx = 0
 52 | 
 53 |     def __iter__(self):
 54 |         self.idx = 0
 55 |         return self
 56 | 
 57 |     def __next__(self) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
 58 |         if (self.idx + self.holdout_len) > self.df.shape[0]:
 59 |             raise StopIteration
 60 |         holdout = slice(self.idx, self.idx + self.holdout_len)
 61 | 
 62 |         df_holdout = self.df.iloc[holdout,]  # fmt: skip
 63 |         ser_holdout = self.ser.iloc[holdout]
 64 | 
 65 |         df = self.df.drop(index=self.df.index[holdout])
 66 |         ser = self.ser.drop(index=self.ser.index[holdout])
 67 | 
 68 |         self.idx += 1
 69 |         return df, df_holdout, ser, ser_holdout
 70 | 
 71 | 
 72 | @dataclass
 73 | class CrossValidationResult:
 74 |     """Convenience class for holding the results of the cross-validation
 75 |     procedure from the AugSynth.
 76 |     """
 77 | 
 78 |     lambdas: np.ndarray
 79 |     errors_mean: np.ndarray
 80 |     errors_se: np.ndarray
 81 | 
 82 |     def best_lambda(self, min_1se: bool = True) -> float:
 83 |         """Return the best lambda.
 84 | 
 85 |         Parameters
 86 |         ----------
 87 |         min_1se : bool, optional
 88 |             return the largest lambda within 1 standard error of the minimum
 89 |             , by default True
 90 | 
 91 |         Returns
 92 |         -------
 93 |         float
 94 |         """
 95 |         if min_1se:
 96 |             return (
 97 |                 self.lambdas[
 98 |                     self.errors_mean
 99 |                     <= self.errors_mean.min()
100 |                     + self.errors_se[self.errors_mean.argmin()]
101 |                 ]
102 |                 .max()
103 |                 .item()
104 |             )
105 |         return self.lambdas[self.errors_mean.argmin()].item()
106 | 
107 |     def plot(self) -> None:
108 |         """Plots the mean errors against the lambda values with the standard
109 |         errors as error bars.
110 |         """
111 |         plt.errorbar(
112 |             x=self.lambdas,
113 |             y=self.errors_mean,
114 |             yerr=self.errors_se,
115 |             ecolor="black",
116 |             capsize=2,
117 |         )
118 |         plt.xlabel("Lambda")
119 |         plt.ylabel("Mean error")
120 |         plt.xscale("log")
121 |         plt.yscale("log")
122 |         plt.title("Cross validation result")
123 |         plt.grid()
124 |         plt.show()
125 | 
126 | 
127 | class PlaceboTest:
128 |     """Class that carries out placebo tests by running a synthetic control
129 |     study using each possible control unit as the treated unit and the
130 |     remaining control units as controls. See :cite:`germany2015` for more details.
131 |     """
132 | 
133 |     def __init__(self) -> None:
134 |         self.paths: Optional[pd.DataFrame] = None
135 |         self.treated_path: Optional[pd.DataFrame] = None
136 |         self.gaps: Optional[pd.DataFrame] = None
137 |         self.treated_gap: Optional[pd.DataFrame] = None
138 |         self.time_optimize_ssr: Optional[IsinArg_t] = None
139 | 
140 |     def fit(
141 |         self,
142 |         dataprep: Dataprep,
143 |         scm: BaseSynth,
144 |         scm_options: dict = {},
145 |         max_workers: Optional[int] = None,
146 |         verbose: bool = True,
147 |     ):
148 |         """Run the placebo tests. This method is multi-process and by default
149 |         will use all available processors. Use the `max_workers` option to change
150 |         this behaviour.
151 | 
152 |         Parameters
153 |         ----------
154 |         dataprep : Dataprep
155 |             :class:`Dataprep` object containing data to model, by default None.
156 |         scm : Synth | AugSynth
157 |             Synthetic control study to use
158 |         scm_options : dict, optional
159 |             Options to provide to the fit method of the synthetic control
160 |             study, valid options are any valid option that the `scm_type`
161 |             takes, by default {}
162 |         max_workers : Optional[int], optional
163 |             Maximum number of processes to use, if not provided then will use
164 |             all available, by default None
165 |         verbose : bool, optional
166 |             Whether or not to output progress, by default True
167 |         """
168 |         paths, gaps = list(), list()
169 |         n_tests = len(dataprep.controls_identifier)
170 |         with futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
171 |             to_do = list()
172 |             for treated, controls in self.placebo_iter(dataprep.controls_identifier):
173 |                 _dataprep = copy.copy(dataprep)
174 |                 _dataprep.treatment_identifier = treated
175 |                 _dataprep.controls_identifier = controls
176 |                 to_do.append(
177 |                     executor.submit(
178 |                         self._single_placebo,
179 |                         dataprep=_dataprep,
180 |                         scm=scm,
181 |                         scm_options=scm_options,
182 |                     )
183 |                 )
184 |             for idx, future in enumerate(futures.as_completed(to_do), 1):
185 |                 path, gap = future.result()
186 |                 if verbose:
187 |                     print(f"({idx}/{n_tests}) Completed placebo test for {path.name}.")
188 |                 paths.append(path)
189 |                 gaps.append(gap)
190 | 
191 |         self.paths = pd.concat(paths, axis=1)
192 |         self.gaps = pd.concat(gaps, axis=1)
193 |         self.time_optimize_ssr = dataprep.time_optimize_ssr
194 | 
195 |         print(f"Calculating treated unit gaps.")
196 |         self.treated_path, self.treated_gap = self._single_placebo(
197 |             dataprep=dataprep, scm=scm, scm_options=scm_options
198 |         )
199 |         print("Done.")
200 | 
201 |     @staticmethod
202 |     def placebo_iter(controls: list[str]) -> tuple[str, list[str]]:
203 |         """Generates combinations of (treated unit, control units) for the
204 |         placebo tests.
205 | 
206 |         Parameters
207 |         ----------
208 |         controls : list[str]
209 |             List of unit labels to use
210 | 
211 |         Yields
212 |         ------
213 |         tuple[str, list[str]]
214 |             Tuple of (treated unit label, control unit labels)
215 | 
216 |         :meta private:
217 |         """
218 |         for control in controls:
219 |             yield (control, [c for c in controls if c != control])
220 | 
221 |     @staticmethod
222 |     def _single_placebo(
223 |         dataprep: Dataprep, scm: BaseSynth, scm_options: dict = {}
224 |     ) -> tuple[pd.Series, pd.Series]:
225 |         """Run a single placebo test.
226 | 
227 |         Parameters
228 |         ----------
229 |         dataprep : Dataprep
230 |             :class:`Dataprep` object containing data to model
231 |         scm : Synth | AugSynth
232 |             Type of synthetic control study to use
233 |         scm_options : dict, optional
234 |             Options to provide to the fit method of the synthetic control
235 |             study, valid options are any valid option that `scm` takes, by
236 |             default {}
237 | 
238 |         Returns
239 |         -------
240 |         tuple[pandas.Series, pandas.Series]
241 |             A time-series of the path of the synthetic control and a
242 |             time-series of the gap between the treated unit and the synthetic
243 |             control.
244 | 
245 |         :meta private:
246 |         """
247 |         scm.fit(dataprep=dataprep, **scm_options)
248 | 
249 |         Z0, Z1 = dataprep.make_outcome_mats(
250 |             time_period=dataprep.foo[dataprep.time_variable]
251 |         )
252 |         synthetic = scm._synthetic(Z0=Z0)
253 |         gaps = scm._gaps(Z0=Z0, Z1=Z1)
254 |         return synthetic.rename(dataprep.treatment_identifier), gaps.rename(
255 |             dataprep.treatment_identifier
256 |         )
257 | 
258 |     def gaps_plot(
259 |         self,
260 |         time_period: Optional[IsinArg_t] = None,
261 |         grid: bool = True,
262 |         treatment_time: Optional[int] = None,
263 |         mspe_threshold: Optional[float] = None,
264 |         exclude_units: Optional[list] = None,
265 |     ):
266 |         """Plot the gaps between the treated unit and the synthetic control
267 |         for each placebo test.
268 | 
269 |         Parameters
270 |         ----------
271 |         time_period : Iterable | pandas.Series | dict, optional
272 |             Time range to plot, if none is supplied then the time range used
273 |             is the time period over which the optimisation happens, by default
274 |             None
275 |         grid : bool, optional
276 |             Whether or not to plot a grid, by default True
277 |         treatment_time : int, optional
278 |             If supplied, plot a vertical line at the time period that the
279 |             treatment time occurred, by default None
280 |         mspe_threshold : float, optional
281 |             Remove any non-treated units whose MSPE pre-treatment is :math:`>`
282 |             mspe_threshold :math:`\\times` the MSPE of the treated unit pre-treatment.
283 |             This serves to exclude any non-treated units whose synthetic control
284 |             had a poor pre-treatment match to the actual relative to how the
285 |             actual treated unit matched pre-treatment.
286 | 
287 |         Raises
288 |         ------
289 |         ValueError
290 |             if no placebo test has been run yet
291 |         ValueError
292 |             if `mspe_threshold` is supplied but `treatment_year` is not.
293 |         """
294 |         if self.gaps is None:
295 |             raise ValueError("No gaps available; run a placebo test first.")
296 |         time_period = time_period if time_period is not None else self.time_optimize_ssr
297 | 
298 |         gaps = self.gaps.drop(columns=exclude_units) if exclude_units else self.gaps
299 | 
300 |         if mspe_threshold:
301 |             if not treatment_time:
302 |                 raise ValueError("Need `treatment_time` to use `mspe_threshold`.")
303 |             pre_mspe = gaps.loc[:treatment_time].pow(2).sum(axis=0)
304 |             pre_mspe_treated = self.treated_gap.loc[:treatment_time].pow(2).sum(axis=0)
305 |             keep = pre_mspe[pre_mspe < mspe_threshold * pre_mspe_treated].index
306 |             placebo_gaps = gaps[gaps.index.isin(time_period)][keep]
307 |         else:
308 |             placebo_gaps = gaps[gaps.index.isin(time_period)]
309 | 
310 |         plt.plot(placebo_gaps, color="black", alpha=0.1)
311 |         plt.plot(self.treated_gap, color="black", alpha=1.0)
312 |         if treatment_time:
313 |             plt.axvline(x=treatment_time, ymin=0.05, ymax=0.95, linestyle="dashed")
314 |         plt.grid(grid)
315 |         plt.show()
316 | 
317 |     def pvalue(self, treatment_time: int) -> float:
318 |         """Calculate p-value of Abadie et al's version of Fisher's
319 |         exact hypothesis test for no effect of treatment null, see also
320 |         section 2.2. of :cite:`fp2018`.
321 | 
322 |         Parameters
323 |         ----------
324 |         treatment_time : int
325 |             The time period that the treatment time occurred
326 | 
327 |         Returns
328 |         -------
329 |         float
330 |             p-value for null hypothesis of no effect of treatment
331 | 
332 |         Raises
333 |         ------
334 |         ValueError
335 |             if no placebo test has been run yet
336 |         """
337 |         if self.gaps is None or self.treated_gap is None:
338 |             raise ValueError("Run a placebo test first.")
339 | 
340 |         all_ = pd.concat([self.gaps, self.treated_gap], axis=1)
341 | 
342 |         denom = all_.loc[:treatment_time].pow(2).sum(axis=0)
343 |         num = all_.loc[treatment_time:].pow(2).sum(axis=0)
344 | 
345 |         t, _ = self.gaps.shape
346 |         t0, _ = self.gaps.loc[:treatment_time].shape
347 | 
348 |         rmspe = (num / (t - t0)) / (denom / t0)
349 |         return sum(
350 |             rmspe.drop(index=self.treated_gap.name) >= rmspe.loc[self.treated_gap.name]
351 |         ) / len(rmspe)
352 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdfordham/pysyncon/9aa6b546d7f96c1e699e9ed145214d6ff17fee12/requirements-dev.txt


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = pysyncon
 3 | version = 1.5.2
 4 | author = Stiofán Fordham
 5 | url = https://github.com/sdfordham/pysyncon/
 6 | long_description = file: README.md
 7 | long_description_content_type = text/markdown
 8 | keywords = Synth,augsynth,synthetic-control-method,causal-inference,
 9 | license = MIT License
10 | 
11 | [options]
12 | packages = pysyncon,
13 | python_requires = >=3.8
14 | install_requires =
15 |     numpy >= 1.24.0
16 |     matplotlib >= 3.6.2
17 |     pandas >= 1.5.2
18 |     scipy >= 1.9.3
19 | 
20 | [options.extras_require]
21 | dev = 
22 |     black == 23.10.1
23 | 


--------------------------------------------------------------------------------
/tests/test_augsynth.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | import pysyncon
 6 | 
 7 | 
 8 | class TestAugSynth(unittest.TestCase):
 9 |     def setUp(self):
10 |         self.foo = pd.DataFrame(
11 |             {
12 |                 "time": [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4],
13 |                 "name": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3],
14 |                 "dependent": np.random.random(12),
15 |                 "predictor1": np.random.random(12),
16 |                 "predictor2": np.random.random(12),
17 |             }
18 |         )
19 |         self.predictors = ["predictor1"]
20 |         self.predictors_op = "mean"
21 |         self.dependent = "dependent"
22 |         self.unit_variable = "name"
23 |         self.time_variable = "time"
24 |         self.treatment_identifier = 1
25 |         self.treatment_identifier_list = [1, 2]
26 |         self.controls_identifier = [2, 3]
27 |         self.controls_identifier_alt = [3]
28 |         self.time_predictors_prior = [2, 3]
29 |         self.time_optimize_ssr = [1, 2, 3]
30 |         self.special_predictors = [
31 |             ("predictor1", [2], "mean"),
32 |             ("predictor2", [1, 2], "median"),
33 |             ("predictor2", [1, 2], "std"),
34 |         ]
35 | 
36 |     def test_fit_treated(self):
37 |         kwargs = {
38 |             "foo": self.foo,
39 |             "predictors": self.predictors,
40 |             "predictors_op": self.predictors_op,
41 |             "dependent": self.dependent,
42 |             "unit_variable": self.unit_variable,
43 |             "time_variable": self.time_variable,
44 |             "time_predictors_prior": self.time_predictors_prior,
45 |             "time_optimize_ssr": self.time_optimize_ssr,
46 |             "special_predictors": self.special_predictors,
47 |         }
48 | 
49 |         dataprep = pysyncon.Dataprep(
50 |             treatment_identifier=self.treatment_identifier_list,
51 |             controls_identifier=self.controls_identifier_alt,
52 |             **kwargs,
53 |         )
54 |         augsynth = pysyncon.AugSynth()
55 |         self.assertRaises(ValueError, augsynth.fit, dataprep)
56 | 
57 |         dataprep = pysyncon.Dataprep(
58 |             treatment_identifier=self.treatment_identifier,
59 |             controls_identifier=self.controls_identifier,
60 |             **kwargs,
61 |         )
62 |         augsynth = pysyncon.AugSynth()
63 |         try:
64 |             augsynth.fit(dataprep)
65 |         except Exception as e:
66 |             self.fail(f"Augsynth fit with single treated failed: {e}.")
67 | 
68 |         dataprep = pysyncon.Dataprep(
69 |             treatment_identifier=[self.treatment_identifier],
70 |             controls_identifier=self.controls_identifier,
71 |             **kwargs,
72 |         )
73 |         augsynth = pysyncon.AugSynth()
74 |         try:
75 |             augsynth.fit(dataprep)
76 |         except Exception as e:
77 |             self.fail(f"Augsynth fit with single treated in list failed: {e}.")
78 | 


--------------------------------------------------------------------------------
/tests/test_augsynth_basque.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import pandas as pd
 3 | 
 4 | from pysyncon import Dataprep, AugSynth
 5 | 
 6 | 
 7 | class TestAugsynthBasque(unittest.TestCase):
 8 |     def setUp(self):
 9 |         df = pd.read_csv("./data/basque.csv")
10 |         self.dataprep = Dataprep(
11 |             foo=df,
12 |             predictors=[
13 |                 "school.illit",
14 |                 "school.prim",
15 |                 "school.med",
16 |                 "school.high",
17 |                 "school.post.high",
18 |                 "invest",
19 |             ],
20 |             predictors_op="mean",
21 |             time_predictors_prior=range(1964, 1970),
22 |             special_predictors=[
23 |                 ("gdpcap", range(1960, 1970), "mean"),
24 |                 ("sec.agriculture", range(1961, 1970, 2), "mean"),
25 |                 ("sec.energy", range(1961, 1970, 2), "mean"),
26 |                 ("sec.industry", range(1961, 1970, 2), "mean"),
27 |                 ("sec.construction", range(1961, 1970, 2), "mean"),
28 |                 ("sec.services.venta", range(1961, 1970, 2), "mean"),
29 |                 ("sec.services.nonventa", range(1961, 1970, 2), "mean"),
30 |                 ("popdens", [1969], "mean"),
31 |             ],
32 |             dependent="gdpcap",
33 |             unit_variable="regionname",
34 |             time_variable="year",
35 |             treatment_identifier="Basque Country (Pais Vasco)",
36 |             controls_identifier=[
37 |                 "Andalucia",
38 |                 "Aragon",
39 |                 "Baleares (Islas)",
40 |                 "Canarias",
41 |                 "Cantabria",
42 |                 "Castilla-La Mancha",
43 |                 "Castilla Y Leon",
44 |                 "Cataluna",
45 |                 "Comunidad Valenciana",
46 |                 "Extremadura",
47 |                 "Galicia",
48 |                 "Madrid (Comunidad De)",
49 |                 "Murcia (Region de)",
50 |                 "Navarra (Comunidad Foral De)",
51 |                 "Principado De Asturias",
52 |                 "Rioja (La)",
53 |                 "Spain (Espana)",
54 |             ],
55 |             time_optimize_ssr=range(1960, 1970),
56 |         )
57 |         self.optim_method = "Nelder-Mead"
58 |         self.optim_initial = "equal"
59 |         self.weights = {
60 |             "Andalucia": 0.113627911,
61 |             "Aragon": 1.774922286,
62 |             "Baleares (Islas)": -0.713432799,
63 |             "Canarias": 1.19397534,
64 |             "Cantabria": 0.497825351,
65 |             "Castilla-La Mancha": 0.131573892,
66 |             "Castilla Y Leon": -1.405974956,
67 |             "Cataluna": 1.31890027,
68 |             "Comunidad Valenciana": -1.731140541,
69 |             "Extremadura": -1.134362989,
70 |             "Galicia": 1.982136937,
71 |             "Madrid (Comunidad De)": 0.110801212,
72 |             "Murcia (Region de)": -1.31476635,
73 |             "Navarra (Comunidad Foral De)": -1.303045915,
74 |             "Principado De Asturias": -0.02423815,
75 |             "Rioja (La)": 1.58950474,
76 |             "Spain (Espana)": -0.086306241,
77 |         }
78 | 
79 |     def test_weights(self):
80 |         augsynth = AugSynth()
81 |         augsynth.fit(dataprep=self.dataprep)
82 | 
83 |         weights = pd.Series(self.weights, name="weights")
84 |         # Allow a tolerance of 2.5%
85 |         pd.testing.assert_series_equal(
86 |             weights, augsynth.weights(round=9), check_exact=False, atol=0.025
87 |         )
88 | 


--------------------------------------------------------------------------------
/tests/test_conformal_interence.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import pandas as pd
  3 | import numpy as np
  4 | 
  5 | from pysyncon import Synth
  6 | from pysyncon.inference import ConformalInference
  7 | 
  8 | 
  9 | class TestConformalInference(unittest.TestCase):
 10 |     def setUp(self):
 11 |         self.rng = np.random.default_rng()
 12 |         self.alpha = 0.05
 13 |         self.scm = Synth()
 14 |         self.Z0 = pd.DataFrame(
 15 |             data=self.rng.random(size=(30, 10)),
 16 |             index=range(1, 31),
 17 |             columns=range(1, 11),
 18 |         )
 19 |         self.Z1 = pd.Series(
 20 |             data=self.rng.random(size=(30,)), index=range(1, 31), name=0
 21 |         )
 22 |         self.X0 = pd.DataFrame(
 23 |             data=self.rng.random(size=(4, 10)),
 24 |             index=range(1, 5),
 25 |             columns=range(1, 11),
 26 |         )
 27 |         self.X1 = pd.Series(data=self.rng.random(size=(4,)), index=range(1, 5), name=0)
 28 |         self.pre_periods = list(range(1, 21))
 29 |         self.post_periods = list(range(21, 31))
 30 |         self.max_iter = 20
 31 |         self.tol = 0.1
 32 |         self.step_sz = None
 33 |         self.step_sz_div = 20.0
 34 |         self.verbose = False
 35 | 
 36 |     def test_alpha(self):
 37 |         kwargs = {
 38 |             "scm": self.scm,
 39 |             "Z0": self.Z0,
 40 |             "Z1": self.Z1,
 41 |             "pre_periods": self.pre_periods,
 42 |             "post_periods": self.post_periods,
 43 |             "max_iter": self.max_iter,
 44 |             "tol": self.tol,
 45 |             "step_sz": self.step_sz,
 46 |             "step_sz_div": self.step_sz_div,
 47 |             "verbose": self.verbose,
 48 |         }
 49 | 
 50 |         conformal_inf = ConformalInference()
 51 | 
 52 |         cases = [-1.0, 0.0, 1.0, 17.0]
 53 |         for case in cases:
 54 |             with self.subTest(case=case):
 55 |                 self.assertRaises(
 56 |                     ValueError, conformal_inf.confidence_intervals, alpha=case, **kwargs
 57 |                 )
 58 | 
 59 |         cases = [True, ["foo"], {"foo": "bar"}]
 60 |         for case in cases:
 61 |             with self.subTest(case=case):
 62 |                 self.assertRaises(
 63 |                     TypeError, conformal_inf.confidence_intervals, alpha=case, **kwargs
 64 |                 )
 65 | 
 66 |     def test_max_iter(self):
 67 |         kwargs = {
 68 |             "alpha": self.alpha,
 69 |             "scm": self.scm,
 70 |             "Z0": self.Z0,
 71 |             "Z1": self.Z1,
 72 |             "pre_periods": self.pre_periods,
 73 |             "post_periods": self.post_periods,
 74 |             "tol": self.tol,
 75 |             "step_sz": self.step_sz,
 76 |             "step_sz_div": self.step_sz_div,
 77 |             "verbose": self.verbose,
 78 |         }
 79 | 
 80 |         conformal_inf = ConformalInference()
 81 | 
 82 |         cases = [-17, 0]
 83 |         for case in cases:
 84 |             with self.subTest(case=case):
 85 |                 self.assertRaises(
 86 |                     ValueError,
 87 |                     conformal_inf.confidence_intervals,
 88 |                     max_iter=case,
 89 |                     **kwargs
 90 |                 )
 91 | 
 92 |         cases = [5.2, 10.0]
 93 |         for case in cases:
 94 |             with self.subTest(case=case):
 95 |                 self.assertRaises(
 96 |                     TypeError,
 97 |                     conformal_inf.confidence_intervals,
 98 |                     max_iter=case,
 99 |                     **kwargs
100 |                 )
101 | 
102 |     def test_tol(self):
103 |         kwargs = {
104 |             "alpha": self.alpha,
105 |             "scm": self.scm,
106 |             "Z0": self.Z0,
107 |             "Z1": self.Z1,
108 |             "pre_periods": self.pre_periods,
109 |             "post_periods": self.post_periods,
110 |             "max_iter": self.max_iter,
111 |             "step_sz": self.step_sz,
112 |             "step_sz_div": self.step_sz_div,
113 |             "verbose": self.verbose,
114 |         }
115 | 
116 |         conformal_inf = ConformalInference()
117 | 
118 |         cases = [-4.2, 0.0]
119 |         for case in cases:
120 |             with self.subTest(case=case):
121 |                 self.assertRaises(
122 |                     ValueError, conformal_inf.confidence_intervals, tol=case, **kwargs
123 |                 )
124 | 
125 |         cases = [-4, 0]
126 |         for case in cases:
127 |             with self.subTest(case=case):
128 |                 self.assertRaises(
129 |                     TypeError, conformal_inf.confidence_intervals, tol=case, **kwargs
130 |                 )
131 | 
132 |     def test_step_sz(self):
133 |         kwargs = {
134 |             "alpha": self.alpha,
135 |             "scm": self.scm,
136 |             "Z0": self.Z0,
137 |             "Z1": self.Z1,
138 |             "pre_periods": self.pre_periods,
139 |             "post_periods": self.post_periods,
140 |             "tol": self.tol,
141 |             "max_iter": self.max_iter,
142 |             "step_sz_div": self.step_sz_div,
143 |             "verbose": self.verbose,
144 |         }
145 | 
146 |         conformal_inf = ConformalInference()
147 | 
148 |         cases = [-4.2, 0.0]
149 |         for case in cases:
150 |             with self.subTest(case=case):
151 |                 self.assertRaises(
152 |                     ValueError,
153 |                     conformal_inf.confidence_intervals,
154 |                     step_sz=case,
155 |                     **kwargs
156 |                 )
157 | 
158 |         cases = [-4, 0]
159 |         for case in cases:
160 |             with self.subTest(case=case):
161 |                 self.assertRaises(
162 |                     TypeError,
163 |                     conformal_inf.confidence_intervals,
164 |                     step_sz=case,
165 |                     **kwargs
166 |                 )
167 | 
168 |     def test_step_sz_tol(self):
169 |         kwargs = {
170 |             "alpha": self.alpha,
171 |             "scm": self.scm,
172 |             "Z0": self.Z0,
173 |             "Z1": self.Z1,
174 |             "pre_periods": self.pre_periods,
175 |             "post_periods": self.post_periods,
176 |             "max_iter": self.max_iter,
177 |             "step_sz_div": self.step_sz_div,
178 |             "verbose": self.verbose,
179 |         }
180 | 
181 |         conformal_inf = ConformalInference()
182 | 
183 |         # Step-size is less than tolerance
184 |         self.assertRaises(
185 |             ValueError,
186 |             conformal_inf.confidence_intervals,
187 |             tol=1.0,
188 |             step_sz=0.1,
189 |             **kwargs
190 |         )
191 | 
192 |         # Step-size = tolerance
193 |         self.assertRaises(
194 |             ValueError,
195 |             conformal_inf.confidence_intervals,
196 |             tol=1.0,
197 |             step_sz=1.0,
198 |             **kwargs
199 |         )
200 | 
201 |     def test_step_sz_guessing(self):
202 |         kwargs = {
203 |             "alpha": self.alpha,
204 |             "scm": self.scm,
205 |             "Z0": self.Z0,
206 |             "Z1": self.Z1,
207 |             "pre_periods": self.pre_periods,
208 |             "post_periods": self.post_periods,
209 |             "max_iter": self.max_iter,
210 |             "step_sz_div": self.step_sz_div,
211 |             "verbose": self.verbose,
212 |             "scm_fit_args": {"X0": self.X0, "X1": self.X1},
213 |         }
214 | 
215 |         conformal_inf = ConformalInference()
216 | 
217 |         # No step-size and a big tolerance
218 |         # (step-size guessing)
219 |         _, n_c = self.Z0.shape
220 |         self.scm.W = np.full(n_c, 1.0 / n_c)
221 |         conformal_inf.confidence_intervals(tol=100.0, **kwargs)
222 |         self.scm.W = None
223 | 
224 |     def test_step_sz_div(self):
225 |         kwargs = {
226 |             "alpha": self.alpha,
227 |             "scm": self.scm,
228 |             "Z0": self.Z0,
229 |             "Z1": self.Z1,
230 |             "pre_periods": self.pre_periods,
231 |             "post_periods": self.post_periods,
232 |             "tol": self.tol,
233 |             "max_iter": self.max_iter,
234 |             "step_sz": self.step_sz,
235 |             "verbose": self.verbose,
236 |         }
237 | 
238 |         conformal_inf = ConformalInference()
239 | 
240 |         cases = [-4.2, 0.0]
241 |         for case in cases:
242 |             with self.subTest(case=case):
243 |                 self.assertRaises(
244 |                     ValueError,
245 |                     conformal_inf.confidence_intervals,
246 |                     step_sz_div=case,
247 |                     **kwargs
248 |                 )
249 | 
250 |         cases = [-4, 0]
251 |         for case in cases:
252 |             with self.subTest(case=case):
253 |                 self.assertRaises(
254 |                     TypeError,
255 |                     conformal_inf.confidence_intervals,
256 |                     step_sz_div=case,
257 |                     **kwargs
258 |                 )
259 | 
260 |     def test_no_weights(self):
261 |         kwargs = {
262 |             "alpha": self.alpha,
263 |             "scm": self.scm,
264 |             "Z0": self.Z0,
265 |             "Z1": self.Z1,
266 |             "pre_periods": self.pre_periods,
267 |             "post_periods": self.post_periods,
268 |             "tol": self.tol,
269 |             "max_iter": self.max_iter,
270 |             "step_sz": self.step_sz,
271 |             "verbose": self.verbose,
272 |         }
273 | 
274 |         conformal_inf = ConformalInference()
275 |         self.assertRaises(ValueError, conformal_inf.confidence_intervals, **kwargs)
276 | 
277 |     def test_root_search(self):
278 |         cases_roots_x0 = [
279 |             ((-1, 3), 0.5),
280 |             ((-1, 3), 1.0),
281 |             ((-1, 3), 2.5),
282 |             ((-1, 400), 0.5),
283 |             ((-1, 400), 100),
284 |             ((-1, 400), 399),
285 |         ]
286 |         cases_step_sz = [0.1, 1.0]
287 | 
288 |         ci = ConformalInference()
289 |         tol = 0.01
290 |         for case_root_x0 in cases_roots_x0:
291 |             for case_step_sz in cases_step_sz:
292 |                 case = (case_root_x0, case_step_sz)
293 |                 with self.subTest(case=case):
294 |                     ((lower, upper), x0) = case_root_x0
295 | 
296 |                     res = ci._root_search(
297 |                         fn=lambda x: (lower - x) * (x - upper),
298 |                         x0=x0,
299 |                         direction=-1,
300 |                         tol=tol,
301 |                         step_sz=case_step_sz,
302 |                         max_iter=100,
303 |                     )
304 |                     self.assertAlmostEqual(lower, res, delta=tol)
305 | 
306 |                     res = ci._root_search(
307 |                         fn=lambda x: (lower - x) * (x - upper),
308 |                         x0=x0,
309 |                         direction=1,
310 |                         tol=tol,
311 |                         step_sz=case_step_sz,
312 |                         max_iter=100,
313 |                     )
314 |                     self.assertAlmostEqual(upper, res, delta=tol)
315 | 
316 |         self.assertRaises(
317 |             Exception,
318 |             ci._root_search,
319 |             fn=lambda x: (-1 - x) * (x - 400),
320 |             x0=200,
321 |             direction=-1,
322 |             tol=0.01,
323 |             step_sz=1.0,
324 |             max_iter=1,
325 |         )
326 | 


--------------------------------------------------------------------------------
/tests/test_linear_factor_model.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | from pysyncon.generator import LinearFactorModel
 6 | 
 7 | 
 8 | class TestLinearFactorModel(unittest.TestCase):
 9 |     def setUp(self):
10 |         self.n_units = np.random.randint(low=10, high=20)
11 |         self.n_observable = np.random.randint(low=10, high=15)
12 |         self.n_unobservable = np.random.randint(low=10, high=15)
13 |         self.n_periods_pre = np.random.randint(low=50, high=80)
14 |         self.n_periods_post = np.random.randint(low=10, high=20)
15 | 
16 |     def test_matrix_dims(self):
17 |         lfm = LinearFactorModel()
18 |         X0, X1, Z0, Z1 = lfm.generate(
19 |             n_units=self.n_units,
20 |             n_observable=self.n_observable,
21 |             n_unobservable=self.n_unobservable,
22 |             n_periods_pre=self.n_periods_pre,
23 |             n_periods_post=self.n_periods_post,
24 |         )
25 | 
26 |         self.assertEqual(X0.shape, (self.n_observable, self.n_units - 1))
27 |         self.assertEqual(X1.shape, (self.n_observable,))
28 |         self.assertEqual(
29 |             Z0.shape, (self.n_periods_pre + self.n_periods_post, self.n_units - 1)
30 |         )
31 |         self.assertEqual(Z1.shape, (self.n_periods_pre + self.n_periods_post,))
32 | 


--------------------------------------------------------------------------------
/tests/test_penalized.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import numpy as np
  3 | import pandas as pd
  4 | 
  5 | import pysyncon
  6 | 
  7 | 
  8 | class TestPenalizedSynth(unittest.TestCase):
  9 |     def setUp(self):
 10 |         self.foo = pd.DataFrame(
 11 |             {
 12 |                 "time": [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4],
 13 |                 "name": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3],
 14 |                 "dependent": np.random.random(12),
 15 |                 "predictor1": np.random.random(12),
 16 |                 "predictor2": np.random.random(12),
 17 |             }
 18 |         )
 19 |         self.predictors = ["predictor1"]
 20 |         self.predictors_op = "mean"
 21 |         self.dependent = "dependent"
 22 |         self.unit_variable = "name"
 23 |         self.time_variable = "time"
 24 |         self.treatment_identifier = 1
 25 |         self.treatment_identifier_list = [1, 2]
 26 |         self.controls_identifier = [2, 3]
 27 |         self.controls_identifier_alt = [3]
 28 |         self.time_predictors_prior = [2, 3]
 29 |         self.time_optimize_ssr = [1, 2, 3]
 30 |         self.special_predictors = [
 31 |             ("predictor1", [2], "mean"),
 32 |             ("predictor2", [1, 2], "median"),
 33 |             ("predictor2", [1, 2], "std"),
 34 |         ]
 35 |         self.custom_V = np.full(4, 1.0)
 36 | 
 37 |     def test_fit_treated(self):
 38 |         kwargs = {
 39 |             "foo": self.foo,
 40 |             "predictors": self.predictors,
 41 |             "predictors_op": self.predictors_op,
 42 |             "dependent": self.dependent,
 43 |             "unit_variable": self.unit_variable,
 44 |             "time_variable": self.time_variable,
 45 |             "time_predictors_prior": self.time_predictors_prior,
 46 |             "time_optimize_ssr": self.time_optimize_ssr,
 47 |             "special_predictors": self.special_predictors,
 48 |         }
 49 | 
 50 |         dataprep = pysyncon.Dataprep(
 51 |             treatment_identifier=self.treatment_identifier_list,
 52 |             controls_identifier=self.controls_identifier_alt,
 53 |             **kwargs,
 54 |         )
 55 |         pen = pysyncon.PenalizedSynth()
 56 |         self.assertRaises(ValueError, pen.fit, dataprep)
 57 | 
 58 |         dataprep = pysyncon.Dataprep(
 59 |             treatment_identifier=self.treatment_identifier,
 60 |             controls_identifier=self.controls_identifier,
 61 |             **kwargs,
 62 |         )
 63 |         pen = pysyncon.PenalizedSynth()
 64 |         try:
 65 |             pen.fit(dataprep)
 66 |         except Exception as e:
 67 |             self.fail(f"PenalizedSynth fit with single treated failed: {e}.")
 68 | 
 69 |         dataprep = pysyncon.Dataprep(
 70 |             treatment_identifier=[self.treatment_identifier],
 71 |             controls_identifier=self.controls_identifier,
 72 |             **kwargs,
 73 |         )
 74 |         pen = pysyncon.PenalizedSynth()
 75 |         try:
 76 |             pen.fit(dataprep)
 77 |         except Exception as e:
 78 |             self.fail(f"PenalizedSynth fit with single treated in list failed: {e}.")
 79 | 
 80 |     def test_X0_X1_fit(self):
 81 |         pen = pysyncon.PenalizedSynth()
 82 | 
 83 |         # X1 needs to be pd.Series
 84 |         X0 = pd.DataFrame(np.random.rand(5, 5))
 85 |         X1 = pd.DataFrame(np.random.rand(5, 2))
 86 |         self.assertRaises(TypeError, pen.fit, X0=X0, X1=X1)
 87 | 
 88 |         # X1 needs to be pd.Series
 89 |         X0 = pd.DataFrame(np.random.rand(5, 5))
 90 |         X1 = pd.DataFrame(np.random.rand(5, 1))
 91 |         self.assertRaises(TypeError, pen.fit, X0=X0, X1=X1)
 92 | 
 93 |     def test_fit_no_data(self):
 94 |         pen = pysyncon.PenalizedSynth()
 95 |         self.assertRaises(ValueError, pen.fit)
 96 | 
 97 |     def test_fit_custom_V(self):
 98 |         kwargs = {
 99 |             "foo": self.foo,
100 |             "predictors": self.predictors,
101 |             "predictors_op": self.predictors_op,
102 |             "dependent": self.dependent,
103 |             "unit_variable": self.unit_variable,
104 |             "time_variable": self.time_variable,
105 |             "treatment_identifier": self.treatment_identifier,
106 |             "controls_identifier": self.controls_identifier,
107 |             "time_predictors_prior": self.time_predictors_prior,
108 |             "time_optimize_ssr": self.time_optimize_ssr,
109 |             "special_predictors": self.special_predictors,
110 |         }
111 | 
112 |         dataprep = pysyncon.Dataprep(**kwargs)
113 |         pen = pysyncon.PenalizedSynth()
114 |         try:
115 |             pen.fit(dataprep=dataprep, custom_V=self.custom_V)
116 |         except Exception as e:
117 |             self.fail(f"PenalizedSynth fit failed with custom_V: {e}")
118 | 


--------------------------------------------------------------------------------
/tests/test_penalized_basque.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import pandas as pd
 3 | 
 4 | from pysyncon import Dataprep, PenalizedSynth
 5 | 
 6 | 
 7 | class TestPenalizedBasque(unittest.TestCase):
 8 |     def setUp(self):
 9 |         df = pd.read_csv("./data/basque.csv")
10 |         self.dataprep = Dataprep(
11 |             foo=df,
12 |             predictors=[
13 |                 "school.illit",
14 |                 "school.prim",
15 |                 "school.med",
16 |                 "school.high",
17 |                 "school.post.high",
18 |                 "invest",
19 |             ],
20 |             predictors_op="mean",
21 |             time_predictors_prior=range(1964, 1970),
22 |             special_predictors=[
23 |                 ("gdpcap", range(1960, 1970), "mean"),
24 |                 ("sec.agriculture", range(1961, 1970, 2), "mean"),
25 |                 ("sec.energy", range(1961, 1970, 2), "mean"),
26 |                 ("sec.industry", range(1961, 1970, 2), "mean"),
27 |                 ("sec.construction", range(1961, 1970, 2), "mean"),
28 |                 ("sec.services.venta", range(1961, 1970, 2), "mean"),
29 |                 ("sec.services.nonventa", range(1961, 1970, 2), "mean"),
30 |                 ("popdens", [1969], "mean"),
31 |             ],
32 |             dependent="gdpcap",
33 |             unit_variable="regionname",
34 |             time_variable="year",
35 |             treatment_identifier="Basque Country (Pais Vasco)",
36 |             controls_identifier=[
37 |                 "Aragon",
38 |                 "Baleares (Islas)",
39 |                 "Andalucia",
40 |                 "Canarias",
41 |                 "Cantabria",
42 |                 "Castilla Y Leon",
43 |                 "Castilla-La Mancha",
44 |                 "Cataluna",
45 |                 "Comunidad Valenciana",
46 |                 "Extremadura",
47 |                 "Galicia",
48 |                 "Madrid (Comunidad De)",
49 |                 "Murcia (Region de)",
50 |                 "Navarra (Comunidad Foral De)",
51 |                 "Principado De Asturias",
52 |                 "Rioja (La)",
53 |                 "Spain (Espana)",
54 |             ],
55 |             time_optimize_ssr=range(1960, 1970),
56 |         )
57 |         self.lambda_ = 0.01
58 |         self.weights = {
59 |             "Aragon": 0.0,
60 |             "Baleares (Islas)": 0.0,
61 |             "Andalucia": 0.0,
62 |             "Canarias": 0.0,
63 |             "Cantabria": 0.241,
64 |             "Castilla Y Leon": 0.0,
65 |             "Castilla-La Mancha": 0.0,
66 |             "Cataluna": 0.759,
67 |             "Comunidad Valenciana": 0.0,
68 |             "Extremadura": 0.0,
69 |             "Galicia": 0.0,
70 |             "Madrid (Comunidad De)": 0.0,
71 |             "Murcia (Region de)": 0.0,
72 |             "Navarra (Comunidad Foral De)": 0.0,
73 |             "Principado De Asturias": 0.0,
74 |             "Rioja (La)": 0.0,
75 |             "Spain (Espana)": 0.0,
76 |         }
77 | 
78 |     def test_weights(self):
79 |         robust = PenalizedSynth()
80 |         robust.fit(dataprep=self.dataprep, lambda_=self.lambda_)
81 | 
82 |         weights = pd.Series(self.weights, name="weights")
83 |         # Allow a tolerance of 2.5%
84 |         pd.testing.assert_series_equal(
85 |             weights, robust.weights(round=9), check_exact=False, atol=0.025
86 |         )
87 | 


--------------------------------------------------------------------------------
/tests/test_robust.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | import pysyncon
 6 | 
 7 | 
 8 | class TestRobustSynth(unittest.TestCase):
 9 |     def setUp(self):
10 |         self.foo = pd.DataFrame(
11 |             {
12 |                 "time": [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4],
13 |                 "name": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3],
14 |                 "dependent": np.random.random(12),
15 |                 "predictor1": np.random.random(12),
16 |                 "predictor2": np.random.random(12),
17 |             }
18 |         )
19 |         self.predictors = ["predictor1"]
20 |         self.predictors_op = "mean"
21 |         self.dependent = "dependent"
22 |         self.unit_variable = "name"
23 |         self.time_variable = "time"
24 |         self.treatment_identifier = 1
25 |         self.treatment_identifier_list = [1, 2]
26 |         self.controls_identifier = [2, 3]
27 |         self.controls_identifier_alt = [3]
28 |         self.time_predictors_prior = [2, 3]
29 |         self.time_optimize_ssr = [1, 2, 3]
30 |         self.special_predictors = [
31 |             ("predictor1", [2], "mean"),
32 |             ("predictor2", [1, 2], "median"),
33 |             ("predictor2", [1, 2], "std"),
34 |         ]
35 |         self.lambda_ = 0.01
36 |         self.sv_count = 1
37 | 
38 |     def test_fit_treated(self):
39 |         kwargs = {
40 |             "foo": self.foo,
41 |             "predictors": self.predictors,
42 |             "predictors_op": self.predictors_op,
43 |             "dependent": self.dependent,
44 |             "unit_variable": self.unit_variable,
45 |             "time_variable": self.time_variable,
46 |             "time_predictors_prior": self.time_predictors_prior,
47 |             "time_optimize_ssr": self.time_optimize_ssr,
48 |             "special_predictors": self.special_predictors,
49 |         }
50 | 
51 |         dataprep = pysyncon.Dataprep(
52 |             treatment_identifier=self.treatment_identifier_list,
53 |             controls_identifier=self.controls_identifier_alt,
54 |             **kwargs,
55 |         )
56 |         robust = pysyncon.RobustSynth()
57 |         self.assertRaises(
58 |             ValueError,
59 |             robust.fit,
60 |             dataprep,
61 |             lambda_=self.lambda_,
62 |             sv_count=self.sv_count,
63 |         )
64 | 
65 |         dataprep = pysyncon.Dataprep(
66 |             treatment_identifier=self.treatment_identifier,
67 |             controls_identifier=self.controls_identifier,
68 |             **kwargs,
69 |         )
70 |         robust = pysyncon.RobustSynth()
71 |         try:
72 |             robust.fit(dataprep, lambda_=self.lambda_, sv_count=self.sv_count)
73 |         except Exception as e:
74 |             self.fail(f"RobustSynth fit with single treated failed: {e}.")
75 | 
76 |         dataprep = pysyncon.Dataprep(
77 |             treatment_identifier=[self.treatment_identifier],
78 |             controls_identifier=self.controls_identifier,
79 |             **kwargs,
80 |         )
81 |         robust = pysyncon.RobustSynth()
82 |         try:
83 |             robust.fit(dataprep, lambda_=self.lambda_, sv_count=self.sv_count)
84 |         except Exception as e:
85 |             self.fail(f"RobustSynth fit with single treated in list failed: {e}.")
86 | 


--------------------------------------------------------------------------------
/tests/test_robust_basque.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import pandas as pd
 3 | 
 4 | from pysyncon import Dataprep, RobustSynth
 5 | 
 6 | 
 7 | class TestRobustBasque(unittest.TestCase):
 8 |     def setUp(self):
 9 |         df = pd.read_csv("./data/basque.csv")
10 |         self.dataprep = Dataprep(
11 |             foo=df,
12 |             predictors=[
13 |                 "school.illit",
14 |                 "school.prim",
15 |                 "school.med",
16 |                 "school.high",
17 |                 "school.post.high",
18 |                 "invest",
19 |             ],
20 |             predictors_op="mean",
21 |             time_predictors_prior=range(1964, 1970),
22 |             special_predictors=[
23 |                 ("gdpcap", range(1960, 1970), "mean"),
24 |                 ("sec.agriculture", range(1961, 1970, 2), "mean"),
25 |                 ("sec.energy", range(1961, 1970, 2), "mean"),
26 |                 ("sec.industry", range(1961, 1970, 2), "mean"),
27 |                 ("sec.construction", range(1961, 1970, 2), "mean"),
28 |                 ("sec.services.venta", range(1961, 1970, 2), "mean"),
29 |                 ("sec.services.nonventa", range(1961, 1970, 2), "mean"),
30 |                 ("popdens", [1969], "mean"),
31 |             ],
32 |             dependent="gdpcap",
33 |             unit_variable="regionname",
34 |             time_variable="year",
35 |             treatment_identifier="Basque Country (Pais Vasco)",
36 |             controls_identifier=[
37 |                 "Aragon",
38 |                 "Baleares (Islas)",
39 |                 "Andalucia",
40 |                 "Canarias",
41 |                 "Cantabria",
42 |                 "Castilla Y Leon",
43 |                 "Castilla-La Mancha",
44 |                 "Cataluna",
45 |                 "Comunidad Valenciana",
46 |                 "Extremadura",
47 |                 "Galicia",
48 |                 "Madrid (Comunidad De)",
49 |                 "Murcia (Region de)",
50 |                 "Navarra (Comunidad Foral De)",
51 |                 "Principado De Asturias",
52 |                 "Rioja (La)",
53 |             ],
54 |             time_optimize_ssr=range(1960, 1970),
55 |         )
56 |         self.lambda_ = 0.1
57 |         self.sv_count = 2
58 |         self.weights = {
59 |             "Aragon": 0.042750725,
60 |             "Baleares (Islas)": 0.095687916,
61 |             "Andalucia": 0.05471977,
62 |             "Canarias": 0.029348893,
63 |             "Cantabria": 0.131449835,
64 |             "Castilla Y Leon": 0.00534905,
65 |             "Castilla-La Mancha": -0.023989253,
66 |             "Cataluna": 0.172766943,
67 |             "Comunidad Valenciana": 0.098502043,
68 |             "Extremadura": -0.024916194,
69 |             "Galicia": 0.000285705,
70 |             "Madrid (Comunidad De)": 0.306908016,
71 |             "Murcia (Region de)": 0.037554988,
72 |             "Navarra (Comunidad Foral De)": 0.042127484,
73 |             "Principado De Asturias": 0.144568216,
74 |             "Rioja (La)": 0.018474723,
75 |         }
76 | 
77 |     def test_weights(self):
78 |         robust = RobustSynth()
79 |         robust.fit(dataprep=self.dataprep, lambda_=self.lambda_, sv_count=self.sv_count)
80 | 
81 |         weights = pd.Series(self.weights, name="weights")
82 |         # Allow a tolerance of 2.5%
83 |         pd.testing.assert_series_equal(
84 |             weights, robust.weights(round=9), check_exact=False, atol=0.025
85 |         )
86 | 


--------------------------------------------------------------------------------
/tests/test_synth.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from unittest.mock import patch, Mock
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | import pysyncon
  7 | 
  8 | 
  9 | class TestSynth(unittest.TestCase):
 10 |     def setUp(self):
 11 |         self.foo = pd.DataFrame(
 12 |             {
 13 |                 "time": [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4],
 14 |                 "name": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3],
 15 |                 "dependent": np.random.random(12),
 16 |                 "predictor1": np.random.random(12),
 17 |                 "predictor2": np.random.random(12),
 18 |             }
 19 |         )
 20 |         self.predictors = ["predictor1"]
 21 |         self.predictors_op = "mean"
 22 |         self.dependent = "dependent"
 23 |         self.unit_variable = "name"
 24 |         self.time_variable = "time"
 25 |         self.treatment_identifier = 1
 26 |         self.treatment_identifier_list = [1, 2]
 27 |         self.controls_identifier = [2, 3]
 28 |         self.controls_identifier_alt = [3]
 29 |         self.time_predictors_prior = [2, 3]
 30 |         self.time_optimize_ssr = [1, 2, 3]
 31 |         self.special_predictors = [
 32 |             ("predictor1", [2], "mean"),
 33 |             ("predictor2", [1, 2], "median"),
 34 |             ("predictor2", [1, 2], "std"),
 35 |         ]
 36 | 
 37 |     def test_fit_treated(self):
 38 |         kwargs = {
 39 |             "foo": self.foo,
 40 |             "predictors": self.predictors,
 41 |             "predictors_op": self.predictors_op,
 42 |             "dependent": self.dependent,
 43 |             "unit_variable": self.unit_variable,
 44 |             "time_variable": self.time_variable,
 45 |             "time_predictors_prior": self.time_predictors_prior,
 46 |             "time_optimize_ssr": self.time_optimize_ssr,
 47 |             "special_predictors": self.special_predictors,
 48 |         }
 49 |         dataprep = pysyncon.Dataprep(
 50 |             treatment_identifier=self.treatment_identifier_list,
 51 |             controls_identifier=self.controls_identifier_alt,
 52 |             **kwargs,
 53 |         )
 54 |         synth = pysyncon.Synth()
 55 |         self.assertRaises(ValueError, synth.fit, dataprep)
 56 | 
 57 |         dataprep = pysyncon.Dataprep(
 58 |             treatment_identifier=self.treatment_identifier,
 59 |             controls_identifier=self.controls_identifier,
 60 |             **kwargs,
 61 |         )
 62 |         synth = pysyncon.Synth()
 63 | 
 64 |         # Run with normal controls list
 65 |         synth.fit(dataprep)
 66 | 
 67 |         dataprep = pysyncon.Dataprep(
 68 |             treatment_identifier=[self.treatment_identifier],
 69 |             controls_identifier=self.controls_identifier,
 70 |             **kwargs,
 71 |         )
 72 |         synth = pysyncon.Synth()
 73 | 
 74 |         # Run with a list of treatment identifiers
 75 |         synth.fit(dataprep)
 76 | 
 77 |     def test_X0_X1_fit(self):
 78 |         synth = pysyncon.Synth()
 79 | 
 80 |         # Neither dataprep nor matrices set
 81 |         self.assertRaises(ValueError, synth.fit)
 82 | 
 83 |         # X1 needs to be pd.Series
 84 |         X0 = pd.DataFrame(np.random.rand(5, 5))
 85 |         X1 = pd.DataFrame(np.random.rand(5, 2))
 86 |         Z0 = pd.DataFrame(np.random.rand(5, 5))
 87 |         Z1 = pd.DataFrame(np.random.rand(5, 2))
 88 |         self.assertRaises(TypeError, synth.fit, X0=X0, X1=X1, Z0=Z0, Z1=Z1)
 89 | 
 90 |         # X1 needs to be pd.Series
 91 |         X0 = pd.DataFrame(np.random.rand(5, 5))
 92 |         X1 = pd.DataFrame(np.random.rand(5, 1))
 93 |         Z0 = pd.DataFrame(np.random.rand(5, 5))
 94 |         Z1 = pd.DataFrame(np.random.rand(5, 1))
 95 |         self.assertRaises(TypeError, synth.fit, X0=X0, X1=X1, Z0=Z0, Z1=Z1)
 96 | 
 97 |     @patch("pysyncon.base.plt")
 98 |     def test_path_plot(self, mock_plt: Mock):
 99 |         kwargs = {
100 |             "foo": self.foo,
101 |             "predictors": self.predictors,
102 |             "predictors_op": self.predictors_op,
103 |             "dependent": self.dependent,
104 |             "unit_variable": self.unit_variable,
105 |             "time_variable": self.time_variable,
106 |             "treatment_identifier": self.treatment_identifier,
107 |             "controls_identifier": self.controls_identifier,
108 |             "time_predictors_prior": self.time_predictors_prior,
109 |             "time_optimize_ssr": self.time_optimize_ssr,
110 |             "special_predictors": self.special_predictors,
111 |         }
112 | 
113 |         dataprep = pysyncon.Dataprep(**kwargs)
114 |         synth = pysyncon.Synth()
115 |         # No weight matrix set
116 |         self.assertRaises(ValueError, synth.path_plot)
117 | 
118 |         X0, X1 = dataprep.make_covariate_mats()
119 |         Z0, Z1 = dataprep.make_outcome_mats()
120 |         synth.fit(X0=X0, X1=X1, Z0=Z0, Z1=Z1)
121 |         # No Dataprep object available
122 |         self.assertRaises(ValueError, synth.path_plot)
123 | 
124 |         synth.fit(dataprep=dataprep)
125 |         synth.path_plot()
126 | 
127 |         self.assertEqual(mock_plt.plot.call_count, 2)
128 |         first_call, second_call = mock_plt.plot.call_args_list
129 | 
130 |         _, first_call_kwargs = first_call
131 |         self.assertEqual(first_call_kwargs["color"], "black")
132 |         self.assertEqual(first_call_kwargs["linewidth"], 1)
133 |         self.assertEqual(first_call_kwargs["label"], dataprep.treatment_identifier)
134 | 
135 |         _, second_call_kwargs = second_call
136 |         self.assertEqual(second_call_kwargs["color"], "black")
137 |         self.assertEqual(second_call_kwargs["linewidth"], 1)
138 |         self.assertEqual(second_call_kwargs["linestyle"], "dashed")
139 |         self.assertEqual(second_call_kwargs["label"], "Synthetic")
140 | 
141 |         mock_plt.axvline.assert_not_called()
142 |         mock_plt.legend.assert_called()
143 |         mock_plt.grid.assert_called_with(True)
144 |         mock_plt.show.assert_called()
145 | 
146 |         synth.path_plot(treatment_time=3)
147 |         mock_plt.axvline.assert_called_once()
148 | 
149 |         _, kwargs = mock_plt.axvline.call_args
150 |         self.assertEqual(kwargs["x"], 3)
151 |         self.assertEqual(kwargs["ymin"], 0.05)
152 |         self.assertEqual(kwargs["ymax"], 0.95)
153 |         self.assertEqual(kwargs["linestyle"], "dashed")
154 | 
155 |     @patch("pysyncon.base.plt")
156 |     def test_gaps_plot(self, mock_plt: Mock):
157 |         kwargs = {
158 |             "foo": self.foo,
159 |             "predictors": self.predictors,
160 |             "predictors_op": self.predictors_op,
161 |             "dependent": self.dependent,
162 |             "unit_variable": self.unit_variable,
163 |             "time_variable": self.time_variable,
164 |             "treatment_identifier": self.treatment_identifier,
165 |             "controls_identifier": self.controls_identifier,
166 |             "time_predictors_prior": self.time_predictors_prior,
167 |             "time_optimize_ssr": self.time_optimize_ssr,
168 |             "special_predictors": self.special_predictors,
169 |         }
170 | 
171 |         dataprep = pysyncon.Dataprep(**kwargs)
172 |         synth = pysyncon.Synth()
173 |         # No weight matrix set
174 |         self.assertRaises(ValueError, synth.gaps_plot)
175 | 
176 |         X0, X1 = dataprep.make_covariate_mats()
177 |         Z0, Z1 = dataprep.make_outcome_mats()
178 |         synth.fit(X0=X0, X1=X1, Z0=Z0, Z1=Z1)
179 |         # No Dataprep object available
180 |         self.assertRaises(ValueError, synth.gaps_plot)
181 | 
182 |         synth.fit(dataprep=dataprep)
183 |         synth.gaps_plot()
184 | 
185 |         self.assertEqual(mock_plt.plot.call_count, 1)
186 |         _, kwargs = mock_plt.plot.call_args
187 | 
188 |         self.assertEqual(kwargs["color"], "black")
189 |         self.assertEqual(kwargs["linewidth"], 1)
190 | 
191 |         mock_plt.axvline.assert_not_called()
192 |         mock_plt.grid.assert_called_with(True)
193 |         mock_plt.show.assert_called()
194 | 
195 |         synth.path_plot(treatment_time=3)
196 |         mock_plt.axvline.assert_called_once()
197 | 
198 |         _, kwargs = mock_plt.axvline.call_args
199 |         self.assertEqual(kwargs["x"], 3)
200 |         self.assertEqual(kwargs["ymin"], 0.05)
201 |         self.assertEqual(kwargs["ymax"], 0.95)
202 |         self.assertEqual(kwargs["linestyle"], "dashed")
203 | 
204 |     def test_weight(self):
205 |         synth = pysyncon.Synth()
206 |         # No weight matrix set
207 |         self.assertRaises(ValueError, synth.weights)
208 | 
209 |     def test_summary(self):
210 |         kwargs = {
211 |             "foo": self.foo,
212 |             "predictors": self.predictors,
213 |             "predictors_op": self.predictors_op,
214 |             "dependent": self.dependent,
215 |             "unit_variable": self.unit_variable,
216 |             "time_variable": self.time_variable,
217 |             "treatment_identifier": self.treatment_identifier,
218 |             "controls_identifier": self.controls_identifier,
219 |             "time_predictors_prior": self.time_predictors_prior,
220 |             "time_optimize_ssr": self.time_optimize_ssr,
221 |             "special_predictors": self.special_predictors,
222 |         }
223 | 
224 |         dataprep = pysyncon.Dataprep(**kwargs)
225 |         synth = pysyncon.Synth()
226 |         # No weight matrix set
227 |         self.assertRaises(ValueError, synth.summary)
228 |         X0, X1 = dataprep.make_covariate_mats()
229 |         Z0, Z1 = dataprep.make_outcome_mats()
230 |         synth.fit(X0=X0, X1=X1, Z0=Z0, Z1=Z1)
231 |         # No Dataprep object available
232 |         self.assertRaises(ValueError, synth.summary)
233 | 
234 |         synth.V = None
235 |         # No V matrix available
236 |         self.assertRaises(ValueError, synth.summary)
237 | 
238 |     def test_att(self):
239 |         synth = pysyncon.Synth()
240 |         # No weight matrix set
241 |         self.assertRaises(ValueError, synth.att, range(1))
242 | 
243 |     def test_metrics(self):
244 |         kwargs = {
245 |             "foo": self.foo,
246 |             "predictors": self.predictors,
247 |             "predictors_op": self.predictors_op,
248 |             "dependent": self.dependent,
249 |             "unit_variable": self.unit_variable,
250 |             "time_variable": self.time_variable,
251 |             "treatment_identifier": self.treatment_identifier,
252 |             "controls_identifier": self.controls_identifier,
253 |             "time_predictors_prior": self.time_predictors_prior,
254 |             "time_optimize_ssr": self.time_optimize_ssr,
255 |             "special_predictors": self.special_predictors,
256 |         }
257 | 
258 |         dataprep = pysyncon.Dataprep(**kwargs)
259 |         synth = pysyncon.Synth()
260 | 
261 |         X0, X1 = dataprep.make_covariate_mats()
262 |         Z0, Z1 = dataprep.make_outcome_mats()
263 |         synth.fit(X0=X0, X1=X1, Z0=Z0, Z1=Z1)
264 |         # No Dataprep object available
265 |         self.assertRaises(ValueError, synth.mspe)
266 |         self.assertRaises(ValueError, synth.mape)
267 |         self.assertRaises(ValueError, synth.mae)
268 | 
269 |         del synth
270 | 
271 |         synth = pysyncon.Synth()
272 |         synth.dataprep = dataprep
273 |         # No weights availble/fit not run available
274 |         self.assertRaises(ValueError, synth.mspe)
275 |         self.assertRaises(ValueError, synth.mape)
276 |         self.assertRaises(ValueError, synth.mae)
277 | 
278 |     def test_confidence_intervals(self):
279 |         kwargs = {
280 |             "foo": self.foo,
281 |             "predictors": self.predictors,
282 |             "predictors_op": self.predictors_op,
283 |             "dependent": self.dependent,
284 |             "unit_variable": self.unit_variable,
285 |             "time_variable": self.time_variable,
286 |             "treatment_identifier": self.treatment_identifier,
287 |             "controls_identifier": self.controls_identifier,
288 |             "time_predictors_prior": self.time_predictors_prior,
289 |             "time_optimize_ssr": self.time_optimize_ssr,
290 |             "special_predictors": self.special_predictors,
291 |         }
292 | 
293 |         dataprep = pysyncon.Dataprep(**kwargs)
294 |         synth = pysyncon.Synth()
295 |         synth.fit(dataprep=dataprep)
296 | 
297 |         # Bad option
298 |         self.assertRaises(
299 |             ValueError,
300 |             synth.confidence_interval,
301 |             alpha=0.5,
302 |             time_periods=[4],
303 |             tol=0.01,
304 |             method="foo",
305 |         )
306 | 
307 |         # Run with dataprep supplied
308 |         synth.confidence_interval(
309 |             alpha=0.5, time_periods=[4], dataprep=dataprep, tol=0.01
310 |         )
311 | 
312 |         # Too few time periods for alpha value
313 |         self.assertRaises(
314 |             ValueError,
315 |             synth.confidence_interval,
316 |             alpha=0.05,
317 |             time_periods=[4],
318 |             tol=0.01,
319 |             dataprep=dataprep,
320 |         )
321 | 
322 |         # Run without dataprep supplied
323 |         synth.confidence_interval(alpha=0.5, time_periods=[4], tol=0.01)
324 | 
325 |         # Too few time periods for alpha value
326 |         self.assertRaises(
327 |             ValueError,
328 |             synth.confidence_interval,
329 |             alpha=0.05,
330 |             time_periods=[4],
331 |             tol=0.01,
332 |         )
333 | 
334 |         # Without dataprep supplied or matrices
335 |         synth.dataprep = None
336 |         self.assertRaises(
337 |             ValueError, synth.confidence_interval, alpha=0.5, time_periods=[4], tol=0.01
338 |         )
339 | 
340 |         # No pre-periods supplied
341 |         synth.dataprep = None
342 |         X0, X1 = dataprep.make_covariate_mats()
343 |         Z0, Z1 = dataprep.make_outcome_mats(time_period=[1, 2, 3, 4])
344 |         self.assertRaises(
345 |             ValueError,
346 |             synth.confidence_interval,
347 |             alpha=0.5,
348 |             time_periods=[4],
349 |             tol=0.01,
350 |             X0=X0,
351 |             X1=X1,
352 |             Z0=Z0,
353 |             Z1=Z1,
354 |         )
355 | 
356 |         # Bad alpha value
357 |         self.assertRaises(
358 |             ValueError,
359 |             synth.confidence_interval,
360 |             alpha=0.05,
361 |             time_periods=[4],
362 |             pre_periods=[1, 2, 3],
363 |             tol=0.01,
364 |             X0=X0,
365 |             X1=X1,
366 |             Z0=Z0,
367 |             Z1=Z1,
368 |         )
369 | 
370 |         # Dataframes supplied instead of series
371 |         X1 = X1.to_frame()
372 |         Z1 = Z1.to_frame()
373 |         self.assertRaises(
374 |             TypeError,
375 |             synth.confidence_interval,
376 |             alpha=0.5,
377 |             time_periods=[4],
378 |             pre_periods=[1, 2, 3],
379 |             tol=0.01,
380 |             X0=X0,
381 |             X1=X1,
382 |             Z0=Z0,
383 |             Z1=Z1,
384 |         )
385 | 


--------------------------------------------------------------------------------
/tests/test_synth_basque.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import pandas as pd
  3 | 
  4 | from pysyncon import Dataprep, Synth
  5 | from pysyncon.utils import PlaceboTest
  6 | 
  7 | 
  8 | class TestSynthBasque(unittest.TestCase):
  9 |     def setUp(self):
 10 |         df = pd.read_csv("./data/basque.csv")
 11 |         self.dataprep = Dataprep(
 12 |             foo=df,
 13 |             predictors=[
 14 |                 "school.illit",
 15 |                 "school.prim",
 16 |                 "school.med",
 17 |                 "school.high",
 18 |                 "school.post.high",
 19 |                 "invest",
 20 |             ],
 21 |             predictors_op="mean",
 22 |             time_predictors_prior=range(1964, 1970),
 23 |             special_predictors=[
 24 |                 ("gdpcap", range(1960, 1970), "mean"),
 25 |                 ("sec.agriculture", range(1961, 1970, 2), "mean"),
 26 |                 ("sec.energy", range(1961, 1970, 2), "mean"),
 27 |                 ("sec.industry", range(1961, 1970, 2), "mean"),
 28 |                 ("sec.construction", range(1961, 1970, 2), "mean"),
 29 |                 ("sec.services.venta", range(1961, 1970, 2), "mean"),
 30 |                 ("sec.services.nonventa", range(1961, 1970, 2), "mean"),
 31 |                 ("popdens", [1969], "mean"),
 32 |             ],
 33 |             dependent="gdpcap",
 34 |             unit_variable="regionname",
 35 |             time_variable="year",
 36 |             treatment_identifier="Basque Country (Pais Vasco)",
 37 |             controls_identifier=[
 38 |                 "Spain (Espana)",
 39 |                 "Andalucia",
 40 |                 "Aragon",
 41 |                 "Principado De Asturias",
 42 |                 "Baleares (Islas)",
 43 |                 "Canarias",
 44 |                 "Cantabria",
 45 |                 "Castilla Y Leon",
 46 |                 "Castilla-La Mancha",
 47 |                 "Cataluna",
 48 |                 "Comunidad Valenciana",
 49 |                 "Extremadura",
 50 |                 "Galicia",
 51 |                 "Madrid (Comunidad De)",
 52 |                 "Murcia (Region de)",
 53 |                 "Navarra (Comunidad Foral De)",
 54 |                 "Rioja (La)",
 55 |             ],
 56 |             time_optimize_ssr=range(1960, 1970),
 57 |         )
 58 |         self.optim_method = "Nelder-Mead"
 59 |         self.optim_initial = "equal"
 60 |         self.weights = {
 61 |             "Spain (Espana)": 0.0,
 62 |             "Andalucia": 0.0,
 63 |             "Aragon": 0.0,
 64 |             "Principado De Asturias": 0.0,
 65 |             "Baleares (Islas)": 0.0,
 66 |             "Canarias": 0.0,
 67 |             "Cantabria": 0.0,
 68 |             "Castilla Y Leon": 0.0,
 69 |             "Castilla-La Mancha": 0.0,
 70 |             "Cataluna": 0.850816306,
 71 |             "Comunidad Valenciana": 0.0,
 72 |             "Extremadura": 0.0,
 73 |             "Galicia": 0.0,
 74 |             "Madrid (Comunidad De)": 0.149183694,
 75 |             "Murcia (Region de)": 0.0,
 76 |             "Navarra (Comunidad Foral De)": 0.0,
 77 |             "Rioja (La)": 0.0,
 78 |         }
 79 |         self.placebo_gaps = {
 80 |             "Cataluna": {
 81 |                 1960.0: 0.203808058,
 82 |                 1961.0: 0.22013128,
 83 |                 1962.0: 0.263867425,
 84 |                 1963.0: 0.305086227,
 85 |                 1964.0: 0.307812892,
 86 |                 1965.0: 0.310500949,
 87 |                 1966.0: 0.369694004,
 88 |                 1967.0: 0.423575362,
 89 |                 1968.0: 0.458736716,
 90 |                 1969.0: 0.488697369,
 91 |                 1970.0: 0.492355223,
 92 |             },
 93 |             "Madrid (Comunidad De)": {
 94 |                 1960.0: 0.927170193,
 95 |                 1961.0: 1.066511653,
 96 |                 1962.0: 1.011029922,
 97 |                 1963.0: 0.950455684,
 98 |                 1964.0: 0.945846094,
 99 |                 1965.0: 0.930053083,
100 |                 1966.0: 0.772220243,
101 |                 1967.0: 0.614648344,
102 |                 1968.0: 0.557832902,
103 |                 1969.0: 0.491439776,
104 |                 1970.0: 0.441262212,
105 |             },
106 |             "Andalucia": {
107 |                 1960.0: -0.005071144,
108 |                 1961.0: 0.002029757,
109 |                 1962.0: -0.002976465,
110 |                 1963.0: -0.008368432,
111 |                 1964.0: -0.012947738,
112 |                 1965.0: -0.018273511,
113 |                 1966.0: -0.002324632,
114 |                 1967.0: 0.012943551,
115 |                 1968.0: 0.009046557,
116 |                 1969.0: 0.004579814,
117 |                 1970.0: 0.013673678,
118 |             },
119 |         }
120 |         self.summary = pd.DataFrame(
121 |             data=[
122 |                 [7.26559110e-02, 3.98884646e01, 2.56336977e02, 3.23825543e02],
123 |                 [1.19777358e-01, 1.03174230e03, 2.73010720e03, 2.18245335e03],
124 |                 [3.48611100e-03, 9.03586680e01, 2.23340172e02, 1.48864075e02],
125 |                 [1.02189247e-01, 2.57275251e01, 6.34368045e01, 4.71326627e01],
126 |                 [1.08267860e-02, 1.34797198e01, 3.61534897e01, 2.61630325e01],
127 |                 [5.32110000e-05, 2.46473831e01, 2.15826359e01, 2.14454579e01],
128 |                 [1.17260969e-01, 5.28546845e00, 5.27078346e00, 3.58401509e00],
129 |                 [6.33926060e-02, 6.84399996e00, 6.17934020e00, 2.10581177e01],
130 |                 [1.55350772e-01, 4.10600004e00, 2.75975796e00, 5.25223529e00],
131 |                 [9.58688000e-02, 4.50820000e01, 3.76359420e01, 2.26702353e01],
132 |                 [5.30811070e-02, 6.15000000e00, 6.95245150e00, 7.27400001e00],
133 |                 [1.63475200e-03, 3.37540001e01, 4.11037607e01, 3.66458824e01],
134 |                 [2.37097130e-02, 4.07200012e00, 5.37134427e00, 7.10294116e00],
135 |                 [1.80712657e-01, 2.46889999e02, 1.96283316e02, 9.74682350e01],
136 |             ],
137 |             columns=["V", "treated", "synthetic", "sample mean"],
138 |             index=[
139 |                 "school.illit",
140 |                 "school.prim",
141 |                 "school.med",
142 |                 "school.high",
143 |                 "school.post.high",
144 |                 "invest",
145 |                 "special.1.gdpcap",
146 |                 "special.2.sec.agriculture",
147 |                 "special.3.sec.energy",
148 |                 "special.4.sec.industry",
149 |                 "special.5.sec.construction",
150 |                 "special.6.sec.services.venta",
151 |                 "special.7.sec.services.nonventa",
152 |                 "special.8.popdens",
153 |             ],
154 |         )
155 |         self.treatment_time = 1975
156 |         self.pvalue = 0.16666666666666666
157 |         self.att = {"att": -0.6995647842110987, "se": 0.07078092130438395}
158 |         self.att_time_period = range(1975, 1998)
159 |         self.mspe = 0.008864544955047298
160 |         self.mape = 0.016928135318837897
161 |         self.mae = 0.08777554288632104
162 | 
163 |     def test_weights(self):
164 |         synth = Synth()
165 |         synth.fit(
166 |             dataprep=self.dataprep,
167 |             optim_method=self.optim_method,
168 |             optim_initial=self.optim_initial,
169 |         )
170 |         weights = pd.Series(self.weights, name="weights")
171 |         # Allow a tolerance of 2.5%
172 |         pd.testing.assert_series_equal(
173 |             weights, synth.weights(round=9), check_exact=False, atol=0.025
174 |         )
175 |         pd.testing.assert_frame_equal(
176 |             self.summary, synth.summary(round=9), check_exact=False, atol=0.025
177 |         )
178 | 
179 |     def test_placebo_weights(self):
180 |         synth = Synth()
181 |         placebo_test = PlaceboTest()
182 |         placebo_test.fit(
183 |             dataprep=self.dataprep,
184 |             scm=synth,
185 |             scm_options={
186 |                 "optim_method": self.optim_method,
187 |                 "optim_initial": self.optim_initial,
188 |             },
189 |         )
190 | 
191 |         placebo_gaps = pd.DataFrame.from_dict(self.placebo_gaps).rename_axis(
192 |             index="year"
193 |         )
194 |         regions = self.placebo_gaps.keys()
195 |         years = list(self.placebo_gaps["Cataluna"].keys())
196 |         pd.testing.assert_frame_equal(
197 |             placebo_gaps,
198 |             placebo_test.gaps[regions].loc[years],
199 |             check_exact=False,
200 |             atol=0.025,
201 |         )
202 |         self.assertAlmostEqual(
203 |             self.pvalue,
204 |             placebo_test.pvalue(treatment_time=self.treatment_time),
205 |             places=3,
206 |         )
207 | 
208 |     def test_att(self):
209 |         synth = Synth()
210 |         synth.fit(
211 |             dataprep=self.dataprep,
212 |             optim_method=self.optim_method,
213 |             optim_initial=self.optim_initial,
214 |         )
215 |         synth_att = synth.att(time_period=self.att_time_period)
216 | 
217 |         # Allow a tolerance of 2.5%
218 |         att_perc_delta = abs(1.0 - self.att["att"] / synth_att["att"])
219 |         self.assertLessEqual(att_perc_delta, 0.025)
220 | 
221 |         # Allow a tolerance of 2.5%
222 |         se_perc_delta = abs(1.0 - self.att["se"] / synth_att["se"])
223 |         self.assertLessEqual(se_perc_delta, 0.025)
224 | 
225 |     def test_metric_values(self):
226 |         synth = Synth()
227 |         synth.fit(
228 |             dataprep=self.dataprep,
229 |             optim_method=self.optim_method,
230 |             optim_initial=self.optim_initial,
231 |         )
232 | 
233 |         # Allow a tolerance of 2.5%
234 |         mspe_perc_delta = abs(1.0 - self.mspe / synth.mspe())
235 |         self.assertLessEqual(mspe_perc_delta, 0.025)
236 |         mape_perc_delta = abs(1.0 - self.mape / synth.mape())
237 |         self.assertLessEqual(mape_perc_delta, 0.025)
238 |         mae_perc_delta = abs(1.0 - self.mae / synth.mae())
239 |         self.assertLessEqual(mae_perc_delta, 0.025)
240 | 


--------------------------------------------------------------------------------
/tests/test_synth_germany.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import pandas as pd
  3 | 
  4 | from pysyncon import Dataprep, Synth
  5 | 
  6 | 
  7 | class TestSynthGermany(unittest.TestCase):
  8 |     def setUp(self):
  9 |         df = pd.read_csv("./data/germany.csv")
 10 |         dataprep_train = Dataprep(
 11 |             foo=df,
 12 |             predictors=["gdp", "trade", "infrate"],
 13 |             predictors_op="mean",
 14 |             time_predictors_prior=range(1971, 1981),
 15 |             special_predictors=[
 16 |                 ("industry", range(1971, 1981), "mean"),
 17 |                 ("schooling", [1970, 1975], "mean"),
 18 |                 ("invest70", [1980], "mean"),
 19 |             ],
 20 |             dependent="gdp",
 21 |             unit_variable="country",
 22 |             time_variable="year",
 23 |             treatment_identifier="West Germany",
 24 |             controls_identifier=[
 25 |                 "USA",
 26 |                 "UK",
 27 |                 "Austria",
 28 |                 "Belgium",
 29 |                 "Denmark",
 30 |                 "France",
 31 |                 "Italy",
 32 |                 "Netherlands",
 33 |                 "Norway",
 34 |                 "Switzerland",
 35 |                 "Japan",
 36 |                 "Greece",
 37 |                 "Portugal",
 38 |                 "Spain",
 39 |                 "Australia",
 40 |                 "New Zealand",
 41 |             ],
 42 |             time_optimize_ssr=range(1981, 1991),
 43 |         )
 44 |         synth_train = Synth()
 45 |         synth_train.fit(
 46 |             dataprep=dataprep_train, optim_method="Nelder-Mead", optim_initial="equal"
 47 |         )
 48 |         self.custom_V = synth_train.V
 49 | 
 50 |         self.dataprep = Dataprep(
 51 |             foo=df,
 52 |             predictors=["gdp", "trade", "infrate"],
 53 |             predictors_op="mean",
 54 |             time_predictors_prior=range(1981, 1991),
 55 |             special_predictors=[
 56 |                 ("industry", range(1981, 1991), "mean"),
 57 |                 ("schooling", [1980, 1985], "mean"),
 58 |                 ("invest80", [1980], "mean"),
 59 |             ],
 60 |             dependent="gdp",
 61 |             unit_variable="country",
 62 |             time_variable="year",
 63 |             treatment_identifier="West Germany",
 64 |             controls_identifier=[
 65 |                 "USA",
 66 |                 "UK",
 67 |                 "Austria",
 68 |                 "Belgium",
 69 |                 "Denmark",
 70 |                 "France",
 71 |                 "Italy",
 72 |                 "Netherlands",
 73 |                 "Norway",
 74 |                 "Switzerland",
 75 |                 "Japan",
 76 |                 "Greece",
 77 |                 "Portugal",
 78 |                 "Spain",
 79 |                 "Australia",
 80 |                 "New Zealand",
 81 |             ],
 82 |             time_optimize_ssr=range(1960, 1990),
 83 |         )
 84 | 
 85 |         self.optim_method = "Nelder-Mead"
 86 |         self.optim_initial = "equal"
 87 |         self.weights = {
 88 |             "USA": 0.21624982,
 89 |             "UK": 0.0,
 90 |             "Austria": 0.414522077,
 91 |             "Belgium": 0.0,
 92 |             "Denmark": 0.0,
 93 |             "France": 0.0,
 94 |             "Italy": 0.0,
 95 |             "Netherlands": 0.09841208,
 96 |             "Norway": 0.0,
 97 |             "Switzerland": 0.107654851,
 98 |             "Japan": 0.163161172,
 99 |             "Greece": 0.0,
100 |             "Portugal": 0.0,
101 |             "Spain": 0.0,
102 |             "Australia": 0.0,
103 |             "New Zealand": 0.0,
104 |         }
105 |         self.att = {"att": -1555.1346777620479, "se": 317.6469306023242}
106 |         self.att_time_period = range(1990, 2004)
107 |         self.cis = {
108 |             "value": {
109 |                 1991: 279.09685975333196,
110 |                 1992: 99.76203427529981,
111 |                 1993: -631.5437231770848,
112 |                 1994: -1050.2679900905205,
113 |                 1995: -1205.2549226793199,
114 |                 1996: -1467.2491625958974,
115 |                 1997: -1954.3741689815615,
116 |                 1998: -2008.3960300490326,
117 |                 1999: -2160.627036515649,
118 |                 2000: -2620.7330909274606,
119 |             },
120 |             "lower_ci": {
121 |                 1991: 43.148688105431994,
122 |                 1992: -136.18613737260014,
123 |                 1993: -867.4918948249846,
124 |                 1994: -1286.2161617384206,
125 |                 1995: -1441.20309432722,
126 |                 1996: -1703.1973342437975,
127 |                 1997: -2190.3223406294615,
128 |                 1998: -2244.3442016969325,
129 |                 1999: -2396.5752081635487,
130 |                 2000: -2856.6812625753605,
131 |             },
132 |             "upper_ci": {
133 |                 1991: 515.0450314012319,
134 |                 1992: 335.7102059231998,
135 |                 1993: -395.59555152918483,
136 |                 1994: -814.3198184426207,
137 |                 1995: -969.3067510314198,
138 |                 1996: -1231.3009909479972,
139 |                 1997: -1718.4259973336614,
140 |                 1998: -1772.4478584011324,
141 |                 1999: -1924.6788648677486,
142 |                 2000: -2384.7849192795607,
143 |             },
144 |         }
145 |         self.ci_args = {
146 |             "alpha": 0.05,
147 |             "time_periods": [
148 |                 1991,
149 |                 1992,
150 |                 1993,
151 |                 1994,
152 |                 1995,
153 |                 1996,
154 |                 1997,
155 |                 1998,
156 |                 1999,
157 |                 2000,
158 |             ],
159 |             "max_iter": 50,
160 |             "tol": 0.1,
161 |             "verbose": False,
162 |         }
163 | 
164 |     def test_weights(self):
165 |         synth = Synth()
166 |         synth.fit(
167 |             dataprep=self.dataprep,
168 |             optim_method=self.optim_method,
169 |             optim_initial=self.optim_initial,
170 |             custom_V=self.custom_V,
171 |         )
172 |         weights = pd.Series(self.weights, name="weights")
173 |         pd.testing.assert_series_equal(
174 |             weights, synth.weights(round=9), check_exact=False, atol=0.025
175 |         )
176 | 
177 |     def test_att(self):
178 |         synth = Synth()
179 |         synth.fit(
180 |             dataprep=self.dataprep,
181 |             optim_method=self.optim_method,
182 |             optim_initial=self.optim_initial,
183 |             custom_V=self.custom_V,
184 |         )
185 |         synth_att = synth.att(time_period=self.att_time_period)
186 | 
187 |         # Allow a tolerance of 2.5%
188 |         att_perc_delta = abs(1.0 - self.att["att"] / synth_att["att"])
189 |         self.assertLessEqual(att_perc_delta, 0.025)
190 | 
191 |         # Allow a tolerance of 2.5%
192 |         se_perc_delta = abs(1.0 - self.att["se"] / synth_att["se"])
193 |         self.assertLessEqual(se_perc_delta, 0.025)
194 | 
195 |     def test_cis(self):
196 |         synth = Synth()
197 |         synth.fit(
198 |             dataprep=self.dataprep,
199 |             optim_method=self.optim_method,
200 |             optim_initial=self.optim_initial,
201 |             custom_V=self.custom_V,
202 |         )
203 | 
204 |         cis = pd.DataFrame.from_dict(self.cis)
205 |         cis.index.name = "time"
206 |         pd.testing.assert_frame_equal(
207 |             cis,
208 |             synth.confidence_interval(custom_V=self.custom_V, **self.ci_args),
209 |             check_exact=False,
210 |             atol=0.025,
211 |         )
212 | 


--------------------------------------------------------------------------------
/tests/test_synth_texas.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import pandas as pd
  3 | 
  4 | from pysyncon import Dataprep, Synth
  5 | 
  6 | 
  7 | class TestSynthTexas(unittest.TestCase):
  8 |     def setUp(self):
  9 |         df = pd.read_csv("./data/texas.csv")
 10 |         self.dataprep = Dataprep(
 11 |             foo=df,
 12 |             predictors=["income", "ur", "poverty"],
 13 |             predictors_op="mean",
 14 |             time_predictors_prior=range(1985, 1994),
 15 |             special_predictors=[
 16 |                 ("bmprison", [1988], "mean"),
 17 |                 ("bmprison", [1990], "mean"),
 18 |                 ("bmprison", [1991], "mean"),
 19 |                 ("bmprison", [1992], "mean"),
 20 |                 ("alcohol", [1990], "mean"),
 21 |                 ("aidscapita", [1990], "mean"),
 22 |                 ("aidscapita", [1991], "mean"),
 23 |                 ("black", [1990], "mean"),
 24 |                 ("black", [1991], "mean"),
 25 |                 ("black", [1992], "mean"),
 26 |                 ("perc1519", [1990], "mean"),
 27 |             ],
 28 |             dependent="bmprison",
 29 |             unit_variable="state",
 30 |             time_variable="year",
 31 |             treatment_identifier="Texas",
 32 |             controls_identifier=[
 33 |                 "Alabama",
 34 |                 "Alaska",
 35 |                 "Arizona",
 36 |                 "Arkansas",
 37 |                 "California",
 38 |                 "Colorado",
 39 |                 "Connecticut",
 40 |                 "Delaware",
 41 |                 "District of Columbia",
 42 |                 "Florida",
 43 |                 "Georgia",
 44 |                 "Hawaii",
 45 |                 "Idaho",
 46 |                 "Illinois",
 47 |                 "Indiana",
 48 |                 "Iowa",
 49 |                 "Kansas",
 50 |                 "Kentucky",
 51 |                 "Louisiana",
 52 |                 "Maine",
 53 |                 "Maryland",
 54 |                 "Massachusetts",
 55 |                 "Michigan",
 56 |                 "Minnesota",
 57 |                 "Mississippi",
 58 |                 "Missouri",
 59 |                 "Montana",
 60 |                 "Nebraska",
 61 |                 "Nevada",
 62 |                 "New Hampshire",
 63 |                 "New Jersey",
 64 |                 "New Mexico",
 65 |                 "New York",
 66 |                 "North Carolina",
 67 |                 "North Dakota",
 68 |                 "Ohio",
 69 |                 "Oklahoma",
 70 |                 "Oregon",
 71 |                 "Pennsylvania",
 72 |                 "Rhode Island",
 73 |                 "South Carolina",
 74 |                 "South Dakota",
 75 |                 "Tennessee",
 76 |                 "Utah",
 77 |                 "Vermont",
 78 |                 "Virginia",
 79 |                 "Washington",
 80 |                 "West Virginia",
 81 |                 "Wisconsin",
 82 |                 "Wyoming",
 83 |             ],
 84 |             time_optimize_ssr=range(1985, 1994),
 85 |         )
 86 |         self.optim_method = "BFGS"
 87 |         self.optim_initial = "ols"
 88 |         self.weights = {
 89 |             "Alabama": 0.0,
 90 |             "Alaska": 0.0,
 91 |             "Arizona": 0.0,
 92 |             "Arkansas": 0.0,
 93 |             "California": 0.407651414,
 94 |             "Colorado": 0.0,
 95 |             "Connecticut": 0.0,
 96 |             "Delaware": 0.0,
 97 |             "District of Columbia": 0.0,
 98 |             "Florida": 0.110543548,
 99 |             "Georgia": 0.0,
100 |             "Hawaii": 0.0,
101 |             "Idaho": 0.0,
102 |             "Illinois": 0.36027434,
103 |             "Indiana": 0.0,
104 |             "Iowa": 0.0,
105 |             "Kansas": 0.0,
106 |             "Kentucky": 0.0,
107 |             "Louisiana": 0.121530698,
108 |             "Maine": 0.0,
109 |             "Maryland": 0.0,
110 |             "Massachusetts": 0.0,
111 |             "Michigan": 0.0,
112 |             "Minnesota": 0.0,
113 |             "Mississippi": 0.0,
114 |             "Missouri": 0.0,
115 |             "Montana": 0.0,
116 |             "Nebraska": 0.0,
117 |             "Nevada": 0.0,
118 |             "New Hampshire": 0.0,
119 |             "New Jersey": 0.0,
120 |             "New Mexico": 0.0,
121 |             "New York": 0.0,
122 |             "North Carolina": 0.0,
123 |             "North Dakota": 0.0,
124 |             "Ohio": 0.0,
125 |             "Oklahoma": 0.0,
126 |             "Oregon": 0.0,
127 |             "Pennsylvania": 0.0,
128 |             "Rhode Island": 0.0,
129 |             "South Carolina": 0.0,
130 |             "South Dakota": 0.0,
131 |             "Tennessee": 0.0,
132 |             "Utah": 0.0,
133 |             "Vermont": 0.0,
134 |             "Virginia": 0.0,
135 |             "Washington": 0.0,
136 |             "West Virginia": 0.0,
137 |             "Wisconsin": 0.0,
138 |             "Wyoming": 0.0,
139 |         }
140 |         self.att = {"att": 20339.375838131393, "se": 3190.4946788704715}
141 |         self.att_time_period = range(1993, 2001)
142 | 
143 |     def test_weights(self):
144 |         synth = Synth()
145 |         synth.fit(
146 |             dataprep=self.dataprep,
147 |             optim_method=self.optim_method,
148 |             optim_initial=self.optim_initial,
149 |         )
150 |         weights = pd.Series(self.weights, name="weights")
151 |         # Allow a tolerance of 2.5%
152 |         pd.testing.assert_series_equal(
153 |             weights, synth.weights(round=9), check_exact=False, atol=0.025
154 |         )
155 | 
156 |     def test_att(self):
157 |         synth = Synth()
158 |         synth.fit(
159 |             dataprep=self.dataprep,
160 |             optim_method=self.optim_method,
161 |             optim_initial=self.optim_initial,
162 |         )
163 |         synth_att = synth.att(time_period=self.att_time_period)
164 | 
165 |         # Allow a tolerance of 2.5%
166 |         att_perc_delta = abs(1.0 - self.att["att"] / synth_att["att"])
167 |         self.assertLessEqual(att_perc_delta, 0.025)
168 | 
169 |         # Allow a tolerance of 2.5%
170 |         se_perc_delta = abs(1.0 - self.att["se"] / synth_att["se"])
171 |         self.assertLessEqual(se_perc_delta, 0.025)
172 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from unittest.mock import patch, Mock
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | from pysyncon import Dataprep, Synth
  7 | from pysyncon.utils import HoldoutSplitter, CrossValidationResult, PlaceboTest
  8 | 
  9 | 
 10 | class TestHoldoutSplitter(unittest.TestCase):
 11 |     def test_values(self):
 12 |         cases = [(3, 3, 1), (3, 3, 2), (5, 1, 1), (5, 1, 2)]
 13 |         for case in cases:
 14 |             with self.subTest(case=case):
 15 |                 rows, columns, holdout = case
 16 |                 df = pd.DataFrame(np.random.random(size=(rows, columns)))
 17 |                 ser = pd.Series(np.random.random(size=rows))
 18 | 
 19 |                 iter_len = 0
 20 |                 for df_, df_h, ser_, ser_h in HoldoutSplitter(
 21 |                     df=df, ser=ser, holdout_len=holdout
 22 |                 ):
 23 |                     self.assertIsInstance(df_, pd.DataFrame)
 24 |                     pd.testing.assert_frame_equal(
 25 |                         df_,
 26 |                         df.drop(index=df.index[iter_len : iter_len + holdout,]),
 27 |                     )
 28 | 
 29 |                     self.assertIsInstance(ser_, pd.Series)
 30 |                     pd.testing.assert_series_equal(
 31 |                         ser_,
 32 |                         ser.drop(index=ser.index[iter_len : iter_len + holdout]),
 33 |                     )
 34 | 
 35 |                     self.assertIsInstance(df_h, pd.DataFrame)
 36 |                     pd.testing.assert_frame_equal(
 37 |                         df_h,
 38 |                         df.iloc[iter_len : iter_len + holdout,],
 39 |                     )
 40 | 
 41 |                     self.assertIsInstance(ser_h, pd.Series)
 42 |                     pd.testing.assert_series_equal(
 43 |                         ser_h,
 44 |                         ser.iloc[iter_len : iter_len + holdout],
 45 |                     )
 46 |                     iter_len += 1
 47 |                 self.assertEqual(iter_len - 1, rows - holdout)
 48 | 
 49 |     def test_errs(self):
 50 |         cases = [(1, 1, 2, 2), (2, 2, 1, 1), (3, 2, 1, 2), (2, 1, 2, 3)]
 51 |         for case in cases:
 52 |             with self.subTest(case=case):
 53 |                 df_rows, df_cols, holdout, ser_rows = case
 54 | 
 55 |                 df = pd.DataFrame(np.random.random(size=(df_rows, df_cols)))
 56 |                 ser = pd.Series(np.random.random(size=ser_rows))
 57 | 
 58 |                 self.assertRaises(
 59 |                     ValueError,
 60 |                     HoldoutSplitter,
 61 |                     df=df,
 62 |                     ser=ser,
 63 |                     holdout_len=holdout,
 64 |                 )
 65 | 
 66 |         cases = [(1, 1, 0, 1), (2, 2, 2, 2), (3, 3, 4, 3)]
 67 |         for case in cases:
 68 |             with self.subTest(case=case):
 69 |                 df_rows, df_cols, holdout, ser_rows = case
 70 | 
 71 |                 df = pd.DataFrame(np.random.random(size=(df_rows, df_cols)))
 72 |                 ser = pd.Series(np.random.random(size=ser_rows))
 73 | 
 74 |                 self.assertRaises(
 75 |                     ValueError,
 76 |                     HoldoutSplitter,
 77 |                     df=df,
 78 |                     ser=ser,
 79 |                     holdout_len=holdout,
 80 |                 )
 81 | 
 82 | 
 83 | class TestCrossValidationResult(unittest.TestCase):
 84 |     def test_best_lambda(self):
 85 |         cases = [1, 2, 3, 10]
 86 |         for case in cases:
 87 |             with self.subTest(case=case):
 88 |                 cv_result = CrossValidationResult(
 89 |                     lambdas=np.random.random(size=case),
 90 |                     errors_mean=np.random.random(size=case),
 91 |                     errors_se=np.random.random(size=case),
 92 |                 )
 93 | 
 94 |                 best_lambda = cv_result.best_lambda()
 95 |                 min_mean = cv_result.errors_mean.min()
 96 |                 min_mean_idx = cv_result.errors_mean.argmin()
 97 |                 min_mean_se = cv_result.errors_se[min_mean_idx]
 98 |                 self.assertEqual(
 99 |                     best_lambda,
100 |                     cv_result.lambdas[cv_result.errors_mean <= min_mean + min_mean_se]
101 |                     .max()
102 |                     .item(),
103 |                 )
104 | 
105 |                 best_lambda = cv_result.best_lambda(min_1se=False)
106 |                 min_mean_idx = cv_result.errors_mean.argmin()
107 |                 self.assertEqual(best_lambda, cv_result.lambdas[min_mean_idx].item())
108 | 
109 |     @patch("pysyncon.utils.plt")
110 |     def test_result_plot(self, mock_plt: Mock):
111 |         cv_result = CrossValidationResult(
112 |             lambdas=np.random.random(size=10),
113 |             errors_mean=np.random.random(size=10),
114 |             errors_se=np.random.random(size=10),
115 |         )
116 |         cv_result.plot()
117 | 
118 |         self.assertEqual(mock_plt.errorbar.call_count, 1)
119 |         _, kwargs = mock_plt.errorbar.call_args
120 |         self.assertEqual(kwargs["ecolor"], "black")
121 |         self.assertEqual(kwargs["capsize"], 2)
122 | 
123 |         mock_plt.xlabel.assert_called_with("Lambda")
124 |         mock_plt.ylabel.assert_called_with("Mean error")
125 |         mock_plt.xscale.assert_called_with("log")
126 |         mock_plt.yscale.assert_called_with("log")
127 |         mock_plt.title.assert_called_with("Cross validation result")
128 |         mock_plt.grid.assert_called()
129 |         mock_plt.show.assert_called()
130 | 
131 | 
132 | class TestPlaceboTests(unittest.TestCase):
133 |     def setUp(self):
134 |         # 1 -> treated, (2, 3) -> controls
135 |         self.dataprep = Dataprep(
136 |             foo=pd.DataFrame(
137 |                 {
138 |                     "time": [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4],
139 |                     "name": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3],
140 |                     "dependent": np.random.random(12),
141 |                     "predictor1": np.random.random(12),
142 |                     "predictor2": np.random.random(12),
143 |                 }
144 |             ),
145 |             predictors=["predictor1"],
146 |             predictors_op="mean",
147 |             dependent="dependent",
148 |             unit_variable="name",
149 |             time_variable="time",
150 |             treatment_identifier=1,
151 |             controls_identifier=[2, 3],
152 |             time_predictors_prior=[2, 3],
153 |             time_optimize_ssr=[1, 2, 3],
154 |             special_predictors=[
155 |                 ("predictor1", [2], "mean"),
156 |                 ("predictor2", [1, 2], "median"),
157 |                 ("predictor2", [1, 2], "std"),
158 |             ],
159 |         )
160 |         self.synth = Synth()
161 |         self.synth.fit(dataprep=self.dataprep)
162 | 
163 |         self.placebo_test = PlaceboTest()
164 |         self.placebo_test.fit(dataprep=self.dataprep, scm=self.synth)
165 | 
166 |     @patch("pysyncon.utils.plt")
167 |     def test_gaps_plot(self, mock_plt: Mock):
168 |         self.placebo_test.gaps_plot()
169 | 
170 |         self.assertEqual(mock_plt.plot.call_count, 2)
171 |         _, kwargs = mock_plt.plot.call_args
172 |         self.assertEqual(kwargs["color"], "black")
173 |         self.assertEqual(kwargs["alpha"], 1.0)
174 |         mock_plt.axvline.assert_not_called()
175 |         mock_plt.grid.assert_called()
176 | 
177 |     @patch("pysyncon.utils.plt")
178 |     def test_gaps_plot_axvline(self, mock_plt: Mock):
179 |         self.placebo_test.gaps_plot(treatment_time=3)
180 | 
181 |         mock_plt.axvline.assert_called()
182 |         _, kwargs = mock_plt.axvline.call_args
183 |         self.assertEqual(kwargs["ymin"], 0.05)
184 |         self.assertEqual(kwargs["ymax"], 0.95)
185 |         self.assertEqual(kwargs["linestyle"], "dashed")
186 | 
187 |     @patch("pysyncon.utils.plt")
188 |     def test_gaps_plot_mspe_threshold(self, mock_plt: Mock):
189 |         self.placebo_test.gaps_plot(treatment_time=3, mspe_threshold=1)
190 | 


--------------------------------------------------------------------------------